giga-spatial 0.6.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- giga_spatial-0.6.0.dist-info/METADATA +141 -0
- giga_spatial-0.6.0.dist-info/RECORD +47 -0
- giga_spatial-0.6.0.dist-info/WHEEL +5 -0
- giga_spatial-0.6.0.dist-info/licenses/LICENSE +661 -0
- giga_spatial-0.6.0.dist-info/top_level.txt +1 -0
- gigaspatial/__init__.py +1 -0
- gigaspatial/config.py +226 -0
- gigaspatial/core/__init__.py +0 -0
- gigaspatial/core/io/__init__.py +5 -0
- gigaspatial/core/io/adls_data_store.py +325 -0
- gigaspatial/core/io/data_api.py +113 -0
- gigaspatial/core/io/data_store.py +147 -0
- gigaspatial/core/io/local_data_store.py +92 -0
- gigaspatial/core/io/readers.py +265 -0
- gigaspatial/core/io/writers.py +128 -0
- gigaspatial/core/schemas/__init__.py +0 -0
- gigaspatial/core/schemas/entity.py +244 -0
- gigaspatial/generators/__init__.py +2 -0
- gigaspatial/generators/poi.py +636 -0
- gigaspatial/generators/zonal/__init__.py +3 -0
- gigaspatial/generators/zonal/base.py +370 -0
- gigaspatial/generators/zonal/geometry.py +439 -0
- gigaspatial/generators/zonal/mercator.py +78 -0
- gigaspatial/grid/__init__.py +1 -0
- gigaspatial/grid/mercator_tiles.py +286 -0
- gigaspatial/handlers/__init__.py +40 -0
- gigaspatial/handlers/base.py +761 -0
- gigaspatial/handlers/boundaries.py +305 -0
- gigaspatial/handlers/ghsl.py +772 -0
- gigaspatial/handlers/giga.py +145 -0
- gigaspatial/handlers/google_open_buildings.py +472 -0
- gigaspatial/handlers/hdx.py +241 -0
- gigaspatial/handlers/mapbox_image.py +208 -0
- gigaspatial/handlers/maxar_image.py +291 -0
- gigaspatial/handlers/microsoft_global_buildings.py +548 -0
- gigaspatial/handlers/ookla_speedtest.py +199 -0
- gigaspatial/handlers/opencellid.py +290 -0
- gigaspatial/handlers/osm.py +356 -0
- gigaspatial/handlers/overture.py +126 -0
- gigaspatial/handlers/rwi.py +157 -0
- gigaspatial/handlers/unicef_georepo.py +806 -0
- gigaspatial/handlers/worldpop.py +266 -0
- gigaspatial/processing/__init__.py +4 -0
- gigaspatial/processing/geo.py +1054 -0
- gigaspatial/processing/sat_images.py +39 -0
- gigaspatial/processing/tif_processor.py +477 -0
- gigaspatial/processing/utils.py +49 -0
@@ -0,0 +1,147 @@
|
|
1
|
+
from abc import ABC, abstractmethod
|
2
|
+
from typing import Any, List, Generator, Union
|
3
|
+
|
4
|
+
|
5
|
+
class DataStore(ABC):
|
6
|
+
"""
|
7
|
+
Abstract base class defining the interface for data store implementations.
|
8
|
+
This class serves as a parent for both local and cloud-based storage solutions.
|
9
|
+
"""
|
10
|
+
|
11
|
+
@abstractmethod
|
12
|
+
def read_file(self, path: str) -> Any:
|
13
|
+
"""
|
14
|
+
Read contents of a file from the data store.
|
15
|
+
|
16
|
+
Args:
|
17
|
+
path: Path to the file to read
|
18
|
+
|
19
|
+
Returns:
|
20
|
+
Contents of the file
|
21
|
+
|
22
|
+
Raises:
|
23
|
+
IOError: If file cannot be read
|
24
|
+
"""
|
25
|
+
pass
|
26
|
+
|
27
|
+
@abstractmethod
|
28
|
+
def write_file(self, path: str, data: Any) -> None:
|
29
|
+
"""
|
30
|
+
Write data to a file in the data store.
|
31
|
+
|
32
|
+
Args:
|
33
|
+
path: Path where to write the file
|
34
|
+
data: Data to write to the file
|
35
|
+
|
36
|
+
Raises:
|
37
|
+
IOError: If file cannot be written
|
38
|
+
"""
|
39
|
+
pass
|
40
|
+
|
41
|
+
@abstractmethod
|
42
|
+
def file_exists(self, path: str) -> bool:
|
43
|
+
"""
|
44
|
+
Check if a file exists in the data store.
|
45
|
+
|
46
|
+
Args:
|
47
|
+
path: Path to check
|
48
|
+
|
49
|
+
Returns:
|
50
|
+
True if file exists, False otherwise
|
51
|
+
"""
|
52
|
+
pass
|
53
|
+
|
54
|
+
@abstractmethod
|
55
|
+
def list_files(self, path: str) -> List[str]:
|
56
|
+
"""
|
57
|
+
List all files in a directory.
|
58
|
+
|
59
|
+
Args:
|
60
|
+
path: Directory path to list
|
61
|
+
|
62
|
+
Returns:
|
63
|
+
List of file paths in the directory
|
64
|
+
"""
|
65
|
+
pass
|
66
|
+
|
67
|
+
@abstractmethod
|
68
|
+
def walk(self, top: str) -> Generator:
|
69
|
+
"""
|
70
|
+
Walk through directory tree, similar to os.walk().
|
71
|
+
|
72
|
+
Args:
|
73
|
+
top: Starting directory for the walk
|
74
|
+
|
75
|
+
Returns:
|
76
|
+
Generator yielding tuples of (dirpath, dirnames, filenames)
|
77
|
+
"""
|
78
|
+
pass
|
79
|
+
|
80
|
+
@abstractmethod
|
81
|
+
def open(self, file: str, mode: str = "r") -> Union[str, bytes]:
|
82
|
+
"""
|
83
|
+
Context manager for file operations.
|
84
|
+
|
85
|
+
Args:
|
86
|
+
file: Path to the file
|
87
|
+
mode: File mode ('r', 'w', 'rb', 'wb')
|
88
|
+
|
89
|
+
Yields:
|
90
|
+
File-like object
|
91
|
+
|
92
|
+
Raises:
|
93
|
+
IOError: If file cannot be opened
|
94
|
+
"""
|
95
|
+
pass
|
96
|
+
|
97
|
+
@abstractmethod
|
98
|
+
def is_file(self, path: str) -> bool:
|
99
|
+
"""
|
100
|
+
Check if path points to a file.
|
101
|
+
|
102
|
+
Args:
|
103
|
+
path: Path to check
|
104
|
+
|
105
|
+
Returns:
|
106
|
+
True if path is a file, False otherwise
|
107
|
+
"""
|
108
|
+
pass
|
109
|
+
|
110
|
+
@abstractmethod
|
111
|
+
def is_dir(self, path: str) -> bool:
|
112
|
+
"""
|
113
|
+
Check if path points to a directory.
|
114
|
+
|
115
|
+
Args:
|
116
|
+
path: Path to check
|
117
|
+
|
118
|
+
Returns:
|
119
|
+
True if path is a directory, False otherwise
|
120
|
+
"""
|
121
|
+
pass
|
122
|
+
|
123
|
+
@abstractmethod
|
124
|
+
def remove(self, path: str) -> None:
|
125
|
+
"""
|
126
|
+
Remove a file.
|
127
|
+
|
128
|
+
Args:
|
129
|
+
path: Path to the file to remove
|
130
|
+
|
131
|
+
Raises:
|
132
|
+
IOError: If file cannot be removed
|
133
|
+
"""
|
134
|
+
pass
|
135
|
+
|
136
|
+
@abstractmethod
|
137
|
+
def rmdir(self, dir: str) -> None:
|
138
|
+
"""
|
139
|
+
Remove a directory and all its contents.
|
140
|
+
|
141
|
+
Args:
|
142
|
+
dir: Path to the directory to remove
|
143
|
+
|
144
|
+
Raises:
|
145
|
+
IOError: If directory cannot be removed
|
146
|
+
"""
|
147
|
+
pass
|
@@ -0,0 +1,92 @@
|
|
1
|
+
from pathlib import Path
|
2
|
+
import os
|
3
|
+
from typing import Any, List, Generator, Tuple, Union, IO
|
4
|
+
|
5
|
+
from .data_store import DataStore
|
6
|
+
|
7
|
+
|
8
|
+
class LocalDataStore(DataStore):
|
9
|
+
"""Implementation for local filesystem storage."""
|
10
|
+
|
11
|
+
def __init__(self, base_path: Union[str, Path] = ""):
|
12
|
+
super().__init__()
|
13
|
+
self.base_path = Path(base_path).resolve()
|
14
|
+
|
15
|
+
def _resolve_path(self, path: str) -> Path:
|
16
|
+
"""Resolve path relative to base directory."""
|
17
|
+
return self.base_path / path
|
18
|
+
|
19
|
+
def read_file(self, path: str) -> bytes:
|
20
|
+
full_path = self._resolve_path(path)
|
21
|
+
with open(full_path, "rb") as f:
|
22
|
+
return f.read()
|
23
|
+
|
24
|
+
def write_file(self, path: str, data: Union[bytes, str]) -> None:
|
25
|
+
full_path = self._resolve_path(path)
|
26
|
+
self.mkdir(str(full_path.parent), exist_ok=True)
|
27
|
+
|
28
|
+
if isinstance(data, str):
|
29
|
+
mode = "w"
|
30
|
+
encoding = "utf-8"
|
31
|
+
else:
|
32
|
+
mode = "wb"
|
33
|
+
encoding = None
|
34
|
+
|
35
|
+
with open(full_path, mode, encoding=encoding) as f:
|
36
|
+
f.write(data)
|
37
|
+
|
38
|
+
def file_exists(self, path: str) -> bool:
|
39
|
+
return self._resolve_path(path).is_file()
|
40
|
+
|
41
|
+
def list_files(self, path: str) -> List[str]:
|
42
|
+
full_path = self._resolve_path(path)
|
43
|
+
return [
|
44
|
+
str(f.relative_to(self.base_path))
|
45
|
+
for f in full_path.iterdir()
|
46
|
+
if f.is_file()
|
47
|
+
]
|
48
|
+
|
49
|
+
def walk(self, top: str) -> Generator[Tuple[str, List[str], List[str]], None, None]:
|
50
|
+
full_path = self._resolve_path(top)
|
51
|
+
for root, dirs, files in os.walk(full_path):
|
52
|
+
rel_root = str(Path(root).relative_to(self.base_path))
|
53
|
+
yield rel_root, dirs, files
|
54
|
+
|
55
|
+
def list_directories(self, path: str) -> List[str]:
|
56
|
+
full_path = self._resolve_path(path)
|
57
|
+
|
58
|
+
if not full_path.exists():
|
59
|
+
return []
|
60
|
+
|
61
|
+
if not full_path.is_dir():
|
62
|
+
return []
|
63
|
+
|
64
|
+
return [d.name for d in full_path.iterdir() if d.is_dir()]
|
65
|
+
|
66
|
+
def open(self, path: str, mode: str = "r") -> IO:
|
67
|
+
full_path = self._resolve_path(path)
|
68
|
+
self.mkdir(str(full_path.parent), exist_ok=True)
|
69
|
+
return open(full_path, mode)
|
70
|
+
|
71
|
+
def is_file(self, path: str) -> bool:
|
72
|
+
return self._resolve_path(path).is_file()
|
73
|
+
|
74
|
+
def is_dir(self, path: str) -> bool:
|
75
|
+
return self._resolve_path(path).is_dir()
|
76
|
+
|
77
|
+
def remove(self, path: str) -> None:
|
78
|
+
full_path = self._resolve_path(path)
|
79
|
+
if full_path.is_file():
|
80
|
+
os.remove(full_path)
|
81
|
+
|
82
|
+
def rmdir(self, directory: str) -> None:
|
83
|
+
full_path = self._resolve_path(directory)
|
84
|
+
if full_path.is_dir():
|
85
|
+
os.rmdir(full_path)
|
86
|
+
|
87
|
+
def mkdir(self, path: str, exist_ok: bool = False) -> None:
|
88
|
+
full_path = self._resolve_path(path)
|
89
|
+
full_path.mkdir(parents=True, exist_ok=exist_ok)
|
90
|
+
|
91
|
+
def exists(self, path: str) -> bool:
|
92
|
+
return self._resolve_path(path).exists()
|
@@ -0,0 +1,265 @@
|
|
1
|
+
import pandas as pd
|
2
|
+
import geopandas as gpd
|
3
|
+
from pathlib import Path
|
4
|
+
import json
|
5
|
+
import io
|
6
|
+
import zipfile
|
7
|
+
import gzip
|
8
|
+
|
9
|
+
from .data_store import DataStore
|
10
|
+
|
11
|
+
|
12
|
+
def read_json(data_store: DataStore, path, **kwargs):
|
13
|
+
with data_store.open(path, "r") as f:
|
14
|
+
return json.load(f, **kwargs)
|
15
|
+
|
16
|
+
|
17
|
+
def read_kmz(file_obj, **kwargs):
|
18
|
+
"""Helper function to read KMZ files and return a GeoDataFrame."""
|
19
|
+
try:
|
20
|
+
with zipfile.ZipFile(file_obj) as kmz:
|
21
|
+
# Find the KML file in the archive (usually doc.kml)
|
22
|
+
kml_filename = next(
|
23
|
+
name for name in kmz.namelist() if name.endswith(".kml")
|
24
|
+
)
|
25
|
+
|
26
|
+
# Read the KML content
|
27
|
+
kml_content = io.BytesIO(kmz.read(kml_filename))
|
28
|
+
|
29
|
+
gdf = gpd.read_file(kml_content)
|
30
|
+
|
31
|
+
# Validate the GeoDataFrame
|
32
|
+
if gdf.empty:
|
33
|
+
raise ValueError(
|
34
|
+
"The KML file is empty or does not contain valid geospatial data."
|
35
|
+
)
|
36
|
+
|
37
|
+
return gdf
|
38
|
+
|
39
|
+
except zipfile.BadZipFile:
|
40
|
+
raise ValueError("The provided file is not a valid KMZ file.")
|
41
|
+
except StopIteration:
|
42
|
+
raise ValueError("No KML file found in the KMZ archive.")
|
43
|
+
except Exception as e:
|
44
|
+
raise RuntimeError(f"An error occurred: {e}")
|
45
|
+
|
46
|
+
|
47
|
+
def read_gzipped_json_or_csv(file_path, data_store):
|
48
|
+
"""Reads a gzipped file, attempting to parse it as JSON (lines=True) or CSV."""
|
49
|
+
|
50
|
+
with data_store.open(file_path, "rb") as f:
|
51
|
+
g = gzip.GzipFile(fileobj=f)
|
52
|
+
text = g.read().decode("utf-8")
|
53
|
+
try:
|
54
|
+
df = pd.read_json(io.StringIO(text), lines=True)
|
55
|
+
return df
|
56
|
+
except json.JSONDecodeError:
|
57
|
+
try:
|
58
|
+
df = pd.read_csv(io.StringIO(text))
|
59
|
+
return df
|
60
|
+
except pd.errors.ParserError:
|
61
|
+
print(f"Error: Could not parse {file_path} as JSON or CSV.")
|
62
|
+
return None
|
63
|
+
|
64
|
+
|
65
|
+
def read_dataset(data_store: DataStore, path: str, compression: str = None, **kwargs):
|
66
|
+
"""
|
67
|
+
Read data from various file formats stored in both local and cloud-based storage.
|
68
|
+
|
69
|
+
Parameters:
|
70
|
+
----------
|
71
|
+
data_store : DataStore
|
72
|
+
Instance of DataStore for accessing data storage.
|
73
|
+
path : str, Path
|
74
|
+
Path to the file in data storage.
|
75
|
+
**kwargs : dict
|
76
|
+
Additional arguments passed to the specific reader function.
|
77
|
+
|
78
|
+
Returns:
|
79
|
+
-------
|
80
|
+
pandas.DataFrame or geopandas.GeoDataFrame
|
81
|
+
The data read from the file.
|
82
|
+
|
83
|
+
Raises:
|
84
|
+
------
|
85
|
+
FileNotFoundError
|
86
|
+
If the file doesn't exist in blob storage.
|
87
|
+
ValueError
|
88
|
+
If the file type is unsupported or if there's an error reading the file.
|
89
|
+
"""
|
90
|
+
|
91
|
+
# Define supported file formats and their readers
|
92
|
+
BINARY_FORMATS = {
|
93
|
+
".shp",
|
94
|
+
".zip",
|
95
|
+
".parquet",
|
96
|
+
".gpkg",
|
97
|
+
".xlsx",
|
98
|
+
".xls",
|
99
|
+
".kmz",
|
100
|
+
".gz",
|
101
|
+
}
|
102
|
+
|
103
|
+
PANDAS_READERS = {
|
104
|
+
".csv": pd.read_csv,
|
105
|
+
".xlsx": lambda f, **kw: pd.read_excel(f, engine="openpyxl", **kw),
|
106
|
+
".xls": lambda f, **kw: pd.read_excel(f, engine="xlrd", **kw),
|
107
|
+
".json": pd.read_json,
|
108
|
+
# ".gz": lambda f, **kw: pd.read_csv(f, compression="gzip", **kw),
|
109
|
+
}
|
110
|
+
|
111
|
+
GEO_READERS = {
|
112
|
+
".shp": gpd.read_file,
|
113
|
+
".zip": gpd.read_file,
|
114
|
+
".geojson": gpd.read_file,
|
115
|
+
".gpkg": gpd.read_file,
|
116
|
+
".parquet": gpd.read_parquet,
|
117
|
+
".kmz": read_kmz,
|
118
|
+
}
|
119
|
+
|
120
|
+
COMPRESSION_FORMATS = {
|
121
|
+
".gz": "gzip",
|
122
|
+
".bz2": "bz2",
|
123
|
+
".zip": "zip",
|
124
|
+
".xz": "xz",
|
125
|
+
}
|
126
|
+
|
127
|
+
try:
|
128
|
+
# Check if file exists
|
129
|
+
if not data_store.file_exists(path):
|
130
|
+
raise FileNotFoundError(f"File '{path}' not found in blob storage")
|
131
|
+
|
132
|
+
path_obj = Path(path)
|
133
|
+
suffixes = path_obj.suffixes
|
134
|
+
file_extension = suffixes[-1].lower() if suffixes else ""
|
135
|
+
|
136
|
+
if compression is None and file_extension in COMPRESSION_FORMATS:
|
137
|
+
compression_format = COMPRESSION_FORMATS[file_extension]
|
138
|
+
|
139
|
+
# if file has multiple extensions (e.g., .csv.gz), get the inner format
|
140
|
+
if len(suffixes) > 1:
|
141
|
+
inner_extension = suffixes[-2].lower()
|
142
|
+
|
143
|
+
if inner_extension == ".tar":
|
144
|
+
raise ValueError(
|
145
|
+
"Tar archives (.tar.gz) are not directly supported"
|
146
|
+
)
|
147
|
+
|
148
|
+
if inner_extension in PANDAS_READERS:
|
149
|
+
try:
|
150
|
+
with data_store.open(path, "rb") as f:
|
151
|
+
return PANDAS_READERS[inner_extension](
|
152
|
+
f, compression=compression_format, **kwargs
|
153
|
+
)
|
154
|
+
except Exception as e:
|
155
|
+
raise ValueError(f"Error reading compressed file: {str(e)}")
|
156
|
+
elif inner_extension in GEO_READERS:
|
157
|
+
try:
|
158
|
+
with data_store.open(path, "rb") as f:
|
159
|
+
if compression_format == "gzip":
|
160
|
+
import gzip
|
161
|
+
|
162
|
+
decompressed_data = gzip.decompress(f.read())
|
163
|
+
import io
|
164
|
+
|
165
|
+
return GEO_READERS[inner_extension](
|
166
|
+
io.BytesIO(decompressed_data), **kwargs
|
167
|
+
)
|
168
|
+
else:
|
169
|
+
raise ValueError(
|
170
|
+
f"Compression format {compression_format} not supported for geo data"
|
171
|
+
)
|
172
|
+
except Exception as e:
|
173
|
+
raise ValueError(f"Error reading compressed geo file: {str(e)}")
|
174
|
+
else:
|
175
|
+
# if just .gz without clear inner type, assume csv
|
176
|
+
try:
|
177
|
+
with data_store.open(path, "rb") as f:
|
178
|
+
return pd.read_csv(f, compression=compression_format, **kwargs)
|
179
|
+
except Exception as e:
|
180
|
+
raise ValueError(
|
181
|
+
f"Error reading compressed file as CSV: {str(e)}. "
|
182
|
+
f"If not a CSV, specify the format in the filename (e.g., .json.gz)"
|
183
|
+
)
|
184
|
+
|
185
|
+
# Special handling for compressed files
|
186
|
+
if file_extension == ".zip":
|
187
|
+
# For zip files, we need to use binary mode
|
188
|
+
with data_store.open(path, "rb") as f:
|
189
|
+
return gpd.read_file(f)
|
190
|
+
|
191
|
+
# Determine if we need binary mode based on file type
|
192
|
+
mode = "rb" if file_extension in BINARY_FORMATS else "r"
|
193
|
+
|
194
|
+
# Try reading with appropriate reader
|
195
|
+
if file_extension in PANDAS_READERS:
|
196
|
+
try:
|
197
|
+
with data_store.open(path, mode) as f:
|
198
|
+
return PANDAS_READERS[file_extension](f, **kwargs)
|
199
|
+
except Exception as e:
|
200
|
+
raise ValueError(f"Error reading file with pandas: {str(e)}")
|
201
|
+
|
202
|
+
if file_extension in GEO_READERS:
|
203
|
+
try:
|
204
|
+
with data_store.open(path, "rb") as f:
|
205
|
+
return GEO_READERS[file_extension](f, **kwargs)
|
206
|
+
except Exception as e:
|
207
|
+
# For parquet files, try pandas reader if geopandas fails
|
208
|
+
if file_extension == ".parquet":
|
209
|
+
try:
|
210
|
+
with data_store.open(path, "rb") as f:
|
211
|
+
return pd.read_parquet(f, **kwargs)
|
212
|
+
except Exception as e2:
|
213
|
+
raise ValueError(
|
214
|
+
f"Failed to read parquet with both geopandas ({str(e)}) "
|
215
|
+
f"and pandas ({str(e2)})"
|
216
|
+
)
|
217
|
+
raise ValueError(f"Error reading file with geopandas: {str(e)}")
|
218
|
+
|
219
|
+
# If we get here, the file type is unsupported
|
220
|
+
supported_formats = sorted(set(PANDAS_READERS.keys()) | set(GEO_READERS.keys()))
|
221
|
+
supported_compressions = sorted(COMPRESSION_FORMATS.keys())
|
222
|
+
raise ValueError(
|
223
|
+
f"Unsupported file type: {file_extension}\n"
|
224
|
+
f"Supported formats: {', '.join(supported_formats)}"
|
225
|
+
f"Supported compressions: {', '.join(supported_compressions)}"
|
226
|
+
)
|
227
|
+
|
228
|
+
except Exception as e:
|
229
|
+
if isinstance(e, (FileNotFoundError, ValueError)):
|
230
|
+
raise
|
231
|
+
raise RuntimeError(f"Unexpected error reading dataset: {str(e)}")
|
232
|
+
|
233
|
+
|
234
|
+
def read_datasets(data_store: DataStore, paths, **kwargs):
|
235
|
+
"""
|
236
|
+
Read multiple datasets from data storage at once.
|
237
|
+
|
238
|
+
Parameters:
|
239
|
+
----------
|
240
|
+
data_store : DataStore
|
241
|
+
Instance of DataStore for accessing data storage.
|
242
|
+
paths : list of str
|
243
|
+
Paths to files in data storage.
|
244
|
+
**kwargs : dict
|
245
|
+
Additional arguments passed to read_dataset.
|
246
|
+
|
247
|
+
Returns:
|
248
|
+
-------
|
249
|
+
dict
|
250
|
+
Dictionary mapping paths to their corresponding DataFrames/GeoDataFrames.
|
251
|
+
"""
|
252
|
+
results = {}
|
253
|
+
errors = {}
|
254
|
+
|
255
|
+
for path in paths:
|
256
|
+
try:
|
257
|
+
results[path] = read_dataset(data_store, path, **kwargs)
|
258
|
+
except Exception as e:
|
259
|
+
errors[path] = str(e)
|
260
|
+
|
261
|
+
if errors:
|
262
|
+
error_msg = "\n".join(f"- {path}: {error}" for path, error in errors.items())
|
263
|
+
raise ValueError(f"Errors reading datasets:\n{error_msg}")
|
264
|
+
|
265
|
+
return results
|
@@ -0,0 +1,128 @@
|
|
1
|
+
import pandas as pd
|
2
|
+
import geopandas as gpd
|
3
|
+
from pathlib import Path
|
4
|
+
import json
|
5
|
+
import io
|
6
|
+
|
7
|
+
from .data_store import DataStore
|
8
|
+
|
9
|
+
|
10
|
+
def write_json(data, data_store: DataStore, path, **kwargs):
|
11
|
+
with data_store.open(path, "w") as f:
|
12
|
+
json.dump(data, f, **kwargs)
|
13
|
+
|
14
|
+
|
15
|
+
def write_dataset(data, data_store: DataStore, path, **kwargs):
|
16
|
+
"""
|
17
|
+
Write DataFrame or GeoDataFrame to various file formats in Azure Blob Storage.
|
18
|
+
|
19
|
+
Parameters:
|
20
|
+
----------
|
21
|
+
data : pandas.DataFrame or geopandas.GeoDataFrame
|
22
|
+
The data to write to blob storage.
|
23
|
+
data_store : DataStore
|
24
|
+
Instance of DataStore for accessing data storage.
|
25
|
+
path : str
|
26
|
+
Path where the file will be written in data storage.
|
27
|
+
**kwargs : dict
|
28
|
+
Additional arguments passed to the specific writer function.
|
29
|
+
|
30
|
+
Raises:
|
31
|
+
------
|
32
|
+
ValueError
|
33
|
+
If the file type is unsupported or if there's an error writing the file.
|
34
|
+
TypeError
|
35
|
+
If input data is not a DataFrame or GeoDataFrame.
|
36
|
+
"""
|
37
|
+
|
38
|
+
# Define supported file formats and their writers
|
39
|
+
BINARY_FORMATS = {".shp", ".zip", ".parquet", ".gpkg", ".xlsx", ".xls"}
|
40
|
+
|
41
|
+
PANDAS_WRITERS = {
|
42
|
+
".csv": lambda df, buf, **kw: df.to_csv(buf, **kw),
|
43
|
+
".xlsx": lambda df, buf, **kw: df.to_excel(buf, engine="openpyxl", **kw),
|
44
|
+
".json": lambda df, buf, **kw: df.to_json(buf, **kw),
|
45
|
+
".parquet": lambda df, buf, **kw: df.to_parquet(buf, **kw),
|
46
|
+
}
|
47
|
+
|
48
|
+
GEO_WRITERS = {
|
49
|
+
".geojson": lambda gdf, buf, **kw: gdf.to_file(buf, driver="GeoJSON", **kw),
|
50
|
+
".gpkg": lambda gdf, buf, **kw: gdf.to_file(buf, driver="GPKG", **kw),
|
51
|
+
".parquet": lambda gdf, buf, **kw: gdf.to_parquet(buf, **kw),
|
52
|
+
}
|
53
|
+
|
54
|
+
try:
|
55
|
+
# Input validation
|
56
|
+
if not isinstance(data, (pd.DataFrame, gpd.GeoDataFrame)):
|
57
|
+
raise TypeError("Input data must be a pandas DataFrame or GeoDataFrame")
|
58
|
+
|
59
|
+
# Get file suffix and ensure it's lowercase
|
60
|
+
suffix = Path(path).suffix.lower()
|
61
|
+
|
62
|
+
# Determine if we need binary mode based on file type
|
63
|
+
mode = "wb" if suffix in BINARY_FORMATS else "w"
|
64
|
+
|
65
|
+
# Handle different data types and formats
|
66
|
+
if isinstance(data, gpd.GeoDataFrame):
|
67
|
+
if suffix not in GEO_WRITERS:
|
68
|
+
supported_formats = sorted(GEO_WRITERS.keys())
|
69
|
+
raise ValueError(
|
70
|
+
f"Unsupported file type for GeoDataFrame: {suffix}\n"
|
71
|
+
f"Supported formats: {', '.join(supported_formats)}"
|
72
|
+
)
|
73
|
+
|
74
|
+
try:
|
75
|
+
with data_store.open(path, "wb") as f:
|
76
|
+
GEO_WRITERS[suffix](data, f, **kwargs)
|
77
|
+
except Exception as e:
|
78
|
+
raise ValueError(f"Error writing GeoDataFrame: {str(e)}")
|
79
|
+
|
80
|
+
else: # pandas DataFrame
|
81
|
+
if suffix not in PANDAS_WRITERS:
|
82
|
+
supported_formats = sorted(PANDAS_WRITERS.keys())
|
83
|
+
raise ValueError(
|
84
|
+
f"Unsupported file type for DataFrame: {suffix}\n"
|
85
|
+
f"Supported formats: {', '.join(supported_formats)}"
|
86
|
+
)
|
87
|
+
|
88
|
+
try:
|
89
|
+
with data_store.open(path, mode) as f:
|
90
|
+
PANDAS_WRITERS[suffix](data, f, **kwargs)
|
91
|
+
except Exception as e:
|
92
|
+
raise ValueError(f"Error writing DataFrame: {str(e)}")
|
93
|
+
|
94
|
+
except Exception as e:
|
95
|
+
if isinstance(e, (TypeError, ValueError)):
|
96
|
+
raise
|
97
|
+
raise RuntimeError(f"Unexpected error writing dataset: {str(e)}")
|
98
|
+
|
99
|
+
|
100
|
+
def write_datasets(data_dict, data_store: DataStore, **kwargs):
|
101
|
+
"""
|
102
|
+
Write multiple datasets to data storage at once.
|
103
|
+
|
104
|
+
Parameters:
|
105
|
+
----------
|
106
|
+
data_dict : dict
|
107
|
+
Dictionary mapping paths to DataFrames/GeoDataFrames.
|
108
|
+
data_store : DataStore
|
109
|
+
Instance of DataStore for accessing data storage.
|
110
|
+
**kwargs : dict
|
111
|
+
Additional arguments passed to write_dataset.
|
112
|
+
|
113
|
+
Raises:
|
114
|
+
------
|
115
|
+
ValueError
|
116
|
+
If there are any errors writing the datasets.
|
117
|
+
"""
|
118
|
+
errors = {}
|
119
|
+
|
120
|
+
for path, data in data_dict.items():
|
121
|
+
try:
|
122
|
+
write_dataset(data, data_store, path, **kwargs)
|
123
|
+
except Exception as e:
|
124
|
+
errors[path] = str(e)
|
125
|
+
|
126
|
+
if errors:
|
127
|
+
error_msg = "\n".join(f"- {path}: {error}" for path, error in errors.items())
|
128
|
+
raise ValueError(f"Errors writing datasets:\n{error_msg}")
|
File without changes
|