earthforge-vector 0.1.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- earthforge_vector-0.1.0/.gitignore +31 -0
- earthforge_vector-0.1.0/PKG-INFO +13 -0
- earthforge_vector-0.1.0/README.md +3 -0
- earthforge_vector-0.1.0/pyproject.toml +18 -0
- earthforge_vector-0.1.0/src/earthforge/vector/__init__.py +6 -0
- earthforge_vector-0.1.0/src/earthforge/vector/convert.py +356 -0
- earthforge_vector-0.1.0/src/earthforge/vector/errors.py +21 -0
- earthforge_vector-0.1.0/src/earthforge/vector/info.py +245 -0
- earthforge_vector-0.1.0/src/earthforge/vector/query.py +384 -0
- earthforge_vector-0.1.0/tests/test_vector_convert.py +226 -0
- earthforge_vector-0.1.0/tests/test_vector_info.py +174 -0
- earthforge_vector-0.1.0/tests/test_vector_query.py +176 -0
|
@@ -0,0 +1,31 @@
|
|
|
1
|
+
# Python
|
|
2
|
+
__pycache__/
|
|
3
|
+
*.py[cod]
|
|
4
|
+
*.egg-info/
|
|
5
|
+
*.egg
|
|
6
|
+
dist/
|
|
7
|
+
build/
|
|
8
|
+
*.whl
|
|
9
|
+
|
|
10
|
+
# Virtual environments
|
|
11
|
+
.venv/
|
|
12
|
+
venv/
|
|
13
|
+
|
|
14
|
+
# IDE
|
|
15
|
+
.vscode/
|
|
16
|
+
.idea/
|
|
17
|
+
*.swp
|
|
18
|
+
*.swo
|
|
19
|
+
|
|
20
|
+
# Testing
|
|
21
|
+
.pytest_cache/
|
|
22
|
+
.coverage
|
|
23
|
+
htmlcov/
|
|
24
|
+
.mypy_cache/
|
|
25
|
+
|
|
26
|
+
# OS
|
|
27
|
+
.DS_Store
|
|
28
|
+
Thumbs.db
|
|
29
|
+
|
|
30
|
+
# Claude
|
|
31
|
+
.claude/
|
|
@@ -0,0 +1,13 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: earthforge-vector
|
|
3
|
+
Version: 0.1.0
|
|
4
|
+
Summary: EarthForge vector format inspection (GeoParquet, FlatGeobuf, GeoJSON).
|
|
5
|
+
License-Expression: GPL-3.0-or-later
|
|
6
|
+
Requires-Python: >=3.11
|
|
7
|
+
Requires-Dist: earthforge-core>=0.1.0
|
|
8
|
+
Requires-Dist: pyarrow>=14.0
|
|
9
|
+
Description-Content-Type: text/markdown
|
|
10
|
+
|
|
11
|
+
# earthforge-vector
|
|
12
|
+
|
|
13
|
+
Vector format inspection for EarthForge. Part of the [EarthForge](../../README.md) toolkit.
|
|
@@ -0,0 +1,18 @@
|
|
|
1
|
+
[build-system]
|
|
2
|
+
requires = ["hatchling"]
|
|
3
|
+
build-backend = "hatchling.build"
|
|
4
|
+
|
|
5
|
+
[project]
|
|
6
|
+
name = "earthforge-vector"
|
|
7
|
+
version = "0.1.0"
|
|
8
|
+
description = "EarthForge vector format inspection (GeoParquet, FlatGeobuf, GeoJSON)."
|
|
9
|
+
readme = "README.md"
|
|
10
|
+
requires-python = ">=3.11"
|
|
11
|
+
license = "GPL-3.0-or-later"
|
|
12
|
+
dependencies = [
|
|
13
|
+
"earthforge-core>=0.1.0",
|
|
14
|
+
"pyarrow>=14.0",
|
|
15
|
+
]
|
|
16
|
+
|
|
17
|
+
[tool.hatch.build.targets.wheel]
|
|
18
|
+
packages = ["src/earthforge"]
|
|
@@ -0,0 +1,356 @@
|
|
|
1
|
+
"""Vector format conversion.
|
|
2
|
+
|
|
3
|
+
Converts between vector geospatial formats with a focus on producing valid
|
|
4
|
+
GeoParquet output. Supports Shapefile, GeoJSON, and other OGR-readable
|
|
5
|
+
formats as input. Writes GeoParquet with proper ``geo`` metadata including
|
|
6
|
+
CRS, geometry types, encoding, and bounding box.
|
|
7
|
+
|
|
8
|
+
Uses GDAL/OGR for reading source formats and pyarrow for writing Parquet.
|
|
9
|
+
Falls back to geopandas if available, but does not require it.
|
|
10
|
+
|
|
11
|
+
Usage::
|
|
12
|
+
|
|
13
|
+
from earthforge.vector.convert import convert_vector
|
|
14
|
+
|
|
15
|
+
result = await convert_vector("buildings.shp", output="buildings.parquet")
|
|
16
|
+
"""
|
|
17
|
+
|
|
18
|
+
from __future__ import annotations
|
|
19
|
+
|
|
20
|
+
import asyncio
|
|
21
|
+
import json
|
|
22
|
+
from functools import partial
|
|
23
|
+
from pathlib import Path
|
|
24
|
+
from typing import Any
|
|
25
|
+
|
|
26
|
+
from pydantic import BaseModel, Field
|
|
27
|
+
|
|
28
|
+
from earthforge.vector.errors import VectorError
|
|
29
|
+
|
|
30
|
+
|
|
31
|
+
class ConvertResult(BaseModel):
|
|
32
|
+
"""Structured result from a vector format conversion.
|
|
33
|
+
|
|
34
|
+
Attributes:
|
|
35
|
+
source: Input file path.
|
|
36
|
+
output: Output file path.
|
|
37
|
+
input_format: Source format name (e.g. ``"ESRI Shapefile"``).
|
|
38
|
+
output_format: Target format (e.g. ``"geoparquet"``).
|
|
39
|
+
feature_count: Number of features converted.
|
|
40
|
+
geometry_type: Geometry type (e.g. ``"Polygon"``).
|
|
41
|
+
crs: CRS identifier string.
|
|
42
|
+
bbox: Bounding box ``[west, south, east, north]``.
|
|
43
|
+
file_size_bytes: Output file size in bytes.
|
|
44
|
+
"""
|
|
45
|
+
|
|
46
|
+
source: str = Field(title="Source")
|
|
47
|
+
output: str = Field(title="Output")
|
|
48
|
+
input_format: str = Field(title="Input Format")
|
|
49
|
+
output_format: str = Field(title="Output Format")
|
|
50
|
+
feature_count: int = Field(title="Features")
|
|
51
|
+
geometry_type: str | None = Field(default=None, title="Geometry Type")
|
|
52
|
+
crs: str | None = Field(default=None, title="CRS")
|
|
53
|
+
bbox: list[float] | None = Field(default=None, title="BBox")
|
|
54
|
+
file_size_bytes: int | None = Field(default=None, title="Size (bytes)")
|
|
55
|
+
|
|
56
|
+
|
|
57
|
+
def _ogr_type_to_arrow(ogr_type: int) -> str:
|
|
58
|
+
"""Map an OGR field type to a pyarrow type string.
|
|
59
|
+
|
|
60
|
+
Parameters:
|
|
61
|
+
ogr_type: OGR field type constant.
|
|
62
|
+
|
|
63
|
+
Returns:
|
|
64
|
+
Arrow type name string.
|
|
65
|
+
"""
|
|
66
|
+
from osgeo import ogr
|
|
67
|
+
|
|
68
|
+
mapping = {
|
|
69
|
+
ogr.OFTInteger: "int32",
|
|
70
|
+
ogr.OFTInteger64: "int64",
|
|
71
|
+
ogr.OFTReal: "float64",
|
|
72
|
+
ogr.OFTString: "string",
|
|
73
|
+
ogr.OFTDate: "string",
|
|
74
|
+
ogr.OFTDateTime: "string",
|
|
75
|
+
ogr.OFTBinary: "binary",
|
|
76
|
+
}
|
|
77
|
+
return mapping.get(ogr_type, "string")
|
|
78
|
+
|
|
79
|
+
|
|
80
|
+
def _ogr_geom_type_name(ogr_geom_type: int) -> str:
|
|
81
|
+
"""Convert OGR geometry type constant to human-readable name.
|
|
82
|
+
|
|
83
|
+
Parameters:
|
|
84
|
+
ogr_geom_type: OGR geometry type constant.
|
|
85
|
+
|
|
86
|
+
Returns:
|
|
87
|
+
Geometry type name.
|
|
88
|
+
"""
|
|
89
|
+
from osgeo import ogr
|
|
90
|
+
|
|
91
|
+
mapping = {
|
|
92
|
+
ogr.wkbPoint: "Point",
|
|
93
|
+
ogr.wkbLineString: "LineString",
|
|
94
|
+
ogr.wkbPolygon: "Polygon",
|
|
95
|
+
ogr.wkbMultiPoint: "MultiPoint",
|
|
96
|
+
ogr.wkbMultiLineString: "MultiLineString",
|
|
97
|
+
ogr.wkbMultiPolygon: "MultiPolygon",
|
|
98
|
+
ogr.wkbGeometryCollection: "GeometryCollection",
|
|
99
|
+
ogr.wkbPoint25D: "Point",
|
|
100
|
+
ogr.wkbLineString25D: "LineString",
|
|
101
|
+
ogr.wkbPolygon25D: "Polygon",
|
|
102
|
+
ogr.wkbMultiPoint25D: "MultiPoint",
|
|
103
|
+
ogr.wkbMultiLineString25D: "MultiLineString",
|
|
104
|
+
ogr.wkbMultiPolygon25D: "MultiPolygon",
|
|
105
|
+
}
|
|
106
|
+
return mapping.get(ogr_geom_type, "Unknown")
|
|
107
|
+
|
|
108
|
+
|
|
109
|
+
def _extract_crs_info(spatial_ref: Any) -> tuple[str | None, dict[str, Any] | None]:
|
|
110
|
+
"""Extract CRS identifier and PROJJSON from an OGR SpatialReference.
|
|
111
|
+
|
|
112
|
+
Parameters:
|
|
113
|
+
spatial_ref: OGR SpatialReference object.
|
|
114
|
+
|
|
115
|
+
Returns:
|
|
116
|
+
Tuple of (crs_string, projjson_dict).
|
|
117
|
+
"""
|
|
118
|
+
if spatial_ref is None:
|
|
119
|
+
return None, None
|
|
120
|
+
|
|
121
|
+
# Try to get authority:code
|
|
122
|
+
auth_name = spatial_ref.GetAuthorityName(None)
|
|
123
|
+
auth_code = spatial_ref.GetAuthorityCode(None)
|
|
124
|
+
crs_string = f"{auth_name}:{auth_code}" if auth_name and auth_code else None
|
|
125
|
+
|
|
126
|
+
# Build PROJJSON for GeoParquet metadata
|
|
127
|
+
projjson: dict[str, Any] | None = None
|
|
128
|
+
try:
|
|
129
|
+
projjson_str = spatial_ref.ExportToPROJJSON()
|
|
130
|
+
if projjson_str:
|
|
131
|
+
projjson = json.loads(projjson_str)
|
|
132
|
+
except Exception:
|
|
133
|
+
# Fall back to building minimal PROJJSON
|
|
134
|
+
if crs_string:
|
|
135
|
+
projjson = {
|
|
136
|
+
"type": "GeographicCRS" if spatial_ref.IsGeographic() else "ProjectedCRS",
|
|
137
|
+
"name": spatial_ref.GetName() or crs_string,
|
|
138
|
+
"id": {"authority": auth_name, "code": int(auth_code) if auth_code else 0},
|
|
139
|
+
}
|
|
140
|
+
|
|
141
|
+
if crs_string is None and spatial_ref.GetName():
|
|
142
|
+
crs_string = spatial_ref.GetName()
|
|
143
|
+
|
|
144
|
+
return crs_string, projjson
|
|
145
|
+
|
|
146
|
+
|
|
147
|
+
def _convert_vector_sync(
|
|
148
|
+
source: str,
|
|
149
|
+
*,
|
|
150
|
+
output: str | None = None,
|
|
151
|
+
target_format: str = "geoparquet",
|
|
152
|
+
compression: str = "snappy",
|
|
153
|
+
) -> ConvertResult:
|
|
154
|
+
"""Convert a vector file to GeoParquet synchronously.
|
|
155
|
+
|
|
156
|
+
Parameters:
|
|
157
|
+
source: Path to the input vector file (Shapefile, GeoJSON, etc.).
|
|
158
|
+
output: Output file path. If ``None``, derives from source name.
|
|
159
|
+
target_format: Target format (currently only ``"geoparquet"``).
|
|
160
|
+
compression: Parquet compression codec (``"snappy"``, ``"zstd"``, ``"gzip"``).
|
|
161
|
+
|
|
162
|
+
Returns:
|
|
163
|
+
Structured conversion result.
|
|
164
|
+
|
|
165
|
+
Raises:
|
|
166
|
+
VectorError: If the source cannot be read or conversion fails.
|
|
167
|
+
"""
|
|
168
|
+
if target_format != "geoparquet":
|
|
169
|
+
raise VectorError(f"Unsupported target format: {target_format}")
|
|
170
|
+
|
|
171
|
+
try:
|
|
172
|
+
from osgeo import ogr
|
|
173
|
+
except ImportError as exc:
|
|
174
|
+
raise VectorError(
|
|
175
|
+
"GDAL/OGR is required for vector conversion: install GDAL Python bindings"
|
|
176
|
+
) from exc
|
|
177
|
+
|
|
178
|
+
try:
|
|
179
|
+
import pyarrow as pa
|
|
180
|
+
import pyarrow.parquet as pq
|
|
181
|
+
except ImportError as exc:
|
|
182
|
+
raise VectorError(
|
|
183
|
+
"pyarrow is required for GeoParquet output: pip install earthforge[vector]"
|
|
184
|
+
) from exc
|
|
185
|
+
|
|
186
|
+
# Open source
|
|
187
|
+
try:
|
|
188
|
+
ds = ogr.Open(source)
|
|
189
|
+
except RuntimeError as exc:
|
|
190
|
+
raise VectorError(f"Failed to open vector file '{source}'") from exc
|
|
191
|
+
if ds is None:
|
|
192
|
+
raise VectorError(f"Failed to open vector file '{source}'")
|
|
193
|
+
|
|
194
|
+
layer = ds.GetLayer(0)
|
|
195
|
+
if layer is None:
|
|
196
|
+
raise VectorError(f"No layers found in '{source}'")
|
|
197
|
+
|
|
198
|
+
input_format = ds.GetDriver().GetName()
|
|
199
|
+
layer_defn = layer.GetLayerDefn()
|
|
200
|
+
feature_count = layer.GetFeatureCount()
|
|
201
|
+
geom_type = _ogr_geom_type_name(layer.GetGeomType())
|
|
202
|
+
|
|
203
|
+
# Extract CRS
|
|
204
|
+
spatial_ref = layer.GetSpatialRef()
|
|
205
|
+
crs_string, projjson = _extract_crs_info(spatial_ref)
|
|
206
|
+
|
|
207
|
+
# Get extent
|
|
208
|
+
extent = layer.GetExtent() # (xmin, xmax, ymin, ymax)
|
|
209
|
+
bbox = [extent[0], extent[2], extent[1], extent[3]] if extent else None
|
|
210
|
+
|
|
211
|
+
# Build field definitions
|
|
212
|
+
field_names: list[str] = []
|
|
213
|
+
field_types: list[str] = []
|
|
214
|
+
for i in range(layer_defn.GetFieldCount()):
|
|
215
|
+
field_def = layer_defn.GetFieldDefn(i)
|
|
216
|
+
field_names.append(field_def.GetName())
|
|
217
|
+
field_types.append(_ogr_type_to_arrow(field_def.GetType()))
|
|
218
|
+
|
|
219
|
+
# Read all features
|
|
220
|
+
arrays: dict[str, list[Any]] = {name: [] for name in field_names}
|
|
221
|
+
geometries: list[bytes] = []
|
|
222
|
+
|
|
223
|
+
layer.ResetReading()
|
|
224
|
+
feature = layer.GetNextFeature()
|
|
225
|
+
actual_count = 0
|
|
226
|
+
while feature is not None:
|
|
227
|
+
# Read attribute fields
|
|
228
|
+
for i, name in enumerate(field_names):
|
|
229
|
+
if not feature.IsFieldSet(i) or feature.IsFieldNull(i):
|
|
230
|
+
arrays[name].append(None)
|
|
231
|
+
elif field_types[i] == "int32":
|
|
232
|
+
arrays[name].append(feature.GetFieldAsInteger(i))
|
|
233
|
+
elif field_types[i] == "int64":
|
|
234
|
+
arrays[name].append(feature.GetFieldAsInteger64(i))
|
|
235
|
+
elif field_types[i] == "float64":
|
|
236
|
+
arrays[name].append(feature.GetFieldAsDouble(i))
|
|
237
|
+
else:
|
|
238
|
+
arrays[name].append(feature.GetFieldAsString(i))
|
|
239
|
+
|
|
240
|
+
# Read geometry as WKB
|
|
241
|
+
geom = feature.GetGeometryRef()
|
|
242
|
+
if geom is not None:
|
|
243
|
+
geometries.append(bytes(geom.ExportToWkb()))
|
|
244
|
+
else:
|
|
245
|
+
geometries.append(b"")
|
|
246
|
+
|
|
247
|
+
actual_count += 1
|
|
248
|
+
feature = layer.GetNextFeature()
|
|
249
|
+
|
|
250
|
+
ds = None # Close OGR dataset
|
|
251
|
+
|
|
252
|
+
if feature_count < 0:
|
|
253
|
+
feature_count = actual_count
|
|
254
|
+
|
|
255
|
+
# Build pyarrow table
|
|
256
|
+
pa_columns: dict[str, Any] = {}
|
|
257
|
+
for name, arrow_type in zip(field_names, field_types, strict=True):
|
|
258
|
+
type_map = {
|
|
259
|
+
"int32": pa.int32(),
|
|
260
|
+
"int64": pa.int64(),
|
|
261
|
+
"float64": pa.float64(),
|
|
262
|
+
"string": pa.string(),
|
|
263
|
+
"binary": pa.binary(),
|
|
264
|
+
}
|
|
265
|
+
pa_type = type_map.get(arrow_type, pa.string())
|
|
266
|
+
pa_columns[name] = pa.array(arrays[name], type=pa_type)
|
|
267
|
+
|
|
268
|
+
pa_columns["geometry"] = pa.array(geometries, type=pa.binary())
|
|
269
|
+
table = pa.table(pa_columns)
|
|
270
|
+
|
|
271
|
+
# Build GeoParquet metadata
|
|
272
|
+
geo_metadata: dict[str, Any] = {
|
|
273
|
+
"version": "1.1.0",
|
|
274
|
+
"primary_column": "geometry",
|
|
275
|
+
"columns": {
|
|
276
|
+
"geometry": {
|
|
277
|
+
"encoding": "WKB",
|
|
278
|
+
"geometry_types": [geom_type],
|
|
279
|
+
}
|
|
280
|
+
},
|
|
281
|
+
}
|
|
282
|
+
|
|
283
|
+
if bbox:
|
|
284
|
+
geo_metadata["columns"]["geometry"]["bbox"] = bbox
|
|
285
|
+
if projjson:
|
|
286
|
+
geo_metadata["columns"]["geometry"]["crs"] = projjson
|
|
287
|
+
|
|
288
|
+
# Attach geo metadata to schema
|
|
289
|
+
existing = table.schema.metadata or {}
|
|
290
|
+
existing[b"geo"] = json.dumps(geo_metadata).encode("utf-8")
|
|
291
|
+
table = table.replace_schema_metadata(existing)
|
|
292
|
+
|
|
293
|
+
# Determine output path
|
|
294
|
+
if output is None:
|
|
295
|
+
output = str(Path(source).with_suffix(".parquet"))
|
|
296
|
+
|
|
297
|
+
# Write GeoParquet
|
|
298
|
+
try:
|
|
299
|
+
pq.write_table(table, output, compression=compression)
|
|
300
|
+
except Exception as exc:
|
|
301
|
+
raise VectorError(f"Failed to write GeoParquet '{output}': {exc}") from exc
|
|
302
|
+
|
|
303
|
+
file_size: int | None = None
|
|
304
|
+
try:
|
|
305
|
+
file_size = Path(output).stat().st_size
|
|
306
|
+
except OSError:
|
|
307
|
+
pass
|
|
308
|
+
|
|
309
|
+
return ConvertResult(
|
|
310
|
+
source=source,
|
|
311
|
+
output=output,
|
|
312
|
+
input_format=input_format,
|
|
313
|
+
output_format="geoparquet",
|
|
314
|
+
feature_count=actual_count,
|
|
315
|
+
geometry_type=geom_type,
|
|
316
|
+
crs=crs_string,
|
|
317
|
+
bbox=bbox,
|
|
318
|
+
file_size_bytes=file_size,
|
|
319
|
+
)
|
|
320
|
+
|
|
321
|
+
|
|
322
|
+
async def convert_vector(
|
|
323
|
+
source: str,
|
|
324
|
+
*,
|
|
325
|
+
output: str | None = None,
|
|
326
|
+
target_format: str = "geoparquet",
|
|
327
|
+
compression: str = "snappy",
|
|
328
|
+
) -> ConvertResult:
|
|
329
|
+
"""Convert a vector file to GeoParquet.
|
|
330
|
+
|
|
331
|
+
Reads the source using GDAL/OGR and writes GeoParquet with proper ``geo``
|
|
332
|
+
metadata. Supports Shapefile, GeoJSON, GPKG, and any OGR-supported format.
|
|
333
|
+
|
|
334
|
+
Parameters:
|
|
335
|
+
source: Path to the input vector file.
|
|
336
|
+
output: Output file path. If ``None``, replaces extension with ``.parquet``.
|
|
337
|
+
target_format: Target format (default: ``"geoparquet"``).
|
|
338
|
+
compression: Parquet compression codec (default: ``"snappy"``).
|
|
339
|
+
|
|
340
|
+
Returns:
|
|
341
|
+
Structured conversion result.
|
|
342
|
+
|
|
343
|
+
Raises:
|
|
344
|
+
VectorError: If the conversion fails.
|
|
345
|
+
"""
|
|
346
|
+
loop = asyncio.get_running_loop()
|
|
347
|
+
return await loop.run_in_executor(
|
|
348
|
+
None,
|
|
349
|
+
partial(
|
|
350
|
+
_convert_vector_sync,
|
|
351
|
+
source,
|
|
352
|
+
output=output,
|
|
353
|
+
target_format=target_format,
|
|
354
|
+
compression=compression,
|
|
355
|
+
),
|
|
356
|
+
)
|
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
"""Vector-specific error types.
|
|
2
|
+
|
|
3
|
+
All exceptions inherit from :class:`~earthforge.core.errors.EarthForgeError`
|
|
4
|
+
so the CLI can catch them uniformly and map to appropriate exit codes.
|
|
5
|
+
"""
|
|
6
|
+
|
|
7
|
+
from __future__ import annotations
|
|
8
|
+
|
|
9
|
+
from earthforge.core.errors import EarthForgeError
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
class VectorError(EarthForgeError):
|
|
13
|
+
"""Base error for vector operations.
|
|
14
|
+
|
|
15
|
+
Parameters:
|
|
16
|
+
message: Human-readable error description.
|
|
17
|
+
exit_code: Process exit code (default: 20).
|
|
18
|
+
"""
|
|
19
|
+
|
|
20
|
+
def __init__(self, message: str, *, exit_code: int = 20) -> None:
|
|
21
|
+
super().__init__(message, exit_code=exit_code)
|
|
@@ -0,0 +1,245 @@
|
|
|
1
|
+
"""Deep metadata extraction for vector geospatial formats.
|
|
2
|
+
|
|
3
|
+
Reads Parquet/GeoParquet file metadata via pyarrow without loading data into
|
|
4
|
+
memory. Extracts schema, row counts, geometry columns, CRS, bounding box, and
|
|
5
|
+
encoding information from GeoParquet ``geo`` metadata.
|
|
6
|
+
|
|
7
|
+
For non-Parquet vector formats (GeoJSON, FlatGeobuf), provides basic file-level
|
|
8
|
+
metadata. Deep inspection of those formats may be added in later milestones.
|
|
9
|
+
"""
|
|
10
|
+
|
|
11
|
+
from __future__ import annotations
|
|
12
|
+
|
|
13
|
+
import asyncio
|
|
14
|
+
import json
|
|
15
|
+
from functools import partial
|
|
16
|
+
from pathlib import Path
|
|
17
|
+
from typing import Any
|
|
18
|
+
|
|
19
|
+
from pydantic import BaseModel, Field
|
|
20
|
+
|
|
21
|
+
from earthforge.vector.errors import VectorError
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
class ColumnInfo(BaseModel):
|
|
25
|
+
"""Metadata for a single column in a vector dataset.
|
|
26
|
+
|
|
27
|
+
Attributes:
|
|
28
|
+
name: Column name.
|
|
29
|
+
type: Arrow type string (e.g. ``"int64"``, ``"binary"``).
|
|
30
|
+
is_geometry: Whether this column contains geometry data.
|
|
31
|
+
"""
|
|
32
|
+
|
|
33
|
+
name: str = Field(title="Column")
|
|
34
|
+
type: str = Field(title="Type")
|
|
35
|
+
is_geometry: bool = Field(default=False, title="Geometry")
|
|
36
|
+
|
|
37
|
+
|
|
38
|
+
class VectorInfo(BaseModel):
|
|
39
|
+
"""Structured metadata for a vector geospatial file.
|
|
40
|
+
|
|
41
|
+
Attributes:
|
|
42
|
+
source: The file path that was inspected.
|
|
43
|
+
format: Detected vector format (e.g. ``"geoparquet"``, ``"parquet"``).
|
|
44
|
+
row_count: Total number of rows/features.
|
|
45
|
+
num_columns: Total number of columns.
|
|
46
|
+
columns: Per-column metadata.
|
|
47
|
+
geometry_column: Name of the primary geometry column, if any.
|
|
48
|
+
geometry_types: List of geometry types found (e.g. ``["Point"]``).
|
|
49
|
+
crs: CRS string from GeoParquet metadata, if available.
|
|
50
|
+
bbox: Bounding box ``[west, south, east, north]``, if available.
|
|
51
|
+
encoding: Geometry encoding (e.g. ``"WKB"``), if available.
|
|
52
|
+
num_row_groups: Number of Parquet row groups.
|
|
53
|
+
compression: Parquet compression codec, if applicable.
|
|
54
|
+
file_size_bytes: File size in bytes.
|
|
55
|
+
"""
|
|
56
|
+
|
|
57
|
+
source: str = Field(title="Source")
|
|
58
|
+
format: str = Field(title="Format")
|
|
59
|
+
row_count: int = Field(title="Rows")
|
|
60
|
+
num_columns: int = Field(title="Columns")
|
|
61
|
+
columns: list[ColumnInfo] = Field(title="Column Details")
|
|
62
|
+
geometry_column: str | None = Field(default=None, title="Geometry Column")
|
|
63
|
+
geometry_types: list[str] = Field(default_factory=list, title="Geometry Types")
|
|
64
|
+
crs: str | None = Field(default=None, title="CRS")
|
|
65
|
+
bbox: list[float] | None = Field(default=None, title="Bounding Box")
|
|
66
|
+
encoding: str | None = Field(default=None, title="Encoding")
|
|
67
|
+
num_row_groups: int | None = Field(default=None, title="Row Groups")
|
|
68
|
+
compression: str | None = Field(default=None, title="Compression")
|
|
69
|
+
file_size_bytes: int | None = Field(default=None, title="Size (bytes)")
|
|
70
|
+
|
|
71
|
+
|
|
72
|
+
def _read_parquet_info(source: str) -> VectorInfo:
|
|
73
|
+
"""Read metadata from a Parquet/GeoParquet file synchronously.
|
|
74
|
+
|
|
75
|
+
Uses pyarrow to read only the file metadata and schema — no row data
|
|
76
|
+
is loaded into memory. Parses the ``geo`` metadata key for GeoParquet-
|
|
77
|
+
specific information (geometry column, CRS, bbox, encoding).
|
|
78
|
+
|
|
79
|
+
Parameters:
|
|
80
|
+
source: Path to a Parquet file.
|
|
81
|
+
|
|
82
|
+
Returns:
|
|
83
|
+
Structured vector metadata.
|
|
84
|
+
|
|
85
|
+
Raises:
|
|
86
|
+
VectorError: If the file cannot be read or is not a valid Parquet file.
|
|
87
|
+
"""
|
|
88
|
+
try:
|
|
89
|
+
import pyarrow.parquet as pq
|
|
90
|
+
except ImportError as exc:
|
|
91
|
+
msg = "pyarrow is required for Parquet inspection: pip install pyarrow"
|
|
92
|
+
raise VectorError(msg) from exc
|
|
93
|
+
|
|
94
|
+
try:
|
|
95
|
+
pf = pq.ParquetFile(source)
|
|
96
|
+
except Exception as exc:
|
|
97
|
+
msg = f"Failed to read Parquet file '{source}': {exc}"
|
|
98
|
+
raise VectorError(msg) from exc
|
|
99
|
+
|
|
100
|
+
schema = pf.schema_arrow
|
|
101
|
+
metadata = schema.metadata or {}
|
|
102
|
+
num_rows = pf.metadata.num_rows
|
|
103
|
+
num_row_groups = pf.metadata.num_row_groups
|
|
104
|
+
|
|
105
|
+
# Detect compression from the first row group's first column chunk
|
|
106
|
+
compression: str | None = None
|
|
107
|
+
if num_row_groups > 0 and schema:
|
|
108
|
+
try:
|
|
109
|
+
rg = pf.metadata.row_group(0)
|
|
110
|
+
if rg.num_columns > 0:
|
|
111
|
+
compression = rg.column(0).compression
|
|
112
|
+
except Exception: # noqa: S110 — best-effort metadata extraction
|
|
113
|
+
pass
|
|
114
|
+
|
|
115
|
+
# Parse GeoParquet metadata
|
|
116
|
+
geo_meta = _parse_geo_metadata(metadata)
|
|
117
|
+
geometry_column = geo_meta.get("primary_column")
|
|
118
|
+
geometry_columns: set[str] = set()
|
|
119
|
+
geometry_types: list[str] = []
|
|
120
|
+
crs: str | None = None
|
|
121
|
+
bbox: list[float] | None = None
|
|
122
|
+
encoding: str | None = None
|
|
123
|
+
|
|
124
|
+
if geometry_column and "columns" in geo_meta:
|
|
125
|
+
geometry_columns.add(geometry_column)
|
|
126
|
+
col_meta = geo_meta["columns"].get(geometry_column, {})
|
|
127
|
+
geometry_types = col_meta.get("geometry_types", [])
|
|
128
|
+
encoding = col_meta.get("encoding")
|
|
129
|
+
bbox_raw = col_meta.get("bbox")
|
|
130
|
+
if isinstance(bbox_raw, list) and len(bbox_raw) == 4:
|
|
131
|
+
bbox = [float(v) for v in bbox_raw]
|
|
132
|
+
|
|
133
|
+
crs_obj = col_meta.get("crs")
|
|
134
|
+
if isinstance(crs_obj, dict):
|
|
135
|
+
# GeoParquet stores CRS as PROJJSON — extract the name or ID
|
|
136
|
+
crs = _extract_crs_string(crs_obj)
|
|
137
|
+
elif isinstance(crs_obj, str):
|
|
138
|
+
crs = crs_obj
|
|
139
|
+
|
|
140
|
+
# Also check for additional geometry columns
|
|
141
|
+
if "columns" in geo_meta:
|
|
142
|
+
geometry_columns.update(geo_meta["columns"].keys())
|
|
143
|
+
|
|
144
|
+
# Build column info
|
|
145
|
+
columns: list[ColumnInfo] = []
|
|
146
|
+
for i in range(len(schema)):
|
|
147
|
+
field = schema.field(i)
|
|
148
|
+
columns.append(
|
|
149
|
+
ColumnInfo(
|
|
150
|
+
name=field.name,
|
|
151
|
+
type=str(field.type),
|
|
152
|
+
is_geometry=field.name in geometry_columns,
|
|
153
|
+
)
|
|
154
|
+
)
|
|
155
|
+
|
|
156
|
+
# File size
|
|
157
|
+
file_size: int | None = None
|
|
158
|
+
try:
|
|
159
|
+
file_size = Path(source).stat().st_size
|
|
160
|
+
except OSError:
|
|
161
|
+
pass
|
|
162
|
+
|
|
163
|
+
fmt = "geoparquet" if geometry_column else "parquet"
|
|
164
|
+
|
|
165
|
+
return VectorInfo(
|
|
166
|
+
source=source,
|
|
167
|
+
format=fmt,
|
|
168
|
+
row_count=num_rows,
|
|
169
|
+
num_columns=len(schema),
|
|
170
|
+
columns=columns,
|
|
171
|
+
geometry_column=geometry_column,
|
|
172
|
+
geometry_types=geometry_types,
|
|
173
|
+
crs=crs,
|
|
174
|
+
bbox=bbox,
|
|
175
|
+
encoding=encoding,
|
|
176
|
+
num_row_groups=num_row_groups,
|
|
177
|
+
compression=compression,
|
|
178
|
+
file_size_bytes=file_size,
|
|
179
|
+
)
|
|
180
|
+
|
|
181
|
+
|
|
182
|
+
def _parse_geo_metadata(metadata: dict[bytes, bytes]) -> dict[str, Any]:
|
|
183
|
+
"""Parse the ``geo`` key from Parquet file metadata.
|
|
184
|
+
|
|
185
|
+
Parameters:
|
|
186
|
+
metadata: Raw Parquet schema metadata (bytes keys and values).
|
|
187
|
+
|
|
188
|
+
Returns:
|
|
189
|
+
Parsed GeoParquet metadata dict, or empty dict if not present.
|
|
190
|
+
"""
|
|
191
|
+
geo_bytes = metadata.get(b"geo")
|
|
192
|
+
if geo_bytes is None:
|
|
193
|
+
return {}
|
|
194
|
+
try:
|
|
195
|
+
result: dict[str, Any] = json.loads(geo_bytes)
|
|
196
|
+
return result
|
|
197
|
+
except (json.JSONDecodeError, UnicodeDecodeError):
|
|
198
|
+
return {}
|
|
199
|
+
|
|
200
|
+
|
|
201
|
+
def _extract_crs_string(crs_obj: dict[str, Any]) -> str:
|
|
202
|
+
"""Extract a human-readable CRS identifier from PROJJSON.
|
|
203
|
+
|
|
204
|
+
Tries ``id.code`` (e.g. ``"EPSG:4326"``), then ``name``, then falls
|
|
205
|
+
back to a truncated JSON representation.
|
|
206
|
+
|
|
207
|
+
Parameters:
|
|
208
|
+
crs_obj: PROJJSON CRS object.
|
|
209
|
+
|
|
210
|
+
Returns:
|
|
211
|
+
CRS identifier string.
|
|
212
|
+
"""
|
|
213
|
+
# Try EPSG-style authority:code
|
|
214
|
+
crs_id = crs_obj.get("id", {})
|
|
215
|
+
if isinstance(crs_id, dict):
|
|
216
|
+
authority = crs_id.get("authority")
|
|
217
|
+
code = crs_id.get("code")
|
|
218
|
+
if authority and code:
|
|
219
|
+
return f"{authority}:{code}"
|
|
220
|
+
|
|
221
|
+
# Fall back to name
|
|
222
|
+
name = crs_obj.get("name")
|
|
223
|
+
if isinstance(name, str):
|
|
224
|
+
return name
|
|
225
|
+
|
|
226
|
+
return json.dumps(crs_obj)[:100]
|
|
227
|
+
|
|
228
|
+
|
|
229
|
+
async def inspect_vector(source: str) -> VectorInfo:
|
|
230
|
+
"""Inspect a vector file and return structured metadata.
|
|
231
|
+
|
|
232
|
+
Runs the synchronous pyarrow read in a thread executor to avoid blocking
|
|
233
|
+
the event loop. Currently supports Parquet and GeoParquet files.
|
|
234
|
+
|
|
235
|
+
Parameters:
|
|
236
|
+
source: Path to a vector file.
|
|
237
|
+
|
|
238
|
+
Returns:
|
|
239
|
+
Structured vector metadata.
|
|
240
|
+
|
|
241
|
+
Raises:
|
|
242
|
+
VectorError: If the file cannot be read or format is unsupported.
|
|
243
|
+
"""
|
|
244
|
+
loop = asyncio.get_running_loop()
|
|
245
|
+
return await loop.run_in_executor(None, partial(_read_parquet_info, source))
|