earthforge-vector 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- earthforge/vector/__init__.py +6 -0
- earthforge/vector/convert.py +356 -0
- earthforge/vector/errors.py +21 -0
- earthforge/vector/info.py +245 -0
- earthforge/vector/query.py +384 -0
- earthforge_vector-0.1.0.dist-info/METADATA +13 -0
- earthforge_vector-0.1.0.dist-info/RECORD +8 -0
- earthforge_vector-0.1.0.dist-info/WHEEL +4 -0
|
@@ -0,0 +1,356 @@
|
|
|
1
|
+
"""Vector format conversion.
|
|
2
|
+
|
|
3
|
+
Converts between vector geospatial formats with a focus on producing valid
|
|
4
|
+
GeoParquet output. Supports Shapefile, GeoJSON, and other OGR-readable
|
|
5
|
+
formats as input. Writes GeoParquet with proper ``geo`` metadata including
|
|
6
|
+
CRS, geometry types, encoding, and bounding box.
|
|
7
|
+
|
|
8
|
+
Uses GDAL/OGR for reading source formats and pyarrow for writing Parquet.
|
|
9
|
+
Falls back to geopandas if available, but does not require it.
|
|
10
|
+
|
|
11
|
+
Usage::
|
|
12
|
+
|
|
13
|
+
from earthforge.vector.convert import convert_vector
|
|
14
|
+
|
|
15
|
+
result = await convert_vector("buildings.shp", output="buildings.parquet")
|
|
16
|
+
"""
|
|
17
|
+
|
|
18
|
+
from __future__ import annotations
|
|
19
|
+
|
|
20
|
+
import asyncio
|
|
21
|
+
import json
|
|
22
|
+
from functools import partial
|
|
23
|
+
from pathlib import Path
|
|
24
|
+
from typing import Any
|
|
25
|
+
|
|
26
|
+
from pydantic import BaseModel, Field
|
|
27
|
+
|
|
28
|
+
from earthforge.vector.errors import VectorError
|
|
29
|
+
|
|
30
|
+
|
|
31
|
+
class ConvertResult(BaseModel):
|
|
32
|
+
"""Structured result from a vector format conversion.
|
|
33
|
+
|
|
34
|
+
Attributes:
|
|
35
|
+
source: Input file path.
|
|
36
|
+
output: Output file path.
|
|
37
|
+
input_format: Source format name (e.g. ``"ESRI Shapefile"``).
|
|
38
|
+
output_format: Target format (e.g. ``"geoparquet"``).
|
|
39
|
+
feature_count: Number of features converted.
|
|
40
|
+
geometry_type: Geometry type (e.g. ``"Polygon"``).
|
|
41
|
+
crs: CRS identifier string.
|
|
42
|
+
bbox: Bounding box ``[west, south, east, north]``.
|
|
43
|
+
file_size_bytes: Output file size in bytes.
|
|
44
|
+
"""
|
|
45
|
+
|
|
46
|
+
source: str = Field(title="Source")
|
|
47
|
+
output: str = Field(title="Output")
|
|
48
|
+
input_format: str = Field(title="Input Format")
|
|
49
|
+
output_format: str = Field(title="Output Format")
|
|
50
|
+
feature_count: int = Field(title="Features")
|
|
51
|
+
geometry_type: str | None = Field(default=None, title="Geometry Type")
|
|
52
|
+
crs: str | None = Field(default=None, title="CRS")
|
|
53
|
+
bbox: list[float] | None = Field(default=None, title="BBox")
|
|
54
|
+
file_size_bytes: int | None = Field(default=None, title="Size (bytes)")
|
|
55
|
+
|
|
56
|
+
|
|
57
|
+
def _ogr_type_to_arrow(ogr_type: int) -> str:
|
|
58
|
+
"""Map an OGR field type to a pyarrow type string.
|
|
59
|
+
|
|
60
|
+
Parameters:
|
|
61
|
+
ogr_type: OGR field type constant.
|
|
62
|
+
|
|
63
|
+
Returns:
|
|
64
|
+
Arrow type name string.
|
|
65
|
+
"""
|
|
66
|
+
from osgeo import ogr
|
|
67
|
+
|
|
68
|
+
mapping = {
|
|
69
|
+
ogr.OFTInteger: "int32",
|
|
70
|
+
ogr.OFTInteger64: "int64",
|
|
71
|
+
ogr.OFTReal: "float64",
|
|
72
|
+
ogr.OFTString: "string",
|
|
73
|
+
ogr.OFTDate: "string",
|
|
74
|
+
ogr.OFTDateTime: "string",
|
|
75
|
+
ogr.OFTBinary: "binary",
|
|
76
|
+
}
|
|
77
|
+
return mapping.get(ogr_type, "string")
|
|
78
|
+
|
|
79
|
+
|
|
80
|
+
def _ogr_geom_type_name(ogr_geom_type: int) -> str:
|
|
81
|
+
"""Convert OGR geometry type constant to human-readable name.
|
|
82
|
+
|
|
83
|
+
Parameters:
|
|
84
|
+
ogr_geom_type: OGR geometry type constant.
|
|
85
|
+
|
|
86
|
+
Returns:
|
|
87
|
+
Geometry type name.
|
|
88
|
+
"""
|
|
89
|
+
from osgeo import ogr
|
|
90
|
+
|
|
91
|
+
mapping = {
|
|
92
|
+
ogr.wkbPoint: "Point",
|
|
93
|
+
ogr.wkbLineString: "LineString",
|
|
94
|
+
ogr.wkbPolygon: "Polygon",
|
|
95
|
+
ogr.wkbMultiPoint: "MultiPoint",
|
|
96
|
+
ogr.wkbMultiLineString: "MultiLineString",
|
|
97
|
+
ogr.wkbMultiPolygon: "MultiPolygon",
|
|
98
|
+
ogr.wkbGeometryCollection: "GeometryCollection",
|
|
99
|
+
ogr.wkbPoint25D: "Point",
|
|
100
|
+
ogr.wkbLineString25D: "LineString",
|
|
101
|
+
ogr.wkbPolygon25D: "Polygon",
|
|
102
|
+
ogr.wkbMultiPoint25D: "MultiPoint",
|
|
103
|
+
ogr.wkbMultiLineString25D: "MultiLineString",
|
|
104
|
+
ogr.wkbMultiPolygon25D: "MultiPolygon",
|
|
105
|
+
}
|
|
106
|
+
return mapping.get(ogr_geom_type, "Unknown")
|
|
107
|
+
|
|
108
|
+
|
|
109
|
+
def _extract_crs_info(spatial_ref: Any) -> tuple[str | None, dict[str, Any] | None]:
|
|
110
|
+
"""Extract CRS identifier and PROJJSON from an OGR SpatialReference.
|
|
111
|
+
|
|
112
|
+
Parameters:
|
|
113
|
+
spatial_ref: OGR SpatialReference object.
|
|
114
|
+
|
|
115
|
+
Returns:
|
|
116
|
+
Tuple of (crs_string, projjson_dict).
|
|
117
|
+
"""
|
|
118
|
+
if spatial_ref is None:
|
|
119
|
+
return None, None
|
|
120
|
+
|
|
121
|
+
# Try to get authority:code
|
|
122
|
+
auth_name = spatial_ref.GetAuthorityName(None)
|
|
123
|
+
auth_code = spatial_ref.GetAuthorityCode(None)
|
|
124
|
+
crs_string = f"{auth_name}:{auth_code}" if auth_name and auth_code else None
|
|
125
|
+
|
|
126
|
+
# Build PROJJSON for GeoParquet metadata
|
|
127
|
+
projjson: dict[str, Any] | None = None
|
|
128
|
+
try:
|
|
129
|
+
projjson_str = spatial_ref.ExportToPROJJSON()
|
|
130
|
+
if projjson_str:
|
|
131
|
+
projjson = json.loads(projjson_str)
|
|
132
|
+
except Exception:
|
|
133
|
+
# Fall back to building minimal PROJJSON
|
|
134
|
+
if crs_string:
|
|
135
|
+
projjson = {
|
|
136
|
+
"type": "GeographicCRS" if spatial_ref.IsGeographic() else "ProjectedCRS",
|
|
137
|
+
"name": spatial_ref.GetName() or crs_string,
|
|
138
|
+
"id": {"authority": auth_name, "code": int(auth_code) if auth_code else 0},
|
|
139
|
+
}
|
|
140
|
+
|
|
141
|
+
if crs_string is None and spatial_ref.GetName():
|
|
142
|
+
crs_string = spatial_ref.GetName()
|
|
143
|
+
|
|
144
|
+
return crs_string, projjson
|
|
145
|
+
|
|
146
|
+
|
|
147
|
+
def _convert_vector_sync(
|
|
148
|
+
source: str,
|
|
149
|
+
*,
|
|
150
|
+
output: str | None = None,
|
|
151
|
+
target_format: str = "geoparquet",
|
|
152
|
+
compression: str = "snappy",
|
|
153
|
+
) -> ConvertResult:
|
|
154
|
+
"""Convert a vector file to GeoParquet synchronously.
|
|
155
|
+
|
|
156
|
+
Parameters:
|
|
157
|
+
source: Path to the input vector file (Shapefile, GeoJSON, etc.).
|
|
158
|
+
output: Output file path. If ``None``, derives from source name.
|
|
159
|
+
target_format: Target format (currently only ``"geoparquet"``).
|
|
160
|
+
compression: Parquet compression codec (``"snappy"``, ``"zstd"``, ``"gzip"``).
|
|
161
|
+
|
|
162
|
+
Returns:
|
|
163
|
+
Structured conversion result.
|
|
164
|
+
|
|
165
|
+
Raises:
|
|
166
|
+
VectorError: If the source cannot be read or conversion fails.
|
|
167
|
+
"""
|
|
168
|
+
if target_format != "geoparquet":
|
|
169
|
+
raise VectorError(f"Unsupported target format: {target_format}")
|
|
170
|
+
|
|
171
|
+
try:
|
|
172
|
+
from osgeo import ogr
|
|
173
|
+
except ImportError as exc:
|
|
174
|
+
raise VectorError(
|
|
175
|
+
"GDAL/OGR is required for vector conversion: install GDAL Python bindings"
|
|
176
|
+
) from exc
|
|
177
|
+
|
|
178
|
+
try:
|
|
179
|
+
import pyarrow as pa
|
|
180
|
+
import pyarrow.parquet as pq
|
|
181
|
+
except ImportError as exc:
|
|
182
|
+
raise VectorError(
|
|
183
|
+
"pyarrow is required for GeoParquet output: pip install earthforge[vector]"
|
|
184
|
+
) from exc
|
|
185
|
+
|
|
186
|
+
# Open source
|
|
187
|
+
try:
|
|
188
|
+
ds = ogr.Open(source)
|
|
189
|
+
except RuntimeError as exc:
|
|
190
|
+
raise VectorError(f"Failed to open vector file '{source}'") from exc
|
|
191
|
+
if ds is None:
|
|
192
|
+
raise VectorError(f"Failed to open vector file '{source}'")
|
|
193
|
+
|
|
194
|
+
layer = ds.GetLayer(0)
|
|
195
|
+
if layer is None:
|
|
196
|
+
raise VectorError(f"No layers found in '{source}'")
|
|
197
|
+
|
|
198
|
+
input_format = ds.GetDriver().GetName()
|
|
199
|
+
layer_defn = layer.GetLayerDefn()
|
|
200
|
+
feature_count = layer.GetFeatureCount()
|
|
201
|
+
geom_type = _ogr_geom_type_name(layer.GetGeomType())
|
|
202
|
+
|
|
203
|
+
# Extract CRS
|
|
204
|
+
spatial_ref = layer.GetSpatialRef()
|
|
205
|
+
crs_string, projjson = _extract_crs_info(spatial_ref)
|
|
206
|
+
|
|
207
|
+
# Get extent
|
|
208
|
+
extent = layer.GetExtent() # (xmin, xmax, ymin, ymax)
|
|
209
|
+
bbox = [extent[0], extent[2], extent[1], extent[3]] if extent else None
|
|
210
|
+
|
|
211
|
+
# Build field definitions
|
|
212
|
+
field_names: list[str] = []
|
|
213
|
+
field_types: list[str] = []
|
|
214
|
+
for i in range(layer_defn.GetFieldCount()):
|
|
215
|
+
field_def = layer_defn.GetFieldDefn(i)
|
|
216
|
+
field_names.append(field_def.GetName())
|
|
217
|
+
field_types.append(_ogr_type_to_arrow(field_def.GetType()))
|
|
218
|
+
|
|
219
|
+
# Read all features
|
|
220
|
+
arrays: dict[str, list[Any]] = {name: [] for name in field_names}
|
|
221
|
+
geometries: list[bytes] = []
|
|
222
|
+
|
|
223
|
+
layer.ResetReading()
|
|
224
|
+
feature = layer.GetNextFeature()
|
|
225
|
+
actual_count = 0
|
|
226
|
+
while feature is not None:
|
|
227
|
+
# Read attribute fields
|
|
228
|
+
for i, name in enumerate(field_names):
|
|
229
|
+
if not feature.IsFieldSet(i) or feature.IsFieldNull(i):
|
|
230
|
+
arrays[name].append(None)
|
|
231
|
+
elif field_types[i] == "int32":
|
|
232
|
+
arrays[name].append(feature.GetFieldAsInteger(i))
|
|
233
|
+
elif field_types[i] == "int64":
|
|
234
|
+
arrays[name].append(feature.GetFieldAsInteger64(i))
|
|
235
|
+
elif field_types[i] == "float64":
|
|
236
|
+
arrays[name].append(feature.GetFieldAsDouble(i))
|
|
237
|
+
else:
|
|
238
|
+
arrays[name].append(feature.GetFieldAsString(i))
|
|
239
|
+
|
|
240
|
+
# Read geometry as WKB
|
|
241
|
+
geom = feature.GetGeometryRef()
|
|
242
|
+
if geom is not None:
|
|
243
|
+
geometries.append(bytes(geom.ExportToWkb()))
|
|
244
|
+
else:
|
|
245
|
+
geometries.append(b"")
|
|
246
|
+
|
|
247
|
+
actual_count += 1
|
|
248
|
+
feature = layer.GetNextFeature()
|
|
249
|
+
|
|
250
|
+
ds = None # Close OGR dataset
|
|
251
|
+
|
|
252
|
+
if feature_count < 0:
|
|
253
|
+
feature_count = actual_count
|
|
254
|
+
|
|
255
|
+
# Build pyarrow table
|
|
256
|
+
pa_columns: dict[str, Any] = {}
|
|
257
|
+
for name, arrow_type in zip(field_names, field_types, strict=True):
|
|
258
|
+
type_map = {
|
|
259
|
+
"int32": pa.int32(),
|
|
260
|
+
"int64": pa.int64(),
|
|
261
|
+
"float64": pa.float64(),
|
|
262
|
+
"string": pa.string(),
|
|
263
|
+
"binary": pa.binary(),
|
|
264
|
+
}
|
|
265
|
+
pa_type = type_map.get(arrow_type, pa.string())
|
|
266
|
+
pa_columns[name] = pa.array(arrays[name], type=pa_type)
|
|
267
|
+
|
|
268
|
+
pa_columns["geometry"] = pa.array(geometries, type=pa.binary())
|
|
269
|
+
table = pa.table(pa_columns)
|
|
270
|
+
|
|
271
|
+
# Build GeoParquet metadata
|
|
272
|
+
geo_metadata: dict[str, Any] = {
|
|
273
|
+
"version": "1.1.0",
|
|
274
|
+
"primary_column": "geometry",
|
|
275
|
+
"columns": {
|
|
276
|
+
"geometry": {
|
|
277
|
+
"encoding": "WKB",
|
|
278
|
+
"geometry_types": [geom_type],
|
|
279
|
+
}
|
|
280
|
+
},
|
|
281
|
+
}
|
|
282
|
+
|
|
283
|
+
if bbox:
|
|
284
|
+
geo_metadata["columns"]["geometry"]["bbox"] = bbox
|
|
285
|
+
if projjson:
|
|
286
|
+
geo_metadata["columns"]["geometry"]["crs"] = projjson
|
|
287
|
+
|
|
288
|
+
# Attach geo metadata to schema
|
|
289
|
+
existing = table.schema.metadata or {}
|
|
290
|
+
existing[b"geo"] = json.dumps(geo_metadata).encode("utf-8")
|
|
291
|
+
table = table.replace_schema_metadata(existing)
|
|
292
|
+
|
|
293
|
+
# Determine output path
|
|
294
|
+
if output is None:
|
|
295
|
+
output = str(Path(source).with_suffix(".parquet"))
|
|
296
|
+
|
|
297
|
+
# Write GeoParquet
|
|
298
|
+
try:
|
|
299
|
+
pq.write_table(table, output, compression=compression)
|
|
300
|
+
except Exception as exc:
|
|
301
|
+
raise VectorError(f"Failed to write GeoParquet '{output}': {exc}") from exc
|
|
302
|
+
|
|
303
|
+
file_size: int | None = None
|
|
304
|
+
try:
|
|
305
|
+
file_size = Path(output).stat().st_size
|
|
306
|
+
except OSError:
|
|
307
|
+
pass
|
|
308
|
+
|
|
309
|
+
return ConvertResult(
|
|
310
|
+
source=source,
|
|
311
|
+
output=output,
|
|
312
|
+
input_format=input_format,
|
|
313
|
+
output_format="geoparquet",
|
|
314
|
+
feature_count=actual_count,
|
|
315
|
+
geometry_type=geom_type,
|
|
316
|
+
crs=crs_string,
|
|
317
|
+
bbox=bbox,
|
|
318
|
+
file_size_bytes=file_size,
|
|
319
|
+
)
|
|
320
|
+
|
|
321
|
+
|
|
322
|
+
async def convert_vector(
|
|
323
|
+
source: str,
|
|
324
|
+
*,
|
|
325
|
+
output: str | None = None,
|
|
326
|
+
target_format: str = "geoparquet",
|
|
327
|
+
compression: str = "snappy",
|
|
328
|
+
) -> ConvertResult:
|
|
329
|
+
"""Convert a vector file to GeoParquet.
|
|
330
|
+
|
|
331
|
+
Reads the source using GDAL/OGR and writes GeoParquet with proper ``geo``
|
|
332
|
+
metadata. Supports Shapefile, GeoJSON, GPKG, and any OGR-supported format.
|
|
333
|
+
|
|
334
|
+
Parameters:
|
|
335
|
+
source: Path to the input vector file.
|
|
336
|
+
output: Output file path. If ``None``, replaces extension with ``.parquet``.
|
|
337
|
+
target_format: Target format (default: ``"geoparquet"``).
|
|
338
|
+
compression: Parquet compression codec (default: ``"snappy"``).
|
|
339
|
+
|
|
340
|
+
Returns:
|
|
341
|
+
Structured conversion result.
|
|
342
|
+
|
|
343
|
+
Raises:
|
|
344
|
+
VectorError: If the conversion fails.
|
|
345
|
+
"""
|
|
346
|
+
loop = asyncio.get_running_loop()
|
|
347
|
+
return await loop.run_in_executor(
|
|
348
|
+
None,
|
|
349
|
+
partial(
|
|
350
|
+
_convert_vector_sync,
|
|
351
|
+
source,
|
|
352
|
+
output=output,
|
|
353
|
+
target_format=target_format,
|
|
354
|
+
compression=compression,
|
|
355
|
+
),
|
|
356
|
+
)
|
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
"""Vector-specific error types.
|
|
2
|
+
|
|
3
|
+
All exceptions inherit from :class:`~earthforge.core.errors.EarthForgeError`
|
|
4
|
+
so the CLI can catch them uniformly and map to appropriate exit codes.
|
|
5
|
+
"""
|
|
6
|
+
|
|
7
|
+
from __future__ import annotations
|
|
8
|
+
|
|
9
|
+
from earthforge.core.errors import EarthForgeError
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
class VectorError(EarthForgeError):
|
|
13
|
+
"""Base error for vector operations.
|
|
14
|
+
|
|
15
|
+
Parameters:
|
|
16
|
+
message: Human-readable error description.
|
|
17
|
+
exit_code: Process exit code (default: 20).
|
|
18
|
+
"""
|
|
19
|
+
|
|
20
|
+
def __init__(self, message: str, *, exit_code: int = 20) -> None:
|
|
21
|
+
super().__init__(message, exit_code=exit_code)
|
|
@@ -0,0 +1,245 @@
|
|
|
1
|
+
"""Deep metadata extraction for vector geospatial formats.
|
|
2
|
+
|
|
3
|
+
Reads Parquet/GeoParquet file metadata via pyarrow without loading data into
|
|
4
|
+
memory. Extracts schema, row counts, geometry columns, CRS, bounding box, and
|
|
5
|
+
encoding information from GeoParquet ``geo`` metadata.
|
|
6
|
+
|
|
7
|
+
For non-Parquet vector formats (GeoJSON, FlatGeobuf), provides basic file-level
|
|
8
|
+
metadata. Deep inspection of those formats may be added in later milestones.
|
|
9
|
+
"""
|
|
10
|
+
|
|
11
|
+
from __future__ import annotations
|
|
12
|
+
|
|
13
|
+
import asyncio
|
|
14
|
+
import json
|
|
15
|
+
from functools import partial
|
|
16
|
+
from pathlib import Path
|
|
17
|
+
from typing import Any
|
|
18
|
+
|
|
19
|
+
from pydantic import BaseModel, Field
|
|
20
|
+
|
|
21
|
+
from earthforge.vector.errors import VectorError
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
class ColumnInfo(BaseModel):
|
|
25
|
+
"""Metadata for a single column in a vector dataset.
|
|
26
|
+
|
|
27
|
+
Attributes:
|
|
28
|
+
name: Column name.
|
|
29
|
+
type: Arrow type string (e.g. ``"int64"``, ``"binary"``).
|
|
30
|
+
is_geometry: Whether this column contains geometry data.
|
|
31
|
+
"""
|
|
32
|
+
|
|
33
|
+
name: str = Field(title="Column")
|
|
34
|
+
type: str = Field(title="Type")
|
|
35
|
+
is_geometry: bool = Field(default=False, title="Geometry")
|
|
36
|
+
|
|
37
|
+
|
|
38
|
+
class VectorInfo(BaseModel):
|
|
39
|
+
"""Structured metadata for a vector geospatial file.
|
|
40
|
+
|
|
41
|
+
Attributes:
|
|
42
|
+
source: The file path that was inspected.
|
|
43
|
+
format: Detected vector format (e.g. ``"geoparquet"``, ``"parquet"``).
|
|
44
|
+
row_count: Total number of rows/features.
|
|
45
|
+
num_columns: Total number of columns.
|
|
46
|
+
columns: Per-column metadata.
|
|
47
|
+
geometry_column: Name of the primary geometry column, if any.
|
|
48
|
+
geometry_types: List of geometry types found (e.g. ``["Point"]``).
|
|
49
|
+
crs: CRS string from GeoParquet metadata, if available.
|
|
50
|
+
bbox: Bounding box ``[west, south, east, north]``, if available.
|
|
51
|
+
encoding: Geometry encoding (e.g. ``"WKB"``), if available.
|
|
52
|
+
num_row_groups: Number of Parquet row groups.
|
|
53
|
+
compression: Parquet compression codec, if applicable.
|
|
54
|
+
file_size_bytes: File size in bytes.
|
|
55
|
+
"""
|
|
56
|
+
|
|
57
|
+
source: str = Field(title="Source")
|
|
58
|
+
format: str = Field(title="Format")
|
|
59
|
+
row_count: int = Field(title="Rows")
|
|
60
|
+
num_columns: int = Field(title="Columns")
|
|
61
|
+
columns: list[ColumnInfo] = Field(title="Column Details")
|
|
62
|
+
geometry_column: str | None = Field(default=None, title="Geometry Column")
|
|
63
|
+
geometry_types: list[str] = Field(default_factory=list, title="Geometry Types")
|
|
64
|
+
crs: str | None = Field(default=None, title="CRS")
|
|
65
|
+
bbox: list[float] | None = Field(default=None, title="Bounding Box")
|
|
66
|
+
encoding: str | None = Field(default=None, title="Encoding")
|
|
67
|
+
num_row_groups: int | None = Field(default=None, title="Row Groups")
|
|
68
|
+
compression: str | None = Field(default=None, title="Compression")
|
|
69
|
+
file_size_bytes: int | None = Field(default=None, title="Size (bytes)")
|
|
70
|
+
|
|
71
|
+
|
|
72
|
+
def _read_parquet_info(source: str) -> VectorInfo:
|
|
73
|
+
"""Read metadata from a Parquet/GeoParquet file synchronously.
|
|
74
|
+
|
|
75
|
+
Uses pyarrow to read only the file metadata and schema — no row data
|
|
76
|
+
is loaded into memory. Parses the ``geo`` metadata key for GeoParquet-
|
|
77
|
+
specific information (geometry column, CRS, bbox, encoding).
|
|
78
|
+
|
|
79
|
+
Parameters:
|
|
80
|
+
source: Path to a Parquet file.
|
|
81
|
+
|
|
82
|
+
Returns:
|
|
83
|
+
Structured vector metadata.
|
|
84
|
+
|
|
85
|
+
Raises:
|
|
86
|
+
VectorError: If the file cannot be read or is not a valid Parquet file.
|
|
87
|
+
"""
|
|
88
|
+
try:
|
|
89
|
+
import pyarrow.parquet as pq
|
|
90
|
+
except ImportError as exc:
|
|
91
|
+
msg = "pyarrow is required for Parquet inspection: pip install pyarrow"
|
|
92
|
+
raise VectorError(msg) from exc
|
|
93
|
+
|
|
94
|
+
try:
|
|
95
|
+
pf = pq.ParquetFile(source)
|
|
96
|
+
except Exception as exc:
|
|
97
|
+
msg = f"Failed to read Parquet file '{source}': {exc}"
|
|
98
|
+
raise VectorError(msg) from exc
|
|
99
|
+
|
|
100
|
+
schema = pf.schema_arrow
|
|
101
|
+
metadata = schema.metadata or {}
|
|
102
|
+
num_rows = pf.metadata.num_rows
|
|
103
|
+
num_row_groups = pf.metadata.num_row_groups
|
|
104
|
+
|
|
105
|
+
# Detect compression from the first row group's first column chunk
|
|
106
|
+
compression: str | None = None
|
|
107
|
+
if num_row_groups > 0 and schema:
|
|
108
|
+
try:
|
|
109
|
+
rg = pf.metadata.row_group(0)
|
|
110
|
+
if rg.num_columns > 0:
|
|
111
|
+
compression = rg.column(0).compression
|
|
112
|
+
except Exception: # noqa: S110 — best-effort metadata extraction
|
|
113
|
+
pass
|
|
114
|
+
|
|
115
|
+
# Parse GeoParquet metadata
|
|
116
|
+
geo_meta = _parse_geo_metadata(metadata)
|
|
117
|
+
geometry_column = geo_meta.get("primary_column")
|
|
118
|
+
geometry_columns: set[str] = set()
|
|
119
|
+
geometry_types: list[str] = []
|
|
120
|
+
crs: str | None = None
|
|
121
|
+
bbox: list[float] | None = None
|
|
122
|
+
encoding: str | None = None
|
|
123
|
+
|
|
124
|
+
if geometry_column and "columns" in geo_meta:
|
|
125
|
+
geometry_columns.add(geometry_column)
|
|
126
|
+
col_meta = geo_meta["columns"].get(geometry_column, {})
|
|
127
|
+
geometry_types = col_meta.get("geometry_types", [])
|
|
128
|
+
encoding = col_meta.get("encoding")
|
|
129
|
+
bbox_raw = col_meta.get("bbox")
|
|
130
|
+
if isinstance(bbox_raw, list) and len(bbox_raw) == 4:
|
|
131
|
+
bbox = [float(v) for v in bbox_raw]
|
|
132
|
+
|
|
133
|
+
crs_obj = col_meta.get("crs")
|
|
134
|
+
if isinstance(crs_obj, dict):
|
|
135
|
+
# GeoParquet stores CRS as PROJJSON — extract the name or ID
|
|
136
|
+
crs = _extract_crs_string(crs_obj)
|
|
137
|
+
elif isinstance(crs_obj, str):
|
|
138
|
+
crs = crs_obj
|
|
139
|
+
|
|
140
|
+
# Also check for additional geometry columns
|
|
141
|
+
if "columns" in geo_meta:
|
|
142
|
+
geometry_columns.update(geo_meta["columns"].keys())
|
|
143
|
+
|
|
144
|
+
# Build column info
|
|
145
|
+
columns: list[ColumnInfo] = []
|
|
146
|
+
for i in range(len(schema)):
|
|
147
|
+
field = schema.field(i)
|
|
148
|
+
columns.append(
|
|
149
|
+
ColumnInfo(
|
|
150
|
+
name=field.name,
|
|
151
|
+
type=str(field.type),
|
|
152
|
+
is_geometry=field.name in geometry_columns,
|
|
153
|
+
)
|
|
154
|
+
)
|
|
155
|
+
|
|
156
|
+
# File size
|
|
157
|
+
file_size: int | None = None
|
|
158
|
+
try:
|
|
159
|
+
file_size = Path(source).stat().st_size
|
|
160
|
+
except OSError:
|
|
161
|
+
pass
|
|
162
|
+
|
|
163
|
+
fmt = "geoparquet" if geometry_column else "parquet"
|
|
164
|
+
|
|
165
|
+
return VectorInfo(
|
|
166
|
+
source=source,
|
|
167
|
+
format=fmt,
|
|
168
|
+
row_count=num_rows,
|
|
169
|
+
num_columns=len(schema),
|
|
170
|
+
columns=columns,
|
|
171
|
+
geometry_column=geometry_column,
|
|
172
|
+
geometry_types=geometry_types,
|
|
173
|
+
crs=crs,
|
|
174
|
+
bbox=bbox,
|
|
175
|
+
encoding=encoding,
|
|
176
|
+
num_row_groups=num_row_groups,
|
|
177
|
+
compression=compression,
|
|
178
|
+
file_size_bytes=file_size,
|
|
179
|
+
)
|
|
180
|
+
|
|
181
|
+
|
|
182
|
+
def _parse_geo_metadata(metadata: dict[bytes, bytes]) -> dict[str, Any]:
|
|
183
|
+
"""Parse the ``geo`` key from Parquet file metadata.
|
|
184
|
+
|
|
185
|
+
Parameters:
|
|
186
|
+
metadata: Raw Parquet schema metadata (bytes keys and values).
|
|
187
|
+
|
|
188
|
+
Returns:
|
|
189
|
+
Parsed GeoParquet metadata dict, or empty dict if not present.
|
|
190
|
+
"""
|
|
191
|
+
geo_bytes = metadata.get(b"geo")
|
|
192
|
+
if geo_bytes is None:
|
|
193
|
+
return {}
|
|
194
|
+
try:
|
|
195
|
+
result: dict[str, Any] = json.loads(geo_bytes)
|
|
196
|
+
return result
|
|
197
|
+
except (json.JSONDecodeError, UnicodeDecodeError):
|
|
198
|
+
return {}
|
|
199
|
+
|
|
200
|
+
|
|
201
|
+
def _extract_crs_string(crs_obj: dict[str, Any]) -> str:
|
|
202
|
+
"""Extract a human-readable CRS identifier from PROJJSON.
|
|
203
|
+
|
|
204
|
+
Tries ``id.code`` (e.g. ``"EPSG:4326"``), then ``name``, then falls
|
|
205
|
+
back to a truncated JSON representation.
|
|
206
|
+
|
|
207
|
+
Parameters:
|
|
208
|
+
crs_obj: PROJJSON CRS object.
|
|
209
|
+
|
|
210
|
+
Returns:
|
|
211
|
+
CRS identifier string.
|
|
212
|
+
"""
|
|
213
|
+
# Try EPSG-style authority:code
|
|
214
|
+
crs_id = crs_obj.get("id", {})
|
|
215
|
+
if isinstance(crs_id, dict):
|
|
216
|
+
authority = crs_id.get("authority")
|
|
217
|
+
code = crs_id.get("code")
|
|
218
|
+
if authority and code:
|
|
219
|
+
return f"{authority}:{code}"
|
|
220
|
+
|
|
221
|
+
# Fall back to name
|
|
222
|
+
name = crs_obj.get("name")
|
|
223
|
+
if isinstance(name, str):
|
|
224
|
+
return name
|
|
225
|
+
|
|
226
|
+
return json.dumps(crs_obj)[:100]
|
|
227
|
+
|
|
228
|
+
|
|
229
|
+
async def inspect_vector(source: str) -> VectorInfo:
|
|
230
|
+
"""Inspect a vector file and return structured metadata.
|
|
231
|
+
|
|
232
|
+
Runs the synchronous pyarrow read in a thread executor to avoid blocking
|
|
233
|
+
the event loop. Currently supports Parquet and GeoParquet files.
|
|
234
|
+
|
|
235
|
+
Parameters:
|
|
236
|
+
source: Path to a vector file.
|
|
237
|
+
|
|
238
|
+
Returns:
|
|
239
|
+
Structured vector metadata.
|
|
240
|
+
|
|
241
|
+
Raises:
|
|
242
|
+
VectorError: If the file cannot be read or format is unsupported.
|
|
243
|
+
"""
|
|
244
|
+
loop = asyncio.get_running_loop()
|
|
245
|
+
return await loop.run_in_executor(None, partial(_read_parquet_info, source))
|
|
@@ -0,0 +1,384 @@
|
|
|
1
|
+
"""Spatial and attribute queries against GeoParquet files.
|
|
2
|
+
|
|
3
|
+
Leverages pyarrow's row-group-level statistics and predicate pushdown to
|
|
4
|
+
read only the data that matches the query — critical for large files where
|
|
5
|
+
reading the full dataset would be impractical.
|
|
6
|
+
|
|
7
|
+
For bbox queries, the filter is applied against the ``bbox`` column covering
|
|
8
|
+
structure embedded in GeoParquet metadata. If per-row bounding box columns
|
|
9
|
+
(``bbox.xmin``, ``bbox.ymin``, etc.) are present, pyarrow can skip entire
|
|
10
|
+
row groups whose spatial extent doesn't intersect the query box.
|
|
11
|
+
|
|
12
|
+
Usage::
|
|
13
|
+
|
|
14
|
+
from earthforge.vector.query import query_features
|
|
15
|
+
|
|
16
|
+
result = await query_features("buildings.parquet", bbox=[-85, 37, -84, 38])
|
|
17
|
+
print(result.feature_count)
|
|
18
|
+
"""
|
|
19
|
+
|
|
20
|
+
from __future__ import annotations
|
|
21
|
+
|
|
22
|
+
import asyncio
|
|
23
|
+
import json
|
|
24
|
+
from functools import partial
|
|
25
|
+
from typing import Any
|
|
26
|
+
|
|
27
|
+
from pydantic import BaseModel, Field
|
|
28
|
+
|
|
29
|
+
from earthforge.vector.errors import VectorError
|
|
30
|
+
|
|
31
|
+
|
|
32
|
+
class QueryResult(BaseModel):
|
|
33
|
+
"""Structured result from a vector spatial/attribute query.
|
|
34
|
+
|
|
35
|
+
Attributes:
|
|
36
|
+
source: The file that was queried.
|
|
37
|
+
feature_count: Number of features matching the query.
|
|
38
|
+
columns: Column names in the result.
|
|
39
|
+
bbox_filter: The bounding box filter applied, if any.
|
|
40
|
+
features: List of feature dicts (geometry as WKT if available).
|
|
41
|
+
total_rows: Total rows in the source file (before filtering).
|
|
42
|
+
row_groups_scanned: Number of Parquet row groups actually read.
|
|
43
|
+
row_groups_total: Total row groups in the file.
|
|
44
|
+
"""
|
|
45
|
+
|
|
46
|
+
source: str = Field(title="Source")
|
|
47
|
+
feature_count: int = Field(title="Features")
|
|
48
|
+
columns: list[str] = Field(title="Columns")
|
|
49
|
+
bbox_filter: list[float] | None = Field(default=None, title="BBox Filter")
|
|
50
|
+
features: list[dict[str, Any]] = Field(default_factory=list, title="Features")
|
|
51
|
+
total_rows: int = Field(title="Total Rows")
|
|
52
|
+
row_groups_scanned: int | None = Field(default=None, title="Row Groups Scanned")
|
|
53
|
+
row_groups_total: int | None = Field(default=None, title="Row Groups Total")
|
|
54
|
+
|
|
55
|
+
|
|
56
|
+
def _parse_geo_metadata(metadata: dict[bytes, bytes]) -> dict[str, Any]:
|
|
57
|
+
"""Parse the GeoParquet ``geo`` metadata key.
|
|
58
|
+
|
|
59
|
+
Parameters:
|
|
60
|
+
metadata: Raw Parquet file-level metadata.
|
|
61
|
+
|
|
62
|
+
Returns:
|
|
63
|
+
Parsed geo metadata dict, or empty dict.
|
|
64
|
+
"""
|
|
65
|
+
geo_bytes = metadata.get(b"geo")
|
|
66
|
+
if geo_bytes is None:
|
|
67
|
+
return {}
|
|
68
|
+
try:
|
|
69
|
+
result: dict[str, Any] = json.loads(geo_bytes)
|
|
70
|
+
return result
|
|
71
|
+
except (json.JSONDecodeError, UnicodeDecodeError):
|
|
72
|
+
return {}
|
|
73
|
+
|
|
74
|
+
|
|
75
|
+
def _build_bbox_filter(
|
|
76
|
+
geo_meta: dict[str, Any],
|
|
77
|
+
bbox: list[float],
|
|
78
|
+
schema_names: set[str],
|
|
79
|
+
) -> Any:
|
|
80
|
+
"""Build a pyarrow filter expression for a bounding box query.
|
|
81
|
+
|
|
82
|
+
Checks for GeoParquet bbox column covering (``covering.bbox``), which
|
|
83
|
+
provides per-row min/max coordinates that pyarrow can use for predicate
|
|
84
|
+
pushdown at the row-group level.
|
|
85
|
+
|
|
86
|
+
Falls back to no filter if bbox covering is not present — the full scan
|
|
87
|
+
result will then be post-filtered in Python.
|
|
88
|
+
|
|
89
|
+
Parameters:
|
|
90
|
+
geo_meta: Parsed GeoParquet metadata.
|
|
91
|
+
bbox: Query bounding box ``[west, south, east, north]``.
|
|
92
|
+
schema_names: Set of column names present in the file schema.
|
|
93
|
+
|
|
94
|
+
Returns:
|
|
95
|
+
A pyarrow compute expression, or ``None`` if no pushdown is possible.
|
|
96
|
+
"""
|
|
97
|
+
import pyarrow.compute as pc
|
|
98
|
+
|
|
99
|
+
west, south, east, north = bbox
|
|
100
|
+
|
|
101
|
+
# Check for GeoParquet bbox covering metadata
|
|
102
|
+
primary_col = geo_meta.get("primary_column", "geometry")
|
|
103
|
+
col_meta = geo_meta.get("columns", {}).get(primary_col, {})
|
|
104
|
+
covering = col_meta.get("covering", {})
|
|
105
|
+
bbox_covering = covering.get("bbox", {})
|
|
106
|
+
|
|
107
|
+
xmin_col = bbox_covering.get("xmin")
|
|
108
|
+
ymin_col = bbox_covering.get("ymin")
|
|
109
|
+
xmax_col = bbox_covering.get("xmax")
|
|
110
|
+
ymax_col = bbox_covering.get("ymax")
|
|
111
|
+
|
|
112
|
+
if all([xmin_col, ymin_col, xmax_col, ymax_col]):
|
|
113
|
+
# Use covering columns for pushdown: feature bbox intersects query bbox
|
|
114
|
+
return (
|
|
115
|
+
(pc.field(xmin_col) <= east)
|
|
116
|
+
& (pc.field(xmax_col) >= west)
|
|
117
|
+
& (pc.field(ymin_col) <= north)
|
|
118
|
+
& (pc.field(ymax_col) >= south)
|
|
119
|
+
)
|
|
120
|
+
|
|
121
|
+
# Check for common bbox struct columns (Overture Maps pattern)
|
|
122
|
+
for col_set in [
|
|
123
|
+
("bbox.xmin", "bbox.ymin", "bbox.xmax", "bbox.ymax"),
|
|
124
|
+
("bbox.minx", "bbox.miny", "bbox.maxx", "bbox.maxy"),
|
|
125
|
+
]:
|
|
126
|
+
if all(c in schema_names for c in col_set):
|
|
127
|
+
xmin_c, ymin_c, xmax_c, ymax_c = col_set
|
|
128
|
+
return (
|
|
129
|
+
(pc.field(xmin_c) <= east)
|
|
130
|
+
& (pc.field(xmax_c) >= west)
|
|
131
|
+
& (pc.field(ymin_c) <= north)
|
|
132
|
+
& (pc.field(ymax_c) >= south)
|
|
133
|
+
)
|
|
134
|
+
|
|
135
|
+
# No pushdown columns available — caller will post-filter with geometry
|
|
136
|
+
return None
|
|
137
|
+
|
|
138
|
+
|
|
139
|
+
def _geometry_intersects_bbox(
|
|
140
|
+
wkb: bytes,
|
|
141
|
+
west: float,
|
|
142
|
+
south: float,
|
|
143
|
+
east: float,
|
|
144
|
+
north: float,
|
|
145
|
+
) -> bool:
|
|
146
|
+
"""Check if a WKB geometry's envelope intersects a bounding box.
|
|
147
|
+
|
|
148
|
+
Uses shapely for full geometry intersection if available. Falls back to
|
|
149
|
+
a minimal WKB point parser that checks if the point lies within the bbox.
|
|
150
|
+
|
|
151
|
+
Parameters:
|
|
152
|
+
wkb: Well-Known Binary geometry bytes.
|
|
153
|
+
west: Query bbox west.
|
|
154
|
+
south: Query bbox south.
|
|
155
|
+
east: Query bbox east.
|
|
156
|
+
north: Query bbox north.
|
|
157
|
+
|
|
158
|
+
Returns:
|
|
159
|
+
True if the geometry intersects the query bbox.
|
|
160
|
+
"""
|
|
161
|
+
try:
|
|
162
|
+
from shapely import from_wkb
|
|
163
|
+
from shapely.geometry import box
|
|
164
|
+
|
|
165
|
+
geom = from_wkb(wkb)
|
|
166
|
+
query_box = box(west, south, east, north)
|
|
167
|
+
return bool(geom.intersects(query_box))
|
|
168
|
+
except ImportError:
|
|
169
|
+
pass
|
|
170
|
+
|
|
171
|
+
# Fallback: parse WKB point coordinates for simple containment check
|
|
172
|
+
import struct
|
|
173
|
+
|
|
174
|
+
if len(wkb) >= 21:
|
|
175
|
+
try:
|
|
176
|
+
byte_order = wkb[0]
|
|
177
|
+
fmt = "<" if byte_order == 1 else ">"
|
|
178
|
+
wkb_type = struct.unpack(f"{fmt}I", wkb[1:5])[0]
|
|
179
|
+
if wkb_type == 1: # Point
|
|
180
|
+
x, y = struct.unpack(f"{fmt}dd", wkb[5:21])
|
|
181
|
+
return bool(west <= x <= east and south <= y <= north)
|
|
182
|
+
except (struct.error, IndexError):
|
|
183
|
+
pass
|
|
184
|
+
|
|
185
|
+
# For non-point geometries without shapely, include conservatively
|
|
186
|
+
return True
|
|
187
|
+
|
|
188
|
+
|
|
189
|
+
def _wkb_to_wkt(wkb: bytes) -> str | None:
|
|
190
|
+
"""Convert WKB bytes to WKT string for output.
|
|
191
|
+
|
|
192
|
+
Uses shapely if available; otherwise falls back to a minimal WKB parser
|
|
193
|
+
that handles Point geometries (the most common case for tabular data).
|
|
194
|
+
|
|
195
|
+
Parameters:
|
|
196
|
+
wkb: Well-Known Binary geometry.
|
|
197
|
+
|
|
198
|
+
Returns:
|
|
199
|
+
WKT string, or None if conversion fails.
|
|
200
|
+
"""
|
|
201
|
+
try:
|
|
202
|
+
from shapely import from_wkb
|
|
203
|
+
|
|
204
|
+
geom = from_wkb(wkb)
|
|
205
|
+
return str(geom.wkt)
|
|
206
|
+
except ImportError:
|
|
207
|
+
pass
|
|
208
|
+
except Exception:
|
|
209
|
+
return None
|
|
210
|
+
|
|
211
|
+
# Minimal fallback WKB parser for Point geometry
|
|
212
|
+
return _parse_wkb_point(wkb)
|
|
213
|
+
|
|
214
|
+
|
|
215
|
+
def _parse_wkb_point(wkb: bytes) -> str | None:
|
|
216
|
+
"""Parse a WKB Point geometry to WKT without shapely.
|
|
217
|
+
|
|
218
|
+
Parameters:
|
|
219
|
+
wkb: Well-Known Binary bytes.
|
|
220
|
+
|
|
221
|
+
Returns:
|
|
222
|
+
WKT string if it's a Point, None otherwise.
|
|
223
|
+
"""
|
|
224
|
+
import struct
|
|
225
|
+
|
|
226
|
+
if len(wkb) < 21:
|
|
227
|
+
return None
|
|
228
|
+
try:
|
|
229
|
+
byte_order = wkb[0]
|
|
230
|
+
fmt = "<" if byte_order == 1 else ">"
|
|
231
|
+
wkb_type = struct.unpack(f"{fmt}I", wkb[1:5])[0]
|
|
232
|
+
if wkb_type == 1: # Point
|
|
233
|
+
x, y = struct.unpack(f"{fmt}dd", wkb[5:21])
|
|
234
|
+
return f"POINT ({x} {y})"
|
|
235
|
+
except (struct.error, IndexError):
|
|
236
|
+
pass
|
|
237
|
+
return None
|
|
238
|
+
|
|
239
|
+
|
|
240
|
+
def _query_features_sync(
|
|
241
|
+
source: str,
|
|
242
|
+
*,
|
|
243
|
+
bbox: list[float] | None = None,
|
|
244
|
+
columns: list[str] | None = None,
|
|
245
|
+
limit: int | None = None,
|
|
246
|
+
include_geometry: bool = True,
|
|
247
|
+
) -> QueryResult:
|
|
248
|
+
"""Execute a spatial/attribute query synchronously.
|
|
249
|
+
|
|
250
|
+
Parameters:
|
|
251
|
+
source: Path to a GeoParquet/Parquet file.
|
|
252
|
+
bbox: Bounding box filter ``[west, south, east, north]`` in the file's CRS.
|
|
253
|
+
columns: Columns to include in results. ``None`` returns all columns.
|
|
254
|
+
limit: Maximum number of features to return.
|
|
255
|
+
include_geometry: Whether to include geometry as WKT in results.
|
|
256
|
+
|
|
257
|
+
Returns:
|
|
258
|
+
Structured query result with matching features.
|
|
259
|
+
|
|
260
|
+
Raises:
|
|
261
|
+
VectorError: If the file cannot be read or query fails.
|
|
262
|
+
"""
|
|
263
|
+
try:
|
|
264
|
+
import pyarrow.parquet as pq
|
|
265
|
+
except ImportError as exc:
|
|
266
|
+
raise VectorError(
|
|
267
|
+
"pyarrow is required for vector queries: pip install earthforge[vector]"
|
|
268
|
+
) from exc
|
|
269
|
+
|
|
270
|
+
try:
|
|
271
|
+
pf = pq.ParquetFile(source)
|
|
272
|
+
except Exception as exc:
|
|
273
|
+
raise VectorError(f"Failed to open '{source}': {exc}") from exc
|
|
274
|
+
|
|
275
|
+
schema = pf.schema_arrow
|
|
276
|
+
file_metadata = schema.metadata or {}
|
|
277
|
+
geo_meta = _parse_geo_metadata(file_metadata)
|
|
278
|
+
primary_geom = geo_meta.get("primary_column", "geometry")
|
|
279
|
+
total_rows = pf.metadata.num_rows
|
|
280
|
+
num_row_groups = pf.metadata.num_row_groups
|
|
281
|
+
|
|
282
|
+
# Build pyarrow filter for pushdown
|
|
283
|
+
schema_names = {schema.field(i).name for i in range(len(schema))}
|
|
284
|
+
pa_filter = None
|
|
285
|
+
if bbox:
|
|
286
|
+
pa_filter = _build_bbox_filter(geo_meta, bbox, schema_names)
|
|
287
|
+
|
|
288
|
+
# Determine columns to read
|
|
289
|
+
read_columns = columns
|
|
290
|
+
if read_columns and include_geometry and primary_geom not in read_columns:
|
|
291
|
+
read_columns = [*read_columns, primary_geom]
|
|
292
|
+
|
|
293
|
+
# Read with filter pushdown via read_table (supports filters, unlike ParquetFile.read)
|
|
294
|
+
try:
|
|
295
|
+
table = pq.read_table(source, columns=read_columns, filters=pa_filter)
|
|
296
|
+
except Exception as exc:
|
|
297
|
+
raise VectorError(f"Query failed on '{source}': {exc}") from exc
|
|
298
|
+
|
|
299
|
+
# Post-filter with geometry intersection if bbox provided but no pushdown
|
|
300
|
+
if bbox and pa_filter is None and primary_geom in table.column_names:
|
|
301
|
+
west, south, east, north = bbox
|
|
302
|
+
geom_col = table.column(primary_geom)
|
|
303
|
+
mask = []
|
|
304
|
+
for val in geom_col:
|
|
305
|
+
raw = val.as_py()
|
|
306
|
+
if isinstance(raw, bytes):
|
|
307
|
+
mask.append(_geometry_intersects_bbox(raw, west, south, east, north))
|
|
308
|
+
else:
|
|
309
|
+
mask.append(True)
|
|
310
|
+
import pyarrow as pa
|
|
311
|
+
|
|
312
|
+
table = table.filter(pa.array(mask))
|
|
313
|
+
|
|
314
|
+
# Apply limit
|
|
315
|
+
if limit is not None and len(table) > limit:
|
|
316
|
+
table = table.slice(0, limit)
|
|
317
|
+
|
|
318
|
+
# Convert to feature dicts
|
|
319
|
+
features: list[dict[str, Any]] = []
|
|
320
|
+
result_columns = table.column_names
|
|
321
|
+
for i in range(len(table)):
|
|
322
|
+
feature: dict[str, Any] = {}
|
|
323
|
+
for col_name in result_columns:
|
|
324
|
+
val = table.column(col_name)[i].as_py()
|
|
325
|
+
if col_name == primary_geom and isinstance(val, bytes):
|
|
326
|
+
if include_geometry:
|
|
327
|
+
wkt = _wkb_to_wkt(val)
|
|
328
|
+
feature["geometry_wkt"] = wkt if wkt else "(binary)"
|
|
329
|
+
else:
|
|
330
|
+
feature[col_name] = val
|
|
331
|
+
features.append(feature)
|
|
332
|
+
|
|
333
|
+
return QueryResult(
|
|
334
|
+
source=source,
|
|
335
|
+
feature_count=len(features),
|
|
336
|
+
columns=[c for c in result_columns if c != primary_geom or include_geometry],
|
|
337
|
+
bbox_filter=bbox,
|
|
338
|
+
features=features,
|
|
339
|
+
total_rows=total_rows,
|
|
340
|
+
row_groups_scanned=num_row_groups if pa_filter is None else None,
|
|
341
|
+
row_groups_total=num_row_groups,
|
|
342
|
+
)
|
|
343
|
+
|
|
344
|
+
|
|
345
|
+
async def query_features(
|
|
346
|
+
source: str,
|
|
347
|
+
*,
|
|
348
|
+
bbox: list[float] | None = None,
|
|
349
|
+
columns: list[str] | None = None,
|
|
350
|
+
limit: int | None = None,
|
|
351
|
+
include_geometry: bool = True,
|
|
352
|
+
) -> QueryResult:
|
|
353
|
+
"""Query features from a GeoParquet file.
|
|
354
|
+
|
|
355
|
+
Uses pyarrow predicate pushdown when GeoParquet bbox covering metadata
|
|
356
|
+
is present, skipping row groups that don't intersect the query bbox.
|
|
357
|
+
Falls back to post-read geometry filtering via shapely when covering
|
|
358
|
+
is not available.
|
|
359
|
+
|
|
360
|
+
Parameters:
|
|
361
|
+
source: Path to a GeoParquet/Parquet file.
|
|
362
|
+
bbox: Bounding box filter ``[west, south, east, north]``.
|
|
363
|
+
columns: Columns to include. ``None`` returns all.
|
|
364
|
+
limit: Maximum features to return.
|
|
365
|
+
include_geometry: Include geometry as WKT in results.
|
|
366
|
+
|
|
367
|
+
Returns:
|
|
368
|
+
Structured query result.
|
|
369
|
+
|
|
370
|
+
Raises:
|
|
371
|
+
VectorError: If the file cannot be read or query fails.
|
|
372
|
+
"""
|
|
373
|
+
loop = asyncio.get_running_loop()
|
|
374
|
+
return await loop.run_in_executor(
|
|
375
|
+
None,
|
|
376
|
+
partial(
|
|
377
|
+
_query_features_sync,
|
|
378
|
+
source,
|
|
379
|
+
bbox=bbox,
|
|
380
|
+
columns=columns,
|
|
381
|
+
limit=limit,
|
|
382
|
+
include_geometry=include_geometry,
|
|
383
|
+
),
|
|
384
|
+
)
|
|
@@ -0,0 +1,13 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: earthforge-vector
|
|
3
|
+
Version: 0.1.0
|
|
4
|
+
Summary: EarthForge vector format inspection (GeoParquet, FlatGeobuf, GeoJSON).
|
|
5
|
+
License-Expression: GPL-3.0-or-later
|
|
6
|
+
Requires-Python: >=3.11
|
|
7
|
+
Requires-Dist: earthforge-core>=0.1.0
|
|
8
|
+
Requires-Dist: pyarrow>=14.0
|
|
9
|
+
Description-Content-Type: text/markdown
|
|
10
|
+
|
|
11
|
+
# earthforge-vector
|
|
12
|
+
|
|
13
|
+
Vector format inspection for EarthForge. Part of the [EarthForge](../../README.md) toolkit.
|
|
@@ -0,0 +1,8 @@
|
|
|
1
|
+
earthforge/vector/__init__.py,sha256=GtTAK7s61ZFZDhvoJGyZFmo598-m1vQJLscZvZ1ARcw,249
|
|
2
|
+
earthforge/vector/convert.py,sha256=bCx8-MxmmfWasYxsrv0-z9a8QinxmTuxB1Hqvi8m41Q,11255
|
|
3
|
+
earthforge/vector/errors.py,sha256=meQNnjghRSbRAW6askDiZJ8uewQPvXCeDXOhuqheVTo,606
|
|
4
|
+
earthforge/vector/info.py,sha256=H5ltP-SfbOYlKEapy8JEvyySGkTmBHjxuAd0XhZi_Jw,8154
|
|
5
|
+
earthforge/vector/query.py,sha256=Kw0ZMBr_ygHhuXwShByGBvzj8OH8nBic5OUxjqxay1o,12532
|
|
6
|
+
earthforge_vector-0.1.0.dist-info/METADATA,sha256=8mO9tQYkRVV8LCwIMUfcEZAzoMXmZJd6o0QCGkksH9U,423
|
|
7
|
+
earthforge_vector-0.1.0.dist-info/WHEEL,sha256=QccIxa26bgl1E6uMy58deGWi-0aeIkkangHcxk2kWfw,87
|
|
8
|
+
earthforge_vector-0.1.0.dist-info/RECORD,,
|