earthforge-vector 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,6 @@
1
+ """EarthForge vector format inspection.
2
+
3
+ Provides deep metadata extraction for vector geospatial formats including
4
+ GeoParquet, FlatGeobuf, and GeoJSON. Uses pyarrow for Parquet/GeoParquet
5
+ introspection without loading full datasets into memory.
6
+ """
@@ -0,0 +1,356 @@
1
+ """Vector format conversion.
2
+
3
+ Converts between vector geospatial formats with a focus on producing valid
4
+ GeoParquet output. Supports Shapefile, GeoJSON, and other OGR-readable
5
+ formats as input. Writes GeoParquet with proper ``geo`` metadata including
6
+ CRS, geometry types, encoding, and bounding box.
7
+
8
+ Uses GDAL/OGR for reading source formats and pyarrow for writing Parquet.
9
+ Falls back to geopandas if available, but does not require it.
10
+
11
+ Usage::
12
+
13
+ from earthforge.vector.convert import convert_vector
14
+
15
+ result = await convert_vector("buildings.shp", output="buildings.parquet")
16
+ """
17
+
18
+ from __future__ import annotations
19
+
20
+ import asyncio
21
+ import json
22
+ from functools import partial
23
+ from pathlib import Path
24
+ from typing import Any
25
+
26
+ from pydantic import BaseModel, Field
27
+
28
+ from earthforge.vector.errors import VectorError
29
+
30
+
31
+ class ConvertResult(BaseModel):
32
+ """Structured result from a vector format conversion.
33
+
34
+ Attributes:
35
+ source: Input file path.
36
+ output: Output file path.
37
+ input_format: Source format name (e.g. ``"ESRI Shapefile"``).
38
+ output_format: Target format (e.g. ``"geoparquet"``).
39
+ feature_count: Number of features converted.
40
+ geometry_type: Geometry type (e.g. ``"Polygon"``).
41
+ crs: CRS identifier string.
42
+ bbox: Bounding box ``[west, south, east, north]``.
43
+ file_size_bytes: Output file size in bytes.
44
+ """
45
+
46
+ source: str = Field(title="Source")
47
+ output: str = Field(title="Output")
48
+ input_format: str = Field(title="Input Format")
49
+ output_format: str = Field(title="Output Format")
50
+ feature_count: int = Field(title="Features")
51
+ geometry_type: str | None = Field(default=None, title="Geometry Type")
52
+ crs: str | None = Field(default=None, title="CRS")
53
+ bbox: list[float] | None = Field(default=None, title="BBox")
54
+ file_size_bytes: int | None = Field(default=None, title="Size (bytes)")
55
+
56
+
57
+ def _ogr_type_to_arrow(ogr_type: int) -> str:
58
+ """Map an OGR field type to a pyarrow type string.
59
+
60
+ Parameters:
61
+ ogr_type: OGR field type constant.
62
+
63
+ Returns:
64
+ Arrow type name string.
65
+ """
66
+ from osgeo import ogr
67
+
68
+ mapping = {
69
+ ogr.OFTInteger: "int32",
70
+ ogr.OFTInteger64: "int64",
71
+ ogr.OFTReal: "float64",
72
+ ogr.OFTString: "string",
73
+ ogr.OFTDate: "string",
74
+ ogr.OFTDateTime: "string",
75
+ ogr.OFTBinary: "binary",
76
+ }
77
+ return mapping.get(ogr_type, "string")
78
+
79
+
80
+ def _ogr_geom_type_name(ogr_geom_type: int) -> str:
81
+ """Convert OGR geometry type constant to human-readable name.
82
+
83
+ Parameters:
84
+ ogr_geom_type: OGR geometry type constant.
85
+
86
+ Returns:
87
+ Geometry type name.
88
+ """
89
+ from osgeo import ogr
90
+
91
+ mapping = {
92
+ ogr.wkbPoint: "Point",
93
+ ogr.wkbLineString: "LineString",
94
+ ogr.wkbPolygon: "Polygon",
95
+ ogr.wkbMultiPoint: "MultiPoint",
96
+ ogr.wkbMultiLineString: "MultiLineString",
97
+ ogr.wkbMultiPolygon: "MultiPolygon",
98
+ ogr.wkbGeometryCollection: "GeometryCollection",
99
+ ogr.wkbPoint25D: "Point",
100
+ ogr.wkbLineString25D: "LineString",
101
+ ogr.wkbPolygon25D: "Polygon",
102
+ ogr.wkbMultiPoint25D: "MultiPoint",
103
+ ogr.wkbMultiLineString25D: "MultiLineString",
104
+ ogr.wkbMultiPolygon25D: "MultiPolygon",
105
+ }
106
+ return mapping.get(ogr_geom_type, "Unknown")
107
+
108
+
109
+ def _extract_crs_info(spatial_ref: Any) -> tuple[str | None, dict[str, Any] | None]:
110
+ """Extract CRS identifier and PROJJSON from an OGR SpatialReference.
111
+
112
+ Parameters:
113
+ spatial_ref: OGR SpatialReference object.
114
+
115
+ Returns:
116
+ Tuple of (crs_string, projjson_dict).
117
+ """
118
+ if spatial_ref is None:
119
+ return None, None
120
+
121
+ # Try to get authority:code
122
+ auth_name = spatial_ref.GetAuthorityName(None)
123
+ auth_code = spatial_ref.GetAuthorityCode(None)
124
+ crs_string = f"{auth_name}:{auth_code}" if auth_name and auth_code else None
125
+
126
+ # Build PROJJSON for GeoParquet metadata
127
+ projjson: dict[str, Any] | None = None
128
+ try:
129
+ projjson_str = spatial_ref.ExportToPROJJSON()
130
+ if projjson_str:
131
+ projjson = json.loads(projjson_str)
132
+ except Exception:
133
+ # Fall back to building minimal PROJJSON
134
+ if crs_string:
135
+ projjson = {
136
+ "type": "GeographicCRS" if spatial_ref.IsGeographic() else "ProjectedCRS",
137
+ "name": spatial_ref.GetName() or crs_string,
138
+ "id": {"authority": auth_name, "code": int(auth_code) if auth_code else 0},
139
+ }
140
+
141
+ if crs_string is None and spatial_ref.GetName():
142
+ crs_string = spatial_ref.GetName()
143
+
144
+ return crs_string, projjson
145
+
146
+
147
+ def _convert_vector_sync(
148
+ source: str,
149
+ *,
150
+ output: str | None = None,
151
+ target_format: str = "geoparquet",
152
+ compression: str = "snappy",
153
+ ) -> ConvertResult:
154
+ """Convert a vector file to GeoParquet synchronously.
155
+
156
+ Parameters:
157
+ source: Path to the input vector file (Shapefile, GeoJSON, etc.).
158
+ output: Output file path. If ``None``, derives from source name.
159
+ target_format: Target format (currently only ``"geoparquet"``).
160
+ compression: Parquet compression codec (``"snappy"``, ``"zstd"``, ``"gzip"``).
161
+
162
+ Returns:
163
+ Structured conversion result.
164
+
165
+ Raises:
166
+ VectorError: If the source cannot be read or conversion fails.
167
+ """
168
+ if target_format != "geoparquet":
169
+ raise VectorError(f"Unsupported target format: {target_format}")
170
+
171
+ try:
172
+ from osgeo import ogr
173
+ except ImportError as exc:
174
+ raise VectorError(
175
+ "GDAL/OGR is required for vector conversion: install GDAL Python bindings"
176
+ ) from exc
177
+
178
+ try:
179
+ import pyarrow as pa
180
+ import pyarrow.parquet as pq
181
+ except ImportError as exc:
182
+ raise VectorError(
183
+ "pyarrow is required for GeoParquet output: pip install earthforge[vector]"
184
+ ) from exc
185
+
186
+ # Open source
187
+ try:
188
+ ds = ogr.Open(source)
189
+ except RuntimeError as exc:
190
+ raise VectorError(f"Failed to open vector file '{source}'") from exc
191
+ if ds is None:
192
+ raise VectorError(f"Failed to open vector file '{source}'")
193
+
194
+ layer = ds.GetLayer(0)
195
+ if layer is None:
196
+ raise VectorError(f"No layers found in '{source}'")
197
+
198
+ input_format = ds.GetDriver().GetName()
199
+ layer_defn = layer.GetLayerDefn()
200
+ feature_count = layer.GetFeatureCount()
201
+ geom_type = _ogr_geom_type_name(layer.GetGeomType())
202
+
203
+ # Extract CRS
204
+ spatial_ref = layer.GetSpatialRef()
205
+ crs_string, projjson = _extract_crs_info(spatial_ref)
206
+
207
+ # Get extent
208
+ extent = layer.GetExtent() # (xmin, xmax, ymin, ymax)
209
+ bbox = [extent[0], extent[2], extent[1], extent[3]] if extent else None
210
+
211
+ # Build field definitions
212
+ field_names: list[str] = []
213
+ field_types: list[str] = []
214
+ for i in range(layer_defn.GetFieldCount()):
215
+ field_def = layer_defn.GetFieldDefn(i)
216
+ field_names.append(field_def.GetName())
217
+ field_types.append(_ogr_type_to_arrow(field_def.GetType()))
218
+
219
+ # Read all features
220
+ arrays: dict[str, list[Any]] = {name: [] for name in field_names}
221
+ geometries: list[bytes] = []
222
+
223
+ layer.ResetReading()
224
+ feature = layer.GetNextFeature()
225
+ actual_count = 0
226
+ while feature is not None:
227
+ # Read attribute fields
228
+ for i, name in enumerate(field_names):
229
+ if not feature.IsFieldSet(i) or feature.IsFieldNull(i):
230
+ arrays[name].append(None)
231
+ elif field_types[i] == "int32":
232
+ arrays[name].append(feature.GetFieldAsInteger(i))
233
+ elif field_types[i] == "int64":
234
+ arrays[name].append(feature.GetFieldAsInteger64(i))
235
+ elif field_types[i] == "float64":
236
+ arrays[name].append(feature.GetFieldAsDouble(i))
237
+ else:
238
+ arrays[name].append(feature.GetFieldAsString(i))
239
+
240
+ # Read geometry as WKB
241
+ geom = feature.GetGeometryRef()
242
+ if geom is not None:
243
+ geometries.append(bytes(geom.ExportToWkb()))
244
+ else:
245
+ geometries.append(b"")
246
+
247
+ actual_count += 1
248
+ feature = layer.GetNextFeature()
249
+
250
+ ds = None # Close OGR dataset
251
+
252
+ if feature_count < 0:
253
+ feature_count = actual_count
254
+
255
+ # Build pyarrow table
256
+ pa_columns: dict[str, Any] = {}
257
+ for name, arrow_type in zip(field_names, field_types, strict=True):
258
+ type_map = {
259
+ "int32": pa.int32(),
260
+ "int64": pa.int64(),
261
+ "float64": pa.float64(),
262
+ "string": pa.string(),
263
+ "binary": pa.binary(),
264
+ }
265
+ pa_type = type_map.get(arrow_type, pa.string())
266
+ pa_columns[name] = pa.array(arrays[name], type=pa_type)
267
+
268
+ pa_columns["geometry"] = pa.array(geometries, type=pa.binary())
269
+ table = pa.table(pa_columns)
270
+
271
+ # Build GeoParquet metadata
272
+ geo_metadata: dict[str, Any] = {
273
+ "version": "1.1.0",
274
+ "primary_column": "geometry",
275
+ "columns": {
276
+ "geometry": {
277
+ "encoding": "WKB",
278
+ "geometry_types": [geom_type],
279
+ }
280
+ },
281
+ }
282
+
283
+ if bbox:
284
+ geo_metadata["columns"]["geometry"]["bbox"] = bbox
285
+ if projjson:
286
+ geo_metadata["columns"]["geometry"]["crs"] = projjson
287
+
288
+ # Attach geo metadata to schema
289
+ existing = table.schema.metadata or {}
290
+ existing[b"geo"] = json.dumps(geo_metadata).encode("utf-8")
291
+ table = table.replace_schema_metadata(existing)
292
+
293
+ # Determine output path
294
+ if output is None:
295
+ output = str(Path(source).with_suffix(".parquet"))
296
+
297
+ # Write GeoParquet
298
+ try:
299
+ pq.write_table(table, output, compression=compression)
300
+ except Exception as exc:
301
+ raise VectorError(f"Failed to write GeoParquet '{output}': {exc}") from exc
302
+
303
+ file_size: int | None = None
304
+ try:
305
+ file_size = Path(output).stat().st_size
306
+ except OSError:
307
+ pass
308
+
309
+ return ConvertResult(
310
+ source=source,
311
+ output=output,
312
+ input_format=input_format,
313
+ output_format="geoparquet",
314
+ feature_count=actual_count,
315
+ geometry_type=geom_type,
316
+ crs=crs_string,
317
+ bbox=bbox,
318
+ file_size_bytes=file_size,
319
+ )
320
+
321
+
322
+ async def convert_vector(
323
+ source: str,
324
+ *,
325
+ output: str | None = None,
326
+ target_format: str = "geoparquet",
327
+ compression: str = "snappy",
328
+ ) -> ConvertResult:
329
+ """Convert a vector file to GeoParquet.
330
+
331
+ Reads the source using GDAL/OGR and writes GeoParquet with proper ``geo``
332
+ metadata. Supports Shapefile, GeoJSON, GPKG, and any OGR-supported format.
333
+
334
+ Parameters:
335
+ source: Path to the input vector file.
336
+ output: Output file path. If ``None``, replaces extension with ``.parquet``.
337
+ target_format: Target format (default: ``"geoparquet"``).
338
+ compression: Parquet compression codec (default: ``"snappy"``).
339
+
340
+ Returns:
341
+ Structured conversion result.
342
+
343
+ Raises:
344
+ VectorError: If the conversion fails.
345
+ """
346
+ loop = asyncio.get_running_loop()
347
+ return await loop.run_in_executor(
348
+ None,
349
+ partial(
350
+ _convert_vector_sync,
351
+ source,
352
+ output=output,
353
+ target_format=target_format,
354
+ compression=compression,
355
+ ),
356
+ )
@@ -0,0 +1,21 @@
1
+ """Vector-specific error types.
2
+
3
+ All exceptions inherit from :class:`~earthforge.core.errors.EarthForgeError`
4
+ so the CLI can catch them uniformly and map to appropriate exit codes.
5
+ """
6
+
7
+ from __future__ import annotations
8
+
9
+ from earthforge.core.errors import EarthForgeError
10
+
11
+
12
+ class VectorError(EarthForgeError):
13
+ """Base error for vector operations.
14
+
15
+ Parameters:
16
+ message: Human-readable error description.
17
+ exit_code: Process exit code (default: 20).
18
+ """
19
+
20
+ def __init__(self, message: str, *, exit_code: int = 20) -> None:
21
+ super().__init__(message, exit_code=exit_code)
@@ -0,0 +1,245 @@
1
+ """Deep metadata extraction for vector geospatial formats.
2
+
3
+ Reads Parquet/GeoParquet file metadata via pyarrow without loading data into
4
+ memory. Extracts schema, row counts, geometry columns, CRS, bounding box, and
5
+ encoding information from GeoParquet ``geo`` metadata.
6
+
7
+ For non-Parquet vector formats (GeoJSON, FlatGeobuf), provides basic file-level
8
+ metadata. Deep inspection of those formats may be added in later milestones.
9
+ """
10
+
11
+ from __future__ import annotations
12
+
13
+ import asyncio
14
+ import json
15
+ from functools import partial
16
+ from pathlib import Path
17
+ from typing import Any
18
+
19
+ from pydantic import BaseModel, Field
20
+
21
+ from earthforge.vector.errors import VectorError
22
+
23
+
24
+ class ColumnInfo(BaseModel):
25
+ """Metadata for a single column in a vector dataset.
26
+
27
+ Attributes:
28
+ name: Column name.
29
+ type: Arrow type string (e.g. ``"int64"``, ``"binary"``).
30
+ is_geometry: Whether this column contains geometry data.
31
+ """
32
+
33
+ name: str = Field(title="Column")
34
+ type: str = Field(title="Type")
35
+ is_geometry: bool = Field(default=False, title="Geometry")
36
+
37
+
38
+ class VectorInfo(BaseModel):
39
+ """Structured metadata for a vector geospatial file.
40
+
41
+ Attributes:
42
+ source: The file path that was inspected.
43
+ format: Detected vector format (e.g. ``"geoparquet"``, ``"parquet"``).
44
+ row_count: Total number of rows/features.
45
+ num_columns: Total number of columns.
46
+ columns: Per-column metadata.
47
+ geometry_column: Name of the primary geometry column, if any.
48
+ geometry_types: List of geometry types found (e.g. ``["Point"]``).
49
+ crs: CRS string from GeoParquet metadata, if available.
50
+ bbox: Bounding box ``[west, south, east, north]``, if available.
51
+ encoding: Geometry encoding (e.g. ``"WKB"``), if available.
52
+ num_row_groups: Number of Parquet row groups.
53
+ compression: Parquet compression codec, if applicable.
54
+ file_size_bytes: File size in bytes.
55
+ """
56
+
57
+ source: str = Field(title="Source")
58
+ format: str = Field(title="Format")
59
+ row_count: int = Field(title="Rows")
60
+ num_columns: int = Field(title="Columns")
61
+ columns: list[ColumnInfo] = Field(title="Column Details")
62
+ geometry_column: str | None = Field(default=None, title="Geometry Column")
63
+ geometry_types: list[str] = Field(default_factory=list, title="Geometry Types")
64
+ crs: str | None = Field(default=None, title="CRS")
65
+ bbox: list[float] | None = Field(default=None, title="Bounding Box")
66
+ encoding: str | None = Field(default=None, title="Encoding")
67
+ num_row_groups: int | None = Field(default=None, title="Row Groups")
68
+ compression: str | None = Field(default=None, title="Compression")
69
+ file_size_bytes: int | None = Field(default=None, title="Size (bytes)")
70
+
71
+
72
+ def _read_parquet_info(source: str) -> VectorInfo:
73
+ """Read metadata from a Parquet/GeoParquet file synchronously.
74
+
75
+ Uses pyarrow to read only the file metadata and schema — no row data
76
+ is loaded into memory. Parses the ``geo`` metadata key for GeoParquet-
77
+ specific information (geometry column, CRS, bbox, encoding).
78
+
79
+ Parameters:
80
+ source: Path to a Parquet file.
81
+
82
+ Returns:
83
+ Structured vector metadata.
84
+
85
+ Raises:
86
+ VectorError: If the file cannot be read or is not a valid Parquet file.
87
+ """
88
+ try:
89
+ import pyarrow.parquet as pq
90
+ except ImportError as exc:
91
+ msg = "pyarrow is required for Parquet inspection: pip install pyarrow"
92
+ raise VectorError(msg) from exc
93
+
94
+ try:
95
+ pf = pq.ParquetFile(source)
96
+ except Exception as exc:
97
+ msg = f"Failed to read Parquet file '{source}': {exc}"
98
+ raise VectorError(msg) from exc
99
+
100
+ schema = pf.schema_arrow
101
+ metadata = schema.metadata or {}
102
+ num_rows = pf.metadata.num_rows
103
+ num_row_groups = pf.metadata.num_row_groups
104
+
105
+ # Detect compression from the first row group's first column chunk
106
+ compression: str | None = None
107
+ if num_row_groups > 0 and schema:
108
+ try:
109
+ rg = pf.metadata.row_group(0)
110
+ if rg.num_columns > 0:
111
+ compression = rg.column(0).compression
112
+ except Exception: # noqa: S110 — best-effort metadata extraction
113
+ pass
114
+
115
+ # Parse GeoParquet metadata
116
+ geo_meta = _parse_geo_metadata(metadata)
117
+ geometry_column = geo_meta.get("primary_column")
118
+ geometry_columns: set[str] = set()
119
+ geometry_types: list[str] = []
120
+ crs: str | None = None
121
+ bbox: list[float] | None = None
122
+ encoding: str | None = None
123
+
124
+ if geometry_column and "columns" in geo_meta:
125
+ geometry_columns.add(geometry_column)
126
+ col_meta = geo_meta["columns"].get(geometry_column, {})
127
+ geometry_types = col_meta.get("geometry_types", [])
128
+ encoding = col_meta.get("encoding")
129
+ bbox_raw = col_meta.get("bbox")
130
+ if isinstance(bbox_raw, list) and len(bbox_raw) == 4:
131
+ bbox = [float(v) for v in bbox_raw]
132
+
133
+ crs_obj = col_meta.get("crs")
134
+ if isinstance(crs_obj, dict):
135
+ # GeoParquet stores CRS as PROJJSON — extract the name or ID
136
+ crs = _extract_crs_string(crs_obj)
137
+ elif isinstance(crs_obj, str):
138
+ crs = crs_obj
139
+
140
+ # Also check for additional geometry columns
141
+ if "columns" in geo_meta:
142
+ geometry_columns.update(geo_meta["columns"].keys())
143
+
144
+ # Build column info
145
+ columns: list[ColumnInfo] = []
146
+ for i in range(len(schema)):
147
+ field = schema.field(i)
148
+ columns.append(
149
+ ColumnInfo(
150
+ name=field.name,
151
+ type=str(field.type),
152
+ is_geometry=field.name in geometry_columns,
153
+ )
154
+ )
155
+
156
+ # File size
157
+ file_size: int | None = None
158
+ try:
159
+ file_size = Path(source).stat().st_size
160
+ except OSError:
161
+ pass
162
+
163
+ fmt = "geoparquet" if geometry_column else "parquet"
164
+
165
+ return VectorInfo(
166
+ source=source,
167
+ format=fmt,
168
+ row_count=num_rows,
169
+ num_columns=len(schema),
170
+ columns=columns,
171
+ geometry_column=geometry_column,
172
+ geometry_types=geometry_types,
173
+ crs=crs,
174
+ bbox=bbox,
175
+ encoding=encoding,
176
+ num_row_groups=num_row_groups,
177
+ compression=compression,
178
+ file_size_bytes=file_size,
179
+ )
180
+
181
+
182
+ def _parse_geo_metadata(metadata: dict[bytes, bytes]) -> dict[str, Any]:
183
+ """Parse the ``geo`` key from Parquet file metadata.
184
+
185
+ Parameters:
186
+ metadata: Raw Parquet schema metadata (bytes keys and values).
187
+
188
+ Returns:
189
+ Parsed GeoParquet metadata dict, or empty dict if not present.
190
+ """
191
+ geo_bytes = metadata.get(b"geo")
192
+ if geo_bytes is None:
193
+ return {}
194
+ try:
195
+ result: dict[str, Any] = json.loads(geo_bytes)
196
+ return result
197
+ except (json.JSONDecodeError, UnicodeDecodeError):
198
+ return {}
199
+
200
+
201
+ def _extract_crs_string(crs_obj: dict[str, Any]) -> str:
202
+ """Extract a human-readable CRS identifier from PROJJSON.
203
+
204
+ Tries ``id.code`` (e.g. ``"EPSG:4326"``), then ``name``, then falls
205
+ back to a truncated JSON representation.
206
+
207
+ Parameters:
208
+ crs_obj: PROJJSON CRS object.
209
+
210
+ Returns:
211
+ CRS identifier string.
212
+ """
213
+ # Try EPSG-style authority:code
214
+ crs_id = crs_obj.get("id", {})
215
+ if isinstance(crs_id, dict):
216
+ authority = crs_id.get("authority")
217
+ code = crs_id.get("code")
218
+ if authority and code:
219
+ return f"{authority}:{code}"
220
+
221
+ # Fall back to name
222
+ name = crs_obj.get("name")
223
+ if isinstance(name, str):
224
+ return name
225
+
226
+ return json.dumps(crs_obj)[:100]
227
+
228
+
229
+ async def inspect_vector(source: str) -> VectorInfo:
230
+ """Inspect a vector file and return structured metadata.
231
+
232
+ Runs the synchronous pyarrow read in a thread executor to avoid blocking
233
+ the event loop. Currently supports Parquet and GeoParquet files.
234
+
235
+ Parameters:
236
+ source: Path to a vector file.
237
+
238
+ Returns:
239
+ Structured vector metadata.
240
+
241
+ Raises:
242
+ VectorError: If the file cannot be read or format is unsupported.
243
+ """
244
+ loop = asyncio.get_running_loop()
245
+ return await loop.run_in_executor(None, partial(_read_parquet_info, source))
@@ -0,0 +1,384 @@
1
+ """Spatial and attribute queries against GeoParquet files.
2
+
3
+ Leverages pyarrow's row-group-level statistics and predicate pushdown to
4
+ read only the data that matches the query — critical for large files where
5
+ reading the full dataset would be impractical.
6
+
7
+ For bbox queries, the filter is applied against the ``bbox`` column covering
8
+ structure embedded in GeoParquet metadata. If per-row bounding box columns
9
+ (``bbox.xmin``, ``bbox.ymin``, etc.) are present, pyarrow can skip entire
10
+ row groups whose spatial extent doesn't intersect the query box.
11
+
12
+ Usage::
13
+
14
+ from earthforge.vector.query import query_features
15
+
16
+ result = await query_features("buildings.parquet", bbox=[-85, 37, -84, 38])
17
+ print(result.feature_count)
18
+ """
19
+
20
+ from __future__ import annotations
21
+
22
+ import asyncio
23
+ import json
24
+ from functools import partial
25
+ from typing import Any
26
+
27
+ from pydantic import BaseModel, Field
28
+
29
+ from earthforge.vector.errors import VectorError
30
+
31
+
32
+ class QueryResult(BaseModel):
33
+ """Structured result from a vector spatial/attribute query.
34
+
35
+ Attributes:
36
+ source: The file that was queried.
37
+ feature_count: Number of features matching the query.
38
+ columns: Column names in the result.
39
+ bbox_filter: The bounding box filter applied, if any.
40
+ features: List of feature dicts (geometry as WKT if available).
41
+ total_rows: Total rows in the source file (before filtering).
42
+ row_groups_scanned: Number of Parquet row groups actually read.
43
+ row_groups_total: Total row groups in the file.
44
+ """
45
+
46
+ source: str = Field(title="Source")
47
+ feature_count: int = Field(title="Features")
48
+ columns: list[str] = Field(title="Columns")
49
+ bbox_filter: list[float] | None = Field(default=None, title="BBox Filter")
50
+ features: list[dict[str, Any]] = Field(default_factory=list, title="Features")
51
+ total_rows: int = Field(title="Total Rows")
52
+ row_groups_scanned: int | None = Field(default=None, title="Row Groups Scanned")
53
+ row_groups_total: int | None = Field(default=None, title="Row Groups Total")
54
+
55
+
56
+ def _parse_geo_metadata(metadata: dict[bytes, bytes]) -> dict[str, Any]:
57
+ """Parse the GeoParquet ``geo`` metadata key.
58
+
59
+ Parameters:
60
+ metadata: Raw Parquet file-level metadata.
61
+
62
+ Returns:
63
+ Parsed geo metadata dict, or empty dict.
64
+ """
65
+ geo_bytes = metadata.get(b"geo")
66
+ if geo_bytes is None:
67
+ return {}
68
+ try:
69
+ result: dict[str, Any] = json.loads(geo_bytes)
70
+ return result
71
+ except (json.JSONDecodeError, UnicodeDecodeError):
72
+ return {}
73
+
74
+
75
+ def _build_bbox_filter(
76
+ geo_meta: dict[str, Any],
77
+ bbox: list[float],
78
+ schema_names: set[str],
79
+ ) -> Any:
80
+ """Build a pyarrow filter expression for a bounding box query.
81
+
82
+ Checks for GeoParquet bbox column covering (``covering.bbox``), which
83
+ provides per-row min/max coordinates that pyarrow can use for predicate
84
+ pushdown at the row-group level.
85
+
86
+ Falls back to no filter if bbox covering is not present — the full scan
87
+ result will then be post-filtered in Python.
88
+
89
+ Parameters:
90
+ geo_meta: Parsed GeoParquet metadata.
91
+ bbox: Query bounding box ``[west, south, east, north]``.
92
+ schema_names: Set of column names present in the file schema.
93
+
94
+ Returns:
95
+ A pyarrow compute expression, or ``None`` if no pushdown is possible.
96
+ """
97
+ import pyarrow.compute as pc
98
+
99
+ west, south, east, north = bbox
100
+
101
+ # Check for GeoParquet bbox covering metadata
102
+ primary_col = geo_meta.get("primary_column", "geometry")
103
+ col_meta = geo_meta.get("columns", {}).get(primary_col, {})
104
+ covering = col_meta.get("covering", {})
105
+ bbox_covering = covering.get("bbox", {})
106
+
107
+ xmin_col = bbox_covering.get("xmin")
108
+ ymin_col = bbox_covering.get("ymin")
109
+ xmax_col = bbox_covering.get("xmax")
110
+ ymax_col = bbox_covering.get("ymax")
111
+
112
+ if all([xmin_col, ymin_col, xmax_col, ymax_col]):
113
+ # Use covering columns for pushdown: feature bbox intersects query bbox
114
+ return (
115
+ (pc.field(xmin_col) <= east)
116
+ & (pc.field(xmax_col) >= west)
117
+ & (pc.field(ymin_col) <= north)
118
+ & (pc.field(ymax_col) >= south)
119
+ )
120
+
121
+ # Check for common bbox struct columns (Overture Maps pattern)
122
+ for col_set in [
123
+ ("bbox.xmin", "bbox.ymin", "bbox.xmax", "bbox.ymax"),
124
+ ("bbox.minx", "bbox.miny", "bbox.maxx", "bbox.maxy"),
125
+ ]:
126
+ if all(c in schema_names for c in col_set):
127
+ xmin_c, ymin_c, xmax_c, ymax_c = col_set
128
+ return (
129
+ (pc.field(xmin_c) <= east)
130
+ & (pc.field(xmax_c) >= west)
131
+ & (pc.field(ymin_c) <= north)
132
+ & (pc.field(ymax_c) >= south)
133
+ )
134
+
135
+ # No pushdown columns available — caller will post-filter with geometry
136
+ return None
137
+
138
+
139
+ def _geometry_intersects_bbox(
140
+ wkb: bytes,
141
+ west: float,
142
+ south: float,
143
+ east: float,
144
+ north: float,
145
+ ) -> bool:
146
+ """Check if a WKB geometry's envelope intersects a bounding box.
147
+
148
+ Uses shapely for full geometry intersection if available. Falls back to
149
+ a minimal WKB point parser that checks if the point lies within the bbox.
150
+
151
+ Parameters:
152
+ wkb: Well-Known Binary geometry bytes.
153
+ west: Query bbox west.
154
+ south: Query bbox south.
155
+ east: Query bbox east.
156
+ north: Query bbox north.
157
+
158
+ Returns:
159
+ True if the geometry intersects the query bbox.
160
+ """
161
+ try:
162
+ from shapely import from_wkb
163
+ from shapely.geometry import box
164
+
165
+ geom = from_wkb(wkb)
166
+ query_box = box(west, south, east, north)
167
+ return bool(geom.intersects(query_box))
168
+ except ImportError:
169
+ pass
170
+
171
+ # Fallback: parse WKB point coordinates for simple containment check
172
+ import struct
173
+
174
+ if len(wkb) >= 21:
175
+ try:
176
+ byte_order = wkb[0]
177
+ fmt = "<" if byte_order == 1 else ">"
178
+ wkb_type = struct.unpack(f"{fmt}I", wkb[1:5])[0]
179
+ if wkb_type == 1: # Point
180
+ x, y = struct.unpack(f"{fmt}dd", wkb[5:21])
181
+ return bool(west <= x <= east and south <= y <= north)
182
+ except (struct.error, IndexError):
183
+ pass
184
+
185
+ # For non-point geometries without shapely, include conservatively
186
+ return True
187
+
188
+
189
+ def _wkb_to_wkt(wkb: bytes) -> str | None:
190
+ """Convert WKB bytes to WKT string for output.
191
+
192
+ Uses shapely if available; otherwise falls back to a minimal WKB parser
193
+ that handles Point geometries (the most common case for tabular data).
194
+
195
+ Parameters:
196
+ wkb: Well-Known Binary geometry.
197
+
198
+ Returns:
199
+ WKT string, or None if conversion fails.
200
+ """
201
+ try:
202
+ from shapely import from_wkb
203
+
204
+ geom = from_wkb(wkb)
205
+ return str(geom.wkt)
206
+ except ImportError:
207
+ pass
208
+ except Exception:
209
+ return None
210
+
211
+ # Minimal fallback WKB parser for Point geometry
212
+ return _parse_wkb_point(wkb)
213
+
214
+
215
+ def _parse_wkb_point(wkb: bytes) -> str | None:
216
+ """Parse a WKB Point geometry to WKT without shapely.
217
+
218
+ Parameters:
219
+ wkb: Well-Known Binary bytes.
220
+
221
+ Returns:
222
+ WKT string if it's a Point, None otherwise.
223
+ """
224
+ import struct
225
+
226
+ if len(wkb) < 21:
227
+ return None
228
+ try:
229
+ byte_order = wkb[0]
230
+ fmt = "<" if byte_order == 1 else ">"
231
+ wkb_type = struct.unpack(f"{fmt}I", wkb[1:5])[0]
232
+ if wkb_type == 1: # Point
233
+ x, y = struct.unpack(f"{fmt}dd", wkb[5:21])
234
+ return f"POINT ({x} {y})"
235
+ except (struct.error, IndexError):
236
+ pass
237
+ return None
238
+
239
+
240
+ def _query_features_sync(
241
+ source: str,
242
+ *,
243
+ bbox: list[float] | None = None,
244
+ columns: list[str] | None = None,
245
+ limit: int | None = None,
246
+ include_geometry: bool = True,
247
+ ) -> QueryResult:
248
+ """Execute a spatial/attribute query synchronously.
249
+
250
+ Parameters:
251
+ source: Path to a GeoParquet/Parquet file.
252
+ bbox: Bounding box filter ``[west, south, east, north]`` in the file's CRS.
253
+ columns: Columns to include in results. ``None`` returns all columns.
254
+ limit: Maximum number of features to return.
255
+ include_geometry: Whether to include geometry as WKT in results.
256
+
257
+ Returns:
258
+ Structured query result with matching features.
259
+
260
+ Raises:
261
+ VectorError: If the file cannot be read or query fails.
262
+ """
263
+ try:
264
+ import pyarrow.parquet as pq
265
+ except ImportError as exc:
266
+ raise VectorError(
267
+ "pyarrow is required for vector queries: pip install earthforge[vector]"
268
+ ) from exc
269
+
270
+ try:
271
+ pf = pq.ParquetFile(source)
272
+ except Exception as exc:
273
+ raise VectorError(f"Failed to open '{source}': {exc}") from exc
274
+
275
+ schema = pf.schema_arrow
276
+ file_metadata = schema.metadata or {}
277
+ geo_meta = _parse_geo_metadata(file_metadata)
278
+ primary_geom = geo_meta.get("primary_column", "geometry")
279
+ total_rows = pf.metadata.num_rows
280
+ num_row_groups = pf.metadata.num_row_groups
281
+
282
+ # Build pyarrow filter for pushdown
283
+ schema_names = {schema.field(i).name for i in range(len(schema))}
284
+ pa_filter = None
285
+ if bbox:
286
+ pa_filter = _build_bbox_filter(geo_meta, bbox, schema_names)
287
+
288
+ # Determine columns to read
289
+ read_columns = columns
290
+ if read_columns and include_geometry and primary_geom not in read_columns:
291
+ read_columns = [*read_columns, primary_geom]
292
+
293
+ # Read with filter pushdown via read_table (supports filters, unlike ParquetFile.read)
294
+ try:
295
+ table = pq.read_table(source, columns=read_columns, filters=pa_filter)
296
+ except Exception as exc:
297
+ raise VectorError(f"Query failed on '{source}': {exc}") from exc
298
+
299
+ # Post-filter with geometry intersection if bbox provided but no pushdown
300
+ if bbox and pa_filter is None and primary_geom in table.column_names:
301
+ west, south, east, north = bbox
302
+ geom_col = table.column(primary_geom)
303
+ mask = []
304
+ for val in geom_col:
305
+ raw = val.as_py()
306
+ if isinstance(raw, bytes):
307
+ mask.append(_geometry_intersects_bbox(raw, west, south, east, north))
308
+ else:
309
+ mask.append(True)
310
+ import pyarrow as pa
311
+
312
+ table = table.filter(pa.array(mask))
313
+
314
+ # Apply limit
315
+ if limit is not None and len(table) > limit:
316
+ table = table.slice(0, limit)
317
+
318
+ # Convert to feature dicts
319
+ features: list[dict[str, Any]] = []
320
+ result_columns = table.column_names
321
+ for i in range(len(table)):
322
+ feature: dict[str, Any] = {}
323
+ for col_name in result_columns:
324
+ val = table.column(col_name)[i].as_py()
325
+ if col_name == primary_geom and isinstance(val, bytes):
326
+ if include_geometry:
327
+ wkt = _wkb_to_wkt(val)
328
+ feature["geometry_wkt"] = wkt if wkt else "(binary)"
329
+ else:
330
+ feature[col_name] = val
331
+ features.append(feature)
332
+
333
+ return QueryResult(
334
+ source=source,
335
+ feature_count=len(features),
336
+ columns=[c for c in result_columns if c != primary_geom or include_geometry],
337
+ bbox_filter=bbox,
338
+ features=features,
339
+ total_rows=total_rows,
340
+ row_groups_scanned=num_row_groups if pa_filter is None else None,
341
+ row_groups_total=num_row_groups,
342
+ )
343
+
344
+
345
+ async def query_features(
346
+ source: str,
347
+ *,
348
+ bbox: list[float] | None = None,
349
+ columns: list[str] | None = None,
350
+ limit: int | None = None,
351
+ include_geometry: bool = True,
352
+ ) -> QueryResult:
353
+ """Query features from a GeoParquet file.
354
+
355
+ Uses pyarrow predicate pushdown when GeoParquet bbox covering metadata
356
+ is present, skipping row groups that don't intersect the query bbox.
357
+ Falls back to post-read geometry filtering via shapely when covering
358
+ is not available.
359
+
360
+ Parameters:
361
+ source: Path to a GeoParquet/Parquet file.
362
+ bbox: Bounding box filter ``[west, south, east, north]``.
363
+ columns: Columns to include. ``None`` returns all.
364
+ limit: Maximum features to return.
365
+ include_geometry: Include geometry as WKT in results.
366
+
367
+ Returns:
368
+ Structured query result.
369
+
370
+ Raises:
371
+ VectorError: If the file cannot be read or query fails.
372
+ """
373
+ loop = asyncio.get_running_loop()
374
+ return await loop.run_in_executor(
375
+ None,
376
+ partial(
377
+ _query_features_sync,
378
+ source,
379
+ bbox=bbox,
380
+ columns=columns,
381
+ limit=limit,
382
+ include_geometry=include_geometry,
383
+ ),
384
+ )
@@ -0,0 +1,13 @@
1
+ Metadata-Version: 2.4
2
+ Name: earthforge-vector
3
+ Version: 0.1.0
4
+ Summary: EarthForge vector format inspection (GeoParquet, FlatGeobuf, GeoJSON).
5
+ License-Expression: GPL-3.0-or-later
6
+ Requires-Python: >=3.11
7
+ Requires-Dist: earthforge-core>=0.1.0
8
+ Requires-Dist: pyarrow>=14.0
9
+ Description-Content-Type: text/markdown
10
+
11
+ # earthforge-vector
12
+
13
+ Vector format inspection for EarthForge. Part of the [EarthForge](../../README.md) toolkit.
@@ -0,0 +1,8 @@
1
+ earthforge/vector/__init__.py,sha256=GtTAK7s61ZFZDhvoJGyZFmo598-m1vQJLscZvZ1ARcw,249
2
+ earthforge/vector/convert.py,sha256=bCx8-MxmmfWasYxsrv0-z9a8QinxmTuxB1Hqvi8m41Q,11255
3
+ earthforge/vector/errors.py,sha256=meQNnjghRSbRAW6askDiZJ8uewQPvXCeDXOhuqheVTo,606
4
+ earthforge/vector/info.py,sha256=H5ltP-SfbOYlKEapy8JEvyySGkTmBHjxuAd0XhZi_Jw,8154
5
+ earthforge/vector/query.py,sha256=Kw0ZMBr_ygHhuXwShByGBvzj8OH8nBic5OUxjqxay1o,12532
6
+ earthforge_vector-0.1.0.dist-info/METADATA,sha256=8mO9tQYkRVV8LCwIMUfcEZAzoMXmZJd6o0QCGkksH9U,423
7
+ earthforge_vector-0.1.0.dist-info/WHEEL,sha256=QccIxa26bgl1E6uMy58deGWi-0aeIkkangHcxk2kWfw,87
8
+ earthforge_vector-0.1.0.dist-info/RECORD,,
@@ -0,0 +1,4 @@
1
+ Wheel-Version: 1.0
2
+ Generator: hatchling 1.29.0
3
+ Root-Is-Purelib: true
4
+ Tag: py3-none-any