earthforge-vector 0.1.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,31 @@
1
+ # Python
2
+ __pycache__/
3
+ *.py[cod]
4
+ *.egg-info/
5
+ *.egg
6
+ dist/
7
+ build/
8
+ *.whl
9
+
10
+ # Virtual environments
11
+ .venv/
12
+ venv/
13
+
14
+ # IDE
15
+ .vscode/
16
+ .idea/
17
+ *.swp
18
+ *.swo
19
+
20
+ # Testing
21
+ .pytest_cache/
22
+ .coverage
23
+ htmlcov/
24
+ .mypy_cache/
25
+
26
+ # OS
27
+ .DS_Store
28
+ Thumbs.db
29
+
30
+ # Claude
31
+ .claude/
@@ -0,0 +1,13 @@
1
+ Metadata-Version: 2.4
2
+ Name: earthforge-vector
3
+ Version: 0.1.0
4
+ Summary: EarthForge vector format inspection (GeoParquet, FlatGeobuf, GeoJSON).
5
+ License-Expression: GPL-3.0-or-later
6
+ Requires-Python: >=3.11
7
+ Requires-Dist: earthforge-core>=0.1.0
8
+ Requires-Dist: pyarrow>=14.0
9
+ Description-Content-Type: text/markdown
10
+
11
+ # earthforge-vector
12
+
13
+ Vector format inspection for EarthForge. Part of the [EarthForge](../../README.md) toolkit.
@@ -0,0 +1,3 @@
1
+ # earthforge-vector
2
+
3
+ Vector format inspection for EarthForge. Part of the [EarthForge](../../README.md) toolkit.
@@ -0,0 +1,18 @@
1
+ [build-system]
2
+ requires = ["hatchling"]
3
+ build-backend = "hatchling.build"
4
+
5
+ [project]
6
+ name = "earthforge-vector"
7
+ version = "0.1.0"
8
+ description = "EarthForge vector format inspection (GeoParquet, FlatGeobuf, GeoJSON)."
9
+ readme = "README.md"
10
+ requires-python = ">=3.11"
11
+ license = "GPL-3.0-or-later"
12
+ dependencies = [
13
+ "earthforge-core>=0.1.0",
14
+ "pyarrow>=14.0",
15
+ ]
16
+
17
+ [tool.hatch.build.targets.wheel]
18
+ packages = ["src/earthforge"]
@@ -0,0 +1,6 @@
1
+ """EarthForge vector format inspection.
2
+
3
+ Provides deep metadata extraction for vector geospatial formats including
4
+ GeoParquet, FlatGeobuf, and GeoJSON. Uses pyarrow for Parquet/GeoParquet
5
+ introspection without loading full datasets into memory.
6
+ """
@@ -0,0 +1,356 @@
1
+ """Vector format conversion.
2
+
3
+ Converts between vector geospatial formats with a focus on producing valid
4
+ GeoParquet output. Supports Shapefile, GeoJSON, and other OGR-readable
5
+ formats as input. Writes GeoParquet with proper ``geo`` metadata including
6
+ CRS, geometry types, encoding, and bounding box.
7
+
8
+ Uses GDAL/OGR for reading source formats and pyarrow for writing Parquet.
9
+ Falls back to geopandas if available, but does not require it.
10
+
11
+ Usage::
12
+
13
+ from earthforge.vector.convert import convert_vector
14
+
15
+ result = await convert_vector("buildings.shp", output="buildings.parquet")
16
+ """
17
+
18
+ from __future__ import annotations
19
+
20
+ import asyncio
21
+ import json
22
+ from functools import partial
23
+ from pathlib import Path
24
+ from typing import Any
25
+
26
+ from pydantic import BaseModel, Field
27
+
28
+ from earthforge.vector.errors import VectorError
29
+
30
+
31
+ class ConvertResult(BaseModel):
32
+ """Structured result from a vector format conversion.
33
+
34
+ Attributes:
35
+ source: Input file path.
36
+ output: Output file path.
37
+ input_format: Source format name (e.g. ``"ESRI Shapefile"``).
38
+ output_format: Target format (e.g. ``"geoparquet"``).
39
+ feature_count: Number of features converted.
40
+ geometry_type: Geometry type (e.g. ``"Polygon"``).
41
+ crs: CRS identifier string.
42
+ bbox: Bounding box ``[west, south, east, north]``.
43
+ file_size_bytes: Output file size in bytes.
44
+ """
45
+
46
+ source: str = Field(title="Source")
47
+ output: str = Field(title="Output")
48
+ input_format: str = Field(title="Input Format")
49
+ output_format: str = Field(title="Output Format")
50
+ feature_count: int = Field(title="Features")
51
+ geometry_type: str | None = Field(default=None, title="Geometry Type")
52
+ crs: str | None = Field(default=None, title="CRS")
53
+ bbox: list[float] | None = Field(default=None, title="BBox")
54
+ file_size_bytes: int | None = Field(default=None, title="Size (bytes)")
55
+
56
+
57
+ def _ogr_type_to_arrow(ogr_type: int) -> str:
58
+ """Map an OGR field type to a pyarrow type string.
59
+
60
+ Parameters:
61
+ ogr_type: OGR field type constant.
62
+
63
+ Returns:
64
+ Arrow type name string.
65
+ """
66
+ from osgeo import ogr
67
+
68
+ mapping = {
69
+ ogr.OFTInteger: "int32",
70
+ ogr.OFTInteger64: "int64",
71
+ ogr.OFTReal: "float64",
72
+ ogr.OFTString: "string",
73
+ ogr.OFTDate: "string",
74
+ ogr.OFTDateTime: "string",
75
+ ogr.OFTBinary: "binary",
76
+ }
77
+ return mapping.get(ogr_type, "string")
78
+
79
+
80
+ def _ogr_geom_type_name(ogr_geom_type: int) -> str:
81
+ """Convert OGR geometry type constant to human-readable name.
82
+
83
+ Parameters:
84
+ ogr_geom_type: OGR geometry type constant.
85
+
86
+ Returns:
87
+ Geometry type name.
88
+ """
89
+ from osgeo import ogr
90
+
91
+ mapping = {
92
+ ogr.wkbPoint: "Point",
93
+ ogr.wkbLineString: "LineString",
94
+ ogr.wkbPolygon: "Polygon",
95
+ ogr.wkbMultiPoint: "MultiPoint",
96
+ ogr.wkbMultiLineString: "MultiLineString",
97
+ ogr.wkbMultiPolygon: "MultiPolygon",
98
+ ogr.wkbGeometryCollection: "GeometryCollection",
99
+ ogr.wkbPoint25D: "Point",
100
+ ogr.wkbLineString25D: "LineString",
101
+ ogr.wkbPolygon25D: "Polygon",
102
+ ogr.wkbMultiPoint25D: "MultiPoint",
103
+ ogr.wkbMultiLineString25D: "MultiLineString",
104
+ ogr.wkbMultiPolygon25D: "MultiPolygon",
105
+ }
106
+ return mapping.get(ogr_geom_type, "Unknown")
107
+
108
+
109
+ def _extract_crs_info(spatial_ref: Any) -> tuple[str | None, dict[str, Any] | None]:
110
+ """Extract CRS identifier and PROJJSON from an OGR SpatialReference.
111
+
112
+ Parameters:
113
+ spatial_ref: OGR SpatialReference object.
114
+
115
+ Returns:
116
+ Tuple of (crs_string, projjson_dict).
117
+ """
118
+ if spatial_ref is None:
119
+ return None, None
120
+
121
+ # Try to get authority:code
122
+ auth_name = spatial_ref.GetAuthorityName(None)
123
+ auth_code = spatial_ref.GetAuthorityCode(None)
124
+ crs_string = f"{auth_name}:{auth_code}" if auth_name and auth_code else None
125
+
126
+ # Build PROJJSON for GeoParquet metadata
127
+ projjson: dict[str, Any] | None = None
128
+ try:
129
+ projjson_str = spatial_ref.ExportToPROJJSON()
130
+ if projjson_str:
131
+ projjson = json.loads(projjson_str)
132
+ except Exception:
133
+ # Fall back to building minimal PROJJSON
134
+ if crs_string:
135
+ projjson = {
136
+ "type": "GeographicCRS" if spatial_ref.IsGeographic() else "ProjectedCRS",
137
+ "name": spatial_ref.GetName() or crs_string,
138
+ "id": {"authority": auth_name, "code": int(auth_code) if auth_code else 0},
139
+ }
140
+
141
+ if crs_string is None and spatial_ref.GetName():
142
+ crs_string = spatial_ref.GetName()
143
+
144
+ return crs_string, projjson
145
+
146
+
147
+ def _convert_vector_sync(
148
+ source: str,
149
+ *,
150
+ output: str | None = None,
151
+ target_format: str = "geoparquet",
152
+ compression: str = "snappy",
153
+ ) -> ConvertResult:
154
+ """Convert a vector file to GeoParquet synchronously.
155
+
156
+ Parameters:
157
+ source: Path to the input vector file (Shapefile, GeoJSON, etc.).
158
+ output: Output file path. If ``None``, derives from source name.
159
+ target_format: Target format (currently only ``"geoparquet"``).
160
+ compression: Parquet compression codec (``"snappy"``, ``"zstd"``, ``"gzip"``).
161
+
162
+ Returns:
163
+ Structured conversion result.
164
+
165
+ Raises:
166
+ VectorError: If the source cannot be read or conversion fails.
167
+ """
168
+ if target_format != "geoparquet":
169
+ raise VectorError(f"Unsupported target format: {target_format}")
170
+
171
+ try:
172
+ from osgeo import ogr
173
+ except ImportError as exc:
174
+ raise VectorError(
175
+ "GDAL/OGR is required for vector conversion: install GDAL Python bindings"
176
+ ) from exc
177
+
178
+ try:
179
+ import pyarrow as pa
180
+ import pyarrow.parquet as pq
181
+ except ImportError as exc:
182
+ raise VectorError(
183
+ "pyarrow is required for GeoParquet output: pip install earthforge[vector]"
184
+ ) from exc
185
+
186
+ # Open source
187
+ try:
188
+ ds = ogr.Open(source)
189
+ except RuntimeError as exc:
190
+ raise VectorError(f"Failed to open vector file '{source}'") from exc
191
+ if ds is None:
192
+ raise VectorError(f"Failed to open vector file '{source}'")
193
+
194
+ layer = ds.GetLayer(0)
195
+ if layer is None:
196
+ raise VectorError(f"No layers found in '{source}'")
197
+
198
+ input_format = ds.GetDriver().GetName()
199
+ layer_defn = layer.GetLayerDefn()
200
+ feature_count = layer.GetFeatureCount()
201
+ geom_type = _ogr_geom_type_name(layer.GetGeomType())
202
+
203
+ # Extract CRS
204
+ spatial_ref = layer.GetSpatialRef()
205
+ crs_string, projjson = _extract_crs_info(spatial_ref)
206
+
207
+ # Get extent
208
+ extent = layer.GetExtent() # (xmin, xmax, ymin, ymax)
209
+ bbox = [extent[0], extent[2], extent[1], extent[3]] if extent else None
210
+
211
+ # Build field definitions
212
+ field_names: list[str] = []
213
+ field_types: list[str] = []
214
+ for i in range(layer_defn.GetFieldCount()):
215
+ field_def = layer_defn.GetFieldDefn(i)
216
+ field_names.append(field_def.GetName())
217
+ field_types.append(_ogr_type_to_arrow(field_def.GetType()))
218
+
219
+ # Read all features
220
+ arrays: dict[str, list[Any]] = {name: [] for name in field_names}
221
+ geometries: list[bytes] = []
222
+
223
+ layer.ResetReading()
224
+ feature = layer.GetNextFeature()
225
+ actual_count = 0
226
+ while feature is not None:
227
+ # Read attribute fields
228
+ for i, name in enumerate(field_names):
229
+ if not feature.IsFieldSet(i) or feature.IsFieldNull(i):
230
+ arrays[name].append(None)
231
+ elif field_types[i] == "int32":
232
+ arrays[name].append(feature.GetFieldAsInteger(i))
233
+ elif field_types[i] == "int64":
234
+ arrays[name].append(feature.GetFieldAsInteger64(i))
235
+ elif field_types[i] == "float64":
236
+ arrays[name].append(feature.GetFieldAsDouble(i))
237
+ else:
238
+ arrays[name].append(feature.GetFieldAsString(i))
239
+
240
+ # Read geometry as WKB
241
+ geom = feature.GetGeometryRef()
242
+ if geom is not None:
243
+ geometries.append(bytes(geom.ExportToWkb()))
244
+ else:
245
+ geometries.append(b"")
246
+
247
+ actual_count += 1
248
+ feature = layer.GetNextFeature()
249
+
250
+ ds = None # Close OGR dataset
251
+
252
+ if feature_count < 0:
253
+ feature_count = actual_count
254
+
255
+ # Build pyarrow table
256
+ pa_columns: dict[str, Any] = {}
257
+ for name, arrow_type in zip(field_names, field_types, strict=True):
258
+ type_map = {
259
+ "int32": pa.int32(),
260
+ "int64": pa.int64(),
261
+ "float64": pa.float64(),
262
+ "string": pa.string(),
263
+ "binary": pa.binary(),
264
+ }
265
+ pa_type = type_map.get(arrow_type, pa.string())
266
+ pa_columns[name] = pa.array(arrays[name], type=pa_type)
267
+
268
+ pa_columns["geometry"] = pa.array(geometries, type=pa.binary())
269
+ table = pa.table(pa_columns)
270
+
271
+ # Build GeoParquet metadata
272
+ geo_metadata: dict[str, Any] = {
273
+ "version": "1.1.0",
274
+ "primary_column": "geometry",
275
+ "columns": {
276
+ "geometry": {
277
+ "encoding": "WKB",
278
+ "geometry_types": [geom_type],
279
+ }
280
+ },
281
+ }
282
+
283
+ if bbox:
284
+ geo_metadata["columns"]["geometry"]["bbox"] = bbox
285
+ if projjson:
286
+ geo_metadata["columns"]["geometry"]["crs"] = projjson
287
+
288
+ # Attach geo metadata to schema
289
+ existing = table.schema.metadata or {}
290
+ existing[b"geo"] = json.dumps(geo_metadata).encode("utf-8")
291
+ table = table.replace_schema_metadata(existing)
292
+
293
+ # Determine output path
294
+ if output is None:
295
+ output = str(Path(source).with_suffix(".parquet"))
296
+
297
+ # Write GeoParquet
298
+ try:
299
+ pq.write_table(table, output, compression=compression)
300
+ except Exception as exc:
301
+ raise VectorError(f"Failed to write GeoParquet '{output}': {exc}") from exc
302
+
303
+ file_size: int | None = None
304
+ try:
305
+ file_size = Path(output).stat().st_size
306
+ except OSError:
307
+ pass
308
+
309
+ return ConvertResult(
310
+ source=source,
311
+ output=output,
312
+ input_format=input_format,
313
+ output_format="geoparquet",
314
+ feature_count=actual_count,
315
+ geometry_type=geom_type,
316
+ crs=crs_string,
317
+ bbox=bbox,
318
+ file_size_bytes=file_size,
319
+ )
320
+
321
+
322
+ async def convert_vector(
323
+ source: str,
324
+ *,
325
+ output: str | None = None,
326
+ target_format: str = "geoparquet",
327
+ compression: str = "snappy",
328
+ ) -> ConvertResult:
329
+ """Convert a vector file to GeoParquet.
330
+
331
+ Reads the source using GDAL/OGR and writes GeoParquet with proper ``geo``
332
+ metadata. Supports Shapefile, GeoJSON, GPKG, and any OGR-supported format.
333
+
334
+ Parameters:
335
+ source: Path to the input vector file.
336
+ output: Output file path. If ``None``, replaces extension with ``.parquet``.
337
+ target_format: Target format (default: ``"geoparquet"``).
338
+ compression: Parquet compression codec (default: ``"snappy"``).
339
+
340
+ Returns:
341
+ Structured conversion result.
342
+
343
+ Raises:
344
+ VectorError: If the conversion fails.
345
+ """
346
+ loop = asyncio.get_running_loop()
347
+ return await loop.run_in_executor(
348
+ None,
349
+ partial(
350
+ _convert_vector_sync,
351
+ source,
352
+ output=output,
353
+ target_format=target_format,
354
+ compression=compression,
355
+ ),
356
+ )
@@ -0,0 +1,21 @@
1
+ """Vector-specific error types.
2
+
3
+ All exceptions inherit from :class:`~earthforge.core.errors.EarthForgeError`
4
+ so the CLI can catch them uniformly and map to appropriate exit codes.
5
+ """
6
+
7
+ from __future__ import annotations
8
+
9
+ from earthforge.core.errors import EarthForgeError
10
+
11
+
12
+ class VectorError(EarthForgeError):
13
+ """Base error for vector operations.
14
+
15
+ Parameters:
16
+ message: Human-readable error description.
17
+ exit_code: Process exit code (default: 20).
18
+ """
19
+
20
+ def __init__(self, message: str, *, exit_code: int = 20) -> None:
21
+ super().__init__(message, exit_code=exit_code)
@@ -0,0 +1,245 @@
1
+ """Deep metadata extraction for vector geospatial formats.
2
+
3
+ Reads Parquet/GeoParquet file metadata via pyarrow without loading data into
4
+ memory. Extracts schema, row counts, geometry columns, CRS, bounding box, and
5
+ encoding information from GeoParquet ``geo`` metadata.
6
+
7
+ For non-Parquet vector formats (GeoJSON, FlatGeobuf), provides basic file-level
8
+ metadata. Deep inspection of those formats may be added in later milestones.
9
+ """
10
+
11
+ from __future__ import annotations
12
+
13
+ import asyncio
14
+ import json
15
+ from functools import partial
16
+ from pathlib import Path
17
+ from typing import Any
18
+
19
+ from pydantic import BaseModel, Field
20
+
21
+ from earthforge.vector.errors import VectorError
22
+
23
+
24
+ class ColumnInfo(BaseModel):
25
+ """Metadata for a single column in a vector dataset.
26
+
27
+ Attributes:
28
+ name: Column name.
29
+ type: Arrow type string (e.g. ``"int64"``, ``"binary"``).
30
+ is_geometry: Whether this column contains geometry data.
31
+ """
32
+
33
+ name: str = Field(title="Column")
34
+ type: str = Field(title="Type")
35
+ is_geometry: bool = Field(default=False, title="Geometry")
36
+
37
+
38
+ class VectorInfo(BaseModel):
39
+ """Structured metadata for a vector geospatial file.
40
+
41
+ Attributes:
42
+ source: The file path that was inspected.
43
+ format: Detected vector format (e.g. ``"geoparquet"``, ``"parquet"``).
44
+ row_count: Total number of rows/features.
45
+ num_columns: Total number of columns.
46
+ columns: Per-column metadata.
47
+ geometry_column: Name of the primary geometry column, if any.
48
+ geometry_types: List of geometry types found (e.g. ``["Point"]``).
49
+ crs: CRS string from GeoParquet metadata, if available.
50
+ bbox: Bounding box ``[west, south, east, north]``, if available.
51
+ encoding: Geometry encoding (e.g. ``"WKB"``), if available.
52
+ num_row_groups: Number of Parquet row groups.
53
+ compression: Parquet compression codec, if applicable.
54
+ file_size_bytes: File size in bytes.
55
+ """
56
+
57
+ source: str = Field(title="Source")
58
+ format: str = Field(title="Format")
59
+ row_count: int = Field(title="Rows")
60
+ num_columns: int = Field(title="Columns")
61
+ columns: list[ColumnInfo] = Field(title="Column Details")
62
+ geometry_column: str | None = Field(default=None, title="Geometry Column")
63
+ geometry_types: list[str] = Field(default_factory=list, title="Geometry Types")
64
+ crs: str | None = Field(default=None, title="CRS")
65
+ bbox: list[float] | None = Field(default=None, title="Bounding Box")
66
+ encoding: str | None = Field(default=None, title="Encoding")
67
+ num_row_groups: int | None = Field(default=None, title="Row Groups")
68
+ compression: str | None = Field(default=None, title="Compression")
69
+ file_size_bytes: int | None = Field(default=None, title="Size (bytes)")
70
+
71
+
72
+ def _read_parquet_info(source: str) -> VectorInfo:
73
+ """Read metadata from a Parquet/GeoParquet file synchronously.
74
+
75
+ Uses pyarrow to read only the file metadata and schema — no row data
76
+ is loaded into memory. Parses the ``geo`` metadata key for GeoParquet-
77
+ specific information (geometry column, CRS, bbox, encoding).
78
+
79
+ Parameters:
80
+ source: Path to a Parquet file.
81
+
82
+ Returns:
83
+ Structured vector metadata.
84
+
85
+ Raises:
86
+ VectorError: If the file cannot be read or is not a valid Parquet file.
87
+ """
88
+ try:
89
+ import pyarrow.parquet as pq
90
+ except ImportError as exc:
91
+ msg = "pyarrow is required for Parquet inspection: pip install pyarrow"
92
+ raise VectorError(msg) from exc
93
+
94
+ try:
95
+ pf = pq.ParquetFile(source)
96
+ except Exception as exc:
97
+ msg = f"Failed to read Parquet file '{source}': {exc}"
98
+ raise VectorError(msg) from exc
99
+
100
+ schema = pf.schema_arrow
101
+ metadata = schema.metadata or {}
102
+ num_rows = pf.metadata.num_rows
103
+ num_row_groups = pf.metadata.num_row_groups
104
+
105
+ # Detect compression from the first row group's first column chunk
106
+ compression: str | None = None
107
+ if num_row_groups > 0 and schema:
108
+ try:
109
+ rg = pf.metadata.row_group(0)
110
+ if rg.num_columns > 0:
111
+ compression = rg.column(0).compression
112
+ except Exception: # noqa: S110 — best-effort metadata extraction
113
+ pass
114
+
115
+ # Parse GeoParquet metadata
116
+ geo_meta = _parse_geo_metadata(metadata)
117
+ geometry_column = geo_meta.get("primary_column")
118
+ geometry_columns: set[str] = set()
119
+ geometry_types: list[str] = []
120
+ crs: str | None = None
121
+ bbox: list[float] | None = None
122
+ encoding: str | None = None
123
+
124
+ if geometry_column and "columns" in geo_meta:
125
+ geometry_columns.add(geometry_column)
126
+ col_meta = geo_meta["columns"].get(geometry_column, {})
127
+ geometry_types = col_meta.get("geometry_types", [])
128
+ encoding = col_meta.get("encoding")
129
+ bbox_raw = col_meta.get("bbox")
130
+ if isinstance(bbox_raw, list) and len(bbox_raw) == 4:
131
+ bbox = [float(v) for v in bbox_raw]
132
+
133
+ crs_obj = col_meta.get("crs")
134
+ if isinstance(crs_obj, dict):
135
+ # GeoParquet stores CRS as PROJJSON — extract the name or ID
136
+ crs = _extract_crs_string(crs_obj)
137
+ elif isinstance(crs_obj, str):
138
+ crs = crs_obj
139
+
140
+ # Also check for additional geometry columns
141
+ if "columns" in geo_meta:
142
+ geometry_columns.update(geo_meta["columns"].keys())
143
+
144
+ # Build column info
145
+ columns: list[ColumnInfo] = []
146
+ for i in range(len(schema)):
147
+ field = schema.field(i)
148
+ columns.append(
149
+ ColumnInfo(
150
+ name=field.name,
151
+ type=str(field.type),
152
+ is_geometry=field.name in geometry_columns,
153
+ )
154
+ )
155
+
156
+ # File size
157
+ file_size: int | None = None
158
+ try:
159
+ file_size = Path(source).stat().st_size
160
+ except OSError:
161
+ pass
162
+
163
+ fmt = "geoparquet" if geometry_column else "parquet"
164
+
165
+ return VectorInfo(
166
+ source=source,
167
+ format=fmt,
168
+ row_count=num_rows,
169
+ num_columns=len(schema),
170
+ columns=columns,
171
+ geometry_column=geometry_column,
172
+ geometry_types=geometry_types,
173
+ crs=crs,
174
+ bbox=bbox,
175
+ encoding=encoding,
176
+ num_row_groups=num_row_groups,
177
+ compression=compression,
178
+ file_size_bytes=file_size,
179
+ )
180
+
181
+
182
+ def _parse_geo_metadata(metadata: dict[bytes, bytes]) -> dict[str, Any]:
183
+ """Parse the ``geo`` key from Parquet file metadata.
184
+
185
+ Parameters:
186
+ metadata: Raw Parquet schema metadata (bytes keys and values).
187
+
188
+ Returns:
189
+ Parsed GeoParquet metadata dict, or empty dict if not present.
190
+ """
191
+ geo_bytes = metadata.get(b"geo")
192
+ if geo_bytes is None:
193
+ return {}
194
+ try:
195
+ result: dict[str, Any] = json.loads(geo_bytes)
196
+ return result
197
+ except (json.JSONDecodeError, UnicodeDecodeError):
198
+ return {}
199
+
200
+
201
+ def _extract_crs_string(crs_obj: dict[str, Any]) -> str:
202
+ """Extract a human-readable CRS identifier from PROJJSON.
203
+
204
+ Tries ``id.code`` (e.g. ``"EPSG:4326"``), then ``name``, then falls
205
+ back to a truncated JSON representation.
206
+
207
+ Parameters:
208
+ crs_obj: PROJJSON CRS object.
209
+
210
+ Returns:
211
+ CRS identifier string.
212
+ """
213
+ # Try EPSG-style authority:code
214
+ crs_id = crs_obj.get("id", {})
215
+ if isinstance(crs_id, dict):
216
+ authority = crs_id.get("authority")
217
+ code = crs_id.get("code")
218
+ if authority and code:
219
+ return f"{authority}:{code}"
220
+
221
+ # Fall back to name
222
+ name = crs_obj.get("name")
223
+ if isinstance(name, str):
224
+ return name
225
+
226
+ return json.dumps(crs_obj)[:100]
227
+
228
+
229
+ async def inspect_vector(source: str) -> VectorInfo:
230
+ """Inspect a vector file and return structured metadata.
231
+
232
+ Runs the synchronous pyarrow read in a thread executor to avoid blocking
233
+ the event loop. Currently supports Parquet and GeoParquet files.
234
+
235
+ Parameters:
236
+ source: Path to a vector file.
237
+
238
+ Returns:
239
+ Structured vector metadata.
240
+
241
+ Raises:
242
+ VectorError: If the file cannot be read or format is unsupported.
243
+ """
244
+ loop = asyncio.get_running_loop()
245
+ return await loop.run_in_executor(None, partial(_read_parquet_info, source))