geometamaker 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,529 @@
1
+ import functools
2
+ import hashlib
3
+ import logging
4
+ import os
5
+ import requests
6
+ from collections import defaultdict
7
+ from datetime import datetime, timezone
8
+
9
+ import frictionless
10
+ import fsspec
11
+ import numpy
12
+ import pygeoprocessing
13
+ import yaml
14
+ from osgeo import gdal
15
+ from osgeo import osr
16
+ from pydantic import ValidationError
17
+
18
+ from . import models
19
+ from .config import Config
20
+
21
+
22
+ LOGGER = logging.getLogger(__name__)
23
+
24
+ # URI schemes we support. A subset of fsspec.available_protocols()
25
+ PROTOCOLS = [
26
+ 'file',
27
+ 'http',
28
+ 'https',
29
+ ]
30
+
31
+ DT_FMT = '%Y-%m-%d %H:%M:%S %Z'
32
+
33
+
34
+ # TODO: In the future we can remove these exception managers in favor of the
35
+ # builtin gdal.ExceptionMgr. It was released in 3.7.0 and debugged in 3.9.1.
36
+ # https://github.com/OSGeo/gdal/blob/v3.9.3/NEWS.md#gdalogr-391-release-notes
37
+ class _OSGEOUseExceptions:
38
+ """Context manager that enables GDAL/OSR exceptions and restores state after."""
39
+
40
+ def __init__(self):
41
+ pass
42
+
43
+ def __enter__(self):
44
+ self.currentGDALUseExceptions = gdal.GetUseExceptions()
45
+ self.currentOSRUseExceptions = osr.GetUseExceptions()
46
+ gdal.UseExceptions()
47
+ osr.UseExceptions()
48
+
49
+ def __exit__(self, exc_type, exc_val, exc_tb):
50
+ # The error-handlers are in a stack, so
51
+ # these must be called from the top down.
52
+ if self.currentOSRUseExceptions == 0:
53
+ osr.DontUseExceptions()
54
+ if self.currentGDALUseExceptions == 0:
55
+ gdal.DontUseExceptions()
56
+
57
+
58
+ def _osgeo_use_exceptions(func):
59
+ """Decorator that enables GDAL/OSR exceptions and restores state after.
60
+
61
+ Args:
62
+ func (callable): function to call with GDAL/OSR exceptions enabled
63
+
64
+ Returns:
65
+ Wrapper function that calls ``func`` with GDAL/OSR exceptions enabled
66
+ """
67
+ @functools.wraps(func)
68
+ def wrapper(*args, **kwargs):
69
+ with _OSGEOUseExceptions():
70
+ return func(*args, **kwargs)
71
+ return wrapper
72
+
73
+
74
+ def _vsi_path(filepath, scheme):
75
+ """Construct a GDAL virtual file system path.
76
+
77
+ Args:
78
+ filepath (str): path to a file to be opened by GDAL
79
+ scheme (str): the protocol prefix of the filepath
80
+
81
+ Returns:
82
+ str
83
+
84
+ """
85
+ if scheme.startswith('http'):
86
+ filepath = f'/vsicurl/{filepath}'
87
+ return filepath
88
+
89
+
90
+ def _wkt_to_epsg_units_string(wkt_string):
91
+ crs_string = 'unknown'
92
+ units_string = 'unknown'
93
+ try:
94
+ srs = osr.SpatialReference(wkt_string)
95
+ srs.AutoIdentifyEPSG()
96
+ crs_string = (
97
+ f"{srs.GetAttrValue('AUTHORITY', 0)}:"
98
+ f"{srs.GetAttrValue('AUTHORITY', 1)}")
99
+ units_string = srs.GetAttrValue('UNIT', 0)
100
+ except RuntimeError:
101
+ LOGGER.warning(
102
+ f'{wkt_string} cannot be interpreted as a coordinate reference system')
103
+ return crs_string, units_string
104
+
105
+
106
+ def detect_file_type(filepath, scheme):
107
+ """Detect the type of resource contained in the file.
108
+
109
+ Args:
110
+ filepath (str): path to a file to be opened by GDAL or frictionless
111
+ scheme (str): the protocol prefix of the filepath
112
+
113
+ Returns:
114
+ str
115
+
116
+ Raises:
117
+ ValueError on unsupported file formats.
118
+
119
+ """
120
+ # TODO: guard against classifying netCDF, HDF5, etc as GDAL rasters.
121
+ # We'll likely want a different data model for multi-dimensional arrays.
122
+
123
+ # Frictionless supports a wide range of formats. The quickest way to
124
+ # determine if a file is recognized as a table or archive is to call list.
125
+ info = frictionless.list(filepath)[0]
126
+ if info.type == 'table':
127
+ return 'table'
128
+ if info.compression:
129
+ return 'archive'
130
+ # GDAL considers CSV a vector, so check against frictionless first.
131
+ try:
132
+ gis_type = pygeoprocessing.get_gis_type(_vsi_path(filepath, scheme))
133
+ except ValueError:
134
+ raise ValueError(
135
+ f'{filepath} does not appear to be one of '
136
+ f'(archive, table, raster, vector)')
137
+ if gis_type == pygeoprocessing.VECTOR_TYPE:
138
+ return 'vector'
139
+ if gis_type == pygeoprocessing.RASTER_TYPE:
140
+ return 'raster'
141
+ raise ValueError(
142
+ f'{filepath} contains both raster and vector data. '
143
+ 'Such files are not supported by GeoMetaMaker. '
144
+ 'If you wish to see support for these files, please '
145
+ 'submit a feature request and share your dataset: '
146
+ 'https://github.com/natcap/geometamaker/issues ')
147
+
148
+
149
+ def describe_file(source_dataset_path, scheme):
150
+ """Describe basic properties of a file.
151
+
152
+ Args:
153
+ source_dataset_path (str): path to a file.
154
+ scheme (str): the protocol prefix of the filepath
155
+
156
+ Returns:
157
+ dict
158
+
159
+ """
160
+ description = frictionless.describe(source_dataset_path).to_dict()
161
+
162
+ # If we want to support more file protocols in the future, it may
163
+ # make sense to use fsspec to access file info in a protocol-agnostic way.
164
+ # But not all protocols are equally supported yet.
165
+ # https://github.com/fsspec/filesystem_spec/issues/526
166
+ if scheme.startswith('http'):
167
+ info = requests.head(source_dataset_path).headers
168
+ description['bytes'] = info['Content-Length']
169
+ description['last_modified'] = datetime.strptime(
170
+ info['Last-Modified'], '%a, %d %b %Y %H:%M:%S %Z').strftime(DT_FMT)
171
+ else:
172
+ info = os.stat(source_dataset_path)
173
+ description['bytes'] = info.st_size
174
+ description['last_modified'] = datetime.fromtimestamp(
175
+ info.st_mtime, tz=timezone.utc).strftime(DT_FMT)
176
+
177
+ hash_func = hashlib.new('sha256')
178
+ hash_func.update(
179
+ f'{description["bytes"]}{description["last_modified"]}\
180
+ {description["path"]}'.encode('ascii'))
181
+ description['uid'] = f'sizetimestamp:{hash_func.hexdigest()}'
182
+
183
+ # We don't have a use for including these attributes in our metadata:
184
+ description.pop('mediatype', None)
185
+ description.pop('name', None)
186
+ return description
187
+
188
+
189
+ def describe_archive(source_dataset_path, scheme):
190
+ """Describe file properties of a compressed file.
191
+
192
+ Args:
193
+ source_dataset_path (str): path to a file.
194
+ scheme (str): the protocol prefix of the filepath
195
+
196
+ Returns:
197
+ dict
198
+
199
+ """
200
+ description = describe_file(source_dataset_path, scheme)
201
+ # innerpath is from frictionless and not useful because
202
+ # it does not include all the files contained in the zip
203
+ description.pop('innerpath', None)
204
+
205
+ ZFS = fsspec.get_filesystem_class('zip')
206
+ zfs = ZFS(source_dataset_path)
207
+ file_list = []
208
+ for dirpath, _, files in zfs.walk(zfs.root_marker):
209
+ for f in files:
210
+ file_list.append(os.path.join(dirpath, f))
211
+ description['sources'] = file_list
212
+ return description
213
+
214
+
215
+ def describe_vector(source_dataset_path, scheme):
216
+ """Describe properties of a GDAL vector file.
217
+
218
+ Args:
219
+ source_dataset_path (str): path to a GDAL vector.
220
+
221
+ Returns:
222
+ dict
223
+
224
+ """
225
+ description = describe_file(source_dataset_path, scheme)
226
+
227
+ if 'http' in scheme:
228
+ source_dataset_path = f'/vsicurl/{source_dataset_path}'
229
+ vector = gdal.OpenEx(source_dataset_path, gdal.OF_VECTOR)
230
+ layer = vector.GetLayer()
231
+ fields = []
232
+ description['n_features'] = layer.GetFeatureCount()
233
+ for fld in layer.schema:
234
+ fields.append(
235
+ models.FieldSchema(name=fld.name, type=fld.GetTypeName()))
236
+ vector = layer = None
237
+ description['data_model'] = models.TableSchema(fields=fields)
238
+
239
+ info = pygeoprocessing.get_vector_info(source_dataset_path)
240
+ bbox = models.BoundingBox(*info['bounding_box'])
241
+ epsg_string, units_string = _wkt_to_epsg_units_string(
242
+ info['projection_wkt'])
243
+ description['spatial'] = models.SpatialSchema(
244
+ bounding_box=bbox,
245
+ crs=epsg_string,
246
+ crs_units=units_string)
247
+ description['sources'] = info['file_list']
248
+ return description
249
+
250
+
251
+ def describe_raster(source_dataset_path, scheme):
252
+ """Describe properties of a GDAL raster file.
253
+
254
+ Args:
255
+ source_dataset_path (str): path to a GDAL raster.
256
+
257
+ Returns:
258
+ dict
259
+
260
+ """
261
+ description = describe_file(source_dataset_path, scheme)
262
+ if 'http' in scheme:
263
+ source_dataset_path = f'/vsicurl/{source_dataset_path}'
264
+ info = pygeoprocessing.get_raster_info(source_dataset_path)
265
+ bands = []
266
+ for i in range(info['n_bands']):
267
+ b = i + 1
268
+ bands.append(models.BandSchema(
269
+ index=b,
270
+ gdal_type=gdal.GetDataTypeName(info['datatype']),
271
+ numpy_type=numpy.dtype(info['numpy_type']).name,
272
+ nodata=info['nodata'][i]))
273
+ description['data_model'] = models.RasterSchema(
274
+ bands=bands,
275
+ pixel_size=info['pixel_size'],
276
+ raster_size={'width': info['raster_size'][0],
277
+ 'height': info['raster_size'][1]})
278
+ # Some values of raster info are numpy types, which the
279
+ # yaml dumper doesn't know how to represent.
280
+ bbox = models.BoundingBox(*[float(x) for x in info['bounding_box']])
281
+ epsg_string, units_string = _wkt_to_epsg_units_string(
282
+ info['projection_wkt'])
283
+ description['spatial'] = models.SpatialSchema(
284
+ bounding_box=bbox,
285
+ crs=epsg_string,
286
+ crs_units=units_string)
287
+ description['sources'] = info['file_list']
288
+ return description
289
+
290
+
291
+ def describe_table(source_dataset_path, scheme):
292
+ """Describe properties of a tabular dataset.
293
+
294
+ Args:
295
+ source_dataset_path (str): path to a file representing a table.
296
+ scheme (str): the protocol prefix of the filepath
297
+
298
+ Returns:
299
+ dict
300
+
301
+ """
302
+ description = describe_file(source_dataset_path, scheme)
303
+ description['data_model'] = models.TableSchema(**description['schema'])
304
+ del description['schema'] # we forbid extra args in our Pydantic models
305
+ return description
306
+
307
+
308
+ DESRCIBE_FUNCS = {
309
+ 'archive': describe_archive,
310
+ 'table': describe_table,
311
+ 'vector': describe_vector,
312
+ 'raster': describe_raster
313
+ }
314
+
315
+ RESOURCE_MODELS = {
316
+ 'archive': models.ArchiveResource,
317
+ 'table': models.TableResource,
318
+ 'vector': models.VectorResource,
319
+ 'raster': models.RasterResource
320
+ }
321
+
322
+
323
+ @_osgeo_use_exceptions
324
+ def describe(source_dataset_path, profile=None):
325
+ """Create a metadata resource instance with properties of the dataset.
326
+
327
+ Properties of the dataset are used to populate as many metadata
328
+ properties as possible. Default/placeholder
329
+ values are used for properties that require user input.
330
+
331
+ Args:
332
+ source_dataset_path (string): path or URL to dataset to which the
333
+ metadata applies
334
+ profile (geometamaker.models.Profile): a profile object from
335
+ which to populate some metadata attributes
336
+
337
+ Returns:
338
+ geometamaker.models.Resource: a metadata object
339
+
340
+ """
341
+ config = Config()
342
+ user_profile = config.profile
343
+ if profile is not None:
344
+ user_profile = user_profile.replace(profile)
345
+
346
+ metadata_path = f'{source_dataset_path}.yml'
347
+
348
+ # Despite naming, this does not open a file that must be closed
349
+ of = fsspec.open(source_dataset_path)
350
+ if not of.fs.exists(source_dataset_path):
351
+ raise FileNotFoundError(f'{source_dataset_path} does not exist')
352
+
353
+ protocol = fsspec.utils.get_protocol(source_dataset_path)
354
+ if protocol not in PROTOCOLS:
355
+ raise ValueError(
356
+ f'Cannot describe {source_dataset_path}. {protocol} '
357
+ f'is not one of the suppored file protocols: {PROTOCOLS}')
358
+ resource_type = detect_file_type(source_dataset_path, protocol)
359
+ description = DESRCIBE_FUNCS[resource_type](
360
+ source_dataset_path, protocol)
361
+ description['type'] = resource_type
362
+
363
+ # Load existing metadata file
364
+ try:
365
+ existing_resource = RESOURCE_MODELS[resource_type].load(metadata_path)
366
+ if 'data_model' in description:
367
+ if isinstance(description['data_model'], models.RasterSchema):
368
+ # If existing band metadata still matches data_model of the file
369
+ # carry over existing metadata because it could include
370
+ # human-defined properties.
371
+ new_bands = []
372
+ for band in description['data_model'].bands:
373
+ try:
374
+ eband = existing_resource.get_band_description(band.index)
375
+ # TODO: rewrite this as __eq__ of BandSchema?
376
+ if (band.numpy_type, band.gdal_type, band.nodata) == (
377
+ eband.numpy_type, eband.gdal_type, eband.nodata):
378
+ updated_dict = band.model_dump() | eband.model_dump()
379
+ band = models.BandSchema(**updated_dict)
380
+ except IndexError:
381
+ pass
382
+ new_bands.append(band)
383
+ description['data_model'].bands = new_bands
384
+ if isinstance(description['data_model'], models.TableSchema):
385
+ # If existing field metadata still matches data_model of the file
386
+ # carry over existing metadata because it could include
387
+ # human-defined properties.
388
+ new_fields = []
389
+ for field in description['data_model'].fields:
390
+ try:
391
+ efield = existing_resource.get_field_description(
392
+ field.name)
393
+ # TODO: rewrite this as __eq__ of FieldSchema?
394
+ if field.type == efield.type:
395
+ updated_dict = field.model_dump() | efield.model_dump()
396
+ field = models.FieldSchema(**updated_dict)
397
+ except KeyError:
398
+ pass
399
+ new_fields.append(field)
400
+ description['data_model'].fields = new_fields
401
+ # overwrite properties that are intrinsic to the dataset
402
+ updated_dict = existing_resource.model_dump() | description
403
+ resource = RESOURCE_MODELS[resource_type](**updated_dict)
404
+
405
+ # Common path: metadata file does not already exist
406
+ # Or less common, ValueError if it exists but is incompatible
407
+ except FileNotFoundError:
408
+ resource = RESOURCE_MODELS[resource_type](**description)
409
+
410
+ resource = resource.replace(user_profile)
411
+ return resource
412
+
413
+
414
+ def validate(filepath):
415
+ """Validate a YAML metadata document.
416
+
417
+ Validation includes type-checking of property values and
418
+ checking for the presence of required properties.
419
+
420
+ Args:
421
+ directory (string): path to a YAML file
422
+
423
+ Returns:
424
+ pydantic.ValidationError
425
+
426
+ Raises:
427
+ ValueError if the YAML document is not a geometamaker metadata doc.
428
+
429
+ """
430
+ with fsspec.open(filepath, 'r') as file:
431
+ yaml_string = file.read()
432
+ yaml_dict = yaml.safe_load(yaml_string)
433
+ if not yaml_dict or ('metadata_version' not in yaml_dict
434
+ and 'geometamaker_version' not in yaml_dict):
435
+ message = (f'{filepath} exists but is not compatible with '
436
+ f'geometamaker.')
437
+ raise ValueError(message)
438
+
439
+ try:
440
+ RESOURCE_MODELS[yaml_dict['type']](**yaml_dict)
441
+ except ValidationError as error:
442
+ return error
443
+
444
+
445
+ def validate_dir(directory, recursive=False):
446
+ """Validate all compatible yml documents in the directory.
447
+
448
+ Args:
449
+ directory (string): path to a directory
450
+ recursive (bool): whether or not to describe files
451
+ in all subdirectories
452
+
453
+ Returns:
454
+ tuple (list, list): a list of the filepaths that were validated and
455
+ an equal-length list of the validation messages.
456
+
457
+ """
458
+ file_list = []
459
+ if recursive:
460
+ for path, dirs, files in os.walk(directory):
461
+ for file in files:
462
+ file_list.append(os.path.join(path, file))
463
+ else:
464
+ file_list.extend(
465
+ [os.path.join(directory, path)
466
+ for path in os.listdir(directory)
467
+ if os.path.isfile(os.path.join(directory, path))])
468
+
469
+ messages = []
470
+ yaml_files = []
471
+ for filepath in file_list:
472
+ if filepath.endswith('.yml'):
473
+ yaml_files.append(filepath)
474
+ try:
475
+ error = validate(filepath)
476
+ if error:
477
+ messages.append(error)
478
+ else:
479
+ messages.append('')
480
+ except ValueError:
481
+ messages.append(
482
+ 'does not appear to be a geometamaker document')
483
+
484
+ return (yaml_files, messages)
485
+
486
+
487
+ def describe_dir(directory, recursive=False):
488
+ """Describe all compatible datasets in the directory.
489
+
490
+ Take special care to only describe multifile datasets,
491
+ such as ESRI Shapefiles, one time.
492
+
493
+ Args:
494
+ directory (string): path to a directory
495
+ recursive (bool): whether or not to describe files
496
+ in all subdirectories
497
+
498
+ Returns:
499
+ None
500
+
501
+ """
502
+ root_set = set()
503
+ root_ext_map = defaultdict(set)
504
+ for path, dirs, files in os.walk(directory):
505
+ for file in files:
506
+ full_path = os.path.join(path, file)
507
+ root, ext = os.path.splitext(full_path)
508
+ # tracking which files share a root name
509
+ # so we can check if these comprise a shapefile
510
+ root_ext_map[root].add(ext)
511
+ root_set.add(root)
512
+ if not recursive:
513
+ break
514
+
515
+ for root in root_set:
516
+ extensions = root_ext_map[root]
517
+ if '.shp' in extensions:
518
+ # if we're dealing with a shapefile, we do not want to describe any
519
+ # of these other files with the same root name
520
+ extensions.difference_update(['.shx', '.sbn', '.sbx', '.prj', '.dbf'])
521
+ for ext in extensions:
522
+ filepath = f'{root}{ext}'
523
+ try:
524
+ resource = describe(filepath)
525
+ except ValueError as error:
526
+ LOGGER.debug(error)
527
+ continue
528
+ resource.write()
529
+ LOGGER.info(f'{filepath} described')