geometamaker 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- geometamaker/__init__.py +13 -0
- geometamaker/cli.py +158 -0
- geometamaker/config.py +65 -0
- geometamaker/geometamaker.py +529 -0
- geometamaker/models.py +653 -0
- geometamaker/utils.py +30 -0
- geometamaker-0.1.0.dist-info/LICENSE.txt +202 -0
- geometamaker-0.1.0.dist-info/METADATA +400 -0
- geometamaker-0.1.0.dist-info/RECORD +12 -0
- geometamaker-0.1.0.dist-info/WHEEL +5 -0
- geometamaker-0.1.0.dist-info/entry_points.txt +2 -0
- geometamaker-0.1.0.dist-info/top_level.txt +1 -0
|
@@ -0,0 +1,529 @@
|
|
|
1
|
+
import functools
|
|
2
|
+
import hashlib
|
|
3
|
+
import logging
|
|
4
|
+
import os
|
|
5
|
+
import requests
|
|
6
|
+
from collections import defaultdict
|
|
7
|
+
from datetime import datetime, timezone
|
|
8
|
+
|
|
9
|
+
import frictionless
|
|
10
|
+
import fsspec
|
|
11
|
+
import numpy
|
|
12
|
+
import pygeoprocessing
|
|
13
|
+
import yaml
|
|
14
|
+
from osgeo import gdal
|
|
15
|
+
from osgeo import osr
|
|
16
|
+
from pydantic import ValidationError
|
|
17
|
+
|
|
18
|
+
from . import models
|
|
19
|
+
from .config import Config
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
LOGGER = logging.getLogger(__name__)
|
|
23
|
+
|
|
24
|
+
# URI schemes we support. A subset of fsspec.available_protocols()
|
|
25
|
+
PROTOCOLS = [
|
|
26
|
+
'file',
|
|
27
|
+
'http',
|
|
28
|
+
'https',
|
|
29
|
+
]
|
|
30
|
+
|
|
31
|
+
DT_FMT = '%Y-%m-%d %H:%M:%S %Z'
|
|
32
|
+
|
|
33
|
+
|
|
34
|
+
# TODO: In the future we can remove these exception managers in favor of the
|
|
35
|
+
# builtin gdal.ExceptionMgr. It was released in 3.7.0 and debugged in 3.9.1.
|
|
36
|
+
# https://github.com/OSGeo/gdal/blob/v3.9.3/NEWS.md#gdalogr-391-release-notes
|
|
37
|
+
class _OSGEOUseExceptions:
|
|
38
|
+
"""Context manager that enables GDAL/OSR exceptions and restores state after."""
|
|
39
|
+
|
|
40
|
+
def __init__(self):
|
|
41
|
+
pass
|
|
42
|
+
|
|
43
|
+
def __enter__(self):
|
|
44
|
+
self.currentGDALUseExceptions = gdal.GetUseExceptions()
|
|
45
|
+
self.currentOSRUseExceptions = osr.GetUseExceptions()
|
|
46
|
+
gdal.UseExceptions()
|
|
47
|
+
osr.UseExceptions()
|
|
48
|
+
|
|
49
|
+
def __exit__(self, exc_type, exc_val, exc_tb):
|
|
50
|
+
# The error-handlers are in a stack, so
|
|
51
|
+
# these must be called from the top down.
|
|
52
|
+
if self.currentOSRUseExceptions == 0:
|
|
53
|
+
osr.DontUseExceptions()
|
|
54
|
+
if self.currentGDALUseExceptions == 0:
|
|
55
|
+
gdal.DontUseExceptions()
|
|
56
|
+
|
|
57
|
+
|
|
58
|
+
def _osgeo_use_exceptions(func):
|
|
59
|
+
"""Decorator that enables GDAL/OSR exceptions and restores state after.
|
|
60
|
+
|
|
61
|
+
Args:
|
|
62
|
+
func (callable): function to call with GDAL/OSR exceptions enabled
|
|
63
|
+
|
|
64
|
+
Returns:
|
|
65
|
+
Wrapper function that calls ``func`` with GDAL/OSR exceptions enabled
|
|
66
|
+
"""
|
|
67
|
+
@functools.wraps(func)
|
|
68
|
+
def wrapper(*args, **kwargs):
|
|
69
|
+
with _OSGEOUseExceptions():
|
|
70
|
+
return func(*args, **kwargs)
|
|
71
|
+
return wrapper
|
|
72
|
+
|
|
73
|
+
|
|
74
|
+
def _vsi_path(filepath, scheme):
|
|
75
|
+
"""Construct a GDAL virtual file system path.
|
|
76
|
+
|
|
77
|
+
Args:
|
|
78
|
+
filepath (str): path to a file to be opened by GDAL
|
|
79
|
+
scheme (str): the protocol prefix of the filepath
|
|
80
|
+
|
|
81
|
+
Returns:
|
|
82
|
+
str
|
|
83
|
+
|
|
84
|
+
"""
|
|
85
|
+
if scheme.startswith('http'):
|
|
86
|
+
filepath = f'/vsicurl/{filepath}'
|
|
87
|
+
return filepath
|
|
88
|
+
|
|
89
|
+
|
|
90
|
+
def _wkt_to_epsg_units_string(wkt_string):
|
|
91
|
+
crs_string = 'unknown'
|
|
92
|
+
units_string = 'unknown'
|
|
93
|
+
try:
|
|
94
|
+
srs = osr.SpatialReference(wkt_string)
|
|
95
|
+
srs.AutoIdentifyEPSG()
|
|
96
|
+
crs_string = (
|
|
97
|
+
f"{srs.GetAttrValue('AUTHORITY', 0)}:"
|
|
98
|
+
f"{srs.GetAttrValue('AUTHORITY', 1)}")
|
|
99
|
+
units_string = srs.GetAttrValue('UNIT', 0)
|
|
100
|
+
except RuntimeError:
|
|
101
|
+
LOGGER.warning(
|
|
102
|
+
f'{wkt_string} cannot be interpreted as a coordinate reference system')
|
|
103
|
+
return crs_string, units_string
|
|
104
|
+
|
|
105
|
+
|
|
106
|
+
def detect_file_type(filepath, scheme):
|
|
107
|
+
"""Detect the type of resource contained in the file.
|
|
108
|
+
|
|
109
|
+
Args:
|
|
110
|
+
filepath (str): path to a file to be opened by GDAL or frictionless
|
|
111
|
+
scheme (str): the protocol prefix of the filepath
|
|
112
|
+
|
|
113
|
+
Returns:
|
|
114
|
+
str
|
|
115
|
+
|
|
116
|
+
Raises:
|
|
117
|
+
ValueError on unsupported file formats.
|
|
118
|
+
|
|
119
|
+
"""
|
|
120
|
+
# TODO: guard against classifying netCDF, HDF5, etc as GDAL rasters.
|
|
121
|
+
# We'll likely want a different data model for multi-dimensional arrays.
|
|
122
|
+
|
|
123
|
+
# Frictionless supports a wide range of formats. The quickest way to
|
|
124
|
+
# determine if a file is recognized as a table or archive is to call list.
|
|
125
|
+
info = frictionless.list(filepath)[0]
|
|
126
|
+
if info.type == 'table':
|
|
127
|
+
return 'table'
|
|
128
|
+
if info.compression:
|
|
129
|
+
return 'archive'
|
|
130
|
+
# GDAL considers CSV a vector, so check against frictionless first.
|
|
131
|
+
try:
|
|
132
|
+
gis_type = pygeoprocessing.get_gis_type(_vsi_path(filepath, scheme))
|
|
133
|
+
except ValueError:
|
|
134
|
+
raise ValueError(
|
|
135
|
+
f'{filepath} does not appear to be one of '
|
|
136
|
+
f'(archive, table, raster, vector)')
|
|
137
|
+
if gis_type == pygeoprocessing.VECTOR_TYPE:
|
|
138
|
+
return 'vector'
|
|
139
|
+
if gis_type == pygeoprocessing.RASTER_TYPE:
|
|
140
|
+
return 'raster'
|
|
141
|
+
raise ValueError(
|
|
142
|
+
f'{filepath} contains both raster and vector data. '
|
|
143
|
+
'Such files are not supported by GeoMetaMaker. '
|
|
144
|
+
'If you wish to see support for these files, please '
|
|
145
|
+
'submit a feature request and share your dataset: '
|
|
146
|
+
'https://github.com/natcap/geometamaker/issues ')
|
|
147
|
+
|
|
148
|
+
|
|
149
|
+
def describe_file(source_dataset_path, scheme):
|
|
150
|
+
"""Describe basic properties of a file.
|
|
151
|
+
|
|
152
|
+
Args:
|
|
153
|
+
source_dataset_path (str): path to a file.
|
|
154
|
+
scheme (str): the protocol prefix of the filepath
|
|
155
|
+
|
|
156
|
+
Returns:
|
|
157
|
+
dict
|
|
158
|
+
|
|
159
|
+
"""
|
|
160
|
+
description = frictionless.describe(source_dataset_path).to_dict()
|
|
161
|
+
|
|
162
|
+
# If we want to support more file protocols in the future, it may
|
|
163
|
+
# make sense to use fsspec to access file info in a protocol-agnostic way.
|
|
164
|
+
# But not all protocols are equally supported yet.
|
|
165
|
+
# https://github.com/fsspec/filesystem_spec/issues/526
|
|
166
|
+
if scheme.startswith('http'):
|
|
167
|
+
info = requests.head(source_dataset_path).headers
|
|
168
|
+
description['bytes'] = info['Content-Length']
|
|
169
|
+
description['last_modified'] = datetime.strptime(
|
|
170
|
+
info['Last-Modified'], '%a, %d %b %Y %H:%M:%S %Z').strftime(DT_FMT)
|
|
171
|
+
else:
|
|
172
|
+
info = os.stat(source_dataset_path)
|
|
173
|
+
description['bytes'] = info.st_size
|
|
174
|
+
description['last_modified'] = datetime.fromtimestamp(
|
|
175
|
+
info.st_mtime, tz=timezone.utc).strftime(DT_FMT)
|
|
176
|
+
|
|
177
|
+
hash_func = hashlib.new('sha256')
|
|
178
|
+
hash_func.update(
|
|
179
|
+
f'{description["bytes"]}{description["last_modified"]}\
|
|
180
|
+
{description["path"]}'.encode('ascii'))
|
|
181
|
+
description['uid'] = f'sizetimestamp:{hash_func.hexdigest()}'
|
|
182
|
+
|
|
183
|
+
# We don't have a use for including these attributes in our metadata:
|
|
184
|
+
description.pop('mediatype', None)
|
|
185
|
+
description.pop('name', None)
|
|
186
|
+
return description
|
|
187
|
+
|
|
188
|
+
|
|
189
|
+
def describe_archive(source_dataset_path, scheme):
|
|
190
|
+
"""Describe file properties of a compressed file.
|
|
191
|
+
|
|
192
|
+
Args:
|
|
193
|
+
source_dataset_path (str): path to a file.
|
|
194
|
+
scheme (str): the protocol prefix of the filepath
|
|
195
|
+
|
|
196
|
+
Returns:
|
|
197
|
+
dict
|
|
198
|
+
|
|
199
|
+
"""
|
|
200
|
+
description = describe_file(source_dataset_path, scheme)
|
|
201
|
+
# innerpath is from frictionless and not useful because
|
|
202
|
+
# it does not include all the files contained in the zip
|
|
203
|
+
description.pop('innerpath', None)
|
|
204
|
+
|
|
205
|
+
ZFS = fsspec.get_filesystem_class('zip')
|
|
206
|
+
zfs = ZFS(source_dataset_path)
|
|
207
|
+
file_list = []
|
|
208
|
+
for dirpath, _, files in zfs.walk(zfs.root_marker):
|
|
209
|
+
for f in files:
|
|
210
|
+
file_list.append(os.path.join(dirpath, f))
|
|
211
|
+
description['sources'] = file_list
|
|
212
|
+
return description
|
|
213
|
+
|
|
214
|
+
|
|
215
|
+
def describe_vector(source_dataset_path, scheme):
|
|
216
|
+
"""Describe properties of a GDAL vector file.
|
|
217
|
+
|
|
218
|
+
Args:
|
|
219
|
+
source_dataset_path (str): path to a GDAL vector.
|
|
220
|
+
|
|
221
|
+
Returns:
|
|
222
|
+
dict
|
|
223
|
+
|
|
224
|
+
"""
|
|
225
|
+
description = describe_file(source_dataset_path, scheme)
|
|
226
|
+
|
|
227
|
+
if 'http' in scheme:
|
|
228
|
+
source_dataset_path = f'/vsicurl/{source_dataset_path}'
|
|
229
|
+
vector = gdal.OpenEx(source_dataset_path, gdal.OF_VECTOR)
|
|
230
|
+
layer = vector.GetLayer()
|
|
231
|
+
fields = []
|
|
232
|
+
description['n_features'] = layer.GetFeatureCount()
|
|
233
|
+
for fld in layer.schema:
|
|
234
|
+
fields.append(
|
|
235
|
+
models.FieldSchema(name=fld.name, type=fld.GetTypeName()))
|
|
236
|
+
vector = layer = None
|
|
237
|
+
description['data_model'] = models.TableSchema(fields=fields)
|
|
238
|
+
|
|
239
|
+
info = pygeoprocessing.get_vector_info(source_dataset_path)
|
|
240
|
+
bbox = models.BoundingBox(*info['bounding_box'])
|
|
241
|
+
epsg_string, units_string = _wkt_to_epsg_units_string(
|
|
242
|
+
info['projection_wkt'])
|
|
243
|
+
description['spatial'] = models.SpatialSchema(
|
|
244
|
+
bounding_box=bbox,
|
|
245
|
+
crs=epsg_string,
|
|
246
|
+
crs_units=units_string)
|
|
247
|
+
description['sources'] = info['file_list']
|
|
248
|
+
return description
|
|
249
|
+
|
|
250
|
+
|
|
251
|
+
def describe_raster(source_dataset_path, scheme):
|
|
252
|
+
"""Describe properties of a GDAL raster file.
|
|
253
|
+
|
|
254
|
+
Args:
|
|
255
|
+
source_dataset_path (str): path to a GDAL raster.
|
|
256
|
+
|
|
257
|
+
Returns:
|
|
258
|
+
dict
|
|
259
|
+
|
|
260
|
+
"""
|
|
261
|
+
description = describe_file(source_dataset_path, scheme)
|
|
262
|
+
if 'http' in scheme:
|
|
263
|
+
source_dataset_path = f'/vsicurl/{source_dataset_path}'
|
|
264
|
+
info = pygeoprocessing.get_raster_info(source_dataset_path)
|
|
265
|
+
bands = []
|
|
266
|
+
for i in range(info['n_bands']):
|
|
267
|
+
b = i + 1
|
|
268
|
+
bands.append(models.BandSchema(
|
|
269
|
+
index=b,
|
|
270
|
+
gdal_type=gdal.GetDataTypeName(info['datatype']),
|
|
271
|
+
numpy_type=numpy.dtype(info['numpy_type']).name,
|
|
272
|
+
nodata=info['nodata'][i]))
|
|
273
|
+
description['data_model'] = models.RasterSchema(
|
|
274
|
+
bands=bands,
|
|
275
|
+
pixel_size=info['pixel_size'],
|
|
276
|
+
raster_size={'width': info['raster_size'][0],
|
|
277
|
+
'height': info['raster_size'][1]})
|
|
278
|
+
# Some values of raster info are numpy types, which the
|
|
279
|
+
# yaml dumper doesn't know how to represent.
|
|
280
|
+
bbox = models.BoundingBox(*[float(x) for x in info['bounding_box']])
|
|
281
|
+
epsg_string, units_string = _wkt_to_epsg_units_string(
|
|
282
|
+
info['projection_wkt'])
|
|
283
|
+
description['spatial'] = models.SpatialSchema(
|
|
284
|
+
bounding_box=bbox,
|
|
285
|
+
crs=epsg_string,
|
|
286
|
+
crs_units=units_string)
|
|
287
|
+
description['sources'] = info['file_list']
|
|
288
|
+
return description
|
|
289
|
+
|
|
290
|
+
|
|
291
|
+
def describe_table(source_dataset_path, scheme):
|
|
292
|
+
"""Describe properties of a tabular dataset.
|
|
293
|
+
|
|
294
|
+
Args:
|
|
295
|
+
source_dataset_path (str): path to a file representing a table.
|
|
296
|
+
scheme (str): the protocol prefix of the filepath
|
|
297
|
+
|
|
298
|
+
Returns:
|
|
299
|
+
dict
|
|
300
|
+
|
|
301
|
+
"""
|
|
302
|
+
description = describe_file(source_dataset_path, scheme)
|
|
303
|
+
description['data_model'] = models.TableSchema(**description['schema'])
|
|
304
|
+
del description['schema'] # we forbid extra args in our Pydantic models
|
|
305
|
+
return description
|
|
306
|
+
|
|
307
|
+
|
|
308
|
+
DESRCIBE_FUNCS = {
|
|
309
|
+
'archive': describe_archive,
|
|
310
|
+
'table': describe_table,
|
|
311
|
+
'vector': describe_vector,
|
|
312
|
+
'raster': describe_raster
|
|
313
|
+
}
|
|
314
|
+
|
|
315
|
+
RESOURCE_MODELS = {
|
|
316
|
+
'archive': models.ArchiveResource,
|
|
317
|
+
'table': models.TableResource,
|
|
318
|
+
'vector': models.VectorResource,
|
|
319
|
+
'raster': models.RasterResource
|
|
320
|
+
}
|
|
321
|
+
|
|
322
|
+
|
|
323
|
+
@_osgeo_use_exceptions
|
|
324
|
+
def describe(source_dataset_path, profile=None):
|
|
325
|
+
"""Create a metadata resource instance with properties of the dataset.
|
|
326
|
+
|
|
327
|
+
Properties of the dataset are used to populate as many metadata
|
|
328
|
+
properties as possible. Default/placeholder
|
|
329
|
+
values are used for properties that require user input.
|
|
330
|
+
|
|
331
|
+
Args:
|
|
332
|
+
source_dataset_path (string): path or URL to dataset to which the
|
|
333
|
+
metadata applies
|
|
334
|
+
profile (geometamaker.models.Profile): a profile object from
|
|
335
|
+
which to populate some metadata attributes
|
|
336
|
+
|
|
337
|
+
Returns:
|
|
338
|
+
geometamaker.models.Resource: a metadata object
|
|
339
|
+
|
|
340
|
+
"""
|
|
341
|
+
config = Config()
|
|
342
|
+
user_profile = config.profile
|
|
343
|
+
if profile is not None:
|
|
344
|
+
user_profile = user_profile.replace(profile)
|
|
345
|
+
|
|
346
|
+
metadata_path = f'{source_dataset_path}.yml'
|
|
347
|
+
|
|
348
|
+
# Despite naming, this does not open a file that must be closed
|
|
349
|
+
of = fsspec.open(source_dataset_path)
|
|
350
|
+
if not of.fs.exists(source_dataset_path):
|
|
351
|
+
raise FileNotFoundError(f'{source_dataset_path} does not exist')
|
|
352
|
+
|
|
353
|
+
protocol = fsspec.utils.get_protocol(source_dataset_path)
|
|
354
|
+
if protocol not in PROTOCOLS:
|
|
355
|
+
raise ValueError(
|
|
356
|
+
f'Cannot describe {source_dataset_path}. {protocol} '
|
|
357
|
+
f'is not one of the suppored file protocols: {PROTOCOLS}')
|
|
358
|
+
resource_type = detect_file_type(source_dataset_path, protocol)
|
|
359
|
+
description = DESRCIBE_FUNCS[resource_type](
|
|
360
|
+
source_dataset_path, protocol)
|
|
361
|
+
description['type'] = resource_type
|
|
362
|
+
|
|
363
|
+
# Load existing metadata file
|
|
364
|
+
try:
|
|
365
|
+
existing_resource = RESOURCE_MODELS[resource_type].load(metadata_path)
|
|
366
|
+
if 'data_model' in description:
|
|
367
|
+
if isinstance(description['data_model'], models.RasterSchema):
|
|
368
|
+
# If existing band metadata still matches data_model of the file
|
|
369
|
+
# carry over existing metadata because it could include
|
|
370
|
+
# human-defined properties.
|
|
371
|
+
new_bands = []
|
|
372
|
+
for band in description['data_model'].bands:
|
|
373
|
+
try:
|
|
374
|
+
eband = existing_resource.get_band_description(band.index)
|
|
375
|
+
# TODO: rewrite this as __eq__ of BandSchema?
|
|
376
|
+
if (band.numpy_type, band.gdal_type, band.nodata) == (
|
|
377
|
+
eband.numpy_type, eband.gdal_type, eband.nodata):
|
|
378
|
+
updated_dict = band.model_dump() | eband.model_dump()
|
|
379
|
+
band = models.BandSchema(**updated_dict)
|
|
380
|
+
except IndexError:
|
|
381
|
+
pass
|
|
382
|
+
new_bands.append(band)
|
|
383
|
+
description['data_model'].bands = new_bands
|
|
384
|
+
if isinstance(description['data_model'], models.TableSchema):
|
|
385
|
+
# If existing field metadata still matches data_model of the file
|
|
386
|
+
# carry over existing metadata because it could include
|
|
387
|
+
# human-defined properties.
|
|
388
|
+
new_fields = []
|
|
389
|
+
for field in description['data_model'].fields:
|
|
390
|
+
try:
|
|
391
|
+
efield = existing_resource.get_field_description(
|
|
392
|
+
field.name)
|
|
393
|
+
# TODO: rewrite this as __eq__ of FieldSchema?
|
|
394
|
+
if field.type == efield.type:
|
|
395
|
+
updated_dict = field.model_dump() | efield.model_dump()
|
|
396
|
+
field = models.FieldSchema(**updated_dict)
|
|
397
|
+
except KeyError:
|
|
398
|
+
pass
|
|
399
|
+
new_fields.append(field)
|
|
400
|
+
description['data_model'].fields = new_fields
|
|
401
|
+
# overwrite properties that are intrinsic to the dataset
|
|
402
|
+
updated_dict = existing_resource.model_dump() | description
|
|
403
|
+
resource = RESOURCE_MODELS[resource_type](**updated_dict)
|
|
404
|
+
|
|
405
|
+
# Common path: metadata file does not already exist
|
|
406
|
+
# Or less common, ValueError if it exists but is incompatible
|
|
407
|
+
except FileNotFoundError:
|
|
408
|
+
resource = RESOURCE_MODELS[resource_type](**description)
|
|
409
|
+
|
|
410
|
+
resource = resource.replace(user_profile)
|
|
411
|
+
return resource
|
|
412
|
+
|
|
413
|
+
|
|
414
|
+
def validate(filepath):
|
|
415
|
+
"""Validate a YAML metadata document.
|
|
416
|
+
|
|
417
|
+
Validation includes type-checking of property values and
|
|
418
|
+
checking for the presence of required properties.
|
|
419
|
+
|
|
420
|
+
Args:
|
|
421
|
+
directory (string): path to a YAML file
|
|
422
|
+
|
|
423
|
+
Returns:
|
|
424
|
+
pydantic.ValidationError
|
|
425
|
+
|
|
426
|
+
Raises:
|
|
427
|
+
ValueError if the YAML document is not a geometamaker metadata doc.
|
|
428
|
+
|
|
429
|
+
"""
|
|
430
|
+
with fsspec.open(filepath, 'r') as file:
|
|
431
|
+
yaml_string = file.read()
|
|
432
|
+
yaml_dict = yaml.safe_load(yaml_string)
|
|
433
|
+
if not yaml_dict or ('metadata_version' not in yaml_dict
|
|
434
|
+
and 'geometamaker_version' not in yaml_dict):
|
|
435
|
+
message = (f'{filepath} exists but is not compatible with '
|
|
436
|
+
f'geometamaker.')
|
|
437
|
+
raise ValueError(message)
|
|
438
|
+
|
|
439
|
+
try:
|
|
440
|
+
RESOURCE_MODELS[yaml_dict['type']](**yaml_dict)
|
|
441
|
+
except ValidationError as error:
|
|
442
|
+
return error
|
|
443
|
+
|
|
444
|
+
|
|
445
|
+
def validate_dir(directory, recursive=False):
|
|
446
|
+
"""Validate all compatible yml documents in the directory.
|
|
447
|
+
|
|
448
|
+
Args:
|
|
449
|
+
directory (string): path to a directory
|
|
450
|
+
recursive (bool): whether or not to describe files
|
|
451
|
+
in all subdirectories
|
|
452
|
+
|
|
453
|
+
Returns:
|
|
454
|
+
tuple (list, list): a list of the filepaths that were validated and
|
|
455
|
+
an equal-length list of the validation messages.
|
|
456
|
+
|
|
457
|
+
"""
|
|
458
|
+
file_list = []
|
|
459
|
+
if recursive:
|
|
460
|
+
for path, dirs, files in os.walk(directory):
|
|
461
|
+
for file in files:
|
|
462
|
+
file_list.append(os.path.join(path, file))
|
|
463
|
+
else:
|
|
464
|
+
file_list.extend(
|
|
465
|
+
[os.path.join(directory, path)
|
|
466
|
+
for path in os.listdir(directory)
|
|
467
|
+
if os.path.isfile(os.path.join(directory, path))])
|
|
468
|
+
|
|
469
|
+
messages = []
|
|
470
|
+
yaml_files = []
|
|
471
|
+
for filepath in file_list:
|
|
472
|
+
if filepath.endswith('.yml'):
|
|
473
|
+
yaml_files.append(filepath)
|
|
474
|
+
try:
|
|
475
|
+
error = validate(filepath)
|
|
476
|
+
if error:
|
|
477
|
+
messages.append(error)
|
|
478
|
+
else:
|
|
479
|
+
messages.append('')
|
|
480
|
+
except ValueError:
|
|
481
|
+
messages.append(
|
|
482
|
+
'does not appear to be a geometamaker document')
|
|
483
|
+
|
|
484
|
+
return (yaml_files, messages)
|
|
485
|
+
|
|
486
|
+
|
|
487
|
+
def describe_dir(directory, recursive=False):
|
|
488
|
+
"""Describe all compatible datasets in the directory.
|
|
489
|
+
|
|
490
|
+
Take special care to only describe multifile datasets,
|
|
491
|
+
such as ESRI Shapefiles, one time.
|
|
492
|
+
|
|
493
|
+
Args:
|
|
494
|
+
directory (string): path to a directory
|
|
495
|
+
recursive (bool): whether or not to describe files
|
|
496
|
+
in all subdirectories
|
|
497
|
+
|
|
498
|
+
Returns:
|
|
499
|
+
None
|
|
500
|
+
|
|
501
|
+
"""
|
|
502
|
+
root_set = set()
|
|
503
|
+
root_ext_map = defaultdict(set)
|
|
504
|
+
for path, dirs, files in os.walk(directory):
|
|
505
|
+
for file in files:
|
|
506
|
+
full_path = os.path.join(path, file)
|
|
507
|
+
root, ext = os.path.splitext(full_path)
|
|
508
|
+
# tracking which files share a root name
|
|
509
|
+
# so we can check if these comprise a shapefile
|
|
510
|
+
root_ext_map[root].add(ext)
|
|
511
|
+
root_set.add(root)
|
|
512
|
+
if not recursive:
|
|
513
|
+
break
|
|
514
|
+
|
|
515
|
+
for root in root_set:
|
|
516
|
+
extensions = root_ext_map[root]
|
|
517
|
+
if '.shp' in extensions:
|
|
518
|
+
# if we're dealing with a shapefile, we do not want to describe any
|
|
519
|
+
# of these other files with the same root name
|
|
520
|
+
extensions.difference_update(['.shx', '.sbn', '.sbx', '.prj', '.dbf'])
|
|
521
|
+
for ext in extensions:
|
|
522
|
+
filepath = f'{root}{ext}'
|
|
523
|
+
try:
|
|
524
|
+
resource = describe(filepath)
|
|
525
|
+
except ValueError as error:
|
|
526
|
+
LOGGER.debug(error)
|
|
527
|
+
continue
|
|
528
|
+
resource.write()
|
|
529
|
+
LOGGER.info(f'{filepath} described')
|