geometamaker 0.1.1__py3-none-any.whl → 0.2.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- geometamaker/__init__.py +2 -2
- geometamaker/cli.py +137 -31
- geometamaker/config.py +3 -4
- geometamaker/geometamaker.py +357 -130
- geometamaker/models.py +317 -114
- {geometamaker-0.1.1.dist-info → geometamaker-0.2.0.dist-info}/METADATA +44 -43
- geometamaker-0.2.0.dist-info/RECORD +12 -0
- {geometamaker-0.1.1.dist-info → geometamaker-0.2.0.dist-info}/WHEEL +1 -1
- geometamaker-0.1.1.dist-info/RECORD +0 -12
- {geometamaker-0.1.1.dist-info → geometamaker-0.2.0.dist-info}/entry_points.txt +0 -0
- {geometamaker-0.1.1.dist-info → geometamaker-0.2.0.dist-info/licenses}/LICENSE.txt +0 -0
- {geometamaker-0.1.1.dist-info → geometamaker-0.2.0.dist-info}/top_level.txt +0 -0
geometamaker/geometamaker.py
CHANGED
|
@@ -2,6 +2,7 @@ import functools
|
|
|
2
2
|
import hashlib
|
|
3
3
|
import logging
|
|
4
4
|
import os
|
|
5
|
+
import re
|
|
5
6
|
import requests
|
|
6
7
|
from collections import defaultdict
|
|
7
8
|
from datetime import datetime, timezone
|
|
@@ -13,13 +14,20 @@ import pygeoprocessing
|
|
|
13
14
|
import yaml
|
|
14
15
|
from osgeo import gdal
|
|
15
16
|
from osgeo import osr
|
|
17
|
+
from pathlib import Path
|
|
16
18
|
from pydantic import ValidationError
|
|
19
|
+
import tarfile
|
|
17
20
|
|
|
18
21
|
from . import models
|
|
19
22
|
from .config import Config
|
|
20
23
|
|
|
24
|
+
logging.getLogger('chardet').setLevel(logging.INFO) # DEBUG is just too noisy
|
|
21
25
|
|
|
22
|
-
LOGGER = logging.getLogger(
|
|
26
|
+
LOGGER = logging.getLogger('geometamaker')
|
|
27
|
+
_NOT_FOR_CLI = 'not_for_cli'
|
|
28
|
+
_LOG_EXTRA_NOT_FOR_CLI = {
|
|
29
|
+
_NOT_FOR_CLI: True
|
|
30
|
+
}
|
|
23
31
|
|
|
24
32
|
# URI schemes we support. A subset of fsspec.available_protocols()
|
|
25
33
|
PROTOCOLS = [
|
|
@@ -103,6 +111,80 @@ def _wkt_to_epsg_units_string(wkt_string):
|
|
|
103
111
|
return crs_string, units_string
|
|
104
112
|
|
|
105
113
|
|
|
114
|
+
def _list_files_with_depth(directory, depth, exclude_regex=None,
|
|
115
|
+
exclude_hidden=True):
|
|
116
|
+
"""List files in directory up to depth
|
|
117
|
+
|
|
118
|
+
Args:
|
|
119
|
+
directory (string): path to a directory
|
|
120
|
+
depth (int): maximum number of subdirectory levels to traverse when
|
|
121
|
+
walking through a directory. A value of 1 limits the walk to files
|
|
122
|
+
in the top-level ``directory`` only. A value of 2 allows
|
|
123
|
+
descending into immediate subdirectories, etc.
|
|
124
|
+
exclude_regex (str, optional): a regular expression to pattern-match
|
|
125
|
+
any files for which you do not want to create metadata.
|
|
126
|
+
exclude_hidden (bool, default True): whether to ignore hidden files
|
|
127
|
+
|
|
128
|
+
Returns:
|
|
129
|
+
list of relative filepaths in ``directory``
|
|
130
|
+
|
|
131
|
+
"""
|
|
132
|
+
directory = Path(directory).resolve()
|
|
133
|
+
file_list = []
|
|
134
|
+
|
|
135
|
+
for path in directory.rglob("*"):
|
|
136
|
+
relative_path = path.relative_to(directory)
|
|
137
|
+
current_depth = len(relative_path.parts)
|
|
138
|
+
if current_depth > depth:
|
|
139
|
+
continue
|
|
140
|
+
if exclude_hidden and (
|
|
141
|
+
any(part.startswith('.') for part in relative_path.parts)):
|
|
142
|
+
continue
|
|
143
|
+
file_list.append(str(relative_path))
|
|
144
|
+
|
|
145
|
+
# remove excluded files based on regex
|
|
146
|
+
if exclude_regex is not None:
|
|
147
|
+
file_list = [f for f in file_list if not re.search(exclude_regex, f)]
|
|
148
|
+
|
|
149
|
+
return sorted(file_list)
|
|
150
|
+
|
|
151
|
+
|
|
152
|
+
def _group_files_by_root(file_list):
|
|
153
|
+
"""Get set of files (roots) and extensions by filename"""
|
|
154
|
+
root_set = set()
|
|
155
|
+
root_ext_map = defaultdict(set)
|
|
156
|
+
for filepath in file_list:
|
|
157
|
+
root, ext = os.path.splitext(filepath)
|
|
158
|
+
# tracking which files share a root name
|
|
159
|
+
# so we can check if these comprise a shapefile
|
|
160
|
+
root_ext_map[root].add(ext)
|
|
161
|
+
root_set.add(root)
|
|
162
|
+
return root_ext_map, sorted(list(root_set))
|
|
163
|
+
|
|
164
|
+
|
|
165
|
+
def _get_collection_size_time_uid(directory):
|
|
166
|
+
"""Get size of directory (in bytes), when it was last modified, and uid"""
|
|
167
|
+
total_bytes = 0
|
|
168
|
+
latest_mtime = 0
|
|
169
|
+
|
|
170
|
+
for root, _, files in os.walk(directory):
|
|
171
|
+
for file in files:
|
|
172
|
+
file_path = os.path.join(root, file)
|
|
173
|
+
stat = os.stat(file_path)
|
|
174
|
+
total_bytes += stat.st_size
|
|
175
|
+
latest_mtime = max(latest_mtime, stat.st_mtime)
|
|
176
|
+
|
|
177
|
+
last_modified = datetime.fromtimestamp(latest_mtime, tz=timezone.utc)
|
|
178
|
+
last_modified_str = last_modified.strftime('%Y-%m-%d %H:%M:%S %Z')
|
|
179
|
+
|
|
180
|
+
hash_func = hashlib.sha256()
|
|
181
|
+
hash_func.update(
|
|
182
|
+
f'{total_bytes}{last_modified_str}{directory}'.encode('utf-8'))
|
|
183
|
+
uid = f'sizetimestamp:{hash_func.hexdigest()}'
|
|
184
|
+
|
|
185
|
+
return total_bytes, last_modified_str, uid
|
|
186
|
+
|
|
187
|
+
|
|
106
188
|
def detect_file_type(filepath, scheme):
|
|
107
189
|
"""Detect the type of resource contained in the file.
|
|
108
190
|
|
|
@@ -119,13 +201,13 @@ def detect_file_type(filepath, scheme):
|
|
|
119
201
|
"""
|
|
120
202
|
# TODO: guard against classifying netCDF, HDF5, etc as GDAL rasters.
|
|
121
203
|
# We'll likely want a different data model for multi-dimensional arrays.
|
|
122
|
-
|
|
123
204
|
# Frictionless supports a wide range of formats. The quickest way to
|
|
124
205
|
# determine if a file is recognized as a table or archive is to call list.
|
|
125
206
|
info = frictionless.list(filepath)[0]
|
|
126
207
|
if info.type == 'table':
|
|
127
208
|
return 'table'
|
|
128
|
-
|
|
209
|
+
# Frictionless doesn't recognize .tgz compression (but does recognize .tar.gz)
|
|
210
|
+
if info.compression or info.format == "tgz":
|
|
129
211
|
return 'archive'
|
|
130
212
|
# GDAL considers CSV a vector, so check against frictionless first.
|
|
131
213
|
try:
|
|
@@ -186,37 +268,64 @@ def describe_file(source_dataset_path, scheme):
|
|
|
186
268
|
return description
|
|
187
269
|
|
|
188
270
|
|
|
189
|
-
def describe_archive(source_dataset_path, scheme):
|
|
271
|
+
def describe_archive(source_dataset_path, scheme, **kwargs):
|
|
190
272
|
"""Describe file properties of a compressed file.
|
|
191
273
|
|
|
192
274
|
Args:
|
|
193
275
|
source_dataset_path (str): path to a file.
|
|
194
276
|
scheme (str): the protocol prefix of the filepath
|
|
277
|
+
kwargs (dict): additional options when describing a dataset.
|
|
195
278
|
|
|
196
279
|
Returns:
|
|
197
280
|
dict
|
|
198
281
|
|
|
199
282
|
"""
|
|
283
|
+
def _list_tgz_contents(path):
|
|
284
|
+
"""List contents of a .tar, .tgz, or .tar.gz archive."""
|
|
285
|
+
file_list = []
|
|
286
|
+
with fsspec.open(path, 'rb') as fobj:
|
|
287
|
+
with tarfile.open(fileobj=fobj, mode='r:*') as tar:
|
|
288
|
+
file_list = [member.name for member in tar.getmembers()
|
|
289
|
+
if member.isfile()]
|
|
290
|
+
return file_list
|
|
291
|
+
|
|
292
|
+
def _list_zip_contents(path):
|
|
293
|
+
"""List contents of a zip archive"""
|
|
294
|
+
file_list = []
|
|
295
|
+
ZFS = fsspec.get_filesystem_class('zip')
|
|
296
|
+
zfs = ZFS(path)
|
|
297
|
+
for dirpath, _, files in zfs.walk(zfs.root_marker):
|
|
298
|
+
for f in files:
|
|
299
|
+
file_list.append(os.path.join(dirpath, f))
|
|
300
|
+
return file_list
|
|
301
|
+
|
|
200
302
|
description = describe_file(source_dataset_path, scheme)
|
|
201
303
|
# innerpath is from frictionless and not useful because
|
|
202
304
|
# it does not include all the files contained in the zip
|
|
203
305
|
description.pop('innerpath', None)
|
|
204
306
|
|
|
205
|
-
|
|
206
|
-
|
|
207
|
-
|
|
208
|
-
|
|
209
|
-
|
|
210
|
-
|
|
307
|
+
if description.get("compression") == "zip":
|
|
308
|
+
file_list = _list_zip_contents(source_dataset_path)
|
|
309
|
+
elif description.get("format") in ["tgz", "tar"]:
|
|
310
|
+
file_list = _list_tgz_contents(source_dataset_path)
|
|
311
|
+
# 'compression' attr not auto-added by frictionless.describe for .tgz
|
|
312
|
+
# (but IS added for .tar.gz)
|
|
313
|
+
if source_dataset_path.endswith((".tgz")):
|
|
314
|
+
description["compression"] = "gz"
|
|
315
|
+
else:
|
|
316
|
+
raise ValueError(f"Unsupported archive format: {source_dataset_path}")
|
|
317
|
+
|
|
211
318
|
description['sources'] = file_list
|
|
212
319
|
return description
|
|
213
320
|
|
|
214
321
|
|
|
215
|
-
def describe_vector(source_dataset_path, scheme):
|
|
322
|
+
def describe_vector(source_dataset_path, scheme, **kwargs):
|
|
216
323
|
"""Describe properties of a GDAL vector file.
|
|
217
324
|
|
|
218
325
|
Args:
|
|
219
326
|
source_dataset_path (str): path to a GDAL vector.
|
|
327
|
+
scheme (str): the protocol prefix of the filepath
|
|
328
|
+
kwargs (dict): additional options when describing a dataset.
|
|
220
329
|
|
|
221
330
|
Returns:
|
|
222
331
|
dict
|
|
@@ -229,12 +338,18 @@ def describe_vector(source_dataset_path, scheme):
|
|
|
229
338
|
vector = gdal.OpenEx(source_dataset_path, gdal.OF_VECTOR)
|
|
230
339
|
layer = vector.GetLayer()
|
|
231
340
|
fields = []
|
|
232
|
-
description['n_features'] = layer.GetFeatureCount()
|
|
233
341
|
for fld in layer.schema:
|
|
234
342
|
fields.append(
|
|
235
343
|
models.FieldSchema(name=fld.name, type=fld.GetTypeName()))
|
|
344
|
+
layer_schema = models.LayerSchema(
|
|
345
|
+
name=layer.GetName(),
|
|
346
|
+
n_features=layer.GetFeatureCount(),
|
|
347
|
+
table=models.TableSchema(fields=fields),
|
|
348
|
+
gdal_metadata=layer.GetMetadata())
|
|
349
|
+
description['data_model'] = models.VectorSchema(
|
|
350
|
+
layers=[layer_schema],
|
|
351
|
+
gdal_metadata=vector.GetMetadata())
|
|
236
352
|
vector = layer = None
|
|
237
|
-
description['data_model'] = models.TableSchema(fields=fields)
|
|
238
353
|
|
|
239
354
|
info = pygeoprocessing.get_vector_info(source_dataset_path)
|
|
240
355
|
bbox = models.BoundingBox(*info['bounding_box'])
|
|
@@ -248,33 +363,59 @@ def describe_vector(source_dataset_path, scheme):
|
|
|
248
363
|
return description
|
|
249
364
|
|
|
250
365
|
|
|
251
|
-
def describe_raster(source_dataset_path, scheme):
|
|
366
|
+
def describe_raster(source_dataset_path, scheme, **kwargs):
|
|
252
367
|
"""Describe properties of a GDAL raster file.
|
|
253
368
|
|
|
254
369
|
Args:
|
|
255
370
|
source_dataset_path (str): path to a GDAL raster.
|
|
371
|
+
scheme (str): the protocol prefix of the filepath
|
|
372
|
+
kwargs (dict): additional options when describing a dataset:
|
|
373
|
+
* ``'compute_stats'`` (bool): whether to compute statistics
|
|
374
|
+
for each band in the raster. Default is False.
|
|
256
375
|
|
|
257
376
|
Returns:
|
|
258
377
|
dict
|
|
259
378
|
|
|
260
379
|
"""
|
|
380
|
+
compute_stats = kwargs.get('compute_stats', False)
|
|
261
381
|
description = describe_file(source_dataset_path, scheme)
|
|
262
382
|
if 'http' in scheme:
|
|
263
383
|
source_dataset_path = f'/vsicurl/{source_dataset_path}'
|
|
264
384
|
info = pygeoprocessing.get_raster_info(source_dataset_path)
|
|
385
|
+
raster = gdal.OpenEx(source_dataset_path)
|
|
386
|
+
raster_gdal_metadata = raster.GetMetadata()
|
|
265
387
|
bands = []
|
|
266
388
|
for i in range(info['n_bands']):
|
|
267
389
|
b = i + 1
|
|
390
|
+
band = raster.GetRasterBand(b)
|
|
391
|
+
if compute_stats:
|
|
392
|
+
try:
|
|
393
|
+
# 0=do not approximate stats, 1=calculate if they don't exist
|
|
394
|
+
# If exact stats exist they will be retrieved without
|
|
395
|
+
# computing them, otherwise, this forces computation.
|
|
396
|
+
# https://github.com/OSGeo/gdal/blob/master/gcore/gdalrasterband.cpp
|
|
397
|
+
_ = band.GetStatistics(0, 1)
|
|
398
|
+
except RuntimeError as e:
|
|
399
|
+
LOGGER.warning(
|
|
400
|
+
f'Could not compute statistics for band {b} of '
|
|
401
|
+
f'{source_dataset_path}: {e}')
|
|
402
|
+
band_gdal_metadata = band.GetMetadata()
|
|
403
|
+
|
|
268
404
|
bands.append(models.BandSchema(
|
|
269
405
|
index=b,
|
|
270
406
|
gdal_type=gdal.GetDataTypeName(info['datatype']),
|
|
271
407
|
numpy_type=numpy.dtype(info['numpy_type']).name,
|
|
272
|
-
nodata=info['nodata'][i]
|
|
408
|
+
nodata=info['nodata'][i],
|
|
409
|
+
gdal_metadata=band_gdal_metadata))
|
|
410
|
+
band = None
|
|
411
|
+
raster = None
|
|
412
|
+
|
|
273
413
|
description['data_model'] = models.RasterSchema(
|
|
274
414
|
bands=bands,
|
|
275
415
|
pixel_size=info['pixel_size'],
|
|
276
416
|
raster_size={'width': info['raster_size'][0],
|
|
277
|
-
'height': info['raster_size'][1]}
|
|
417
|
+
'height': info['raster_size'][1]},
|
|
418
|
+
gdal_metadata=raster_gdal_metadata)
|
|
278
419
|
# Some values of raster info are numpy types, which the
|
|
279
420
|
# yaml dumper doesn't know how to represent.
|
|
280
421
|
bbox = models.BoundingBox(*[float(x) for x in info['bounding_box']])
|
|
@@ -288,12 +429,13 @@ def describe_raster(source_dataset_path, scheme):
|
|
|
288
429
|
return description
|
|
289
430
|
|
|
290
431
|
|
|
291
|
-
def describe_table(source_dataset_path, scheme):
|
|
432
|
+
def describe_table(source_dataset_path, scheme, **kwargs):
|
|
292
433
|
"""Describe properties of a tabular dataset.
|
|
293
434
|
|
|
294
435
|
Args:
|
|
295
436
|
source_dataset_path (str): path to a file representing a table.
|
|
296
437
|
scheme (str): the protocol prefix of the filepath
|
|
438
|
+
kwargs (dict): additional options when describing a dataset.
|
|
297
439
|
|
|
298
440
|
Returns:
|
|
299
441
|
dict
|
|
@@ -305,7 +447,139 @@ def describe_table(source_dataset_path, scheme):
|
|
|
305
447
|
return description
|
|
306
448
|
|
|
307
449
|
|
|
308
|
-
|
|
450
|
+
def describe_collection(directory, depth=numpy.iinfo(numpy.int16).max,
|
|
451
|
+
exclude_regex=None, exclude_hidden=True,
|
|
452
|
+
describe_files=False, backup=True, **kwargs):
|
|
453
|
+
"""Create a single metadata document to describe a collection of files.
|
|
454
|
+
|
|
455
|
+
Describe all the files within a directory as members of a "collection".
|
|
456
|
+
The resulting metadata resource should include a list of all the files
|
|
457
|
+
included in the collection along with a description and metadata filepath
|
|
458
|
+
(or placeholder). Optionally create individual metadata files for each
|
|
459
|
+
supported file in a directory.
|
|
460
|
+
|
|
461
|
+
Args:
|
|
462
|
+
directory (str): path to collection
|
|
463
|
+
depth (int, optional): maximum number of subdirectory levels to
|
|
464
|
+
traverse when walking through ``directory`` to find files included
|
|
465
|
+
in the collection. A value of 1 limits the walk to files in the
|
|
466
|
+
top-level ``directory`` only. A value of 2 allows descending into
|
|
467
|
+
immediate subdirectories, etc. All files in all subdirectories in
|
|
468
|
+
the collection will be included by default.
|
|
469
|
+
exclude_regex (str, optional): a regular expression to pattern-match
|
|
470
|
+
any files you do not want included in the output metadata yml.
|
|
471
|
+
exclude_hidden (bool, default True): whether to exclude hidden files
|
|
472
|
+
(files that start with ".").
|
|
473
|
+
describe_files (bool, default False): whether to ``describe`` all
|
|
474
|
+
files, i.e., create individual metadata files for each supported
|
|
475
|
+
resource in the collection.
|
|
476
|
+
backup (bool): whether to write a backup of a pre-existing metadata
|
|
477
|
+
file before ovewriting it in cases where that file is not a valid
|
|
478
|
+
geometamaker document.
|
|
479
|
+
kwargs (dict): optional keyward arguments accepted by ``describe``.
|
|
480
|
+
|
|
481
|
+
Returns:
|
|
482
|
+
Collection metadata
|
|
483
|
+
"""
|
|
484
|
+
directory = str(Path(directory).resolve())
|
|
485
|
+
|
|
486
|
+
file_list = _list_files_with_depth(directory, depth, exclude_regex,
|
|
487
|
+
exclude_hidden)
|
|
488
|
+
|
|
489
|
+
root_ext_map, root_list = _group_files_by_root(file_list)
|
|
490
|
+
|
|
491
|
+
items = []
|
|
492
|
+
|
|
493
|
+
for root in root_list:
|
|
494
|
+
extensions = root_ext_map[root]
|
|
495
|
+
if '.shp' in extensions:
|
|
496
|
+
# if we're dealing with a shapefile, we do not want to describe any
|
|
497
|
+
# of these other files with the same root name
|
|
498
|
+
extensions.difference_update(['.shx', '.sbn', '.sbx', '.prj', '.dbf', '.cpg'])
|
|
499
|
+
# Only drop .yml if its sidecar file, i.e. the corresponding data file
|
|
500
|
+
# (root) exists on disk
|
|
501
|
+
if '.yml' in extensions and os.path.exists(root):
|
|
502
|
+
extensions.discard('.yml')
|
|
503
|
+
for ext in extensions:
|
|
504
|
+
filepath = os.path.join(directory, f'{root}{ext}')
|
|
505
|
+
try:
|
|
506
|
+
this_desc = describe(filepath, **kwargs)
|
|
507
|
+
except ValueError:
|
|
508
|
+
# if file type isn't supported by geometamaker, e.g. pdf
|
|
509
|
+
# or if trying to describe a dir
|
|
510
|
+
this_desc = None
|
|
511
|
+
|
|
512
|
+
if describe_files and this_desc:
|
|
513
|
+
this_desc.write(backup=backup)
|
|
514
|
+
|
|
515
|
+
if ext and os.path.exists(filepath + '.yml'):
|
|
516
|
+
metadata_yml = f'{root}{ext}' + '.yml'
|
|
517
|
+
else:
|
|
518
|
+
metadata_yml = ''
|
|
519
|
+
|
|
520
|
+
this_resource = models.CollectionItemSchema(
|
|
521
|
+
path=f'{root}{ext}',
|
|
522
|
+
description=this_desc.description if this_desc else '',
|
|
523
|
+
metadata=metadata_yml
|
|
524
|
+
)
|
|
525
|
+
items.append(this_resource)
|
|
526
|
+
|
|
527
|
+
total_bytes, last_modified, uid = _get_collection_size_time_uid(directory)
|
|
528
|
+
|
|
529
|
+
resource = models.CollectionResource(
|
|
530
|
+
path=directory,
|
|
531
|
+
type='collection',
|
|
532
|
+
format='directory',
|
|
533
|
+
scheme=fsspec.utils.get_protocol(directory),
|
|
534
|
+
bytes=total_bytes,
|
|
535
|
+
last_modified=last_modified,
|
|
536
|
+
items=items,
|
|
537
|
+
uid=uid
|
|
538
|
+
)
|
|
539
|
+
|
|
540
|
+
# Check if there is existing metadata for the collection
|
|
541
|
+
try:
|
|
542
|
+
metadata_path = f'{directory}-metadata.yml'
|
|
543
|
+
existing_metadata = models.CollectionResource.load(metadata_path)
|
|
544
|
+
|
|
545
|
+
# Copy any existing item descriptions from existing yml to new metadata
|
|
546
|
+
# Note that descriptions in individual resources' ymls will take
|
|
547
|
+
# priority over item descriptions from preexisting collection metadata
|
|
548
|
+
for item in resource.items:
|
|
549
|
+
# Existing metadata's item desc will overwrite new metadata item
|
|
550
|
+
# desc if new item desc is ''
|
|
551
|
+
existing_item_desc = [
|
|
552
|
+
i.description for i in existing_metadata.items if (
|
|
553
|
+
i.path == item.path)]
|
|
554
|
+
if item.description == '' and len(existing_item_desc) > 0:
|
|
555
|
+
item.description = existing_item_desc[0]
|
|
556
|
+
|
|
557
|
+
# Replace fields in existing yml if new metadata has existing value
|
|
558
|
+
resource = existing_metadata.replace(resource)
|
|
559
|
+
|
|
560
|
+
except (ValueError, ValidationError) as error:
|
|
561
|
+
LOGGER.warning(error)
|
|
562
|
+
LOGGER.warning(
|
|
563
|
+
f'Ignoring an existing YAML document: {metadata_path} because it'
|
|
564
|
+
f' is invalid or incompatible.')
|
|
565
|
+
LOGGER.warning(
|
|
566
|
+
'A subsequent call to `.write()` will replace this file, but it'
|
|
567
|
+
f' will be backed up to {metadata_path}.bak.\n'
|
|
568
|
+
f'Use `.write(backup=False)` to skip the backup.\n',
|
|
569
|
+
extra=_LOG_EXTRA_NOT_FOR_CLI)
|
|
570
|
+
resource._would_overwrite = True
|
|
571
|
+
|
|
572
|
+
except FileNotFoundError:
|
|
573
|
+
pass
|
|
574
|
+
|
|
575
|
+
# Add profile metadata
|
|
576
|
+
config = Config()
|
|
577
|
+
resource = resource.replace(config.profile)
|
|
578
|
+
|
|
579
|
+
return resource
|
|
580
|
+
|
|
581
|
+
|
|
582
|
+
DESCRIBE_FUNCS = {
|
|
309
583
|
'archive': describe_archive,
|
|
310
584
|
'table': describe_table,
|
|
311
585
|
'vector': describe_vector,
|
|
@@ -321,7 +595,7 @@ RESOURCE_MODELS = {
|
|
|
321
595
|
|
|
322
596
|
|
|
323
597
|
@_osgeo_use_exceptions
|
|
324
|
-
def describe(source_dataset_path,
|
|
598
|
+
def describe(source_dataset_path, compute_stats=False):
|
|
325
599
|
"""Create a metadata resource instance with properties of the dataset.
|
|
326
600
|
|
|
327
601
|
Properties of the dataset are used to populate as many metadata
|
|
@@ -331,20 +605,23 @@ def describe(source_dataset_path, profile=None):
|
|
|
331
605
|
Args:
|
|
332
606
|
source_dataset_path (string): path or URL to dataset to which the
|
|
333
607
|
metadata applies
|
|
334
|
-
|
|
335
|
-
|
|
608
|
+
compute_stats (bool): whether to compute statistics
|
|
609
|
+
for each band in a raster.
|
|
336
610
|
|
|
337
611
|
Returns:
|
|
338
612
|
geometamaker.models.Resource: a metadata object
|
|
339
613
|
|
|
340
614
|
"""
|
|
341
|
-
config = Config()
|
|
342
|
-
user_profile = config.profile
|
|
343
|
-
if profile is not None:
|
|
344
|
-
user_profile = user_profile.replace(profile)
|
|
345
615
|
|
|
346
616
|
metadata_path = f'{source_dataset_path}.yml'
|
|
347
617
|
|
|
618
|
+
if os.path.isdir(source_dataset_path):
|
|
619
|
+
raise ValueError(
|
|
620
|
+
f"Cannot `describe` {source_dataset_path} as it is a directory, "
|
|
621
|
+
"not a dataset. \nIf you are trying to create metadata for the "
|
|
622
|
+
"files within a directory and/or the directory itself, please use "
|
|
623
|
+
"`geometamaker.describe_collection` instead.")
|
|
624
|
+
|
|
348
625
|
# Despite naming, this does not open a file that must be closed
|
|
349
626
|
of = fsspec.open(source_dataset_path)
|
|
350
627
|
if not of.fs.exists(source_dataset_path):
|
|
@@ -356,58 +633,63 @@ def describe(source_dataset_path, profile=None):
|
|
|
356
633
|
f'Cannot describe {source_dataset_path}. {protocol} '
|
|
357
634
|
f'is not one of the suppored file protocols: {PROTOCOLS}')
|
|
358
635
|
resource_type = detect_file_type(source_dataset_path, protocol)
|
|
359
|
-
description =
|
|
360
|
-
source_dataset_path, protocol)
|
|
636
|
+
description = DESCRIBE_FUNCS[resource_type](
|
|
637
|
+
source_dataset_path, protocol, compute_stats=compute_stats)
|
|
361
638
|
description['type'] = resource_type
|
|
639
|
+
resource = RESOURCE_MODELS[resource_type](**description)
|
|
362
640
|
|
|
363
641
|
# Load existing metadata file
|
|
364
642
|
try:
|
|
643
|
+
# For the data model, use heuristic to decide if the new resource
|
|
644
|
+
# should inherit values from the existing resource.
|
|
645
|
+
# After that, take all non-empty values from the new resource
|
|
646
|
+
# and update the existing resource.
|
|
365
647
|
existing_resource = RESOURCE_MODELS[resource_type].load(metadata_path)
|
|
366
|
-
if '
|
|
367
|
-
|
|
368
|
-
|
|
369
|
-
|
|
370
|
-
|
|
371
|
-
|
|
372
|
-
|
|
373
|
-
|
|
374
|
-
|
|
375
|
-
|
|
376
|
-
|
|
377
|
-
|
|
378
|
-
|
|
379
|
-
|
|
380
|
-
|
|
381
|
-
|
|
382
|
-
|
|
383
|
-
|
|
384
|
-
|
|
385
|
-
|
|
386
|
-
|
|
387
|
-
|
|
388
|
-
|
|
389
|
-
|
|
390
|
-
|
|
391
|
-
|
|
392
|
-
|
|
393
|
-
|
|
394
|
-
|
|
395
|
-
|
|
396
|
-
|
|
397
|
-
|
|
398
|
-
|
|
399
|
-
|
|
400
|
-
|
|
401
|
-
|
|
402
|
-
|
|
403
|
-
resource =
|
|
404
|
-
|
|
405
|
-
# Common path: metadata file does not already exist
|
|
406
|
-
# Or less common, ValueError if it exists but is incompatible
|
|
648
|
+
if resource_type == 'raster':
|
|
649
|
+
for band in resource.data_model.bands:
|
|
650
|
+
try:
|
|
651
|
+
eband = existing_resource.get_band_description(band.index)
|
|
652
|
+
except IndexError:
|
|
653
|
+
continue
|
|
654
|
+
if (band.numpy_type, band.gdal_type, band.nodata) == (
|
|
655
|
+
eband.numpy_type, eband.gdal_type, eband.nodata):
|
|
656
|
+
resource.set_band_description(
|
|
657
|
+
band.index,
|
|
658
|
+
title=eband.title,
|
|
659
|
+
description=eband.description,
|
|
660
|
+
units=eband.units)
|
|
661
|
+
if resource_type in ('vector', 'table'):
|
|
662
|
+
for field in resource._get_fields():
|
|
663
|
+
try:
|
|
664
|
+
efield = existing_resource.get_field_description(field.name)
|
|
665
|
+
except KeyError:
|
|
666
|
+
continue
|
|
667
|
+
if field.type == efield.type:
|
|
668
|
+
resource.set_field_description(
|
|
669
|
+
field.name,
|
|
670
|
+
title=efield.title,
|
|
671
|
+
description=efield.description,
|
|
672
|
+
units=efield.units)
|
|
673
|
+
resource = existing_resource.replace(resource)
|
|
674
|
+
|
|
675
|
+
except (ValueError, ValidationError) as error:
|
|
676
|
+
LOGGER.warning(error)
|
|
677
|
+
LOGGER.warning(
|
|
678
|
+
f'Ignoring an existing YAML document: {metadata_path} because it'
|
|
679
|
+
f' is invalid or incompatible.')
|
|
680
|
+
LOGGER.warning(
|
|
681
|
+
'A subsequent call to `.write()` will replace this file, but it'
|
|
682
|
+
' will be backed up to {metadata_path}.bak.\n'
|
|
683
|
+
f'Use `.write(backup=False)` to skip the backup.\n',
|
|
684
|
+
extra=_LOG_EXTRA_NOT_FOR_CLI)
|
|
685
|
+
resource._would_overwrite = True
|
|
686
|
+
|
|
407
687
|
except FileNotFoundError:
|
|
408
|
-
|
|
688
|
+
# Common path: metadata file does not already exist
|
|
689
|
+
pass
|
|
409
690
|
|
|
410
|
-
|
|
691
|
+
config = Config()
|
|
692
|
+
resource = resource.replace(config.profile)
|
|
411
693
|
return resource
|
|
412
694
|
|
|
413
695
|
|
|
@@ -442,30 +724,20 @@ def validate(filepath):
|
|
|
442
724
|
return error
|
|
443
725
|
|
|
444
726
|
|
|
445
|
-
def validate_dir(directory,
|
|
727
|
+
def validate_dir(directory, depth=numpy.iinfo(numpy.int16).max):
|
|
446
728
|
"""Validate all compatible yml documents in the directory.
|
|
447
729
|
|
|
448
730
|
Args:
|
|
449
731
|
directory (string): path to a directory
|
|
450
|
-
|
|
451
|
-
|
|
732
|
+
depth (int): maximum number of subdirectory levels to
|
|
733
|
+
traverse when walking through ``directory``.
|
|
452
734
|
|
|
453
735
|
Returns:
|
|
454
736
|
tuple (list, list): a list of the filepaths that were validated and
|
|
455
737
|
an equal-length list of the validation messages.
|
|
456
738
|
|
|
457
739
|
"""
|
|
458
|
-
file_list =
|
|
459
|
-
if recursive:
|
|
460
|
-
for path, dirs, files in os.walk(directory):
|
|
461
|
-
for file in files:
|
|
462
|
-
file_list.append(os.path.join(path, file))
|
|
463
|
-
else:
|
|
464
|
-
file_list.extend(
|
|
465
|
-
[os.path.join(directory, path)
|
|
466
|
-
for path in os.listdir(directory)
|
|
467
|
-
if os.path.isfile(os.path.join(directory, path))])
|
|
468
|
-
|
|
740
|
+
file_list = _list_files_with_depth(directory, depth)
|
|
469
741
|
messages = []
|
|
470
742
|
yaml_files = []
|
|
471
743
|
for filepath in file_list:
|
|
@@ -473,7 +745,7 @@ def validate_dir(directory, recursive=False):
|
|
|
473
745
|
yaml_files.append(filepath)
|
|
474
746
|
msg = ''
|
|
475
747
|
try:
|
|
476
|
-
error = validate(filepath)
|
|
748
|
+
error = validate(os.path.join(directory, filepath))
|
|
477
749
|
if error:
|
|
478
750
|
msg = error
|
|
479
751
|
except ValueError:
|
|
@@ -484,48 +756,3 @@ def validate_dir(directory, recursive=False):
|
|
|
484
756
|
messages.append(msg)
|
|
485
757
|
|
|
486
758
|
return (yaml_files, messages)
|
|
487
|
-
|
|
488
|
-
|
|
489
|
-
def describe_dir(directory, recursive=False):
|
|
490
|
-
"""Describe all compatible datasets in the directory.
|
|
491
|
-
|
|
492
|
-
Take special care to only describe multifile datasets,
|
|
493
|
-
such as ESRI Shapefiles, one time.
|
|
494
|
-
|
|
495
|
-
Args:
|
|
496
|
-
directory (string): path to a directory
|
|
497
|
-
recursive (bool): whether or not to describe files
|
|
498
|
-
in all subdirectories
|
|
499
|
-
|
|
500
|
-
Returns:
|
|
501
|
-
None
|
|
502
|
-
|
|
503
|
-
"""
|
|
504
|
-
root_set = set()
|
|
505
|
-
root_ext_map = defaultdict(set)
|
|
506
|
-
for path, dirs, files in os.walk(directory):
|
|
507
|
-
for file in files:
|
|
508
|
-
full_path = os.path.join(path, file)
|
|
509
|
-
root, ext = os.path.splitext(full_path)
|
|
510
|
-
# tracking which files share a root name
|
|
511
|
-
# so we can check if these comprise a shapefile
|
|
512
|
-
root_ext_map[root].add(ext)
|
|
513
|
-
root_set.add(root)
|
|
514
|
-
if not recursive:
|
|
515
|
-
break
|
|
516
|
-
|
|
517
|
-
for root in root_set:
|
|
518
|
-
extensions = root_ext_map[root]
|
|
519
|
-
if '.shp' in extensions:
|
|
520
|
-
# if we're dealing with a shapefile, we do not want to describe any
|
|
521
|
-
# of these other files with the same root name
|
|
522
|
-
extensions.difference_update(['.shx', '.sbn', '.sbx', '.prj', '.dbf'])
|
|
523
|
-
for ext in extensions:
|
|
524
|
-
filepath = f'{root}{ext}'
|
|
525
|
-
try:
|
|
526
|
-
resource = describe(filepath)
|
|
527
|
-
except ValueError as error:
|
|
528
|
-
LOGGER.debug(error)
|
|
529
|
-
continue
|
|
530
|
-
resource.write()
|
|
531
|
-
LOGGER.info(f'{filepath} described')
|