geometamaker 0.1.2__py3-none-any.whl → 0.2.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -2,6 +2,7 @@ import functools
2
2
  import hashlib
3
3
  import logging
4
4
  import os
5
+ import re
5
6
  import requests
6
7
  from collections import defaultdict
7
8
  from datetime import datetime, timezone
@@ -13,13 +14,20 @@ import pygeoprocessing
13
14
  import yaml
14
15
  from osgeo import gdal
15
16
  from osgeo import osr
17
+ from pathlib import Path
16
18
  from pydantic import ValidationError
19
+ import tarfile
17
20
 
18
21
  from . import models
19
22
  from .config import Config
20
23
 
24
+ logging.getLogger('chardet').setLevel(logging.INFO) # DEBUG is just too noisy
21
25
 
22
- LOGGER = logging.getLogger(__name__)
26
+ LOGGER = logging.getLogger('geometamaker')
27
+ _NOT_FOR_CLI = 'not_for_cli'
28
+ _LOG_EXTRA_NOT_FOR_CLI = {
29
+ _NOT_FOR_CLI: True
30
+ }
23
31
 
24
32
  # URI schemes we support. A subset of fsspec.available_protocols()
25
33
  PROTOCOLS = [
@@ -31,6 +39,12 @@ PROTOCOLS = [
31
39
  DT_FMT = '%Y-%m-%d %H:%M:%S %Z'
32
40
 
33
41
 
42
+ def _gdal_progress_callback(complete, message, data):
43
+ percentage = complete * 100
44
+ if (percentage > 0) & (percentage % 5 == 0):
45
+ LOGGER.info(f'{message} {percentage}%')
46
+
47
+
34
48
  # TODO: In the future we can remove these exception managers in favor of the
35
49
  # builtin gdal.ExceptionMgr. It was released in 3.7.0 and debugged in 3.9.1.
36
50
  # https://github.com/OSGeo/gdal/blob/v3.9.3/NEWS.md#gdalogr-391-release-notes
@@ -103,6 +117,80 @@ def _wkt_to_epsg_units_string(wkt_string):
103
117
  return crs_string, units_string
104
118
 
105
119
 
120
+ def _list_files_with_depth(directory, depth, exclude_regex=None,
121
+ exclude_hidden=True):
122
+ """List files in directory up to depth
123
+
124
+ Args:
125
+ directory (string): path to a directory
126
+ depth (int): maximum number of subdirectory levels to traverse when
127
+ walking through a directory. A value of 1 limits the walk to files
128
+ in the top-level ``directory`` only. A value of 2 allows
129
+ descending into immediate subdirectories, etc.
130
+ exclude_regex (str, optional): a regular expression to pattern-match
131
+ any files for which you do not want to create metadata.
132
+ exclude_hidden (bool, default True): whether to ignore hidden files
133
+
134
+ Returns:
135
+ list of relative filepaths in ``directory``
136
+
137
+ """
138
+ directory = Path(directory).resolve()
139
+ file_list = []
140
+
141
+ for path in directory.rglob("*"):
142
+ relative_path = path.relative_to(directory)
143
+ current_depth = len(relative_path.parts)
144
+ if current_depth > depth:
145
+ continue
146
+ if exclude_hidden and (
147
+ any(part.startswith('.') for part in relative_path.parts)):
148
+ continue
149
+ file_list.append(str(relative_path))
150
+
151
+ # remove excluded files based on regex
152
+ if exclude_regex is not None:
153
+ file_list = [f for f in file_list if not re.search(exclude_regex, f)]
154
+
155
+ return sorted(file_list)
156
+
157
+
158
+ def _group_files_by_root(file_list):
159
+ """Get set of files (roots) and extensions by filename"""
160
+ root_set = set()
161
+ root_ext_map = defaultdict(set)
162
+ for filepath in file_list:
163
+ root, ext = os.path.splitext(filepath)
164
+ # tracking which files share a root name
165
+ # so we can check if these comprise a shapefile
166
+ root_ext_map[root].add(ext)
167
+ root_set.add(root)
168
+ return root_ext_map, sorted(list(root_set))
169
+
170
+
171
+ def _get_collection_size_time_uid(directory):
172
+ """Get size of directory (in bytes), when it was last modified, and uid"""
173
+ total_bytes = 0
174
+ latest_mtime = 0
175
+
176
+ for root, _, files in os.walk(directory):
177
+ for file in files:
178
+ file_path = os.path.join(root, file)
179
+ stat = os.stat(file_path)
180
+ total_bytes += stat.st_size
181
+ latest_mtime = max(latest_mtime, stat.st_mtime)
182
+
183
+ last_modified = datetime.fromtimestamp(latest_mtime, tz=timezone.utc)
184
+ last_modified_str = last_modified.strftime('%Y-%m-%d %H:%M:%S %Z')
185
+
186
+ hash_func = hashlib.sha256()
187
+ hash_func.update(
188
+ f'{total_bytes}{last_modified_str}{directory}'.encode('utf-8'))
189
+ uid = f'sizetimestamp:{hash_func.hexdigest()}'
190
+
191
+ return total_bytes, last_modified_str, uid
192
+
193
+
106
194
  def detect_file_type(filepath, scheme):
107
195
  """Detect the type of resource contained in the file.
108
196
 
@@ -119,13 +207,16 @@ def detect_file_type(filepath, scheme):
119
207
  """
120
208
  # TODO: guard against classifying netCDF, HDF5, etc as GDAL rasters.
121
209
  # We'll likely want a different data model for multi-dimensional arrays.
122
-
123
210
  # Frictionless supports a wide range of formats. The quickest way to
124
211
  # determine if a file is recognized as a table or archive is to call list.
125
- info = frictionless.list(filepath)[0]
212
+ try:
213
+ info = frictionless.list(filepath)[0]
214
+ except frictionless.FrictionlessException:
215
+ raise RuntimeError(f'Cannot detect file type of "{filepath}"')
126
216
  if info.type == 'table':
127
217
  return 'table'
128
- if info.compression:
218
+ # Frictionless doesn't recognize .tgz compression (but does recognize .tar.gz)
219
+ if info.compression or info.format == "tgz":
129
220
  return 'archive'
130
221
  # GDAL considers CSV a vector, so check against frictionless first.
131
222
  try:
@@ -177,7 +268,7 @@ def describe_file(source_dataset_path, scheme):
177
268
  hash_func = hashlib.new('sha256')
178
269
  hash_func.update(
179
270
  f'{description["bytes"]}{description["last_modified"]}\
180
- {description["path"]}'.encode('ascii'))
271
+ {description["path"]}'.encode('utf-8'))
181
272
  description['uid'] = f'sizetimestamp:{hash_func.hexdigest()}'
182
273
 
183
274
  # We don't have a use for including these attributes in our metadata:
@@ -186,37 +277,64 @@ def describe_file(source_dataset_path, scheme):
186
277
  return description
187
278
 
188
279
 
189
- def describe_archive(source_dataset_path, scheme):
280
+ def describe_archive(source_dataset_path, scheme, **kwargs):
190
281
  """Describe file properties of a compressed file.
191
282
 
192
283
  Args:
193
284
  source_dataset_path (str): path to a file.
194
285
  scheme (str): the protocol prefix of the filepath
286
+ kwargs (dict): additional options when describing a dataset.
195
287
 
196
288
  Returns:
197
289
  dict
198
290
 
199
291
  """
292
+ def _list_tgz_contents(path):
293
+ """List contents of a .tar, .tgz, or .tar.gz archive."""
294
+ file_list = []
295
+ with fsspec.open(path, 'rb') as fobj:
296
+ with tarfile.open(fileobj=fobj, mode='r:*') as tar:
297
+ file_list = [member.name for member in tar.getmembers()
298
+ if member.isfile()]
299
+ return file_list
300
+
301
+ def _list_zip_contents(path):
302
+ """List contents of a zip archive"""
303
+ file_list = []
304
+ ZFS = fsspec.get_filesystem_class('zip')
305
+ zfs = ZFS(path)
306
+ for dirpath, _, files in zfs.walk(zfs.root_marker):
307
+ for f in files:
308
+ file_list.append(os.path.join(dirpath, f))
309
+ return file_list
310
+
200
311
  description = describe_file(source_dataset_path, scheme)
201
312
  # innerpath is from frictionless and not useful because
202
313
  # it does not include all the files contained in the zip
203
314
  description.pop('innerpath', None)
204
315
 
205
- ZFS = fsspec.get_filesystem_class('zip')
206
- zfs = ZFS(source_dataset_path)
207
- file_list = []
208
- for dirpath, _, files in zfs.walk(zfs.root_marker):
209
- for f in files:
210
- file_list.append(os.path.join(dirpath, f))
316
+ if description.get("compression") == "zip":
317
+ file_list = _list_zip_contents(source_dataset_path)
318
+ elif description.get("format") in ["tgz", "tar"]:
319
+ file_list = _list_tgz_contents(source_dataset_path)
320
+ # 'compression' attr not auto-added by frictionless.describe for .tgz
321
+ # (but IS added for .tar.gz)
322
+ if source_dataset_path.endswith((".tgz")):
323
+ description["compression"] = "gz"
324
+ else:
325
+ raise ValueError(f"Unsupported archive format: {source_dataset_path}")
326
+
211
327
  description['sources'] = file_list
212
328
  return description
213
329
 
214
330
 
215
- def describe_vector(source_dataset_path, scheme):
331
+ def describe_vector(source_dataset_path, scheme, **kwargs):
216
332
  """Describe properties of a GDAL vector file.
217
333
 
218
334
  Args:
219
335
  source_dataset_path (str): path to a GDAL vector.
336
+ scheme (str): the protocol prefix of the filepath
337
+ kwargs (dict): additional options when describing a dataset.
220
338
 
221
339
  Returns:
222
340
  dict
@@ -229,12 +347,18 @@ def describe_vector(source_dataset_path, scheme):
229
347
  vector = gdal.OpenEx(source_dataset_path, gdal.OF_VECTOR)
230
348
  layer = vector.GetLayer()
231
349
  fields = []
232
- description['n_features'] = layer.GetFeatureCount()
233
350
  for fld in layer.schema:
234
351
  fields.append(
235
352
  models.FieldSchema(name=fld.name, type=fld.GetTypeName()))
353
+ layer_schema = models.LayerSchema(
354
+ name=layer.GetName(),
355
+ n_features=layer.GetFeatureCount(),
356
+ table=models.TableSchema(fields=fields),
357
+ gdal_metadata=layer.GetMetadata())
358
+ description['data_model'] = models.VectorSchema(
359
+ layers=[layer_schema],
360
+ gdal_metadata=vector.GetMetadata())
236
361
  vector = layer = None
237
- description['data_model'] = models.TableSchema(fields=fields)
238
362
 
239
363
  info = pygeoprocessing.get_vector_info(source_dataset_path)
240
364
  bbox = models.BoundingBox(*info['bounding_box'])
@@ -248,33 +372,65 @@ def describe_vector(source_dataset_path, scheme):
248
372
  return description
249
373
 
250
374
 
251
- def describe_raster(source_dataset_path, scheme):
375
+ def describe_raster(source_dataset_path, scheme, **kwargs):
252
376
  """Describe properties of a GDAL raster file.
253
377
 
254
378
  Args:
255
379
  source_dataset_path (str): path to a GDAL raster.
380
+ scheme (str): the protocol prefix of the filepath
381
+ kwargs (dict): additional options when describing a dataset:
382
+ * ``'compute_stats'`` (bool): whether to compute statistics
383
+ for each band in the raster. Default is False.
256
384
 
257
385
  Returns:
258
386
  dict
259
387
 
260
388
  """
389
+ compute_stats = kwargs.get('compute_stats', False)
261
390
  description = describe_file(source_dataset_path, scheme)
262
391
  if 'http' in scheme:
263
392
  source_dataset_path = f'/vsicurl/{source_dataset_path}'
264
393
  info = pygeoprocessing.get_raster_info(source_dataset_path)
394
+ raster = gdal.OpenEx(source_dataset_path)
395
+ raster_gdal_metadata = raster.GetMetadata()
265
396
  bands = []
266
397
  for i in range(info['n_bands']):
267
398
  b = i + 1
399
+ band = raster.GetRasterBand(b)
400
+ band_gdal_metadata = band.GetMetadata()
401
+ if compute_stats:
402
+ try:
403
+ if not 'STATISTICS_VALID_PERCENT' in band_gdal_metadata:
404
+ # Sometimes some stats exist, but not all. If this one doesn't,
405
+ # it's important enough that we want to force computation.
406
+ _ = band.ComputeStatistics(0, callback=_gdal_progress_callback)
407
+ else:
408
+ # 0=do not approximate stats, 1=calculate if they don't exist
409
+ # If exact stats exist they will be retrieved without
410
+ # computing them, otherwise, this forces computation.
411
+ # https://github.com/OSGeo/gdal/blob/master/gcore/gdalrasterband.cpp
412
+ _ = band.GetStatistics(0, 1)
413
+ band_gdal_metadata = band.GetMetadata()
414
+ except RuntimeError as e:
415
+ LOGGER.warning(
416
+ f'Could not compute statistics for band {b} of '
417
+ f'{source_dataset_path}: {e}')
418
+
268
419
  bands.append(models.BandSchema(
269
420
  index=b,
270
421
  gdal_type=gdal.GetDataTypeName(info['datatype']),
271
422
  numpy_type=numpy.dtype(info['numpy_type']).name,
272
- nodata=info['nodata'][i]))
423
+ nodata=info['nodata'][i],
424
+ gdal_metadata=band_gdal_metadata))
425
+ band = None
426
+ raster = None
427
+
273
428
  description['data_model'] = models.RasterSchema(
274
429
  bands=bands,
275
430
  pixel_size=info['pixel_size'],
276
431
  raster_size={'width': info['raster_size'][0],
277
- 'height': info['raster_size'][1]})
432
+ 'height': info['raster_size'][1]},
433
+ gdal_metadata=raster_gdal_metadata)
278
434
  # Some values of raster info are numpy types, which the
279
435
  # yaml dumper doesn't know how to represent.
280
436
  bbox = models.BoundingBox(*[float(x) for x in info['bounding_box']])
@@ -288,12 +444,13 @@ def describe_raster(source_dataset_path, scheme):
288
444
  return description
289
445
 
290
446
 
291
- def describe_table(source_dataset_path, scheme):
447
+ def describe_table(source_dataset_path, scheme, **kwargs):
292
448
  """Describe properties of a tabular dataset.
293
449
 
294
450
  Args:
295
451
  source_dataset_path (str): path to a file representing a table.
296
452
  scheme (str): the protocol prefix of the filepath
453
+ kwargs (dict): additional options when describing a dataset.
297
454
 
298
455
  Returns:
299
456
  dict
@@ -305,7 +462,139 @@ def describe_table(source_dataset_path, scheme):
305
462
  return description
306
463
 
307
464
 
308
- DESRCIBE_FUNCS = {
465
+ def describe_collection(directory, depth=numpy.iinfo(numpy.int16).max,
466
+ exclude_regex=None, exclude_hidden=True,
467
+ describe_files=False, backup=True, **kwargs):
468
+ """Create a single metadata document to describe a collection of files.
469
+
470
+ Describe all the files within a directory as members of a "collection".
471
+ The resulting metadata resource should include a list of all the files
472
+ included in the collection along with a description and metadata filepath
473
+ (or placeholder). Optionally create individual metadata files for each
474
+ supported file in a directory.
475
+
476
+ Args:
477
+ directory (str): path to collection
478
+ depth (int, optional): maximum number of subdirectory levels to
479
+ traverse when walking through ``directory`` to find files included
480
+ in the collection. A value of 1 limits the walk to files in the
481
+ top-level ``directory`` only. A value of 2 allows descending into
482
+ immediate subdirectories, etc. All files in all subdirectories in
483
+ the collection will be included by default.
484
+ exclude_regex (str, optional): a regular expression to pattern-match
485
+ any files you do not want included in the output metadata yml.
486
+ exclude_hidden (bool, default True): whether to exclude hidden files
487
+ (files that start with ".").
488
+ describe_files (bool, default False): whether to ``describe`` all
489
+ files, i.e., create individual metadata files for each supported
490
+ resource in the collection.
491
+ backup (bool): whether to write a backup of a pre-existing metadata
492
+ file before ovewriting it in cases where that file is not a valid
493
+ geometamaker document.
494
+ kwargs (dict): optional keyward arguments accepted by ``describe``.
495
+
496
+ Returns:
497
+ Collection metadata
498
+ """
499
+ directory = str(Path(directory).resolve())
500
+
501
+ file_list = _list_files_with_depth(directory, depth, exclude_regex,
502
+ exclude_hidden)
503
+
504
+ root_ext_map, root_list = _group_files_by_root(file_list)
505
+
506
+ items = []
507
+
508
+ for root in root_list:
509
+ extensions = root_ext_map[root]
510
+ if '.shp' in extensions:
511
+ # if we're dealing with a shapefile, we do not want to describe any
512
+ # of these other files with the same root name
513
+ extensions.difference_update(['.shx', '.sbn', '.sbx', '.prj', '.dbf', '.cpg'])
514
+ # Only drop .yml if its sidecar file, i.e. the corresponding data file
515
+ # (root) exists on disk
516
+ if '.yml' in extensions and os.path.exists(root):
517
+ extensions.discard('.yml')
518
+ for ext in extensions:
519
+ filepath = os.path.join(directory, f'{root}{ext}')
520
+ try:
521
+ this_desc = describe(filepath, **kwargs)
522
+ except ValueError:
523
+ # if file type isn't supported by geometamaker, e.g. pdf
524
+ # or if trying to describe a dir
525
+ this_desc = None
526
+
527
+ if describe_files and this_desc:
528
+ this_desc.write(backup=backup)
529
+
530
+ if ext and os.path.exists(filepath + '.yml'):
531
+ metadata_yml = f'{root}{ext}' + '.yml'
532
+ else:
533
+ metadata_yml = ''
534
+
535
+ this_resource = models.CollectionItemSchema(
536
+ path=f'{root}{ext}',
537
+ description=this_desc.description if this_desc else '',
538
+ metadata=metadata_yml
539
+ )
540
+ items.append(this_resource)
541
+
542
+ total_bytes, last_modified, uid = _get_collection_size_time_uid(directory)
543
+
544
+ resource = models.CollectionResource(
545
+ path=directory,
546
+ type='collection',
547
+ format='directory',
548
+ scheme=fsspec.utils.get_protocol(directory),
549
+ bytes=total_bytes,
550
+ last_modified=last_modified,
551
+ items=items,
552
+ uid=uid
553
+ )
554
+
555
+ # Check if there is existing metadata for the collection
556
+ try:
557
+ metadata_path = f'{directory}-metadata.yml'
558
+ existing_metadata = models.CollectionResource.load(metadata_path)
559
+
560
+ # Copy any existing item descriptions from existing yml to new metadata
561
+ # Note that descriptions in individual resources' ymls will take
562
+ # priority over item descriptions from preexisting collection metadata
563
+ for item in resource.items:
564
+ # Existing metadata's item desc will overwrite new metadata item
565
+ # desc if new item desc is ''
566
+ existing_item_desc = [
567
+ i.description for i in existing_metadata.items if (
568
+ i.path == item.path)]
569
+ if item.description == '' and len(existing_item_desc) > 0:
570
+ item.description = existing_item_desc[0]
571
+
572
+ # Replace fields in existing yml if new metadata has existing value
573
+ resource = existing_metadata.replace(resource)
574
+
575
+ except (ValueError, ValidationError) as error:
576
+ LOGGER.warning(error)
577
+ LOGGER.warning(
578
+ f'Ignoring an existing YAML document: {metadata_path} because it'
579
+ f' is invalid or incompatible.')
580
+ LOGGER.warning(
581
+ 'A subsequent call to `.write()` will replace this file, but it'
582
+ f' will be backed up to {metadata_path}.bak.\n'
583
+ f'Use `.write(backup=False)` to skip the backup.\n',
584
+ extra=_LOG_EXTRA_NOT_FOR_CLI)
585
+ resource._would_overwrite = True
586
+
587
+ except FileNotFoundError:
588
+ pass
589
+
590
+ # Add profile metadata
591
+ config = Config()
592
+ resource = resource.replace(config.profile)
593
+
594
+ return resource
595
+
596
+
597
+ DESCRIBE_FUNCS = {
309
598
  'archive': describe_archive,
310
599
  'table': describe_table,
311
600
  'vector': describe_vector,
@@ -321,7 +610,7 @@ RESOURCE_MODELS = {
321
610
 
322
611
 
323
612
  @_osgeo_use_exceptions
324
- def describe(source_dataset_path, profile=None):
613
+ def describe(source_dataset_path, compute_stats=False):
325
614
  """Create a metadata resource instance with properties of the dataset.
326
615
 
327
616
  Properties of the dataset are used to populate as many metadata
@@ -331,20 +620,23 @@ def describe(source_dataset_path, profile=None):
331
620
  Args:
332
621
  source_dataset_path (string): path or URL to dataset to which the
333
622
  metadata applies
334
- profile (geometamaker.models.Profile): a profile object from
335
- which to populate some metadata attributes
623
+ compute_stats (bool): whether to compute statistics
624
+ for each band in a raster.
336
625
 
337
626
  Returns:
338
627
  geometamaker.models.Resource: a metadata object
339
628
 
340
629
  """
341
- config = Config()
342
- user_profile = config.profile
343
- if profile is not None:
344
- user_profile = user_profile.replace(profile)
345
630
 
346
631
  metadata_path = f'{source_dataset_path}.yml'
347
632
 
633
+ if os.path.isdir(source_dataset_path):
634
+ raise ValueError(
635
+ f"Cannot `describe` {source_dataset_path} as it is a directory, "
636
+ "not a dataset. \nIf you are trying to create metadata for the "
637
+ "files within a directory and/or the directory itself, please use "
638
+ "`geometamaker.describe_collection` instead.")
639
+
348
640
  # Despite naming, this does not open a file that must be closed
349
641
  of = fsspec.open(source_dataset_path)
350
642
  if not of.fs.exists(source_dataset_path):
@@ -356,58 +648,63 @@ def describe(source_dataset_path, profile=None):
356
648
  f'Cannot describe {source_dataset_path}. {protocol} '
357
649
  f'is not one of the suppored file protocols: {PROTOCOLS}')
358
650
  resource_type = detect_file_type(source_dataset_path, protocol)
359
- description = DESRCIBE_FUNCS[resource_type](
360
- source_dataset_path, protocol)
651
+ description = DESCRIBE_FUNCS[resource_type](
652
+ source_dataset_path, protocol, compute_stats=compute_stats)
361
653
  description['type'] = resource_type
654
+ resource = RESOURCE_MODELS[resource_type](**description)
362
655
 
363
656
  # Load existing metadata file
364
657
  try:
658
+ # For the data model, use heuristic to decide if the new resource
659
+ # should inherit values from the existing resource.
660
+ # After that, take all non-empty values from the new resource
661
+ # and update the existing resource.
365
662
  existing_resource = RESOURCE_MODELS[resource_type].load(metadata_path)
366
- if 'data_model' in description:
367
- if isinstance(description['data_model'], models.RasterSchema):
368
- # If existing band metadata still matches data_model of the file
369
- # carry over existing metadata because it could include
370
- # human-defined properties.
371
- new_bands = []
372
- for band in description['data_model'].bands:
373
- try:
374
- eband = existing_resource.get_band_description(band.index)
375
- # TODO: rewrite this as __eq__ of BandSchema?
376
- if (band.numpy_type, band.gdal_type, band.nodata) == (
377
- eband.numpy_type, eband.gdal_type, eband.nodata):
378
- updated_dict = band.model_dump() | eband.model_dump()
379
- band = models.BandSchema(**updated_dict)
380
- except IndexError:
381
- pass
382
- new_bands.append(band)
383
- description['data_model'].bands = new_bands
384
- if isinstance(description['data_model'], models.TableSchema):
385
- # If existing field metadata still matches data_model of the file
386
- # carry over existing metadata because it could include
387
- # human-defined properties.
388
- new_fields = []
389
- for field in description['data_model'].fields:
390
- try:
391
- efield = existing_resource.get_field_description(
392
- field.name)
393
- # TODO: rewrite this as __eq__ of FieldSchema?
394
- if field.type == efield.type:
395
- updated_dict = field.model_dump() | efield.model_dump()
396
- field = models.FieldSchema(**updated_dict)
397
- except KeyError:
398
- pass
399
- new_fields.append(field)
400
- description['data_model'].fields = new_fields
401
- # overwrite properties that are intrinsic to the dataset
402
- updated_dict = existing_resource.model_dump() | description
403
- resource = RESOURCE_MODELS[resource_type](**updated_dict)
404
-
405
- # Common path: metadata file does not already exist
406
- # Or less common, ValueError if it exists but is incompatible
663
+ if resource_type == 'raster':
664
+ for band in resource.data_model.bands:
665
+ try:
666
+ eband = existing_resource.get_band_description(band.index)
667
+ except IndexError:
668
+ continue
669
+ if (band.numpy_type, band.gdal_type, band.nodata) == (
670
+ eband.numpy_type, eband.gdal_type, eband.nodata):
671
+ resource.set_band_description(
672
+ band.index,
673
+ title=eband.title,
674
+ description=eband.description,
675
+ units=eband.units)
676
+ if resource_type in ('vector', 'table'):
677
+ for field in resource._get_fields():
678
+ try:
679
+ efield = existing_resource.get_field_description(field.name)
680
+ except KeyError:
681
+ continue
682
+ if field.type == efield.type:
683
+ resource.set_field_description(
684
+ field.name,
685
+ title=efield.title,
686
+ description=efield.description,
687
+ units=efield.units)
688
+ resource = existing_resource.replace(resource)
689
+
690
+ except (ValueError, ValidationError) as error:
691
+ LOGGER.warning(error)
692
+ LOGGER.warning(
693
+ f'Ignoring an existing YAML document: {metadata_path} because it'
694
+ f' is invalid or incompatible.')
695
+ LOGGER.warning(
696
+ 'A subsequent call to `.write()` will replace this file, but it'
697
+ ' will be backed up to {metadata_path}.bak.\n'
698
+ f'Use `.write(backup=False)` to skip the backup.\n',
699
+ extra=_LOG_EXTRA_NOT_FOR_CLI)
700
+ resource._would_overwrite = True
701
+
407
702
  except FileNotFoundError:
408
- resource = RESOURCE_MODELS[resource_type](**description)
703
+ # Common path: metadata file does not already exist
704
+ pass
409
705
 
410
- resource = resource.replace(user_profile)
706
+ config = Config()
707
+ resource = resource.replace(config.profile)
411
708
  return resource
412
709
 
413
710
 
@@ -442,30 +739,20 @@ def validate(filepath):
442
739
  return error
443
740
 
444
741
 
445
- def validate_dir(directory, recursive=False):
742
+ def validate_dir(directory, depth=numpy.iinfo(numpy.int16).max):
446
743
  """Validate all compatible yml documents in the directory.
447
744
 
448
745
  Args:
449
746
  directory (string): path to a directory
450
- recursive (bool): whether or not to describe files
451
- in all subdirectories
747
+ depth (int): maximum number of subdirectory levels to
748
+ traverse when walking through ``directory``.
452
749
 
453
750
  Returns:
454
751
  tuple (list, list): a list of the filepaths that were validated and
455
752
  an equal-length list of the validation messages.
456
753
 
457
754
  """
458
- file_list = []
459
- if recursive:
460
- for path, dirs, files in os.walk(directory):
461
- for file in files:
462
- file_list.append(os.path.join(path, file))
463
- else:
464
- file_list.extend(
465
- [os.path.join(directory, path)
466
- for path in os.listdir(directory)
467
- if os.path.isfile(os.path.join(directory, path))])
468
-
755
+ file_list = _list_files_with_depth(directory, depth)
469
756
  messages = []
470
757
  yaml_files = []
471
758
  for filepath in file_list:
@@ -473,7 +760,7 @@ def validate_dir(directory, recursive=False):
473
760
  yaml_files.append(filepath)
474
761
  msg = ''
475
762
  try:
476
- error = validate(filepath)
763
+ error = validate(os.path.join(directory, filepath))
477
764
  if error:
478
765
  msg = error
479
766
  except ValueError:
@@ -484,48 +771,3 @@ def validate_dir(directory, recursive=False):
484
771
  messages.append(msg)
485
772
 
486
773
  return (yaml_files, messages)
487
-
488
-
489
- def describe_dir(directory, recursive=False):
490
- """Describe all compatible datasets in the directory.
491
-
492
- Take special care to only describe multifile datasets,
493
- such as ESRI Shapefiles, one time.
494
-
495
- Args:
496
- directory (string): path to a directory
497
- recursive (bool): whether or not to describe files
498
- in all subdirectories
499
-
500
- Returns:
501
- None
502
-
503
- """
504
- root_set = set()
505
- root_ext_map = defaultdict(set)
506
- for path, dirs, files in os.walk(directory):
507
- for file in files:
508
- full_path = os.path.join(path, file)
509
- root, ext = os.path.splitext(full_path)
510
- # tracking which files share a root name
511
- # so we can check if these comprise a shapefile
512
- root_ext_map[root].add(ext)
513
- root_set.add(root)
514
- if not recursive:
515
- break
516
-
517
- for root in root_set:
518
- extensions = root_ext_map[root]
519
- if '.shp' in extensions:
520
- # if we're dealing with a shapefile, we do not want to describe any
521
- # of these other files with the same root name
522
- extensions.difference_update(['.shx', '.sbn', '.sbx', '.prj', '.dbf'])
523
- for ext in extensions:
524
- filepath = f'{root}{ext}'
525
- try:
526
- resource = describe(filepath)
527
- except ValueError as error:
528
- LOGGER.debug(error)
529
- continue
530
- resource.write()
531
- LOGGER.info(f'{filepath} described')