geometamaker 0.1.1__py3-none-any.whl → 0.2.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -2,6 +2,7 @@ import functools
2
2
  import hashlib
3
3
  import logging
4
4
  import os
5
+ import re
5
6
  import requests
6
7
  from collections import defaultdict
7
8
  from datetime import datetime, timezone
@@ -13,13 +14,20 @@ import pygeoprocessing
13
14
  import yaml
14
15
  from osgeo import gdal
15
16
  from osgeo import osr
17
+ from pathlib import Path
16
18
  from pydantic import ValidationError
19
+ import tarfile
17
20
 
18
21
  from . import models
19
22
  from .config import Config
20
23
 
24
+ logging.getLogger('chardet').setLevel(logging.INFO) # DEBUG is just too noisy
21
25
 
22
- LOGGER = logging.getLogger(__name__)
26
+ LOGGER = logging.getLogger('geometamaker')
27
+ _NOT_FOR_CLI = 'not_for_cli'
28
+ _LOG_EXTRA_NOT_FOR_CLI = {
29
+ _NOT_FOR_CLI: True
30
+ }
23
31
 
24
32
  # URI schemes we support. A subset of fsspec.available_protocols()
25
33
  PROTOCOLS = [
@@ -103,6 +111,80 @@ def _wkt_to_epsg_units_string(wkt_string):
103
111
  return crs_string, units_string
104
112
 
105
113
 
114
+ def _list_files_with_depth(directory, depth, exclude_regex=None,
115
+ exclude_hidden=True):
116
+ """List files in directory up to depth
117
+
118
+ Args:
119
+ directory (string): path to a directory
120
+ depth (int): maximum number of subdirectory levels to traverse when
121
+ walking through a directory. A value of 1 limits the walk to files
122
+ in the top-level ``directory`` only. A value of 2 allows
123
+ descending into immediate subdirectories, etc.
124
+ exclude_regex (str, optional): a regular expression to pattern-match
125
+ any files for which you do not want to create metadata.
126
+ exclude_hidden (bool, default True): whether to ignore hidden files
127
+
128
+ Returns:
129
+ list of relative filepaths in ``directory``
130
+
131
+ """
132
+ directory = Path(directory).resolve()
133
+ file_list = []
134
+
135
+ for path in directory.rglob("*"):
136
+ relative_path = path.relative_to(directory)
137
+ current_depth = len(relative_path.parts)
138
+ if current_depth > depth:
139
+ continue
140
+ if exclude_hidden and (
141
+ any(part.startswith('.') for part in relative_path.parts)):
142
+ continue
143
+ file_list.append(str(relative_path))
144
+
145
+ # remove excluded files based on regex
146
+ if exclude_regex is not None:
147
+ file_list = [f for f in file_list if not re.search(exclude_regex, f)]
148
+
149
+ return sorted(file_list)
150
+
151
+
152
+ def _group_files_by_root(file_list):
153
+ """Get set of files (roots) and extensions by filename"""
154
+ root_set = set()
155
+ root_ext_map = defaultdict(set)
156
+ for filepath in file_list:
157
+ root, ext = os.path.splitext(filepath)
158
+ # tracking which files share a root name
159
+ # so we can check if these comprise a shapefile
160
+ root_ext_map[root].add(ext)
161
+ root_set.add(root)
162
+ return root_ext_map, sorted(list(root_set))
163
+
164
+
165
+ def _get_collection_size_time_uid(directory):
166
+ """Get size of directory (in bytes), when it was last modified, and uid"""
167
+ total_bytes = 0
168
+ latest_mtime = 0
169
+
170
+ for root, _, files in os.walk(directory):
171
+ for file in files:
172
+ file_path = os.path.join(root, file)
173
+ stat = os.stat(file_path)
174
+ total_bytes += stat.st_size
175
+ latest_mtime = max(latest_mtime, stat.st_mtime)
176
+
177
+ last_modified = datetime.fromtimestamp(latest_mtime, tz=timezone.utc)
178
+ last_modified_str = last_modified.strftime('%Y-%m-%d %H:%M:%S %Z')
179
+
180
+ hash_func = hashlib.sha256()
181
+ hash_func.update(
182
+ f'{total_bytes}{last_modified_str}{directory}'.encode('utf-8'))
183
+ uid = f'sizetimestamp:{hash_func.hexdigest()}'
184
+
185
+ return total_bytes, last_modified_str, uid
186
+
187
+
106
188
  def detect_file_type(filepath, scheme):
107
189
  """Detect the type of resource contained in the file.
108
190
 
@@ -119,13 +201,13 @@ def detect_file_type(filepath, scheme):
119
201
  """
120
202
  # TODO: guard against classifying netCDF, HDF5, etc as GDAL rasters.
121
203
  # We'll likely want a different data model for multi-dimensional arrays.
122
-
123
204
  # Frictionless supports a wide range of formats. The quickest way to
124
205
  # determine if a file is recognized as a table or archive is to call list.
125
206
  info = frictionless.list(filepath)[0]
126
207
  if info.type == 'table':
127
208
  return 'table'
128
- if info.compression:
209
+ # Frictionless doesn't recognize .tgz compression (but does recognize .tar.gz)
210
+ if info.compression or info.format == "tgz":
129
211
  return 'archive'
130
212
  # GDAL considers CSV a vector, so check against frictionless first.
131
213
  try:
@@ -186,37 +268,64 @@ def describe_file(source_dataset_path, scheme):
186
268
  return description
187
269
 
188
270
 
189
- def describe_archive(source_dataset_path, scheme):
271
+ def describe_archive(source_dataset_path, scheme, **kwargs):
190
272
  """Describe file properties of a compressed file.
191
273
 
192
274
  Args:
193
275
  source_dataset_path (str): path to a file.
194
276
  scheme (str): the protocol prefix of the filepath
277
+ kwargs (dict): additional options when describing a dataset.
195
278
 
196
279
  Returns:
197
280
  dict
198
281
 
199
282
  """
283
+ def _list_tgz_contents(path):
284
+ """List contents of a .tar, .tgz, or .tar.gz archive."""
285
+ file_list = []
286
+ with fsspec.open(path, 'rb') as fobj:
287
+ with tarfile.open(fileobj=fobj, mode='r:*') as tar:
288
+ file_list = [member.name for member in tar.getmembers()
289
+ if member.isfile()]
290
+ return file_list
291
+
292
+ def _list_zip_contents(path):
293
+ """List contents of a zip archive"""
294
+ file_list = []
295
+ ZFS = fsspec.get_filesystem_class('zip')
296
+ zfs = ZFS(path)
297
+ for dirpath, _, files in zfs.walk(zfs.root_marker):
298
+ for f in files:
299
+ file_list.append(os.path.join(dirpath, f))
300
+ return file_list
301
+
200
302
  description = describe_file(source_dataset_path, scheme)
201
303
  # innerpath is from frictionless and not useful because
202
304
  # it does not include all the files contained in the zip
203
305
  description.pop('innerpath', None)
204
306
 
205
- ZFS = fsspec.get_filesystem_class('zip')
206
- zfs = ZFS(source_dataset_path)
207
- file_list = []
208
- for dirpath, _, files in zfs.walk(zfs.root_marker):
209
- for f in files:
210
- file_list.append(os.path.join(dirpath, f))
307
+ if description.get("compression") == "zip":
308
+ file_list = _list_zip_contents(source_dataset_path)
309
+ elif description.get("format") in ["tgz", "tar"]:
310
+ file_list = _list_tgz_contents(source_dataset_path)
311
+ # 'compression' attr not auto-added by frictionless.describe for .tgz
312
+ # (but IS added for .tar.gz)
313
+ if source_dataset_path.endswith((".tgz")):
314
+ description["compression"] = "gz"
315
+ else:
316
+ raise ValueError(f"Unsupported archive format: {source_dataset_path}")
317
+
211
318
  description['sources'] = file_list
212
319
  return description
213
320
 
214
321
 
215
- def describe_vector(source_dataset_path, scheme):
322
+ def describe_vector(source_dataset_path, scheme, **kwargs):
216
323
  """Describe properties of a GDAL vector file.
217
324
 
218
325
  Args:
219
326
  source_dataset_path (str): path to a GDAL vector.
327
+ scheme (str): the protocol prefix of the filepath
328
+ kwargs (dict): additional options when describing a dataset.
220
329
 
221
330
  Returns:
222
331
  dict
@@ -229,12 +338,18 @@ def describe_vector(source_dataset_path, scheme):
229
338
  vector = gdal.OpenEx(source_dataset_path, gdal.OF_VECTOR)
230
339
  layer = vector.GetLayer()
231
340
  fields = []
232
- description['n_features'] = layer.GetFeatureCount()
233
341
  for fld in layer.schema:
234
342
  fields.append(
235
343
  models.FieldSchema(name=fld.name, type=fld.GetTypeName()))
344
+ layer_schema = models.LayerSchema(
345
+ name=layer.GetName(),
346
+ n_features=layer.GetFeatureCount(),
347
+ table=models.TableSchema(fields=fields),
348
+ gdal_metadata=layer.GetMetadata())
349
+ description['data_model'] = models.VectorSchema(
350
+ layers=[layer_schema],
351
+ gdal_metadata=vector.GetMetadata())
236
352
  vector = layer = None
237
- description['data_model'] = models.TableSchema(fields=fields)
238
353
 
239
354
  info = pygeoprocessing.get_vector_info(source_dataset_path)
240
355
  bbox = models.BoundingBox(*info['bounding_box'])
@@ -248,33 +363,59 @@ def describe_vector(source_dataset_path, scheme):
248
363
  return description
249
364
 
250
365
 
251
- def describe_raster(source_dataset_path, scheme):
366
+ def describe_raster(source_dataset_path, scheme, **kwargs):
252
367
  """Describe properties of a GDAL raster file.
253
368
 
254
369
  Args:
255
370
  source_dataset_path (str): path to a GDAL raster.
371
+ scheme (str): the protocol prefix of the filepath
372
+ kwargs (dict): additional options when describing a dataset:
373
+ * ``'compute_stats'`` (bool): whether to compute statistics
374
+ for each band in the raster. Default is False.
256
375
 
257
376
  Returns:
258
377
  dict
259
378
 
260
379
  """
380
+ compute_stats = kwargs.get('compute_stats', False)
261
381
  description = describe_file(source_dataset_path, scheme)
262
382
  if 'http' in scheme:
263
383
  source_dataset_path = f'/vsicurl/{source_dataset_path}'
264
384
  info = pygeoprocessing.get_raster_info(source_dataset_path)
385
+ raster = gdal.OpenEx(source_dataset_path)
386
+ raster_gdal_metadata = raster.GetMetadata()
265
387
  bands = []
266
388
  for i in range(info['n_bands']):
267
389
  b = i + 1
390
+ band = raster.GetRasterBand(b)
391
+ if compute_stats:
392
+ try:
393
+ # 0=do not approximate stats, 1=calculate if they don't exist
394
+ # If exact stats exist they will be retrieved without
395
+ # computing them, otherwise, this forces computation.
396
+ # https://github.com/OSGeo/gdal/blob/master/gcore/gdalrasterband.cpp
397
+ _ = band.GetStatistics(0, 1)
398
+ except RuntimeError as e:
399
+ LOGGER.warning(
400
+ f'Could not compute statistics for band {b} of '
401
+ f'{source_dataset_path}: {e}')
402
+ band_gdal_metadata = band.GetMetadata()
403
+
268
404
  bands.append(models.BandSchema(
269
405
  index=b,
270
406
  gdal_type=gdal.GetDataTypeName(info['datatype']),
271
407
  numpy_type=numpy.dtype(info['numpy_type']).name,
272
- nodata=info['nodata'][i]))
408
+ nodata=info['nodata'][i],
409
+ gdal_metadata=band_gdal_metadata))
410
+ band = None
411
+ raster = None
412
+
273
413
  description['data_model'] = models.RasterSchema(
274
414
  bands=bands,
275
415
  pixel_size=info['pixel_size'],
276
416
  raster_size={'width': info['raster_size'][0],
277
- 'height': info['raster_size'][1]})
417
+ 'height': info['raster_size'][1]},
418
+ gdal_metadata=raster_gdal_metadata)
278
419
  # Some values of raster info are numpy types, which the
279
420
  # yaml dumper doesn't know how to represent.
280
421
  bbox = models.BoundingBox(*[float(x) for x in info['bounding_box']])
@@ -288,12 +429,13 @@ def describe_raster(source_dataset_path, scheme):
288
429
  return description
289
430
 
290
431
 
291
- def describe_table(source_dataset_path, scheme):
432
+ def describe_table(source_dataset_path, scheme, **kwargs):
292
433
  """Describe properties of a tabular dataset.
293
434
 
294
435
  Args:
295
436
  source_dataset_path (str): path to a file representing a table.
296
437
  scheme (str): the protocol prefix of the filepath
438
+ kwargs (dict): additional options when describing a dataset.
297
439
 
298
440
  Returns:
299
441
  dict
@@ -305,7 +447,139 @@ def describe_table(source_dataset_path, scheme):
305
447
  return description
306
448
 
307
449
 
308
- DESRCIBE_FUNCS = {
450
+ def describe_collection(directory, depth=numpy.iinfo(numpy.int16).max,
451
+ exclude_regex=None, exclude_hidden=True,
452
+ describe_files=False, backup=True, **kwargs):
453
+ """Create a single metadata document to describe a collection of files.
454
+
455
+ Describe all the files within a directory as members of a "collection".
456
+ The resulting metadata resource should include a list of all the files
457
+ included in the collection along with a description and metadata filepath
458
+ (or placeholder). Optionally create individual metadata files for each
459
+ supported file in a directory.
460
+
461
+ Args:
462
+ directory (str): path to collection
463
+ depth (int, optional): maximum number of subdirectory levels to
464
+ traverse when walking through ``directory`` to find files included
465
+ in the collection. A value of 1 limits the walk to files in the
466
+ top-level ``directory`` only. A value of 2 allows descending into
467
+ immediate subdirectories, etc. All files in all subdirectories in
468
+ the collection will be included by default.
469
+ exclude_regex (str, optional): a regular expression to pattern-match
470
+ any files you do not want included in the output metadata yml.
471
+ exclude_hidden (bool, default True): whether to exclude hidden files
472
+ (files that start with ".").
473
+ describe_files (bool, default False): whether to ``describe`` all
474
+ files, i.e., create individual metadata files for each supported
475
+ resource in the collection.
476
+ backup (bool): whether to write a backup of a pre-existing metadata
477
+ file before ovewriting it in cases where that file is not a valid
478
+ geometamaker document.
479
+ kwargs (dict): optional keyward arguments accepted by ``describe``.
480
+
481
+ Returns:
482
+ Collection metadata
483
+ """
484
+ directory = str(Path(directory).resolve())
485
+
486
+ file_list = _list_files_with_depth(directory, depth, exclude_regex,
487
+ exclude_hidden)
488
+
489
+ root_ext_map, root_list = _group_files_by_root(file_list)
490
+
491
+ items = []
492
+
493
+ for root in root_list:
494
+ extensions = root_ext_map[root]
495
+ if '.shp' in extensions:
496
+ # if we're dealing with a shapefile, we do not want to describe any
497
+ # of these other files with the same root name
498
+ extensions.difference_update(['.shx', '.sbn', '.sbx', '.prj', '.dbf', '.cpg'])
499
+ # Only drop .yml if its sidecar file, i.e. the corresponding data file
500
+ # (root) exists on disk
501
+ if '.yml' in extensions and os.path.exists(root):
502
+ extensions.discard('.yml')
503
+ for ext in extensions:
504
+ filepath = os.path.join(directory, f'{root}{ext}')
505
+ try:
506
+ this_desc = describe(filepath, **kwargs)
507
+ except ValueError:
508
+ # if file type isn't supported by geometamaker, e.g. pdf
509
+ # or if trying to describe a dir
510
+ this_desc = None
511
+
512
+ if describe_files and this_desc:
513
+ this_desc.write(backup=backup)
514
+
515
+ if ext and os.path.exists(filepath + '.yml'):
516
+ metadata_yml = f'{root}{ext}' + '.yml'
517
+ else:
518
+ metadata_yml = ''
519
+
520
+ this_resource = models.CollectionItemSchema(
521
+ path=f'{root}{ext}',
522
+ description=this_desc.description if this_desc else '',
523
+ metadata=metadata_yml
524
+ )
525
+ items.append(this_resource)
526
+
527
+ total_bytes, last_modified, uid = _get_collection_size_time_uid(directory)
528
+
529
+ resource = models.CollectionResource(
530
+ path=directory,
531
+ type='collection',
532
+ format='directory',
533
+ scheme=fsspec.utils.get_protocol(directory),
534
+ bytes=total_bytes,
535
+ last_modified=last_modified,
536
+ items=items,
537
+ uid=uid
538
+ )
539
+
540
+ # Check if there is existing metadata for the collection
541
+ try:
542
+ metadata_path = f'{directory}-metadata.yml'
543
+ existing_metadata = models.CollectionResource.load(metadata_path)
544
+
545
+ # Copy any existing item descriptions from existing yml to new metadata
546
+ # Note that descriptions in individual resources' ymls will take
547
+ # priority over item descriptions from preexisting collection metadata
548
+ for item in resource.items:
549
+ # Existing metadata's item desc will overwrite new metadata item
550
+ # desc if new item desc is ''
551
+ existing_item_desc = [
552
+ i.description for i in existing_metadata.items if (
553
+ i.path == item.path)]
554
+ if item.description == '' and len(existing_item_desc) > 0:
555
+ item.description = existing_item_desc[0]
556
+
557
+ # Replace fields in existing yml if new metadata has existing value
558
+ resource = existing_metadata.replace(resource)
559
+
560
+ except (ValueError, ValidationError) as error:
561
+ LOGGER.warning(error)
562
+ LOGGER.warning(
563
+ f'Ignoring an existing YAML document: {metadata_path} because it'
564
+ f' is invalid or incompatible.')
565
+ LOGGER.warning(
566
+ 'A subsequent call to `.write()` will replace this file, but it'
567
+ f' will be backed up to {metadata_path}.bak.\n'
568
+ f'Use `.write(backup=False)` to skip the backup.\n',
569
+ extra=_LOG_EXTRA_NOT_FOR_CLI)
570
+ resource._would_overwrite = True
571
+
572
+ except FileNotFoundError:
573
+ pass
574
+
575
+ # Add profile metadata
576
+ config = Config()
577
+ resource = resource.replace(config.profile)
578
+
579
+ return resource
580
+
581
+
582
+ DESCRIBE_FUNCS = {
309
583
  'archive': describe_archive,
310
584
  'table': describe_table,
311
585
  'vector': describe_vector,
@@ -321,7 +595,7 @@ RESOURCE_MODELS = {
321
595
 
322
596
 
323
597
  @_osgeo_use_exceptions
324
- def describe(source_dataset_path, profile=None):
598
+ def describe(source_dataset_path, compute_stats=False):
325
599
  """Create a metadata resource instance with properties of the dataset.
326
600
 
327
601
  Properties of the dataset are used to populate as many metadata
@@ -331,20 +605,23 @@ def describe(source_dataset_path, profile=None):
331
605
  Args:
332
606
  source_dataset_path (string): path or URL to dataset to which the
333
607
  metadata applies
334
- profile (geometamaker.models.Profile): a profile object from
335
- which to populate some metadata attributes
608
+ compute_stats (bool): whether to compute statistics
609
+ for each band in a raster.
336
610
 
337
611
  Returns:
338
612
  geometamaker.models.Resource: a metadata object
339
613
 
340
614
  """
341
- config = Config()
342
- user_profile = config.profile
343
- if profile is not None:
344
- user_profile = user_profile.replace(profile)
345
615
 
346
616
  metadata_path = f'{source_dataset_path}.yml'
347
617
 
618
+ if os.path.isdir(source_dataset_path):
619
+ raise ValueError(
620
+ f"Cannot `describe` {source_dataset_path} as it is a directory, "
621
+ "not a dataset. \nIf you are trying to create metadata for the "
622
+ "files within a directory and/or the directory itself, please use "
623
+ "`geometamaker.describe_collection` instead.")
624
+
348
625
  # Despite naming, this does not open a file that must be closed
349
626
  of = fsspec.open(source_dataset_path)
350
627
  if not of.fs.exists(source_dataset_path):
@@ -356,58 +633,63 @@ def describe(source_dataset_path, profile=None):
356
633
  f'Cannot describe {source_dataset_path}. {protocol} '
357
634
  f'is not one of the suppored file protocols: {PROTOCOLS}')
358
635
  resource_type = detect_file_type(source_dataset_path, protocol)
359
- description = DESRCIBE_FUNCS[resource_type](
360
- source_dataset_path, protocol)
636
+ description = DESCRIBE_FUNCS[resource_type](
637
+ source_dataset_path, protocol, compute_stats=compute_stats)
361
638
  description['type'] = resource_type
639
+ resource = RESOURCE_MODELS[resource_type](**description)
362
640
 
363
641
  # Load existing metadata file
364
642
  try:
643
+ # For the data model, use heuristic to decide if the new resource
644
+ # should inherit values from the existing resource.
645
+ # After that, take all non-empty values from the new resource
646
+ # and update the existing resource.
365
647
  existing_resource = RESOURCE_MODELS[resource_type].load(metadata_path)
366
- if 'data_model' in description:
367
- if isinstance(description['data_model'], models.RasterSchema):
368
- # If existing band metadata still matches data_model of the file
369
- # carry over existing metadata because it could include
370
- # human-defined properties.
371
- new_bands = []
372
- for band in description['data_model'].bands:
373
- try:
374
- eband = existing_resource.get_band_description(band.index)
375
- # TODO: rewrite this as __eq__ of BandSchema?
376
- if (band.numpy_type, band.gdal_type, band.nodata) == (
377
- eband.numpy_type, eband.gdal_type, eband.nodata):
378
- updated_dict = band.model_dump() | eband.model_dump()
379
- band = models.BandSchema(**updated_dict)
380
- except IndexError:
381
- pass
382
- new_bands.append(band)
383
- description['data_model'].bands = new_bands
384
- if isinstance(description['data_model'], models.TableSchema):
385
- # If existing field metadata still matches data_model of the file
386
- # carry over existing metadata because it could include
387
- # human-defined properties.
388
- new_fields = []
389
- for field in description['data_model'].fields:
390
- try:
391
- efield = existing_resource.get_field_description(
392
- field.name)
393
- # TODO: rewrite this as __eq__ of FieldSchema?
394
- if field.type == efield.type:
395
- updated_dict = field.model_dump() | efield.model_dump()
396
- field = models.FieldSchema(**updated_dict)
397
- except KeyError:
398
- pass
399
- new_fields.append(field)
400
- description['data_model'].fields = new_fields
401
- # overwrite properties that are intrinsic to the dataset
402
- updated_dict = existing_resource.model_dump() | description
403
- resource = RESOURCE_MODELS[resource_type](**updated_dict)
404
-
405
- # Common path: metadata file does not already exist
406
- # Or less common, ValueError if it exists but is incompatible
648
+ if resource_type == 'raster':
649
+ for band in resource.data_model.bands:
650
+ try:
651
+ eband = existing_resource.get_band_description(band.index)
652
+ except IndexError:
653
+ continue
654
+ if (band.numpy_type, band.gdal_type, band.nodata) == (
655
+ eband.numpy_type, eband.gdal_type, eband.nodata):
656
+ resource.set_band_description(
657
+ band.index,
658
+ title=eband.title,
659
+ description=eband.description,
660
+ units=eband.units)
661
+ if resource_type in ('vector', 'table'):
662
+ for field in resource._get_fields():
663
+ try:
664
+ efield = existing_resource.get_field_description(field.name)
665
+ except KeyError:
666
+ continue
667
+ if field.type == efield.type:
668
+ resource.set_field_description(
669
+ field.name,
670
+ title=efield.title,
671
+ description=efield.description,
672
+ units=efield.units)
673
+ resource = existing_resource.replace(resource)
674
+
675
+ except (ValueError, ValidationError) as error:
676
+ LOGGER.warning(error)
677
+ LOGGER.warning(
678
+ f'Ignoring an existing YAML document: {metadata_path} because it'
679
+ f' is invalid or incompatible.')
680
+ LOGGER.warning(
681
+ 'A subsequent call to `.write()` will replace this file, but it'
682
+ ' will be backed up to {metadata_path}.bak.\n'
683
+ f'Use `.write(backup=False)` to skip the backup.\n',
684
+ extra=_LOG_EXTRA_NOT_FOR_CLI)
685
+ resource._would_overwrite = True
686
+
407
687
  except FileNotFoundError:
408
- resource = RESOURCE_MODELS[resource_type](**description)
688
+ # Common path: metadata file does not already exist
689
+ pass
409
690
 
410
- resource = resource.replace(user_profile)
691
+ config = Config()
692
+ resource = resource.replace(config.profile)
411
693
  return resource
412
694
 
413
695
 
@@ -442,30 +724,20 @@ def validate(filepath):
442
724
  return error
443
725
 
444
726
 
445
- def validate_dir(directory, recursive=False):
727
+ def validate_dir(directory, depth=numpy.iinfo(numpy.int16).max):
446
728
  """Validate all compatible yml documents in the directory.
447
729
 
448
730
  Args:
449
731
  directory (string): path to a directory
450
- recursive (bool): whether or not to describe files
451
- in all subdirectories
732
+ depth (int): maximum number of subdirectory levels to
733
+ traverse when walking through ``directory``.
452
734
 
453
735
  Returns:
454
736
  tuple (list, list): a list of the filepaths that were validated and
455
737
  an equal-length list of the validation messages.
456
738
 
457
739
  """
458
- file_list = []
459
- if recursive:
460
- for path, dirs, files in os.walk(directory):
461
- for file in files:
462
- file_list.append(os.path.join(path, file))
463
- else:
464
- file_list.extend(
465
- [os.path.join(directory, path)
466
- for path in os.listdir(directory)
467
- if os.path.isfile(os.path.join(directory, path))])
468
-
740
+ file_list = _list_files_with_depth(directory, depth)
469
741
  messages = []
470
742
  yaml_files = []
471
743
  for filepath in file_list:
@@ -473,7 +745,7 @@ def validate_dir(directory, recursive=False):
473
745
  yaml_files.append(filepath)
474
746
  msg = ''
475
747
  try:
476
- error = validate(filepath)
748
+ error = validate(os.path.join(directory, filepath))
477
749
  if error:
478
750
  msg = error
479
751
  except ValueError:
@@ -484,48 +756,3 @@ def validate_dir(directory, recursive=False):
484
756
  messages.append(msg)
485
757
 
486
758
  return (yaml_files, messages)
487
-
488
-
489
- def describe_dir(directory, recursive=False):
490
- """Describe all compatible datasets in the directory.
491
-
492
- Take special care to only describe multifile datasets,
493
- such as ESRI Shapefiles, one time.
494
-
495
- Args:
496
- directory (string): path to a directory
497
- recursive (bool): whether or not to describe files
498
- in all subdirectories
499
-
500
- Returns:
501
- None
502
-
503
- """
504
- root_set = set()
505
- root_ext_map = defaultdict(set)
506
- for path, dirs, files in os.walk(directory):
507
- for file in files:
508
- full_path = os.path.join(path, file)
509
- root, ext = os.path.splitext(full_path)
510
- # tracking which files share a root name
511
- # so we can check if these comprise a shapefile
512
- root_ext_map[root].add(ext)
513
- root_set.add(root)
514
- if not recursive:
515
- break
516
-
517
- for root in root_set:
518
- extensions = root_ext_map[root]
519
- if '.shp' in extensions:
520
- # if we're dealing with a shapefile, we do not want to describe any
521
- # of these other files with the same root name
522
- extensions.difference_update(['.shx', '.sbn', '.sbx', '.prj', '.dbf'])
523
- for ext in extensions:
524
- filepath = f'{root}{ext}'
525
- try:
526
- resource = describe(filepath)
527
- except ValueError as error:
528
- LOGGER.debug(error)
529
- continue
530
- resource.write()
531
- LOGGER.info(f'{filepath} described')