PyPI - geometamaker - Versions diffs - 0.1.2__py3-none-any.whl → 0.2.1__py3-none-any.whl - Mend

geometamaker 0.1.2py3-none-any.whl → 0.2.1py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (12) hide show

geometamaker/__init__.py +2 -2
geometamaker/cli.py +137 -31
geometamaker/config.py +3 -4
geometamaker/geometamaker.py +374 -132
geometamaker/models.py +317 -114
{geometamaker-0.1.2.dist-info → geometamaker-0.2.1.dist-info}/METADATA +34 -44
geometamaker-0.2.1.dist-info/RECORD +12 -0
{geometamaker-0.1.2.dist-info → geometamaker-0.2.1.dist-info}/WHEEL +1 -1
geometamaker-0.1.2.dist-info/RECORD +0 -12
{geometamaker-0.1.2.dist-info → geometamaker-0.2.1.dist-info}/entry_points.txt +0 -0
{geometamaker-0.1.2.dist-info → geometamaker-0.2.1.dist-info/licenses}/LICENSE.txt +0 -0
{geometamaker-0.1.2.dist-info → geometamaker-0.2.1.dist-info}/top_level.txt +0 -0

geometamaker/geometamaker.py CHANGED Viewed

@@ -2,6 +2,7 @@ import functools
 import hashlib
 import logging
 import os
+import re
 import requests
 from collections import defaultdict
 from datetime import datetime, timezone
@@ -13,13 +14,20 @@ import pygeoprocessing
 import yaml
 from osgeo import gdal
 from osgeo import osr
+from pathlib import Path
 from pydantic import ValidationError
+import tarfile
 from . import models
 from .config import Config
+logging.getLogger('chardet').setLevel(logging.INFO)  # DEBUG is just too noisy
-LOGGER = logging.getLogger(__name__)
+LOGGER = logging.getLogger('geometamaker')
+_NOT_FOR_CLI = 'not_for_cli'
+_LOG_EXTRA_NOT_FOR_CLI = {
+    _NOT_FOR_CLI: True
+}
 # URI schemes we support. A subset of fsspec.available_protocols()
 PROTOCOLS = [
@@ -31,6 +39,12 @@ PROTOCOLS = [
 DT_FMT = '%Y-%m-%d %H:%M:%S %Z'
+def _gdal_progress_callback(complete, message, data):
+    percentage = complete * 100
+    if (percentage > 0) & (percentage % 5 == 0):
+        LOGGER.info(f'{message} {percentage}%')
 # TODO: In the future we can remove these exception managers in favor of the
 # builtin gdal.ExceptionMgr. It was released in 3.7.0 and debugged in 3.9.1.
 # https://github.com/OSGeo/gdal/blob/v3.9.3/NEWS.md#gdalogr-391-release-notes
@@ -103,6 +117,80 @@ def _wkt_to_epsg_units_string(wkt_string):
     return crs_string, units_string
+def _list_files_with_depth(directory, depth, exclude_regex=None,
+                           exclude_hidden=True):
+    """List files in directory up to depth
+    Args:
+        directory (string): path to a directory
+        depth (int): maximum number of subdirectory levels to traverse when
+            walking through a directory. A value of 1 limits the walk to files
+            in the top-level ``directory`` only. A value of 2 allows
+            descending into immediate subdirectories, etc.
+        exclude_regex (str, optional): a regular expression to pattern-match
+            any files for which you do not want to create metadata.
+        exclude_hidden (bool, default True): whether to ignore hidden files
+    Returns:
+        list of relative filepaths in ``directory``
+    """
+    directory = Path(directory).resolve()
+    file_list = []
+    for path in directory.rglob("*"):
+        relative_path = path.relative_to(directory)
+        current_depth = len(relative_path.parts)
+        if current_depth > depth:
+            continue
+        if exclude_hidden and (
+                any(part.startswith('.') for part in relative_path.parts)):
+            continue
+        file_list.append(str(relative_path))
+    # remove excluded files based on regex
+    if exclude_regex is not None:
+        file_list = [f for f in file_list if not re.search(exclude_regex, f)]
+    return sorted(file_list)
+def _group_files_by_root(file_list):
+    """Get set of files (roots) and extensions by filename"""
+    root_set = set()
+    root_ext_map = defaultdict(set)
+    for filepath in file_list:
+        root, ext = os.path.splitext(filepath)
+        # tracking which files share a root name
+        # so we can check if these comprise a shapefile
+        root_ext_map[root].add(ext)
+        root_set.add(root)
+    return root_ext_map, sorted(list(root_set))
+def _get_collection_size_time_uid(directory):
+    """Get size of directory (in bytes), when it was last modified, and uid"""
+    total_bytes = 0
+    latest_mtime = 0
+    for root, _, files in os.walk(directory):
+        for file in files:
+            file_path = os.path.join(root, file)
+            stat = os.stat(file_path)
+            total_bytes += stat.st_size
+            latest_mtime = max(latest_mtime, stat.st_mtime)
+    last_modified = datetime.fromtimestamp(latest_mtime, tz=timezone.utc)
+    last_modified_str = last_modified.strftime('%Y-%m-%d %H:%M:%S %Z')
+    hash_func = hashlib.sha256()
+    hash_func.update(
+        f'{total_bytes}{last_modified_str}{directory}'.encode('utf-8'))
+    uid = f'sizetimestamp:{hash_func.hexdigest()}'
+    return total_bytes, last_modified_str, uid
 def detect_file_type(filepath, scheme):
     """Detect the type of resource contained in the file.
@@ -119,13 +207,16 @@ def detect_file_type(filepath, scheme):
     """
     # TODO: guard against classifying netCDF, HDF5, etc as GDAL rasters.
     # We'll likely want a different data model for multi-dimensional arrays.
     # Frictionless supports a wide range of formats. The quickest way to
     # determine if a file is recognized as a table or archive is to call list.
-    info = frictionless.list(filepath)[0]
+    try:
+        info = frictionless.list(filepath)[0]
+    except frictionless.FrictionlessException:
+        raise RuntimeError(f'Cannot detect file type of "{filepath}"')
     if info.type == 'table':
         return 'table'
-    if info.compression:
+    # Frictionless doesn't recognize .tgz compression (but does recognize .tar.gz)
+    if info.compression or info.format == "tgz":
         return 'archive'
     # GDAL considers CSV a vector, so check against frictionless first.
     try:
@@ -177,7 +268,7 @@ def describe_file(source_dataset_path, scheme):
     hash_func = hashlib.new('sha256')
     hash_func.update(
         f'{description["bytes"]}{description["last_modified"]}\
-        {description["path"]}'.encode('ascii'))
+        {description["path"]}'.encode('utf-8'))
     description['uid'] = f'sizetimestamp:{hash_func.hexdigest()}'
     # We don't have a use for including these attributes in our metadata:
@@ -186,37 +277,64 @@ def describe_file(source_dataset_path, scheme):
     return description
-def describe_archive(source_dataset_path, scheme):
+def describe_archive(source_dataset_path, scheme, **kwargs):
     """Describe file properties of a compressed file.
     Args:
         source_dataset_path (str): path to a file.
         scheme (str): the protocol prefix of the filepath
+        kwargs (dict): additional options when describing a dataset.
     Returns:
         dict
     """
+    def _list_tgz_contents(path):
+        """List contents of a .tar, .tgz, or .tar.gz archive."""
+        file_list = []
+        with fsspec.open(path, 'rb') as fobj:
+            with tarfile.open(fileobj=fobj, mode='r:*') as tar:
+                file_list = [member.name for member in tar.getmembers()
+                             if member.isfile()]
+        return file_list
+    def _list_zip_contents(path):
+        """List contents of a zip archive"""
+        file_list = []
+        ZFS = fsspec.get_filesystem_class('zip')
+        zfs = ZFS(path)
+        for dirpath, _, files in zfs.walk(zfs.root_marker):
+            for f in files:
+                file_list.append(os.path.join(dirpath, f))
+        return file_list
     description = describe_file(source_dataset_path, scheme)
     # innerpath is from frictionless and not useful because
     # it does not include all the files contained in the zip
     description.pop('innerpath', None)
-    ZFS = fsspec.get_filesystem_class('zip')
-    zfs = ZFS(source_dataset_path)
-    file_list = []
-    for dirpath, _, files in zfs.walk(zfs.root_marker):
-        for f in files:
-            file_list.append(os.path.join(dirpath, f))
+    if description.get("compression") == "zip":
+        file_list = _list_zip_contents(source_dataset_path)
+    elif description.get("format") in ["tgz", "tar"]:
+        file_list = _list_tgz_contents(source_dataset_path)
+        # 'compression' attr not auto-added by frictionless.describe for .tgz
+        # (but IS added for .tar.gz)
+        if source_dataset_path.endswith((".tgz")):
+            description["compression"] = "gz"
+    else:
+        raise ValueError(f"Unsupported archive format: {source_dataset_path}")
     description['sources'] = file_list
     return description
-def describe_vector(source_dataset_path, scheme):
+def describe_vector(source_dataset_path, scheme, **kwargs):
     """Describe properties of a GDAL vector file.
     Args:
         source_dataset_path (str): path to a GDAL vector.
+        scheme (str): the protocol prefix of the filepath
+        kwargs (dict): additional options when describing a dataset.
     Returns:
         dict
@@ -229,12 +347,18 @@ def describe_vector(source_dataset_path, scheme):
     vector = gdal.OpenEx(source_dataset_path, gdal.OF_VECTOR)
     layer = vector.GetLayer()
     fields = []
-    description['n_features'] = layer.GetFeatureCount()
     for fld in layer.schema:
         fields.append(
             models.FieldSchema(name=fld.name, type=fld.GetTypeName()))
+    layer_schema = models.LayerSchema(
+        name=layer.GetName(),
+        n_features=layer.GetFeatureCount(),
+        table=models.TableSchema(fields=fields),
+        gdal_metadata=layer.GetMetadata())
+    description['data_model'] = models.VectorSchema(
+        layers=[layer_schema],
+        gdal_metadata=vector.GetMetadata())
     vector = layer = None
-    description['data_model'] = models.TableSchema(fields=fields)
     info = pygeoprocessing.get_vector_info(source_dataset_path)
     bbox = models.BoundingBox(*info['bounding_box'])
@@ -248,33 +372,65 @@ def describe_vector(source_dataset_path, scheme):
     return description
-def describe_raster(source_dataset_path, scheme):
+def describe_raster(source_dataset_path, scheme, **kwargs):
     """Describe properties of a GDAL raster file.
     Args:
         source_dataset_path (str): path to a GDAL raster.
+        scheme (str): the protocol prefix of the filepath
+        kwargs (dict): additional options when describing a dataset:
+            * ``'compute_stats'`` (bool): whether to compute statistics
+              for each band in the raster. Default is False.
     Returns:
         dict
     """
+    compute_stats = kwargs.get('compute_stats', False)
     description = describe_file(source_dataset_path, scheme)
     if 'http' in scheme:
         source_dataset_path = f'/vsicurl/{source_dataset_path}'
     info = pygeoprocessing.get_raster_info(source_dataset_path)
+    raster = gdal.OpenEx(source_dataset_path)
+    raster_gdal_metadata = raster.GetMetadata()
     bands = []
     for i in range(info['n_bands']):
         b = i + 1
+        band = raster.GetRasterBand(b)
+        band_gdal_metadata = band.GetMetadata()
+        if compute_stats:
+            try:
+                if not 'STATISTICS_VALID_PERCENT' in band_gdal_metadata:
+                    # Sometimes some stats exist, but not all. If this one doesn't,
+                    # it's important enough that we want to force computation.
+                    _ = band.ComputeStatistics(0, callback=_gdal_progress_callback)
+                else:
+                    # 0=do not approximate stats, 1=calculate if they don't exist
+                    # If exact stats exist they will be retrieved without
+                    # computing them, otherwise, this forces computation.
+                    # https://github.com/OSGeo/gdal/blob/master/gcore/gdalrasterband.cpp
+                    _ = band.GetStatistics(0, 1)
+                band_gdal_metadata = band.GetMetadata()
+            except RuntimeError as e:
+                LOGGER.warning(
+                    f'Could not compute statistics for band {b} of '
+                    f'{source_dataset_path}: {e}')
         bands.append(models.BandSchema(
             index=b,
             gdal_type=gdal.GetDataTypeName(info['datatype']),
             numpy_type=numpy.dtype(info['numpy_type']).name,
-            nodata=info['nodata'][i]))
+            nodata=info['nodata'][i],
+            gdal_metadata=band_gdal_metadata))
+        band = None
+    raster = None
     description['data_model'] = models.RasterSchema(
         bands=bands,
         pixel_size=info['pixel_size'],
         raster_size={'width': info['raster_size'][0],
-                     'height': info['raster_size'][1]})
+                     'height': info['raster_size'][1]},
+        gdal_metadata=raster_gdal_metadata)
     # Some values of raster info are numpy types, which the
     # yaml dumper doesn't know how to represent.
     bbox = models.BoundingBox(*[float(x) for x in info['bounding_box']])
@@ -288,12 +444,13 @@ def describe_raster(source_dataset_path, scheme):
     return description
-def describe_table(source_dataset_path, scheme):
+def describe_table(source_dataset_path, scheme, **kwargs):
     """Describe properties of a tabular dataset.
     Args:
         source_dataset_path (str): path to a file representing a table.
         scheme (str): the protocol prefix of the filepath
+        kwargs (dict): additional options when describing a dataset.
     Returns:
         dict
@@ -305,7 +462,139 @@ def describe_table(source_dataset_path, scheme):
     return description
-DESRCIBE_FUNCS = {
+def describe_collection(directory, depth=numpy.iinfo(numpy.int16).max,
+                        exclude_regex=None, exclude_hidden=True,
+                        describe_files=False, backup=True, **kwargs):
+    """Create a single metadata document to describe a collection of files.
+    Describe all the files within a directory as members of a "collection".
+    The resulting metadata resource should include a list of all the files
+    included in the collection along with a description and metadata filepath
+    (or placeholder). Optionally create individual metadata files for each
+    supported file in a directory.
+    Args:
+        directory (str): path to collection
+        depth (int, optional): maximum number of subdirectory levels to
+            traverse when walking through ``directory`` to find files included
+            in the collection. A value of 1 limits the walk to files in the
+            top-level ``directory`` only. A value of 2 allows descending into
+            immediate subdirectories, etc. All files in all subdirectories in
+            the collection will be included by default.
+        exclude_regex (str, optional): a regular expression to pattern-match
+            any files you do not want included in the output metadata yml.
+        exclude_hidden (bool, default True): whether to exclude hidden files
+            (files that start with ".").
+        describe_files (bool, default False): whether to ``describe`` all
+            files, i.e., create individual metadata files for each supported
+            resource in the collection.
+        backup (bool): whether to write a backup of a pre-existing metadata
+            file before ovewriting it in cases where that file is not a valid
+            geometamaker document.
+        kwargs (dict): optional keyward arguments accepted by ``describe``.
+    Returns:
+        Collection metadata
+    """
+    directory = str(Path(directory).resolve())
+    file_list = _list_files_with_depth(directory, depth, exclude_regex,
+                                       exclude_hidden)
+    root_ext_map, root_list = _group_files_by_root(file_list)
+    items = []
+    for root in root_list:
+        extensions = root_ext_map[root]
+        if '.shp' in extensions:
+            # if we're dealing with a shapefile, we do not want to describe any
+            # of these other files with the same root name
+            extensions.difference_update(['.shx', '.sbn', '.sbx', '.prj', '.dbf', '.cpg'])
+        # Only drop .yml if its sidecar file, i.e. the corresponding data file
+        # (root) exists on disk
+        if '.yml' in extensions and os.path.exists(root):
+            extensions.discard('.yml')
+        for ext in extensions:
+            filepath = os.path.join(directory, f'{root}{ext}')
+            try:
+                this_desc = describe(filepath, **kwargs)
+            except ValueError:
+                # if file type isn't supported by geometamaker, e.g. pdf
+                # or if trying to describe a dir
+                this_desc = None
+            if describe_files and this_desc:
+                this_desc.write(backup=backup)
+            if ext and os.path.exists(filepath + '.yml'):
+                metadata_yml = f'{root}{ext}' + '.yml'
+            else:
+                metadata_yml = ''
+            this_resource = models.CollectionItemSchema(
+                path=f'{root}{ext}',
+                description=this_desc.description if this_desc else '',
+                metadata=metadata_yml
+            )
+            items.append(this_resource)
+    total_bytes, last_modified, uid = _get_collection_size_time_uid(directory)
+    resource = models.CollectionResource(
+        path=directory,
+        type='collection',
+        format='directory',
+        scheme=fsspec.utils.get_protocol(directory),
+        bytes=total_bytes,
+        last_modified=last_modified,
+        items=items,
+        uid=uid
+    )
+    # Check if there is existing metadata for the collection
+    try:
+        metadata_path = f'{directory}-metadata.yml'
+        existing_metadata = models.CollectionResource.load(metadata_path)
+        # Copy any existing item descriptions from existing yml to new metadata
+        # Note that descriptions in individual resources' ymls will take
+        # priority over item descriptions from preexisting collection metadata
+        for item in resource.items:
+            # Existing metadata's item desc will overwrite new metadata item
+            # desc if new item desc is ''
+            existing_item_desc = [
+                i.description for i in existing_metadata.items if (
+                    i.path == item.path)]
+            if item.description == '' and len(existing_item_desc) > 0:
+                item.description = existing_item_desc[0]
+        # Replace fields in existing yml if new metadata has existing value
+        resource = existing_metadata.replace(resource)
+    except (ValueError, ValidationError) as error:
+        LOGGER.warning(error)
+        LOGGER.warning(
+            f'Ignoring an existing YAML document: {metadata_path} because it'
+            f' is invalid or incompatible.')
+        LOGGER.warning(
+            'A subsequent call to `.write()` will replace this file, but it'
+            f' will be backed up to {metadata_path}.bak.\n'
+            f'Use `.write(backup=False)` to skip the backup.\n',
+            extra=_LOG_EXTRA_NOT_FOR_CLI)
+        resource._would_overwrite = True
+    except FileNotFoundError:
+        pass
+    # Add profile metadata
+    config = Config()
+    resource = resource.replace(config.profile)
+    return resource
+DESCRIBE_FUNCS = {
     'archive': describe_archive,
     'table': describe_table,
     'vector': describe_vector,
@@ -321,7 +610,7 @@ RESOURCE_MODELS = {
 @_osgeo_use_exceptions
-def describe(source_dataset_path, profile=None):
+def describe(source_dataset_path, compute_stats=False):
     """Create a metadata resource instance with properties of the dataset.
     Properties of the dataset are used to populate as many metadata
@@ -331,20 +620,23 @@ def describe(source_dataset_path, profile=None):
     Args:
         source_dataset_path (string): path or URL to dataset to which the
             metadata applies
-        profile (geometamaker.models.Profile): a profile object from
-            which to populate some metadata attributes
+        compute_stats (bool): whether to compute statistics
+            for each band in a raster.
     Returns:
         geometamaker.models.Resource: a metadata object
     """
-    config = Config()
-    user_profile = config.profile
-    if profile is not None:
-        user_profile = user_profile.replace(profile)
     metadata_path = f'{source_dataset_path}.yml'
+    if os.path.isdir(source_dataset_path):
+        raise ValueError(
+            f"Cannot `describe` {source_dataset_path} as it is a directory, "
+            "not a dataset. \nIf you are trying to create metadata for the "
+            "files within a directory and/or the directory itself, please use "
+            "`geometamaker.describe_collection` instead.")
     # Despite naming, this does not open a file that must be closed
     of = fsspec.open(source_dataset_path)
     if not of.fs.exists(source_dataset_path):
@@ -356,58 +648,63 @@ def describe(source_dataset_path, profile=None):
             f'Cannot describe {source_dataset_path}. {protocol} '
             f'is not one of the suppored file protocols: {PROTOCOLS}')
     resource_type = detect_file_type(source_dataset_path, protocol)
-    description = DESRCIBE_FUNCS[resource_type](
-        source_dataset_path, protocol)
+    description = DESCRIBE_FUNCS[resource_type](
+        source_dataset_path, protocol, compute_stats=compute_stats)
     description['type'] = resource_type
+    resource = RESOURCE_MODELS[resource_type](**description)
     # Load existing metadata file
     try:
+        # For the data model, use heuristic to decide if the new resource
+        # should inherit values from the existing resource.
+        # After that, take all non-empty values from the new resource
+        # and update the existing resource.
         existing_resource = RESOURCE_MODELS[resource_type].load(metadata_path)
-        if 'data_model' in description:
-            if isinstance(description['data_model'], models.RasterSchema):
-                # If existing band metadata still matches data_model of the file
-                # carry over existing metadata because it could include
-                # human-defined properties.
-                new_bands = []
-                for band in description['data_model'].bands:
-                    try:
-                        eband = existing_resource.get_band_description(band.index)
-                        # TODO: rewrite this as __eq__ of BandSchema?
-                        if (band.numpy_type, band.gdal_type, band.nodata) == (
-                                eband.numpy_type, eband.gdal_type, eband.nodata):
-                            updated_dict = band.model_dump() | eband.model_dump()
-                            band = models.BandSchema(**updated_dict)
-                    except IndexError:
-                        pass
-                    new_bands.append(band)
-                description['data_model'].bands = new_bands
-            if isinstance(description['data_model'], models.TableSchema):
-                # If existing field metadata still matches data_model of the file
-                # carry over existing metadata because it could include
-                # human-defined properties.
-                new_fields = []
-                for field in description['data_model'].fields:
-                    try:
-                        efield = existing_resource.get_field_description(
-                            field.name)
-                        # TODO: rewrite this as __eq__ of FieldSchema?
-                        if field.type == efield.type:
-                            updated_dict = field.model_dump() | efield.model_dump()
-                            field = models.FieldSchema(**updated_dict)
-                    except KeyError:
-                        pass
-                    new_fields.append(field)
-                description['data_model'].fields = new_fields
-        # overwrite properties that are intrinsic to the dataset
-        updated_dict = existing_resource.model_dump() | description
-        resource = RESOURCE_MODELS[resource_type](**updated_dict)
-    # Common path: metadata file does not already exist
-    # Or less common, ValueError if it exists but is incompatible
+        if resource_type == 'raster':
+            for band in resource.data_model.bands:
+                try:
+                    eband = existing_resource.get_band_description(band.index)
+                except IndexError:
+                    continue
+                if (band.numpy_type, band.gdal_type, band.nodata) == (
+                        eband.numpy_type, eband.gdal_type, eband.nodata):
+                    resource.set_band_description(
+                        band.index,
+                        title=eband.title,
+                        description=eband.description,
+                        units=eband.units)
+        if resource_type in ('vector', 'table'):
+            for field in resource._get_fields():
+                try:
+                    efield = existing_resource.get_field_description(field.name)
+                except KeyError:
+                    continue
+                if field.type == efield.type:
+                    resource.set_field_description(
+                        field.name,
+                        title=efield.title,
+                        description=efield.description,
+                        units=efield.units)
+        resource = existing_resource.replace(resource)
+    except (ValueError, ValidationError) as error:
+        LOGGER.warning(error)
+        LOGGER.warning(
+            f'Ignoring an existing YAML document: {metadata_path} because it'
+            f' is invalid or incompatible.')
+        LOGGER.warning(
+            'A subsequent call to `.write()` will replace this file, but it'
+            ' will be backed up to {metadata_path}.bak.\n'
+            f'Use `.write(backup=False)` to skip the backup.\n',
+            extra=_LOG_EXTRA_NOT_FOR_CLI)
+        resource._would_overwrite = True
     except FileNotFoundError:
-        resource = RESOURCE_MODELS[resource_type](**description)
+        # Common path: metadata file does not already exist
+        pass
-    resource = resource.replace(user_profile)
+    config = Config()
+    resource = resource.replace(config.profile)
     return resource
@@ -442,30 +739,20 @@ def validate(filepath):
         return error
-def validate_dir(directory, recursive=False):
+def validate_dir(directory, depth=numpy.iinfo(numpy.int16).max):
     """Validate all compatible yml documents in the directory.
     Args:
         directory (string): path to a directory
-        recursive (bool): whether or not to describe files
-            in all subdirectories
+        depth (int): maximum number of subdirectory levels to
+            traverse when walking through ``directory``.
     Returns:
         tuple (list, list): a list of the filepaths that were validated and
             an equal-length list of the validation messages.
     """
-    file_list = []
-    if recursive:
-        for path, dirs, files in os.walk(directory):
-            for file in files:
-                file_list.append(os.path.join(path, file))
-    else:
-        file_list.extend(
-            [os.path.join(directory, path)
-                for path in os.listdir(directory)
-                if os.path.isfile(os.path.join(directory, path))])
+    file_list = _list_files_with_depth(directory, depth)
     messages = []
     yaml_files = []
     for filepath in file_list:
@@ -473,7 +760,7 @@ def validate_dir(directory, recursive=False):
             yaml_files.append(filepath)
             msg = ''
             try:
-                error = validate(filepath)
+                error = validate(os.path.join(directory, filepath))
                 if error:
                     msg = error
             except ValueError:
@@ -484,48 +771,3 @@ def validate_dir(directory, recursive=False):
             messages.append(msg)
     return (yaml_files, messages)
-def describe_dir(directory, recursive=False):
-    """Describe all compatible datasets in the directory.
-    Take special care to only describe multifile datasets,
-    such as ESRI Shapefiles, one time.
-    Args:
-        directory (string): path to a directory
-        recursive (bool): whether or not to describe files
-            in all subdirectories
-    Returns:
-        None
-    """
-    root_set = set()
-    root_ext_map = defaultdict(set)
-    for path, dirs, files in os.walk(directory):
-        for file in files:
-            full_path = os.path.join(path, file)
-            root, ext = os.path.splitext(full_path)
-            # tracking which files share a root name
-            # so we can check if these comprise a shapefile
-            root_ext_map[root].add(ext)
-            root_set.add(root)
-        if not recursive:
-            break
-    for root in root_set:
-        extensions = root_ext_map[root]
-        if '.shp' in extensions:
-            # if we're dealing with a shapefile, we do not want to describe any
-            # of these other files with the same root name
-            extensions.difference_update(['.shx', '.sbn', '.sbx', '.prj', '.dbf'])
-        for ext in extensions:
-            filepath = f'{root}{ext}'
-            try:
-                resource = describe(filepath)
-            except ValueError as error:
-                LOGGER.debug(error)
-                continue
-            resource.write()
-            LOGGER.info(f'{filepath} described')

geometamaker 0.1.2__py3-none-any.whl → 0.2.1__py3-none-any.whl

geometamaker 0.1.2py3-none-any.whl → 0.2.1py3-none-any.whl