geometamaker 0.1.2__py3-none-any.whl → 0.2.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- geometamaker/__init__.py +2 -2
- geometamaker/cli.py +137 -31
- geometamaker/config.py +3 -4
- geometamaker/geometamaker.py +374 -132
- geometamaker/models.py +317 -114
- {geometamaker-0.1.2.dist-info → geometamaker-0.2.1.dist-info}/METADATA +34 -44
- geometamaker-0.2.1.dist-info/RECORD +12 -0
- {geometamaker-0.1.2.dist-info → geometamaker-0.2.1.dist-info}/WHEEL +1 -1
- geometamaker-0.1.2.dist-info/RECORD +0 -12
- {geometamaker-0.1.2.dist-info → geometamaker-0.2.1.dist-info}/entry_points.txt +0 -0
- {geometamaker-0.1.2.dist-info → geometamaker-0.2.1.dist-info/licenses}/LICENSE.txt +0 -0
- {geometamaker-0.1.2.dist-info → geometamaker-0.2.1.dist-info}/top_level.txt +0 -0
geometamaker/geometamaker.py
CHANGED
|
@@ -2,6 +2,7 @@ import functools
|
|
|
2
2
|
import hashlib
|
|
3
3
|
import logging
|
|
4
4
|
import os
|
|
5
|
+
import re
|
|
5
6
|
import requests
|
|
6
7
|
from collections import defaultdict
|
|
7
8
|
from datetime import datetime, timezone
|
|
@@ -13,13 +14,20 @@ import pygeoprocessing
|
|
|
13
14
|
import yaml
|
|
14
15
|
from osgeo import gdal
|
|
15
16
|
from osgeo import osr
|
|
17
|
+
from pathlib import Path
|
|
16
18
|
from pydantic import ValidationError
|
|
19
|
+
import tarfile
|
|
17
20
|
|
|
18
21
|
from . import models
|
|
19
22
|
from .config import Config
|
|
20
23
|
|
|
24
|
+
logging.getLogger('chardet').setLevel(logging.INFO) # DEBUG is just too noisy
|
|
21
25
|
|
|
22
|
-
LOGGER = logging.getLogger(
|
|
26
|
+
LOGGER = logging.getLogger('geometamaker')
|
|
27
|
+
_NOT_FOR_CLI = 'not_for_cli'
|
|
28
|
+
_LOG_EXTRA_NOT_FOR_CLI = {
|
|
29
|
+
_NOT_FOR_CLI: True
|
|
30
|
+
}
|
|
23
31
|
|
|
24
32
|
# URI schemes we support. A subset of fsspec.available_protocols()
|
|
25
33
|
PROTOCOLS = [
|
|
@@ -31,6 +39,12 @@ PROTOCOLS = [
|
|
|
31
39
|
DT_FMT = '%Y-%m-%d %H:%M:%S %Z'
|
|
32
40
|
|
|
33
41
|
|
|
42
|
+
def _gdal_progress_callback(complete, message, data):
|
|
43
|
+
percentage = complete * 100
|
|
44
|
+
if (percentage > 0) & (percentage % 5 == 0):
|
|
45
|
+
LOGGER.info(f'{message} {percentage}%')
|
|
46
|
+
|
|
47
|
+
|
|
34
48
|
# TODO: In the future we can remove these exception managers in favor of the
|
|
35
49
|
# builtin gdal.ExceptionMgr. It was released in 3.7.0 and debugged in 3.9.1.
|
|
36
50
|
# https://github.com/OSGeo/gdal/blob/v3.9.3/NEWS.md#gdalogr-391-release-notes
|
|
@@ -103,6 +117,80 @@ def _wkt_to_epsg_units_string(wkt_string):
|
|
|
103
117
|
return crs_string, units_string
|
|
104
118
|
|
|
105
119
|
|
|
120
|
+
def _list_files_with_depth(directory, depth, exclude_regex=None,
|
|
121
|
+
exclude_hidden=True):
|
|
122
|
+
"""List files in directory up to depth
|
|
123
|
+
|
|
124
|
+
Args:
|
|
125
|
+
directory (string): path to a directory
|
|
126
|
+
depth (int): maximum number of subdirectory levels to traverse when
|
|
127
|
+
walking through a directory. A value of 1 limits the walk to files
|
|
128
|
+
in the top-level ``directory`` only. A value of 2 allows
|
|
129
|
+
descending into immediate subdirectories, etc.
|
|
130
|
+
exclude_regex (str, optional): a regular expression to pattern-match
|
|
131
|
+
any files for which you do not want to create metadata.
|
|
132
|
+
exclude_hidden (bool, default True): whether to ignore hidden files
|
|
133
|
+
|
|
134
|
+
Returns:
|
|
135
|
+
list of relative filepaths in ``directory``
|
|
136
|
+
|
|
137
|
+
"""
|
|
138
|
+
directory = Path(directory).resolve()
|
|
139
|
+
file_list = []
|
|
140
|
+
|
|
141
|
+
for path in directory.rglob("*"):
|
|
142
|
+
relative_path = path.relative_to(directory)
|
|
143
|
+
current_depth = len(relative_path.parts)
|
|
144
|
+
if current_depth > depth:
|
|
145
|
+
continue
|
|
146
|
+
if exclude_hidden and (
|
|
147
|
+
any(part.startswith('.') for part in relative_path.parts)):
|
|
148
|
+
continue
|
|
149
|
+
file_list.append(str(relative_path))
|
|
150
|
+
|
|
151
|
+
# remove excluded files based on regex
|
|
152
|
+
if exclude_regex is not None:
|
|
153
|
+
file_list = [f for f in file_list if not re.search(exclude_regex, f)]
|
|
154
|
+
|
|
155
|
+
return sorted(file_list)
|
|
156
|
+
|
|
157
|
+
|
|
158
|
+
def _group_files_by_root(file_list):
|
|
159
|
+
"""Get set of files (roots) and extensions by filename"""
|
|
160
|
+
root_set = set()
|
|
161
|
+
root_ext_map = defaultdict(set)
|
|
162
|
+
for filepath in file_list:
|
|
163
|
+
root, ext = os.path.splitext(filepath)
|
|
164
|
+
# tracking which files share a root name
|
|
165
|
+
# so we can check if these comprise a shapefile
|
|
166
|
+
root_ext_map[root].add(ext)
|
|
167
|
+
root_set.add(root)
|
|
168
|
+
return root_ext_map, sorted(list(root_set))
|
|
169
|
+
|
|
170
|
+
|
|
171
|
+
def _get_collection_size_time_uid(directory):
|
|
172
|
+
"""Get size of directory (in bytes), when it was last modified, and uid"""
|
|
173
|
+
total_bytes = 0
|
|
174
|
+
latest_mtime = 0
|
|
175
|
+
|
|
176
|
+
for root, _, files in os.walk(directory):
|
|
177
|
+
for file in files:
|
|
178
|
+
file_path = os.path.join(root, file)
|
|
179
|
+
stat = os.stat(file_path)
|
|
180
|
+
total_bytes += stat.st_size
|
|
181
|
+
latest_mtime = max(latest_mtime, stat.st_mtime)
|
|
182
|
+
|
|
183
|
+
last_modified = datetime.fromtimestamp(latest_mtime, tz=timezone.utc)
|
|
184
|
+
last_modified_str = last_modified.strftime('%Y-%m-%d %H:%M:%S %Z')
|
|
185
|
+
|
|
186
|
+
hash_func = hashlib.sha256()
|
|
187
|
+
hash_func.update(
|
|
188
|
+
f'{total_bytes}{last_modified_str}{directory}'.encode('utf-8'))
|
|
189
|
+
uid = f'sizetimestamp:{hash_func.hexdigest()}'
|
|
190
|
+
|
|
191
|
+
return total_bytes, last_modified_str, uid
|
|
192
|
+
|
|
193
|
+
|
|
106
194
|
def detect_file_type(filepath, scheme):
|
|
107
195
|
"""Detect the type of resource contained in the file.
|
|
108
196
|
|
|
@@ -119,13 +207,16 @@ def detect_file_type(filepath, scheme):
|
|
|
119
207
|
"""
|
|
120
208
|
# TODO: guard against classifying netCDF, HDF5, etc as GDAL rasters.
|
|
121
209
|
# We'll likely want a different data model for multi-dimensional arrays.
|
|
122
|
-
|
|
123
210
|
# Frictionless supports a wide range of formats. The quickest way to
|
|
124
211
|
# determine if a file is recognized as a table or archive is to call list.
|
|
125
|
-
|
|
212
|
+
try:
|
|
213
|
+
info = frictionless.list(filepath)[0]
|
|
214
|
+
except frictionless.FrictionlessException:
|
|
215
|
+
raise RuntimeError(f'Cannot detect file type of "{filepath}"')
|
|
126
216
|
if info.type == 'table':
|
|
127
217
|
return 'table'
|
|
128
|
-
|
|
218
|
+
# Frictionless doesn't recognize .tgz compression (but does recognize .tar.gz)
|
|
219
|
+
if info.compression or info.format == "tgz":
|
|
129
220
|
return 'archive'
|
|
130
221
|
# GDAL considers CSV a vector, so check against frictionless first.
|
|
131
222
|
try:
|
|
@@ -177,7 +268,7 @@ def describe_file(source_dataset_path, scheme):
|
|
|
177
268
|
hash_func = hashlib.new('sha256')
|
|
178
269
|
hash_func.update(
|
|
179
270
|
f'{description["bytes"]}{description["last_modified"]}\
|
|
180
|
-
{description["path"]}'.encode('
|
|
271
|
+
{description["path"]}'.encode('utf-8'))
|
|
181
272
|
description['uid'] = f'sizetimestamp:{hash_func.hexdigest()}'
|
|
182
273
|
|
|
183
274
|
# We don't have a use for including these attributes in our metadata:
|
|
@@ -186,37 +277,64 @@ def describe_file(source_dataset_path, scheme):
|
|
|
186
277
|
return description
|
|
187
278
|
|
|
188
279
|
|
|
189
|
-
def describe_archive(source_dataset_path, scheme):
|
|
280
|
+
def describe_archive(source_dataset_path, scheme, **kwargs):
|
|
190
281
|
"""Describe file properties of a compressed file.
|
|
191
282
|
|
|
192
283
|
Args:
|
|
193
284
|
source_dataset_path (str): path to a file.
|
|
194
285
|
scheme (str): the protocol prefix of the filepath
|
|
286
|
+
kwargs (dict): additional options when describing a dataset.
|
|
195
287
|
|
|
196
288
|
Returns:
|
|
197
289
|
dict
|
|
198
290
|
|
|
199
291
|
"""
|
|
292
|
+
def _list_tgz_contents(path):
|
|
293
|
+
"""List contents of a .tar, .tgz, or .tar.gz archive."""
|
|
294
|
+
file_list = []
|
|
295
|
+
with fsspec.open(path, 'rb') as fobj:
|
|
296
|
+
with tarfile.open(fileobj=fobj, mode='r:*') as tar:
|
|
297
|
+
file_list = [member.name for member in tar.getmembers()
|
|
298
|
+
if member.isfile()]
|
|
299
|
+
return file_list
|
|
300
|
+
|
|
301
|
+
def _list_zip_contents(path):
|
|
302
|
+
"""List contents of a zip archive"""
|
|
303
|
+
file_list = []
|
|
304
|
+
ZFS = fsspec.get_filesystem_class('zip')
|
|
305
|
+
zfs = ZFS(path)
|
|
306
|
+
for dirpath, _, files in zfs.walk(zfs.root_marker):
|
|
307
|
+
for f in files:
|
|
308
|
+
file_list.append(os.path.join(dirpath, f))
|
|
309
|
+
return file_list
|
|
310
|
+
|
|
200
311
|
description = describe_file(source_dataset_path, scheme)
|
|
201
312
|
# innerpath is from frictionless and not useful because
|
|
202
313
|
# it does not include all the files contained in the zip
|
|
203
314
|
description.pop('innerpath', None)
|
|
204
315
|
|
|
205
|
-
|
|
206
|
-
|
|
207
|
-
|
|
208
|
-
|
|
209
|
-
|
|
210
|
-
|
|
316
|
+
if description.get("compression") == "zip":
|
|
317
|
+
file_list = _list_zip_contents(source_dataset_path)
|
|
318
|
+
elif description.get("format") in ["tgz", "tar"]:
|
|
319
|
+
file_list = _list_tgz_contents(source_dataset_path)
|
|
320
|
+
# 'compression' attr not auto-added by frictionless.describe for .tgz
|
|
321
|
+
# (but IS added for .tar.gz)
|
|
322
|
+
if source_dataset_path.endswith((".tgz")):
|
|
323
|
+
description["compression"] = "gz"
|
|
324
|
+
else:
|
|
325
|
+
raise ValueError(f"Unsupported archive format: {source_dataset_path}")
|
|
326
|
+
|
|
211
327
|
description['sources'] = file_list
|
|
212
328
|
return description
|
|
213
329
|
|
|
214
330
|
|
|
215
|
-
def describe_vector(source_dataset_path, scheme):
|
|
331
|
+
def describe_vector(source_dataset_path, scheme, **kwargs):
|
|
216
332
|
"""Describe properties of a GDAL vector file.
|
|
217
333
|
|
|
218
334
|
Args:
|
|
219
335
|
source_dataset_path (str): path to a GDAL vector.
|
|
336
|
+
scheme (str): the protocol prefix of the filepath
|
|
337
|
+
kwargs (dict): additional options when describing a dataset.
|
|
220
338
|
|
|
221
339
|
Returns:
|
|
222
340
|
dict
|
|
@@ -229,12 +347,18 @@ def describe_vector(source_dataset_path, scheme):
|
|
|
229
347
|
vector = gdal.OpenEx(source_dataset_path, gdal.OF_VECTOR)
|
|
230
348
|
layer = vector.GetLayer()
|
|
231
349
|
fields = []
|
|
232
|
-
description['n_features'] = layer.GetFeatureCount()
|
|
233
350
|
for fld in layer.schema:
|
|
234
351
|
fields.append(
|
|
235
352
|
models.FieldSchema(name=fld.name, type=fld.GetTypeName()))
|
|
353
|
+
layer_schema = models.LayerSchema(
|
|
354
|
+
name=layer.GetName(),
|
|
355
|
+
n_features=layer.GetFeatureCount(),
|
|
356
|
+
table=models.TableSchema(fields=fields),
|
|
357
|
+
gdal_metadata=layer.GetMetadata())
|
|
358
|
+
description['data_model'] = models.VectorSchema(
|
|
359
|
+
layers=[layer_schema],
|
|
360
|
+
gdal_metadata=vector.GetMetadata())
|
|
236
361
|
vector = layer = None
|
|
237
|
-
description['data_model'] = models.TableSchema(fields=fields)
|
|
238
362
|
|
|
239
363
|
info = pygeoprocessing.get_vector_info(source_dataset_path)
|
|
240
364
|
bbox = models.BoundingBox(*info['bounding_box'])
|
|
@@ -248,33 +372,65 @@ def describe_vector(source_dataset_path, scheme):
|
|
|
248
372
|
return description
|
|
249
373
|
|
|
250
374
|
|
|
251
|
-
def describe_raster(source_dataset_path, scheme):
|
|
375
|
+
def describe_raster(source_dataset_path, scheme, **kwargs):
|
|
252
376
|
"""Describe properties of a GDAL raster file.
|
|
253
377
|
|
|
254
378
|
Args:
|
|
255
379
|
source_dataset_path (str): path to a GDAL raster.
|
|
380
|
+
scheme (str): the protocol prefix of the filepath
|
|
381
|
+
kwargs (dict): additional options when describing a dataset:
|
|
382
|
+
* ``'compute_stats'`` (bool): whether to compute statistics
|
|
383
|
+
for each band in the raster. Default is False.
|
|
256
384
|
|
|
257
385
|
Returns:
|
|
258
386
|
dict
|
|
259
387
|
|
|
260
388
|
"""
|
|
389
|
+
compute_stats = kwargs.get('compute_stats', False)
|
|
261
390
|
description = describe_file(source_dataset_path, scheme)
|
|
262
391
|
if 'http' in scheme:
|
|
263
392
|
source_dataset_path = f'/vsicurl/{source_dataset_path}'
|
|
264
393
|
info = pygeoprocessing.get_raster_info(source_dataset_path)
|
|
394
|
+
raster = gdal.OpenEx(source_dataset_path)
|
|
395
|
+
raster_gdal_metadata = raster.GetMetadata()
|
|
265
396
|
bands = []
|
|
266
397
|
for i in range(info['n_bands']):
|
|
267
398
|
b = i + 1
|
|
399
|
+
band = raster.GetRasterBand(b)
|
|
400
|
+
band_gdal_metadata = band.GetMetadata()
|
|
401
|
+
if compute_stats:
|
|
402
|
+
try:
|
|
403
|
+
if not 'STATISTICS_VALID_PERCENT' in band_gdal_metadata:
|
|
404
|
+
# Sometimes some stats exist, but not all. If this one doesn't,
|
|
405
|
+
# it's important enough that we want to force computation.
|
|
406
|
+
_ = band.ComputeStatistics(0, callback=_gdal_progress_callback)
|
|
407
|
+
else:
|
|
408
|
+
# 0=do not approximate stats, 1=calculate if they don't exist
|
|
409
|
+
# If exact stats exist they will be retrieved without
|
|
410
|
+
# computing them, otherwise, this forces computation.
|
|
411
|
+
# https://github.com/OSGeo/gdal/blob/master/gcore/gdalrasterband.cpp
|
|
412
|
+
_ = band.GetStatistics(0, 1)
|
|
413
|
+
band_gdal_metadata = band.GetMetadata()
|
|
414
|
+
except RuntimeError as e:
|
|
415
|
+
LOGGER.warning(
|
|
416
|
+
f'Could not compute statistics for band {b} of '
|
|
417
|
+
f'{source_dataset_path}: {e}')
|
|
418
|
+
|
|
268
419
|
bands.append(models.BandSchema(
|
|
269
420
|
index=b,
|
|
270
421
|
gdal_type=gdal.GetDataTypeName(info['datatype']),
|
|
271
422
|
numpy_type=numpy.dtype(info['numpy_type']).name,
|
|
272
|
-
nodata=info['nodata'][i]
|
|
423
|
+
nodata=info['nodata'][i],
|
|
424
|
+
gdal_metadata=band_gdal_metadata))
|
|
425
|
+
band = None
|
|
426
|
+
raster = None
|
|
427
|
+
|
|
273
428
|
description['data_model'] = models.RasterSchema(
|
|
274
429
|
bands=bands,
|
|
275
430
|
pixel_size=info['pixel_size'],
|
|
276
431
|
raster_size={'width': info['raster_size'][0],
|
|
277
|
-
'height': info['raster_size'][1]}
|
|
432
|
+
'height': info['raster_size'][1]},
|
|
433
|
+
gdal_metadata=raster_gdal_metadata)
|
|
278
434
|
# Some values of raster info are numpy types, which the
|
|
279
435
|
# yaml dumper doesn't know how to represent.
|
|
280
436
|
bbox = models.BoundingBox(*[float(x) for x in info['bounding_box']])
|
|
@@ -288,12 +444,13 @@ def describe_raster(source_dataset_path, scheme):
|
|
|
288
444
|
return description
|
|
289
445
|
|
|
290
446
|
|
|
291
|
-
def describe_table(source_dataset_path, scheme):
|
|
447
|
+
def describe_table(source_dataset_path, scheme, **kwargs):
|
|
292
448
|
"""Describe properties of a tabular dataset.
|
|
293
449
|
|
|
294
450
|
Args:
|
|
295
451
|
source_dataset_path (str): path to a file representing a table.
|
|
296
452
|
scheme (str): the protocol prefix of the filepath
|
|
453
|
+
kwargs (dict): additional options when describing a dataset.
|
|
297
454
|
|
|
298
455
|
Returns:
|
|
299
456
|
dict
|
|
@@ -305,7 +462,139 @@ def describe_table(source_dataset_path, scheme):
|
|
|
305
462
|
return description
|
|
306
463
|
|
|
307
464
|
|
|
308
|
-
|
|
465
|
+
def describe_collection(directory, depth=numpy.iinfo(numpy.int16).max,
|
|
466
|
+
exclude_regex=None, exclude_hidden=True,
|
|
467
|
+
describe_files=False, backup=True, **kwargs):
|
|
468
|
+
"""Create a single metadata document to describe a collection of files.
|
|
469
|
+
|
|
470
|
+
Describe all the files within a directory as members of a "collection".
|
|
471
|
+
The resulting metadata resource should include a list of all the files
|
|
472
|
+
included in the collection along with a description and metadata filepath
|
|
473
|
+
(or placeholder). Optionally create individual metadata files for each
|
|
474
|
+
supported file in a directory.
|
|
475
|
+
|
|
476
|
+
Args:
|
|
477
|
+
directory (str): path to collection
|
|
478
|
+
depth (int, optional): maximum number of subdirectory levels to
|
|
479
|
+
traverse when walking through ``directory`` to find files included
|
|
480
|
+
in the collection. A value of 1 limits the walk to files in the
|
|
481
|
+
top-level ``directory`` only. A value of 2 allows descending into
|
|
482
|
+
immediate subdirectories, etc. All files in all subdirectories in
|
|
483
|
+
the collection will be included by default.
|
|
484
|
+
exclude_regex (str, optional): a regular expression to pattern-match
|
|
485
|
+
any files you do not want included in the output metadata yml.
|
|
486
|
+
exclude_hidden (bool, default True): whether to exclude hidden files
|
|
487
|
+
(files that start with ".").
|
|
488
|
+
describe_files (bool, default False): whether to ``describe`` all
|
|
489
|
+
files, i.e., create individual metadata files for each supported
|
|
490
|
+
resource in the collection.
|
|
491
|
+
backup (bool): whether to write a backup of a pre-existing metadata
|
|
492
|
+
file before ovewriting it in cases where that file is not a valid
|
|
493
|
+
geometamaker document.
|
|
494
|
+
kwargs (dict): optional keyward arguments accepted by ``describe``.
|
|
495
|
+
|
|
496
|
+
Returns:
|
|
497
|
+
Collection metadata
|
|
498
|
+
"""
|
|
499
|
+
directory = str(Path(directory).resolve())
|
|
500
|
+
|
|
501
|
+
file_list = _list_files_with_depth(directory, depth, exclude_regex,
|
|
502
|
+
exclude_hidden)
|
|
503
|
+
|
|
504
|
+
root_ext_map, root_list = _group_files_by_root(file_list)
|
|
505
|
+
|
|
506
|
+
items = []
|
|
507
|
+
|
|
508
|
+
for root in root_list:
|
|
509
|
+
extensions = root_ext_map[root]
|
|
510
|
+
if '.shp' in extensions:
|
|
511
|
+
# if we're dealing with a shapefile, we do not want to describe any
|
|
512
|
+
# of these other files with the same root name
|
|
513
|
+
extensions.difference_update(['.shx', '.sbn', '.sbx', '.prj', '.dbf', '.cpg'])
|
|
514
|
+
# Only drop .yml if its sidecar file, i.e. the corresponding data file
|
|
515
|
+
# (root) exists on disk
|
|
516
|
+
if '.yml' in extensions and os.path.exists(root):
|
|
517
|
+
extensions.discard('.yml')
|
|
518
|
+
for ext in extensions:
|
|
519
|
+
filepath = os.path.join(directory, f'{root}{ext}')
|
|
520
|
+
try:
|
|
521
|
+
this_desc = describe(filepath, **kwargs)
|
|
522
|
+
except ValueError:
|
|
523
|
+
# if file type isn't supported by geometamaker, e.g. pdf
|
|
524
|
+
# or if trying to describe a dir
|
|
525
|
+
this_desc = None
|
|
526
|
+
|
|
527
|
+
if describe_files and this_desc:
|
|
528
|
+
this_desc.write(backup=backup)
|
|
529
|
+
|
|
530
|
+
if ext and os.path.exists(filepath + '.yml'):
|
|
531
|
+
metadata_yml = f'{root}{ext}' + '.yml'
|
|
532
|
+
else:
|
|
533
|
+
metadata_yml = ''
|
|
534
|
+
|
|
535
|
+
this_resource = models.CollectionItemSchema(
|
|
536
|
+
path=f'{root}{ext}',
|
|
537
|
+
description=this_desc.description if this_desc else '',
|
|
538
|
+
metadata=metadata_yml
|
|
539
|
+
)
|
|
540
|
+
items.append(this_resource)
|
|
541
|
+
|
|
542
|
+
total_bytes, last_modified, uid = _get_collection_size_time_uid(directory)
|
|
543
|
+
|
|
544
|
+
resource = models.CollectionResource(
|
|
545
|
+
path=directory,
|
|
546
|
+
type='collection',
|
|
547
|
+
format='directory',
|
|
548
|
+
scheme=fsspec.utils.get_protocol(directory),
|
|
549
|
+
bytes=total_bytes,
|
|
550
|
+
last_modified=last_modified,
|
|
551
|
+
items=items,
|
|
552
|
+
uid=uid
|
|
553
|
+
)
|
|
554
|
+
|
|
555
|
+
# Check if there is existing metadata for the collection
|
|
556
|
+
try:
|
|
557
|
+
metadata_path = f'{directory}-metadata.yml'
|
|
558
|
+
existing_metadata = models.CollectionResource.load(metadata_path)
|
|
559
|
+
|
|
560
|
+
# Copy any existing item descriptions from existing yml to new metadata
|
|
561
|
+
# Note that descriptions in individual resources' ymls will take
|
|
562
|
+
# priority over item descriptions from preexisting collection metadata
|
|
563
|
+
for item in resource.items:
|
|
564
|
+
# Existing metadata's item desc will overwrite new metadata item
|
|
565
|
+
# desc if new item desc is ''
|
|
566
|
+
existing_item_desc = [
|
|
567
|
+
i.description for i in existing_metadata.items if (
|
|
568
|
+
i.path == item.path)]
|
|
569
|
+
if item.description == '' and len(existing_item_desc) > 0:
|
|
570
|
+
item.description = existing_item_desc[0]
|
|
571
|
+
|
|
572
|
+
# Replace fields in existing yml if new metadata has existing value
|
|
573
|
+
resource = existing_metadata.replace(resource)
|
|
574
|
+
|
|
575
|
+
except (ValueError, ValidationError) as error:
|
|
576
|
+
LOGGER.warning(error)
|
|
577
|
+
LOGGER.warning(
|
|
578
|
+
f'Ignoring an existing YAML document: {metadata_path} because it'
|
|
579
|
+
f' is invalid or incompatible.')
|
|
580
|
+
LOGGER.warning(
|
|
581
|
+
'A subsequent call to `.write()` will replace this file, but it'
|
|
582
|
+
f' will be backed up to {metadata_path}.bak.\n'
|
|
583
|
+
f'Use `.write(backup=False)` to skip the backup.\n',
|
|
584
|
+
extra=_LOG_EXTRA_NOT_FOR_CLI)
|
|
585
|
+
resource._would_overwrite = True
|
|
586
|
+
|
|
587
|
+
except FileNotFoundError:
|
|
588
|
+
pass
|
|
589
|
+
|
|
590
|
+
# Add profile metadata
|
|
591
|
+
config = Config()
|
|
592
|
+
resource = resource.replace(config.profile)
|
|
593
|
+
|
|
594
|
+
return resource
|
|
595
|
+
|
|
596
|
+
|
|
597
|
+
DESCRIBE_FUNCS = {
|
|
309
598
|
'archive': describe_archive,
|
|
310
599
|
'table': describe_table,
|
|
311
600
|
'vector': describe_vector,
|
|
@@ -321,7 +610,7 @@ RESOURCE_MODELS = {
|
|
|
321
610
|
|
|
322
611
|
|
|
323
612
|
@_osgeo_use_exceptions
|
|
324
|
-
def describe(source_dataset_path,
|
|
613
|
+
def describe(source_dataset_path, compute_stats=False):
|
|
325
614
|
"""Create a metadata resource instance with properties of the dataset.
|
|
326
615
|
|
|
327
616
|
Properties of the dataset are used to populate as many metadata
|
|
@@ -331,20 +620,23 @@ def describe(source_dataset_path, profile=None):
|
|
|
331
620
|
Args:
|
|
332
621
|
source_dataset_path (string): path or URL to dataset to which the
|
|
333
622
|
metadata applies
|
|
334
|
-
|
|
335
|
-
|
|
623
|
+
compute_stats (bool): whether to compute statistics
|
|
624
|
+
for each band in a raster.
|
|
336
625
|
|
|
337
626
|
Returns:
|
|
338
627
|
geometamaker.models.Resource: a metadata object
|
|
339
628
|
|
|
340
629
|
"""
|
|
341
|
-
config = Config()
|
|
342
|
-
user_profile = config.profile
|
|
343
|
-
if profile is not None:
|
|
344
|
-
user_profile = user_profile.replace(profile)
|
|
345
630
|
|
|
346
631
|
metadata_path = f'{source_dataset_path}.yml'
|
|
347
632
|
|
|
633
|
+
if os.path.isdir(source_dataset_path):
|
|
634
|
+
raise ValueError(
|
|
635
|
+
f"Cannot `describe` {source_dataset_path} as it is a directory, "
|
|
636
|
+
"not a dataset. \nIf you are trying to create metadata for the "
|
|
637
|
+
"files within a directory and/or the directory itself, please use "
|
|
638
|
+
"`geometamaker.describe_collection` instead.")
|
|
639
|
+
|
|
348
640
|
# Despite naming, this does not open a file that must be closed
|
|
349
641
|
of = fsspec.open(source_dataset_path)
|
|
350
642
|
if not of.fs.exists(source_dataset_path):
|
|
@@ -356,58 +648,63 @@ def describe(source_dataset_path, profile=None):
|
|
|
356
648
|
f'Cannot describe {source_dataset_path}. {protocol} '
|
|
357
649
|
f'is not one of the suppored file protocols: {PROTOCOLS}')
|
|
358
650
|
resource_type = detect_file_type(source_dataset_path, protocol)
|
|
359
|
-
description =
|
|
360
|
-
source_dataset_path, protocol)
|
|
651
|
+
description = DESCRIBE_FUNCS[resource_type](
|
|
652
|
+
source_dataset_path, protocol, compute_stats=compute_stats)
|
|
361
653
|
description['type'] = resource_type
|
|
654
|
+
resource = RESOURCE_MODELS[resource_type](**description)
|
|
362
655
|
|
|
363
656
|
# Load existing metadata file
|
|
364
657
|
try:
|
|
658
|
+
# For the data model, use heuristic to decide if the new resource
|
|
659
|
+
# should inherit values from the existing resource.
|
|
660
|
+
# After that, take all non-empty values from the new resource
|
|
661
|
+
# and update the existing resource.
|
|
365
662
|
existing_resource = RESOURCE_MODELS[resource_type].load(metadata_path)
|
|
366
|
-
if '
|
|
367
|
-
|
|
368
|
-
|
|
369
|
-
|
|
370
|
-
|
|
371
|
-
|
|
372
|
-
|
|
373
|
-
|
|
374
|
-
|
|
375
|
-
|
|
376
|
-
|
|
377
|
-
|
|
378
|
-
|
|
379
|
-
|
|
380
|
-
|
|
381
|
-
|
|
382
|
-
|
|
383
|
-
|
|
384
|
-
|
|
385
|
-
|
|
386
|
-
|
|
387
|
-
|
|
388
|
-
|
|
389
|
-
|
|
390
|
-
|
|
391
|
-
|
|
392
|
-
|
|
393
|
-
|
|
394
|
-
|
|
395
|
-
|
|
396
|
-
|
|
397
|
-
|
|
398
|
-
|
|
399
|
-
|
|
400
|
-
|
|
401
|
-
|
|
402
|
-
|
|
403
|
-
resource =
|
|
404
|
-
|
|
405
|
-
# Common path: metadata file does not already exist
|
|
406
|
-
# Or less common, ValueError if it exists but is incompatible
|
|
663
|
+
if resource_type == 'raster':
|
|
664
|
+
for band in resource.data_model.bands:
|
|
665
|
+
try:
|
|
666
|
+
eband = existing_resource.get_band_description(band.index)
|
|
667
|
+
except IndexError:
|
|
668
|
+
continue
|
|
669
|
+
if (band.numpy_type, band.gdal_type, band.nodata) == (
|
|
670
|
+
eband.numpy_type, eband.gdal_type, eband.nodata):
|
|
671
|
+
resource.set_band_description(
|
|
672
|
+
band.index,
|
|
673
|
+
title=eband.title,
|
|
674
|
+
description=eband.description,
|
|
675
|
+
units=eband.units)
|
|
676
|
+
if resource_type in ('vector', 'table'):
|
|
677
|
+
for field in resource._get_fields():
|
|
678
|
+
try:
|
|
679
|
+
efield = existing_resource.get_field_description(field.name)
|
|
680
|
+
except KeyError:
|
|
681
|
+
continue
|
|
682
|
+
if field.type == efield.type:
|
|
683
|
+
resource.set_field_description(
|
|
684
|
+
field.name,
|
|
685
|
+
title=efield.title,
|
|
686
|
+
description=efield.description,
|
|
687
|
+
units=efield.units)
|
|
688
|
+
resource = existing_resource.replace(resource)
|
|
689
|
+
|
|
690
|
+
except (ValueError, ValidationError) as error:
|
|
691
|
+
LOGGER.warning(error)
|
|
692
|
+
LOGGER.warning(
|
|
693
|
+
f'Ignoring an existing YAML document: {metadata_path} because it'
|
|
694
|
+
f' is invalid or incompatible.')
|
|
695
|
+
LOGGER.warning(
|
|
696
|
+
'A subsequent call to `.write()` will replace this file, but it'
|
|
697
|
+
' will be backed up to {metadata_path}.bak.\n'
|
|
698
|
+
f'Use `.write(backup=False)` to skip the backup.\n',
|
|
699
|
+
extra=_LOG_EXTRA_NOT_FOR_CLI)
|
|
700
|
+
resource._would_overwrite = True
|
|
701
|
+
|
|
407
702
|
except FileNotFoundError:
|
|
408
|
-
|
|
703
|
+
# Common path: metadata file does not already exist
|
|
704
|
+
pass
|
|
409
705
|
|
|
410
|
-
|
|
706
|
+
config = Config()
|
|
707
|
+
resource = resource.replace(config.profile)
|
|
411
708
|
return resource
|
|
412
709
|
|
|
413
710
|
|
|
@@ -442,30 +739,20 @@ def validate(filepath):
|
|
|
442
739
|
return error
|
|
443
740
|
|
|
444
741
|
|
|
445
|
-
def validate_dir(directory,
|
|
742
|
+
def validate_dir(directory, depth=numpy.iinfo(numpy.int16).max):
|
|
446
743
|
"""Validate all compatible yml documents in the directory.
|
|
447
744
|
|
|
448
745
|
Args:
|
|
449
746
|
directory (string): path to a directory
|
|
450
|
-
|
|
451
|
-
|
|
747
|
+
depth (int): maximum number of subdirectory levels to
|
|
748
|
+
traverse when walking through ``directory``.
|
|
452
749
|
|
|
453
750
|
Returns:
|
|
454
751
|
tuple (list, list): a list of the filepaths that were validated and
|
|
455
752
|
an equal-length list of the validation messages.
|
|
456
753
|
|
|
457
754
|
"""
|
|
458
|
-
file_list =
|
|
459
|
-
if recursive:
|
|
460
|
-
for path, dirs, files in os.walk(directory):
|
|
461
|
-
for file in files:
|
|
462
|
-
file_list.append(os.path.join(path, file))
|
|
463
|
-
else:
|
|
464
|
-
file_list.extend(
|
|
465
|
-
[os.path.join(directory, path)
|
|
466
|
-
for path in os.listdir(directory)
|
|
467
|
-
if os.path.isfile(os.path.join(directory, path))])
|
|
468
|
-
|
|
755
|
+
file_list = _list_files_with_depth(directory, depth)
|
|
469
756
|
messages = []
|
|
470
757
|
yaml_files = []
|
|
471
758
|
for filepath in file_list:
|
|
@@ -473,7 +760,7 @@ def validate_dir(directory, recursive=False):
|
|
|
473
760
|
yaml_files.append(filepath)
|
|
474
761
|
msg = ''
|
|
475
762
|
try:
|
|
476
|
-
error = validate(filepath)
|
|
763
|
+
error = validate(os.path.join(directory, filepath))
|
|
477
764
|
if error:
|
|
478
765
|
msg = error
|
|
479
766
|
except ValueError:
|
|
@@ -484,48 +771,3 @@ def validate_dir(directory, recursive=False):
|
|
|
484
771
|
messages.append(msg)
|
|
485
772
|
|
|
486
773
|
return (yaml_files, messages)
|
|
487
|
-
|
|
488
|
-
|
|
489
|
-
def describe_dir(directory, recursive=False):
|
|
490
|
-
"""Describe all compatible datasets in the directory.
|
|
491
|
-
|
|
492
|
-
Take special care to only describe multifile datasets,
|
|
493
|
-
such as ESRI Shapefiles, one time.
|
|
494
|
-
|
|
495
|
-
Args:
|
|
496
|
-
directory (string): path to a directory
|
|
497
|
-
recursive (bool): whether or not to describe files
|
|
498
|
-
in all subdirectories
|
|
499
|
-
|
|
500
|
-
Returns:
|
|
501
|
-
None
|
|
502
|
-
|
|
503
|
-
"""
|
|
504
|
-
root_set = set()
|
|
505
|
-
root_ext_map = defaultdict(set)
|
|
506
|
-
for path, dirs, files in os.walk(directory):
|
|
507
|
-
for file in files:
|
|
508
|
-
full_path = os.path.join(path, file)
|
|
509
|
-
root, ext = os.path.splitext(full_path)
|
|
510
|
-
# tracking which files share a root name
|
|
511
|
-
# so we can check if these comprise a shapefile
|
|
512
|
-
root_ext_map[root].add(ext)
|
|
513
|
-
root_set.add(root)
|
|
514
|
-
if not recursive:
|
|
515
|
-
break
|
|
516
|
-
|
|
517
|
-
for root in root_set:
|
|
518
|
-
extensions = root_ext_map[root]
|
|
519
|
-
if '.shp' in extensions:
|
|
520
|
-
# if we're dealing with a shapefile, we do not want to describe any
|
|
521
|
-
# of these other files with the same root name
|
|
522
|
-
extensions.difference_update(['.shx', '.sbn', '.sbx', '.prj', '.dbf'])
|
|
523
|
-
for ext in extensions:
|
|
524
|
-
filepath = f'{root}{ext}'
|
|
525
|
-
try:
|
|
526
|
-
resource = describe(filepath)
|
|
527
|
-
except ValueError as error:
|
|
528
|
-
LOGGER.debug(error)
|
|
529
|
-
continue
|
|
530
|
-
resource.write()
|
|
531
|
-
LOGGER.info(f'{filepath} described')
|