geometamaker 0.1.2__py3-none-any.whl → 0.2.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
geometamaker/models.py CHANGED
@@ -1,25 +1,47 @@
1
1
  from __future__ import annotations
2
+ import collections
2
3
  import logging
4
+ import numbers
3
5
  import os
4
6
  import warnings
5
- from typing import List, Union
7
+ from typing import Union
6
8
 
7
9
  import fsspec
8
10
  import yaml
9
- from pydantic import BaseModel, ConfigDict, Field
11
+ from pydantic import BaseModel, ConfigDict, Field, ValidationError
10
12
  from pydantic.dataclasses import dataclass
11
13
 
12
14
  import geometamaker
13
15
  from . import utils
14
16
 
15
17
 
16
- LOGGER = logging.getLogger(__name__)
18
+ LOGGER = logging.getLogger('geometamaker')
19
+
20
+
21
+ def _deep_update_dict(self_dict, other_dict):
22
+ """Update values in self_dict.
23
+
24
+ Only keys that exist in ``self_dict`` will exist in the
25
+ returned dict. Only values that are not empty in ``other_dict``
26
+ will be used to replace values in ``self_dict``.
27
+
28
+ """
29
+ for k, v in other_dict.items():
30
+ if k in self_dict:
31
+ if isinstance(v, collections.abc.Mapping):
32
+ self_dict[k] = _deep_update_dict(self_dict[k], v)
33
+ else:
34
+ if v is not None and (v or isinstance(v, numbers.Number)):
35
+ self_dict[k] = v
36
+ return self_dict
17
37
 
18
38
 
19
39
  class Parent(BaseModel):
20
40
  """Parent class on which to configure validation."""
21
41
 
22
- model_config = ConfigDict(validate_assignment=True, extra='forbid')
42
+ model_config = ConfigDict(validate_assignment=True,
43
+ extra='forbid',
44
+ use_attribute_docstrings=True)
23
45
 
24
46
 
25
47
  # dataclass allows positional args, BaseModel does not.
@@ -39,12 +61,15 @@ class SpatialSchema(Parent):
39
61
  """Class for keeping track of spatial info."""
40
62
 
41
63
  bounding_box: BoundingBox
64
+ """Spatial extent [xmin, ymin, xmax, ymax]."""
42
65
  crs: str
66
+ """Coordinate Reference System."""
43
67
  crs_units: str
68
+ """Units of measure for coordinates in the CRS."""
44
69
 
45
70
 
46
71
  class ContactSchema(Parent):
47
- """Class for keeping track of contact info."""
72
+ """Class for storing contact information of data author."""
48
73
 
49
74
  email: str = ''
50
75
  organization: str = ''
@@ -53,15 +78,13 @@ class ContactSchema(Parent):
53
78
 
54
79
 
55
80
  class LicenseSchema(Parent):
56
- """Class for storing license info."""
81
+ """Class for storing data license information."""
57
82
 
58
- # https://datapackage.org/profiles/2.0/dataresource.json
59
- # This profile also includes `name`, described as:
60
- # "MUST be an Open Definition license identifier",
61
- # see http://licenses.opendefinition.org/"
62
- # I don't think that's useful to us yet.
83
+ # Loosely follows https://datapackage.org/profiles/2.0/dataresource.json
63
84
  path: str = ''
85
+ """URL that describes the license."""
64
86
  title: str = ''
87
+ """Name of a license, such as one from http://licenses.opendefinition.org/"""
65
88
 
66
89
 
67
90
  class FieldSchema(Parent):
@@ -69,40 +92,127 @@ class FieldSchema(Parent):
69
92
 
70
93
  # https://datapackage.org/standard/table-schema/
71
94
  name: str
95
+ """The name used to uniquely identify the field."""
72
96
  type: str
97
+ """Datatype of the content of the field."""
73
98
  description: str = ''
99
+ """A description of the field."""
74
100
  title: str = ''
101
+ """A human-readable title for the field."""
75
102
  units: str = ''
103
+ """Unit of measurement for values in the field."""
76
104
 
77
105
 
78
106
  class TableSchema(Parent):
79
107
  """Class for metadata for tables."""
80
108
 
81
109
  # https://datapackage.org/standard/table-schema/
82
- fields: List[FieldSchema]
110
+ fields: list[FieldSchema]
111
+ """A list of ``FieldSchema`` objects."""
83
112
  missingValues: list = Field(default_factory=list)
84
- primaryKey: list = Field(default_factory=list)
85
- foreignKeys: list = Field(default_factory=list)
113
+ """A list of values that represent missing data."""
114
+ primaryKey: list[str] = Field(default_factory=list)
115
+ """A field or list of fields that uniquely identifies each row in the table."""
116
+ foreignKeys: list[str] = Field(default_factory=list)
117
+ """A field or list of fields that can be used to join another table.
118
+
119
+ See https://datapackage.org/standard/table-schema/#foreignKeys
120
+ """
121
+
122
+ def _get_field(self, name):
123
+ """Get an attribute by its name property.
124
+
125
+ Args:
126
+ name (string): to match the value of the 'name' key in a dict
127
+
128
+ Returns:
129
+ tuple of (list index of the matching attribute, the attribute
130
+ dict)
131
+
132
+ Raises:
133
+ KeyError if no attributes exist in the resource or if the named
134
+ attribute does not exist.
135
+
136
+ """
137
+ if len(self.fields) == 0:
138
+ raise KeyError(
139
+ f'{self} has no fields')
140
+ for idx, field in enumerate(self.fields):
141
+ if field.name == name:
142
+ return idx, field
143
+ raise KeyError(
144
+ f'{self} has no field named {name}')
145
+
146
+ def set_field_description(self, name, title=None, description=None,
147
+ units=None, type=None):
148
+ """Define metadata for a tabular field.
149
+
150
+ Args:
151
+ name (str): name and unique identifier of the field
152
+ title (str): title for the field
153
+ description (str): description of the field
154
+ units (str): unit of measurement for the field's values
155
+ type (str): datatype of values in the field
156
+
157
+ """
158
+ idx, field = self._get_field(name)
159
+
160
+ if title is not None:
161
+ field.title = title
162
+ if description is not None:
163
+ field.description = description
164
+ if units is not None:
165
+ field.units = units
166
+ if type is not None:
167
+ field.type = type
168
+
169
+ self.fields[idx] = field
170
+
171
+ def get_field_description(self, name):
172
+ """Get the attribute metadata for a field.
173
+
174
+ Args:
175
+ name (str): name and unique identifier of the field
176
+
177
+ Returns:
178
+ FieldSchema
179
+ """
180
+ idx, field = self._get_field(name)
181
+ return field
86
182
 
87
183
 
88
184
  class BandSchema(Parent):
89
185
  """Class for metadata for a raster band."""
90
186
 
91
187
  index: int
188
+ """The index of the band of a GDAL raster, starting at 1."""
92
189
  gdal_type: str
190
+ """The GDAL data type of the band."""
93
191
  numpy_type: str
192
+ """The numpy data type of the band."""
94
193
  nodata: Union[int, float, None]
194
+ """The pixel value that represents no data in the band."""
95
195
  description: str = ''
196
+ """A description of the band."""
96
197
  title: str = ''
198
+ """A human-readable title for the band."""
97
199
  units: str = ''
200
+ """Unit of measurement for the pixel values."""
201
+ gdal_metadata: dict = {}
202
+ """Metadata key:value pairs stored in the GDAL band object."""
98
203
 
99
204
 
100
205
  class RasterSchema(Parent):
101
206
  """Class for metadata for raster bands."""
102
207
 
103
- bands: List[BandSchema]
104
- pixel_size: list
208
+ bands: list[BandSchema]
209
+ """A list of ``BandSchema`` objects."""
210
+ pixel_size: tuple[Union[int, float], Union[int, float]]
211
+ """The width and height of a pixel measured in ``SpatialSchema.crs_units``."""
105
212
  raster_size: Union[dict, list]
213
+ """The width and height of the raster measured in number of pixels."""
214
+ gdal_metadata: dict = {}
215
+ """Metadata key:value pairs stored in the GDAL raster object."""
106
216
 
107
217
  def model_post_init(self, __context):
108
218
  # Migrate from previous model where we stored this as a list
@@ -111,13 +221,37 @@ class RasterSchema(Parent):
111
221
  'height': self.raster_size[1]}
112
222
 
113
223
 
224
+ class LayerSchema(Parent):
225
+ """Class for metadata for a GDAL vector's layer."""
226
+
227
+ name: str
228
+ """The layer name."""
229
+ table: TableSchema = Field(default_factory=TableSchema)
230
+ """A ``models.TableSchema`` object for describing fields in a layer's table."""
231
+ gdal_metadata: dict = {}
232
+ """Metadata key:value pairs stored in the GDAL layer object."""
233
+ n_features: int
234
+ """Number of features in the layer."""
235
+
236
+
237
+ class VectorSchema(Parent):
238
+
239
+ layers: list[LayerSchema]
240
+ """A list of layers in the vector.
241
+
242
+ Geometamaker currently only supports vectors with one layer.
243
+ """
244
+ gdal_metadata: dict = {}
245
+ """Metadata key:value pairs stored in the GDAL vector object."""
246
+
247
+
114
248
  class BaseMetadata(Parent):
115
249
  """A class for the things shared by Resource and Profile."""
116
250
 
117
- # These default to None in order to facilitate the logic
118
- # in ``replace`` where we only replace values that are not None.
119
- contact: Union[ContactSchema, None] = Field(default_factory=ContactSchema)
120
- license: Union[LicenseSchema, None] = Field(default_factory=LicenseSchema)
251
+ contact: ContactSchema = Field(default_factory=ContactSchema)
252
+ """Contact information for the data author."""
253
+ license: LicenseSchema = Field(default_factory=LicenseSchema)
254
+ """Data license information."""
121
255
 
122
256
  def set_contact(self, organization=None, individual_name=None,
123
257
  position_name=None, email=None):
@@ -186,8 +320,8 @@ class BaseMetadata(Parent):
186
320
  """Replace attribute values with those from another instance.
187
321
 
188
322
  Only attributes that exist in ``self`` will exist in the
189
- returned instance. Only attribute values that are not None will be used
190
- to replace existing attribute values in ``self``.
323
+ returned instance. Only attribute values that are not empty
324
+ in ``other`` will be used to replace values in ``self``.
191
325
 
192
326
  Args:
193
327
  other (BaseMetadata)
@@ -200,9 +334,13 @@ class BaseMetadata(Parent):
200
334
 
201
335
  """
202
336
  if isinstance(other, BaseMetadata):
203
- updated_dict = self.model_dump() | {
204
- k: v for k, v in other.__dict__.items() if v is not None}
205
- return self.__class__(**updated_dict)
337
+ updated_dict = _deep_update_dict(
338
+ self.model_dump(), other.model_dump())
339
+ obj = self.__class__(**updated_dict)
340
+ # Private attributes are not pydantic fields.
341
+ # They were excluded in model_dump so set them again
342
+ obj._would_overwrite = self._would_overwrite
343
+ return obj
206
344
  raise TypeError(f'{type(other)} is not an instance of BaseMetadata')
207
345
 
208
346
 
@@ -214,11 +352,6 @@ class Profile(BaseMetadata):
214
352
 
215
353
  """
216
354
 
217
- # For a Profile, default these to None so that they do not replace
218
- # values in a Resource
219
- contact: Union[ContactSchema, None] = None
220
- license: Union[LicenseSchema, None] = None
221
-
222
355
  @classmethod
223
356
  def load(cls, filepath):
224
357
  """Load metadata document from a yaml file.
@@ -246,7 +379,7 @@ class Profile(BaseMetadata):
246
379
  file.write(utils.yaml_dump(self.model_dump()))
247
380
 
248
381
 
249
- class Resource(BaseMetadata):
382
+ class BaseResource(BaseMetadata):
250
383
  """Base class for metadata for a resource.
251
384
 
252
385
  https://datapackage.org/standard/data-resource/
@@ -259,43 +392,49 @@ class Resource(BaseMetadata):
259
392
  with which to complete later.
260
393
 
261
394
  """
262
-
263
- # A version string we can use to identify geometamaker compliant documents
264
- geometamaker_version: str = ''
395
+ _would_overwrite: bool = False
265
396
  metadata_path: str = ''
397
+ geometamaker_version: str = ''
398
+ """The version of geometamaker used to create this metadata resource."""
266
399
 
267
- # These are populated geometamaker.describe()
400
+ # These are populated by geometamaker.describe()
268
401
  bytes: int = 0
269
- encoding: str = ''
402
+ """File size of the resource in bytes."""
270
403
  format: str = ''
404
+ """File format of the resource."""
271
405
  uid: str = ''
406
+ """Unique identifier for the resource."""
272
407
  path: str = ''
408
+ """Path to the resource being described."""
273
409
  scheme: str = ''
410
+ """File protocol for opening the resource."""
274
411
  type: str = ''
412
+ """The type of resource being described."""
275
413
  last_modified: str = ''
276
- # DataPackage includes `sources` as a list of source files
277
- # with some amount of metadata for each item. For our
278
- # use-case, I think a list of filenames is good enough.
279
- sources: list = Field(default_factory=list)
414
+ """Last modified time of the file at ``path``."""
280
415
 
281
416
  # These are not populated by geometamaker.describe(),
282
417
  # and should have setters & getters
283
418
  citation: str = ''
419
+ """A citation for the resource."""
284
420
  description: str = ''
421
+ """A text description of the resource."""
285
422
  doi: str = ''
423
+ """A digital object identifier for the resource."""
286
424
  edition: str = ''
287
- keywords: list = Field(default_factory=list)
425
+ """A string representing the edition, or version, of the resource."""
426
+ keywords: list[str] = Field(default_factory=list)
427
+ """A list of keywords that describe the subject-matter of the resource."""
288
428
  lineage: str = ''
289
- placenames: list = Field(default_factory=list)
429
+ """A text description of how the resource was created."""
430
+ placenames: list[str] = Field(default_factory=list)
431
+ """A list of geographic places associated with the resource."""
290
432
  purpose: str = ''
433
+ """The author's stated purpose for the resource."""
291
434
  title: str = ''
435
+ """The title of the resource."""
292
436
  url: str = ''
293
-
294
- def model_post_init(self, __context):
295
- self.metadata_path = f'{self.path}.yml'
296
- self.geometamaker_version: str = geometamaker.__version__
297
- self.path = self.path.replace('\\', '/')
298
- self.sources = [x.replace('\\', '/') for x in self.sources]
437
+ """A URL where the resource is available."""
299
438
 
300
439
  @classmethod
301
440
  def load(cls, filepath):
@@ -322,27 +461,29 @@ class Resource(BaseMetadata):
322
461
  f'geometamaker.')
323
462
  raise ValueError(message)
324
463
 
325
- deprecated_attrs = ['metadata_version', 'mediatype', 'name']
326
- for attr in deprecated_attrs:
327
- if attr in yaml_dict:
328
- warnings.warn(
329
- f'"{attr}" exists in {filepath} but is no longer part of '
330
- f'the geometamaker specification. "{attr}" will be '
331
- f'removed from this document. In the future, presence '
332
- f' of "{attr}" will raise a ValidationError',
333
- category=FutureWarning)
334
- del yaml_dict[attr]
335
-
336
- # migrate from 'schema' to 'data_model', if needed.
337
- if 'schema' in yaml_dict:
338
- warnings.warn(
339
- "'schema' has been replaced with 'data_model' as an attribute "
340
- "name. In the future, the presence of a 'schema' attribute "
341
- "will raise a ValidationError",
342
- category=FutureWarning)
343
- yaml_dict['data_model'] = yaml_dict['schema']
344
- del yaml_dict['schema']
345
- return cls(**yaml_dict)
464
+ try:
465
+ return cls(**yaml_dict)
466
+ except ValidationError as validation_error:
467
+ for e in validation_error.errors():
468
+ # Migrate vector metadata that pre-dates 'layers'
469
+ if e['type'] == 'missing' and e['loc'] == ('data_model', 'layers'):
470
+ warnings.warn(
471
+ "A vector 'data_model' must include 'layers'. "
472
+ "In the future, the absence of a 'layers' attribute "
473
+ "will raise a ValidationError",
474
+ category=FutureWarning)
475
+ # In the context of `describe`, these layer attributes will
476
+ # be updated on the resource after this document is loaded.
477
+ layer = {
478
+ 'name': '',
479
+ 'table': yaml_dict['data_model'],
480
+ 'n_features': yaml_dict['n_features']
481
+ }
482
+ del yaml_dict['data_model']
483
+ del yaml_dict['n_features']
484
+ yaml_dict['data_model'] = {'layers': [layer]}
485
+ return cls(**yaml_dict)
486
+ raise validation_error
346
487
 
347
488
  def set_title(self, title):
348
489
  """Add a title for the dataset.
@@ -500,7 +641,7 @@ class Resource(BaseMetadata):
500
641
  """Get the url for the dataset."""
501
642
  return self.url
502
643
 
503
- def write(self, workspace=None):
644
+ def write(self, workspace=None, backup=True):
504
645
  """Write datapackage yaml to disk.
505
646
 
506
647
  This creates sidecar files with '.yml'
@@ -515,6 +656,9 @@ class Resource(BaseMetadata):
515
656
  to write files. They will still be named to match the source
516
657
  filename. Use this option if the source data is not on the local
517
658
  filesystem.
659
+ backup (bool): whether to write a backup of a pre-existing metadata
660
+ file before ovewriting it in cases where that file is not a valid
661
+ geometamaker document.
518
662
 
519
663
  """
520
664
  if workspace is None:
@@ -523,42 +667,63 @@ class Resource(BaseMetadata):
523
667
  target_path = os.path.join(
524
668
  workspace, os.path.basename(self.metadata_path))
525
669
 
670
+ if self._would_overwrite and backup and os.path.exists(target_path):
671
+ backup_path = f'{target_path}.bak'
672
+ LOGGER.info(
673
+ f'Backing up existing metadata file to {backup_path}')
674
+ os.rename(target_path, backup_path)
675
+
526
676
  with open(target_path, 'w', encoding='utf-8') as file:
527
- file.write(utils.yaml_dump(
528
- self.model_dump(exclude=['metadata_path'])))
677
+ file.write(utils.yaml_dump(self._dump_for_write()))
678
+
679
+ def _dump_for_write(self):
680
+ return self.model_dump(exclude={'metadata_path'})
681
+
529
682
 
530
- def to_string(self):
531
- pass
683
+ class Resource(BaseResource):
684
+ """
685
+ Metadata class for general-purpose resources.
686
+
687
+ This class extends `BaseResource` and provides metadata for a single file
688
+ or dataset, including encoding and source file references. It serves as a
689
+ base for more specific resource types (e.g., table, raster, vector,
690
+ archive) and is typically initialized by `describe()`.
691
+ """
692
+
693
+ encoding: str = ''
694
+ """File encoding of the resource."""
695
+ sources: list[str] = Field(default_factory=list)
696
+ """A list of files which comprise the dataset or resource."""
697
+
698
+ def model_post_init(self, __context):
699
+ self.metadata_path = self._default_metadata_path()
700
+ self.geometamaker_version: str = geometamaker.__version__
701
+ self.path = self.path.replace('\\', '/')
702
+ self.sources = [x.replace('\\', '/') for x in self.sources]
703
+
704
+ def _default_metadata_path(self):
705
+ return f'{self.path}.yml'
532
706
 
533
707
 
534
708
  class TableResource(Resource):
535
709
  """Class for metadata for a table resource."""
536
710
 
537
711
  data_model: TableSchema = Field(default_factory=TableSchema)
712
+ """A ``models.TableSchema`` object for describing fields."""
538
713
 
539
- def _get_field(self, name):
540
- """Get an attribute by its name property.
714
+ def _get_fields(self):
715
+ return self.data_model.fields
716
+
717
+ def get_field_description(self, name):
718
+ """Get the attribute metadata for a field.
541
719
 
542
720
  Args:
543
- name (string): to match the value of the 'name' key in a dict
721
+ name (str): name and unique identifier of the field
544
722
 
545
723
  Returns:
546
- tuple of (list index of the matching attribute, the attribute
547
- dict)
548
-
549
- Raises:
550
- KeyError if no attributes exist in the resource or if the named
551
- attribute does not exist.
552
-
724
+ FieldSchema
553
725
  """
554
- if len(self.data_model.fields) == 0:
555
- raise KeyError(
556
- f'{self.data_model} has no fields')
557
- for idx, field in enumerate(self.data_model.fields):
558
- if field.name == name:
559
- return idx, field
560
- raise KeyError(
561
- f'{self.data_model} has no field named {name}')
726
+ return self.data_model.get_field_description(name)
562
727
 
563
728
  def set_field_description(self, name, title=None, description=None,
564
729
  units=None, type=None):
@@ -572,18 +737,53 @@ class TableResource(Resource):
572
737
  type (str): datatype of values in the field
573
738
 
574
739
  """
575
- idx, field = self._get_field(name)
740
+ self.data_model.set_field_description(
741
+ name, title, description, units, type)
576
742
 
577
- if title is not None:
578
- field.title = title
579
- if description is not None:
580
- field.description = description
581
- if units is not None:
582
- field.units = units
583
- if type is not None:
584
- field.type = type
585
743
 
586
- self.data_model.fields[idx] = field
744
+ class ArchiveResource(Resource):
745
+ """Class for metadata for an archive resource."""
746
+
747
+ compression: str = ''
748
+ """The compression method used to create the archive."""
749
+
750
+
751
+ class CollectionItemSchema(Parent):
752
+ """Class for metadata for collection items."""
753
+ path: str = ''
754
+ """Path to the resource being described."""
755
+ description: str = ''
756
+ """A text description of the resource."""
757
+ metadata: str = ''
758
+ """Path to metadata document describing resource"""
759
+
760
+
761
+ class CollectionResource(BaseResource):
762
+ """Class for metadata for a collection resource."""
763
+
764
+ items: list[CollectionItemSchema] = Field(default_factory=list)
765
+ """Files in collection."""
766
+
767
+ def model_post_init(self, __context):
768
+ self.metadata_path = self._default_metadata_path()
769
+ self.geometamaker_version: str = geometamaker.__version__
770
+ self.path = self.path.replace('\\', '/')
771
+
772
+ def _default_metadata_path(self):
773
+ """Add -metadata tag"""
774
+ return f'{self.path}-metadata.yml'
775
+
776
+
777
+ class VectorResource(Resource):
778
+ """Class for metadata for a vector resource."""
779
+
780
+ data_model: VectorSchema
781
+ """An object for describing vector properties and layers."""
782
+ spatial: SpatialSchema
783
+ """An object for describing spatial properties of a GDAL dataset."""
784
+
785
+ def _get_fields(self):
786
+ return self.data_model.layers[0].table.fields
587
787
 
588
788
  def get_field_description(self, name):
589
789
  """Get the attribute metadata for a field.
@@ -594,28 +794,31 @@ class TableResource(Resource):
594
794
  Returns:
595
795
  FieldSchema
596
796
  """
597
- idx, field = self._get_field(name)
598
- return field
599
-
600
-
601
- class ArchiveResource(Resource):
602
- """Class for metadata for an archive resource."""
603
-
604
- compression: str
797
+ return self.data_model.layers[0].table.get_field_description(name)
605
798
 
799
+ def set_field_description(self, name, title=None, description=None,
800
+ units=None, type=None):
801
+ """Define metadata for a tabular field.
606
802
 
607
- class VectorResource(TableResource):
608
- """Class for metadata for a vector resource."""
803
+ Args:
804
+ name (str): name and unique identifier of the field
805
+ title (str): title for the field
806
+ description (str): description of the field
807
+ units (str): unit of measurement for the field's values
808
+ type (str): datatype of values in the field
609
809
 
610
- n_features: int
611
- spatial: SpatialSchema
810
+ """
811
+ self.data_model.layers[0].table.set_field_description(
812
+ name, title, description, units, type)
612
813
 
613
814
 
614
815
  class RasterResource(Resource):
615
816
  """Class for metadata for a raster resource."""
616
817
 
617
818
  data_model: RasterSchema
819
+ """An object for describing raster properties and bands."""
618
820
  spatial: SpatialSchema
821
+ """An object for describing spatial properties of a GDAL dataset."""
619
822
 
620
823
  def set_band_description(self, band_number, title=None,
621
824
  description=None, units=None):