geometamaker 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
geometamaker/models.py ADDED
@@ -0,0 +1,653 @@
1
+ from __future__ import annotations
2
+ import logging
3
+ import os
4
+ import warnings
5
+ from typing import List, Union
6
+
7
+ import fsspec
8
+ import yaml
9
+ from pydantic import BaseModel, ConfigDict, Field
10
+ from pydantic.dataclasses import dataclass
11
+
12
+ import geometamaker
13
+ from . import utils
14
+
15
+
16
+ LOGGER = logging.getLogger(__name__)
17
+
18
+
19
+ class Parent(BaseModel):
20
+ """Parent class on which to configure validation."""
21
+
22
+ model_config = ConfigDict(validate_assignment=True, extra='forbid')
23
+
24
+
25
+ # dataclass allows positional args, BaseModel does not.
26
+ # positional args are convenient for initializing BoundingBox,
27
+ # but we could switch to BaseModel for consistency.
28
+ @dataclass(frozen=True)
29
+ class BoundingBox:
30
+ """Class for a spatial bounding box."""
31
+
32
+ xmin: float
33
+ ymin: float
34
+ xmax: float
35
+ ymax: float
36
+
37
+
38
+ class SpatialSchema(Parent):
39
+ """Class for keeping track of spatial info."""
40
+
41
+ bounding_box: BoundingBox
42
+ crs: str
43
+ crs_units: str
44
+
45
+
46
+ class ContactSchema(Parent):
47
+ """Class for keeping track of contact info."""
48
+
49
+ email: str = ''
50
+ organization: str = ''
51
+ individual_name: str = ''
52
+ position_name: str = ''
53
+
54
+
55
+ class LicenseSchema(Parent):
56
+ """Class for storing license info."""
57
+
58
+ # https://datapackage.org/profiles/2.0/dataresource.json
59
+ # This profile also includes `name`, described as:
60
+ # "MUST be an Open Definition license identifier",
61
+ # see http://licenses.opendefinition.org/"
62
+ # I don't think that's useful to us yet.
63
+ path: str = ''
64
+ title: str = ''
65
+
66
+
67
+ class FieldSchema(Parent):
68
+ """Metadata for a field in a table."""
69
+
70
+ # https://datapackage.org/standard/table-schema/
71
+ name: str
72
+ type: str
73
+ description: str = ''
74
+ title: str = ''
75
+ units: str = ''
76
+
77
+
78
+ class TableSchema(Parent):
79
+ """Class for metadata for tables."""
80
+
81
+ # https://datapackage.org/standard/table-schema/
82
+ fields: List[FieldSchema]
83
+ missingValues: list = Field(default_factory=list)
84
+ primaryKey: list = Field(default_factory=list)
85
+ foreignKeys: list = Field(default_factory=list)
86
+
87
+
88
+ class BandSchema(Parent):
89
+ """Class for metadata for a raster band."""
90
+
91
+ index: int
92
+ gdal_type: str
93
+ numpy_type: str
94
+ nodata: Union[int, float]
95
+ description: str = ''
96
+ title: str = ''
97
+ units: str = ''
98
+
99
+
100
+ class RasterSchema(Parent):
101
+ """Class for metadata for raster bands."""
102
+
103
+ bands: List[BandSchema]
104
+ pixel_size: list
105
+ raster_size: Union[dict, list]
106
+
107
+ def model_post_init(self, __context):
108
+ # Migrate from previous model where we stored this as a list
109
+ if isinstance(self.raster_size, list):
110
+ self.raster_size = {'width': self.raster_size[0],
111
+ 'height': self.raster_size[1]}
112
+
113
+
114
+ class BaseMetadata(Parent):
115
+ """A class for the things shared by Resource and Profile."""
116
+
117
+ # These default to None in order to facilitate the logic
118
+ # in ``replace`` where we only replace values that are not None.
119
+ contact: Union[ContactSchema, None] = Field(default_factory=ContactSchema)
120
+ license: Union[LicenseSchema, None] = Field(default_factory=LicenseSchema)
121
+
122
+ def set_contact(self, organization=None, individual_name=None,
123
+ position_name=None, email=None):
124
+ """Add a contact section.
125
+
126
+ Args:
127
+ organization (str): name of the responsible organization
128
+ individual_name (str): name of the responsible person
129
+ position_name (str): role or position of the responsible person
130
+ email (str): address of the responsible organization or individual
131
+
132
+ """
133
+ if self.contact is None:
134
+ self.contact = ContactSchema()
135
+ if organization is not None:
136
+ self.contact.organization = organization
137
+ if individual_name is not None:
138
+ self.contact.individual_name = individual_name
139
+ if position_name is not None:
140
+ self.contact.position_name = position_name
141
+ if email is not None:
142
+ self.contact.email = email
143
+
144
+ def get_contact(self):
145
+ """Get metadata from a contact section.
146
+
147
+ Returns:
148
+ ContactSchema
149
+
150
+ """
151
+ return self.contact
152
+
153
+ def set_license(self, title=None, path=None):
154
+ """Add a license for the dataset.
155
+
156
+ Either or both title and path are required if there is a license.
157
+ Call with no arguments to remove license info.
158
+
159
+ Args:
160
+ title (str): human-readable title of the license
161
+ path (str): url for the license
162
+
163
+ """
164
+ if self.license is None:
165
+ self.license = LicenseSchema()
166
+ license_dict = {}
167
+ license_dict['title'] = title if title else ''
168
+ license_dict['path'] = path if path else ''
169
+
170
+ # TODO: DataPackage/Resource allows for a list of licenses.
171
+ # So far we only support one license per resource.
172
+ self.license = LicenseSchema(**license_dict)
173
+
174
+ def get_license(self):
175
+ """Get ``license`` for the dataset.
176
+
177
+ Returns:
178
+ models.LicenseSchema
179
+
180
+ """
181
+ # TODO: DataPackage/Resource allows for a list of licenses.
182
+ # So far we only support one license per resource.
183
+ return self.license
184
+
185
+ def replace(self, other):
186
+ """Replace attribute values with those from another instance.
187
+
188
+ Only attributes that exist in ``self`` will exist in the
189
+ returned instance. Only attribute values that are not None will be used
190
+ to replace existing attribute values in ``self``.
191
+
192
+ Args:
193
+ other (BaseMetadata)
194
+
195
+ Returns:
196
+ an instance of same type as ``self``
197
+
198
+ Raises:
199
+ TypeError if ``other`` is not an instance of BaseMetadata.
200
+
201
+ """
202
+ if isinstance(other, BaseMetadata):
203
+ updated_dict = self.model_dump() | {
204
+ k: v for k, v in other.__dict__.items() if v is not None}
205
+ return self.__class__(**updated_dict)
206
+ raise TypeError(f'{type(other)} is not an instance of BaseMetadata')
207
+
208
+
209
+ class Profile(BaseMetadata):
210
+ """Class for a metadata profile.
211
+
212
+ A Profile can store metadata properties that are likely to apply
213
+ to more than one resource, such as ``contact`` and ``license``.
214
+
215
+ """
216
+
217
+ # For a Profile, default these to None so that they do not replace
218
+ # values in a Resource
219
+ contact: Union[ContactSchema, None] = None
220
+ license: Union[LicenseSchema, None] = None
221
+
222
+ @classmethod
223
+ def load(cls, filepath):
224
+ """Load metadata document from a yaml file.
225
+
226
+ Args:
227
+ filepath (str): path to yaml file
228
+
229
+ Returns:
230
+ instance of the class
231
+
232
+ """
233
+ with fsspec.open(filepath, 'r') as file:
234
+ yaml_string = file.read()
235
+ yaml_dict = yaml.safe_load(yaml_string)
236
+ return cls(**yaml_dict)
237
+
238
+ def write(self, target_path):
239
+ """Write profile data to a yaml file.
240
+
241
+ Args:
242
+ target_path (str): path to a yaml file to be written
243
+
244
+ """
245
+ with open(target_path, 'w') as file:
246
+ file.write(utils.yaml_dump(self.model_dump()))
247
+
248
+
249
+ class Resource(BaseMetadata):
250
+ """Base class for metadata for a resource.
251
+
252
+ https://datapackage.org/standard/data-resource/
253
+ This class borrows from the Data Package - Resource
254
+ specification. But we have some additional properties
255
+ that are important to us.
256
+
257
+ All attributes are keyword-only so that we can init
258
+ with default values, allowing the user to get a template
259
+ with which to complete later.
260
+
261
+ """
262
+
263
+ # A version string we can use to identify geometamaker compliant documents
264
+ geometamaker_version: str = ''
265
+ metadata_path: str = ''
266
+
267
+ # These are populated geometamaker.describe()
268
+ bytes: int = 0
269
+ encoding: str = ''
270
+ format: str = ''
271
+ uid: str = ''
272
+ path: str = ''
273
+ scheme: str = ''
274
+ type: str = ''
275
+ last_modified: str = ''
276
+ # DataPackage includes `sources` as a list of source files
277
+ # with some amount of metadata for each item. For our
278
+ # use-case, I think a list of filenames is good enough.
279
+ sources: list = Field(default_factory=list)
280
+
281
+ # These are not populated by geometamaker.describe(),
282
+ # and should have setters & getters
283
+ citation: str = ''
284
+ description: str = ''
285
+ doi: str = ''
286
+ edition: str = ''
287
+ keywords: list = Field(default_factory=list)
288
+ lineage: str = ''
289
+ placenames: list = Field(default_factory=list)
290
+ purpose: str = ''
291
+ title: str = ''
292
+ url: str = ''
293
+
294
+ def model_post_init(self, __context):
295
+ self.metadata_path = f'{self.path}.yml'
296
+ self.geometamaker_version: str = geometamaker.__version__
297
+ self.path = self.path.replace('\\', '/')
298
+ self.sources = [x.replace('\\', '/') for x in self.sources]
299
+
300
+ @classmethod
301
+ def load(cls, filepath):
302
+ """Load metadata document from a yaml file.
303
+
304
+ Args:
305
+ filepath (str): path to yaml file
306
+
307
+ Returns:
308
+ instance of the class
309
+
310
+ Raises:
311
+ FileNotFoundError if filepath does not exist
312
+ ValueError if the metadata is found to be incompatible with
313
+ geometamaker.
314
+
315
+ """
316
+ with fsspec.open(filepath, 'r') as file:
317
+ yaml_string = file.read()
318
+ yaml_dict = yaml.safe_load(yaml_string)
319
+ if not yaml_dict or ('metadata_version' not in yaml_dict
320
+ and 'geometamaker_version' not in yaml_dict):
321
+ message = (f'{filepath} exists but is not compatible with '
322
+ f'geometamaker.')
323
+ raise ValueError(message)
324
+
325
+ deprecated_attrs = ['metadata_version', 'mediatype', 'name']
326
+ for attr in deprecated_attrs:
327
+ if attr in yaml_dict:
328
+ warnings.warn(
329
+ f'"{attr}" exists in {filepath} but is no longer part of '
330
+ f'the geometamaker specification. "{attr}" will be '
331
+ f'removed from this document. In the future, presence '
332
+ f' of "{attr}" will raise a ValidationError',
333
+ category=FutureWarning)
334
+ del yaml_dict[attr]
335
+
336
+ # migrate from 'schema' to 'data_model', if needed.
337
+ if 'schema' in yaml_dict:
338
+ warnings.warn(
339
+ "'schema' has been replaced with 'data_model' as an attribute "
340
+ "name. In the future, the presence of a 'schema' attribute "
341
+ "will raise a ValidationError",
342
+ category=FutureWarning)
343
+ yaml_dict['data_model'] = yaml_dict['schema']
344
+ del yaml_dict['schema']
345
+ return cls(**yaml_dict)
346
+
347
+ def set_title(self, title):
348
+ """Add a title for the dataset.
349
+
350
+ Args:
351
+ title (str)
352
+
353
+ """
354
+ self.title = title
355
+
356
+ def get_title(self):
357
+ """Get the title for the dataset."""
358
+ return self.title
359
+
360
+ def set_description(self, description):
361
+ """Add a description for the dataset.
362
+
363
+ Args:
364
+ description (str)
365
+
366
+ """
367
+ self.description = description
368
+
369
+ def get_description(self):
370
+ """Get the description for the dataset."""
371
+ return self.description
372
+
373
+ def set_citation(self, citation):
374
+ """Add a citation string for the dataset.
375
+
376
+ Args:
377
+ citation (str)
378
+
379
+ """
380
+ self.citation = citation
381
+
382
+ def get_citation(self):
383
+ """Get the citation for the dataset."""
384
+ return self.citation
385
+
386
+ def set_doi(self, doi):
387
+ """Add a doi string for the dataset.
388
+
389
+ Args:
390
+ doi (str)
391
+
392
+ """
393
+ self.doi = doi
394
+
395
+ def get_doi(self):
396
+ """Get the doi for the dataset."""
397
+ return self.doi
398
+
399
+ def set_edition(self, edition):
400
+ """Set the edition for the dataset.
401
+
402
+ Args:
403
+ edition (str): version of the cited resource
404
+
405
+ """
406
+ self.edition = edition
407
+
408
+ def get_edition(self):
409
+ """Get the edition of the dataset.
410
+
411
+ Returns:
412
+ str or ``None`` if ``edition`` does not exist.
413
+
414
+ """
415
+ return self.edition
416
+
417
+ def set_keywords(self, keywords):
418
+ """Describe a dataset with a list of keywords.
419
+
420
+ Args:
421
+ keywords (list): sequence of strings
422
+
423
+ """
424
+ self.keywords = keywords
425
+
426
+ def get_keywords(self):
427
+ """Get the keywords describing the dataset.
428
+
429
+ Returns:
430
+ list
431
+
432
+ """
433
+ return self.keywords
434
+
435
+ def set_lineage(self, statement):
436
+ """Set the lineage statement for the dataset.
437
+
438
+ Args:
439
+ statement (str): general explanation describing the lineage or
440
+ provenance of the dataset
441
+
442
+ """
443
+ self.lineage = statement
444
+
445
+ def get_lineage(self):
446
+ """Get the lineage statement of the dataset.
447
+
448
+ Returns:
449
+ str
450
+
451
+ """
452
+ return self.lineage
453
+
454
+ def set_placenames(self, placenames):
455
+ """Describe the geography of a dataset with a list of placenames.
456
+
457
+ Args:
458
+ places (list): sequence of strings
459
+
460
+ """
461
+ self.placenames = placenames
462
+
463
+ def get_placenames(self):
464
+ """Get the placenames describing the dataset.
465
+
466
+ Returns:
467
+ list
468
+
469
+ """
470
+ return self.placenames
471
+
472
+ def set_purpose(self, purpose):
473
+ """Add a purpose for the dataset.
474
+
475
+ Args:
476
+ purpose (str): description of the purpose of the source dataset
477
+
478
+ """
479
+ self.purpose = purpose
480
+
481
+ def get_purpose(self):
482
+ """Get ``purpose`` for the dataset.
483
+
484
+ Returns:
485
+ str
486
+
487
+ """
488
+ return self.purpose
489
+
490
+ def set_url(self, url):
491
+ """Add a url for the dataset.
492
+
493
+ Args:
494
+ url (str)
495
+
496
+ """
497
+ self.url = url
498
+
499
+ def get_url(self):
500
+ """Get the url for the dataset."""
501
+ return self.url
502
+
503
+ def write(self, workspace=None):
504
+ """Write datapackage yaml to disk.
505
+
506
+ This creates sidecar files with '.yml'
507
+ appended to the full filename of the data source. For example,
508
+
509
+ - 'myraster.tif'
510
+ - 'myraster.tif.yml'
511
+
512
+ Args:
513
+ workspace (str): if ``None``, files write to the same location
514
+ as the source data. If not ``None``, a path to a local directory
515
+ to write files. They will still be named to match the source
516
+ filename. Use this option if the source data is not on the local
517
+ filesystem.
518
+
519
+ """
520
+ if workspace is None:
521
+ target_path = self.metadata_path
522
+ else:
523
+ target_path = os.path.join(
524
+ workspace, os.path.basename(self.metadata_path))
525
+
526
+ with open(target_path, 'w') as file:
527
+ file.write(utils.yaml_dump(
528
+ self.model_dump(exclude=['metadata_path'])))
529
+
530
+ def to_string(self):
531
+ pass
532
+
533
+
534
+ class TableResource(Resource):
535
+ """Class for metadata for a table resource."""
536
+
537
+ data_model: TableSchema = Field(default_factory=TableSchema)
538
+
539
+ def _get_field(self, name):
540
+ """Get an attribute by its name property.
541
+
542
+ Args:
543
+ name (string): to match the value of the 'name' key in a dict
544
+
545
+ Returns:
546
+ tuple of (list index of the matching attribute, the attribute
547
+ dict)
548
+
549
+ Raises:
550
+ KeyError if no attributes exist in the resource or if the named
551
+ attribute does not exist.
552
+
553
+ """
554
+ if len(self.data_model.fields) == 0:
555
+ raise KeyError(
556
+ f'{self.data_model} has no fields')
557
+ for idx, field in enumerate(self.data_model.fields):
558
+ if field.name == name:
559
+ return idx, field
560
+ raise KeyError(
561
+ f'{self.data_model} has no field named {name}')
562
+
563
+ def set_field_description(self, name, title=None, description=None,
564
+ units=None, type=None):
565
+ """Define metadata for a tabular field.
566
+
567
+ Args:
568
+ name (str): name and unique identifier of the field
569
+ title (str): title for the field
570
+ description (str): description of the field
571
+ units (str): unit of measurement for the field's values
572
+ type (str): datatype of values in the field
573
+
574
+ """
575
+ idx, field = self._get_field(name)
576
+
577
+ if title is not None:
578
+ field.title = title
579
+ if description is not None:
580
+ field.description = description
581
+ if units is not None:
582
+ field.units = units
583
+ if type is not None:
584
+ field.type = type
585
+
586
+ self.data_model.fields[idx] = field
587
+
588
+ def get_field_description(self, name):
589
+ """Get the attribute metadata for a field.
590
+
591
+ Args:
592
+ name (str): name and unique identifier of the field
593
+
594
+ Returns:
595
+ FieldSchema
596
+ """
597
+ idx, field = self._get_field(name)
598
+ return field
599
+
600
+
601
+ class ArchiveResource(Resource):
602
+ """Class for metadata for an archive resource."""
603
+
604
+ compression: str
605
+
606
+
607
+ class VectorResource(TableResource):
608
+ """Class for metadata for a vector resource."""
609
+
610
+ n_features: int
611
+ spatial: SpatialSchema
612
+
613
+
614
+ class RasterResource(Resource):
615
+ """Class for metadata for a raster resource."""
616
+
617
+ data_model: RasterSchema
618
+ spatial: SpatialSchema
619
+
620
+ def set_band_description(self, band_number, title=None,
621
+ description=None, units=None):
622
+ """Define metadata for a raster band.
623
+
624
+ Args:
625
+ band_number (int): a raster band index, starting at 1
626
+ title (str): title for the raster band
627
+ description (str): description of the raster band
628
+ units (str): unit of measurement for the band's pixel values
629
+
630
+ """
631
+ idx = band_number - 1
632
+ band = self.data_model.bands[idx]
633
+
634
+ if title is not None:
635
+ band.title = title
636
+ if description is not None:
637
+ band.description = description
638
+ if units is not None:
639
+ band.units = units
640
+
641
+ self.data_model.bands[idx] = band
642
+
643
+ def get_band_description(self, band_number):
644
+ """Get the attribute metadata for a band.
645
+
646
+ Args:
647
+ band_number (int): a raster band index, starting at 1
648
+
649
+ Returns:
650
+ BandSchema
651
+
652
+ """
653
+ return self.data_model.bands[band_number - 1]
geometamaker/utils.py ADDED
@@ -0,0 +1,30 @@
1
+ import yaml
2
+
3
+
4
+ def _represent_str(dumper, data):
5
+ scalar = yaml.representer.SafeRepresenter.represent_str(dumper, data)
6
+ if len(data.splitlines()) > 1:
7
+ scalar.style = '|' # literal style, newline chars will be new lines
8
+ return scalar
9
+
10
+
11
+ class _SafeDumper(yaml.SafeDumper):
12
+
13
+ def __init__(self, *args, **kwargs):
14
+ super().__init__(*args, **kwargs)
15
+ # Patch the default string representer to use a literal block
16
+ # style when the data contain newline characters
17
+ self.add_representer(str, _represent_str)
18
+
19
+ # https://stackoverflow.com/questions/13518819/avoid-references-in-pyyaml
20
+ def ignore_aliases(self, data):
21
+ """Keep the yaml human-readable by avoiding anchors and aliases."""
22
+ return True
23
+
24
+
25
+ def yaml_dump(data):
26
+ return yaml.dump(
27
+ data,
28
+ allow_unicode=True,
29
+ sort_keys=False,
30
+ Dumper=_SafeDumper)