metadata-crawler 2510.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of metadata-crawler might be problematic. Click here for more details.

Files changed (35) hide show
  1. metadata_crawler/__init__.py +263 -0
  2. metadata_crawler/__main__.py +8 -0
  3. metadata_crawler/_version.py +1 -0
  4. metadata_crawler/api/__init__.py +1 -0
  5. metadata_crawler/api/cli.py +57 -0
  6. metadata_crawler/api/config.py +831 -0
  7. metadata_crawler/api/drs_config.toml +440 -0
  8. metadata_crawler/api/index.py +151 -0
  9. metadata_crawler/api/metadata_stores.py +755 -0
  10. metadata_crawler/api/mixin/__init__.py +7 -0
  11. metadata_crawler/api/mixin/lookup_mixin.py +112 -0
  12. metadata_crawler/api/mixin/lookup_tables.py +10010 -0
  13. metadata_crawler/api/mixin/path_mixin.py +46 -0
  14. metadata_crawler/api/mixin/template_mixin.py +145 -0
  15. metadata_crawler/api/storage_backend.py +277 -0
  16. metadata_crawler/backends/__init__.py +1 -0
  17. metadata_crawler/backends/intake.py +211 -0
  18. metadata_crawler/backends/posix.py +121 -0
  19. metadata_crawler/backends/s3.py +140 -0
  20. metadata_crawler/backends/swift.py +305 -0
  21. metadata_crawler/cli.py +547 -0
  22. metadata_crawler/data_collector.py +278 -0
  23. metadata_crawler/ingester/__init__.py +1 -0
  24. metadata_crawler/ingester/mongo.py +206 -0
  25. metadata_crawler/ingester/solr.py +282 -0
  26. metadata_crawler/logger.py +153 -0
  27. metadata_crawler/py.typed +0 -0
  28. metadata_crawler/run.py +419 -0
  29. metadata_crawler/utils/__init__.py +482 -0
  30. metadata_crawler/utils/cftime_utils.py +207 -0
  31. metadata_crawler-2510.1.0.dist-info/METADATA +401 -0
  32. metadata_crawler-2510.1.0.dist-info/RECORD +35 -0
  33. metadata_crawler-2510.1.0.dist-info/WHEEL +4 -0
  34. metadata_crawler-2510.1.0.dist-info/entry_points.txt +14 -0
  35. metadata_crawler-2510.1.0.dist-info/licenses/LICENSE +28 -0
@@ -0,0 +1,831 @@
1
+ """API for loading crawler configuration."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import os
6
+ import re
7
+ import textwrap
8
+ from copy import deepcopy
9
+ from datetime import datetime
10
+ from enum import Enum, StrEnum
11
+ from fnmatch import fnmatch
12
+ from pathlib import Path
13
+ from typing import (
14
+ Annotated,
15
+ Any,
16
+ Dict,
17
+ List,
18
+ Literal,
19
+ Optional,
20
+ Tuple,
21
+ Union,
22
+ cast,
23
+ )
24
+ from urllib.parse import urlsplit
25
+ from warnings import catch_warnings
26
+
27
+ import tomli
28
+ import tomlkit
29
+ import xarray
30
+ from pydantic import (
31
+ BaseModel,
32
+ ConfigDict,
33
+ Field,
34
+ ValidationError,
35
+ field_validator,
36
+ model_validator,
37
+ )
38
+ from tomlkit.container import OutOfOrderTableProxy
39
+ from tomlkit.items import Table
40
+
41
+ from ..utils import (
42
+ MetadataCrawlerException,
43
+ convert_str_to_timestamp,
44
+ load_plugins,
45
+ )
46
+ from ..utils.cftime_utils import infer_cmor_like_time_frequency
47
+ from .mixin import TemplateMixin
48
+ from .storage_backend import Metadata, MetadataType
49
+
50
+
51
+ class BaseType(str, Enum):
52
+ """Basic types."""
53
+
54
+ string = "string"
55
+ integer = "integer"
56
+ float = "float"
57
+ timestamp = "timestamp"
58
+
59
+
60
+ class Types(str, Enum):
61
+ """Types supported by the config."""
62
+
63
+ string = "string"
64
+ integer = "integer"
65
+ float = "float"
66
+ timestamp = "timestamp"
67
+ daterange = "timestamp[2]"
68
+ path = "string"
69
+ uri = "string"
70
+ dataset = "string"
71
+ fmt = "string"
72
+ storage = "string"
73
+ bbox = "float[4]"
74
+
75
+
76
+ class SchemaField(BaseModel):
77
+ """BaseModel defining the metadata schema."""
78
+
79
+ key: str
80
+ type: str
81
+ required: bool = False
82
+ default: Optional[Any] = None
83
+ length: Optional[int] = None
84
+ base_type: BaseType = BaseType.string
85
+ multi_valued: bool = True
86
+ indexed: bool = True
87
+ name: Optional[str] = None
88
+ unique: bool = False
89
+
90
+ @field_validator("type")
91
+ @classmethod
92
+ def parse_type(cls, v: str) -> str:
93
+ """Parse the data types.
94
+
95
+ Accepts
96
+ ^^^^^^^
97
+ - 'string', 'integer', 'float', 'timestamp' -> length=None
98
+ - 'float[2]', 'int[5]' -> length=number
99
+ - 'string[]' -> length=None, multi_valued semantics
100
+ - 'daterange' [timestamp, timestamp]
101
+ """
102
+ f_type = getattr(getattr(Types, v, None), "value", v)
103
+ m = re.fullmatch(r"({})(\[(\d*)\])?".format("|".join(BaseType)), f_type)
104
+ if not m:
105
+ raise MetadataCrawlerException(f"invalid type spec {v!r}")
106
+ base, _, num = m.groups()
107
+ setattr(
108
+ cls,
109
+ "__parsed_type",
110
+ {"base": base, "length": int(num) if num else None},
111
+ )
112
+ return v
113
+
114
+ @model_validator(mode="after")
115
+ def _set_parsed(self) -> "SchemaField":
116
+ parsed = getattr(self, "__parsed_type")
117
+ self.base_type = BaseType(parsed["base"])
118
+ self.length = parsed["length"]
119
+ self.name = self.name or self.key
120
+ return self
121
+
122
+ @staticmethod
123
+ def get_time_range(
124
+ time_stamp: Optional[Union[str, List[str]]],
125
+ ) -> List[datetime]:
126
+ """Convert a from to time range to a begin or end time step."""
127
+ time_stamp = time_stamp or ""
128
+ if isinstance(time_stamp, str):
129
+ start_str, _, end_str = (
130
+ time_stamp.replace(":", "").replace("_", "-").partition("-")
131
+ )
132
+ time_stamp = [start_str or "fx", end_str or "fx"]
133
+ for n, ts in enumerate(time_stamp):
134
+ if hasattr(ts, "isoformat"):
135
+ time_stamp[n] = ts.isoformat()
136
+ time_stamp[n] = str(ts) or "fx"
137
+ start = convert_str_to_timestamp(
138
+ time_stamp[0], alternative="0001-01-01T00:00"
139
+ )
140
+ end = convert_str_to_timestamp(
141
+ time_stamp[-1], alternative="9999-12-31T23:59"
142
+ )
143
+ return [start, end]
144
+
145
+
146
+ class MetadataSource(StrEnum):
147
+ """Representation of how the metadata should be retrieved."""
148
+
149
+ storage = "storage" # via intake/fdb5/etc.
150
+ path = "path" # parse via specs_dir/specs_file
151
+ data = "data" # read attributes from the file itself
152
+
153
+
154
+ class VarAttrRule(BaseModel):
155
+ """How to read attributes from variables."""
156
+
157
+ var: str
158
+ attr: str # attribute name on DataArray.attrs
159
+ default: Any = None
160
+
161
+
162
+ class StatRule(BaseModel):
163
+ """How to apply statistics."""
164
+
165
+ stat: Literal["min", "max", "minmax", "range", "bbox", "timedelta"]
166
+ var: Optional[str] = None # for numeric stats on a single var
167
+ coords: Optional[Union[List[str], str]] = None # for time range, etc.
168
+ lat: Optional[str] = None # convenience keys for bbox
169
+ lon: Optional[str] = None
170
+ default: Any = None
171
+
172
+
173
+ class ConfigMerger:
174
+ """Load the system and user TOML, merges user -> system under.
175
+
176
+ Merging the config preserves comments/formatting, and lets you inspect or
177
+ write the result.
178
+ """
179
+
180
+ def __init__(
181
+ self,
182
+ user_path: Optional[
183
+ Union[Path, str, Dict[str, Any], tomlkit.TOMLDocument]
184
+ ] = None,
185
+ ):
186
+ # parse both documents
187
+ system_path = Path(__file__).parent / "drs_config.toml"
188
+ self._system_doc = tomlkit.parse(system_path.read_text(encoding="utf-8"))
189
+ _config: str = ""
190
+ if user_path is not None:
191
+ if isinstance(user_path, (str, Path)) and os.path.isdir(user_path):
192
+ _config = (
193
+ Path(user_path).expanduser().absolute() / "drs_config.toml"
194
+ ).read_text(encoding="utf-8")
195
+ elif isinstance(user_path, (str, Path)) and os.path.isfile(user_path):
196
+ _config = (
197
+ Path(user_path)
198
+ .expanduser()
199
+ .absolute()
200
+ .read_text(encoding="utf-8")
201
+ )
202
+ elif isinstance(user_path, (str, Path)):
203
+ _config = str(user_path)
204
+ else:
205
+ _config = tomlkit.dumps(user_path)
206
+ if _config:
207
+ try:
208
+ self._user_doc = tomlkit.parse(_config)
209
+ except Exception:
210
+ raise MetadataCrawlerException(
211
+ "Could not load config path."
212
+ ) from None
213
+ self._merge_tables(self._system_doc, self._user_doc)
214
+
215
+ def _merge_tables(
216
+ self,
217
+ base: Union[tomlkit.TOMLDocument, Table, OutOfOrderTableProxy],
218
+ override: Union[Table, tomlkit.TOMLDocument, OutOfOrderTableProxy],
219
+ ) -> None:
220
+
221
+ for key, value in override.items():
222
+ if key not in base:
223
+ base[key] = value
224
+ continue
225
+ if isinstance(value, (Table, OutOfOrderTableProxy)):
226
+ self._merge_tables(cast(Table, base[key]), value)
227
+ else:
228
+ base[key] = value
229
+
230
+ @property
231
+ def merged_doc(self) -> tomlkit.TOMLDocument:
232
+ """Return the merged TOMLDocument."""
233
+ return self._system_doc
234
+
235
+ def dumps(self) -> str:
236
+ """Return the merged document as a TOML string."""
237
+ return tomlkit.dumps(self.merged_doc)
238
+
239
+
240
+ def strip_protocol(inp: str | Path) -> Path:
241
+ """Extract the path from a given input file system."""
242
+ abs_path = Path(urlsplit(str(inp)).path).expanduser()
243
+ return Path(abs_path)
244
+
245
+
246
+ class CrawlerSettings(BaseModel):
247
+ """Define the user input for a data crawler session."""
248
+
249
+ name: str
250
+ search_path: Union[str, Path]
251
+
252
+ def model_post_init(self, __context: Any = None) -> None:
253
+ """Apply rules after init."""
254
+ self.search_path = str(self.search_path)
255
+
256
+
257
+ class PathSpecs(BaseModel):
258
+ """Implementation of the Directory reference syntax."""
259
+
260
+ dir_parts: Optional[List[str]] = None
261
+ file_parts: Optional[List[str]] = None
262
+ file_sep: str = "_"
263
+
264
+ def _get_metadata_from_dir(
265
+ self, data: Dict[str, Any], rel_path: Path
266
+ ) -> None:
267
+ dir_parts = rel_path.parent.parts
268
+
269
+ if self.dir_parts and len(dir_parts) == len(self.dir_parts):
270
+ data.update(
271
+ {
272
+ k: v
273
+ for (k, v) in zip(self.dir_parts, dir_parts)
274
+ if k not in data
275
+ }
276
+ )
277
+ elif self.dir_parts:
278
+ raise MetadataCrawlerException(
279
+ (
280
+ f"Number of dir parts for {rel_path.parent} do not match "
281
+ f"- needs: {len(self.dir_parts)} has: {len(dir_parts)}"
282
+ )
283
+ ) from None
284
+
285
+ def _get_metadata_from_filename(
286
+ self, data: Dict[str, Any], rel_path: Path
287
+ ) -> None:
288
+ if self.file_parts is None:
289
+ return
290
+ file_parts = rel_path.name.split(self.file_sep)
291
+ _parts: Dict[str, str] = {}
292
+ if len(file_parts) == len(self.file_parts):
293
+ _parts = dict(zip(self.file_parts, file_parts))
294
+ elif (
295
+ len(file_parts) == len(self.file_parts) - 1 and "fx" in rel_path.name
296
+ ):
297
+ _parts = dict(zip(self.file_parts[:-1], file_parts))
298
+ else:
299
+ raise MetadataCrawlerException(
300
+ (
301
+ f"Number of file parts for {rel_path.name} do not match "
302
+ f"- needs: {len(self.file_parts)} has: {len(file_parts)})"
303
+ )
304
+ )
305
+ data.update({k: v for (k, v) in _parts.items() if k not in data})
306
+
307
+ def get_metadata_from_path(self, rel_path: Path) -> Dict[str, Any]:
308
+ """Read path encoded metadata from path specs."""
309
+ data: Dict[str, Any] = {}
310
+ self._get_metadata_from_dir(data, rel_path)
311
+ self._get_metadata_from_filename(data, rel_path)
312
+ data.pop("_", None)
313
+ return data
314
+
315
+
316
+ class DataSpecs(BaseModel):
317
+ """BaseModel for the configuration."""
318
+
319
+ globals: Dict[str, str] = Field(default_factory=dict)
320
+ var_attrs: Dict[str, VarAttrRule] = Field(default_factory=dict)
321
+ stats: Dict[str, StatRule] = Field(default_factory=dict)
322
+ read_kws: Dict[str, Any] = Field(default_factory=dict)
323
+
324
+ def _set_global_attributes(
325
+ self, dset: "xarray.Dataset", out: Dict[str, Any]
326
+ ) -> None:
327
+
328
+ for facet, attr in self.globals.items():
329
+ if attr == "__variable__":
330
+ out[facet] = list(getattr(dset, "data_vara", dset.variables))
331
+ else:
332
+ out[facet] = dset.attrs.get(attr)
333
+
334
+ def _set_variable_attributes(
335
+ self, dset: "xarray.Dataset", out: Dict[str, Any]
336
+ ) -> None:
337
+ data_vars = list(getattr(dset, "data_vars", dset.variables))
338
+
339
+ def get_val(
340
+ rule: VarAttrRule, vnames: Union[str, List[str]]
341
+ ) -> List[Any]:
342
+ if isinstance(vnames, str):
343
+ vnames = [dv for dv in data_vars if fnmatch(dv, vnames)]
344
+ attr_list: List[Any] = []
345
+ for vname in vnames:
346
+ default = (rule.default or "").replace("__name__", vname) or vname
347
+ attr = (rule.attr or "").replace("__name__", vname) or vname
348
+ if vname in dset:
349
+ attr_list.append(dset[vname].attrs.get(attr, default))
350
+ else:
351
+ attr_list.append(default)
352
+ return attr_list
353
+
354
+ for facet, rule in self.var_attrs.items():
355
+ resolved: Union[str, List[str]] = rule.var
356
+ vals = get_val(rule, resolved)
357
+ if len(vals) == 1:
358
+ out[facet] = vals[0]
359
+ else:
360
+ out[facet] = vals
361
+
362
+ def _apply_stats_rules(
363
+ self, dset: "xarray.Dataset", out: Dict[str, Any]
364
+ ) -> None:
365
+
366
+ for facet, rule in self.stats.items():
367
+ coords: Optional[List[str]] = None
368
+ if rule.coords:
369
+ coords = (
370
+ rule.coords
371
+ if isinstance(rule.coords, list)
372
+ else [rule.coords]
373
+ )
374
+ match rule.stat:
375
+ case "bbox":
376
+ lat = rule.lat or (coords[0] if coords else "lat")
377
+ lon = rule.lon or (
378
+ coords[1] if coords and len(coords) > 1 else "lon"
379
+ )
380
+ out[facet] = rule.default
381
+ if lat in dset and lon in dset:
382
+ latv = dset[lat].values
383
+ lonv = dset[lon].values
384
+ out[facet] = [
385
+ float(lonv.min()),
386
+ float(lonv.max()),
387
+ float(latv.min()),
388
+ float(latv.max()),
389
+ ]
390
+
391
+ case "range":
392
+ coord = coords[0] if coords else None
393
+ out[facet] = rule.default
394
+ if coord and coord in dset.coords:
395
+ arr = dset.coords[coord].values
396
+ out[facet] = [arr.min(), arr.max()]
397
+
398
+ case "min" | "max" | "minmax":
399
+
400
+ coord = coords[0] if coords else None
401
+ var_name = rule.var if rule.var else coord
402
+ out[facet] = rule.default
403
+ if var_name and var_name in dset:
404
+ arr = dset[var_name].values
405
+ if rule.stat == "min":
406
+ out[facet] = arr.min()
407
+ elif rule.stat == "max":
408
+ out[facet] = arr.max()
409
+ else:
410
+ out[facet] = [arr.min(), arr.max()]
411
+ case "timedelta":
412
+ coord = coords[0] if coords else None
413
+ out[facet] = infer_cmor_like_time_frequency(
414
+ dset, rule.var or coord
415
+ )
416
+
417
+ def extract_from_data(self, dset: xarray.Dataset) -> Dict[str, Any]:
418
+ """Extract metadata from the data."""
419
+ data: Dict[str, Any] = {}
420
+ self._set_global_attributes(dset, data)
421
+ self._set_variable_attributes(dset, data)
422
+ self._apply_stats_rules(dset, data)
423
+ return data
424
+
425
+
426
+ class Datasets(BaseModel):
427
+ """Definition of datasets that should be crawled."""
428
+
429
+ __pydantic_extra__: Dict[str, Any] = Field(init=False)
430
+ model_config = ConfigDict(extra="allow")
431
+ root_path: str | Path
432
+ drs_format: str = "freva"
433
+ fs_type: str = "posix"
434
+ defaults: Dict[str, Any] = Field(default_factory=dict)
435
+ storage_options: Dict[str, Any] = Field(default_factory=dict)
436
+ glob_pattern: str = "*.*"
437
+ inherits_from: str = Field(default_factory=str)
438
+
439
+ @field_validator("storage_options", mode="after")
440
+ @classmethod
441
+ def _render_storage_options(
442
+ cls, storage_options: Dict[str, Any]
443
+ ) -> Dict[str, Any]:
444
+ tmpl = TemplateMixin()
445
+ return cast(Dict[str, Any], tmpl.render_templates(storage_options, {}))
446
+
447
+ def model_post_init(self, __context: Any = None) -> None:
448
+ """Apply rules after init."""
449
+ storage_plugins = load_plugins("metadata_crawler.storage")
450
+ try:
451
+ self.backend = storage_plugins[self.fs_type](**self.storage_options)
452
+ except KeyError:
453
+ raise NotImplementedError(
454
+ f"Backend not available. `{self.fs_type}` extension missing?"
455
+ ) from None
456
+
457
+
458
+ class ConditionalRule(BaseModel):
459
+ """Define conditional rules."""
460
+
461
+ type: Literal["conditional"] = "conditional"
462
+ condition: str
463
+ true: Any
464
+ false: Any
465
+
466
+
467
+ class CallRule(BaseModel):
468
+ """Define caller rules."""
469
+
470
+ type: Literal["call"] = "call"
471
+ call: str
472
+
473
+
474
+ class LookupRule(BaseModel):
475
+ """Define lookup table rules."""
476
+
477
+ type: Literal["lookup"] = "lookup"
478
+ tree: List[str] = Field(default_factory=list)
479
+ attribute: str
480
+ standard: Optional[str] = None
481
+
482
+
483
+ SpecialRule = Annotated[
484
+ Union[ConditionalRule, CallRule, LookupRule], Field(discriminator="type")
485
+ ]
486
+
487
+
488
+ class Dialect(BaseModel):
489
+ """Settings for a DRS Format."""
490
+
491
+ facets: Dict[str, str | list[str]] = Field(default_factory=dict)
492
+ defaults: Dict[str, Any] = Field(default_factory=dict)
493
+ path_specs: PathSpecs = Field(default_factory=PathSpecs)
494
+ data_specs: DataSpecs = Field(default_factory=DataSpecs)
495
+ special: Dict[str, SpecialRule] = Field(default_factory=dict)
496
+ domains: Dict[str, List[float]] = Field(default_factory=dict)
497
+ sources: List[MetadataSource] = Field(
498
+ default_factory=lambda: [
499
+ MetadataSource.path,
500
+ ],
501
+ description="Priority list of where to retrieve metadata",
502
+ )
503
+ inherits_from: Optional[str] = None
504
+
505
+ @field_validator("sources", mode="after")
506
+ @classmethod
507
+ def _validate_sources(cls, srcs: List[str]) -> List[str]:
508
+ # ensure only allowed sources are present
509
+ names = {name.upper() for name in MetadataSource.__members__.keys()}
510
+ values = {m.value for m in MetadataSource}
511
+ invalid = [s for s in srcs if s.upper() not in names and s not in values]
512
+ if invalid:
513
+ allowed = sorted(values | {n.lower() for n in names})
514
+ raise MetadataCrawlerException(
515
+ f"Invalid metadata source(s): {invalid!r}. Allowed: {allowed}"
516
+ )
517
+ return srcs
518
+
519
+
520
+ class DRSConfig(BaseModel, TemplateMixin):
521
+ """BaseModel model for the entire user config."""
522
+
523
+ datasets: Dict[str, Datasets]
524
+ index_schema: Dict[str, SchemaField] = Field(...)
525
+ suffixes: List[str] = Field(default_factory=list)
526
+ storage_options: Dict[str, Any] = Field(default_factory=dict)
527
+ defaults: Dict[str, Any] = Field(default_factory=dict)
528
+ special: Dict[str, SpecialRule] = Field(default_factory=dict)
529
+ dialect: Dict[str, Dialect]
530
+
531
+ def model_post_init(self, __context: Any = None) -> None:
532
+ """Apply special rules after init."""
533
+ self._defaults: Dict[str, Any] = {}
534
+ self.suffixes = self.suffixes or [
535
+ ".zarr",
536
+ ".zar",
537
+ ".nc4",
538
+ ".nc",
539
+ ".tar",
540
+ ".hdf5",
541
+ ".h5",
542
+ ]
543
+ for key, dset in self.datasets.items():
544
+ self.dialect.setdefault(key, self.dialect[dset.drs_format])
545
+ dset.backend.suffixes = self.suffixes
546
+ for key, option in self.storage_options.items():
547
+ dset.backend.storage_options.setdefault(key, option)
548
+ for key, dset in self.datasets.items():
549
+ self._defaults.setdefault(key, {})
550
+ for k, _def in (dset.defaults or {}).items():
551
+ self._defaults[key].setdefault(k, _def)
552
+ for k, _def in self.dialect[dset.drs_format].defaults.items():
553
+ self._defaults[key].setdefault(k, _def)
554
+ for k, _def in self.defaults.items():
555
+ self._defaults[key].setdefault(k, _def)
556
+ self.prep_template_env()
557
+ for standard in self.dialect:
558
+ for key in self.special:
559
+ self.dialect[standard].special.setdefault(key, self.special[key])
560
+
561
+ @model_validator(mode="before")
562
+ @classmethod
563
+ def _dump_(cls, values: Any) -> Any:
564
+ setattr(cls, "_model_dict", values)
565
+ return values
566
+
567
+ @model_validator(mode="before")
568
+ def _resolve_inheritance(cls, values: Any) -> Any:
569
+ """Apply inheritance.
570
+
571
+ After loading raw TOML into dicts, but before model instantiation, merge
572
+ any dialects that declare `inherits_from`.
573
+ """
574
+ if not isinstance(values, dict):
575
+ return values # pragma: no cover
576
+
577
+ def _deep_merge(a: Dict[str, Any], b: Dict[str, Any]) -> None:
578
+ for k, v in b.items():
579
+ if k in a and isinstance(a[k], dict) and isinstance(v, dict):
580
+ if not v:
581
+ a[k] = {}
582
+ else:
583
+ _deep_merge(a[k], v)
584
+ else:
585
+ a[k] = v
586
+
587
+ for key in ("dialect", "datasets"):
588
+ raw = values.get(key, {})
589
+ merged = deepcopy(raw)
590
+ for name, cfg in raw.items():
591
+ parent = cfg.get("inherits_from")
592
+ if parent:
593
+ if parent not in merged:
594
+ raise MetadataCrawlerException(
595
+ f"'{name}' inherits from unknown " f"'{parent}'"
596
+ )
597
+ # take parent base, then overlay this dialect
598
+ base = deepcopy(
599
+ merged[parent]
600
+ ) # shallow copy of parent raw dict
601
+ # remove inherits_from to avoid cycles
602
+ child = deepcopy(cfg)
603
+ child.pop("inherits_from", None)
604
+ # deep-merge child into base
605
+ _deep_merge(base, child)
606
+ base["inherits_from"] = parent
607
+ merged[name] = base
608
+
609
+ values[key] = merged
610
+ return values
611
+
612
+ @model_validator(mode="before")
613
+ def _ensure_dialects(cls, values: Any) -> Any:
614
+ """Ensure every dialect is a Dialect model."""
615
+ if not isinstance(values, dict):
616
+ return values # pragma: no cover
617
+
618
+ raw = values.get("dialect", {})
619
+ values["dialect"] = {k: v for k, v in raw.items()}
620
+ return values
621
+
622
+ def _apply_special_rules(
623
+ self,
624
+ standard: str,
625
+ drs_type: str,
626
+ inp: Metadata,
627
+ specials: Dict[str, SpecialRule],
628
+ ) -> None:
629
+ data = {**inp.metadata, **{"file": inp.path, "uri": inp.path}}
630
+
631
+ for facet, rule in specials.items():
632
+ result: Any = None
633
+ if inp.metadata.get(facet):
634
+ continue
635
+ match rule.type:
636
+ case "conditional":
637
+ _rule = textwrap.dedent(rule.condition or "").strip()
638
+ s_cond = self.render_templates(_rule, data)
639
+ cond = eval(
640
+ s_cond, {}, getattr(self, "_model_dict", {})
641
+ ) # nosec
642
+ result = rule.true if cond else rule.false
643
+ case "lookup":
644
+ args = cast(List[str], self.render_templates(rule.tree, data))
645
+
646
+ result = self.datasets[standard].backend.lookup(
647
+ inp.path,
648
+ self.render_templates(rule.attribute, data),
649
+ rule.standard or drs_type,
650
+ *args,
651
+ **self.dialect[standard].data_specs.read_kws,
652
+ )
653
+ case "call":
654
+ _call = textwrap.dedent(rule.call or "").strip()
655
+ result = eval(
656
+ self.render_templates(_call, data),
657
+ {},
658
+ getattr(self, "_model_dict", {}),
659
+ ) # nosec
660
+ if result:
661
+ inp.metadata[facet] = result
662
+
663
+ def _metadata_from_path(self, path: str, standard: str) -> Dict[str, Any]:
664
+ """Extract the metadata from the path."""
665
+ drs_type = self.datasets[standard].drs_format
666
+ root_path = strip_protocol(
667
+ self.datasets[standard].backend.path(
668
+ self.datasets[standard].root_path
669
+ )
670
+ )
671
+ _path = strip_protocol(self.datasets[standard].backend.path(path))
672
+ rel_path = _path.with_suffix("").relative_to(root_path)
673
+ return self.dialect[drs_type].path_specs.get_metadata_from_path(rel_path)
674
+
675
+ @classmethod
676
+ def load(
677
+ cls,
678
+ config_path: Optional[
679
+ Union[Path, str, Dict[str, Any], tomlkit.TOMLDocument]
680
+ ] = None,
681
+ ) -> DRSConfig:
682
+ """Load a drs config from file."""
683
+ cfg = tomli.loads(ConfigMerger(config_path).dumps())
684
+ settings = cfg.pop("drs_settings")
685
+ try:
686
+ return cls(datasets=cfg, **settings)
687
+ except ValidationError as e:
688
+ msgs = []
689
+ for err in e.errors():
690
+ loc = ".".join(str(x) for x in err["loc"])
691
+ msgs.append(f"{loc}: {err['msg']}")
692
+ raise MetadataCrawlerException(
693
+ "DRSConfig validation failed:\n" + "\n".join(msgs)
694
+ ) from None
695
+
696
+ def max_directory_tree_level(
697
+ self, search_dir: str | Path, drs_type: str
698
+ ) -> Tuple[int, bool]:
699
+ """Get the maximum level for descending into directories.
700
+
701
+ When searching for files in a directory we can only traverse the directory
702
+ search tree until the version level is reached. This level is set as a hard
703
+ threshold. If the drs type has no version we can indeed go all the way down
704
+ to the file level.
705
+ """
706
+ root_path = strip_protocol(
707
+ self.datasets[drs_type].backend.path(
708
+ self.datasets[drs_type].root_path
709
+ )
710
+ )
711
+ search_dir = strip_protocol(
712
+ self.datasets[drs_type].backend.path(search_dir)
713
+ )
714
+ standard = self.datasets[drs_type].drs_format
715
+ version = cast(
716
+ str, self.dialect[standard].facets.get("version", "version")
717
+ )
718
+ is_versioned = True
719
+ dir_parts = self.dialect[standard].path_specs.dir_parts or []
720
+ try:
721
+ version_idx = dir_parts.index(version)
722
+ except ValueError:
723
+ # No version given
724
+ version_idx = len(dir_parts)
725
+ is_versioned = False
726
+ if root_path == search_dir:
727
+ current_pos = 0
728
+ else:
729
+ current_pos = len(search_dir.relative_to(root_path).parts)
730
+ return version_idx - current_pos, is_versioned
731
+
732
+ def is_complete(self, data: Dict[str, Any], standard: str) -> bool:
733
+ """Check if all metadata that can be collected was collected."""
734
+ if not data:
735
+ return False
736
+ complete = True
737
+ preset = {**self._defaults[standard], **self.dialect[standard].special}
738
+ facets = (
739
+ k for k, v in self.index_schema.items() if not v.key.startswith("__")
740
+ )
741
+ for facet in self.dialect[standard].facets or facets:
742
+ if facet not in data and facet not in preset:
743
+ complete = False
744
+ return complete
745
+
746
+ def _read_metadata(self, standard: str, inp: Metadata) -> Dict[str, Any]:
747
+ """Get the metadata from a store."""
748
+ drs_type = self.datasets[standard].drs_format
749
+ for source in self.dialect[drs_type].sources:
750
+ if self.is_complete(inp.metadata, standard) is True:
751
+ break
752
+ match source:
753
+ case MetadataSource.path:
754
+ inp.metadata.update(
755
+ self._metadata_from_path(inp.path, standard)
756
+ )
757
+ case MetadataSource.data:
758
+ with catch_warnings(action="ignore", category=RuntimeWarning):
759
+ with self.datasets[standard].backend.open_dataset(
760
+ inp.path, **self.dialect[standard].data_specs.read_kws
761
+ ) as ds:
762
+ inp.metadata.update(
763
+ self.dialect[
764
+ standard
765
+ ].data_specs.extract_from_data(ds)
766
+ )
767
+ self._apply_special_rules(
768
+ standard, drs_type, inp, self.dialect[standard].special
769
+ )
770
+ return self._translate(standard, inp)
771
+
772
+ def read_metadata(self, standard: str, inp: MetadataType) -> Dict[str, Any]:
773
+ """Get the meta data for a given file path."""
774
+ return self._read_metadata(
775
+ standard,
776
+ Metadata(path=inp["path"], metadata=inp["metadata"].copy()),
777
+ )
778
+
779
+ def _translate(self, standard: str, inp: Metadata) -> Dict[str, Any]:
780
+ out: Dict[str, Any] = {}
781
+ # locals to cut attribute lookups
782
+ defs = self._defaults[standard]
783
+ dia = self.dialect[standard]
784
+ facets_get = dia.facets.get
785
+ backend = self.datasets[standard].backend
786
+ mget = inp.metadata.get
787
+ defs_get = defs.get
788
+ path = inp.path
789
+ fmt = path.rsplit(".", 1)[1] if "." in path else ""
790
+
791
+ precomputed: Dict[str, Any] = {
792
+ "path": backend.path(path),
793
+ "uri": backend.uri(path),
794
+ "storage": backend.fs_type(path),
795
+ "dataset": standard,
796
+ "fmt": fmt,
797
+ }
798
+ val: Any = ""
799
+ out_set = out.__setitem__
800
+ for field, schema in self.index_schema.items():
801
+ if schema.indexed is False:
802
+ continue
803
+
804
+ stype = schema.type
805
+
806
+ # Fast path for simple, precomputed types
807
+ if stype in precomputed and stype != "daterange":
808
+ val = precomputed[stype]
809
+
810
+ elif stype == "daterange":
811
+ src = mget(field) or defs_get(field)
812
+ val = schema.get_time_range(src)
813
+
814
+ else:
815
+ # Resolve metadata key via facets once; default to field name
816
+ key = cast(str, facets_get(schema.key, field))
817
+ val = mget(key) or defs_get(key)
818
+
819
+ # Preserve your current semantics: fall back to schema.default on falsey
820
+ val = val or schema.default
821
+
822
+ # Multi-valued normalization
823
+ if (
824
+ (schema.multi_valued or schema.length)
825
+ and val
826
+ and not isinstance(val, list)
827
+ ):
828
+ val = [val]
829
+
830
+ out_set(field, val)
831
+ return out