metadata-crawler 2509.0.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of metadata-crawler might be problematic. Click here for more details.

Files changed (34) hide show
  1. metadata_crawler/__init__.py +248 -0
  2. metadata_crawler/__main__.py +8 -0
  3. metadata_crawler/_version.py +1 -0
  4. metadata_crawler/api/__init__.py +1 -0
  5. metadata_crawler/api/cli.py +57 -0
  6. metadata_crawler/api/config.py +801 -0
  7. metadata_crawler/api/drs_config.toml +439 -0
  8. metadata_crawler/api/index.py +132 -0
  9. metadata_crawler/api/metadata_stores.py +749 -0
  10. metadata_crawler/api/mixin/__init__.py +7 -0
  11. metadata_crawler/api/mixin/lookup_mixin.py +112 -0
  12. metadata_crawler/api/mixin/lookup_tables.py +10010 -0
  13. metadata_crawler/api/mixin/path_mixin.py +46 -0
  14. metadata_crawler/api/mixin/template_mixin.py +145 -0
  15. metadata_crawler/api/storage_backend.py +277 -0
  16. metadata_crawler/backends/__init__.py +1 -0
  17. metadata_crawler/backends/intake.py +211 -0
  18. metadata_crawler/backends/posix.py +121 -0
  19. metadata_crawler/backends/s3.py +136 -0
  20. metadata_crawler/backends/swift.py +305 -0
  21. metadata_crawler/cli.py +539 -0
  22. metadata_crawler/data_collector.py +258 -0
  23. metadata_crawler/ingester/__init__.py +1 -0
  24. metadata_crawler/ingester/mongo.py +193 -0
  25. metadata_crawler/ingester/solr.py +152 -0
  26. metadata_crawler/logger.py +142 -0
  27. metadata_crawler/py.typed +0 -0
  28. metadata_crawler/run.py +373 -0
  29. metadata_crawler/utils.py +411 -0
  30. metadata_crawler-2509.0.0.dist-info/METADATA +399 -0
  31. metadata_crawler-2509.0.0.dist-info/RECORD +34 -0
  32. metadata_crawler-2509.0.0.dist-info/WHEEL +4 -0
  33. metadata_crawler-2509.0.0.dist-info/entry_points.txt +14 -0
  34. metadata_crawler-2509.0.0.dist-info/licenses/LICENSE +28 -0
@@ -0,0 +1,801 @@
1
+ """API for loading crawler configuration."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import os
6
+ import re
7
+ import textwrap
8
+ from copy import deepcopy
9
+ from datetime import datetime
10
+ from enum import Enum, StrEnum
11
+ from fnmatch import fnmatch
12
+ from pathlib import Path
13
+ from typing import (
14
+ Annotated,
15
+ Any,
16
+ Dict,
17
+ List,
18
+ Literal,
19
+ Optional,
20
+ Union,
21
+ cast,
22
+ )
23
+ from urllib.parse import urlsplit
24
+ from warnings import catch_warnings
25
+
26
+ import tomli
27
+ import tomlkit
28
+ import xarray
29
+ from pydantic import (
30
+ BaseModel,
31
+ ConfigDict,
32
+ Field,
33
+ ValidationError,
34
+ field_validator,
35
+ model_validator,
36
+ )
37
+ from tomlkit.container import OutOfOrderTableProxy
38
+ from tomlkit.items import Table
39
+
40
+ from ..utils import (
41
+ MetadataCrawlerException,
42
+ convert_str_to_timestamp,
43
+ load_plugins,
44
+ )
45
+ from .mixin import TemplateMixin
46
+ from .storage_backend import Metadata, MetadataType
47
+
48
+
49
+ class BaseType(str, Enum):
50
+ """Basic types."""
51
+
52
+ string = "string"
53
+ integer = "integer"
54
+ float = "float"
55
+ timestamp = "timestamp"
56
+
57
+
58
+ class Types(str, Enum):
59
+ """Types supported by the config."""
60
+
61
+ string = "string"
62
+ integer = "integer"
63
+ float = "float"
64
+ timestamp = "timestamp"
65
+ daterange = "timestamp[2]"
66
+ path = "string"
67
+ uri = "string"
68
+ dataset = "string"
69
+ fmt = "string"
70
+ storage = "string"
71
+ bbox = "float[4]"
72
+
73
+
74
+ class SchemaField(BaseModel):
75
+ """BaseModel defining the metadata schema."""
76
+
77
+ key: str
78
+ type: str
79
+ required: bool = False
80
+ default: Optional[Any] = None
81
+ length: Optional[int] = None
82
+ base_type: BaseType = BaseType.string
83
+ multi_valued: bool = True
84
+ indexed: bool = True
85
+ name: Optional[str] = None
86
+ unique: bool = False
87
+
88
+ @field_validator("type")
89
+ @classmethod
90
+ def parse_type(cls, v: str) -> str:
91
+ """Parse the data types.
92
+
93
+ Accepts
94
+ ^^^^^^^
95
+ - 'string', 'integer', 'float', 'timestamp' -> length=None
96
+ - 'float[2]', 'int[5]' -> length=number
97
+ - 'string[]' -> length=None, multi_valued semantics
98
+ - 'daterange' [timestamp, timestamp]
99
+ """
100
+ f_type = getattr(getattr(Types, v, None), "value", v)
101
+ m = re.fullmatch(r"({})(\[(\d*)\])?".format("|".join(BaseType)), f_type)
102
+ if not m:
103
+ raise MetadataCrawlerException(f"invalid type spec {v!r}")
104
+ base, _, num = m.groups()
105
+ setattr(
106
+ cls,
107
+ "__parsed_type",
108
+ {"base": base, "length": int(num) if num else None},
109
+ )
110
+ return v
111
+
112
+ @model_validator(mode="after")
113
+ def _set_parsed(self) -> "SchemaField":
114
+ parsed = getattr(self, "__parsed_type")
115
+ self.base_type = BaseType(parsed["base"])
116
+ self.length = parsed["length"]
117
+ self.name = self.name or self.key
118
+ return self
119
+
120
+ @staticmethod
121
+ def get_time_range(
122
+ time_stamp: Optional[Union[str, List[str]]],
123
+ ) -> List[datetime]:
124
+ """Convert a from to time range to a begin or end time step."""
125
+ time_stamp = time_stamp or ""
126
+ if isinstance(time_stamp, str):
127
+ start_str, _, end_str = (
128
+ time_stamp.replace(":", "").replace("_", "-").partition("-")
129
+ )
130
+ time_stamp = [start_str or "fx", end_str or "fx"]
131
+ for n, ts in enumerate(time_stamp):
132
+ if hasattr(ts, "isoformat"):
133
+ time_stamp[n] = ts.isoformat()
134
+ time_stamp[n] = str(ts) or "fx"
135
+ start = convert_str_to_timestamp(
136
+ time_stamp[0], alternative="0001-01-01T00:00"
137
+ )
138
+ end = convert_str_to_timestamp(
139
+ time_stamp[-1], alternative="9999-12-31T23:59"
140
+ )
141
+ return [start, end]
142
+
143
+
144
+ class MetadataSource(StrEnum):
145
+ """Representation of how the metadata should be retrieved."""
146
+
147
+ storage = "storage" # via intake/fdb5/etc.
148
+ path = "path" # parse via specs_dir/specs_file
149
+ data = "data" # read attributes from the file itself
150
+
151
+
152
+ class VarAttrRule(BaseModel):
153
+ """How to read attributes from variables."""
154
+
155
+ var: str
156
+ attr: str # attribute name on DataArray.attrs
157
+ default: Any = None
158
+
159
+
160
+ class StatRule(BaseModel):
161
+ """How to apply statistics."""
162
+
163
+ stat: Literal["min", "max", "minmax", "range", "bbox"]
164
+ var: Optional[str] = None # for numeric stats on a single var
165
+ coords: Optional[Union[List[str], str]] = None # for time range, etc.
166
+ lat: Optional[str] = None # convenience keys for bbox
167
+ lon: Optional[str] = None
168
+ default: Any = None
169
+
170
+
171
+ class ConfigMerger:
172
+ """Load the system and user TOML, merges user -> system under.
173
+
174
+ Merging the config preserves comments/formatting, and lets you inspect or
175
+ write the result.
176
+ """
177
+
178
+ def __init__(
179
+ self,
180
+ user_path: Optional[
181
+ Union[Path, str, Dict[str, Any], tomlkit.TOMLDocument]
182
+ ] = None,
183
+ ):
184
+ # parse both documents
185
+ system_path = Path(__file__).parent / "drs_config.toml"
186
+ self._system_doc = tomlkit.parse(system_path.read_text(encoding="utf-8"))
187
+ _config: str = ""
188
+ if user_path is not None:
189
+ if isinstance(user_path, (str, Path)) and os.path.isdir(user_path):
190
+ _config = (
191
+ Path(user_path).expanduser().absolute() / "drs_config.toml"
192
+ ).read_text(encoding="utf-8")
193
+ elif isinstance(user_path, (str, Path)) and os.path.isfile(user_path):
194
+ _config = (
195
+ Path(user_path)
196
+ .expanduser()
197
+ .absolute()
198
+ .read_text(encoding="utf-8")
199
+ )
200
+ elif isinstance(user_path, (str, Path)):
201
+ _config = str(user_path)
202
+ else:
203
+ _config = tomlkit.dumps(user_path)
204
+ if _config:
205
+ try:
206
+ self._user_doc = tomlkit.parse(_config)
207
+ except Exception:
208
+ raise MetadataCrawlerException(
209
+ "Could not load config path."
210
+ ) from None
211
+ self._merge_tables(self._system_doc, self._user_doc)
212
+
213
+ def _merge_tables(
214
+ self,
215
+ base: Union[tomlkit.TOMLDocument, Table, OutOfOrderTableProxy],
216
+ override: Union[Table, tomlkit.TOMLDocument, OutOfOrderTableProxy],
217
+ ) -> None:
218
+
219
+ for key, value in override.items():
220
+ if key not in base:
221
+ base[key] = value
222
+ continue
223
+ if isinstance(value, (Table, OutOfOrderTableProxy)):
224
+ self._merge_tables(cast(Table, base[key]), value)
225
+ else:
226
+ base[key] = value
227
+
228
+ @property
229
+ def merged_doc(self) -> tomlkit.TOMLDocument:
230
+ """Return the merged TOMLDocument."""
231
+ return self._system_doc
232
+
233
+ def dumps(self) -> str:
234
+ """Return the merged document as a TOML string."""
235
+ return tomlkit.dumps(self.merged_doc)
236
+
237
+
238
+ def strip_protocol(inp: str | Path) -> Path:
239
+ """Extract the path from a given input file system."""
240
+ abs_path = Path(urlsplit(str(inp)).path).expanduser()
241
+ return Path(abs_path)
242
+
243
+
244
+ class CrawlerSettings(BaseModel):
245
+ """Define the user input for a data crawler session."""
246
+
247
+ name: str
248
+ search_path: Union[str, Path]
249
+
250
+ def model_post_init(self, __context: Any = None) -> None:
251
+ """Apply rules after init."""
252
+ self.search_path = str(self.search_path)
253
+
254
+
255
+ class PathSpecs(BaseModel):
256
+ """Implementation of the Directory reference syntax."""
257
+
258
+ dir_parts: List[str] = Field(default_factory=list)
259
+ file_parts: List[str] = Field(default_factory=list)
260
+ file_sep: str = "_"
261
+
262
+ def get_metadata_from_path(self, rel_path: Path) -> Dict[str, Any]:
263
+ """Read path encoded metadata from path specs."""
264
+ dir_parts = rel_path.parent.parts
265
+ file_parts = rel_path.name.split(self.file_sep)
266
+ if len(dir_parts) == len(self.dir_parts):
267
+ data: Dict[str, Any] = dict(zip(self.dir_parts, dir_parts))
268
+ else:
269
+ raise MetadataCrawlerException(
270
+ (
271
+ f"Number of dir parts for {rel_path.parent} do not match "
272
+ f"- needs: {len(self.dir_parts)} has: {len(dir_parts)}"
273
+ )
274
+ ) from None
275
+ if len(file_parts) == len(self.file_parts):
276
+ _parts = dict(zip(self.file_parts, file_parts))
277
+ elif (
278
+ len(file_parts) == len(self.file_parts) - 1 and "fx" in rel_path.name
279
+ ):
280
+ _parts = dict(zip(self.file_parts[:-1], file_parts))
281
+ else:
282
+ raise MetadataCrawlerException(
283
+ (
284
+ f"Number of file parts for {rel_path.name} do not match "
285
+ f"- needs: {len(self.file_parts)} has: {len(file_parts)})"
286
+ )
287
+ )
288
+ _parts.setdefault("time", "fx")
289
+ data.update({k: v for (k, v) in _parts.items() if k not in data})
290
+ data.pop("_", None)
291
+ return data
292
+
293
+
294
+ class DataSpecs(BaseModel):
295
+ """BaseModel for the configuration."""
296
+
297
+ globals: Dict[str, str] = Field(default_factory=dict)
298
+ var_attrs: Dict[str, VarAttrRule] = Field(default_factory=dict)
299
+ stats: Dict[str, StatRule] = Field(default_factory=dict)
300
+ read_kws: Dict[str, Any] = Field(default_factory=dict)
301
+
302
+ def _set_global_attributes(
303
+ self, dset: "xarray.Dataset", out: Dict[str, Any]
304
+ ) -> None:
305
+
306
+ for facet, attr in self.globals.items():
307
+ if attr == "__variable__":
308
+ out[facet] = list(getattr(dset, "data_vara", dset.variables))
309
+ else:
310
+ out[facet] = dset.attrs.get(attr)
311
+
312
+ def _set_variable_attributes(
313
+ self, dset: "xarray.Dataset", out: Dict[str, Any]
314
+ ) -> None:
315
+ data_vars = list(getattr(dset, "data_vars", dset.variables))
316
+
317
+ def get_val(
318
+ rule: VarAttrRule, vnames: Union[str, List[str]]
319
+ ) -> List[Any]:
320
+ if isinstance(vnames, str):
321
+ vnames = [dv for dv in data_vars if fnmatch(dv, vnames)]
322
+ attr_list: List[Any] = []
323
+ for vname in vnames:
324
+ default = (rule.default or "").replace("__name__", vname) or vname
325
+ attr = (rule.attr or "").replace("__name__", vname) or vname
326
+ if vname in dset:
327
+ attr_list.append(dset[vname].attrs.get(attr, default))
328
+ else:
329
+ attr_list.append(default)
330
+ return attr_list
331
+
332
+ for facet, rule in self.var_attrs.items():
333
+ resolved: Union[str, List[str]] = rule.var
334
+ vals = get_val(rule, resolved)
335
+ if len(vals) == 1:
336
+ out[facet] = vals[0]
337
+ else:
338
+ out[facet] = vals
339
+
340
+ def _apply_stats_rules(
341
+ self, dset: "xarray.Dataset", out: Dict[str, Any]
342
+ ) -> None:
343
+
344
+ for facet, rule in self.stats.items():
345
+ coords: Optional[List[str]] = None
346
+ if rule.coords:
347
+ coords = (
348
+ rule.coords
349
+ if isinstance(rule.coords, list)
350
+ else [rule.coords]
351
+ )
352
+ match rule.stat:
353
+ case "bbox":
354
+ lat = rule.lat or (coords[0] if coords else "lat")
355
+ lon = rule.lon or (
356
+ coords[1] if coords and len(coords) > 1 else "lon"
357
+ )
358
+ out[facet] = rule.default
359
+ if lat in dset and lon in dset:
360
+ latv = dset[lat].values
361
+ lonv = dset[lon].values
362
+ out[facet] = [
363
+ float(lonv.min()),
364
+ float(lonv.max()),
365
+ float(latv.min()),
366
+ float(latv.max()),
367
+ ]
368
+
369
+ case "range":
370
+ coord = coords[0] if coords else None
371
+ out[facet] = rule.default
372
+ if coord and coord in dset.coords:
373
+ arr = dset.coords[coord].values
374
+ out[facet] = [arr.min(), arr.max()]
375
+
376
+ case "min" | "max" | "minmax":
377
+
378
+ coord = coords[0] if coords else None
379
+ var_name = rule.var if rule.var else coord
380
+ out[facet] = rule.default
381
+ if var_name and var_name in dset:
382
+ arr = dset[var_name].values
383
+ if rule.stat == "min":
384
+ out[facet] = arr.min()
385
+ elif rule.stat == "max":
386
+ out[facet] = arr.max()
387
+ else:
388
+ out[facet] = [arr.min(), arr.max()]
389
+
390
+ def extract_from_data(self, dset: xarray.Dataset) -> Dict[str, Any]:
391
+ """Extract metadata from the data."""
392
+ data: Dict[str, Any] = {}
393
+ self._set_global_attributes(dset, data)
394
+ self._set_variable_attributes(dset, data)
395
+ self._apply_stats_rules(dset, data)
396
+ return data
397
+
398
+
399
+ class Datasets(BaseModel):
400
+ """Definition of datasets that should be crawled."""
401
+
402
+ __pydantic_extra__: Dict[str, str] = Field(init=False)
403
+ model_config = ConfigDict(extra="allow")
404
+ root_path: str | Path
405
+ drs_format: str = "freva"
406
+ fs_type: str = "posix"
407
+ defaults: Dict[str, Any] = Field(default_factory=dict)
408
+ storage_options: Dict[str, Any] = Field(default_factory=dict)
409
+ glob_pattern: str = "*.*"
410
+ inherits_from: str = Field(default_factory=str)
411
+
412
+ @field_validator("storage_options", mode="after")
413
+ @classmethod
414
+ def _render_storage_options(
415
+ cls, storage_options: Dict[str, Any]
416
+ ) -> Dict[str, Any]:
417
+ tmpl = TemplateMixin()
418
+ return cast(Dict[str, Any], tmpl.render_templates(storage_options, {}))
419
+
420
+ def model_post_init(self, __context: Any = None) -> None:
421
+ """Apply rules after init."""
422
+ storage_plugins = load_plugins("metadata_crawler.storage")
423
+ try:
424
+ self.backend = storage_plugins[self.fs_type](**self.storage_options)
425
+ except KeyError:
426
+ raise NotImplementedError(
427
+ f"Backend not available. `{self.fs_type}` extension missing?"
428
+ ) from None
429
+
430
+
431
+ class ConditionalRule(BaseModel):
432
+ """Define conditional rules."""
433
+
434
+ type: Literal["conditional"] = "conditional"
435
+ condition: str
436
+ true: Any
437
+ false: Any
438
+
439
+
440
+ class CallRule(BaseModel):
441
+ """Define caller rules."""
442
+
443
+ type: Literal["call"] = "call"
444
+ call: str
445
+
446
+
447
+ class LookupRule(BaseModel):
448
+ """Define lookup table rules."""
449
+
450
+ type: Literal["lookup"] = "lookup"
451
+ tree: List[str] = Field(default_factory=list)
452
+ attribute: str
453
+ standard: Optional[str] = None
454
+
455
+
456
+ SpecialRule = Annotated[
457
+ Union[ConditionalRule, CallRule, LookupRule], Field(discriminator="type")
458
+ ]
459
+
460
+
461
+ class Dialect(BaseModel):
462
+ """Settings for a DRS Format."""
463
+
464
+ facets: Dict[str, str | list[str]] = Field(default_factory=dict)
465
+ defaults: Dict[str, Any] = Field(default_factory=dict)
466
+ path_specs: PathSpecs = Field(default_factory=PathSpecs)
467
+ data_specs: DataSpecs = Field(default_factory=DataSpecs)
468
+ special: Dict[str, SpecialRule] = Field(default_factory=dict)
469
+ domains: Dict[str, List[float]] = Field(default_factory=dict)
470
+ sources: List[MetadataSource] = Field(
471
+ default_factory=lambda: [
472
+ MetadataSource.path,
473
+ ],
474
+ description="Priority list of where to retrieve metadata",
475
+ )
476
+ inherits_from: Optional[str] = None
477
+
478
+ @field_validator("sources", mode="after")
479
+ @classmethod
480
+ def _validate_sources(cls, srcs: List[str]) -> List[str]:
481
+ # ensure only allowed sources are present
482
+ names = {name.upper() for name in MetadataSource.__members__.keys()}
483
+ values = {m.value for m in MetadataSource}
484
+ invalid = [s for s in srcs if s.upper() not in names and s not in values]
485
+ if invalid:
486
+ allowed = sorted(values | {n.lower() for n in names})
487
+ raise MetadataCrawlerException(
488
+ f"Invalid metadata source(s): {invalid!r}. Allowed: {allowed}"
489
+ )
490
+ return srcs
491
+
492
+
493
+ class DRSConfig(BaseModel, TemplateMixin):
494
+ """BaseModel model for the entire user config."""
495
+
496
+ datasets: Dict[str, Datasets]
497
+ index_schema: Dict[str, SchemaField] = Field(...)
498
+ suffixes: List[str] = Field(default_factory=list)
499
+ storage_options: Dict[str, Any] = Field(default_factory=dict)
500
+ defaults: Dict[str, Any] = Field(default_factory=dict)
501
+ special: Dict[str, SpecialRule] = Field(default_factory=dict)
502
+ dialect: Dict[str, Dialect]
503
+
504
+ def model_post_init(self, __context: Any = None) -> None:
505
+ """Apply special rules after init."""
506
+ self._defaults: Dict[str, Any] = {}
507
+ self.suffixes = self.suffixes or [
508
+ ".zarr",
509
+ ".zar",
510
+ ".nc4",
511
+ ".nc",
512
+ ".tar",
513
+ ".hdf5",
514
+ ".h5",
515
+ ]
516
+ for key, dset in self.datasets.items():
517
+ self.dialect.setdefault(key, self.dialect[dset.drs_format])
518
+ dset.backend.suffixes = self.suffixes
519
+ for key, option in self.storage_options.items():
520
+ dset.backend.storage_options.setdefault(key, option)
521
+ for key, dset in self.datasets.items():
522
+ self._defaults.setdefault(key, {})
523
+ for k, _def in (dset.defaults or {}).items():
524
+ self._defaults[key].setdefault(k, _def)
525
+ for k, _def in self.dialect[dset.drs_format].defaults.items():
526
+ self._defaults[key].setdefault(k, _def)
527
+ for k, _def in self.defaults.items():
528
+ self._defaults[key].setdefault(k, _def)
529
+ self.prep_template_env()
530
+ for standard in self.dialect:
531
+ for key in self.special:
532
+ self.dialect[standard].special.setdefault(key, self.special[key])
533
+
534
+ @model_validator(mode="before")
535
+ @classmethod
536
+ def _dump_(cls, values: Any) -> Any:
537
+ setattr(cls, "_model_dict", values)
538
+ return values
539
+
540
+ @model_validator(mode="before")
541
+ def _resolve_inheritance(cls, values: Any) -> Any:
542
+ """Apply inheritance.
543
+
544
+ After loading raw TOML into dicts, but before model instantiation, merge
545
+ any dialects that declare `inherits_from`.
546
+ """
547
+ if not isinstance(values, dict):
548
+ return values # pragma: no cover
549
+
550
+ def _deep_merge(a: Dict[str, Any], b: Dict[str, Any]) -> None:
551
+ for k, v in b.items():
552
+ if k in a and isinstance(a[k], dict) and isinstance(v, dict):
553
+ if not v:
554
+ a[k] = {}
555
+ else:
556
+ _deep_merge(a[k], v)
557
+ else:
558
+ a[k] = v
559
+
560
+ for key in ("dialect", "datasets"):
561
+ raw = values.get(key, {})
562
+ merged = deepcopy(raw)
563
+ for name, cfg in raw.items():
564
+ parent = cfg.get("inherits_from")
565
+ if parent:
566
+ if parent not in merged:
567
+ raise MetadataCrawlerException(
568
+ f"'{name}' inherits from unknown " f"'{parent}'"
569
+ )
570
+ # take parent base, then overlay this dialect
571
+ base = deepcopy(
572
+ merged[parent]
573
+ ) # shallow copy of parent raw dict
574
+ # remove inherits_from to avoid cycles
575
+ child = deepcopy(cfg)
576
+ child.pop("inherits_from", None)
577
+ # deep-merge child into base
578
+ _deep_merge(base, child)
579
+ base["inherits_from"] = parent
580
+ merged[name] = base
581
+
582
+ values[key] = merged
583
+ return values
584
+
585
+ @model_validator(mode="before")
586
+ def _ensure_dialects(cls, values: Any) -> Any:
587
+ """Ensure every dialect is a Dialect model."""
588
+ if not isinstance(values, dict):
589
+ return values # pragma: no cover
590
+
591
+ raw = values.get("dialect", {})
592
+ values["dialect"] = {k: v for k, v in raw.items()}
593
+ return values
594
+
595
+ def _apply_special_rules(
596
+ self,
597
+ standard: str,
598
+ drs_type: str,
599
+ inp: Metadata,
600
+ specials: Dict[str, SpecialRule],
601
+ ) -> None:
602
+ data = {**inp.metadata, **{"file": inp.path, "uri": inp.path}}
603
+
604
+ for facet, rule in specials.items():
605
+ result: Any = None
606
+ if inp.metadata.get(facet):
607
+ continue
608
+ match rule.type:
609
+ case "conditional":
610
+ _rule = textwrap.dedent(rule.condition or "").strip()
611
+ s_cond = self.render_templates(_rule, data)
612
+ cond = eval(s_cond, {}, getattr(self, "_model_dict", {}))
613
+ result = rule.true if cond else rule.false
614
+ case "lookup":
615
+ args = cast(List[str], self.render_templates(rule.tree, data))
616
+
617
+ result = self.datasets[standard].backend.lookup(
618
+ inp.path,
619
+ self.render_templates(rule.attribute, data),
620
+ rule.standard or drs_type,
621
+ *args,
622
+ **self.dialect[standard].data_specs.read_kws,
623
+ )
624
+ case "call":
625
+ _call = textwrap.dedent(rule.call or "").strip()
626
+ result = eval(
627
+ self.render_templates(_call, data),
628
+ {},
629
+ getattr(self, "_model_dict", {}),
630
+ )
631
+ if result:
632
+ inp.metadata[facet] = result
633
+
634
+ def _metadata_from_path(self, path: str, standard: str) -> Dict[str, Any]:
635
+ """Extract the metadata from the path."""
636
+ drs_type = self.datasets[standard].drs_format
637
+ root_path = strip_protocol(
638
+ self.datasets[standard].backend.path(
639
+ self.datasets[standard].root_path
640
+ )
641
+ )
642
+ _path = strip_protocol(self.datasets[standard].backend.path(path))
643
+ rel_path = _path.with_suffix("").relative_to(root_path)
644
+ return self.dialect[drs_type].path_specs.get_metadata_from_path(rel_path)
645
+
646
+ @classmethod
647
+ def load(
648
+ cls,
649
+ config_path: Optional[
650
+ Union[Path, str, Dict[str, Any], tomlkit.TOMLDocument]
651
+ ] = None,
652
+ ) -> DRSConfig:
653
+ """Load a drs config from file."""
654
+ cfg = tomli.loads(ConfigMerger(config_path).dumps())
655
+ settings = cfg.pop("drs_settings")
656
+ try:
657
+ return cls(datasets=cfg, **settings)
658
+ except ValidationError as e:
659
+ msgs = []
660
+ for err in e.errors():
661
+ loc = ".".join(str(x) for x in err["loc"])
662
+ msgs.append(f"{loc}: {err['msg']}")
663
+ raise MetadataCrawlerException(
664
+ "DRSConfig validation failed:\n" + "\n".join(msgs)
665
+ ) from None
666
+
667
+ def max_directory_tree_level(
668
+ self, search_dir: str | Path, drs_type: str
669
+ ) -> int:
670
+ """Get the maximum level for descending into directories.
671
+
672
+ When searching for files in a directory we can only traverse the directory
673
+ search tree until the version level is reached. This level is set as a hard
674
+ threshold. If the drs type has no version we can indeed go all the way down
675
+ to the file level.
676
+ """
677
+ root_path = strip_protocol(
678
+ self.datasets[drs_type].backend.path(
679
+ self.datasets[drs_type].root_path
680
+ )
681
+ )
682
+ search_dir = strip_protocol(
683
+ self.datasets[drs_type].backend.path(search_dir)
684
+ )
685
+ standard = self.datasets[drs_type].drs_format
686
+ version = cast(
687
+ str, self.dialect[standard].facets.get("version", "version")
688
+ )
689
+ try:
690
+ version_idx = self.dialect[standard].path_specs.dir_parts.index(
691
+ version
692
+ )
693
+ except ValueError:
694
+ # No version given
695
+ version_idx = len(self.dialect[standard].path_specs.dir_parts)
696
+ if root_path == search_dir:
697
+ current_pos = 0
698
+ else:
699
+ current_pos = len(search_dir.relative_to(root_path).parts)
700
+ return version_idx - current_pos
701
+
702
+ def is_complete(self, data: Dict[str, Any], standard: str) -> bool:
703
+ """Check if all metadata that can be collected was collected."""
704
+ if not data:
705
+ return False
706
+ complete = True
707
+ preset = {**self._defaults[standard], **self.dialect[standard].special}
708
+ facets = (
709
+ k for k, v in self.index_schema.items() if not v.key.startswith("__")
710
+ )
711
+ for facet in self.dialect[standard].facets or facets:
712
+ if facet not in data and facet not in preset:
713
+ complete = False
714
+ return complete
715
+
716
+ def _read_metadata(self, standard: str, inp: Metadata) -> Dict[str, Any]:
717
+ """Get the metadata from a store."""
718
+ drs_type = self.datasets[standard].drs_format
719
+ for source in self.dialect[drs_type].sources:
720
+ if self.is_complete(inp.metadata, standard) is True:
721
+ break
722
+ match source:
723
+ case MetadataSource.path:
724
+ inp.metadata.update(
725
+ self._metadata_from_path(inp.path, standard)
726
+ )
727
+ case MetadataSource.data:
728
+ with catch_warnings(action="ignore", category=RuntimeWarning):
729
+ with self.datasets[standard].backend.open_dataset(
730
+ inp.path, **self.dialect[standard].data_specs.read_kws
731
+ ) as ds:
732
+ inp.metadata.update(
733
+ self.dialect[
734
+ standard
735
+ ].data_specs.extract_from_data(ds)
736
+ )
737
+ self._apply_special_rules(
738
+ standard, drs_type, inp, self.dialect[standard].special
739
+ )
740
+ return self._translate(standard, inp)
741
+
742
+ def read_metadata(self, standard: str, inp: MetadataType) -> Dict[str, Any]:
743
+ """Get the meta data for a given file path."""
744
+ return self._read_metadata(
745
+ standard,
746
+ Metadata(path=inp["path"], metadata=inp["metadata"].copy()),
747
+ )
748
+
749
+ def _translate(self, standard: str, inp: Metadata) -> Dict[str, Any]:
750
+ out: Dict[str, Any] = {}
751
+ # locals to cut attribute lookups
752
+ defs = self._defaults[standard]
753
+ dia = self.dialect[standard]
754
+ facets_get = dia.facets.get
755
+ backend = self.datasets[standard].backend
756
+ mget = inp.metadata.get
757
+ defs_get = defs.get
758
+ path = inp.path
759
+ fmt = path.rsplit(".", 1)[1] if "." in path else ""
760
+
761
+ precomputed: Dict[str, Any] = {
762
+ "path": backend.path(path),
763
+ "uri": backend.uri(path),
764
+ "storage": backend.fs_type(path),
765
+ "dataset": standard,
766
+ "fmt": fmt,
767
+ }
768
+ val: Any = ""
769
+ out_set = out.__setitem__
770
+ for field, schema in self.index_schema.items():
771
+ if schema.indexed is False:
772
+ continue
773
+
774
+ stype = schema.type
775
+
776
+ # Fast path for simple, precomputed types
777
+ if stype in precomputed and stype != "daterange":
778
+ val = precomputed[stype]
779
+
780
+ elif stype == "daterange":
781
+ src = mget(field) or defs_get(field)
782
+ val = schema.get_time_range(src)
783
+
784
+ else:
785
+ # Resolve metadata key via facets once; default to field name
786
+ key = cast(str, facets_get(schema.key, field))
787
+ val = mget(key) or defs_get(key)
788
+
789
+ # Preserve your current semantics: fall back to schema.default on falsey
790
+ val = val or schema.default
791
+
792
+ # Multi-valued normalization
793
+ if (
794
+ (schema.multi_valued or schema.length)
795
+ and val
796
+ and not isinstance(val, list)
797
+ ):
798
+ val = [val]
799
+
800
+ out_set(field, val)
801
+ return out