metadata-crawler 2510.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of metadata-crawler might be problematic. Click here for more details.
- metadata_crawler/__init__.py +263 -0
- metadata_crawler/__main__.py +8 -0
- metadata_crawler/_version.py +1 -0
- metadata_crawler/api/__init__.py +1 -0
- metadata_crawler/api/cli.py +57 -0
- metadata_crawler/api/config.py +831 -0
- metadata_crawler/api/drs_config.toml +440 -0
- metadata_crawler/api/index.py +151 -0
- metadata_crawler/api/metadata_stores.py +755 -0
- metadata_crawler/api/mixin/__init__.py +7 -0
- metadata_crawler/api/mixin/lookup_mixin.py +112 -0
- metadata_crawler/api/mixin/lookup_tables.py +10010 -0
- metadata_crawler/api/mixin/path_mixin.py +46 -0
- metadata_crawler/api/mixin/template_mixin.py +145 -0
- metadata_crawler/api/storage_backend.py +277 -0
- metadata_crawler/backends/__init__.py +1 -0
- metadata_crawler/backends/intake.py +211 -0
- metadata_crawler/backends/posix.py +121 -0
- metadata_crawler/backends/s3.py +140 -0
- metadata_crawler/backends/swift.py +305 -0
- metadata_crawler/cli.py +547 -0
- metadata_crawler/data_collector.py +278 -0
- metadata_crawler/ingester/__init__.py +1 -0
- metadata_crawler/ingester/mongo.py +206 -0
- metadata_crawler/ingester/solr.py +282 -0
- metadata_crawler/logger.py +153 -0
- metadata_crawler/py.typed +0 -0
- metadata_crawler/run.py +419 -0
- metadata_crawler/utils/__init__.py +482 -0
- metadata_crawler/utils/cftime_utils.py +207 -0
- metadata_crawler-2510.1.0.dist-info/METADATA +401 -0
- metadata_crawler-2510.1.0.dist-info/RECORD +35 -0
- metadata_crawler-2510.1.0.dist-info/WHEEL +4 -0
- metadata_crawler-2510.1.0.dist-info/entry_points.txt +14 -0
- metadata_crawler-2510.1.0.dist-info/licenses/LICENSE +28 -0
|
@@ -0,0 +1,831 @@
|
|
|
1
|
+
"""API for loading crawler configuration."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import os
|
|
6
|
+
import re
|
|
7
|
+
import textwrap
|
|
8
|
+
from copy import deepcopy
|
|
9
|
+
from datetime import datetime
|
|
10
|
+
from enum import Enum, StrEnum
|
|
11
|
+
from fnmatch import fnmatch
|
|
12
|
+
from pathlib import Path
|
|
13
|
+
from typing import (
|
|
14
|
+
Annotated,
|
|
15
|
+
Any,
|
|
16
|
+
Dict,
|
|
17
|
+
List,
|
|
18
|
+
Literal,
|
|
19
|
+
Optional,
|
|
20
|
+
Tuple,
|
|
21
|
+
Union,
|
|
22
|
+
cast,
|
|
23
|
+
)
|
|
24
|
+
from urllib.parse import urlsplit
|
|
25
|
+
from warnings import catch_warnings
|
|
26
|
+
|
|
27
|
+
import tomli
|
|
28
|
+
import tomlkit
|
|
29
|
+
import xarray
|
|
30
|
+
from pydantic import (
|
|
31
|
+
BaseModel,
|
|
32
|
+
ConfigDict,
|
|
33
|
+
Field,
|
|
34
|
+
ValidationError,
|
|
35
|
+
field_validator,
|
|
36
|
+
model_validator,
|
|
37
|
+
)
|
|
38
|
+
from tomlkit.container import OutOfOrderTableProxy
|
|
39
|
+
from tomlkit.items import Table
|
|
40
|
+
|
|
41
|
+
from ..utils import (
|
|
42
|
+
MetadataCrawlerException,
|
|
43
|
+
convert_str_to_timestamp,
|
|
44
|
+
load_plugins,
|
|
45
|
+
)
|
|
46
|
+
from ..utils.cftime_utils import infer_cmor_like_time_frequency
|
|
47
|
+
from .mixin import TemplateMixin
|
|
48
|
+
from .storage_backend import Metadata, MetadataType
|
|
49
|
+
|
|
50
|
+
|
|
51
|
+
class BaseType(str, Enum):
|
|
52
|
+
"""Basic types."""
|
|
53
|
+
|
|
54
|
+
string = "string"
|
|
55
|
+
integer = "integer"
|
|
56
|
+
float = "float"
|
|
57
|
+
timestamp = "timestamp"
|
|
58
|
+
|
|
59
|
+
|
|
60
|
+
class Types(str, Enum):
|
|
61
|
+
"""Types supported by the config."""
|
|
62
|
+
|
|
63
|
+
string = "string"
|
|
64
|
+
integer = "integer"
|
|
65
|
+
float = "float"
|
|
66
|
+
timestamp = "timestamp"
|
|
67
|
+
daterange = "timestamp[2]"
|
|
68
|
+
path = "string"
|
|
69
|
+
uri = "string"
|
|
70
|
+
dataset = "string"
|
|
71
|
+
fmt = "string"
|
|
72
|
+
storage = "string"
|
|
73
|
+
bbox = "float[4]"
|
|
74
|
+
|
|
75
|
+
|
|
76
|
+
class SchemaField(BaseModel):
|
|
77
|
+
"""BaseModel defining the metadata schema."""
|
|
78
|
+
|
|
79
|
+
key: str
|
|
80
|
+
type: str
|
|
81
|
+
required: bool = False
|
|
82
|
+
default: Optional[Any] = None
|
|
83
|
+
length: Optional[int] = None
|
|
84
|
+
base_type: BaseType = BaseType.string
|
|
85
|
+
multi_valued: bool = True
|
|
86
|
+
indexed: bool = True
|
|
87
|
+
name: Optional[str] = None
|
|
88
|
+
unique: bool = False
|
|
89
|
+
|
|
90
|
+
@field_validator("type")
|
|
91
|
+
@classmethod
|
|
92
|
+
def parse_type(cls, v: str) -> str:
|
|
93
|
+
"""Parse the data types.
|
|
94
|
+
|
|
95
|
+
Accepts
|
|
96
|
+
^^^^^^^
|
|
97
|
+
- 'string', 'integer', 'float', 'timestamp' -> length=None
|
|
98
|
+
- 'float[2]', 'int[5]' -> length=number
|
|
99
|
+
- 'string[]' -> length=None, multi_valued semantics
|
|
100
|
+
- 'daterange' [timestamp, timestamp]
|
|
101
|
+
"""
|
|
102
|
+
f_type = getattr(getattr(Types, v, None), "value", v)
|
|
103
|
+
m = re.fullmatch(r"({})(\[(\d*)\])?".format("|".join(BaseType)), f_type)
|
|
104
|
+
if not m:
|
|
105
|
+
raise MetadataCrawlerException(f"invalid type spec {v!r}")
|
|
106
|
+
base, _, num = m.groups()
|
|
107
|
+
setattr(
|
|
108
|
+
cls,
|
|
109
|
+
"__parsed_type",
|
|
110
|
+
{"base": base, "length": int(num) if num else None},
|
|
111
|
+
)
|
|
112
|
+
return v
|
|
113
|
+
|
|
114
|
+
@model_validator(mode="after")
|
|
115
|
+
def _set_parsed(self) -> "SchemaField":
|
|
116
|
+
parsed = getattr(self, "__parsed_type")
|
|
117
|
+
self.base_type = BaseType(parsed["base"])
|
|
118
|
+
self.length = parsed["length"]
|
|
119
|
+
self.name = self.name or self.key
|
|
120
|
+
return self
|
|
121
|
+
|
|
122
|
+
@staticmethod
|
|
123
|
+
def get_time_range(
|
|
124
|
+
time_stamp: Optional[Union[str, List[str]]],
|
|
125
|
+
) -> List[datetime]:
|
|
126
|
+
"""Convert a from to time range to a begin or end time step."""
|
|
127
|
+
time_stamp = time_stamp or ""
|
|
128
|
+
if isinstance(time_stamp, str):
|
|
129
|
+
start_str, _, end_str = (
|
|
130
|
+
time_stamp.replace(":", "").replace("_", "-").partition("-")
|
|
131
|
+
)
|
|
132
|
+
time_stamp = [start_str or "fx", end_str or "fx"]
|
|
133
|
+
for n, ts in enumerate(time_stamp):
|
|
134
|
+
if hasattr(ts, "isoformat"):
|
|
135
|
+
time_stamp[n] = ts.isoformat()
|
|
136
|
+
time_stamp[n] = str(ts) or "fx"
|
|
137
|
+
start = convert_str_to_timestamp(
|
|
138
|
+
time_stamp[0], alternative="0001-01-01T00:00"
|
|
139
|
+
)
|
|
140
|
+
end = convert_str_to_timestamp(
|
|
141
|
+
time_stamp[-1], alternative="9999-12-31T23:59"
|
|
142
|
+
)
|
|
143
|
+
return [start, end]
|
|
144
|
+
|
|
145
|
+
|
|
146
|
+
class MetadataSource(StrEnum):
|
|
147
|
+
"""Representation of how the metadata should be retrieved."""
|
|
148
|
+
|
|
149
|
+
storage = "storage" # via intake/fdb5/etc.
|
|
150
|
+
path = "path" # parse via specs_dir/specs_file
|
|
151
|
+
data = "data" # read attributes from the file itself
|
|
152
|
+
|
|
153
|
+
|
|
154
|
+
class VarAttrRule(BaseModel):
|
|
155
|
+
"""How to read attributes from variables."""
|
|
156
|
+
|
|
157
|
+
var: str
|
|
158
|
+
attr: str # attribute name on DataArray.attrs
|
|
159
|
+
default: Any = None
|
|
160
|
+
|
|
161
|
+
|
|
162
|
+
class StatRule(BaseModel):
|
|
163
|
+
"""How to apply statistics."""
|
|
164
|
+
|
|
165
|
+
stat: Literal["min", "max", "minmax", "range", "bbox", "timedelta"]
|
|
166
|
+
var: Optional[str] = None # for numeric stats on a single var
|
|
167
|
+
coords: Optional[Union[List[str], str]] = None # for time range, etc.
|
|
168
|
+
lat: Optional[str] = None # convenience keys for bbox
|
|
169
|
+
lon: Optional[str] = None
|
|
170
|
+
default: Any = None
|
|
171
|
+
|
|
172
|
+
|
|
173
|
+
class ConfigMerger:
|
|
174
|
+
"""Load the system and user TOML, merges user -> system under.
|
|
175
|
+
|
|
176
|
+
Merging the config preserves comments/formatting, and lets you inspect or
|
|
177
|
+
write the result.
|
|
178
|
+
"""
|
|
179
|
+
|
|
180
|
+
def __init__(
|
|
181
|
+
self,
|
|
182
|
+
user_path: Optional[
|
|
183
|
+
Union[Path, str, Dict[str, Any], tomlkit.TOMLDocument]
|
|
184
|
+
] = None,
|
|
185
|
+
):
|
|
186
|
+
# parse both documents
|
|
187
|
+
system_path = Path(__file__).parent / "drs_config.toml"
|
|
188
|
+
self._system_doc = tomlkit.parse(system_path.read_text(encoding="utf-8"))
|
|
189
|
+
_config: str = ""
|
|
190
|
+
if user_path is not None:
|
|
191
|
+
if isinstance(user_path, (str, Path)) and os.path.isdir(user_path):
|
|
192
|
+
_config = (
|
|
193
|
+
Path(user_path).expanduser().absolute() / "drs_config.toml"
|
|
194
|
+
).read_text(encoding="utf-8")
|
|
195
|
+
elif isinstance(user_path, (str, Path)) and os.path.isfile(user_path):
|
|
196
|
+
_config = (
|
|
197
|
+
Path(user_path)
|
|
198
|
+
.expanduser()
|
|
199
|
+
.absolute()
|
|
200
|
+
.read_text(encoding="utf-8")
|
|
201
|
+
)
|
|
202
|
+
elif isinstance(user_path, (str, Path)):
|
|
203
|
+
_config = str(user_path)
|
|
204
|
+
else:
|
|
205
|
+
_config = tomlkit.dumps(user_path)
|
|
206
|
+
if _config:
|
|
207
|
+
try:
|
|
208
|
+
self._user_doc = tomlkit.parse(_config)
|
|
209
|
+
except Exception:
|
|
210
|
+
raise MetadataCrawlerException(
|
|
211
|
+
"Could not load config path."
|
|
212
|
+
) from None
|
|
213
|
+
self._merge_tables(self._system_doc, self._user_doc)
|
|
214
|
+
|
|
215
|
+
def _merge_tables(
|
|
216
|
+
self,
|
|
217
|
+
base: Union[tomlkit.TOMLDocument, Table, OutOfOrderTableProxy],
|
|
218
|
+
override: Union[Table, tomlkit.TOMLDocument, OutOfOrderTableProxy],
|
|
219
|
+
) -> None:
|
|
220
|
+
|
|
221
|
+
for key, value in override.items():
|
|
222
|
+
if key not in base:
|
|
223
|
+
base[key] = value
|
|
224
|
+
continue
|
|
225
|
+
if isinstance(value, (Table, OutOfOrderTableProxy)):
|
|
226
|
+
self._merge_tables(cast(Table, base[key]), value)
|
|
227
|
+
else:
|
|
228
|
+
base[key] = value
|
|
229
|
+
|
|
230
|
+
@property
|
|
231
|
+
def merged_doc(self) -> tomlkit.TOMLDocument:
|
|
232
|
+
"""Return the merged TOMLDocument."""
|
|
233
|
+
return self._system_doc
|
|
234
|
+
|
|
235
|
+
def dumps(self) -> str:
|
|
236
|
+
"""Return the merged document as a TOML string."""
|
|
237
|
+
return tomlkit.dumps(self.merged_doc)
|
|
238
|
+
|
|
239
|
+
|
|
240
|
+
def strip_protocol(inp: str | Path) -> Path:
|
|
241
|
+
"""Extract the path from a given input file system."""
|
|
242
|
+
abs_path = Path(urlsplit(str(inp)).path).expanduser()
|
|
243
|
+
return Path(abs_path)
|
|
244
|
+
|
|
245
|
+
|
|
246
|
+
class CrawlerSettings(BaseModel):
|
|
247
|
+
"""Define the user input for a data crawler session."""
|
|
248
|
+
|
|
249
|
+
name: str
|
|
250
|
+
search_path: Union[str, Path]
|
|
251
|
+
|
|
252
|
+
def model_post_init(self, __context: Any = None) -> None:
|
|
253
|
+
"""Apply rules after init."""
|
|
254
|
+
self.search_path = str(self.search_path)
|
|
255
|
+
|
|
256
|
+
|
|
257
|
+
class PathSpecs(BaseModel):
|
|
258
|
+
"""Implementation of the Directory reference syntax."""
|
|
259
|
+
|
|
260
|
+
dir_parts: Optional[List[str]] = None
|
|
261
|
+
file_parts: Optional[List[str]] = None
|
|
262
|
+
file_sep: str = "_"
|
|
263
|
+
|
|
264
|
+
def _get_metadata_from_dir(
|
|
265
|
+
self, data: Dict[str, Any], rel_path: Path
|
|
266
|
+
) -> None:
|
|
267
|
+
dir_parts = rel_path.parent.parts
|
|
268
|
+
|
|
269
|
+
if self.dir_parts and len(dir_parts) == len(self.dir_parts):
|
|
270
|
+
data.update(
|
|
271
|
+
{
|
|
272
|
+
k: v
|
|
273
|
+
for (k, v) in zip(self.dir_parts, dir_parts)
|
|
274
|
+
if k not in data
|
|
275
|
+
}
|
|
276
|
+
)
|
|
277
|
+
elif self.dir_parts:
|
|
278
|
+
raise MetadataCrawlerException(
|
|
279
|
+
(
|
|
280
|
+
f"Number of dir parts for {rel_path.parent} do not match "
|
|
281
|
+
f"- needs: {len(self.dir_parts)} has: {len(dir_parts)}"
|
|
282
|
+
)
|
|
283
|
+
) from None
|
|
284
|
+
|
|
285
|
+
def _get_metadata_from_filename(
|
|
286
|
+
self, data: Dict[str, Any], rel_path: Path
|
|
287
|
+
) -> None:
|
|
288
|
+
if self.file_parts is None:
|
|
289
|
+
return
|
|
290
|
+
file_parts = rel_path.name.split(self.file_sep)
|
|
291
|
+
_parts: Dict[str, str] = {}
|
|
292
|
+
if len(file_parts) == len(self.file_parts):
|
|
293
|
+
_parts = dict(zip(self.file_parts, file_parts))
|
|
294
|
+
elif (
|
|
295
|
+
len(file_parts) == len(self.file_parts) - 1 and "fx" in rel_path.name
|
|
296
|
+
):
|
|
297
|
+
_parts = dict(zip(self.file_parts[:-1], file_parts))
|
|
298
|
+
else:
|
|
299
|
+
raise MetadataCrawlerException(
|
|
300
|
+
(
|
|
301
|
+
f"Number of file parts for {rel_path.name} do not match "
|
|
302
|
+
f"- needs: {len(self.file_parts)} has: {len(file_parts)})"
|
|
303
|
+
)
|
|
304
|
+
)
|
|
305
|
+
data.update({k: v for (k, v) in _parts.items() if k not in data})
|
|
306
|
+
|
|
307
|
+
def get_metadata_from_path(self, rel_path: Path) -> Dict[str, Any]:
|
|
308
|
+
"""Read path encoded metadata from path specs."""
|
|
309
|
+
data: Dict[str, Any] = {}
|
|
310
|
+
self._get_metadata_from_dir(data, rel_path)
|
|
311
|
+
self._get_metadata_from_filename(data, rel_path)
|
|
312
|
+
data.pop("_", None)
|
|
313
|
+
return data
|
|
314
|
+
|
|
315
|
+
|
|
316
|
+
class DataSpecs(BaseModel):
|
|
317
|
+
"""BaseModel for the configuration."""
|
|
318
|
+
|
|
319
|
+
globals: Dict[str, str] = Field(default_factory=dict)
|
|
320
|
+
var_attrs: Dict[str, VarAttrRule] = Field(default_factory=dict)
|
|
321
|
+
stats: Dict[str, StatRule] = Field(default_factory=dict)
|
|
322
|
+
read_kws: Dict[str, Any] = Field(default_factory=dict)
|
|
323
|
+
|
|
324
|
+
def _set_global_attributes(
|
|
325
|
+
self, dset: "xarray.Dataset", out: Dict[str, Any]
|
|
326
|
+
) -> None:
|
|
327
|
+
|
|
328
|
+
for facet, attr in self.globals.items():
|
|
329
|
+
if attr == "__variable__":
|
|
330
|
+
out[facet] = list(getattr(dset, "data_vara", dset.variables))
|
|
331
|
+
else:
|
|
332
|
+
out[facet] = dset.attrs.get(attr)
|
|
333
|
+
|
|
334
|
+
def _set_variable_attributes(
|
|
335
|
+
self, dset: "xarray.Dataset", out: Dict[str, Any]
|
|
336
|
+
) -> None:
|
|
337
|
+
data_vars = list(getattr(dset, "data_vars", dset.variables))
|
|
338
|
+
|
|
339
|
+
def get_val(
|
|
340
|
+
rule: VarAttrRule, vnames: Union[str, List[str]]
|
|
341
|
+
) -> List[Any]:
|
|
342
|
+
if isinstance(vnames, str):
|
|
343
|
+
vnames = [dv for dv in data_vars if fnmatch(dv, vnames)]
|
|
344
|
+
attr_list: List[Any] = []
|
|
345
|
+
for vname in vnames:
|
|
346
|
+
default = (rule.default or "").replace("__name__", vname) or vname
|
|
347
|
+
attr = (rule.attr or "").replace("__name__", vname) or vname
|
|
348
|
+
if vname in dset:
|
|
349
|
+
attr_list.append(dset[vname].attrs.get(attr, default))
|
|
350
|
+
else:
|
|
351
|
+
attr_list.append(default)
|
|
352
|
+
return attr_list
|
|
353
|
+
|
|
354
|
+
for facet, rule in self.var_attrs.items():
|
|
355
|
+
resolved: Union[str, List[str]] = rule.var
|
|
356
|
+
vals = get_val(rule, resolved)
|
|
357
|
+
if len(vals) == 1:
|
|
358
|
+
out[facet] = vals[0]
|
|
359
|
+
else:
|
|
360
|
+
out[facet] = vals
|
|
361
|
+
|
|
362
|
+
def _apply_stats_rules(
|
|
363
|
+
self, dset: "xarray.Dataset", out: Dict[str, Any]
|
|
364
|
+
) -> None:
|
|
365
|
+
|
|
366
|
+
for facet, rule in self.stats.items():
|
|
367
|
+
coords: Optional[List[str]] = None
|
|
368
|
+
if rule.coords:
|
|
369
|
+
coords = (
|
|
370
|
+
rule.coords
|
|
371
|
+
if isinstance(rule.coords, list)
|
|
372
|
+
else [rule.coords]
|
|
373
|
+
)
|
|
374
|
+
match rule.stat:
|
|
375
|
+
case "bbox":
|
|
376
|
+
lat = rule.lat or (coords[0] if coords else "lat")
|
|
377
|
+
lon = rule.lon or (
|
|
378
|
+
coords[1] if coords and len(coords) > 1 else "lon"
|
|
379
|
+
)
|
|
380
|
+
out[facet] = rule.default
|
|
381
|
+
if lat in dset and lon in dset:
|
|
382
|
+
latv = dset[lat].values
|
|
383
|
+
lonv = dset[lon].values
|
|
384
|
+
out[facet] = [
|
|
385
|
+
float(lonv.min()),
|
|
386
|
+
float(lonv.max()),
|
|
387
|
+
float(latv.min()),
|
|
388
|
+
float(latv.max()),
|
|
389
|
+
]
|
|
390
|
+
|
|
391
|
+
case "range":
|
|
392
|
+
coord = coords[0] if coords else None
|
|
393
|
+
out[facet] = rule.default
|
|
394
|
+
if coord and coord in dset.coords:
|
|
395
|
+
arr = dset.coords[coord].values
|
|
396
|
+
out[facet] = [arr.min(), arr.max()]
|
|
397
|
+
|
|
398
|
+
case "min" | "max" | "minmax":
|
|
399
|
+
|
|
400
|
+
coord = coords[0] if coords else None
|
|
401
|
+
var_name = rule.var if rule.var else coord
|
|
402
|
+
out[facet] = rule.default
|
|
403
|
+
if var_name and var_name in dset:
|
|
404
|
+
arr = dset[var_name].values
|
|
405
|
+
if rule.stat == "min":
|
|
406
|
+
out[facet] = arr.min()
|
|
407
|
+
elif rule.stat == "max":
|
|
408
|
+
out[facet] = arr.max()
|
|
409
|
+
else:
|
|
410
|
+
out[facet] = [arr.min(), arr.max()]
|
|
411
|
+
case "timedelta":
|
|
412
|
+
coord = coords[0] if coords else None
|
|
413
|
+
out[facet] = infer_cmor_like_time_frequency(
|
|
414
|
+
dset, rule.var or coord
|
|
415
|
+
)
|
|
416
|
+
|
|
417
|
+
def extract_from_data(self, dset: xarray.Dataset) -> Dict[str, Any]:
|
|
418
|
+
"""Extract metadata from the data."""
|
|
419
|
+
data: Dict[str, Any] = {}
|
|
420
|
+
self._set_global_attributes(dset, data)
|
|
421
|
+
self._set_variable_attributes(dset, data)
|
|
422
|
+
self._apply_stats_rules(dset, data)
|
|
423
|
+
return data
|
|
424
|
+
|
|
425
|
+
|
|
426
|
+
class Datasets(BaseModel):
|
|
427
|
+
"""Definition of datasets that should be crawled."""
|
|
428
|
+
|
|
429
|
+
__pydantic_extra__: Dict[str, Any] = Field(init=False)
|
|
430
|
+
model_config = ConfigDict(extra="allow")
|
|
431
|
+
root_path: str | Path
|
|
432
|
+
drs_format: str = "freva"
|
|
433
|
+
fs_type: str = "posix"
|
|
434
|
+
defaults: Dict[str, Any] = Field(default_factory=dict)
|
|
435
|
+
storage_options: Dict[str, Any] = Field(default_factory=dict)
|
|
436
|
+
glob_pattern: str = "*.*"
|
|
437
|
+
inherits_from: str = Field(default_factory=str)
|
|
438
|
+
|
|
439
|
+
@field_validator("storage_options", mode="after")
|
|
440
|
+
@classmethod
|
|
441
|
+
def _render_storage_options(
|
|
442
|
+
cls, storage_options: Dict[str, Any]
|
|
443
|
+
) -> Dict[str, Any]:
|
|
444
|
+
tmpl = TemplateMixin()
|
|
445
|
+
return cast(Dict[str, Any], tmpl.render_templates(storage_options, {}))
|
|
446
|
+
|
|
447
|
+
def model_post_init(self, __context: Any = None) -> None:
|
|
448
|
+
"""Apply rules after init."""
|
|
449
|
+
storage_plugins = load_plugins("metadata_crawler.storage")
|
|
450
|
+
try:
|
|
451
|
+
self.backend = storage_plugins[self.fs_type](**self.storage_options)
|
|
452
|
+
except KeyError:
|
|
453
|
+
raise NotImplementedError(
|
|
454
|
+
f"Backend not available. `{self.fs_type}` extension missing?"
|
|
455
|
+
) from None
|
|
456
|
+
|
|
457
|
+
|
|
458
|
+
class ConditionalRule(BaseModel):
|
|
459
|
+
"""Define conditional rules."""
|
|
460
|
+
|
|
461
|
+
type: Literal["conditional"] = "conditional"
|
|
462
|
+
condition: str
|
|
463
|
+
true: Any
|
|
464
|
+
false: Any
|
|
465
|
+
|
|
466
|
+
|
|
467
|
+
class CallRule(BaseModel):
|
|
468
|
+
"""Define caller rules."""
|
|
469
|
+
|
|
470
|
+
type: Literal["call"] = "call"
|
|
471
|
+
call: str
|
|
472
|
+
|
|
473
|
+
|
|
474
|
+
class LookupRule(BaseModel):
|
|
475
|
+
"""Define lookup table rules."""
|
|
476
|
+
|
|
477
|
+
type: Literal["lookup"] = "lookup"
|
|
478
|
+
tree: List[str] = Field(default_factory=list)
|
|
479
|
+
attribute: str
|
|
480
|
+
standard: Optional[str] = None
|
|
481
|
+
|
|
482
|
+
|
|
483
|
+
SpecialRule = Annotated[
|
|
484
|
+
Union[ConditionalRule, CallRule, LookupRule], Field(discriminator="type")
|
|
485
|
+
]
|
|
486
|
+
|
|
487
|
+
|
|
488
|
+
class Dialect(BaseModel):
|
|
489
|
+
"""Settings for a DRS Format."""
|
|
490
|
+
|
|
491
|
+
facets: Dict[str, str | list[str]] = Field(default_factory=dict)
|
|
492
|
+
defaults: Dict[str, Any] = Field(default_factory=dict)
|
|
493
|
+
path_specs: PathSpecs = Field(default_factory=PathSpecs)
|
|
494
|
+
data_specs: DataSpecs = Field(default_factory=DataSpecs)
|
|
495
|
+
special: Dict[str, SpecialRule] = Field(default_factory=dict)
|
|
496
|
+
domains: Dict[str, List[float]] = Field(default_factory=dict)
|
|
497
|
+
sources: List[MetadataSource] = Field(
|
|
498
|
+
default_factory=lambda: [
|
|
499
|
+
MetadataSource.path,
|
|
500
|
+
],
|
|
501
|
+
description="Priority list of where to retrieve metadata",
|
|
502
|
+
)
|
|
503
|
+
inherits_from: Optional[str] = None
|
|
504
|
+
|
|
505
|
+
@field_validator("sources", mode="after")
|
|
506
|
+
@classmethod
|
|
507
|
+
def _validate_sources(cls, srcs: List[str]) -> List[str]:
|
|
508
|
+
# ensure only allowed sources are present
|
|
509
|
+
names = {name.upper() for name in MetadataSource.__members__.keys()}
|
|
510
|
+
values = {m.value for m in MetadataSource}
|
|
511
|
+
invalid = [s for s in srcs if s.upper() not in names and s not in values]
|
|
512
|
+
if invalid:
|
|
513
|
+
allowed = sorted(values | {n.lower() for n in names})
|
|
514
|
+
raise MetadataCrawlerException(
|
|
515
|
+
f"Invalid metadata source(s): {invalid!r}. Allowed: {allowed}"
|
|
516
|
+
)
|
|
517
|
+
return srcs
|
|
518
|
+
|
|
519
|
+
|
|
520
|
+
class DRSConfig(BaseModel, TemplateMixin):
|
|
521
|
+
"""BaseModel model for the entire user config."""
|
|
522
|
+
|
|
523
|
+
datasets: Dict[str, Datasets]
|
|
524
|
+
index_schema: Dict[str, SchemaField] = Field(...)
|
|
525
|
+
suffixes: List[str] = Field(default_factory=list)
|
|
526
|
+
storage_options: Dict[str, Any] = Field(default_factory=dict)
|
|
527
|
+
defaults: Dict[str, Any] = Field(default_factory=dict)
|
|
528
|
+
special: Dict[str, SpecialRule] = Field(default_factory=dict)
|
|
529
|
+
dialect: Dict[str, Dialect]
|
|
530
|
+
|
|
531
|
+
def model_post_init(self, __context: Any = None) -> None:
|
|
532
|
+
"""Apply special rules after init."""
|
|
533
|
+
self._defaults: Dict[str, Any] = {}
|
|
534
|
+
self.suffixes = self.suffixes or [
|
|
535
|
+
".zarr",
|
|
536
|
+
".zar",
|
|
537
|
+
".nc4",
|
|
538
|
+
".nc",
|
|
539
|
+
".tar",
|
|
540
|
+
".hdf5",
|
|
541
|
+
".h5",
|
|
542
|
+
]
|
|
543
|
+
for key, dset in self.datasets.items():
|
|
544
|
+
self.dialect.setdefault(key, self.dialect[dset.drs_format])
|
|
545
|
+
dset.backend.suffixes = self.suffixes
|
|
546
|
+
for key, option in self.storage_options.items():
|
|
547
|
+
dset.backend.storage_options.setdefault(key, option)
|
|
548
|
+
for key, dset in self.datasets.items():
|
|
549
|
+
self._defaults.setdefault(key, {})
|
|
550
|
+
for k, _def in (dset.defaults or {}).items():
|
|
551
|
+
self._defaults[key].setdefault(k, _def)
|
|
552
|
+
for k, _def in self.dialect[dset.drs_format].defaults.items():
|
|
553
|
+
self._defaults[key].setdefault(k, _def)
|
|
554
|
+
for k, _def in self.defaults.items():
|
|
555
|
+
self._defaults[key].setdefault(k, _def)
|
|
556
|
+
self.prep_template_env()
|
|
557
|
+
for standard in self.dialect:
|
|
558
|
+
for key in self.special:
|
|
559
|
+
self.dialect[standard].special.setdefault(key, self.special[key])
|
|
560
|
+
|
|
561
|
+
@model_validator(mode="before")
|
|
562
|
+
@classmethod
|
|
563
|
+
def _dump_(cls, values: Any) -> Any:
|
|
564
|
+
setattr(cls, "_model_dict", values)
|
|
565
|
+
return values
|
|
566
|
+
|
|
567
|
+
@model_validator(mode="before")
|
|
568
|
+
def _resolve_inheritance(cls, values: Any) -> Any:
|
|
569
|
+
"""Apply inheritance.
|
|
570
|
+
|
|
571
|
+
After loading raw TOML into dicts, but before model instantiation, merge
|
|
572
|
+
any dialects that declare `inherits_from`.
|
|
573
|
+
"""
|
|
574
|
+
if not isinstance(values, dict):
|
|
575
|
+
return values # pragma: no cover
|
|
576
|
+
|
|
577
|
+
def _deep_merge(a: Dict[str, Any], b: Dict[str, Any]) -> None:
|
|
578
|
+
for k, v in b.items():
|
|
579
|
+
if k in a and isinstance(a[k], dict) and isinstance(v, dict):
|
|
580
|
+
if not v:
|
|
581
|
+
a[k] = {}
|
|
582
|
+
else:
|
|
583
|
+
_deep_merge(a[k], v)
|
|
584
|
+
else:
|
|
585
|
+
a[k] = v
|
|
586
|
+
|
|
587
|
+
for key in ("dialect", "datasets"):
|
|
588
|
+
raw = values.get(key, {})
|
|
589
|
+
merged = deepcopy(raw)
|
|
590
|
+
for name, cfg in raw.items():
|
|
591
|
+
parent = cfg.get("inherits_from")
|
|
592
|
+
if parent:
|
|
593
|
+
if parent not in merged:
|
|
594
|
+
raise MetadataCrawlerException(
|
|
595
|
+
f"'{name}' inherits from unknown " f"'{parent}'"
|
|
596
|
+
)
|
|
597
|
+
# take parent base, then overlay this dialect
|
|
598
|
+
base = deepcopy(
|
|
599
|
+
merged[parent]
|
|
600
|
+
) # shallow copy of parent raw dict
|
|
601
|
+
# remove inherits_from to avoid cycles
|
|
602
|
+
child = deepcopy(cfg)
|
|
603
|
+
child.pop("inherits_from", None)
|
|
604
|
+
# deep-merge child into base
|
|
605
|
+
_deep_merge(base, child)
|
|
606
|
+
base["inherits_from"] = parent
|
|
607
|
+
merged[name] = base
|
|
608
|
+
|
|
609
|
+
values[key] = merged
|
|
610
|
+
return values
|
|
611
|
+
|
|
612
|
+
@model_validator(mode="before")
|
|
613
|
+
def _ensure_dialects(cls, values: Any) -> Any:
|
|
614
|
+
"""Ensure every dialect is a Dialect model."""
|
|
615
|
+
if not isinstance(values, dict):
|
|
616
|
+
return values # pragma: no cover
|
|
617
|
+
|
|
618
|
+
raw = values.get("dialect", {})
|
|
619
|
+
values["dialect"] = {k: v for k, v in raw.items()}
|
|
620
|
+
return values
|
|
621
|
+
|
|
622
|
+
def _apply_special_rules(
|
|
623
|
+
self,
|
|
624
|
+
standard: str,
|
|
625
|
+
drs_type: str,
|
|
626
|
+
inp: Metadata,
|
|
627
|
+
specials: Dict[str, SpecialRule],
|
|
628
|
+
) -> None:
|
|
629
|
+
data = {**inp.metadata, **{"file": inp.path, "uri": inp.path}}
|
|
630
|
+
|
|
631
|
+
for facet, rule in specials.items():
|
|
632
|
+
result: Any = None
|
|
633
|
+
if inp.metadata.get(facet):
|
|
634
|
+
continue
|
|
635
|
+
match rule.type:
|
|
636
|
+
case "conditional":
|
|
637
|
+
_rule = textwrap.dedent(rule.condition or "").strip()
|
|
638
|
+
s_cond = self.render_templates(_rule, data)
|
|
639
|
+
cond = eval(
|
|
640
|
+
s_cond, {}, getattr(self, "_model_dict", {})
|
|
641
|
+
) # nosec
|
|
642
|
+
result = rule.true if cond else rule.false
|
|
643
|
+
case "lookup":
|
|
644
|
+
args = cast(List[str], self.render_templates(rule.tree, data))
|
|
645
|
+
|
|
646
|
+
result = self.datasets[standard].backend.lookup(
|
|
647
|
+
inp.path,
|
|
648
|
+
self.render_templates(rule.attribute, data),
|
|
649
|
+
rule.standard or drs_type,
|
|
650
|
+
*args,
|
|
651
|
+
**self.dialect[standard].data_specs.read_kws,
|
|
652
|
+
)
|
|
653
|
+
case "call":
|
|
654
|
+
_call = textwrap.dedent(rule.call or "").strip()
|
|
655
|
+
result = eval(
|
|
656
|
+
self.render_templates(_call, data),
|
|
657
|
+
{},
|
|
658
|
+
getattr(self, "_model_dict", {}),
|
|
659
|
+
) # nosec
|
|
660
|
+
if result:
|
|
661
|
+
inp.metadata[facet] = result
|
|
662
|
+
|
|
663
|
+
def _metadata_from_path(self, path: str, standard: str) -> Dict[str, Any]:
|
|
664
|
+
"""Extract the metadata from the path."""
|
|
665
|
+
drs_type = self.datasets[standard].drs_format
|
|
666
|
+
root_path = strip_protocol(
|
|
667
|
+
self.datasets[standard].backend.path(
|
|
668
|
+
self.datasets[standard].root_path
|
|
669
|
+
)
|
|
670
|
+
)
|
|
671
|
+
_path = strip_protocol(self.datasets[standard].backend.path(path))
|
|
672
|
+
rel_path = _path.with_suffix("").relative_to(root_path)
|
|
673
|
+
return self.dialect[drs_type].path_specs.get_metadata_from_path(rel_path)
|
|
674
|
+
|
|
675
|
+
@classmethod
|
|
676
|
+
def load(
|
|
677
|
+
cls,
|
|
678
|
+
config_path: Optional[
|
|
679
|
+
Union[Path, str, Dict[str, Any], tomlkit.TOMLDocument]
|
|
680
|
+
] = None,
|
|
681
|
+
) -> DRSConfig:
|
|
682
|
+
"""Load a drs config from file."""
|
|
683
|
+
cfg = tomli.loads(ConfigMerger(config_path).dumps())
|
|
684
|
+
settings = cfg.pop("drs_settings")
|
|
685
|
+
try:
|
|
686
|
+
return cls(datasets=cfg, **settings)
|
|
687
|
+
except ValidationError as e:
|
|
688
|
+
msgs = []
|
|
689
|
+
for err in e.errors():
|
|
690
|
+
loc = ".".join(str(x) for x in err["loc"])
|
|
691
|
+
msgs.append(f"{loc}: {err['msg']}")
|
|
692
|
+
raise MetadataCrawlerException(
|
|
693
|
+
"DRSConfig validation failed:\n" + "\n".join(msgs)
|
|
694
|
+
) from None
|
|
695
|
+
|
|
696
|
+
def max_directory_tree_level(
|
|
697
|
+
self, search_dir: str | Path, drs_type: str
|
|
698
|
+
) -> Tuple[int, bool]:
|
|
699
|
+
"""Get the maximum level for descending into directories.
|
|
700
|
+
|
|
701
|
+
When searching for files in a directory we can only traverse the directory
|
|
702
|
+
search tree until the version level is reached. This level is set as a hard
|
|
703
|
+
threshold. If the drs type has no version we can indeed go all the way down
|
|
704
|
+
to the file level.
|
|
705
|
+
"""
|
|
706
|
+
root_path = strip_protocol(
|
|
707
|
+
self.datasets[drs_type].backend.path(
|
|
708
|
+
self.datasets[drs_type].root_path
|
|
709
|
+
)
|
|
710
|
+
)
|
|
711
|
+
search_dir = strip_protocol(
|
|
712
|
+
self.datasets[drs_type].backend.path(search_dir)
|
|
713
|
+
)
|
|
714
|
+
standard = self.datasets[drs_type].drs_format
|
|
715
|
+
version = cast(
|
|
716
|
+
str, self.dialect[standard].facets.get("version", "version")
|
|
717
|
+
)
|
|
718
|
+
is_versioned = True
|
|
719
|
+
dir_parts = self.dialect[standard].path_specs.dir_parts or []
|
|
720
|
+
try:
|
|
721
|
+
version_idx = dir_parts.index(version)
|
|
722
|
+
except ValueError:
|
|
723
|
+
# No version given
|
|
724
|
+
version_idx = len(dir_parts)
|
|
725
|
+
is_versioned = False
|
|
726
|
+
if root_path == search_dir:
|
|
727
|
+
current_pos = 0
|
|
728
|
+
else:
|
|
729
|
+
current_pos = len(search_dir.relative_to(root_path).parts)
|
|
730
|
+
return version_idx - current_pos, is_versioned
|
|
731
|
+
|
|
732
|
+
def is_complete(self, data: Dict[str, Any], standard: str) -> bool:
|
|
733
|
+
"""Check if all metadata that can be collected was collected."""
|
|
734
|
+
if not data:
|
|
735
|
+
return False
|
|
736
|
+
complete = True
|
|
737
|
+
preset = {**self._defaults[standard], **self.dialect[standard].special}
|
|
738
|
+
facets = (
|
|
739
|
+
k for k, v in self.index_schema.items() if not v.key.startswith("__")
|
|
740
|
+
)
|
|
741
|
+
for facet in self.dialect[standard].facets or facets:
|
|
742
|
+
if facet not in data and facet not in preset:
|
|
743
|
+
complete = False
|
|
744
|
+
return complete
|
|
745
|
+
|
|
746
|
+
def _read_metadata(self, standard: str, inp: Metadata) -> Dict[str, Any]:
|
|
747
|
+
"""Get the metadata from a store."""
|
|
748
|
+
drs_type = self.datasets[standard].drs_format
|
|
749
|
+
for source in self.dialect[drs_type].sources:
|
|
750
|
+
if self.is_complete(inp.metadata, standard) is True:
|
|
751
|
+
break
|
|
752
|
+
match source:
|
|
753
|
+
case MetadataSource.path:
|
|
754
|
+
inp.metadata.update(
|
|
755
|
+
self._metadata_from_path(inp.path, standard)
|
|
756
|
+
)
|
|
757
|
+
case MetadataSource.data:
|
|
758
|
+
with catch_warnings(action="ignore", category=RuntimeWarning):
|
|
759
|
+
with self.datasets[standard].backend.open_dataset(
|
|
760
|
+
inp.path, **self.dialect[standard].data_specs.read_kws
|
|
761
|
+
) as ds:
|
|
762
|
+
inp.metadata.update(
|
|
763
|
+
self.dialect[
|
|
764
|
+
standard
|
|
765
|
+
].data_specs.extract_from_data(ds)
|
|
766
|
+
)
|
|
767
|
+
self._apply_special_rules(
|
|
768
|
+
standard, drs_type, inp, self.dialect[standard].special
|
|
769
|
+
)
|
|
770
|
+
return self._translate(standard, inp)
|
|
771
|
+
|
|
772
|
+
def read_metadata(self, standard: str, inp: MetadataType) -> Dict[str, Any]:
|
|
773
|
+
"""Get the meta data for a given file path."""
|
|
774
|
+
return self._read_metadata(
|
|
775
|
+
standard,
|
|
776
|
+
Metadata(path=inp["path"], metadata=inp["metadata"].copy()),
|
|
777
|
+
)
|
|
778
|
+
|
|
779
|
+
def _translate(self, standard: str, inp: Metadata) -> Dict[str, Any]:
|
|
780
|
+
out: Dict[str, Any] = {}
|
|
781
|
+
# locals to cut attribute lookups
|
|
782
|
+
defs = self._defaults[standard]
|
|
783
|
+
dia = self.dialect[standard]
|
|
784
|
+
facets_get = dia.facets.get
|
|
785
|
+
backend = self.datasets[standard].backend
|
|
786
|
+
mget = inp.metadata.get
|
|
787
|
+
defs_get = defs.get
|
|
788
|
+
path = inp.path
|
|
789
|
+
fmt = path.rsplit(".", 1)[1] if "." in path else ""
|
|
790
|
+
|
|
791
|
+
precomputed: Dict[str, Any] = {
|
|
792
|
+
"path": backend.path(path),
|
|
793
|
+
"uri": backend.uri(path),
|
|
794
|
+
"storage": backend.fs_type(path),
|
|
795
|
+
"dataset": standard,
|
|
796
|
+
"fmt": fmt,
|
|
797
|
+
}
|
|
798
|
+
val: Any = ""
|
|
799
|
+
out_set = out.__setitem__
|
|
800
|
+
for field, schema in self.index_schema.items():
|
|
801
|
+
if schema.indexed is False:
|
|
802
|
+
continue
|
|
803
|
+
|
|
804
|
+
stype = schema.type
|
|
805
|
+
|
|
806
|
+
# Fast path for simple, precomputed types
|
|
807
|
+
if stype in precomputed and stype != "daterange":
|
|
808
|
+
val = precomputed[stype]
|
|
809
|
+
|
|
810
|
+
elif stype == "daterange":
|
|
811
|
+
src = mget(field) or defs_get(field)
|
|
812
|
+
val = schema.get_time_range(src)
|
|
813
|
+
|
|
814
|
+
else:
|
|
815
|
+
# Resolve metadata key via facets once; default to field name
|
|
816
|
+
key = cast(str, facets_get(schema.key, field))
|
|
817
|
+
val = mget(key) or defs_get(key)
|
|
818
|
+
|
|
819
|
+
# Preserve your current semantics: fall back to schema.default on falsey
|
|
820
|
+
val = val or schema.default
|
|
821
|
+
|
|
822
|
+
# Multi-valued normalization
|
|
823
|
+
if (
|
|
824
|
+
(schema.multi_valued or schema.length)
|
|
825
|
+
and val
|
|
826
|
+
and not isinstance(val, list)
|
|
827
|
+
):
|
|
828
|
+
val = [val]
|
|
829
|
+
|
|
830
|
+
out_set(field, val)
|
|
831
|
+
return out
|