metadata-crawler 2510.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of metadata-crawler might be problematic. Click here for more details.

Files changed (35) hide show
  1. metadata_crawler/__init__.py +263 -0
  2. metadata_crawler/__main__.py +8 -0
  3. metadata_crawler/_version.py +1 -0
  4. metadata_crawler/api/__init__.py +1 -0
  5. metadata_crawler/api/cli.py +57 -0
  6. metadata_crawler/api/config.py +831 -0
  7. metadata_crawler/api/drs_config.toml +440 -0
  8. metadata_crawler/api/index.py +151 -0
  9. metadata_crawler/api/metadata_stores.py +755 -0
  10. metadata_crawler/api/mixin/__init__.py +7 -0
  11. metadata_crawler/api/mixin/lookup_mixin.py +112 -0
  12. metadata_crawler/api/mixin/lookup_tables.py +10010 -0
  13. metadata_crawler/api/mixin/path_mixin.py +46 -0
  14. metadata_crawler/api/mixin/template_mixin.py +145 -0
  15. metadata_crawler/api/storage_backend.py +277 -0
  16. metadata_crawler/backends/__init__.py +1 -0
  17. metadata_crawler/backends/intake.py +211 -0
  18. metadata_crawler/backends/posix.py +121 -0
  19. metadata_crawler/backends/s3.py +140 -0
  20. metadata_crawler/backends/swift.py +305 -0
  21. metadata_crawler/cli.py +547 -0
  22. metadata_crawler/data_collector.py +278 -0
  23. metadata_crawler/ingester/__init__.py +1 -0
  24. metadata_crawler/ingester/mongo.py +206 -0
  25. metadata_crawler/ingester/solr.py +282 -0
  26. metadata_crawler/logger.py +153 -0
  27. metadata_crawler/py.typed +0 -0
  28. metadata_crawler/run.py +419 -0
  29. metadata_crawler/utils/__init__.py +482 -0
  30. metadata_crawler/utils/cftime_utils.py +207 -0
  31. metadata_crawler-2510.1.0.dist-info/METADATA +401 -0
  32. metadata_crawler-2510.1.0.dist-info/RECORD +35 -0
  33. metadata_crawler-2510.1.0.dist-info/WHEEL +4 -0
  34. metadata_crawler-2510.1.0.dist-info/entry_points.txt +14 -0
  35. metadata_crawler-2510.1.0.dist-info/licenses/LICENSE +28 -0
@@ -0,0 +1,482 @@
1
+ """Random utility functions."""
2
+
3
+ import difflib
4
+ import logging
5
+ import multiprocessing as mp
6
+ import multiprocessing.context as mctx
7
+ import os
8
+ import sys
9
+ import time
10
+ from datetime import datetime, timedelta
11
+ from importlib.metadata import entry_points
12
+ from typing import (
13
+ IO,
14
+ Any,
15
+ AsyncIterator,
16
+ Callable,
17
+ Dict,
18
+ Iterable,
19
+ List,
20
+ Optional,
21
+ Protocol,
22
+ Set,
23
+ Tuple,
24
+ TypeAlias,
25
+ TypeVar,
26
+ Union,
27
+ )
28
+
29
+ import ciso8601
30
+ import orjson
31
+ import rich.console
32
+ import rich.spinner
33
+ from dateutil.parser import isoparse
34
+ from rich.live import Live
35
+ from rich.progress import Progress, TaskID
36
+
37
+ from ..logger import logger
38
+
39
+ T = TypeVar("T")
40
+ U = TypeVar("U")
41
+
42
+
43
+ class SimpleQueueLike(Protocol[T]):
44
+ """A simple queue like Type class."""
45
+
46
+ def put(self, item: T) -> None: # noqa
47
+ ...
48
+
49
+ def get(self) -> T: # noqa
50
+ ...
51
+
52
+
53
+ class QueueLike(Protocol[T]):
54
+ """A queue like Type class."""
55
+
56
+ def put(self, item: T) -> None: # noqa
57
+ ...
58
+
59
+ def get(
60
+ self, block: bool = True, timeout: Optional[float] = ...
61
+ ) -> T: # noqa
62
+ ...
63
+
64
+ def qsize(self) -> int: # noqa
65
+ ...
66
+
67
+
68
+ class EventLike(Protocol):
69
+ """An event like Type class."""
70
+
71
+ def set(self) -> None: # noqa
72
+ ...
73
+
74
+ def clear(self) -> None: # noqa
75
+ ...
76
+
77
+ def is_set(self) -> bool: # noqa
78
+ ...
79
+
80
+ def wait(self) -> None: # noqa
81
+ ...
82
+
83
+
84
+ class LockLike(Protocol):
85
+ """A lock like Type class."""
86
+
87
+ def acquire(
88
+ self, blocking: bool = ..., timeout: Optional[float] = ...
89
+ ) -> bool: # noqa
90
+ ...
91
+
92
+ def release(self) -> None: # noqa
93
+ ...
94
+
95
+ def __enter__(self) -> "LockLike": ...
96
+ def __exit__(self, exc_type: Any, exc: Any, tb: Any) -> None: ...
97
+
98
+
99
+ class ValueLike(Protocol[U]):
100
+ """A value like Type class."""
101
+
102
+ value: U
103
+
104
+ def get_lock(self) -> "Any": # noqa
105
+ ...
106
+
107
+
108
+ class FilesystemLike(Protocol):
109
+ """File-like opener protocol (e.g., fsspec)."""
110
+
111
+ def open(
112
+ self,
113
+ path: str,
114
+ mode: str = "rt",
115
+ compression: Optional[str] = None,
116
+ encoding: Optional[str] = None,
117
+ **kwargs: Any,
118
+ ) -> IO[str]: # noqa
119
+ ...
120
+
121
+
122
+ Counter: TypeAlias = ValueLike[int]
123
+ PrintLock = mp.Lock()
124
+ Console = rich.console.Console(force_terminal=sys.stdout.isatty(), stderr=True)
125
+
126
+
127
+ class MetadataCrawlerException(Exception):
128
+ """Custom Exception for the crawling."""
129
+
130
+
131
+ class EmptyCrawl(MetadataCrawlerException):
132
+ """Cusotom Exceptoin for a crawl with no results."""
133
+
134
+
135
+ async def create_async_iterator(itt: Iterable[Any]) -> AsyncIterator[Any]:
136
+ """Create an async iterator from as sync iterable."""
137
+ for item in itt:
138
+ yield item
139
+
140
+
141
+ def _parse_iso_datetime(s: str) -> datetime:
142
+ return ciso8601.parse_datetime(s) or datetime.fromisoformat(s)
143
+
144
+
145
+ def parse_batch(
146
+ lines: List[str],
147
+ timestamp_keys: Set[str],
148
+ ) -> List[Dict[str, Any]]:
149
+ """Parse a batch of NDJSON lines and convert timestamp fields.
150
+
151
+ Parameters
152
+ ^^^^^^^^^^
153
+ lines : list of str
154
+ Raw NDJSON lines.
155
+ timestamp_keys : set of str
156
+ Keys that should be parsed as datetimes.
157
+
158
+ Returns
159
+ ^^^^^^^
160
+ list of dict
161
+ Parsed objects with timestamp fields converted to ``datetime``.
162
+ """
163
+ out: List[Dict[str, Any]] = []
164
+ append = out.append
165
+ loads = orjson.loads
166
+ parse_dt = _parse_iso_datetime
167
+
168
+ for line in lines:
169
+ obj: Dict[str, Any] = loads(line)
170
+ for k in timestamp_keys:
171
+ v = obj.get(k, None)
172
+ if v is None:
173
+ continue
174
+ if isinstance(v, str):
175
+ obj[k] = parse_dt(v)
176
+ elif isinstance(v, list):
177
+ obj[k] = [parse_dt(x) if isinstance(x, str) else x for x in v]
178
+ append(obj)
179
+ return out
180
+
181
+
182
+ def convert_str_to_timestamp(
183
+ time_str: str, alternative: str = "0001-01-01"
184
+ ) -> datetime:
185
+ """Convert a string representation of a time step to an iso timestamp.
186
+
187
+ Parameters
188
+ ----------
189
+ time_str: str
190
+ Representation of the time step in formats:
191
+ - %Y%m%d%H%M%S%f (year, month, day, hour, minute, second, millisecond)
192
+ - %Y%m%d%H%M (year, month, day, hour, minute)
193
+ - %Y%m (year, month)
194
+ - %Y%m%dT%H%M (year, month, day, hour, minute with T separator)
195
+ - %Y%j (year and day of year, e.g. 2022203 for 22nd July 2022)
196
+ - %Y (year only)
197
+ alternative: str, default: 0
198
+ If conversion fails, the alternative/default value the time step
199
+ gets assign to
200
+
201
+ Returns
202
+ -------
203
+ str: ISO time string representation of the input time step, such as
204
+ %Y %Y-%m-%d or %Y-%m-%dT%H%M%S
205
+ """
206
+ _date = isoparse(alternative)
207
+ _time = f"{_date.strftime('%H')}:{_date.strftime('%M')}"
208
+ _day = _date.strftime("%d")
209
+ _mon = _date.strftime("%m")
210
+ has_t_separator = "T" in time_str
211
+ position_t = time_str.find("T") if has_t_separator else -1
212
+ # Strip anything that's not a number from the string
213
+ if not time_str:
214
+ return _date
215
+ # Not valid if time repr empty or starts with a letter, such as 'fx'
216
+ digits = "".join(filter(str.isdigit, time_str))
217
+ l_times = len(digits)
218
+ if not l_times:
219
+ return _date
220
+ try:
221
+ if l_times <= 4:
222
+ # Suppose this is a year only
223
+ return isoparse(f"{digits.zfill(4)}-{_mon}-{_day}T{_time}")
224
+ if l_times <= 6:
225
+ # Suppose this is %Y%m or %Y%e
226
+ return isoparse(f"{digits[:4]}-{digits[4:].zfill(2)}-{_day}T{_time}")
227
+ # Year and day of year
228
+ if l_times == 7:
229
+ # Suppose this is %Y%j
230
+ year = int(digits[:4])
231
+ day_of_year = int(digits[4:])
232
+ date = datetime(year, 1, 1, _date.hour, _date.minute) + timedelta(
233
+ days=day_of_year - 1
234
+ )
235
+ return date
236
+ if l_times <= 8:
237
+ # Suppose this is %Y%m%d
238
+ return isoparse(
239
+ f"{digits[:4]}-{digits[4:6]}-{digits[6:].zfill(2)}T{_time}"
240
+ )
241
+
242
+ date_str = f"{digits[:4]}-{digits[4:6]}-{digits[6:8]}"
243
+ time = digits[8:]
244
+ if len(time) <= 2:
245
+ time = time.zfill(2)
246
+ else:
247
+ # Alaways drop seconds
248
+ time = time[:2] + ":" + time[2 : min(4, len(time))].zfill(2)
249
+ return isoparse(f"{date_str}T{time}")
250
+
251
+ except ValueError:
252
+ if has_t_separator and position_t > 0:
253
+ date_part = time_str[:position_t]
254
+ time_part = time_str[position_t + 1 :]
255
+
256
+ date_digits = "".join(filter(str.isdigit, date_part))
257
+ if len(date_digits) >= 8:
258
+ return isoparse(
259
+ f"{date_digits[:4]}-{date_digits[4:6]}"
260
+ f"-{date_digits[6:8]}T{time_part[:2].zfill(2)}"
261
+ )
262
+
263
+ return _date
264
+
265
+
266
+ def find_closest(msg: str, target: str, options: Iterable[str]) -> str:
267
+ """Find the closest match for a target within a collection of items.
268
+
269
+ Parameters
270
+ ----------
271
+ target: The string to match.
272
+ options: A list of candidate strings.
273
+
274
+
275
+ Returns
276
+ -------
277
+ str: Message
278
+ """
279
+ matches = difflib.get_close_matches(target, options, n=1, cutoff=0.6)
280
+ suffix = f", did you mean {matches[0]}?" if matches else ""
281
+ return msg + suffix
282
+
283
+
284
+ def load_plugins(group: str) -> Dict[str, Any]:
285
+ """Load harverster plugins."""
286
+ eps = entry_points().select(group=group)
287
+ plugins = {}
288
+ for ep in eps:
289
+ plugins[ep.name] = ep.load()
290
+ return plugins
291
+
292
+
293
+ def exception_handler(exception: BaseException) -> None:
294
+ """Handle raising exceptions appropriately."""
295
+ msg = str(exception)
296
+ if logger.level >= logging.INFO:
297
+ msg += " - increase verbosity for more information"
298
+ exc_info = None
299
+ else:
300
+ exc_info = exception
301
+ logger.critical(msg, exc_info=exc_info)
302
+ raise SystemExit(1)
303
+
304
+
305
+ def daemon(
306
+ func: Callable[..., Any],
307
+ ) -> Callable[..., mctx.ForkProcess]:
308
+ """Threading decorator.
309
+
310
+ use @daemon above the function you want to run in the background
311
+ """
312
+
313
+ def background_func(*args: Any, **kwargs: Any) -> mctx.ForkProcess:
314
+ ctx = mp.get_context("fork")
315
+ proc = ctx.Process(target=func, args=args, kwargs=kwargs, daemon=True)
316
+ proc.start()
317
+ return proc
318
+
319
+ return background_func
320
+
321
+
322
+ def timedelta_to_str(seconds: Union[int, float]) -> str:
323
+ """Convert seconds to a more human readable format."""
324
+ hours = seconds // 60**2
325
+ minutes = (seconds // 60) % 60
326
+ sec = round(seconds - (hours * 60 + minutes) * 60, 2)
327
+ out = []
328
+ for num, letter in {sec: "Sec.", minutes: "Min.", hours: "Hour"}.items():
329
+ if num > 0:
330
+ out.append(f"{num} {letter}")
331
+ return " ".join(out[::-1])
332
+
333
+
334
+ class IndexProgress:
335
+ """A helper that displays the progress of index Tasks."""
336
+
337
+ def __init__(
338
+ self,
339
+ total: int = 0,
340
+ interactive: Optional[bool] = None,
341
+ text: str = "Indexing: ",
342
+ ) -> None:
343
+ if interactive is None:
344
+ self._interactive = bool(
345
+ int(os.getenv("MDC_INTERACTIVE", str(int(Console.is_terminal))))
346
+ )
347
+ else:
348
+ self._interactive = interactive
349
+ self._log_interval = int(os.getenv("MDC_LOG_INTERVAL", "30"))
350
+ self.text = text
351
+ self._done = 0
352
+ self._task: TaskID = TaskID(0)
353
+ self._total = total
354
+ self._start = self._last_log = time.time()
355
+ self._progress = Progress()
356
+ self._last_printed_percent: float = -1.0
357
+
358
+ def start(self) -> None:
359
+ """Start the progress bar."""
360
+ self._start = self._last_log = time.time()
361
+
362
+ if self._interactive:
363
+ self._task = self._progress.add_task(
364
+ f"[green] {self.text}", total=self._total or None
365
+ )
366
+ self._progress.start()
367
+
368
+ def stop(self) -> None:
369
+ """Stop the progress bar."""
370
+ if self._interactive:
371
+ self._progress.stop()
372
+ else:
373
+ self._text_update()
374
+
375
+ def _text_update(self, bar_width: int = 40) -> None:
376
+ elapsed = timedelta(seconds=int(time.time() - self._start))
377
+ log_interval = timedelta(seconds=int(time.time() - self._last_log))
378
+ if self._total > 0:
379
+ filled = int((self._last_printed_percent / 100) * bar_width)
380
+ bar = "#" * filled + "-" * (bar_width - filled)
381
+ text = f"{self.text} [{bar}] {self._last_printed_percent:>6,.02f}%"
382
+ else:
383
+ text = f"{self.text} [{self._done:>12,}]"
384
+ if log_interval.total_seconds() >= self._log_interval:
385
+ print(f"{text} ({elapsed})", flush=True)
386
+ self._last_log = time.time()
387
+
388
+ def update(self, inc: int) -> None:
389
+ """Update the status progress bar by an increment."""
390
+ self._done += inc
391
+
392
+ if self._interactive is True:
393
+ desc = f"{self.text} [{self._done:>10d}]" if self._done == 0 else None
394
+ self._progress.update(self._task, advance=inc, description=desc)
395
+ return
396
+
397
+ frac = self._done / max(self._total, 1)
398
+ pct = frac * 100
399
+ if pct > self._last_printed_percent or self._total == 0:
400
+ self._last_printed_percent = pct
401
+ self._text_update()
402
+
403
+
404
+ @daemon
405
+ def print_performance(
406
+ print_status: EventLike,
407
+ num_files: Counter,
408
+ ingest_queue: QueueLike[Any],
409
+ num_objects: Counter,
410
+ ) -> None:
411
+ """Display the progress of the crawler."""
412
+ spinner = rich.spinner.Spinner(
413
+ os.getenv("SPINNER", "earth"), text="[b]Preparing crawler ...[/]"
414
+ )
415
+ interactive = bool(
416
+ int(os.getenv("MDC_INTERACTIVE", str(int(Console.is_terminal))))
417
+ )
418
+ log_interval = int(os.getenv("MDC_LOG_INTERVAL", "30"))
419
+ sample_interval = 1.0 if interactive else 10.0
420
+
421
+ def _snapshot() -> Tuple[float, int, int, int]:
422
+ start = time.monotonic()
423
+ n0 = num_files.value
424
+ time.sleep(sample_interval)
425
+ dn = num_files.value - n0
426
+ dt = max(1e-6, time.monotonic() - start)
427
+ perf_file = dn / dt
428
+ queue_size = ingest_queue.qsize()
429
+ return perf_file, n0, queue_size, num_objects.value
430
+
431
+ def _build_msg(
432
+ perf_file: float,
433
+ discovered: int,
434
+ queue_size: int,
435
+ indexed: int,
436
+ *,
437
+ markup: bool,
438
+ ) -> str:
439
+ # Color thresholds only when markup=True (interactive)
440
+ if markup:
441
+ f_col = (
442
+ "green"
443
+ if perf_file > 500
444
+ else "red" if perf_file < 100 else "blue"
445
+ )
446
+ q_col = (
447
+ "red"
448
+ if queue_size > 100_000
449
+ else "green" if queue_size < 10_000 else "blue"
450
+ )
451
+ return (
452
+ f"[bold]Discovering: [{f_col}]{perf_file:>6,.1f}[/{f_col}] files/s "
453
+ f"#files: [blue]{discovered:>10,.0f}[/blue] "
454
+ f"in queue: [{q_col}]{queue_size:>6,.0f}[/{q_col}] "
455
+ f"#indexed: [blue]{indexed:>10,.0f}[/blue][/bold]"
456
+ )
457
+ else:
458
+ return (
459
+ f"Discovering: {perf_file:,.1f} files/s | "
460
+ f"files={discovered:,} | queue={queue_size:,} | indexed={indexed:,}"
461
+ )
462
+
463
+ if interactive:
464
+ with Live(
465
+ spinner, console=Console, refresh_per_second=2.5, transient=True
466
+ ):
467
+ while print_status.is_set():
468
+ perf, disc, qsz, idx = _snapshot()
469
+ spinner.update(text=_build_msg(perf, disc, qsz, idx, markup=True))
470
+ # Clear the last line when done
471
+ Console.print(" " * Console.width, end="\r")
472
+ Console.print(" ")
473
+ else:
474
+ # Non-TTY (e.g. systemd): emit a plain summary every log_interval secs
475
+ next_log = time.monotonic()
476
+ while print_status.is_set():
477
+ perf, disc, qsz, idx = _snapshot()
478
+ now = time.monotonic()
479
+ if now >= next_log:
480
+ # Print one clean line; journald/Cockpit will show one entry
481
+ print(_build_msg(perf, disc, qsz, idx, markup=False), flush=True)
482
+ next_log = now + log_interval
@@ -0,0 +1,207 @@
1
+ """Utilities to convert time delta to CMOR time frequencies."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import datetime as dt
6
+ from typing import Any, Optional, cast
7
+
8
+ import numpy as np
9
+ import pandas as pd
10
+ import xarray as xr
11
+
12
+
13
+ def _seconds_from_timedelta(delta: Any) -> float:
14
+ """Normalize timedelta-like objects to seconds (float)."""
15
+ if hasattr(delta, "to_numpy"):
16
+ delta = delta.to_numpy()
17
+
18
+ if isinstance(delta, np.timedelta64):
19
+ return cast(float, delta.astype("timedelta64[ns]").astype("int64") / 1e9)
20
+ if isinstance(delta, dt.timedelta):
21
+ return delta.total_seconds()
22
+ if isinstance(delta, pd.Timedelta):
23
+ return cast(float, delta.total_seconds())
24
+
25
+ try:
26
+ return float(delta)
27
+ except Exception as exc: # pragma: no cover
28
+ raise TypeError(f"Unrecognized timedelta type: {type(delta)}") from exc
29
+
30
+
31
+ def _near(
32
+ value: float, target: float, rel: float = 0.01, abs_tol: float = 60.0
33
+ ) -> bool:
34
+ """Compare two float values within relative or absolute tolerance."""
35
+ return abs(value - target) <= max(abs_tol, abs(target) * rel)
36
+
37
+
38
+ def _map_seconds_to_cmor_like_frequency(sec: float) -> str:
39
+ """
40
+ Map a timestep in seconds to a CMOR-like frequency string.
41
+
42
+ Returns one of:
43
+ subhr, 1hr, 3hr, 6hr,
44
+ day, 6d, 1w, sem,
45
+ mon, season, yr, dec,
46
+ <Xd> (generic fallback),
47
+ unknown
48
+ """
49
+ day = 86400.0
50
+ hour = 3600.0
51
+
52
+ if sec <= 0:
53
+ return "unknown"
54
+
55
+ # Sub-hourly
56
+ if sec < 0.5 * hour:
57
+ return "subhr"
58
+
59
+ # Hourly
60
+ if _near(sec, hour):
61
+ return "1hr"
62
+ if _near(sec, 3 * hour):
63
+ return "3hr"
64
+ if _near(sec, 6 * hour):
65
+ return "6hr"
66
+
67
+ # Daily
68
+ if _near(sec, day):
69
+ return "day"
70
+
71
+ # Multi-day (requested extras)
72
+ if _near(sec, 6 * day):
73
+ return "6d"
74
+ if _near(sec, 7 * day):
75
+ return "1w"
76
+ if _near(sec, 14 * day):
77
+ return "sem"
78
+
79
+ # Monthly-ish
80
+ if 20 * day <= sec <= 40 * day:
81
+ return "mon"
82
+
83
+ # Seasonal-ish (~3 months)
84
+ if 80 * day <= sec <= 100 * day:
85
+ return "season"
86
+
87
+ # Yearly-ish
88
+ if 350 * day <= sec <= 380 * day:
89
+ return "yr"
90
+
91
+ # Decadal-ish
92
+ if 9 * 365 * day <= sec <= 11 * 365 * day:
93
+ return "dec"
94
+
95
+ # Fallback: express as days
96
+ return f"{sec / day:.3g}d"
97
+
98
+
99
+ def _find_time_coord(
100
+ ds: xr.Dataset,
101
+ time_coord: Optional[str] = None,
102
+ ) -> Optional[xr.DataArray]:
103
+ """Best-effort detection of the time coordinate."""
104
+ # 1) Explicit
105
+ if time_coord is not None:
106
+ if time_coord in ds.coords:
107
+ return ds.coords[time_coord]
108
+ if time_coord in ds.variables:
109
+ return ds[time_coord]
110
+ return None
111
+
112
+ # 2) Coordinate literally named "time"
113
+ if "time" in ds.coords:
114
+ return ds.coords["time"]
115
+
116
+ # 3) Coordinate with standard_name="time" or axis="T"
117
+ for coord in ds.coords.values():
118
+ std_name = coord.attrs.get("standard_name", "").lower()
119
+ axis = coord.attrs.get("axis", "")
120
+ if std_name == "time" or axis == "T":
121
+ return cast(xr.DataArray, coord)
122
+
123
+ # 4) Any coord whose dim is named "time"
124
+ time_like_coords: list[xr.DataArray] = []
125
+ for coord in ds.coords.values():
126
+ if any(dim.lower() == "time" for dim in coord.dims):
127
+ time_like_coords.append(coord)
128
+ if time_like_coords:
129
+ return time_like_coords[0]
130
+
131
+ # 5) As last resort: any variable (coord or not) that looks time-like
132
+ for vname in ds.variables:
133
+ var = ds[vname]
134
+ if any(str(dim).lower() == "time" for dim in var.dims):
135
+ # Require datetime-like or object (for cftime) to avoid bogus matches
136
+ if np.issubdtype(var.dtype, np.datetime64) or var.dtype == "O":
137
+ return ds[vname]
138
+
139
+ return None
140
+
141
+
142
+ def infer_cmor_like_time_frequency(
143
+ ds: xr.Dataset,
144
+ time_coord: Optional[str] = None,
145
+ ) -> str:
146
+ """
147
+ Infer a CMOR-like time frequency from the first two valid time entries.
148
+
149
+ Parameters
150
+ ----------
151
+ ds:
152
+ Open xarray Dataset.
153
+ time_coord:
154
+ Optional explicit name of the time coordinate.
155
+
156
+ Returns
157
+ -------
158
+ freq : str
159
+ One of:
160
+ - 'fx' : no/insufficient/constant time
161
+ - 'subhr'
162
+ - '1hr', '3hr', '6hr'
163
+ - 'day'
164
+ - '6d', '1w', 'sem'
165
+ - 'mon', 'season', 'yr', 'dec'
166
+ - '<Xd>' : generic days fallback
167
+ - 'unknown' : invalid/negative step
168
+ """
169
+ t = _find_time_coord(ds, time_coord=time_coord)
170
+
171
+ if t is None:
172
+ return "fx"
173
+
174
+ # Ensure 1D along its primary dim
175
+ if t.ndim != 1:
176
+ main_dim = t.dims[0]
177
+ t = t.isel({main_dim: slice(None)})
178
+
179
+ if t.size < 2:
180
+ return "fx"
181
+
182
+ # Extract values
183
+ vals = np.asarray(t.values).ravel()
184
+
185
+ # Try via pandas for datetime64 / cftime that can be coerced
186
+ dt_like = pd.to_datetime(vals, errors="coerce")
187
+ valid = dt_like[~dt_like.isna()]
188
+
189
+ if valid.size >= 2:
190
+ uniq = np.unique(valid)
191
+ if uniq.size < 2:
192
+ return "fx"
193
+ uniq.sort()
194
+ delta = uniq[1] - uniq[0]
195
+ else:
196
+ # Fallback for non-coercible types: keep non-null, sort, diff
197
+ non_null = [v for v in vals if v is not None]
198
+ if len(non_null) < 2:
199
+ return "fx"
200
+ non_null = sorted(non_null)
201
+ if non_null[0] == non_null[1]:
202
+ return "fx"
203
+ delta = non_null[1] - non_null[0]
204
+
205
+ sec = _seconds_from_timedelta(delta)
206
+ freq = _map_seconds_to_cmor_like_frequency(sec)
207
+ return freq