metadata-crawler 2510.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of metadata-crawler might be problematic. Click here for more details.
- metadata_crawler/__init__.py +263 -0
- metadata_crawler/__main__.py +8 -0
- metadata_crawler/_version.py +1 -0
- metadata_crawler/api/__init__.py +1 -0
- metadata_crawler/api/cli.py +57 -0
- metadata_crawler/api/config.py +831 -0
- metadata_crawler/api/drs_config.toml +440 -0
- metadata_crawler/api/index.py +151 -0
- metadata_crawler/api/metadata_stores.py +755 -0
- metadata_crawler/api/mixin/__init__.py +7 -0
- metadata_crawler/api/mixin/lookup_mixin.py +112 -0
- metadata_crawler/api/mixin/lookup_tables.py +10010 -0
- metadata_crawler/api/mixin/path_mixin.py +46 -0
- metadata_crawler/api/mixin/template_mixin.py +145 -0
- metadata_crawler/api/storage_backend.py +277 -0
- metadata_crawler/backends/__init__.py +1 -0
- metadata_crawler/backends/intake.py +211 -0
- metadata_crawler/backends/posix.py +121 -0
- metadata_crawler/backends/s3.py +140 -0
- metadata_crawler/backends/swift.py +305 -0
- metadata_crawler/cli.py +547 -0
- metadata_crawler/data_collector.py +278 -0
- metadata_crawler/ingester/__init__.py +1 -0
- metadata_crawler/ingester/mongo.py +206 -0
- metadata_crawler/ingester/solr.py +282 -0
- metadata_crawler/logger.py +153 -0
- metadata_crawler/py.typed +0 -0
- metadata_crawler/run.py +419 -0
- metadata_crawler/utils/__init__.py +482 -0
- metadata_crawler/utils/cftime_utils.py +207 -0
- metadata_crawler-2510.1.0.dist-info/METADATA +401 -0
- metadata_crawler-2510.1.0.dist-info/RECORD +35 -0
- metadata_crawler-2510.1.0.dist-info/WHEEL +4 -0
- metadata_crawler-2510.1.0.dist-info/entry_points.txt +14 -0
- metadata_crawler-2510.1.0.dist-info/licenses/LICENSE +28 -0
|
@@ -0,0 +1,482 @@
|
|
|
1
|
+
"""Random utility functions."""
|
|
2
|
+
|
|
3
|
+
import difflib
|
|
4
|
+
import logging
|
|
5
|
+
import multiprocessing as mp
|
|
6
|
+
import multiprocessing.context as mctx
|
|
7
|
+
import os
|
|
8
|
+
import sys
|
|
9
|
+
import time
|
|
10
|
+
from datetime import datetime, timedelta
|
|
11
|
+
from importlib.metadata import entry_points
|
|
12
|
+
from typing import (
|
|
13
|
+
IO,
|
|
14
|
+
Any,
|
|
15
|
+
AsyncIterator,
|
|
16
|
+
Callable,
|
|
17
|
+
Dict,
|
|
18
|
+
Iterable,
|
|
19
|
+
List,
|
|
20
|
+
Optional,
|
|
21
|
+
Protocol,
|
|
22
|
+
Set,
|
|
23
|
+
Tuple,
|
|
24
|
+
TypeAlias,
|
|
25
|
+
TypeVar,
|
|
26
|
+
Union,
|
|
27
|
+
)
|
|
28
|
+
|
|
29
|
+
import ciso8601
|
|
30
|
+
import orjson
|
|
31
|
+
import rich.console
|
|
32
|
+
import rich.spinner
|
|
33
|
+
from dateutil.parser import isoparse
|
|
34
|
+
from rich.live import Live
|
|
35
|
+
from rich.progress import Progress, TaskID
|
|
36
|
+
|
|
37
|
+
from ..logger import logger
|
|
38
|
+
|
|
39
|
+
T = TypeVar("T")
|
|
40
|
+
U = TypeVar("U")
|
|
41
|
+
|
|
42
|
+
|
|
43
|
+
class SimpleQueueLike(Protocol[T]):
|
|
44
|
+
"""A simple queue like Type class."""
|
|
45
|
+
|
|
46
|
+
def put(self, item: T) -> None: # noqa
|
|
47
|
+
...
|
|
48
|
+
|
|
49
|
+
def get(self) -> T: # noqa
|
|
50
|
+
...
|
|
51
|
+
|
|
52
|
+
|
|
53
|
+
class QueueLike(Protocol[T]):
|
|
54
|
+
"""A queue like Type class."""
|
|
55
|
+
|
|
56
|
+
def put(self, item: T) -> None: # noqa
|
|
57
|
+
...
|
|
58
|
+
|
|
59
|
+
def get(
|
|
60
|
+
self, block: bool = True, timeout: Optional[float] = ...
|
|
61
|
+
) -> T: # noqa
|
|
62
|
+
...
|
|
63
|
+
|
|
64
|
+
def qsize(self) -> int: # noqa
|
|
65
|
+
...
|
|
66
|
+
|
|
67
|
+
|
|
68
|
+
class EventLike(Protocol):
|
|
69
|
+
"""An event like Type class."""
|
|
70
|
+
|
|
71
|
+
def set(self) -> None: # noqa
|
|
72
|
+
...
|
|
73
|
+
|
|
74
|
+
def clear(self) -> None: # noqa
|
|
75
|
+
...
|
|
76
|
+
|
|
77
|
+
def is_set(self) -> bool: # noqa
|
|
78
|
+
...
|
|
79
|
+
|
|
80
|
+
def wait(self) -> None: # noqa
|
|
81
|
+
...
|
|
82
|
+
|
|
83
|
+
|
|
84
|
+
class LockLike(Protocol):
|
|
85
|
+
"""A lock like Type class."""
|
|
86
|
+
|
|
87
|
+
def acquire(
|
|
88
|
+
self, blocking: bool = ..., timeout: Optional[float] = ...
|
|
89
|
+
) -> bool: # noqa
|
|
90
|
+
...
|
|
91
|
+
|
|
92
|
+
def release(self) -> None: # noqa
|
|
93
|
+
...
|
|
94
|
+
|
|
95
|
+
def __enter__(self) -> "LockLike": ...
|
|
96
|
+
def __exit__(self, exc_type: Any, exc: Any, tb: Any) -> None: ...
|
|
97
|
+
|
|
98
|
+
|
|
99
|
+
class ValueLike(Protocol[U]):
|
|
100
|
+
"""A value like Type class."""
|
|
101
|
+
|
|
102
|
+
value: U
|
|
103
|
+
|
|
104
|
+
def get_lock(self) -> "Any": # noqa
|
|
105
|
+
...
|
|
106
|
+
|
|
107
|
+
|
|
108
|
+
class FilesystemLike(Protocol):
|
|
109
|
+
"""File-like opener protocol (e.g., fsspec)."""
|
|
110
|
+
|
|
111
|
+
def open(
|
|
112
|
+
self,
|
|
113
|
+
path: str,
|
|
114
|
+
mode: str = "rt",
|
|
115
|
+
compression: Optional[str] = None,
|
|
116
|
+
encoding: Optional[str] = None,
|
|
117
|
+
**kwargs: Any,
|
|
118
|
+
) -> IO[str]: # noqa
|
|
119
|
+
...
|
|
120
|
+
|
|
121
|
+
|
|
122
|
+
Counter: TypeAlias = ValueLike[int]
|
|
123
|
+
PrintLock = mp.Lock()
|
|
124
|
+
Console = rich.console.Console(force_terminal=sys.stdout.isatty(), stderr=True)
|
|
125
|
+
|
|
126
|
+
|
|
127
|
+
class MetadataCrawlerException(Exception):
|
|
128
|
+
"""Custom Exception for the crawling."""
|
|
129
|
+
|
|
130
|
+
|
|
131
|
+
class EmptyCrawl(MetadataCrawlerException):
|
|
132
|
+
"""Cusotom Exceptoin for a crawl with no results."""
|
|
133
|
+
|
|
134
|
+
|
|
135
|
+
async def create_async_iterator(itt: Iterable[Any]) -> AsyncIterator[Any]:
|
|
136
|
+
"""Create an async iterator from as sync iterable."""
|
|
137
|
+
for item in itt:
|
|
138
|
+
yield item
|
|
139
|
+
|
|
140
|
+
|
|
141
|
+
def _parse_iso_datetime(s: str) -> datetime:
|
|
142
|
+
return ciso8601.parse_datetime(s) or datetime.fromisoformat(s)
|
|
143
|
+
|
|
144
|
+
|
|
145
|
+
def parse_batch(
|
|
146
|
+
lines: List[str],
|
|
147
|
+
timestamp_keys: Set[str],
|
|
148
|
+
) -> List[Dict[str, Any]]:
|
|
149
|
+
"""Parse a batch of NDJSON lines and convert timestamp fields.
|
|
150
|
+
|
|
151
|
+
Parameters
|
|
152
|
+
^^^^^^^^^^
|
|
153
|
+
lines : list of str
|
|
154
|
+
Raw NDJSON lines.
|
|
155
|
+
timestamp_keys : set of str
|
|
156
|
+
Keys that should be parsed as datetimes.
|
|
157
|
+
|
|
158
|
+
Returns
|
|
159
|
+
^^^^^^^
|
|
160
|
+
list of dict
|
|
161
|
+
Parsed objects with timestamp fields converted to ``datetime``.
|
|
162
|
+
"""
|
|
163
|
+
out: List[Dict[str, Any]] = []
|
|
164
|
+
append = out.append
|
|
165
|
+
loads = orjson.loads
|
|
166
|
+
parse_dt = _parse_iso_datetime
|
|
167
|
+
|
|
168
|
+
for line in lines:
|
|
169
|
+
obj: Dict[str, Any] = loads(line)
|
|
170
|
+
for k in timestamp_keys:
|
|
171
|
+
v = obj.get(k, None)
|
|
172
|
+
if v is None:
|
|
173
|
+
continue
|
|
174
|
+
if isinstance(v, str):
|
|
175
|
+
obj[k] = parse_dt(v)
|
|
176
|
+
elif isinstance(v, list):
|
|
177
|
+
obj[k] = [parse_dt(x) if isinstance(x, str) else x for x in v]
|
|
178
|
+
append(obj)
|
|
179
|
+
return out
|
|
180
|
+
|
|
181
|
+
|
|
182
|
+
def convert_str_to_timestamp(
|
|
183
|
+
time_str: str, alternative: str = "0001-01-01"
|
|
184
|
+
) -> datetime:
|
|
185
|
+
"""Convert a string representation of a time step to an iso timestamp.
|
|
186
|
+
|
|
187
|
+
Parameters
|
|
188
|
+
----------
|
|
189
|
+
time_str: str
|
|
190
|
+
Representation of the time step in formats:
|
|
191
|
+
- %Y%m%d%H%M%S%f (year, month, day, hour, minute, second, millisecond)
|
|
192
|
+
- %Y%m%d%H%M (year, month, day, hour, minute)
|
|
193
|
+
- %Y%m (year, month)
|
|
194
|
+
- %Y%m%dT%H%M (year, month, day, hour, minute with T separator)
|
|
195
|
+
- %Y%j (year and day of year, e.g. 2022203 for 22nd July 2022)
|
|
196
|
+
- %Y (year only)
|
|
197
|
+
alternative: str, default: 0
|
|
198
|
+
If conversion fails, the alternative/default value the time step
|
|
199
|
+
gets assign to
|
|
200
|
+
|
|
201
|
+
Returns
|
|
202
|
+
-------
|
|
203
|
+
str: ISO time string representation of the input time step, such as
|
|
204
|
+
%Y %Y-%m-%d or %Y-%m-%dT%H%M%S
|
|
205
|
+
"""
|
|
206
|
+
_date = isoparse(alternative)
|
|
207
|
+
_time = f"{_date.strftime('%H')}:{_date.strftime('%M')}"
|
|
208
|
+
_day = _date.strftime("%d")
|
|
209
|
+
_mon = _date.strftime("%m")
|
|
210
|
+
has_t_separator = "T" in time_str
|
|
211
|
+
position_t = time_str.find("T") if has_t_separator else -1
|
|
212
|
+
# Strip anything that's not a number from the string
|
|
213
|
+
if not time_str:
|
|
214
|
+
return _date
|
|
215
|
+
# Not valid if time repr empty or starts with a letter, such as 'fx'
|
|
216
|
+
digits = "".join(filter(str.isdigit, time_str))
|
|
217
|
+
l_times = len(digits)
|
|
218
|
+
if not l_times:
|
|
219
|
+
return _date
|
|
220
|
+
try:
|
|
221
|
+
if l_times <= 4:
|
|
222
|
+
# Suppose this is a year only
|
|
223
|
+
return isoparse(f"{digits.zfill(4)}-{_mon}-{_day}T{_time}")
|
|
224
|
+
if l_times <= 6:
|
|
225
|
+
# Suppose this is %Y%m or %Y%e
|
|
226
|
+
return isoparse(f"{digits[:4]}-{digits[4:].zfill(2)}-{_day}T{_time}")
|
|
227
|
+
# Year and day of year
|
|
228
|
+
if l_times == 7:
|
|
229
|
+
# Suppose this is %Y%j
|
|
230
|
+
year = int(digits[:4])
|
|
231
|
+
day_of_year = int(digits[4:])
|
|
232
|
+
date = datetime(year, 1, 1, _date.hour, _date.minute) + timedelta(
|
|
233
|
+
days=day_of_year - 1
|
|
234
|
+
)
|
|
235
|
+
return date
|
|
236
|
+
if l_times <= 8:
|
|
237
|
+
# Suppose this is %Y%m%d
|
|
238
|
+
return isoparse(
|
|
239
|
+
f"{digits[:4]}-{digits[4:6]}-{digits[6:].zfill(2)}T{_time}"
|
|
240
|
+
)
|
|
241
|
+
|
|
242
|
+
date_str = f"{digits[:4]}-{digits[4:6]}-{digits[6:8]}"
|
|
243
|
+
time = digits[8:]
|
|
244
|
+
if len(time) <= 2:
|
|
245
|
+
time = time.zfill(2)
|
|
246
|
+
else:
|
|
247
|
+
# Alaways drop seconds
|
|
248
|
+
time = time[:2] + ":" + time[2 : min(4, len(time))].zfill(2)
|
|
249
|
+
return isoparse(f"{date_str}T{time}")
|
|
250
|
+
|
|
251
|
+
except ValueError:
|
|
252
|
+
if has_t_separator and position_t > 0:
|
|
253
|
+
date_part = time_str[:position_t]
|
|
254
|
+
time_part = time_str[position_t + 1 :]
|
|
255
|
+
|
|
256
|
+
date_digits = "".join(filter(str.isdigit, date_part))
|
|
257
|
+
if len(date_digits) >= 8:
|
|
258
|
+
return isoparse(
|
|
259
|
+
f"{date_digits[:4]}-{date_digits[4:6]}"
|
|
260
|
+
f"-{date_digits[6:8]}T{time_part[:2].zfill(2)}"
|
|
261
|
+
)
|
|
262
|
+
|
|
263
|
+
return _date
|
|
264
|
+
|
|
265
|
+
|
|
266
|
+
def find_closest(msg: str, target: str, options: Iterable[str]) -> str:
|
|
267
|
+
"""Find the closest match for a target within a collection of items.
|
|
268
|
+
|
|
269
|
+
Parameters
|
|
270
|
+
----------
|
|
271
|
+
target: The string to match.
|
|
272
|
+
options: A list of candidate strings.
|
|
273
|
+
|
|
274
|
+
|
|
275
|
+
Returns
|
|
276
|
+
-------
|
|
277
|
+
str: Message
|
|
278
|
+
"""
|
|
279
|
+
matches = difflib.get_close_matches(target, options, n=1, cutoff=0.6)
|
|
280
|
+
suffix = f", did you mean {matches[0]}?" if matches else ""
|
|
281
|
+
return msg + suffix
|
|
282
|
+
|
|
283
|
+
|
|
284
|
+
def load_plugins(group: str) -> Dict[str, Any]:
|
|
285
|
+
"""Load harverster plugins."""
|
|
286
|
+
eps = entry_points().select(group=group)
|
|
287
|
+
plugins = {}
|
|
288
|
+
for ep in eps:
|
|
289
|
+
plugins[ep.name] = ep.load()
|
|
290
|
+
return plugins
|
|
291
|
+
|
|
292
|
+
|
|
293
|
+
def exception_handler(exception: BaseException) -> None:
|
|
294
|
+
"""Handle raising exceptions appropriately."""
|
|
295
|
+
msg = str(exception)
|
|
296
|
+
if logger.level >= logging.INFO:
|
|
297
|
+
msg += " - increase verbosity for more information"
|
|
298
|
+
exc_info = None
|
|
299
|
+
else:
|
|
300
|
+
exc_info = exception
|
|
301
|
+
logger.critical(msg, exc_info=exc_info)
|
|
302
|
+
raise SystemExit(1)
|
|
303
|
+
|
|
304
|
+
|
|
305
|
+
def daemon(
|
|
306
|
+
func: Callable[..., Any],
|
|
307
|
+
) -> Callable[..., mctx.ForkProcess]:
|
|
308
|
+
"""Threading decorator.
|
|
309
|
+
|
|
310
|
+
use @daemon above the function you want to run in the background
|
|
311
|
+
"""
|
|
312
|
+
|
|
313
|
+
def background_func(*args: Any, **kwargs: Any) -> mctx.ForkProcess:
|
|
314
|
+
ctx = mp.get_context("fork")
|
|
315
|
+
proc = ctx.Process(target=func, args=args, kwargs=kwargs, daemon=True)
|
|
316
|
+
proc.start()
|
|
317
|
+
return proc
|
|
318
|
+
|
|
319
|
+
return background_func
|
|
320
|
+
|
|
321
|
+
|
|
322
|
+
def timedelta_to_str(seconds: Union[int, float]) -> str:
|
|
323
|
+
"""Convert seconds to a more human readable format."""
|
|
324
|
+
hours = seconds // 60**2
|
|
325
|
+
minutes = (seconds // 60) % 60
|
|
326
|
+
sec = round(seconds - (hours * 60 + minutes) * 60, 2)
|
|
327
|
+
out = []
|
|
328
|
+
for num, letter in {sec: "Sec.", minutes: "Min.", hours: "Hour"}.items():
|
|
329
|
+
if num > 0:
|
|
330
|
+
out.append(f"{num} {letter}")
|
|
331
|
+
return " ".join(out[::-1])
|
|
332
|
+
|
|
333
|
+
|
|
334
|
+
class IndexProgress:
|
|
335
|
+
"""A helper that displays the progress of index Tasks."""
|
|
336
|
+
|
|
337
|
+
def __init__(
|
|
338
|
+
self,
|
|
339
|
+
total: int = 0,
|
|
340
|
+
interactive: Optional[bool] = None,
|
|
341
|
+
text: str = "Indexing: ",
|
|
342
|
+
) -> None:
|
|
343
|
+
if interactive is None:
|
|
344
|
+
self._interactive = bool(
|
|
345
|
+
int(os.getenv("MDC_INTERACTIVE", str(int(Console.is_terminal))))
|
|
346
|
+
)
|
|
347
|
+
else:
|
|
348
|
+
self._interactive = interactive
|
|
349
|
+
self._log_interval = int(os.getenv("MDC_LOG_INTERVAL", "30"))
|
|
350
|
+
self.text = text
|
|
351
|
+
self._done = 0
|
|
352
|
+
self._task: TaskID = TaskID(0)
|
|
353
|
+
self._total = total
|
|
354
|
+
self._start = self._last_log = time.time()
|
|
355
|
+
self._progress = Progress()
|
|
356
|
+
self._last_printed_percent: float = -1.0
|
|
357
|
+
|
|
358
|
+
def start(self) -> None:
|
|
359
|
+
"""Start the progress bar."""
|
|
360
|
+
self._start = self._last_log = time.time()
|
|
361
|
+
|
|
362
|
+
if self._interactive:
|
|
363
|
+
self._task = self._progress.add_task(
|
|
364
|
+
f"[green] {self.text}", total=self._total or None
|
|
365
|
+
)
|
|
366
|
+
self._progress.start()
|
|
367
|
+
|
|
368
|
+
def stop(self) -> None:
|
|
369
|
+
"""Stop the progress bar."""
|
|
370
|
+
if self._interactive:
|
|
371
|
+
self._progress.stop()
|
|
372
|
+
else:
|
|
373
|
+
self._text_update()
|
|
374
|
+
|
|
375
|
+
def _text_update(self, bar_width: int = 40) -> None:
|
|
376
|
+
elapsed = timedelta(seconds=int(time.time() - self._start))
|
|
377
|
+
log_interval = timedelta(seconds=int(time.time() - self._last_log))
|
|
378
|
+
if self._total > 0:
|
|
379
|
+
filled = int((self._last_printed_percent / 100) * bar_width)
|
|
380
|
+
bar = "#" * filled + "-" * (bar_width - filled)
|
|
381
|
+
text = f"{self.text} [{bar}] {self._last_printed_percent:>6,.02f}%"
|
|
382
|
+
else:
|
|
383
|
+
text = f"{self.text} [{self._done:>12,}]"
|
|
384
|
+
if log_interval.total_seconds() >= self._log_interval:
|
|
385
|
+
print(f"{text} ({elapsed})", flush=True)
|
|
386
|
+
self._last_log = time.time()
|
|
387
|
+
|
|
388
|
+
def update(self, inc: int) -> None:
|
|
389
|
+
"""Update the status progress bar by an increment."""
|
|
390
|
+
self._done += inc
|
|
391
|
+
|
|
392
|
+
if self._interactive is True:
|
|
393
|
+
desc = f"{self.text} [{self._done:>10d}]" if self._done == 0 else None
|
|
394
|
+
self._progress.update(self._task, advance=inc, description=desc)
|
|
395
|
+
return
|
|
396
|
+
|
|
397
|
+
frac = self._done / max(self._total, 1)
|
|
398
|
+
pct = frac * 100
|
|
399
|
+
if pct > self._last_printed_percent or self._total == 0:
|
|
400
|
+
self._last_printed_percent = pct
|
|
401
|
+
self._text_update()
|
|
402
|
+
|
|
403
|
+
|
|
404
|
+
@daemon
|
|
405
|
+
def print_performance(
|
|
406
|
+
print_status: EventLike,
|
|
407
|
+
num_files: Counter,
|
|
408
|
+
ingest_queue: QueueLike[Any],
|
|
409
|
+
num_objects: Counter,
|
|
410
|
+
) -> None:
|
|
411
|
+
"""Display the progress of the crawler."""
|
|
412
|
+
spinner = rich.spinner.Spinner(
|
|
413
|
+
os.getenv("SPINNER", "earth"), text="[b]Preparing crawler ...[/]"
|
|
414
|
+
)
|
|
415
|
+
interactive = bool(
|
|
416
|
+
int(os.getenv("MDC_INTERACTIVE", str(int(Console.is_terminal))))
|
|
417
|
+
)
|
|
418
|
+
log_interval = int(os.getenv("MDC_LOG_INTERVAL", "30"))
|
|
419
|
+
sample_interval = 1.0 if interactive else 10.0
|
|
420
|
+
|
|
421
|
+
def _snapshot() -> Tuple[float, int, int, int]:
|
|
422
|
+
start = time.monotonic()
|
|
423
|
+
n0 = num_files.value
|
|
424
|
+
time.sleep(sample_interval)
|
|
425
|
+
dn = num_files.value - n0
|
|
426
|
+
dt = max(1e-6, time.monotonic() - start)
|
|
427
|
+
perf_file = dn / dt
|
|
428
|
+
queue_size = ingest_queue.qsize()
|
|
429
|
+
return perf_file, n0, queue_size, num_objects.value
|
|
430
|
+
|
|
431
|
+
def _build_msg(
|
|
432
|
+
perf_file: float,
|
|
433
|
+
discovered: int,
|
|
434
|
+
queue_size: int,
|
|
435
|
+
indexed: int,
|
|
436
|
+
*,
|
|
437
|
+
markup: bool,
|
|
438
|
+
) -> str:
|
|
439
|
+
# Color thresholds only when markup=True (interactive)
|
|
440
|
+
if markup:
|
|
441
|
+
f_col = (
|
|
442
|
+
"green"
|
|
443
|
+
if perf_file > 500
|
|
444
|
+
else "red" if perf_file < 100 else "blue"
|
|
445
|
+
)
|
|
446
|
+
q_col = (
|
|
447
|
+
"red"
|
|
448
|
+
if queue_size > 100_000
|
|
449
|
+
else "green" if queue_size < 10_000 else "blue"
|
|
450
|
+
)
|
|
451
|
+
return (
|
|
452
|
+
f"[bold]Discovering: [{f_col}]{perf_file:>6,.1f}[/{f_col}] files/s "
|
|
453
|
+
f"#files: [blue]{discovered:>10,.0f}[/blue] "
|
|
454
|
+
f"in queue: [{q_col}]{queue_size:>6,.0f}[/{q_col}] "
|
|
455
|
+
f"#indexed: [blue]{indexed:>10,.0f}[/blue][/bold]"
|
|
456
|
+
)
|
|
457
|
+
else:
|
|
458
|
+
return (
|
|
459
|
+
f"Discovering: {perf_file:,.1f} files/s | "
|
|
460
|
+
f"files={discovered:,} | queue={queue_size:,} | indexed={indexed:,}"
|
|
461
|
+
)
|
|
462
|
+
|
|
463
|
+
if interactive:
|
|
464
|
+
with Live(
|
|
465
|
+
spinner, console=Console, refresh_per_second=2.5, transient=True
|
|
466
|
+
):
|
|
467
|
+
while print_status.is_set():
|
|
468
|
+
perf, disc, qsz, idx = _snapshot()
|
|
469
|
+
spinner.update(text=_build_msg(perf, disc, qsz, idx, markup=True))
|
|
470
|
+
# Clear the last line when done
|
|
471
|
+
Console.print(" " * Console.width, end="\r")
|
|
472
|
+
Console.print(" ")
|
|
473
|
+
else:
|
|
474
|
+
# Non-TTY (e.g. systemd): emit a plain summary every log_interval secs
|
|
475
|
+
next_log = time.monotonic()
|
|
476
|
+
while print_status.is_set():
|
|
477
|
+
perf, disc, qsz, idx = _snapshot()
|
|
478
|
+
now = time.monotonic()
|
|
479
|
+
if now >= next_log:
|
|
480
|
+
# Print one clean line; journald/Cockpit will show one entry
|
|
481
|
+
print(_build_msg(perf, disc, qsz, idx, markup=False), flush=True)
|
|
482
|
+
next_log = now + log_interval
|
|
@@ -0,0 +1,207 @@
|
|
|
1
|
+
"""Utilities to convert time delta to CMOR time frequencies."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import datetime as dt
|
|
6
|
+
from typing import Any, Optional, cast
|
|
7
|
+
|
|
8
|
+
import numpy as np
|
|
9
|
+
import pandas as pd
|
|
10
|
+
import xarray as xr
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
def _seconds_from_timedelta(delta: Any) -> float:
|
|
14
|
+
"""Normalize timedelta-like objects to seconds (float)."""
|
|
15
|
+
if hasattr(delta, "to_numpy"):
|
|
16
|
+
delta = delta.to_numpy()
|
|
17
|
+
|
|
18
|
+
if isinstance(delta, np.timedelta64):
|
|
19
|
+
return cast(float, delta.astype("timedelta64[ns]").astype("int64") / 1e9)
|
|
20
|
+
if isinstance(delta, dt.timedelta):
|
|
21
|
+
return delta.total_seconds()
|
|
22
|
+
if isinstance(delta, pd.Timedelta):
|
|
23
|
+
return cast(float, delta.total_seconds())
|
|
24
|
+
|
|
25
|
+
try:
|
|
26
|
+
return float(delta)
|
|
27
|
+
except Exception as exc: # pragma: no cover
|
|
28
|
+
raise TypeError(f"Unrecognized timedelta type: {type(delta)}") from exc
|
|
29
|
+
|
|
30
|
+
|
|
31
|
+
def _near(
|
|
32
|
+
value: float, target: float, rel: float = 0.01, abs_tol: float = 60.0
|
|
33
|
+
) -> bool:
|
|
34
|
+
"""Compare two float values within relative or absolute tolerance."""
|
|
35
|
+
return abs(value - target) <= max(abs_tol, abs(target) * rel)
|
|
36
|
+
|
|
37
|
+
|
|
38
|
+
def _map_seconds_to_cmor_like_frequency(sec: float) -> str:
|
|
39
|
+
"""
|
|
40
|
+
Map a timestep in seconds to a CMOR-like frequency string.
|
|
41
|
+
|
|
42
|
+
Returns one of:
|
|
43
|
+
subhr, 1hr, 3hr, 6hr,
|
|
44
|
+
day, 6d, 1w, sem,
|
|
45
|
+
mon, season, yr, dec,
|
|
46
|
+
<Xd> (generic fallback),
|
|
47
|
+
unknown
|
|
48
|
+
"""
|
|
49
|
+
day = 86400.0
|
|
50
|
+
hour = 3600.0
|
|
51
|
+
|
|
52
|
+
if sec <= 0:
|
|
53
|
+
return "unknown"
|
|
54
|
+
|
|
55
|
+
# Sub-hourly
|
|
56
|
+
if sec < 0.5 * hour:
|
|
57
|
+
return "subhr"
|
|
58
|
+
|
|
59
|
+
# Hourly
|
|
60
|
+
if _near(sec, hour):
|
|
61
|
+
return "1hr"
|
|
62
|
+
if _near(sec, 3 * hour):
|
|
63
|
+
return "3hr"
|
|
64
|
+
if _near(sec, 6 * hour):
|
|
65
|
+
return "6hr"
|
|
66
|
+
|
|
67
|
+
# Daily
|
|
68
|
+
if _near(sec, day):
|
|
69
|
+
return "day"
|
|
70
|
+
|
|
71
|
+
# Multi-day (requested extras)
|
|
72
|
+
if _near(sec, 6 * day):
|
|
73
|
+
return "6d"
|
|
74
|
+
if _near(sec, 7 * day):
|
|
75
|
+
return "1w"
|
|
76
|
+
if _near(sec, 14 * day):
|
|
77
|
+
return "sem"
|
|
78
|
+
|
|
79
|
+
# Monthly-ish
|
|
80
|
+
if 20 * day <= sec <= 40 * day:
|
|
81
|
+
return "mon"
|
|
82
|
+
|
|
83
|
+
# Seasonal-ish (~3 months)
|
|
84
|
+
if 80 * day <= sec <= 100 * day:
|
|
85
|
+
return "season"
|
|
86
|
+
|
|
87
|
+
# Yearly-ish
|
|
88
|
+
if 350 * day <= sec <= 380 * day:
|
|
89
|
+
return "yr"
|
|
90
|
+
|
|
91
|
+
# Decadal-ish
|
|
92
|
+
if 9 * 365 * day <= sec <= 11 * 365 * day:
|
|
93
|
+
return "dec"
|
|
94
|
+
|
|
95
|
+
# Fallback: express as days
|
|
96
|
+
return f"{sec / day:.3g}d"
|
|
97
|
+
|
|
98
|
+
|
|
99
|
+
def _find_time_coord(
|
|
100
|
+
ds: xr.Dataset,
|
|
101
|
+
time_coord: Optional[str] = None,
|
|
102
|
+
) -> Optional[xr.DataArray]:
|
|
103
|
+
"""Best-effort detection of the time coordinate."""
|
|
104
|
+
# 1) Explicit
|
|
105
|
+
if time_coord is not None:
|
|
106
|
+
if time_coord in ds.coords:
|
|
107
|
+
return ds.coords[time_coord]
|
|
108
|
+
if time_coord in ds.variables:
|
|
109
|
+
return ds[time_coord]
|
|
110
|
+
return None
|
|
111
|
+
|
|
112
|
+
# 2) Coordinate literally named "time"
|
|
113
|
+
if "time" in ds.coords:
|
|
114
|
+
return ds.coords["time"]
|
|
115
|
+
|
|
116
|
+
# 3) Coordinate with standard_name="time" or axis="T"
|
|
117
|
+
for coord in ds.coords.values():
|
|
118
|
+
std_name = coord.attrs.get("standard_name", "").lower()
|
|
119
|
+
axis = coord.attrs.get("axis", "")
|
|
120
|
+
if std_name == "time" or axis == "T":
|
|
121
|
+
return cast(xr.DataArray, coord)
|
|
122
|
+
|
|
123
|
+
# 4) Any coord whose dim is named "time"
|
|
124
|
+
time_like_coords: list[xr.DataArray] = []
|
|
125
|
+
for coord in ds.coords.values():
|
|
126
|
+
if any(dim.lower() == "time" for dim in coord.dims):
|
|
127
|
+
time_like_coords.append(coord)
|
|
128
|
+
if time_like_coords:
|
|
129
|
+
return time_like_coords[0]
|
|
130
|
+
|
|
131
|
+
# 5) As last resort: any variable (coord or not) that looks time-like
|
|
132
|
+
for vname in ds.variables:
|
|
133
|
+
var = ds[vname]
|
|
134
|
+
if any(str(dim).lower() == "time" for dim in var.dims):
|
|
135
|
+
# Require datetime-like or object (for cftime) to avoid bogus matches
|
|
136
|
+
if np.issubdtype(var.dtype, np.datetime64) or var.dtype == "O":
|
|
137
|
+
return ds[vname]
|
|
138
|
+
|
|
139
|
+
return None
|
|
140
|
+
|
|
141
|
+
|
|
142
|
+
def infer_cmor_like_time_frequency(
|
|
143
|
+
ds: xr.Dataset,
|
|
144
|
+
time_coord: Optional[str] = None,
|
|
145
|
+
) -> str:
|
|
146
|
+
"""
|
|
147
|
+
Infer a CMOR-like time frequency from the first two valid time entries.
|
|
148
|
+
|
|
149
|
+
Parameters
|
|
150
|
+
----------
|
|
151
|
+
ds:
|
|
152
|
+
Open xarray Dataset.
|
|
153
|
+
time_coord:
|
|
154
|
+
Optional explicit name of the time coordinate.
|
|
155
|
+
|
|
156
|
+
Returns
|
|
157
|
+
-------
|
|
158
|
+
freq : str
|
|
159
|
+
One of:
|
|
160
|
+
- 'fx' : no/insufficient/constant time
|
|
161
|
+
- 'subhr'
|
|
162
|
+
- '1hr', '3hr', '6hr'
|
|
163
|
+
- 'day'
|
|
164
|
+
- '6d', '1w', 'sem'
|
|
165
|
+
- 'mon', 'season', 'yr', 'dec'
|
|
166
|
+
- '<Xd>' : generic days fallback
|
|
167
|
+
- 'unknown' : invalid/negative step
|
|
168
|
+
"""
|
|
169
|
+
t = _find_time_coord(ds, time_coord=time_coord)
|
|
170
|
+
|
|
171
|
+
if t is None:
|
|
172
|
+
return "fx"
|
|
173
|
+
|
|
174
|
+
# Ensure 1D along its primary dim
|
|
175
|
+
if t.ndim != 1:
|
|
176
|
+
main_dim = t.dims[0]
|
|
177
|
+
t = t.isel({main_dim: slice(None)})
|
|
178
|
+
|
|
179
|
+
if t.size < 2:
|
|
180
|
+
return "fx"
|
|
181
|
+
|
|
182
|
+
# Extract values
|
|
183
|
+
vals = np.asarray(t.values).ravel()
|
|
184
|
+
|
|
185
|
+
# Try via pandas for datetime64 / cftime that can be coerced
|
|
186
|
+
dt_like = pd.to_datetime(vals, errors="coerce")
|
|
187
|
+
valid = dt_like[~dt_like.isna()]
|
|
188
|
+
|
|
189
|
+
if valid.size >= 2:
|
|
190
|
+
uniq = np.unique(valid)
|
|
191
|
+
if uniq.size < 2:
|
|
192
|
+
return "fx"
|
|
193
|
+
uniq.sort()
|
|
194
|
+
delta = uniq[1] - uniq[0]
|
|
195
|
+
else:
|
|
196
|
+
# Fallback for non-coercible types: keep non-null, sort, diff
|
|
197
|
+
non_null = [v for v in vals if v is not None]
|
|
198
|
+
if len(non_null) < 2:
|
|
199
|
+
return "fx"
|
|
200
|
+
non_null = sorted(non_null)
|
|
201
|
+
if non_null[0] == non_null[1]:
|
|
202
|
+
return "fx"
|
|
203
|
+
delta = non_null[1] - non_null[0]
|
|
204
|
+
|
|
205
|
+
sec = _seconds_from_timedelta(delta)
|
|
206
|
+
freq = _map_seconds_to_cmor_like_frequency(sec)
|
|
207
|
+
return freq
|