metadata-crawler 2509.0.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of metadata-crawler might be problematic. Click here for more details.
- metadata_crawler/__init__.py +248 -0
- metadata_crawler/__main__.py +8 -0
- metadata_crawler/_version.py +1 -0
- metadata_crawler/api/__init__.py +1 -0
- metadata_crawler/api/cli.py +57 -0
- metadata_crawler/api/config.py +801 -0
- metadata_crawler/api/drs_config.toml +439 -0
- metadata_crawler/api/index.py +132 -0
- metadata_crawler/api/metadata_stores.py +749 -0
- metadata_crawler/api/mixin/__init__.py +7 -0
- metadata_crawler/api/mixin/lookup_mixin.py +112 -0
- metadata_crawler/api/mixin/lookup_tables.py +10010 -0
- metadata_crawler/api/mixin/path_mixin.py +46 -0
- metadata_crawler/api/mixin/template_mixin.py +145 -0
- metadata_crawler/api/storage_backend.py +277 -0
- metadata_crawler/backends/__init__.py +1 -0
- metadata_crawler/backends/intake.py +211 -0
- metadata_crawler/backends/posix.py +121 -0
- metadata_crawler/backends/s3.py +136 -0
- metadata_crawler/backends/swift.py +305 -0
- metadata_crawler/cli.py +539 -0
- metadata_crawler/data_collector.py +258 -0
- metadata_crawler/ingester/__init__.py +1 -0
- metadata_crawler/ingester/mongo.py +193 -0
- metadata_crawler/ingester/solr.py +152 -0
- metadata_crawler/logger.py +142 -0
- metadata_crawler/py.typed +0 -0
- metadata_crawler/run.py +373 -0
- metadata_crawler/utils.py +411 -0
- metadata_crawler-2509.0.0.dist-info/METADATA +399 -0
- metadata_crawler-2509.0.0.dist-info/RECORD +34 -0
- metadata_crawler-2509.0.0.dist-info/WHEEL +4 -0
- metadata_crawler-2509.0.0.dist-info/entry_points.txt +14 -0
- metadata_crawler-2509.0.0.dist-info/licenses/LICENSE +28 -0
|
@@ -0,0 +1,411 @@
|
|
|
1
|
+
"""Random utility functions."""
|
|
2
|
+
|
|
3
|
+
import difflib
|
|
4
|
+
import logging
|
|
5
|
+
import multiprocessing as mp
|
|
6
|
+
import multiprocessing.context as mctx
|
|
7
|
+
import os
|
|
8
|
+
import sys
|
|
9
|
+
import time
|
|
10
|
+
from datetime import datetime, timedelta
|
|
11
|
+
from importlib.metadata import entry_points
|
|
12
|
+
from typing import (
|
|
13
|
+
IO,
|
|
14
|
+
Any,
|
|
15
|
+
AsyncIterator,
|
|
16
|
+
Callable,
|
|
17
|
+
Dict,
|
|
18
|
+
Iterable,
|
|
19
|
+
List,
|
|
20
|
+
Optional,
|
|
21
|
+
Protocol,
|
|
22
|
+
Set,
|
|
23
|
+
Tuple,
|
|
24
|
+
TypeAlias,
|
|
25
|
+
TypeVar,
|
|
26
|
+
Union,
|
|
27
|
+
)
|
|
28
|
+
|
|
29
|
+
import ciso8601
|
|
30
|
+
import orjson
|
|
31
|
+
import rich.console
|
|
32
|
+
import rich.spinner
|
|
33
|
+
from dateutil.parser import isoparse
|
|
34
|
+
from rich.live import Live
|
|
35
|
+
|
|
36
|
+
from .logger import logger
|
|
37
|
+
|
|
38
|
+
T = TypeVar("T")
|
|
39
|
+
U = TypeVar("U")
|
|
40
|
+
|
|
41
|
+
|
|
42
|
+
class SimpleQueueLike(Protocol[T]):
|
|
43
|
+
"""A simple queue like Type class."""
|
|
44
|
+
|
|
45
|
+
def put(self, item: T) -> None: # noqa
|
|
46
|
+
...
|
|
47
|
+
|
|
48
|
+
def get(self) -> T: # noqa
|
|
49
|
+
...
|
|
50
|
+
|
|
51
|
+
|
|
52
|
+
class QueueLike(Protocol[T]):
|
|
53
|
+
"""A queue like Type class."""
|
|
54
|
+
|
|
55
|
+
def put(self, item: T) -> None: # noqa
|
|
56
|
+
...
|
|
57
|
+
|
|
58
|
+
def get(
|
|
59
|
+
self, block: bool = True, timeout: Optional[float] = ...
|
|
60
|
+
) -> T: # noqa
|
|
61
|
+
...
|
|
62
|
+
|
|
63
|
+
def qsize(self) -> int: # noqa
|
|
64
|
+
...
|
|
65
|
+
|
|
66
|
+
|
|
67
|
+
class EventLike(Protocol):
|
|
68
|
+
"""An event like Type class."""
|
|
69
|
+
|
|
70
|
+
def set(self) -> None: # noqa
|
|
71
|
+
...
|
|
72
|
+
|
|
73
|
+
def clear(self) -> None: # noqa
|
|
74
|
+
...
|
|
75
|
+
|
|
76
|
+
def is_set(self) -> bool: # noqa
|
|
77
|
+
...
|
|
78
|
+
|
|
79
|
+
def wait(self) -> None: # noqa
|
|
80
|
+
...
|
|
81
|
+
|
|
82
|
+
|
|
83
|
+
class LockLike(Protocol):
|
|
84
|
+
"""A lock like Type class."""
|
|
85
|
+
|
|
86
|
+
def acquire(
|
|
87
|
+
self, blocking: bool = ..., timeout: Optional[float] = ...
|
|
88
|
+
) -> bool: # noqa
|
|
89
|
+
...
|
|
90
|
+
|
|
91
|
+
def release(self) -> None: # noqa
|
|
92
|
+
...
|
|
93
|
+
|
|
94
|
+
def __enter__(self) -> "LockLike": ...
|
|
95
|
+
def __exit__(self, exc_type: Any, exc: Any, tb: Any) -> None: ...
|
|
96
|
+
|
|
97
|
+
|
|
98
|
+
class ValueLike(Protocol[U]):
|
|
99
|
+
"""A value like Type class."""
|
|
100
|
+
|
|
101
|
+
value: U
|
|
102
|
+
|
|
103
|
+
def get_lock(self) -> "Any": # noqa
|
|
104
|
+
...
|
|
105
|
+
|
|
106
|
+
|
|
107
|
+
class FilesystemLike(Protocol):
|
|
108
|
+
"""File-like opener protocol (e.g., fsspec)."""
|
|
109
|
+
|
|
110
|
+
def open(
|
|
111
|
+
self,
|
|
112
|
+
path: str,
|
|
113
|
+
mode: str = "rt",
|
|
114
|
+
compression: Optional[str] = None,
|
|
115
|
+
encoding: Optional[str] = None,
|
|
116
|
+
**kwargs: Any,
|
|
117
|
+
) -> IO[str]: # noqa
|
|
118
|
+
...
|
|
119
|
+
|
|
120
|
+
|
|
121
|
+
Counter: TypeAlias = ValueLike[int]
|
|
122
|
+
PrintLock = mp.Lock()
|
|
123
|
+
Console = rich.console.Console(force_terminal=sys.stdout.isatty(), stderr=True)
|
|
124
|
+
|
|
125
|
+
|
|
126
|
+
class MetadataCrawlerException(Exception):
|
|
127
|
+
"""Custom Exception for the crawling."""
|
|
128
|
+
|
|
129
|
+
|
|
130
|
+
class EmptyCrawl(MetadataCrawlerException):
|
|
131
|
+
"""Cusotom Exceptoin for a crawl with no results."""
|
|
132
|
+
|
|
133
|
+
|
|
134
|
+
async def create_async_iterator(itt: Iterable[Any]) -> AsyncIterator[Any]:
|
|
135
|
+
"""Create an async iterator from as sync iterable."""
|
|
136
|
+
for item in itt:
|
|
137
|
+
yield item
|
|
138
|
+
|
|
139
|
+
|
|
140
|
+
def _parse_iso_datetime(s: str) -> datetime:
|
|
141
|
+
return ciso8601.parse_datetime(s) or datetime.fromisoformat(s)
|
|
142
|
+
|
|
143
|
+
|
|
144
|
+
def parse_batch(
|
|
145
|
+
lines: List[str],
|
|
146
|
+
timestamp_keys: Set[str],
|
|
147
|
+
) -> List[Dict[str, Any]]:
|
|
148
|
+
"""Parse a batch of NDJSON lines and convert timestamp fields.
|
|
149
|
+
|
|
150
|
+
Parameters
|
|
151
|
+
^^^^^^^^^^
|
|
152
|
+
lines : list of str
|
|
153
|
+
Raw NDJSON lines.
|
|
154
|
+
timestamp_keys : set of str
|
|
155
|
+
Keys that should be parsed as datetimes.
|
|
156
|
+
|
|
157
|
+
Returns
|
|
158
|
+
^^^^^^^
|
|
159
|
+
list of dict
|
|
160
|
+
Parsed objects with timestamp fields converted to ``datetime``.
|
|
161
|
+
"""
|
|
162
|
+
out: List[Dict[str, Any]] = []
|
|
163
|
+
append = out.append
|
|
164
|
+
loads = orjson.loads
|
|
165
|
+
parse_dt = _parse_iso_datetime
|
|
166
|
+
|
|
167
|
+
for line in lines:
|
|
168
|
+
obj: Dict[str, Any] = loads(line)
|
|
169
|
+
for k in timestamp_keys:
|
|
170
|
+
v = obj.get(k, None)
|
|
171
|
+
if v is None:
|
|
172
|
+
continue
|
|
173
|
+
if isinstance(v, str):
|
|
174
|
+
obj[k] = parse_dt(v)
|
|
175
|
+
elif isinstance(v, list):
|
|
176
|
+
obj[k] = [parse_dt(x) if isinstance(x, str) else x for x in v]
|
|
177
|
+
append(obj)
|
|
178
|
+
return out
|
|
179
|
+
|
|
180
|
+
|
|
181
|
+
def convert_str_to_timestamp(
|
|
182
|
+
time_str: str, alternative: str = "0001-01-01"
|
|
183
|
+
) -> datetime:
|
|
184
|
+
"""Convert a string representation of a time step to an iso timestamp.
|
|
185
|
+
|
|
186
|
+
Parameters
|
|
187
|
+
----------
|
|
188
|
+
time_str: str
|
|
189
|
+
Representation of the time step in formats:
|
|
190
|
+
- %Y%m%d%H%M%S%f (year, month, day, hour, minute, second, millisecond)
|
|
191
|
+
- %Y%m%d%H%M (year, month, day, hour, minute)
|
|
192
|
+
- %Y%m (year, month)
|
|
193
|
+
- %Y%m%dT%H%M (year, month, day, hour, minute with T separator)
|
|
194
|
+
- %Y%j (year and day of year, e.g. 2022203 for 22nd July 2022)
|
|
195
|
+
- %Y (year only)
|
|
196
|
+
alternative: str, default: 0
|
|
197
|
+
If conversion fails, the alternative/default value the time step
|
|
198
|
+
gets assign to
|
|
199
|
+
|
|
200
|
+
Returns
|
|
201
|
+
-------
|
|
202
|
+
str: ISO time string representation of the input time step, such as
|
|
203
|
+
%Y %Y-%m-%d or %Y-%m-%dT%H%M%S
|
|
204
|
+
"""
|
|
205
|
+
_date = isoparse(alternative)
|
|
206
|
+
_time = f"{_date.strftime('%H')}:{_date.strftime('%M')}"
|
|
207
|
+
_day = _date.strftime("%d")
|
|
208
|
+
_mon = _date.strftime("%m")
|
|
209
|
+
has_t_separator = "T" in time_str
|
|
210
|
+
position_t = time_str.find("T") if has_t_separator else -1
|
|
211
|
+
# Strip anything that's not a number from the string
|
|
212
|
+
if not time_str:
|
|
213
|
+
return _date
|
|
214
|
+
# Not valid if time repr empty or starts with a letter, such as 'fx'
|
|
215
|
+
digits = "".join(filter(str.isdigit, time_str))
|
|
216
|
+
l_times = len(digits)
|
|
217
|
+
if not l_times:
|
|
218
|
+
return _date
|
|
219
|
+
try:
|
|
220
|
+
if l_times <= 4:
|
|
221
|
+
# Suppose this is a year only
|
|
222
|
+
return isoparse(f"{digits.zfill(4)}-{_mon}-{_day}T{_time}")
|
|
223
|
+
if l_times <= 6:
|
|
224
|
+
# Suppose this is %Y%m or %Y%e
|
|
225
|
+
return isoparse(f"{digits[:4]}-{digits[4:].zfill(2)}-{_day}T{_time}")
|
|
226
|
+
# Year and day of year
|
|
227
|
+
if l_times == 7:
|
|
228
|
+
# Suppose this is %Y%j
|
|
229
|
+
year = int(digits[:4])
|
|
230
|
+
day_of_year = int(digits[4:])
|
|
231
|
+
date = datetime(year, 1, 1, _date.hour, _date.minute) + timedelta(
|
|
232
|
+
days=day_of_year - 1
|
|
233
|
+
)
|
|
234
|
+
return date
|
|
235
|
+
if l_times <= 8:
|
|
236
|
+
# Suppose this is %Y%m%d
|
|
237
|
+
return isoparse(
|
|
238
|
+
f"{digits[:4]}-{digits[4:6]}-{digits[6:].zfill(2)}T{_time}"
|
|
239
|
+
)
|
|
240
|
+
|
|
241
|
+
date_str = f"{digits[:4]}-{digits[4:6]}-{digits[6:8]}"
|
|
242
|
+
time = digits[8:]
|
|
243
|
+
if len(time) <= 2:
|
|
244
|
+
time = time.zfill(2)
|
|
245
|
+
else:
|
|
246
|
+
# Alaways drop seconds
|
|
247
|
+
time = time[:2] + ":" + time[2 : min(4, len(time))].zfill(2)
|
|
248
|
+
return isoparse(f"{date_str}T{time}")
|
|
249
|
+
|
|
250
|
+
except ValueError:
|
|
251
|
+
if has_t_separator and position_t > 0:
|
|
252
|
+
date_part = time_str[:position_t]
|
|
253
|
+
time_part = time_str[position_t + 1 :]
|
|
254
|
+
|
|
255
|
+
date_digits = "".join(filter(str.isdigit, date_part))
|
|
256
|
+
if len(date_digits) >= 8:
|
|
257
|
+
return isoparse(
|
|
258
|
+
f"{date_digits[:4]}-{date_digits[4:6]}"
|
|
259
|
+
f"-{date_digits[6:8]}T{time_part[:2].zfill(2)}"
|
|
260
|
+
)
|
|
261
|
+
|
|
262
|
+
return _date
|
|
263
|
+
|
|
264
|
+
|
|
265
|
+
def find_closest(msg: str, target: str, options: Iterable[str]) -> str:
|
|
266
|
+
"""Find the closest match for a target within a collection of items.
|
|
267
|
+
|
|
268
|
+
Parameters
|
|
269
|
+
----------
|
|
270
|
+
target: The string to match.
|
|
271
|
+
options: A list of candidate strings.
|
|
272
|
+
|
|
273
|
+
|
|
274
|
+
Returns
|
|
275
|
+
-------
|
|
276
|
+
str: Message
|
|
277
|
+
"""
|
|
278
|
+
matches = difflib.get_close_matches(target, options, n=1, cutoff=0.6)
|
|
279
|
+
suffix = f", did you mean {matches[0]}?" if matches else ""
|
|
280
|
+
return msg + suffix
|
|
281
|
+
|
|
282
|
+
|
|
283
|
+
def load_plugins(group: str) -> Dict[str, Any]:
|
|
284
|
+
"""Load harverster plugins."""
|
|
285
|
+
eps = entry_points().select(group=group)
|
|
286
|
+
plugins = {}
|
|
287
|
+
for ep in eps:
|
|
288
|
+
plugins[ep.name] = ep.load()
|
|
289
|
+
return plugins
|
|
290
|
+
|
|
291
|
+
|
|
292
|
+
def exception_handler(exception: BaseException) -> None:
|
|
293
|
+
"""Handle raising exceptions appropriately."""
|
|
294
|
+
msg = str(exception)
|
|
295
|
+
if logger.level >= logging.INFO:
|
|
296
|
+
msg += " - increase verbosity for more information"
|
|
297
|
+
exc_info = None
|
|
298
|
+
else:
|
|
299
|
+
exc_info = exception
|
|
300
|
+
logger.critical(msg, exc_info=exc_info)
|
|
301
|
+
raise SystemExit(1)
|
|
302
|
+
|
|
303
|
+
|
|
304
|
+
def daemon(
|
|
305
|
+
func: Callable[..., Any],
|
|
306
|
+
) -> Callable[..., mctx.ForkProcess]:
|
|
307
|
+
"""Threading decorator.
|
|
308
|
+
|
|
309
|
+
use @daemon above the function you want to run in the background
|
|
310
|
+
"""
|
|
311
|
+
|
|
312
|
+
def background_func(*args: Any, **kwargs: Any) -> mctx.ForkProcess:
|
|
313
|
+
ctx = mp.get_context("fork")
|
|
314
|
+
proc = ctx.Process(target=func, args=args, kwargs=kwargs, daemon=True)
|
|
315
|
+
proc.start()
|
|
316
|
+
return proc
|
|
317
|
+
|
|
318
|
+
return background_func
|
|
319
|
+
|
|
320
|
+
|
|
321
|
+
def timedelta_to_str(seconds: Union[int, float]) -> str:
|
|
322
|
+
"""Convert seconds to a more human readable format."""
|
|
323
|
+
hours = seconds // 60**2
|
|
324
|
+
minutes = (seconds // 60) % 60
|
|
325
|
+
sec = round(seconds - (hours * 60 + minutes) * 60, 2)
|
|
326
|
+
out = []
|
|
327
|
+
for num, letter in {sec: "Sec.", minutes: "Min.", hours: "Hour"}.items():
|
|
328
|
+
if num > 0:
|
|
329
|
+
out.append(f"{num} {letter}")
|
|
330
|
+
return " ".join(out[::-1])
|
|
331
|
+
|
|
332
|
+
|
|
333
|
+
@daemon
|
|
334
|
+
def print_performance(
|
|
335
|
+
print_status: EventLike,
|
|
336
|
+
num_files: Counter,
|
|
337
|
+
ingest_queue: QueueLike[Any],
|
|
338
|
+
num_objects: Counter,
|
|
339
|
+
) -> None:
|
|
340
|
+
"""Display the progress of the crawler."""
|
|
341
|
+
spinner = rich.spinner.Spinner(
|
|
342
|
+
os.getenv("SPINNER", "earth"), text="[b]Preparing crawler ...[/]"
|
|
343
|
+
)
|
|
344
|
+
interactive = bool(
|
|
345
|
+
int(os.getenv("MDC_INTERACTIVE", str(int(Console.is_terminal))))
|
|
346
|
+
)
|
|
347
|
+
log_interval = int(os.getenv("MDC_LOG_INTERVAL", "30"))
|
|
348
|
+
sample_interval = 1.0 if interactive else 10.0
|
|
349
|
+
|
|
350
|
+
def _snapshot() -> Tuple[float, int, int, int]:
|
|
351
|
+
start = time.monotonic()
|
|
352
|
+
n0 = num_files.value
|
|
353
|
+
time.sleep(sample_interval)
|
|
354
|
+
dn = num_files.value - n0
|
|
355
|
+
dt = max(1e-6, time.monotonic() - start)
|
|
356
|
+
perf_file = dn / dt
|
|
357
|
+
queue_size = ingest_queue.qsize()
|
|
358
|
+
return perf_file, n0, queue_size, num_objects.value
|
|
359
|
+
|
|
360
|
+
def _build_msg(
|
|
361
|
+
perf_file: float,
|
|
362
|
+
discovered: int,
|
|
363
|
+
queue_size: int,
|
|
364
|
+
indexed: int,
|
|
365
|
+
*,
|
|
366
|
+
markup: bool,
|
|
367
|
+
) -> str:
|
|
368
|
+
# Color thresholds only when markup=True (interactive)
|
|
369
|
+
if markup:
|
|
370
|
+
f_col = (
|
|
371
|
+
"green"
|
|
372
|
+
if perf_file > 500
|
|
373
|
+
else "red" if perf_file < 100 else "blue"
|
|
374
|
+
)
|
|
375
|
+
q_col = (
|
|
376
|
+
"red"
|
|
377
|
+
if queue_size > 100_000
|
|
378
|
+
else "green" if queue_size < 10_000 else "blue"
|
|
379
|
+
)
|
|
380
|
+
return (
|
|
381
|
+
f"[bold]Discovering: [{f_col}]{perf_file:>6,.1f}[/{f_col}] files/s "
|
|
382
|
+
f"#files: [blue]{discovered:>10,.0f}[/blue] "
|
|
383
|
+
f"in queue: [{q_col}]{queue_size:>6,.0f}[/{q_col}] "
|
|
384
|
+
f"#indexed: [blue]{indexed:>10,.0f}[/blue][/bold]"
|
|
385
|
+
)
|
|
386
|
+
else:
|
|
387
|
+
return (
|
|
388
|
+
f"Discovering: {perf_file:,.1f} files/s | "
|
|
389
|
+
f"files={discovered:,} | queue={queue_size:,} | indexed={indexed:,}"
|
|
390
|
+
)
|
|
391
|
+
|
|
392
|
+
if interactive:
|
|
393
|
+
with Live(
|
|
394
|
+
spinner, console=Console, refresh_per_second=2.5, transient=True
|
|
395
|
+
):
|
|
396
|
+
while print_status.is_set():
|
|
397
|
+
perf, disc, qsz, idx = _snapshot()
|
|
398
|
+
spinner.update(text=_build_msg(perf, disc, qsz, idx, markup=True))
|
|
399
|
+
# Clear the last line when done
|
|
400
|
+
Console.print(" " * Console.width, end="\r")
|
|
401
|
+
Console.print(" ")
|
|
402
|
+
else:
|
|
403
|
+
# Non-TTY (e.g. systemd): emit a plain summary every log_interval secs
|
|
404
|
+
next_log = time.monotonic()
|
|
405
|
+
while print_status.is_set():
|
|
406
|
+
perf, disc, qsz, idx = _snapshot()
|
|
407
|
+
now = time.monotonic()
|
|
408
|
+
if now >= next_log:
|
|
409
|
+
# Print one clean line; journald/Cockpit will show one entry
|
|
410
|
+
print(_build_msg(perf, disc, qsz, idx, markup=False), flush=True)
|
|
411
|
+
next_log = now + log_interval
|