metadata-crawler 2509.0.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of metadata-crawler might be problematic. Click here for more details.

Files changed (34) hide show
  1. metadata_crawler/__init__.py +248 -0
  2. metadata_crawler/__main__.py +8 -0
  3. metadata_crawler/_version.py +1 -0
  4. metadata_crawler/api/__init__.py +1 -0
  5. metadata_crawler/api/cli.py +57 -0
  6. metadata_crawler/api/config.py +801 -0
  7. metadata_crawler/api/drs_config.toml +439 -0
  8. metadata_crawler/api/index.py +132 -0
  9. metadata_crawler/api/metadata_stores.py +749 -0
  10. metadata_crawler/api/mixin/__init__.py +7 -0
  11. metadata_crawler/api/mixin/lookup_mixin.py +112 -0
  12. metadata_crawler/api/mixin/lookup_tables.py +10010 -0
  13. metadata_crawler/api/mixin/path_mixin.py +46 -0
  14. metadata_crawler/api/mixin/template_mixin.py +145 -0
  15. metadata_crawler/api/storage_backend.py +277 -0
  16. metadata_crawler/backends/__init__.py +1 -0
  17. metadata_crawler/backends/intake.py +211 -0
  18. metadata_crawler/backends/posix.py +121 -0
  19. metadata_crawler/backends/s3.py +136 -0
  20. metadata_crawler/backends/swift.py +305 -0
  21. metadata_crawler/cli.py +539 -0
  22. metadata_crawler/data_collector.py +258 -0
  23. metadata_crawler/ingester/__init__.py +1 -0
  24. metadata_crawler/ingester/mongo.py +193 -0
  25. metadata_crawler/ingester/solr.py +152 -0
  26. metadata_crawler/logger.py +142 -0
  27. metadata_crawler/py.typed +0 -0
  28. metadata_crawler/run.py +373 -0
  29. metadata_crawler/utils.py +411 -0
  30. metadata_crawler-2509.0.0.dist-info/METADATA +399 -0
  31. metadata_crawler-2509.0.0.dist-info/RECORD +34 -0
  32. metadata_crawler-2509.0.0.dist-info/WHEEL +4 -0
  33. metadata_crawler-2509.0.0.dist-info/entry_points.txt +14 -0
  34. metadata_crawler-2509.0.0.dist-info/licenses/LICENSE +28 -0
@@ -0,0 +1,411 @@
1
+ """Random utility functions."""
2
+
3
+ import difflib
4
+ import logging
5
+ import multiprocessing as mp
6
+ import multiprocessing.context as mctx
7
+ import os
8
+ import sys
9
+ import time
10
+ from datetime import datetime, timedelta
11
+ from importlib.metadata import entry_points
12
+ from typing import (
13
+ IO,
14
+ Any,
15
+ AsyncIterator,
16
+ Callable,
17
+ Dict,
18
+ Iterable,
19
+ List,
20
+ Optional,
21
+ Protocol,
22
+ Set,
23
+ Tuple,
24
+ TypeAlias,
25
+ TypeVar,
26
+ Union,
27
+ )
28
+
29
+ import ciso8601
30
+ import orjson
31
+ import rich.console
32
+ import rich.spinner
33
+ from dateutil.parser import isoparse
34
+ from rich.live import Live
35
+
36
+ from .logger import logger
37
+
38
+ T = TypeVar("T")
39
+ U = TypeVar("U")
40
+
41
+
42
+ class SimpleQueueLike(Protocol[T]):
43
+ """A simple queue like Type class."""
44
+
45
+ def put(self, item: T) -> None: # noqa
46
+ ...
47
+
48
+ def get(self) -> T: # noqa
49
+ ...
50
+
51
+
52
+ class QueueLike(Protocol[T]):
53
+ """A queue like Type class."""
54
+
55
+ def put(self, item: T) -> None: # noqa
56
+ ...
57
+
58
+ def get(
59
+ self, block: bool = True, timeout: Optional[float] = ...
60
+ ) -> T: # noqa
61
+ ...
62
+
63
+ def qsize(self) -> int: # noqa
64
+ ...
65
+
66
+
67
+ class EventLike(Protocol):
68
+ """An event like Type class."""
69
+
70
+ def set(self) -> None: # noqa
71
+ ...
72
+
73
+ def clear(self) -> None: # noqa
74
+ ...
75
+
76
+ def is_set(self) -> bool: # noqa
77
+ ...
78
+
79
+ def wait(self) -> None: # noqa
80
+ ...
81
+
82
+
83
+ class LockLike(Protocol):
84
+ """A lock like Type class."""
85
+
86
+ def acquire(
87
+ self, blocking: bool = ..., timeout: Optional[float] = ...
88
+ ) -> bool: # noqa
89
+ ...
90
+
91
+ def release(self) -> None: # noqa
92
+ ...
93
+
94
+ def __enter__(self) -> "LockLike": ...
95
+ def __exit__(self, exc_type: Any, exc: Any, tb: Any) -> None: ...
96
+
97
+
98
+ class ValueLike(Protocol[U]):
99
+ """A value like Type class."""
100
+
101
+ value: U
102
+
103
+ def get_lock(self) -> "Any": # noqa
104
+ ...
105
+
106
+
107
+ class FilesystemLike(Protocol):
108
+ """File-like opener protocol (e.g., fsspec)."""
109
+
110
+ def open(
111
+ self,
112
+ path: str,
113
+ mode: str = "rt",
114
+ compression: Optional[str] = None,
115
+ encoding: Optional[str] = None,
116
+ **kwargs: Any,
117
+ ) -> IO[str]: # noqa
118
+ ...
119
+
120
+
121
+ Counter: TypeAlias = ValueLike[int]
122
+ PrintLock = mp.Lock()
123
+ Console = rich.console.Console(force_terminal=sys.stdout.isatty(), stderr=True)
124
+
125
+
126
+ class MetadataCrawlerException(Exception):
127
+ """Custom Exception for the crawling."""
128
+
129
+
130
+ class EmptyCrawl(MetadataCrawlerException):
131
+ """Cusotom Exceptoin for a crawl with no results."""
132
+
133
+
134
+ async def create_async_iterator(itt: Iterable[Any]) -> AsyncIterator[Any]:
135
+ """Create an async iterator from as sync iterable."""
136
+ for item in itt:
137
+ yield item
138
+
139
+
140
+ def _parse_iso_datetime(s: str) -> datetime:
141
+ return ciso8601.parse_datetime(s) or datetime.fromisoformat(s)
142
+
143
+
144
+ def parse_batch(
145
+ lines: List[str],
146
+ timestamp_keys: Set[str],
147
+ ) -> List[Dict[str, Any]]:
148
+ """Parse a batch of NDJSON lines and convert timestamp fields.
149
+
150
+ Parameters
151
+ ^^^^^^^^^^
152
+ lines : list of str
153
+ Raw NDJSON lines.
154
+ timestamp_keys : set of str
155
+ Keys that should be parsed as datetimes.
156
+
157
+ Returns
158
+ ^^^^^^^
159
+ list of dict
160
+ Parsed objects with timestamp fields converted to ``datetime``.
161
+ """
162
+ out: List[Dict[str, Any]] = []
163
+ append = out.append
164
+ loads = orjson.loads
165
+ parse_dt = _parse_iso_datetime
166
+
167
+ for line in lines:
168
+ obj: Dict[str, Any] = loads(line)
169
+ for k in timestamp_keys:
170
+ v = obj.get(k, None)
171
+ if v is None:
172
+ continue
173
+ if isinstance(v, str):
174
+ obj[k] = parse_dt(v)
175
+ elif isinstance(v, list):
176
+ obj[k] = [parse_dt(x) if isinstance(x, str) else x for x in v]
177
+ append(obj)
178
+ return out
179
+
180
+
181
+ def convert_str_to_timestamp(
182
+ time_str: str, alternative: str = "0001-01-01"
183
+ ) -> datetime:
184
+ """Convert a string representation of a time step to an iso timestamp.
185
+
186
+ Parameters
187
+ ----------
188
+ time_str: str
189
+ Representation of the time step in formats:
190
+ - %Y%m%d%H%M%S%f (year, month, day, hour, minute, second, millisecond)
191
+ - %Y%m%d%H%M (year, month, day, hour, minute)
192
+ - %Y%m (year, month)
193
+ - %Y%m%dT%H%M (year, month, day, hour, minute with T separator)
194
+ - %Y%j (year and day of year, e.g. 2022203 for 22nd July 2022)
195
+ - %Y (year only)
196
+ alternative: str, default: 0
197
+ If conversion fails, the alternative/default value the time step
198
+ gets assign to
199
+
200
+ Returns
201
+ -------
202
+ str: ISO time string representation of the input time step, such as
203
+ %Y %Y-%m-%d or %Y-%m-%dT%H%M%S
204
+ """
205
+ _date = isoparse(alternative)
206
+ _time = f"{_date.strftime('%H')}:{_date.strftime('%M')}"
207
+ _day = _date.strftime("%d")
208
+ _mon = _date.strftime("%m")
209
+ has_t_separator = "T" in time_str
210
+ position_t = time_str.find("T") if has_t_separator else -1
211
+ # Strip anything that's not a number from the string
212
+ if not time_str:
213
+ return _date
214
+ # Not valid if time repr empty or starts with a letter, such as 'fx'
215
+ digits = "".join(filter(str.isdigit, time_str))
216
+ l_times = len(digits)
217
+ if not l_times:
218
+ return _date
219
+ try:
220
+ if l_times <= 4:
221
+ # Suppose this is a year only
222
+ return isoparse(f"{digits.zfill(4)}-{_mon}-{_day}T{_time}")
223
+ if l_times <= 6:
224
+ # Suppose this is %Y%m or %Y%e
225
+ return isoparse(f"{digits[:4]}-{digits[4:].zfill(2)}-{_day}T{_time}")
226
+ # Year and day of year
227
+ if l_times == 7:
228
+ # Suppose this is %Y%j
229
+ year = int(digits[:4])
230
+ day_of_year = int(digits[4:])
231
+ date = datetime(year, 1, 1, _date.hour, _date.minute) + timedelta(
232
+ days=day_of_year - 1
233
+ )
234
+ return date
235
+ if l_times <= 8:
236
+ # Suppose this is %Y%m%d
237
+ return isoparse(
238
+ f"{digits[:4]}-{digits[4:6]}-{digits[6:].zfill(2)}T{_time}"
239
+ )
240
+
241
+ date_str = f"{digits[:4]}-{digits[4:6]}-{digits[6:8]}"
242
+ time = digits[8:]
243
+ if len(time) <= 2:
244
+ time = time.zfill(2)
245
+ else:
246
+ # Alaways drop seconds
247
+ time = time[:2] + ":" + time[2 : min(4, len(time))].zfill(2)
248
+ return isoparse(f"{date_str}T{time}")
249
+
250
+ except ValueError:
251
+ if has_t_separator and position_t > 0:
252
+ date_part = time_str[:position_t]
253
+ time_part = time_str[position_t + 1 :]
254
+
255
+ date_digits = "".join(filter(str.isdigit, date_part))
256
+ if len(date_digits) >= 8:
257
+ return isoparse(
258
+ f"{date_digits[:4]}-{date_digits[4:6]}"
259
+ f"-{date_digits[6:8]}T{time_part[:2].zfill(2)}"
260
+ )
261
+
262
+ return _date
263
+
264
+
265
+ def find_closest(msg: str, target: str, options: Iterable[str]) -> str:
266
+ """Find the closest match for a target within a collection of items.
267
+
268
+ Parameters
269
+ ----------
270
+ target: The string to match.
271
+ options: A list of candidate strings.
272
+
273
+
274
+ Returns
275
+ -------
276
+ str: Message
277
+ """
278
+ matches = difflib.get_close_matches(target, options, n=1, cutoff=0.6)
279
+ suffix = f", did you mean {matches[0]}?" if matches else ""
280
+ return msg + suffix
281
+
282
+
283
+ def load_plugins(group: str) -> Dict[str, Any]:
284
+ """Load harverster plugins."""
285
+ eps = entry_points().select(group=group)
286
+ plugins = {}
287
+ for ep in eps:
288
+ plugins[ep.name] = ep.load()
289
+ return plugins
290
+
291
+
292
+ def exception_handler(exception: BaseException) -> None:
293
+ """Handle raising exceptions appropriately."""
294
+ msg = str(exception)
295
+ if logger.level >= logging.INFO:
296
+ msg += " - increase verbosity for more information"
297
+ exc_info = None
298
+ else:
299
+ exc_info = exception
300
+ logger.critical(msg, exc_info=exc_info)
301
+ raise SystemExit(1)
302
+
303
+
304
+ def daemon(
305
+ func: Callable[..., Any],
306
+ ) -> Callable[..., mctx.ForkProcess]:
307
+ """Threading decorator.
308
+
309
+ use @daemon above the function you want to run in the background
310
+ """
311
+
312
+ def background_func(*args: Any, **kwargs: Any) -> mctx.ForkProcess:
313
+ ctx = mp.get_context("fork")
314
+ proc = ctx.Process(target=func, args=args, kwargs=kwargs, daemon=True)
315
+ proc.start()
316
+ return proc
317
+
318
+ return background_func
319
+
320
+
321
+ def timedelta_to_str(seconds: Union[int, float]) -> str:
322
+ """Convert seconds to a more human readable format."""
323
+ hours = seconds // 60**2
324
+ minutes = (seconds // 60) % 60
325
+ sec = round(seconds - (hours * 60 + minutes) * 60, 2)
326
+ out = []
327
+ for num, letter in {sec: "Sec.", minutes: "Min.", hours: "Hour"}.items():
328
+ if num > 0:
329
+ out.append(f"{num} {letter}")
330
+ return " ".join(out[::-1])
331
+
332
+
333
+ @daemon
334
+ def print_performance(
335
+ print_status: EventLike,
336
+ num_files: Counter,
337
+ ingest_queue: QueueLike[Any],
338
+ num_objects: Counter,
339
+ ) -> None:
340
+ """Display the progress of the crawler."""
341
+ spinner = rich.spinner.Spinner(
342
+ os.getenv("SPINNER", "earth"), text="[b]Preparing crawler ...[/]"
343
+ )
344
+ interactive = bool(
345
+ int(os.getenv("MDC_INTERACTIVE", str(int(Console.is_terminal))))
346
+ )
347
+ log_interval = int(os.getenv("MDC_LOG_INTERVAL", "30"))
348
+ sample_interval = 1.0 if interactive else 10.0
349
+
350
+ def _snapshot() -> Tuple[float, int, int, int]:
351
+ start = time.monotonic()
352
+ n0 = num_files.value
353
+ time.sleep(sample_interval)
354
+ dn = num_files.value - n0
355
+ dt = max(1e-6, time.monotonic() - start)
356
+ perf_file = dn / dt
357
+ queue_size = ingest_queue.qsize()
358
+ return perf_file, n0, queue_size, num_objects.value
359
+
360
+ def _build_msg(
361
+ perf_file: float,
362
+ discovered: int,
363
+ queue_size: int,
364
+ indexed: int,
365
+ *,
366
+ markup: bool,
367
+ ) -> str:
368
+ # Color thresholds only when markup=True (interactive)
369
+ if markup:
370
+ f_col = (
371
+ "green"
372
+ if perf_file > 500
373
+ else "red" if perf_file < 100 else "blue"
374
+ )
375
+ q_col = (
376
+ "red"
377
+ if queue_size > 100_000
378
+ else "green" if queue_size < 10_000 else "blue"
379
+ )
380
+ return (
381
+ f"[bold]Discovering: [{f_col}]{perf_file:>6,.1f}[/{f_col}] files/s "
382
+ f"#files: [blue]{discovered:>10,.0f}[/blue] "
383
+ f"in queue: [{q_col}]{queue_size:>6,.0f}[/{q_col}] "
384
+ f"#indexed: [blue]{indexed:>10,.0f}[/blue][/bold]"
385
+ )
386
+ else:
387
+ return (
388
+ f"Discovering: {perf_file:,.1f} files/s | "
389
+ f"files={discovered:,} | queue={queue_size:,} | indexed={indexed:,}"
390
+ )
391
+
392
+ if interactive:
393
+ with Live(
394
+ spinner, console=Console, refresh_per_second=2.5, transient=True
395
+ ):
396
+ while print_status.is_set():
397
+ perf, disc, qsz, idx = _snapshot()
398
+ spinner.update(text=_build_msg(perf, disc, qsz, idx, markup=True))
399
+ # Clear the last line when done
400
+ Console.print(" " * Console.width, end="\r")
401
+ Console.print(" ")
402
+ else:
403
+ # Non-TTY (e.g. systemd): emit a plain summary every log_interval secs
404
+ next_log = time.monotonic()
405
+ while print_status.is_set():
406
+ perf, disc, qsz, idx = _snapshot()
407
+ now = time.monotonic()
408
+ if now >= next_log:
409
+ # Print one clean line; journald/Cockpit will show one entry
410
+ print(_build_msg(perf, disc, qsz, idx, markup=False), flush=True)
411
+ next_log = now + log_interval