metadata-crawler 2509.0.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of metadata-crawler might be problematic. Click here for more details.

Files changed (34) hide show
  1. metadata_crawler/__init__.py +248 -0
  2. metadata_crawler/__main__.py +8 -0
  3. metadata_crawler/_version.py +1 -0
  4. metadata_crawler/api/__init__.py +1 -0
  5. metadata_crawler/api/cli.py +57 -0
  6. metadata_crawler/api/config.py +801 -0
  7. metadata_crawler/api/drs_config.toml +439 -0
  8. metadata_crawler/api/index.py +132 -0
  9. metadata_crawler/api/metadata_stores.py +749 -0
  10. metadata_crawler/api/mixin/__init__.py +7 -0
  11. metadata_crawler/api/mixin/lookup_mixin.py +112 -0
  12. metadata_crawler/api/mixin/lookup_tables.py +10010 -0
  13. metadata_crawler/api/mixin/path_mixin.py +46 -0
  14. metadata_crawler/api/mixin/template_mixin.py +145 -0
  15. metadata_crawler/api/storage_backend.py +277 -0
  16. metadata_crawler/backends/__init__.py +1 -0
  17. metadata_crawler/backends/intake.py +211 -0
  18. metadata_crawler/backends/posix.py +121 -0
  19. metadata_crawler/backends/s3.py +136 -0
  20. metadata_crawler/backends/swift.py +305 -0
  21. metadata_crawler/cli.py +539 -0
  22. metadata_crawler/data_collector.py +258 -0
  23. metadata_crawler/ingester/__init__.py +1 -0
  24. metadata_crawler/ingester/mongo.py +193 -0
  25. metadata_crawler/ingester/solr.py +152 -0
  26. metadata_crawler/logger.py +142 -0
  27. metadata_crawler/py.typed +0 -0
  28. metadata_crawler/run.py +373 -0
  29. metadata_crawler/utils.py +411 -0
  30. metadata_crawler-2509.0.0.dist-info/METADATA +399 -0
  31. metadata_crawler-2509.0.0.dist-info/RECORD +34 -0
  32. metadata_crawler-2509.0.0.dist-info/WHEEL +4 -0
  33. metadata_crawler-2509.0.0.dist-info/entry_points.txt +14 -0
  34. metadata_crawler-2509.0.0.dist-info/licenses/LICENSE +28 -0
@@ -0,0 +1,539 @@
1
+ """Command line interface for the data crawler."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import argparse
6
+ import asyncio
7
+ import inspect
8
+ import os
9
+ import sys
10
+ from functools import partial
11
+ from json import dumps
12
+ from pathlib import Path
13
+ from typing import (
14
+ Annotated,
15
+ Any,
16
+ Callable,
17
+ Dict,
18
+ List,
19
+ Optional,
20
+ Tuple,
21
+ Union,
22
+ cast,
23
+ get_args,
24
+ get_origin,
25
+ get_type_hints,
26
+ )
27
+
28
+ from rich_argparse import ArgumentDefaultsRichHelpFormatter
29
+
30
+ from metadata_crawler import add, delete, get_config, index
31
+
32
+ from ._version import __version__
33
+ from .api.metadata_stores import CatalogueBackends, IndexName
34
+ from .backends.intake import IntakePath
35
+ from .logger import (
36
+ THIS_NAME,
37
+ add_file_handle,
38
+ apply_verbosity,
39
+ logger,
40
+ )
41
+ from .utils import exception_handler, load_plugins
42
+
43
+ StorageScalar = Union[str, int, float, bool]
44
+ StorageOptions = Dict[str, StorageScalar]
45
+ KwargValue = Union[
46
+ str, int, float, Path, StorageOptions, List[str], List[int], None
47
+ ]
48
+
49
+
50
+ def walk_catalogue(
51
+ path: str, storage_options: Optional[Dict[str, Any]] = None, **kwargs: Any
52
+ ) -> int:
53
+ """Recursively traverse an intake catalogue.
54
+
55
+ Parameters
56
+ ^^^^^^^^^^
57
+
58
+ path:
59
+ The path to the intake catalogue
60
+ storage_options:
61
+ Optional configuration passed to open catalogues residing on non posix
62
+ storage backends, such as S3/MinIO
63
+ """
64
+
65
+ async def _walk(path: str, **storage_options: Any) -> int:
66
+ num_items = 0
67
+ ip = IntakePath(**storage_options)
68
+ async for md in ip.walk(path):
69
+ print(md)
70
+ num_items += 1
71
+ return num_items
72
+
73
+ storage_options = storage_options or {}
74
+ return asyncio.run(_walk(path, **storage_options))
75
+
76
+
77
+ def _flatten(inp: Union[List[str], List[List[str]]]) -> List[str]:
78
+
79
+ out = []
80
+ for item in inp:
81
+ out += item if isinstance(item, list) else [item]
82
+ return out
83
+
84
+
85
+ def _process_storage_option(option: str) -> Union[str, bool, int, float]:
86
+
87
+ if option.lower() in ("false", "true"):
88
+ return option.lower() == "true"
89
+ try:
90
+ return int(option)
91
+ except ValueError:
92
+ pass
93
+ try:
94
+ return float(option)
95
+ except ValueError:
96
+ pass
97
+ return option
98
+
99
+
100
+ def display_config(
101
+ config: Optional[Union[Path, str]], json: bool = False, **kwargs: Any
102
+ ) -> None:
103
+ """Display the config file."""
104
+ cfg = get_config(config)
105
+ if json is False:
106
+ print(cfg.dumps())
107
+ else:
108
+ print(dumps(cfg.merged_doc, indent=3))
109
+
110
+
111
+ class ArgParse:
112
+ """Command line interface definition.
113
+
114
+ Properties
115
+ ----------
116
+ kwargs: dict[str, Union[str, float, Path]]
117
+ property holding all parsed keyword arguments.
118
+ """
119
+
120
+ kwargs: Optional[Dict[str, KwargValue]] = None
121
+ verbose: int = 0
122
+ epilog: str = (
123
+ "See also "
124
+ "https://metadata-crawler.readthedocs.io"
125
+ " for a detailed documentation."
126
+ )
127
+
128
+ def __init__(self) -> None:
129
+ """Instantiate the CLI class."""
130
+ self.verbose: int = 0
131
+ self.parser = argparse.ArgumentParser(
132
+ prog=THIS_NAME,
133
+ description="Add/Remove metadata to/from a metadata index.",
134
+ formatter_class=ArgumentDefaultsRichHelpFormatter,
135
+ epilog=self.epilog,
136
+ )
137
+ self.parser.add_argument(
138
+ "-V",
139
+ "--version",
140
+ action="version",
141
+ version=f"%(prog)s {__version__}",
142
+ help="Print the version end exit",
143
+ )
144
+ self._add_general_config_to_parser(self.parser)
145
+ self.subparsers = self.parser.add_subparsers(
146
+ description="Collect or ingest metadata",
147
+ required=True,
148
+ )
149
+ self._add_config_parser()
150
+ self._add_walk_catalogue()
151
+ self._add_crawler_subcommand()
152
+ self._index_submcommands()
153
+
154
+ def _add_config_parser(self) -> None:
155
+ parser = self.subparsers.add_parser(
156
+ "config",
157
+ description="Display config",
158
+ help="Display config",
159
+ formatter_class=ArgumentDefaultsRichHelpFormatter,
160
+ epilog=self.epilog,
161
+ )
162
+ parser.add_argument(
163
+ "-c",
164
+ "--config",
165
+ help="Path to the config_file",
166
+ type=Path,
167
+ )
168
+ parser.add_argument(
169
+ "--json", help="Print in json format.", action="store_true"
170
+ )
171
+ parser.set_defaults(apply_func=display_config)
172
+ parser.add_argument(
173
+ "-v",
174
+ "--verbose",
175
+ action="count",
176
+ default=self.verbose,
177
+ help="Increase the verbosity level.",
178
+ )
179
+
180
+ def _add_crawler_subcommand(self) -> None:
181
+ """Add sub command for crawling metadata."""
182
+ parser = self.subparsers.add_parser(
183
+ "add",
184
+ description="Harvest (add) metadata",
185
+ help="Harvest (crawl) metadata",
186
+ formatter_class=ArgumentDefaultsRichHelpFormatter,
187
+ epilog=self.epilog,
188
+ )
189
+ parser.add_argument(
190
+ "store",
191
+ type=str,
192
+ help="Path to the intake catalogue",
193
+ )
194
+ parser.add_argument(
195
+ "--catalogue-backend",
196
+ "-cb",
197
+ type=str,
198
+ help="Source type of the catalogue backend.",
199
+ choices=CatalogueBackends.__members__.keys(),
200
+ default=list(CatalogueBackends.__members__.keys())[0],
201
+ )
202
+ parser.add_argument(
203
+ "--data-store-prefix",
204
+ "--prefix",
205
+ type=str,
206
+ help=(
207
+ "Set the path prefix for the metadata store, this can either be"
208
+ " an absolute path or if absolute path is given a path prefix"
209
+ " relative to the yaml catalogue file."
210
+ ),
211
+ default="metadata",
212
+ )
213
+ parser.add_argument(
214
+ "-c",
215
+ "--config-file",
216
+ "--config-dir",
217
+ type=Path,
218
+ help="Directory holding the metadata and server settings.",
219
+ default=os.environ.get("EVALUATION_SYSTEM_CONFIG_DIR"),
220
+ )
221
+ parser.add_argument(
222
+ "-b",
223
+ "--batch-size",
224
+ type=int,
225
+ default=25_000,
226
+ help="Set the batch size for ingestion.",
227
+ )
228
+ parser.add_argument(
229
+ "--scan-concurrency",
230
+ "--concurrency",
231
+ type=int,
232
+ default=1024,
233
+ help="Level of aync concurrency for data discovery.",
234
+ )
235
+ parser.add_argument(
236
+ "-d",
237
+ "--data-object",
238
+ "--data-obj",
239
+ type=str,
240
+ help="Objects (directories or catalogue files) that are processed.",
241
+ default=None,
242
+ action="append",
243
+ ),
244
+ parser.add_argument(
245
+ "-ds",
246
+ "--data-set",
247
+ type=str,
248
+ help=(
249
+ "The name of the dataset(s) that are processed. "
250
+ "names can contain wildcards such as ``xces-*``."
251
+ ),
252
+ default=None,
253
+ action="append",
254
+ )
255
+ parser.add_argument(
256
+ "-p",
257
+ "--password",
258
+ help=(
259
+ "Ask for a password and set it to the DRS_STORAGE_PASSWD "
260
+ "env variable."
261
+ ),
262
+ action="store_true",
263
+ )
264
+ parser.add_argument(
265
+ "--n-procs",
266
+ "--procs",
267
+ help="Set the number of parallel processes for collecting.",
268
+ type=int,
269
+ default=None,
270
+ )
271
+ parser.add_argument(
272
+ "--latest-version",
273
+ type=str,
274
+ default=IndexName().latest,
275
+ help="Name of the core holding 'latest' metadata.",
276
+ )
277
+ parser.add_argument(
278
+ "--all-versions",
279
+ type=str,
280
+ default=IndexName().all,
281
+ help="Name of the core holding 'all' metadata versions.",
282
+ )
283
+ parser.add_argument(
284
+ "--comp-level",
285
+ "-z",
286
+ help="Set the compression level for compressing files.",
287
+ default=4,
288
+ type=int,
289
+ )
290
+ parser.add_argument(
291
+ "--storage_option",
292
+ "-s",
293
+ help=(
294
+ "Set additional storage options for adding metadata to the"
295
+ "metadata store"
296
+ ),
297
+ action="append",
298
+ nargs=2,
299
+ )
300
+ parser.add_argument(
301
+ "--fail-under",
302
+ help=" Fail if less than X of the discovered files could be indexed.",
303
+ type=int,
304
+ default=-1,
305
+ )
306
+ parser.add_argument(
307
+ "--shadow",
308
+ help=(
309
+ "'Shadow' these storage options. This is useful to hide secrets "
310
+ "in public data catalogues."
311
+ ),
312
+ action="append",
313
+ default=None,
314
+ nargs="+",
315
+ )
316
+ self._add_general_config_to_parser(parser)
317
+ parser.set_defaults(apply_func=add)
318
+
319
+ def _add_general_config_to_parser(
320
+ self, parser: argparse.ArgumentParser
321
+ ) -> None:
322
+ """Add the most common arguments to a given parser."""
323
+ parser.add_argument(
324
+ "-v",
325
+ "--verbose",
326
+ action="count",
327
+ default=self.verbose,
328
+ help="Increase the verbosity level.",
329
+ )
330
+ parser.add_argument(
331
+ "--log-suffix",
332
+ type=str,
333
+ help="Add a suffix to the log file output.",
334
+ default=None,
335
+ )
336
+
337
+ def _add_walk_catalogue(self) -> None:
338
+ """Add a subcommand for walking an intake catalogue."""
339
+ parser = self.subparsers.add_parser(
340
+ "walk-intake",
341
+ description="Walk an intake catalogue",
342
+ help="Walk an intake catalogue",
343
+ formatter_class=ArgumentDefaultsRichHelpFormatter,
344
+ epilog=self.epilog,
345
+ )
346
+
347
+ parser.add_argument(
348
+ "path",
349
+ type=str,
350
+ help="Path/Url to the intake catalogue",
351
+ )
352
+ parser.add_argument(
353
+ "--storage_option",
354
+ "-s",
355
+ help=(
356
+ "Set additional storage options for adding metadata to the"
357
+ "metadata store"
358
+ ),
359
+ action="append",
360
+ nargs=2,
361
+ )
362
+ parser.set_defaults(apply_func=walk_catalogue)
363
+
364
+ def _index_submcommands(self) -> None:
365
+ """Add sub command for adding metadata to the solr server."""
366
+ entry_point = "metadata_crawler.ingester"
367
+ for plugin, cls in load_plugins(entry_point).items():
368
+ cli_methods: Dict[str, Callable[..., Any]] = {}
369
+ for name in ("index", "delete"):
370
+ method = getattr(cls, name, None)
371
+ if hasattr(method, "_cli_help"):
372
+ cli_methods[name] = cast(Callable[..., Any], method)
373
+ if cli_methods:
374
+ subparser = self.subparsers.add_parser(
375
+ plugin,
376
+ help=cls.__doc__,
377
+ description=cls.__doc__,
378
+ formatter_class=ArgumentDefaultsRichHelpFormatter,
379
+ epilog=self.epilog,
380
+ )
381
+ cmd_parser = subparser.add_subparsers(required=True)
382
+ for name, method in cli_methods.items():
383
+ parser = cmd_parser.add_parser(
384
+ name,
385
+ help=getattr(method, "_cli_help", ""),
386
+ description=getattr(method, "_cli_help", ""),
387
+ formatter_class=ArgumentDefaultsRichHelpFormatter,
388
+ epilog=self.epilog,
389
+ )
390
+ parser.add_argument(
391
+ "-b",
392
+ "--batch-size",
393
+ type=int,
394
+ default=25_000,
395
+ help="Set the batch size for ingestion.",
396
+ )
397
+ parser.add_argument(
398
+ "--storage_option",
399
+ "-s",
400
+ help=(
401
+ "Set additional storage options for adding metadata to "
402
+ " the metadata store"
403
+ ),
404
+ action="append",
405
+ nargs=2,
406
+ )
407
+ params = inspect.signature(method).parameters
408
+ annotations = get_type_hints(method, include_extras=True)
409
+ for param_name, param in params.items():
410
+ if param_name == "self":
411
+ continue
412
+ cli_meta = None
413
+ base_type = None
414
+ ann = annotations.get(param_name, None)
415
+
416
+ # 1) Annotated[...] style
417
+ if get_origin(ann) is Annotated:
418
+ base_type, *extras = get_args(ann)
419
+ # find the dict emitted by cli_parameter()
420
+ cli_meta = next(
421
+ (
422
+ e
423
+ for e in extras
424
+ if isinstance(e, dict) and "args" in e
425
+ ),
426
+ None,
427
+ )
428
+
429
+ # 2) default-as-parameter style
430
+ if (
431
+ cli_meta is None
432
+ and isinstance(param.default, dict)
433
+ and "args" in param.default
434
+ ):
435
+ cli_meta = param.default
436
+ # annotation is the base type
437
+ base_type = ann if ann is not inspect._empty else None
438
+
439
+ # if we found a cli_meta, wire it up
440
+ if cli_meta:
441
+ arg_names = cli_meta["args"]
442
+ add_kwargs = {
443
+ k: v for k, v in cli_meta.items() if k != "args"
444
+ }
445
+
446
+ # preserve any explicit default
447
+ if (
448
+ param.default is not inspect._empty
449
+ and cli_meta is not param.default
450
+ ):
451
+ add_kwargs["default"] = param.default
452
+
453
+ # enforce the base type if supplied
454
+ if base_type and "type" not in add_kwargs:
455
+ add_kwargs["type"] = base_type
456
+ parser.add_argument(
457
+ *arg_names, dest=param_name, **add_kwargs
458
+ )
459
+ if name == "index":
460
+ parser.add_argument(
461
+ "catalogue_files",
462
+ help="File path to the metadata store.",
463
+ type=str,
464
+ nargs="*",
465
+ )
466
+
467
+ parser.set_defaults(
468
+ apply_func=partial(index, index_system=plugin)
469
+ )
470
+ else:
471
+ parser.set_defaults(
472
+ apply_func=partial(delete, index_system=plugin)
473
+ )
474
+ self._add_general_config_to_parser(parser)
475
+
476
+ def parse_args(self, argv: list[str]) -> argparse.Namespace:
477
+ """Parse the arguments for the command line interface.
478
+
479
+ Parameters
480
+ ----------
481
+ argv: list[str]
482
+ List of command line arguments that is parsed.
483
+
484
+ Returns
485
+ -------
486
+ argparse.Namespace
487
+ """
488
+ args = self.parser.parse_args(argv)
489
+ self.kwargs = {
490
+ k: v
491
+ for (k, v) in args._get_kwargs()
492
+ if k
493
+ not in (
494
+ "apply_func",
495
+ "verbose",
496
+ "version",
497
+ "log_suffix",
498
+ "storage_option",
499
+ "shadow",
500
+ )
501
+ }
502
+ storage_option_pairs: List[Tuple[str, str]] = (
503
+ getattr(args, "storage_option", None) or []
504
+ )
505
+ so: StorageOptions = {}
506
+ for option, value in storage_option_pairs:
507
+ so[option] = _process_storage_option(value)
508
+ if getattr(args, "shadow", None) or []:
509
+ self.kwargs["shadow"] = _flatten(args.shadow)
510
+ self.kwargs["storage_options"] = so
511
+ self.verbose = args.verbose
512
+ add_file_handle(args.log_suffix)
513
+ self.kwargs["verbosity"] = self.verbose
514
+ return args
515
+
516
+
517
+ def _run(
518
+ parser: argparse.Namespace,
519
+ **kwargs: KwargValue,
520
+ ) -> None:
521
+ """Apply the parsed method."""
522
+ old_level = apply_verbosity(getattr(parser, "verbose", 0))
523
+ try:
524
+ parser.apply_func(**kwargs)
525
+ except Exception as error:
526
+ exception_handler(error)
527
+ finally:
528
+ logger.set_level(old_level)
529
+
530
+
531
+ def cli(sys_args: list[str] | None = None) -> None:
532
+ """Methods that creates the command line argument parser."""
533
+ try:
534
+ parser = ArgParse()
535
+ args = parser.parse_args(sys_args or sys.argv[1:])
536
+ kwargs = parser.kwargs or {}
537
+ _run(args, **kwargs)
538
+ except KeyboardInterrupt:
539
+ raise SystemExit("Exiting program")