metadata-crawler 2510.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of metadata-crawler might be problematic. Click here for more details.

Files changed (35) hide show
  1. metadata_crawler/__init__.py +263 -0
  2. metadata_crawler/__main__.py +8 -0
  3. metadata_crawler/_version.py +1 -0
  4. metadata_crawler/api/__init__.py +1 -0
  5. metadata_crawler/api/cli.py +57 -0
  6. metadata_crawler/api/config.py +831 -0
  7. metadata_crawler/api/drs_config.toml +440 -0
  8. metadata_crawler/api/index.py +151 -0
  9. metadata_crawler/api/metadata_stores.py +755 -0
  10. metadata_crawler/api/mixin/__init__.py +7 -0
  11. metadata_crawler/api/mixin/lookup_mixin.py +112 -0
  12. metadata_crawler/api/mixin/lookup_tables.py +10010 -0
  13. metadata_crawler/api/mixin/path_mixin.py +46 -0
  14. metadata_crawler/api/mixin/template_mixin.py +145 -0
  15. metadata_crawler/api/storage_backend.py +277 -0
  16. metadata_crawler/backends/__init__.py +1 -0
  17. metadata_crawler/backends/intake.py +211 -0
  18. metadata_crawler/backends/posix.py +121 -0
  19. metadata_crawler/backends/s3.py +140 -0
  20. metadata_crawler/backends/swift.py +305 -0
  21. metadata_crawler/cli.py +547 -0
  22. metadata_crawler/data_collector.py +278 -0
  23. metadata_crawler/ingester/__init__.py +1 -0
  24. metadata_crawler/ingester/mongo.py +206 -0
  25. metadata_crawler/ingester/solr.py +282 -0
  26. metadata_crawler/logger.py +153 -0
  27. metadata_crawler/py.typed +0 -0
  28. metadata_crawler/run.py +419 -0
  29. metadata_crawler/utils/__init__.py +482 -0
  30. metadata_crawler/utils/cftime_utils.py +207 -0
  31. metadata_crawler-2510.1.0.dist-info/METADATA +401 -0
  32. metadata_crawler-2510.1.0.dist-info/RECORD +35 -0
  33. metadata_crawler-2510.1.0.dist-info/WHEEL +4 -0
  34. metadata_crawler-2510.1.0.dist-info/entry_points.txt +14 -0
  35. metadata_crawler-2510.1.0.dist-info/licenses/LICENSE +28 -0
@@ -0,0 +1,419 @@
1
+ """Apply the metadata collector."""
2
+
3
+ import os
4
+ import time
5
+ from fnmatch import fnmatch
6
+ from pathlib import Path
7
+ from types import NoneType
8
+ from typing import Any, Collection, Dict, List, Optional, Sequence, Union, cast
9
+
10
+ import tomlkit
11
+ import yaml
12
+ from rich.prompt import Prompt
13
+
14
+ from .api.config import CrawlerSettings, DRSConfig, strip_protocol
15
+ from .api.metadata_stores import (
16
+ CatalogueBackendType,
17
+ CatalogueReader,
18
+ IndexName,
19
+ )
20
+ from .data_collector import DataCollector
21
+ from .logger import apply_verbosity, get_level_from_verbosity, logger
22
+ from .utils import (
23
+ Console,
24
+ EmptyCrawl,
25
+ IndexProgress,
26
+ MetadataCrawlerException,
27
+ find_closest,
28
+ load_plugins,
29
+ timedelta_to_str,
30
+ )
31
+
32
+ FilesArg = Optional[Union[str, Path, Sequence[Union[str, Path]]]]
33
+
34
+
35
+ def _norm_files(catalogue_files: FilesArg) -> List[str]:
36
+ if catalogue_files is None:
37
+ return [""]
38
+ return (
39
+ [str(catalogue_files)]
40
+ if isinstance(catalogue_files, (str, Path))
41
+ else [str(p) for p in catalogue_files]
42
+ )
43
+
44
+
45
+ def _match(match: str, items: Collection[str]) -> List[str]:
46
+ out: List[str] = []
47
+ for item in items:
48
+ if fnmatch(item, match):
49
+ out.append(item)
50
+
51
+ if not out:
52
+ msg = find_closest(f"No such dataset: {match}", match, items)
53
+ raise MetadataCrawlerException(msg) from None
54
+ return out
55
+
56
+
57
+ def _get_num_of_indexed_objects(
58
+ catalogue_files: FilesArg, storage_options: Optional[Dict[str, Any]] = None
59
+ ) -> int:
60
+ num_objects = 0
61
+ storage_options = storage_options or {}
62
+ for cat_file in _norm_files(catalogue_files):
63
+ try:
64
+ cat = CatalogueReader.load_catalogue(cat_file, **storage_options)
65
+ num_objects += cat.get("metadata", {}).get("indexed_objects", 0)
66
+ except (FileNotFoundError, IsADirectoryError, yaml.parser.ParserError):
67
+ pass
68
+ return num_objects
69
+
70
+
71
+ def _get_search(
72
+ config_file: Union[str, Path, Dict[str, Any], tomlkit.TOMLDocument],
73
+ search_dirs: Optional[List[str]] = None,
74
+ datasets: Optional[List[str]] = None,
75
+ ) -> list[CrawlerSettings]:
76
+ _search_items = []
77
+ search_dirs = search_dirs or []
78
+ datasets = datasets or []
79
+ config = DRSConfig.load(config_file).datasets
80
+ if not datasets and not search_dirs:
81
+ return [
82
+ CrawlerSettings(name=k, search_path=cfg.root_path)
83
+ for (k, cfg) in config.items()
84
+ ]
85
+ for item in datasets or []:
86
+ for ds in _match(item, config.keys()):
87
+ logger.debug("Adding dataset %s", ds)
88
+ _search_items.append(
89
+ CrawlerSettings(name=ds, search_path=config[ds].root_path)
90
+ )
91
+ for num, _dir in enumerate(map(strip_protocol, search_dirs or [])):
92
+ for name, cfg in config.items():
93
+ if _dir.is_relative_to(strip_protocol(cfg.root_path)):
94
+ logger.debug("Adding dataset %s", name)
95
+ _search_items.append(
96
+ CrawlerSettings(name=name, search_path=str(search_dirs[num]))
97
+ )
98
+
99
+ return _search_items
100
+
101
+
102
+ async def async_call(
103
+ index_system: str,
104
+ method: str,
105
+ batch_size: int = 2500,
106
+ catalogue_files: Optional[Sequence[Union[Path, str]]] = None,
107
+ verbosity: int = 0,
108
+ log_suffix: Optional[str] = None,
109
+ num_objects: int = 0,
110
+ *args: Any,
111
+ **kwargs: Any,
112
+ ) -> None:
113
+ """Add / Delete metadata from index."""
114
+ env = cast(os._Environ[str], os.environ.copy())
115
+ old_level = apply_verbosity(verbosity, suffix=log_suffix)
116
+
117
+ try:
118
+ progress = IndexProgress(total=num_objects)
119
+ os.environ["MDC_LOG_INIT"] = "1"
120
+ os.environ["MDC_LOG_LEVEL"] = str(get_level_from_verbosity(verbosity))
121
+ os.environ["MDC_LOG_SUFFIX"] = (
122
+ log_suffix or os.getenv("MDC_LOG_SUFFIX") or ""
123
+ )
124
+ backends = load_plugins("metadata_crawler.ingester")
125
+ try:
126
+ cls = backends[index_system]
127
+ except KeyError:
128
+ msg = find_closest(
129
+ f"No such backend: {index_system}", index_system, backends.keys()
130
+ )
131
+ raise ValueError(msg) from None
132
+ flat_files = _norm_files(catalogue_files)
133
+ flat_files = flat_files or [""]
134
+ storage_options = kwargs.pop("storage_options", {})
135
+ progress.start()
136
+ for cf in flat_files:
137
+ async with cls(
138
+ batch_size=batch_size,
139
+ catalogue_file=cf or None,
140
+ storage_options=storage_options,
141
+ progress=progress,
142
+ ) as obj:
143
+ func = getattr(obj, method)
144
+ await func(**kwargs)
145
+
146
+ finally:
147
+ os.environ = env
148
+ progress.stop()
149
+ logger.set_level(old_level)
150
+
151
+
152
+ async def async_index(
153
+ index_system: str,
154
+ *catalogue_files: Union[Path, str, List[str], List[Path]],
155
+ batch_size: int = 2500,
156
+ verbosity: int = 0,
157
+ log_suffix: Optional[str] = None,
158
+ **kwargs: Any,
159
+ ) -> None:
160
+ """Index metadata in the indexing system.
161
+
162
+ Parameters
163
+ ^^^^^^^^^^
164
+
165
+ index_system:
166
+ The index server where the metadata is indexed.
167
+ catalogue_file:
168
+ Path to the file where the metadata was stored.
169
+ batch_size:
170
+ If the index system supports batch-sizes, the size of the batches.
171
+ verbosity:
172
+ Set the verbosity of the system.
173
+ log_suffix:
174
+ Add a suffix to the log file output.
175
+
176
+ Other Parameters
177
+ ^^^^^^^^^^^^^^^^
178
+
179
+ **kwargs:
180
+ Keyword arguments used to delete data from the index.
181
+
182
+
183
+ Example
184
+ ^^^^^^^
185
+
186
+ .. code-block:: python
187
+
188
+ await async_index(
189
+ "solr"
190
+ "/tmp/catalog.yaml",
191
+ server="localhost:8983",
192
+ batch_size=1000,
193
+ )
194
+ """
195
+ kwargs.setdefault("catalogue_files", catalogue_files)
196
+ await async_call(
197
+ index_system,
198
+ "index",
199
+ batch_size=batch_size,
200
+ verbosity=verbosity,
201
+ log_suffix=log_suffix,
202
+ num_objects=_get_num_of_indexed_objects(
203
+ kwargs["catalogue_files"],
204
+ storage_options=kwargs.get("storage_options"),
205
+ ),
206
+ **kwargs,
207
+ )
208
+
209
+
210
+ async def async_delete(
211
+ index_system: str,
212
+ batch_size: int = 2500,
213
+ verbosity: int = 0,
214
+ log_suffix: Optional[str] = None,
215
+ **kwargs: Any,
216
+ ) -> None:
217
+ """Delete metadata from the indexing system.
218
+
219
+ Parameters
220
+ ^^^^^^^^^^^
221
+ index_system:
222
+ The index server where the metadata is indexed.
223
+ batch_size:
224
+ If the index system supports batch-sizes, the size of the batches.
225
+ verbosity:
226
+ Set the verbosity of the system.
227
+ log_suffix:
228
+ Add a suffix to the log file output.
229
+
230
+ Other Parameters
231
+ ^^^^^^^^^^^^^^^^^
232
+
233
+ **kwargs:
234
+ Keyword arguments used to delete data from the index.
235
+
236
+ Examples
237
+ ^^^^^^^^
238
+
239
+ .. code-block:: python
240
+
241
+ await async_delete(
242
+ "solr"
243
+ server="localhost:8983",
244
+ latest_version="latest",
245
+ facets=[("file", "*.nc"), ("project", "OBS")],
246
+ )
247
+ """
248
+ await async_call(
249
+ index_system,
250
+ "delete",
251
+ batch_size=batch_size,
252
+ verbosity=verbosity,
253
+ log_suffix=log_suffix,
254
+ **kwargs,
255
+ )
256
+
257
+
258
+ async def async_add(
259
+ store: Optional[
260
+ Union[str, Path, Dict[str, Any], tomlkit.TOMLDocument]
261
+ ] = None,
262
+ config_file: Optional[
263
+ Union[Path, str, Dict[str, Any], tomlkit.TOMLDocument]
264
+ ] = None,
265
+ data_object: Optional[Union[str, List[str]]] = None,
266
+ data_set: Optional[Union[List[str], str]] = None,
267
+ data_store_prefix: str = "metadata",
268
+ batch_size: int = 25_000,
269
+ comp_level: int = 4,
270
+ storage_options: Optional[Dict[str, Any]] = None,
271
+ shadow: Optional[Union[str, List[str]]] = None,
272
+ catalogue_backend: CatalogueBackendType = "jsonlines",
273
+ latest_version: str = IndexName().latest,
274
+ all_versions: str = IndexName().all,
275
+ password: bool = False,
276
+ n_procs: Optional[int] = None,
277
+ verbosity: int = 0,
278
+ log_suffix: Optional[str] = None,
279
+ fail_under: int = -1,
280
+ **kwargs: Any,
281
+ ) -> None:
282
+ """Harvest metadata from storage systems and add them to an intake catalogue.
283
+
284
+ Parameters
285
+ ^^^^^^^^^^
286
+
287
+ store:
288
+ Path to the intake catalogue.
289
+ config_file:
290
+ Path to the drs-config file / loaded configuration.
291
+ data_objects:
292
+ Instead of defining datasets that are to be crawled you can crawl
293
+ data based on their directories. The directories must be a root dirs
294
+ given in the drs-config file. By default all root dirs are crawled.
295
+ data_object:
296
+ Objects (directories or catalogue files) that are processed.
297
+ data_set:
298
+ Dataset(s) that should be crawled. The datasets need to be defined
299
+ in the drs-config file. By default all datasets are crawled.
300
+ Names can contain wildcards such as ``xces-*``.
301
+ data_store_prefix: str
302
+ Absolute path or relative path to intake catalogue source
303
+ batch_size:
304
+ Batch size that is used to collect the meta data. This can affect
305
+ performance.
306
+ comp_level:
307
+ Compression level used to write the meta data to csv.gz
308
+ storage_options:
309
+ Set additional storage options for adding metadata to the metadata store
310
+ shadow:
311
+ 'Shadow' this storage options. This is useful to hide secrets in public
312
+ data catalogues.
313
+ catalogue_backend:
314
+ Intake catalogue backend
315
+ latest_version:
316
+ Name of the core holding 'latest' metadata.
317
+ all_versions:
318
+ Name of the core holding 'all' metadata versions.
319
+ password:
320
+ Display a password prompt before beginning
321
+ n_procs:
322
+ Set the number of parallel processes for collecting.
323
+ verbosity:
324
+ Set the verbosity of the system.
325
+ log_suffix:
326
+ Add a suffix to the log file output.
327
+ fail_under:
328
+ Fail if less than X of the discovered files could be indexed.
329
+
330
+ Other Parameters
331
+ ^^^^^^^^^^^^^^^^
332
+
333
+ **kwargs:
334
+ Additional keyword arguments.
335
+
336
+
337
+ Examples
338
+ ^^^^^^^^
339
+
340
+ .. code-block:: python
341
+
342
+ await async_add(
343
+ store="my-data.yaml",
344
+ config_file="~/data/drs-config.toml",
345
+ data_set=["cmip6", "cordex"],
346
+ )
347
+
348
+ """
349
+ env = cast(os._Environ[str], os.environ.copy())
350
+ old_level = apply_verbosity(verbosity, suffix=log_suffix)
351
+ try:
352
+ os.environ["MDC_LOG_INIT"] = "1"
353
+ os.environ["MDC_LOG_LEVEL"] = str(get_level_from_verbosity(verbosity))
354
+ os.environ["MDC_LOG_SUFFIX"] = (
355
+ log_suffix or os.getenv("MDC_LOG_SUFFIX") or ""
356
+ )
357
+ config_file = config_file or os.environ.get(
358
+ "EVALUATION_SYSTEM_CONFIG_DIR"
359
+ )
360
+ if not config_file:
361
+ raise MetadataCrawlerException(
362
+ "You must give a config file/directory"
363
+ )
364
+ st = time.time()
365
+ passwd: Optional[str] = None
366
+ if password: # pragma: no cover
367
+ passwd = Prompt.ask(
368
+ "[b]Enter the password", password=True
369
+ ) # pragma: no cover
370
+
371
+ if passwd:
372
+ os.environ["DRS_STORAGE_PASSWD"] = passwd
373
+ data_object = (
374
+ data_object
375
+ if isinstance(data_object, (NoneType, list))
376
+ else [str(data_object)]
377
+ )
378
+ data_set = (
379
+ data_set
380
+ if isinstance(data_set, (NoneType, list))
381
+ else [str(data_set)]
382
+ )
383
+ async with DataCollector(
384
+ config_file,
385
+ store,
386
+ IndexName(latest=latest_version, all=all_versions),
387
+ *_get_search(config_file, data_object, data_set),
388
+ batch_size=batch_size,
389
+ comp_level=comp_level,
390
+ backend=catalogue_backend,
391
+ data_store_prefix=data_store_prefix,
392
+ n_procs=n_procs,
393
+ storage_options=storage_options or {},
394
+ shadow=shadow,
395
+ **kwargs,
396
+ ) as data_col:
397
+ await data_col.ingest_data()
398
+ num_files = data_col.ingested_objects
399
+ files_discovered = data_col.crawled_files
400
+ dt = timedelta_to_str(time.time() - st)
401
+ logger.info("Discovered: %s files", f"{files_discovered:10,.0f}")
402
+ logger.info("Ingested: %s files", f"{num_files:10,.0f}")
403
+ logger.info("Spend: %s", dt)
404
+ Console.print(" " * Console.width, end="\r")
405
+ Console.print(
406
+ (
407
+ f"[bold]Ingested [green]{num_files:10,.0f}[/green] "
408
+ f"within [green]{dt}[/green][/bold]"
409
+ )
410
+ )
411
+
412
+ if (
413
+ files_discovered >= fail_under and num_files < fail_under
414
+ ) or files_discovered == 0:
415
+ await data_col.ingest_queue.delete()
416
+ raise EmptyCrawl("Could not fulfill discovery threshold!") from None
417
+ finally:
418
+ os.environ = env
419
+ logger.set_level(old_level)