metadata-crawler 2509.0.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of metadata-crawler might be problematic. Click here for more details.

Files changed (34) hide show
  1. metadata_crawler/__init__.py +248 -0
  2. metadata_crawler/__main__.py +8 -0
  3. metadata_crawler/_version.py +1 -0
  4. metadata_crawler/api/__init__.py +1 -0
  5. metadata_crawler/api/cli.py +57 -0
  6. metadata_crawler/api/config.py +801 -0
  7. metadata_crawler/api/drs_config.toml +439 -0
  8. metadata_crawler/api/index.py +132 -0
  9. metadata_crawler/api/metadata_stores.py +749 -0
  10. metadata_crawler/api/mixin/__init__.py +7 -0
  11. metadata_crawler/api/mixin/lookup_mixin.py +112 -0
  12. metadata_crawler/api/mixin/lookup_tables.py +10010 -0
  13. metadata_crawler/api/mixin/path_mixin.py +46 -0
  14. metadata_crawler/api/mixin/template_mixin.py +145 -0
  15. metadata_crawler/api/storage_backend.py +277 -0
  16. metadata_crawler/backends/__init__.py +1 -0
  17. metadata_crawler/backends/intake.py +211 -0
  18. metadata_crawler/backends/posix.py +121 -0
  19. metadata_crawler/backends/s3.py +136 -0
  20. metadata_crawler/backends/swift.py +305 -0
  21. metadata_crawler/cli.py +539 -0
  22. metadata_crawler/data_collector.py +258 -0
  23. metadata_crawler/ingester/__init__.py +1 -0
  24. metadata_crawler/ingester/mongo.py +193 -0
  25. metadata_crawler/ingester/solr.py +152 -0
  26. metadata_crawler/logger.py +142 -0
  27. metadata_crawler/py.typed +0 -0
  28. metadata_crawler/run.py +373 -0
  29. metadata_crawler/utils.py +411 -0
  30. metadata_crawler-2509.0.0.dist-info/METADATA +399 -0
  31. metadata_crawler-2509.0.0.dist-info/RECORD +34 -0
  32. metadata_crawler-2509.0.0.dist-info/WHEEL +4 -0
  33. metadata_crawler-2509.0.0.dist-info/entry_points.txt +14 -0
  34. metadata_crawler-2509.0.0.dist-info/licenses/LICENSE +28 -0
@@ -0,0 +1,142 @@
1
+ """Logging utilities."""
2
+
3
+ import logging
4
+ import logging.config
5
+ import os
6
+ from logging.handlers import RotatingFileHandler
7
+ from pathlib import Path
8
+ from typing import Any, Optional, cast
9
+
10
+ import appdirs
11
+ from rich.console import Console
12
+ from rich.logging import RichHandler
13
+
14
+ THIS_NAME = "data-crawler"
15
+
16
+ logging.basicConfig(
17
+ level=logging.WARNING,
18
+ format="%(asctime)s %(levelname)s: %(name)s - %(message)s",
19
+ )
20
+
21
+ logging.config.dictConfig(
22
+ {
23
+ "version": 1,
24
+ # keep existing handlers
25
+ "disable_existing_loggers": False,
26
+ "root": {
27
+ "level": "WARNING",
28
+ "handlers": ["default"],
29
+ },
30
+ "formatters": {
31
+ "standard": {
32
+ "format": "%(asctime)s %(levelname)s: %(name)s - %(message)s",
33
+ },
34
+ },
35
+ "handlers": {
36
+ "default": {
37
+ "class": "logging.StreamHandler",
38
+ "formatter": "standard",
39
+ "level": "WARNING",
40
+ },
41
+ },
42
+ }
43
+ )
44
+
45
+ logging.getLogger("sqlalchemy").setLevel(logging.WARNING)
46
+ logging.getLogger("sqlalchemy.engine").setLevel(logging.WARNING)
47
+ logging.getLogger("sqlalchemy.pool").setLevel(logging.WARNING)
48
+
49
+
50
+ class Logger(logging.Logger):
51
+ """Custom Logger defining the logging behaviour."""
52
+
53
+ logfmt: str = "%(name)s: %(message)s"
54
+ filelogfmt: str = "%(asctime)s %(levelname)s: %(name)s - %(message)s"
55
+ datefmt: str = "%Y-%m-%dT%H:%M:%S"
56
+ no_debug: list[str] = ["watchfiles", "httpcore", "pymongo", "pika"]
57
+
58
+ def __init__(
59
+ self, name: Optional[str] = None, level: Optional[int] = None
60
+ ) -> None:
61
+ """Instantiate this logger only once and for all."""
62
+ level = level or int(
63
+ cast(str, os.getenv("MDC_LOG_LEVEL", str(logging.WARNING)))
64
+ )
65
+ name = name or THIS_NAME
66
+ logger_format = logging.Formatter(self.logfmt, self.datefmt)
67
+ self.file_format = logging.Formatter(self.filelogfmt, self.datefmt)
68
+ self._logger_file_handle: Optional[RotatingFileHandler] = None
69
+ self._logger_stream_handle = RichHandler(
70
+ rich_tracebacks=True,
71
+ tracebacks_max_frames=10,
72
+ tracebacks_extra_lines=5,
73
+ show_path=True,
74
+ console=Console(
75
+ soft_wrap=False,
76
+ force_jupyter=False,
77
+ stderr=True,
78
+ ),
79
+ )
80
+ self._logger_stream_handle.setFormatter(logger_format)
81
+ self._logger_stream_handle.setLevel(level)
82
+ super().__init__(name, level)
83
+
84
+ self.propagate = False
85
+ self.handlers = [self._logger_stream_handle]
86
+
87
+ def set_level(self, level: int) -> None:
88
+ """Set the logger level to level."""
89
+ for handler in self.handlers:
90
+ log_level = level
91
+ if isinstance(handler, RotatingFileHandler):
92
+ log_level = min(level, logging.CRITICAL)
93
+ handler.setLevel(log_level)
94
+ self.setLevel(level)
95
+ logger.level = level
96
+
97
+ def error(
98
+ self,
99
+ msg: object,
100
+ *args: Any,
101
+ **kwargs: Any,
102
+ ) -> None:
103
+ """Log an error. When log level is smaller than INFO, log exceptions."""
104
+ if self.level < logging.INFO:
105
+ kwargs.setdefault("exc_info", True)
106
+ self._log(logging.ERROR, msg, args, **kwargs)
107
+
108
+
109
+ logger = Logger()
110
+
111
+
112
+ def add_file_handle(
113
+ suffix: Optional[str], log_level: int = logging.CRITICAL
114
+ ) -> None:
115
+ """Add a file log handle to the logger."""
116
+ base_name = f"{THIS_NAME}-{suffix}" if suffix else THIS_NAME
117
+ log_dir = Path(appdirs.user_log_dir(THIS_NAME))
118
+ log_dir.mkdir(exist_ok=True, parents=True)
119
+ logger_file_handle = RotatingFileHandler(
120
+ log_dir / f"{base_name}.log",
121
+ mode="a",
122
+ maxBytes=5 * 1024**2,
123
+ backupCount=5,
124
+ encoding="utf-8",
125
+ delay=False,
126
+ )
127
+ logger_file_handle.setFormatter(logger.file_format)
128
+ logger_file_handle.setLevel(min(log_level, logging.CRITICAL))
129
+ logger.addHandler(logger_file_handle)
130
+
131
+
132
+ def get_level_from_verbosity(verbosity: int) -> int:
133
+ """Calculate the log level from a verbosity."""
134
+ return max(logging.CRITICAL - 10 * verbosity, -1)
135
+
136
+
137
+ def apply_verbosity(level: int) -> int:
138
+ """Set the logging level of the handlers to a certain level."""
139
+ old_level = logger.level
140
+ level = get_level_from_verbosity(level)
141
+ logger.set_level(level)
142
+ return old_level
File without changes
@@ -0,0 +1,373 @@
1
+ """Apply the metadata collector."""
2
+
3
+ import asyncio
4
+ import os
5
+ import time
6
+ from fnmatch import fnmatch
7
+ from pathlib import Path
8
+ from types import NoneType
9
+ from typing import Any, Collection, Dict, List, Optional, Sequence, Union, cast
10
+
11
+ import tomlkit
12
+ from rich.prompt import Prompt
13
+
14
+ from .api.config import CrawlerSettings, DRSConfig, strip_protocol
15
+ from .api.metadata_stores import CatalogueBackendType, IndexName
16
+ from .data_collector import DataCollector
17
+ from .logger import apply_verbosity, get_level_from_verbosity, logger
18
+ from .utils import (
19
+ Console,
20
+ EmptyCrawl,
21
+ MetadataCrawlerException,
22
+ find_closest,
23
+ load_plugins,
24
+ timedelta_to_str,
25
+ )
26
+
27
+ FilesArg = Optional[Union[str, Path, Sequence[Union[str, Path]]]]
28
+
29
+
30
+ def _norm_files(catalogue_files: FilesArg) -> List[str]:
31
+ if catalogue_files is None:
32
+ return [""]
33
+ return (
34
+ [str(catalogue_files)]
35
+ if isinstance(catalogue_files, (str, Path))
36
+ else [str(p) for p in catalogue_files]
37
+ )
38
+
39
+
40
+ def _match(match: str, items: Collection[str]) -> List[str]:
41
+ out: List[str] = []
42
+ for item in items:
43
+ if fnmatch(item, match):
44
+ out.append(item)
45
+
46
+ if not out:
47
+ msg = find_closest(f"No such dataset: {match}", match, items)
48
+ raise MetadataCrawlerException(msg) from None
49
+ return out
50
+
51
+
52
+ def _get_search(
53
+ config_file: Union[str, Path, Dict[str, Any], tomlkit.TOMLDocument],
54
+ search_dirs: Optional[List[str]] = None,
55
+ datasets: Optional[List[str]] = None,
56
+ ) -> list[CrawlerSettings]:
57
+ _search_items = []
58
+ search_dirs = search_dirs or []
59
+ datasets = datasets or []
60
+ config = DRSConfig.load(config_file).datasets
61
+ if not datasets and not search_dirs:
62
+ return [
63
+ CrawlerSettings(name=k, search_path=cfg.root_path)
64
+ for (k, cfg) in config.items()
65
+ ]
66
+ for item in datasets or []:
67
+ for ds in _match(item, config.keys()):
68
+ logger.debug("Adding dataset %s", ds)
69
+ _search_items.append(
70
+ CrawlerSettings(name=ds, search_path=config[ds].root_path)
71
+ )
72
+ for num, _dir in enumerate(map(strip_protocol, search_dirs or [])):
73
+ for name, cfg in config.items():
74
+ if _dir.is_relative_to(strip_protocol(cfg.root_path)):
75
+ logger.debug("Adding dataset %s", name)
76
+ _search_items.append(
77
+ CrawlerSettings(name=name, search_path=str(search_dirs[num]))
78
+ )
79
+
80
+ return _search_items
81
+
82
+
83
+ async def async_call(
84
+ index_system: str,
85
+ method: str,
86
+ batch_size: int = 2500,
87
+ catalogue_files: Optional[Sequence[Union[Path, str]]] = None,
88
+ verbosity: int = 0,
89
+ *args: Any,
90
+ **kwargs: Any,
91
+ ) -> None:
92
+ """Index metadata."""
93
+ env = cast(os._Environ[str], os.environ.copy())
94
+ old_level = apply_verbosity(verbosity)
95
+ try:
96
+ os.environ["MDC_LOG_LEVEL"] = str(get_level_from_verbosity(verbosity))
97
+ backends = load_plugins("metadata_crawler.ingester")
98
+ try:
99
+ cls = backends[index_system]
100
+ except KeyError:
101
+ msg = find_closest(
102
+ f"No such backend: {index_system}", index_system, backends.keys()
103
+ )
104
+ raise ValueError(msg) from None
105
+ flat_files = _norm_files(catalogue_files)
106
+ _event_loop = asyncio.get_event_loop()
107
+ flat_files = flat_files or [""]
108
+ futures = []
109
+ storage_options = kwargs.pop("storage_options", {})
110
+ for cf in flat_files:
111
+ obj = cls(
112
+ batch_size=batch_size,
113
+ catalogue_file=cf or None,
114
+ storage_options=storage_options,
115
+ )
116
+ func = getattr(obj, method)
117
+ future = _event_loop.create_task(func(**kwargs))
118
+ futures.append(future)
119
+ await asyncio.gather(*futures)
120
+ finally:
121
+ os.environ = env
122
+ logger.set_level(old_level)
123
+
124
+
125
+ async def async_index(
126
+ index_system: str,
127
+ *catalogue_files: Union[Path, str, List[str], List[Path]],
128
+ batch_size: int = 2500,
129
+ verbosity: int = 0,
130
+ **kwargs: Any,
131
+ ) -> None:
132
+ """Index metadata in the indexing system.
133
+
134
+ Parameters
135
+ ^^^^^^^^^^
136
+
137
+ index_system:
138
+ The index server where the metadata is indexed.
139
+ catalogue_file:
140
+ Path to the file where the metadata was stored.
141
+ batch_size:
142
+ If the index system supports batch-sizes, the size of the batches.
143
+ verbosity:
144
+ Set the verbosity of the system.
145
+
146
+ Other Parameters
147
+ ^^^^^^^^^^^^^^^^
148
+
149
+ **kwargs:
150
+ Keyword arguments used to delete data from the index.
151
+
152
+
153
+ Example
154
+ ^^^^^^^
155
+
156
+ .. code-block:: python
157
+
158
+ await async_index(
159
+ "solr"
160
+ "/tmp/catalog.yaml",
161
+ server="localhost:8983",
162
+ batch_size=1000,
163
+ )
164
+ """
165
+ kwargs.setdefault("catalogue_files", catalogue_files)
166
+ await async_call(
167
+ index_system,
168
+ "index",
169
+ batch_size=batch_size,
170
+ verbosity=verbosity,
171
+ **kwargs,
172
+ )
173
+
174
+
175
+ async def async_delete(
176
+ index_system: str,
177
+ batch_size: int = 2500,
178
+ verbosity: int = 0,
179
+ **kwargs: Any,
180
+ ) -> None:
181
+ """Delete metadata from the indexing system.
182
+
183
+ Parameters
184
+ ^^^^^^^^^^^
185
+ index_system:
186
+ The index server where the metadata is indexed.
187
+ batch_size:
188
+ If the index system supports batch-sizes, the size of the batches.
189
+ verbosity:
190
+ Set the verbosity of the system.
191
+
192
+ Other Parameters
193
+ ^^^^^^^^^^^^^^^^^
194
+
195
+ **kwargs:
196
+ Keyword arguments used to delete data from the index.
197
+
198
+ Examples
199
+ ^^^^^^^^
200
+
201
+ .. code-block:: python
202
+
203
+ await async_delete(
204
+ "solr"
205
+ server="localhost:8983",
206
+ latest_version="latest",
207
+ facets=[("file", "*.nc"), ("project", "OBS")],
208
+ )
209
+ """
210
+ await async_call(
211
+ index_system,
212
+ "delete",
213
+ batch_size=batch_size,
214
+ verbosity=verbosity,
215
+ **kwargs,
216
+ )
217
+
218
+
219
+ async def async_add(
220
+ store: Optional[
221
+ Union[str, Path, Dict[str, Any], tomlkit.TOMLDocument]
222
+ ] = None,
223
+ config_file: Optional[
224
+ Union[Path, str, Dict[str, Any], tomlkit.TOMLDocument]
225
+ ] = None,
226
+ data_object: Optional[Union[str, List[str]]] = None,
227
+ data_set: Optional[Union[List[str], str]] = None,
228
+ data_store_prefix: str = "metadata",
229
+ batch_size: int = 25_000,
230
+ comp_level: int = 4,
231
+ storage_options: Optional[Dict[str, Any]] = None,
232
+ shadow: Optional[Union[str, List[str]]] = None,
233
+ catalogue_backend: CatalogueBackendType = "jsonlines",
234
+ latest_version: str = IndexName().latest,
235
+ all_versions: str = IndexName().all,
236
+ password: bool = False,
237
+ n_procs: Optional[int] = None,
238
+ verbosity: int = 0,
239
+ fail_under: int = -1,
240
+ **kwargs: Any,
241
+ ) -> None:
242
+ """Harvest metadata from storage systems and add them to an intake catalogue.
243
+
244
+ Parameters
245
+ ^^^^^^^^^^
246
+
247
+ store:
248
+ Path to the intake catalogue.
249
+ config_file:
250
+ Path to the drs-config file / loaded configuration.
251
+ data_objects:
252
+ Instead of defining datasets that are to be crawled you can crawl
253
+ data based on their directories. The directories must be a root dirs
254
+ given in the drs-config file. By default all root dirs are crawled.
255
+ data_object:
256
+ Objects (directories or catalogue files) that are processed.
257
+ data_set:
258
+ Dataset(s) that should be crawled. The datasets need to be defined
259
+ in the drs-config file. By default all datasets are crawled.
260
+ Names can contain wildcards such as ``xces-*``.
261
+ data_store_prefix: str
262
+ Absolute path or relative path to intake catalogue source
263
+ batch_size:
264
+ Batch size that is used to collect the meta data. This can affect
265
+ performance.
266
+ comp_level:
267
+ Compression level used to write the meta data to csv.gz
268
+ storage_options:
269
+ Set additional storage options for adding metadata to the metadata store
270
+ shadow:
271
+ 'Shadow' this storage options. This is useful to hide secrets in public
272
+ data catalogues.
273
+ catalogue_backend:
274
+ Intake catalogue backend
275
+ latest_version:
276
+ Name of the core holding 'latest' metadata.
277
+ all_versions:
278
+ Name of the core holding 'all' metadata versions.
279
+ password:
280
+ Display a password prompt before beginning
281
+ n_procs:
282
+ Set the number of parallel processes for collecting.
283
+ verbosity:
284
+ Set the verbosity of the system.
285
+ fail_under:
286
+ Fail if less than X of the discovered files could be indexed.
287
+
288
+ Other Parameters
289
+ ^^^^^^^^^^^^^^^^
290
+
291
+ **kwargs:
292
+ Additional keyword arguments.
293
+
294
+
295
+ Examples
296
+ ^^^^^^^^
297
+
298
+ .. code-block:: python
299
+
300
+ await async_add(
301
+ store="my-data.yaml",
302
+ config_file="~/data/drs-config.toml",
303
+ data_set=["cmip6", "cordex"],
304
+ )
305
+
306
+ """
307
+ env = cast(os._Environ[str], os.environ.copy())
308
+ old_level = apply_verbosity(verbosity)
309
+ try:
310
+ os.environ["MDC_LOG_LEVEL"] = str(get_level_from_verbosity(verbosity))
311
+ config_file = config_file or os.environ.get(
312
+ "EVALUATION_SYSTEM_CONFIG_DIR"
313
+ )
314
+ if not config_file:
315
+ raise MetadataCrawlerException(
316
+ "You must give a config file/directory"
317
+ )
318
+ st = time.time()
319
+ passwd = ""
320
+ if password: # pragma: no cover
321
+ passwd = Prompt.ask(
322
+ "[b]Enter the password", password=True
323
+ ) # pragma: no cover
324
+
325
+ if passwd:
326
+ os.environ["DRS_STORAGE_PASSWD"] = passwd
327
+ data_object = (
328
+ data_object
329
+ if isinstance(data_object, (NoneType, list))
330
+ else [str(data_object)]
331
+ )
332
+ data_set = (
333
+ data_set
334
+ if isinstance(data_set, (NoneType, list))
335
+ else [str(data_set)]
336
+ )
337
+ async with DataCollector(
338
+ config_file,
339
+ store,
340
+ IndexName(latest=latest_version, all=all_versions),
341
+ *_get_search(config_file, data_object, data_set),
342
+ batch_size=batch_size,
343
+ comp_level=comp_level,
344
+ backend=catalogue_backend,
345
+ data_store_prefix=data_store_prefix,
346
+ n_procs=n_procs,
347
+ storage_options=storage_options or {},
348
+ shadow=shadow,
349
+ **kwargs,
350
+ ) as data_col:
351
+ await data_col.ingest_data()
352
+ num_files = data_col.ingested_objects
353
+ files_discovered = data_col.crawled_files
354
+ dt = timedelta_to_str(time.time() - st)
355
+ logger.info("Discovered: %s files", f"{files_discovered:10,.0f}")
356
+ logger.info("Ingested: %s files", f"{num_files:10,.0f}")
357
+ logger.info("Spend: %s", dt)
358
+ Console.print(" " * Console.width, end="\r")
359
+ Console.print(
360
+ (
361
+ f"[bold]Ingested [green]{num_files:10,.0f}[/green] "
362
+ f"within [green]{dt}[/green][/bold]"
363
+ )
364
+ )
365
+
366
+ if (
367
+ files_discovered >= fail_under and num_files < fail_under
368
+ ) or files_discovered == 0:
369
+ await data_col.ingest_queue.delete()
370
+ raise EmptyCrawl("Could not fulfill discovery threshold!") from None
371
+ finally:
372
+ os.environ = env
373
+ logger.set_level(old_level)