metadata-crawler 2510.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of metadata-crawler might be problematic. Click here for more details.
- metadata_crawler/__init__.py +263 -0
- metadata_crawler/__main__.py +8 -0
- metadata_crawler/_version.py +1 -0
- metadata_crawler/api/__init__.py +1 -0
- metadata_crawler/api/cli.py +57 -0
- metadata_crawler/api/config.py +831 -0
- metadata_crawler/api/drs_config.toml +440 -0
- metadata_crawler/api/index.py +151 -0
- metadata_crawler/api/metadata_stores.py +755 -0
- metadata_crawler/api/mixin/__init__.py +7 -0
- metadata_crawler/api/mixin/lookup_mixin.py +112 -0
- metadata_crawler/api/mixin/lookup_tables.py +10010 -0
- metadata_crawler/api/mixin/path_mixin.py +46 -0
- metadata_crawler/api/mixin/template_mixin.py +145 -0
- metadata_crawler/api/storage_backend.py +277 -0
- metadata_crawler/backends/__init__.py +1 -0
- metadata_crawler/backends/intake.py +211 -0
- metadata_crawler/backends/posix.py +121 -0
- metadata_crawler/backends/s3.py +140 -0
- metadata_crawler/backends/swift.py +305 -0
- metadata_crawler/cli.py +547 -0
- metadata_crawler/data_collector.py +278 -0
- metadata_crawler/ingester/__init__.py +1 -0
- metadata_crawler/ingester/mongo.py +206 -0
- metadata_crawler/ingester/solr.py +282 -0
- metadata_crawler/logger.py +153 -0
- metadata_crawler/py.typed +0 -0
- metadata_crawler/run.py +419 -0
- metadata_crawler/utils/__init__.py +482 -0
- metadata_crawler/utils/cftime_utils.py +207 -0
- metadata_crawler-2510.1.0.dist-info/METADATA +401 -0
- metadata_crawler-2510.1.0.dist-info/RECORD +35 -0
- metadata_crawler-2510.1.0.dist-info/WHEEL +4 -0
- metadata_crawler-2510.1.0.dist-info/entry_points.txt +14 -0
- metadata_crawler-2510.1.0.dist-info/licenses/LICENSE +28 -0
metadata_crawler/run.py
ADDED
|
@@ -0,0 +1,419 @@
|
|
|
1
|
+
"""Apply the metadata collector."""
|
|
2
|
+
|
|
3
|
+
import os
|
|
4
|
+
import time
|
|
5
|
+
from fnmatch import fnmatch
|
|
6
|
+
from pathlib import Path
|
|
7
|
+
from types import NoneType
|
|
8
|
+
from typing import Any, Collection, Dict, List, Optional, Sequence, Union, cast
|
|
9
|
+
|
|
10
|
+
import tomlkit
|
|
11
|
+
import yaml
|
|
12
|
+
from rich.prompt import Prompt
|
|
13
|
+
|
|
14
|
+
from .api.config import CrawlerSettings, DRSConfig, strip_protocol
|
|
15
|
+
from .api.metadata_stores import (
|
|
16
|
+
CatalogueBackendType,
|
|
17
|
+
CatalogueReader,
|
|
18
|
+
IndexName,
|
|
19
|
+
)
|
|
20
|
+
from .data_collector import DataCollector
|
|
21
|
+
from .logger import apply_verbosity, get_level_from_verbosity, logger
|
|
22
|
+
from .utils import (
|
|
23
|
+
Console,
|
|
24
|
+
EmptyCrawl,
|
|
25
|
+
IndexProgress,
|
|
26
|
+
MetadataCrawlerException,
|
|
27
|
+
find_closest,
|
|
28
|
+
load_plugins,
|
|
29
|
+
timedelta_to_str,
|
|
30
|
+
)
|
|
31
|
+
|
|
32
|
+
FilesArg = Optional[Union[str, Path, Sequence[Union[str, Path]]]]
|
|
33
|
+
|
|
34
|
+
|
|
35
|
+
def _norm_files(catalogue_files: FilesArg) -> List[str]:
|
|
36
|
+
if catalogue_files is None:
|
|
37
|
+
return [""]
|
|
38
|
+
return (
|
|
39
|
+
[str(catalogue_files)]
|
|
40
|
+
if isinstance(catalogue_files, (str, Path))
|
|
41
|
+
else [str(p) for p in catalogue_files]
|
|
42
|
+
)
|
|
43
|
+
|
|
44
|
+
|
|
45
|
+
def _match(match: str, items: Collection[str]) -> List[str]:
|
|
46
|
+
out: List[str] = []
|
|
47
|
+
for item in items:
|
|
48
|
+
if fnmatch(item, match):
|
|
49
|
+
out.append(item)
|
|
50
|
+
|
|
51
|
+
if not out:
|
|
52
|
+
msg = find_closest(f"No such dataset: {match}", match, items)
|
|
53
|
+
raise MetadataCrawlerException(msg) from None
|
|
54
|
+
return out
|
|
55
|
+
|
|
56
|
+
|
|
57
|
+
def _get_num_of_indexed_objects(
|
|
58
|
+
catalogue_files: FilesArg, storage_options: Optional[Dict[str, Any]] = None
|
|
59
|
+
) -> int:
|
|
60
|
+
num_objects = 0
|
|
61
|
+
storage_options = storage_options or {}
|
|
62
|
+
for cat_file in _norm_files(catalogue_files):
|
|
63
|
+
try:
|
|
64
|
+
cat = CatalogueReader.load_catalogue(cat_file, **storage_options)
|
|
65
|
+
num_objects += cat.get("metadata", {}).get("indexed_objects", 0)
|
|
66
|
+
except (FileNotFoundError, IsADirectoryError, yaml.parser.ParserError):
|
|
67
|
+
pass
|
|
68
|
+
return num_objects
|
|
69
|
+
|
|
70
|
+
|
|
71
|
+
def _get_search(
|
|
72
|
+
config_file: Union[str, Path, Dict[str, Any], tomlkit.TOMLDocument],
|
|
73
|
+
search_dirs: Optional[List[str]] = None,
|
|
74
|
+
datasets: Optional[List[str]] = None,
|
|
75
|
+
) -> list[CrawlerSettings]:
|
|
76
|
+
_search_items = []
|
|
77
|
+
search_dirs = search_dirs or []
|
|
78
|
+
datasets = datasets or []
|
|
79
|
+
config = DRSConfig.load(config_file).datasets
|
|
80
|
+
if not datasets and not search_dirs:
|
|
81
|
+
return [
|
|
82
|
+
CrawlerSettings(name=k, search_path=cfg.root_path)
|
|
83
|
+
for (k, cfg) in config.items()
|
|
84
|
+
]
|
|
85
|
+
for item in datasets or []:
|
|
86
|
+
for ds in _match(item, config.keys()):
|
|
87
|
+
logger.debug("Adding dataset %s", ds)
|
|
88
|
+
_search_items.append(
|
|
89
|
+
CrawlerSettings(name=ds, search_path=config[ds].root_path)
|
|
90
|
+
)
|
|
91
|
+
for num, _dir in enumerate(map(strip_protocol, search_dirs or [])):
|
|
92
|
+
for name, cfg in config.items():
|
|
93
|
+
if _dir.is_relative_to(strip_protocol(cfg.root_path)):
|
|
94
|
+
logger.debug("Adding dataset %s", name)
|
|
95
|
+
_search_items.append(
|
|
96
|
+
CrawlerSettings(name=name, search_path=str(search_dirs[num]))
|
|
97
|
+
)
|
|
98
|
+
|
|
99
|
+
return _search_items
|
|
100
|
+
|
|
101
|
+
|
|
102
|
+
async def async_call(
|
|
103
|
+
index_system: str,
|
|
104
|
+
method: str,
|
|
105
|
+
batch_size: int = 2500,
|
|
106
|
+
catalogue_files: Optional[Sequence[Union[Path, str]]] = None,
|
|
107
|
+
verbosity: int = 0,
|
|
108
|
+
log_suffix: Optional[str] = None,
|
|
109
|
+
num_objects: int = 0,
|
|
110
|
+
*args: Any,
|
|
111
|
+
**kwargs: Any,
|
|
112
|
+
) -> None:
|
|
113
|
+
"""Add / Delete metadata from index."""
|
|
114
|
+
env = cast(os._Environ[str], os.environ.copy())
|
|
115
|
+
old_level = apply_verbosity(verbosity, suffix=log_suffix)
|
|
116
|
+
|
|
117
|
+
try:
|
|
118
|
+
progress = IndexProgress(total=num_objects)
|
|
119
|
+
os.environ["MDC_LOG_INIT"] = "1"
|
|
120
|
+
os.environ["MDC_LOG_LEVEL"] = str(get_level_from_verbosity(verbosity))
|
|
121
|
+
os.environ["MDC_LOG_SUFFIX"] = (
|
|
122
|
+
log_suffix or os.getenv("MDC_LOG_SUFFIX") or ""
|
|
123
|
+
)
|
|
124
|
+
backends = load_plugins("metadata_crawler.ingester")
|
|
125
|
+
try:
|
|
126
|
+
cls = backends[index_system]
|
|
127
|
+
except KeyError:
|
|
128
|
+
msg = find_closest(
|
|
129
|
+
f"No such backend: {index_system}", index_system, backends.keys()
|
|
130
|
+
)
|
|
131
|
+
raise ValueError(msg) from None
|
|
132
|
+
flat_files = _norm_files(catalogue_files)
|
|
133
|
+
flat_files = flat_files or [""]
|
|
134
|
+
storage_options = kwargs.pop("storage_options", {})
|
|
135
|
+
progress.start()
|
|
136
|
+
for cf in flat_files:
|
|
137
|
+
async with cls(
|
|
138
|
+
batch_size=batch_size,
|
|
139
|
+
catalogue_file=cf or None,
|
|
140
|
+
storage_options=storage_options,
|
|
141
|
+
progress=progress,
|
|
142
|
+
) as obj:
|
|
143
|
+
func = getattr(obj, method)
|
|
144
|
+
await func(**kwargs)
|
|
145
|
+
|
|
146
|
+
finally:
|
|
147
|
+
os.environ = env
|
|
148
|
+
progress.stop()
|
|
149
|
+
logger.set_level(old_level)
|
|
150
|
+
|
|
151
|
+
|
|
152
|
+
async def async_index(
|
|
153
|
+
index_system: str,
|
|
154
|
+
*catalogue_files: Union[Path, str, List[str], List[Path]],
|
|
155
|
+
batch_size: int = 2500,
|
|
156
|
+
verbosity: int = 0,
|
|
157
|
+
log_suffix: Optional[str] = None,
|
|
158
|
+
**kwargs: Any,
|
|
159
|
+
) -> None:
|
|
160
|
+
"""Index metadata in the indexing system.
|
|
161
|
+
|
|
162
|
+
Parameters
|
|
163
|
+
^^^^^^^^^^
|
|
164
|
+
|
|
165
|
+
index_system:
|
|
166
|
+
The index server where the metadata is indexed.
|
|
167
|
+
catalogue_file:
|
|
168
|
+
Path to the file where the metadata was stored.
|
|
169
|
+
batch_size:
|
|
170
|
+
If the index system supports batch-sizes, the size of the batches.
|
|
171
|
+
verbosity:
|
|
172
|
+
Set the verbosity of the system.
|
|
173
|
+
log_suffix:
|
|
174
|
+
Add a suffix to the log file output.
|
|
175
|
+
|
|
176
|
+
Other Parameters
|
|
177
|
+
^^^^^^^^^^^^^^^^
|
|
178
|
+
|
|
179
|
+
**kwargs:
|
|
180
|
+
Keyword arguments used to delete data from the index.
|
|
181
|
+
|
|
182
|
+
|
|
183
|
+
Example
|
|
184
|
+
^^^^^^^
|
|
185
|
+
|
|
186
|
+
.. code-block:: python
|
|
187
|
+
|
|
188
|
+
await async_index(
|
|
189
|
+
"solr"
|
|
190
|
+
"/tmp/catalog.yaml",
|
|
191
|
+
server="localhost:8983",
|
|
192
|
+
batch_size=1000,
|
|
193
|
+
)
|
|
194
|
+
"""
|
|
195
|
+
kwargs.setdefault("catalogue_files", catalogue_files)
|
|
196
|
+
await async_call(
|
|
197
|
+
index_system,
|
|
198
|
+
"index",
|
|
199
|
+
batch_size=batch_size,
|
|
200
|
+
verbosity=verbosity,
|
|
201
|
+
log_suffix=log_suffix,
|
|
202
|
+
num_objects=_get_num_of_indexed_objects(
|
|
203
|
+
kwargs["catalogue_files"],
|
|
204
|
+
storage_options=kwargs.get("storage_options"),
|
|
205
|
+
),
|
|
206
|
+
**kwargs,
|
|
207
|
+
)
|
|
208
|
+
|
|
209
|
+
|
|
210
|
+
async def async_delete(
|
|
211
|
+
index_system: str,
|
|
212
|
+
batch_size: int = 2500,
|
|
213
|
+
verbosity: int = 0,
|
|
214
|
+
log_suffix: Optional[str] = None,
|
|
215
|
+
**kwargs: Any,
|
|
216
|
+
) -> None:
|
|
217
|
+
"""Delete metadata from the indexing system.
|
|
218
|
+
|
|
219
|
+
Parameters
|
|
220
|
+
^^^^^^^^^^^
|
|
221
|
+
index_system:
|
|
222
|
+
The index server where the metadata is indexed.
|
|
223
|
+
batch_size:
|
|
224
|
+
If the index system supports batch-sizes, the size of the batches.
|
|
225
|
+
verbosity:
|
|
226
|
+
Set the verbosity of the system.
|
|
227
|
+
log_suffix:
|
|
228
|
+
Add a suffix to the log file output.
|
|
229
|
+
|
|
230
|
+
Other Parameters
|
|
231
|
+
^^^^^^^^^^^^^^^^^
|
|
232
|
+
|
|
233
|
+
**kwargs:
|
|
234
|
+
Keyword arguments used to delete data from the index.
|
|
235
|
+
|
|
236
|
+
Examples
|
|
237
|
+
^^^^^^^^
|
|
238
|
+
|
|
239
|
+
.. code-block:: python
|
|
240
|
+
|
|
241
|
+
await async_delete(
|
|
242
|
+
"solr"
|
|
243
|
+
server="localhost:8983",
|
|
244
|
+
latest_version="latest",
|
|
245
|
+
facets=[("file", "*.nc"), ("project", "OBS")],
|
|
246
|
+
)
|
|
247
|
+
"""
|
|
248
|
+
await async_call(
|
|
249
|
+
index_system,
|
|
250
|
+
"delete",
|
|
251
|
+
batch_size=batch_size,
|
|
252
|
+
verbosity=verbosity,
|
|
253
|
+
log_suffix=log_suffix,
|
|
254
|
+
**kwargs,
|
|
255
|
+
)
|
|
256
|
+
|
|
257
|
+
|
|
258
|
+
async def async_add(
|
|
259
|
+
store: Optional[
|
|
260
|
+
Union[str, Path, Dict[str, Any], tomlkit.TOMLDocument]
|
|
261
|
+
] = None,
|
|
262
|
+
config_file: Optional[
|
|
263
|
+
Union[Path, str, Dict[str, Any], tomlkit.TOMLDocument]
|
|
264
|
+
] = None,
|
|
265
|
+
data_object: Optional[Union[str, List[str]]] = None,
|
|
266
|
+
data_set: Optional[Union[List[str], str]] = None,
|
|
267
|
+
data_store_prefix: str = "metadata",
|
|
268
|
+
batch_size: int = 25_000,
|
|
269
|
+
comp_level: int = 4,
|
|
270
|
+
storage_options: Optional[Dict[str, Any]] = None,
|
|
271
|
+
shadow: Optional[Union[str, List[str]]] = None,
|
|
272
|
+
catalogue_backend: CatalogueBackendType = "jsonlines",
|
|
273
|
+
latest_version: str = IndexName().latest,
|
|
274
|
+
all_versions: str = IndexName().all,
|
|
275
|
+
password: bool = False,
|
|
276
|
+
n_procs: Optional[int] = None,
|
|
277
|
+
verbosity: int = 0,
|
|
278
|
+
log_suffix: Optional[str] = None,
|
|
279
|
+
fail_under: int = -1,
|
|
280
|
+
**kwargs: Any,
|
|
281
|
+
) -> None:
|
|
282
|
+
"""Harvest metadata from storage systems and add them to an intake catalogue.
|
|
283
|
+
|
|
284
|
+
Parameters
|
|
285
|
+
^^^^^^^^^^
|
|
286
|
+
|
|
287
|
+
store:
|
|
288
|
+
Path to the intake catalogue.
|
|
289
|
+
config_file:
|
|
290
|
+
Path to the drs-config file / loaded configuration.
|
|
291
|
+
data_objects:
|
|
292
|
+
Instead of defining datasets that are to be crawled you can crawl
|
|
293
|
+
data based on their directories. The directories must be a root dirs
|
|
294
|
+
given in the drs-config file. By default all root dirs are crawled.
|
|
295
|
+
data_object:
|
|
296
|
+
Objects (directories or catalogue files) that are processed.
|
|
297
|
+
data_set:
|
|
298
|
+
Dataset(s) that should be crawled. The datasets need to be defined
|
|
299
|
+
in the drs-config file. By default all datasets are crawled.
|
|
300
|
+
Names can contain wildcards such as ``xces-*``.
|
|
301
|
+
data_store_prefix: str
|
|
302
|
+
Absolute path or relative path to intake catalogue source
|
|
303
|
+
batch_size:
|
|
304
|
+
Batch size that is used to collect the meta data. This can affect
|
|
305
|
+
performance.
|
|
306
|
+
comp_level:
|
|
307
|
+
Compression level used to write the meta data to csv.gz
|
|
308
|
+
storage_options:
|
|
309
|
+
Set additional storage options for adding metadata to the metadata store
|
|
310
|
+
shadow:
|
|
311
|
+
'Shadow' this storage options. This is useful to hide secrets in public
|
|
312
|
+
data catalogues.
|
|
313
|
+
catalogue_backend:
|
|
314
|
+
Intake catalogue backend
|
|
315
|
+
latest_version:
|
|
316
|
+
Name of the core holding 'latest' metadata.
|
|
317
|
+
all_versions:
|
|
318
|
+
Name of the core holding 'all' metadata versions.
|
|
319
|
+
password:
|
|
320
|
+
Display a password prompt before beginning
|
|
321
|
+
n_procs:
|
|
322
|
+
Set the number of parallel processes for collecting.
|
|
323
|
+
verbosity:
|
|
324
|
+
Set the verbosity of the system.
|
|
325
|
+
log_suffix:
|
|
326
|
+
Add a suffix to the log file output.
|
|
327
|
+
fail_under:
|
|
328
|
+
Fail if less than X of the discovered files could be indexed.
|
|
329
|
+
|
|
330
|
+
Other Parameters
|
|
331
|
+
^^^^^^^^^^^^^^^^
|
|
332
|
+
|
|
333
|
+
**kwargs:
|
|
334
|
+
Additional keyword arguments.
|
|
335
|
+
|
|
336
|
+
|
|
337
|
+
Examples
|
|
338
|
+
^^^^^^^^
|
|
339
|
+
|
|
340
|
+
.. code-block:: python
|
|
341
|
+
|
|
342
|
+
await async_add(
|
|
343
|
+
store="my-data.yaml",
|
|
344
|
+
config_file="~/data/drs-config.toml",
|
|
345
|
+
data_set=["cmip6", "cordex"],
|
|
346
|
+
)
|
|
347
|
+
|
|
348
|
+
"""
|
|
349
|
+
env = cast(os._Environ[str], os.environ.copy())
|
|
350
|
+
old_level = apply_verbosity(verbosity, suffix=log_suffix)
|
|
351
|
+
try:
|
|
352
|
+
os.environ["MDC_LOG_INIT"] = "1"
|
|
353
|
+
os.environ["MDC_LOG_LEVEL"] = str(get_level_from_verbosity(verbosity))
|
|
354
|
+
os.environ["MDC_LOG_SUFFIX"] = (
|
|
355
|
+
log_suffix or os.getenv("MDC_LOG_SUFFIX") or ""
|
|
356
|
+
)
|
|
357
|
+
config_file = config_file or os.environ.get(
|
|
358
|
+
"EVALUATION_SYSTEM_CONFIG_DIR"
|
|
359
|
+
)
|
|
360
|
+
if not config_file:
|
|
361
|
+
raise MetadataCrawlerException(
|
|
362
|
+
"You must give a config file/directory"
|
|
363
|
+
)
|
|
364
|
+
st = time.time()
|
|
365
|
+
passwd: Optional[str] = None
|
|
366
|
+
if password: # pragma: no cover
|
|
367
|
+
passwd = Prompt.ask(
|
|
368
|
+
"[b]Enter the password", password=True
|
|
369
|
+
) # pragma: no cover
|
|
370
|
+
|
|
371
|
+
if passwd:
|
|
372
|
+
os.environ["DRS_STORAGE_PASSWD"] = passwd
|
|
373
|
+
data_object = (
|
|
374
|
+
data_object
|
|
375
|
+
if isinstance(data_object, (NoneType, list))
|
|
376
|
+
else [str(data_object)]
|
|
377
|
+
)
|
|
378
|
+
data_set = (
|
|
379
|
+
data_set
|
|
380
|
+
if isinstance(data_set, (NoneType, list))
|
|
381
|
+
else [str(data_set)]
|
|
382
|
+
)
|
|
383
|
+
async with DataCollector(
|
|
384
|
+
config_file,
|
|
385
|
+
store,
|
|
386
|
+
IndexName(latest=latest_version, all=all_versions),
|
|
387
|
+
*_get_search(config_file, data_object, data_set),
|
|
388
|
+
batch_size=batch_size,
|
|
389
|
+
comp_level=comp_level,
|
|
390
|
+
backend=catalogue_backend,
|
|
391
|
+
data_store_prefix=data_store_prefix,
|
|
392
|
+
n_procs=n_procs,
|
|
393
|
+
storage_options=storage_options or {},
|
|
394
|
+
shadow=shadow,
|
|
395
|
+
**kwargs,
|
|
396
|
+
) as data_col:
|
|
397
|
+
await data_col.ingest_data()
|
|
398
|
+
num_files = data_col.ingested_objects
|
|
399
|
+
files_discovered = data_col.crawled_files
|
|
400
|
+
dt = timedelta_to_str(time.time() - st)
|
|
401
|
+
logger.info("Discovered: %s files", f"{files_discovered:10,.0f}")
|
|
402
|
+
logger.info("Ingested: %s files", f"{num_files:10,.0f}")
|
|
403
|
+
logger.info("Spend: %s", dt)
|
|
404
|
+
Console.print(" " * Console.width, end="\r")
|
|
405
|
+
Console.print(
|
|
406
|
+
(
|
|
407
|
+
f"[bold]Ingested [green]{num_files:10,.0f}[/green] "
|
|
408
|
+
f"within [green]{dt}[/green][/bold]"
|
|
409
|
+
)
|
|
410
|
+
)
|
|
411
|
+
|
|
412
|
+
if (
|
|
413
|
+
files_discovered >= fail_under and num_files < fail_under
|
|
414
|
+
) or files_discovered == 0:
|
|
415
|
+
await data_col.ingest_queue.delete()
|
|
416
|
+
raise EmptyCrawl("Could not fulfill discovery threshold!") from None
|
|
417
|
+
finally:
|
|
418
|
+
os.environ = env
|
|
419
|
+
logger.set_level(old_level)
|