metadata-crawler 2509.0.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of metadata-crawler might be problematic. Click here for more details.
- metadata_crawler/__init__.py +248 -0
- metadata_crawler/__main__.py +8 -0
- metadata_crawler/_version.py +1 -0
- metadata_crawler/api/__init__.py +1 -0
- metadata_crawler/api/cli.py +57 -0
- metadata_crawler/api/config.py +801 -0
- metadata_crawler/api/drs_config.toml +439 -0
- metadata_crawler/api/index.py +132 -0
- metadata_crawler/api/metadata_stores.py +749 -0
- metadata_crawler/api/mixin/__init__.py +7 -0
- metadata_crawler/api/mixin/lookup_mixin.py +112 -0
- metadata_crawler/api/mixin/lookup_tables.py +10010 -0
- metadata_crawler/api/mixin/path_mixin.py +46 -0
- metadata_crawler/api/mixin/template_mixin.py +145 -0
- metadata_crawler/api/storage_backend.py +277 -0
- metadata_crawler/backends/__init__.py +1 -0
- metadata_crawler/backends/intake.py +211 -0
- metadata_crawler/backends/posix.py +121 -0
- metadata_crawler/backends/s3.py +136 -0
- metadata_crawler/backends/swift.py +305 -0
- metadata_crawler/cli.py +539 -0
- metadata_crawler/data_collector.py +258 -0
- metadata_crawler/ingester/__init__.py +1 -0
- metadata_crawler/ingester/mongo.py +193 -0
- metadata_crawler/ingester/solr.py +152 -0
- metadata_crawler/logger.py +142 -0
- metadata_crawler/py.typed +0 -0
- metadata_crawler/run.py +373 -0
- metadata_crawler/utils.py +411 -0
- metadata_crawler-2509.0.0.dist-info/METADATA +399 -0
- metadata_crawler-2509.0.0.dist-info/RECORD +34 -0
- metadata_crawler-2509.0.0.dist-info/WHEEL +4 -0
- metadata_crawler-2509.0.0.dist-info/entry_points.txt +14 -0
- metadata_crawler-2509.0.0.dist-info/licenses/LICENSE +28 -0
|
@@ -0,0 +1,248 @@
|
|
|
1
|
+
"""Metadata Crawler API high level functions."""
|
|
2
|
+
|
|
3
|
+
import asyncio
|
|
4
|
+
from pathlib import Path
|
|
5
|
+
from typing import Any, Dict, List, Optional, Union
|
|
6
|
+
|
|
7
|
+
import tomlkit
|
|
8
|
+
import uvloop
|
|
9
|
+
|
|
10
|
+
from ._version import __version__
|
|
11
|
+
from .api.config import ConfigMerger, DRSConfig
|
|
12
|
+
from .api.metadata_stores import CatalogueBackendType, IndexName
|
|
13
|
+
from .data_collector import DataCollector
|
|
14
|
+
from .logger import logger
|
|
15
|
+
from .run import async_add, async_delete, async_index
|
|
16
|
+
|
|
17
|
+
asyncio.set_event_loop_policy(uvloop.EventLoopPolicy())
|
|
18
|
+
|
|
19
|
+
__all__ = [
|
|
20
|
+
"logger",
|
|
21
|
+
"__version__",
|
|
22
|
+
"DataCollector",
|
|
23
|
+
"index",
|
|
24
|
+
"add",
|
|
25
|
+
"delete",
|
|
26
|
+
"get_config",
|
|
27
|
+
"async_index",
|
|
28
|
+
"async_delete",
|
|
29
|
+
"async_add",
|
|
30
|
+
"get_config",
|
|
31
|
+
]
|
|
32
|
+
|
|
33
|
+
|
|
34
|
+
def get_config(config: Optional[Union[Path, str]] = None) -> ConfigMerger:
|
|
35
|
+
"""Get a drs config file merged with the default config.
|
|
36
|
+
|
|
37
|
+
The method is helpful to inspect all possible configurations and their
|
|
38
|
+
default values.
|
|
39
|
+
|
|
40
|
+
Parameters
|
|
41
|
+
^^^^^^^^^^
|
|
42
|
+
|
|
43
|
+
config:
|
|
44
|
+
Path to a user defined config file that is going to be merged with
|
|
45
|
+
the default config.
|
|
46
|
+
"""
|
|
47
|
+
_ = DRSConfig.load(config)
|
|
48
|
+
return ConfigMerger(config)
|
|
49
|
+
|
|
50
|
+
|
|
51
|
+
def index(
|
|
52
|
+
index_system: str,
|
|
53
|
+
*catalogue_files: Union[Path, str, List[str], List[Path]],
|
|
54
|
+
batch_size: int = 2500,
|
|
55
|
+
verbosity: int = 0,
|
|
56
|
+
**kwargs: Any,
|
|
57
|
+
) -> None:
|
|
58
|
+
"""Index metadata in the indexing system.
|
|
59
|
+
|
|
60
|
+
Parameters
|
|
61
|
+
^^^^^^^^^^
|
|
62
|
+
|
|
63
|
+
index_system:
|
|
64
|
+
The index server where the metadata is indexed.
|
|
65
|
+
catalogue_files:
|
|
66
|
+
Path to the file(s) where the metadata was stored.
|
|
67
|
+
batch_size:
|
|
68
|
+
If the index system supports batch-sizes, the size of the batches.
|
|
69
|
+
verbosity:
|
|
70
|
+
Set the verbosity level.
|
|
71
|
+
|
|
72
|
+
Other Parameters
|
|
73
|
+
^^^^^^^^^^^^^^^^
|
|
74
|
+
|
|
75
|
+
**kwargs:
|
|
76
|
+
Keyword arguments used to delete data from the index.
|
|
77
|
+
|
|
78
|
+
Examples
|
|
79
|
+
^^^^^^^^
|
|
80
|
+
|
|
81
|
+
.. code-block:: python
|
|
82
|
+
|
|
83
|
+
index(
|
|
84
|
+
"solr",
|
|
85
|
+
"/tmp/catalog-1.yml",
|
|
86
|
+
"/tmp/catalog-2.yml",
|
|
87
|
+
batch_size=50,
|
|
88
|
+
server="localhost:8983",
|
|
89
|
+
)
|
|
90
|
+
"""
|
|
91
|
+
uvloop.run(
|
|
92
|
+
async_index(
|
|
93
|
+
index_system,
|
|
94
|
+
*catalogue_files,
|
|
95
|
+
batch_size=batch_size,
|
|
96
|
+
verbosity=verbosity,
|
|
97
|
+
**kwargs,
|
|
98
|
+
)
|
|
99
|
+
)
|
|
100
|
+
|
|
101
|
+
|
|
102
|
+
def delete(
|
|
103
|
+
index_system: str,
|
|
104
|
+
batch_size: int = 2500,
|
|
105
|
+
verbosity: int = 0,
|
|
106
|
+
**kwargs: Any,
|
|
107
|
+
) -> None:
|
|
108
|
+
"""Delete metadata from the indexing system.
|
|
109
|
+
|
|
110
|
+
Parameters
|
|
111
|
+
^^^^^^^^^^
|
|
112
|
+
|
|
113
|
+
index_system:
|
|
114
|
+
The index server where the metadata is indexed.
|
|
115
|
+
batch_size:
|
|
116
|
+
If the index system supports batch-sizes, the size of the batches.
|
|
117
|
+
verbosity:
|
|
118
|
+
Set the verbosity of the system.
|
|
119
|
+
|
|
120
|
+
Other Parameters
|
|
121
|
+
^^^^^^^^^^^^^^^^
|
|
122
|
+
|
|
123
|
+
**kwargs:
|
|
124
|
+
Keyword arguments used to delete data from the index.
|
|
125
|
+
|
|
126
|
+
|
|
127
|
+
Examples
|
|
128
|
+
^^^^^^^^
|
|
129
|
+
|
|
130
|
+
.. code-block:: python
|
|
131
|
+
|
|
132
|
+
delete(
|
|
133
|
+
"solr",
|
|
134
|
+
server="localhost:8983",
|
|
135
|
+
facets=[("project", "CMIP6"), ("institute", "MPI-M")],
|
|
136
|
+
)
|
|
137
|
+
"""
|
|
138
|
+
uvloop.run(async_delete(index_system, batch_size=batch_size, **kwargs))
|
|
139
|
+
|
|
140
|
+
|
|
141
|
+
def add(
|
|
142
|
+
store: Optional[Union[str, Path]] = None,
|
|
143
|
+
config_file: Optional[
|
|
144
|
+
Union[Path, str, Dict[str, Any], tomlkit.TOMLDocument]
|
|
145
|
+
] = None,
|
|
146
|
+
data_object: Optional[Union[str, List[str]]] = None,
|
|
147
|
+
data_set: Optional[Union[str, List[str]]] = None,
|
|
148
|
+
data_store_prefix: str = "metadata",
|
|
149
|
+
catalogue_backend: CatalogueBackendType = "jsonlines",
|
|
150
|
+
batch_size: int = 25_000,
|
|
151
|
+
comp_level: int = 4,
|
|
152
|
+
storage_options: Optional[Dict[str, Any]] = None,
|
|
153
|
+
shadow: Optional[Union[str, List[str]]] = None,
|
|
154
|
+
latest_version: str = IndexName().latest,
|
|
155
|
+
all_versions: str = IndexName().all,
|
|
156
|
+
n_procs: Optional[int] = None,
|
|
157
|
+
verbosity: int = 0,
|
|
158
|
+
password: bool = False,
|
|
159
|
+
fail_under: int = -1,
|
|
160
|
+
**kwargs: Any,
|
|
161
|
+
) -> None:
|
|
162
|
+
"""Harvest metadata from storage systems and add them to an intake catalogue.
|
|
163
|
+
|
|
164
|
+
Parameters
|
|
165
|
+
^^^^^^^^^^
|
|
166
|
+
|
|
167
|
+
store:
|
|
168
|
+
Path to the intake catalogue.
|
|
169
|
+
config_file:
|
|
170
|
+
Path to the drs-config file / loaded configuration.
|
|
171
|
+
data_ojbect:
|
|
172
|
+
Instead of defining datasets that are to be crawled you can crawl
|
|
173
|
+
data based on their directories. The directories must be a root dirs
|
|
174
|
+
given in the drs-config file. By default all root dirs are crawled.
|
|
175
|
+
data_set:
|
|
176
|
+
Datasets that should be crawled. The datasets need to be defined
|
|
177
|
+
in the drs-config file. By default all datasets are crawled.
|
|
178
|
+
Names can contain wildcards such as ``xces-*``.
|
|
179
|
+
data_store_prefix:
|
|
180
|
+
Absolute path or relative path to intake catalogue source
|
|
181
|
+
data_dir:
|
|
182
|
+
Instead of defining datasets are are to be crawled you can crawl
|
|
183
|
+
data based on their directories. The directories must be a root dirs
|
|
184
|
+
given in the drs-config file. By default all root dirs are crawled.
|
|
185
|
+
bach_size:
|
|
186
|
+
Batch size that is used to collect the meta data. This can affect
|
|
187
|
+
performance.
|
|
188
|
+
comp_level:
|
|
189
|
+
Compression level used to write the meta data to csv.gz
|
|
190
|
+
storage_options:
|
|
191
|
+
Set additional storage options for adding metadata to the metadata store
|
|
192
|
+
shadow:
|
|
193
|
+
'Shadow' this storage options. This is useful to hide secrets in public
|
|
194
|
+
data catalogues.
|
|
195
|
+
catalogue_backend:
|
|
196
|
+
Intake catalogue backend
|
|
197
|
+
latest_version:
|
|
198
|
+
Name of the core holding 'latest' metadata.
|
|
199
|
+
all_versions:
|
|
200
|
+
Name of the core holding 'all' metadata versions.
|
|
201
|
+
password:
|
|
202
|
+
Display a password prompt and set password before beginning.
|
|
203
|
+
n_procs:
|
|
204
|
+
Set the number of parallel processes for collecting.
|
|
205
|
+
verbosity:
|
|
206
|
+
Set the verbosity of the system.
|
|
207
|
+
fail_under:
|
|
208
|
+
Fail if less than X of the discovered files could be indexed.
|
|
209
|
+
|
|
210
|
+
Other Parameters
|
|
211
|
+
^^^^^^^^^^^^^^^^
|
|
212
|
+
|
|
213
|
+
**kwargs:
|
|
214
|
+
Additional keyword arguments.
|
|
215
|
+
|
|
216
|
+
|
|
217
|
+
Examples
|
|
218
|
+
^^^^^^^^
|
|
219
|
+
|
|
220
|
+
.. code-block:: python
|
|
221
|
+
|
|
222
|
+
add(
|
|
223
|
+
"my-data.yaml",
|
|
224
|
+
"~/data/drs-config.toml",
|
|
225
|
+
data_set=["cmip6", "cordex"],
|
|
226
|
+
)
|
|
227
|
+
"""
|
|
228
|
+
uvloop.run(
|
|
229
|
+
async_add(
|
|
230
|
+
store=store,
|
|
231
|
+
config_file=config_file,
|
|
232
|
+
data_object=data_object,
|
|
233
|
+
data_set=data_set,
|
|
234
|
+
batch_size=batch_size,
|
|
235
|
+
comp_level=comp_level,
|
|
236
|
+
password=password,
|
|
237
|
+
catalogue_backend=catalogue_backend,
|
|
238
|
+
data_store_prefix=data_store_prefix,
|
|
239
|
+
shadow=shadow,
|
|
240
|
+
latest_version=latest_version,
|
|
241
|
+
all_versions=all_versions,
|
|
242
|
+
n_procs=n_procs,
|
|
243
|
+
storage_options=storage_options,
|
|
244
|
+
verbosity=verbosity,
|
|
245
|
+
fail_under=fail_under,
|
|
246
|
+
**kwargs,
|
|
247
|
+
)
|
|
248
|
+
)
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
__version__ = "2509.0.0"
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
"""Metadata-crawler API."""
|
|
@@ -0,0 +1,57 @@
|
|
|
1
|
+
"""API for adding commands to the cli."""
|
|
2
|
+
|
|
3
|
+
from functools import wraps
|
|
4
|
+
from typing import Any, Callable, Dict, Tuple, Union
|
|
5
|
+
|
|
6
|
+
from pydantic import BaseModel, ConfigDict
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
class Parameter(BaseModel):
|
|
10
|
+
"""CLI Parameter model."""
|
|
11
|
+
|
|
12
|
+
model_config = ConfigDict(extra="allow")
|
|
13
|
+
|
|
14
|
+
args: Union[str, Tuple[str, ...]]
|
|
15
|
+
"""Names for the arpargse.Namespace."""
|
|
16
|
+
help: str
|
|
17
|
+
"""Help string that is going to be displayed."""
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
def cli_parameter(*args: str, **kwargs: Any) -> Dict[str, Any]:
|
|
21
|
+
"""Construct a ``argparse.Namespace``.
|
|
22
|
+
|
|
23
|
+
Parameters
|
|
24
|
+
^^^^^^^^^^
|
|
25
|
+
*args:
|
|
26
|
+
Any arguments passed to ``argparse.ArgumentParser().add_argument``
|
|
27
|
+
**kwargs:
|
|
28
|
+
Any keyword arguments passed to ``argparse.ArgumentParser().add_arguent``
|
|
29
|
+
|
|
30
|
+
"""
|
|
31
|
+
return Parameter(args=args, **kwargs).model_dump()
|
|
32
|
+
|
|
33
|
+
|
|
34
|
+
def cli_function(
|
|
35
|
+
help: str = "",
|
|
36
|
+
) -> Callable[[Callable[..., Any]], Callable[..., Any]]:
|
|
37
|
+
"""Wrap command line arguments around a method.
|
|
38
|
+
|
|
39
|
+
Those arguments represent the arguments you would normally use to create
|
|
40
|
+
a `argparse subcommand <https://docs.python.org/3/library/argparse.html>`_.
|
|
41
|
+
|
|
42
|
+
Parameters
|
|
43
|
+
^^^^^^^^^^
|
|
44
|
+
help:
|
|
45
|
+
Help string for this sub command.
|
|
46
|
+
"""
|
|
47
|
+
|
|
48
|
+
def decorator(func: Callable[..., Any]) -> Callable[..., Any]:
|
|
49
|
+
setattr(func, "_cli_help", help or func.__doc__)
|
|
50
|
+
|
|
51
|
+
@wraps(func)
|
|
52
|
+
def wrapper(*args: Any, **kwargs: Any) -> Any:
|
|
53
|
+
return func(*args, **kwargs)
|
|
54
|
+
|
|
55
|
+
return wrapper
|
|
56
|
+
|
|
57
|
+
return decorator
|