metadata-crawler 2510.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of metadata-crawler might be problematic. Click here for more details.

Files changed (35) hide show
  1. metadata_crawler/__init__.py +263 -0
  2. metadata_crawler/__main__.py +8 -0
  3. metadata_crawler/_version.py +1 -0
  4. metadata_crawler/api/__init__.py +1 -0
  5. metadata_crawler/api/cli.py +57 -0
  6. metadata_crawler/api/config.py +831 -0
  7. metadata_crawler/api/drs_config.toml +440 -0
  8. metadata_crawler/api/index.py +151 -0
  9. metadata_crawler/api/metadata_stores.py +755 -0
  10. metadata_crawler/api/mixin/__init__.py +7 -0
  11. metadata_crawler/api/mixin/lookup_mixin.py +112 -0
  12. metadata_crawler/api/mixin/lookup_tables.py +10010 -0
  13. metadata_crawler/api/mixin/path_mixin.py +46 -0
  14. metadata_crawler/api/mixin/template_mixin.py +145 -0
  15. metadata_crawler/api/storage_backend.py +277 -0
  16. metadata_crawler/backends/__init__.py +1 -0
  17. metadata_crawler/backends/intake.py +211 -0
  18. metadata_crawler/backends/posix.py +121 -0
  19. metadata_crawler/backends/s3.py +140 -0
  20. metadata_crawler/backends/swift.py +305 -0
  21. metadata_crawler/cli.py +547 -0
  22. metadata_crawler/data_collector.py +278 -0
  23. metadata_crawler/ingester/__init__.py +1 -0
  24. metadata_crawler/ingester/mongo.py +206 -0
  25. metadata_crawler/ingester/solr.py +282 -0
  26. metadata_crawler/logger.py +153 -0
  27. metadata_crawler/py.typed +0 -0
  28. metadata_crawler/run.py +419 -0
  29. metadata_crawler/utils/__init__.py +482 -0
  30. metadata_crawler/utils/cftime_utils.py +207 -0
  31. metadata_crawler-2510.1.0.dist-info/METADATA +401 -0
  32. metadata_crawler-2510.1.0.dist-info/RECORD +35 -0
  33. metadata_crawler-2510.1.0.dist-info/WHEEL +4 -0
  34. metadata_crawler-2510.1.0.dist-info/entry_points.txt +14 -0
  35. metadata_crawler-2510.1.0.dist-info/licenses/LICENSE +28 -0
@@ -0,0 +1,46 @@
1
+ """Definitions for path manipulatins."""
2
+
3
+ from pathlib import Path
4
+ from typing import Tuple, Union
5
+ from urllib.parse import urlsplit
6
+
7
+ import fsspec
8
+ from anyio import Path as aPath
9
+
10
+
11
+ class PathMixin:
12
+ """Class that defines typical Path operations."""
13
+
14
+ async def suffix(self, path: Union[str, Path, aPath]) -> str:
15
+ """Get the suffix of a given input path.
16
+
17
+ Parameters
18
+ ^^^^^^^^^^
19
+ path: str, asyncio.Path, pathlib.Path
20
+ Path of the object store
21
+
22
+ Returns
23
+ ^^^^^^-
24
+ str: The file type extension of the path.
25
+ """
26
+ return Path(path).suffix
27
+
28
+ def get_fs_and_path(self, uri: str) -> Tuple[fsspec.AbstractFileSystem, str]:
29
+ """Return (fs, path) suitable for xarray.
30
+
31
+ Parameters
32
+ ^^^^^^^^^^
33
+ uri:
34
+ Path to the object store / file name
35
+
36
+
37
+ Returns
38
+ ^^^^^^-
39
+ fsspec.AbstractFileSystem, str:
40
+ The AbstractFileSystem class and the corresponding path to the
41
+ data store.
42
+ """
43
+ protocol, path = fsspec.core.split_protocol(uri)
44
+ protocol = protocol or "file"
45
+ path = urlsplit(uri.removeprefix(f"{protocol}://")).path
46
+ return fsspec.filesystem(protocol), path
@@ -0,0 +1,145 @@
1
+ """Definitions for jinja2 templating."""
2
+
3
+ import os
4
+ from functools import lru_cache
5
+ from typing import Any, Dict, Mapping, Optional
6
+
7
+ from jinja2 import Environment, Template, Undefined
8
+
9
+ ENV = Environment(undefined=Undefined, autoescape=True)
10
+
11
+
12
+ @lru_cache(maxsize=1024)
13
+ def _compile_jinja_template(s: str) -> Template:
14
+ return ENV.from_string(s)
15
+
16
+
17
+ class TemplateMixin:
18
+ """Apply templating egine jinja2."""
19
+
20
+ env_map: Optional[Dict[str, str]] = None
21
+ _rendered = False
22
+
23
+ def prep_template_env(self) -> None:
24
+ """Prepare the jinja2 env."""
25
+
26
+ def _env_get(name: str, default: Optional[str] = None) -> Optional[str]:
27
+ return os.getenv(name, default)
28
+
29
+ def _getenv_filter(
30
+ varname: str, default: Optional[str] = None
31
+ ) -> Optional[str]:
32
+ return os.getenv(varname, default)
33
+
34
+ ENV.globals.setdefault("env", _env_get)
35
+ ENV.globals.setdefault("ENV", dict(os.environ))
36
+ ENV.filters.setdefault("getenv", _getenv_filter)
37
+ self._rendered = True
38
+
39
+ def render_templates(
40
+ self,
41
+ data: Any,
42
+ context: Mapping[str, Any],
43
+ *,
44
+ max_passes: int = 2,
45
+ ) -> Any:
46
+ """Recursively render Jinja2 templates found in strings within data.
47
+
48
+ This function traverses common container types (``dict``, ``list``,
49
+ ``tuple``, ``set``), dataclasses, namedtuples, and ``pathlib.Path`` objects.
50
+ Every string encountered is treated as a Jinja2 template and rendered with
51
+ the provided ``context``. Rendering can be repeated up to ``max_passes``
52
+ times to resolve templates that produce further templates on the first pass.
53
+
54
+ Parameters
55
+ ^^^^^^^^^^
56
+ data:
57
+ Arbitrary Python data structure. Supported containers are ``dict``
58
+ (keys and values), ``list``, ``tuple`` (including namedtuples),
59
+ ``set``, dataclasses (fields), and ``pathlib.Path``.
60
+ Scalars (e.g., ``int``, ``float``, ``bool``, ``None``) are returned
61
+ unchanged. Strings are rendered as Jinja2 templates.
62
+ context:
63
+ Mapping of template variables available to Jinja2 during rendering.
64
+ max_passes:
65
+ Maximum number of rendering passes to perform on each string,
66
+ by default ``2``. Increase this if templates generate further
67
+ templates that need resolution.
68
+
69
+ Returns
70
+ ^^^^^^^
71
+ Any:
72
+ A structure of the same shape with all strings rendered. Container and
73
+ object types are preserved where feasible (e.g., ``tuple`` stays a
74
+ ``tuple``, namedtuple stays a namedtuple, dataclass remains the
75
+ same dataclass type).
76
+
77
+ Raises
78
+ ^^^^^^^
79
+ jinja2.TemplateError
80
+ For other Jinja2 template errors encountered during rendering.
81
+
82
+ Notes
83
+ ^^^^^^
84
+ * Dictionary keys are also rendered if they are strings (or nested
85
+ containers with strings). If rendering causes key collisions, the
86
+ **last** rendered key wins.
87
+ * For dataclasses, all fields are rendered and a new instance is returned using
88
+ ``dataclasses.replace``. Frozen dataclasses are supported.
89
+ * Namedtuples are detected via the ``_fields`` attribute and
90
+ reconstructed with the same type.
91
+
92
+ Examples
93
+ ^^^^^^^^^
94
+
95
+ .. code-block:: python
96
+
97
+ data = {
98
+ "greeting": "Hello, {{ name }}!",
99
+ "items": ["{{ count }} item(s)", 42],
100
+ "path": {"root": "/home/{{ user }}", "cfg": "{{ root }}/cfg"},
101
+ }
102
+ ctx = {"name": "Ada", "count": 3, "user": "ada", "root": "/opt/app"}
103
+ TemplateMixin().render_templates(data, ctx)
104
+ # {'greeting': 'Hello, Ada!',
105
+ # 'items': ['3 item(s)', 42],
106
+ # 'path': {'root': '/home/ada', 'cfg': '/opt/app/cfg'}}
107
+
108
+ """
109
+ if not self._rendered:
110
+ self.prep_template_env()
111
+
112
+ def _render_str(s: str) -> str:
113
+ out = s
114
+ if ("{{" not in s) and ("{%" not in s):
115
+ return out
116
+ for _ in range(max_passes):
117
+ new = _compile_jinja_template(out).render(context)
118
+ if new == out:
119
+ break
120
+ out = new
121
+ return out
122
+
123
+ def _walk(obj: Any) -> Any:
124
+ if isinstance(obj, str):
125
+ return _render_str(obj)
126
+
127
+ if isinstance(obj, dict):
128
+ rendered: dict[Any, Any] = {}
129
+ for k, v in obj.items():
130
+ rk = _render_str(k) if isinstance(k, str) else k
131
+ rendered[rk] = _walk(v)
132
+ return rendered
133
+
134
+ if isinstance(obj, list):
135
+ return [_walk(x) for x in obj]
136
+
137
+ if isinstance(obj, tuple):
138
+ return tuple(_walk(x) for x in obj)
139
+
140
+ if isinstance(obj, set):
141
+ return {_walk(x) for x in obj}
142
+
143
+ return obj
144
+
145
+ return _walk(data)
@@ -0,0 +1,277 @@
1
+ """API for adding new storage backends via :py:class:`BasePath`."""
2
+
3
+ import abc
4
+ import os
5
+ import pathlib
6
+ import threading
7
+ from getpass import getuser
8
+ from typing import (
9
+ Any,
10
+ AsyncIterator,
11
+ ClassVar,
12
+ Dict,
13
+ List,
14
+ Optional,
15
+ TypedDict,
16
+ Union,
17
+ cast,
18
+ )
19
+
20
+ import h5netcdf
21
+ import xarray as xr
22
+ from anyio import Path
23
+ from pydantic import BaseModel, Field
24
+
25
+ from .mixin import LookupMixin, PathMixin, TemplateMixin
26
+
27
+
28
+ class MetadataType(TypedDict):
29
+ """A dict representation of the metadata."""
30
+
31
+ path: str
32
+ metadata: Dict[str, Any]
33
+
34
+
35
+ class Metadata(BaseModel):
36
+ """Meta data that is attached to each discovered path."""
37
+
38
+ path: str
39
+ metadata: Dict[str, Any] = Field(default_factory=dict)
40
+
41
+
42
+ class BasePath(abc.ABCMeta):
43
+ """Every storage backend class should be of this type."""
44
+
45
+
46
+ class PathTemplate(
47
+ abc.ABC, PathMixin, TemplateMixin, LookupMixin, metaclass=BasePath
48
+ ):
49
+ """Base class for interacting with different storage systems.
50
+
51
+ This class defines fundamental methods that should be implemented
52
+ to retrieve information across different storage systems.
53
+
54
+ Parameters
55
+ ^^^^^^^^^^
56
+ suffixes: List[str], default: [".nc", ".girb", ".zarr", ".tar", ".hdf5"]
57
+ A list of available file suffixes.
58
+
59
+ Other Parameters
60
+ ^^^^^^^^^^^^^^^^
61
+ storage_options: Any
62
+ Information needed to interact with the storage system.
63
+
64
+ Attributes
65
+ ^^^^^^^^^^
66
+ _user : str
67
+ Value of the ``DRS_STORAGE_USER`` env variable (defaults to current user)
68
+ _pw : str
69
+ a password passed by the ``DRS_STORAGE_PASSWD`` env variable
70
+ suffixes: List[str]
71
+ A list of available file suffixes.
72
+ storage_options: Dist[str, Any]
73
+ A dict with information needed to interact with the storage system.
74
+ """
75
+
76
+ _fs_type: ClassVar[Optional[str]]
77
+ """Definition of the file system time for each implementation."""
78
+
79
+ _lock = threading.RLock()
80
+
81
+ def __init__(
82
+ self, suffixes: Optional[List[str]] = None, **storage_options: Any
83
+ ) -> None:
84
+
85
+ self._user: str = os.environ.get("DRS_STORAGE_USER") or getuser()
86
+ self._pw: str = os.environ.get("DRS_STORAGE_PASSWD") or ""
87
+ self.suffixes = suffixes or [".nc", ".girb", ".zarr", ".tar", ".hdf5"]
88
+ self.storage_options = cast(
89
+ Dict[str, Any], self.render_templates(storage_options or {}, {})
90
+ )
91
+ self.set_static_from_nested()
92
+ self.__post_init__()
93
+
94
+ def __post_init__(self) -> None:
95
+ """Call this method after the __init__ get called.
96
+
97
+ If you need to assign any attributes redefine this method in your class.
98
+ """
99
+
100
+ async def close(self) -> None:
101
+ """Close any open sessions."""
102
+
103
+ def open_dataset(
104
+ self, path: str, **read_kws: Any
105
+ ) -> Union[xr.Dataset, h5netcdf.core.File]:
106
+ """Open a dataset with xarray.
107
+
108
+ Parameters
109
+ ^^^^^^^^^^
110
+ path:
111
+ Path to the object store / file name
112
+ **read_kws:
113
+ Keyword arguments passed to open the datasets.
114
+
115
+ Returns
116
+ ^^^^^^-
117
+ xarray.Dataset:
118
+ The xarray dataset.
119
+ """
120
+ fs, path = self.get_fs_and_path(path)
121
+
122
+ def _get_engine(file_name: str) -> str:
123
+ engines = {
124
+ "grb": "cfgrib",
125
+ "grib": "cfgrib",
126
+ "gb": "gb",
127
+ "nc": "h5netcdf",
128
+ "nc4": "h5netcdf",
129
+ "netcdf": "h5netcdf",
130
+ "cdf": "h5netcdf",
131
+ "hdf5": "h5netcdf",
132
+ "h5": "h5netcdf",
133
+ "zarr": "zarr",
134
+ "zar": "zarr",
135
+ }
136
+ suffix = file_name.rpartition(".")[-1]
137
+ return engines.get(suffix, "")
138
+
139
+ kwargs = read_kws.copy()
140
+ engine = kwargs.setdefault("engine", _get_engine(path) or None)
141
+
142
+ if engine == "zarr":
143
+ dset: xr.Dataset = xr.open_zarr(fs.get_mapper(path))
144
+ return dset
145
+ if fs.protocol[0] == "file" and engine == "h5netcdf":
146
+ return h5netcdf.File(path)
147
+ if fs.protocol[0] == "file":
148
+ return xr.open_mfdataset(path, **kwargs)
149
+ with fs.open(path, "rb") as stream:
150
+ return xr.open_dataset(stream, **kwargs)
151
+
152
+ def read_attr(
153
+ self, attribute: str, path: Union[str, pathlib.Path], **read_kws: Any
154
+ ) -> Any:
155
+ """Get a metadata attribute from a datastore object.
156
+
157
+ Parameters
158
+ ^^^^^^^^^^
159
+ attr: The attribute that is queried can be of the form of
160
+ <attribute>, <variable>.<attribute>, <attribute>,
161
+ <variable>.<attribute>
162
+ path: Path to the object store / file path
163
+ read_kws: Keyword arguments for opening the datasets.
164
+
165
+ Returns
166
+ ^^^^^^^
167
+ str: Metadata from the data.
168
+ """
169
+ with self.open_dataset(str(path), **read_kws) as dset:
170
+ if "." not in attribute:
171
+ return dset.attrs[attribute]
172
+ var, _, attr = attribute.partition(".")
173
+ return dset[var].attrs[attr]
174
+
175
+ @abc.abstractmethod
176
+ async def is_dir(self, path: Union[str, Path, pathlib.Path]) -> bool:
177
+ """Check if a given path is a directory object on the storage system.
178
+
179
+ Parameters
180
+ ^^^^^^^^^^
181
+ path : str, asyncio.Path, pathlib.Path
182
+ Path of the object store
183
+
184
+ Returns
185
+ ^^^^^^-
186
+ bool: True if path is dir object, False if otherwise or doesn't exist
187
+ """
188
+
189
+ @abc.abstractmethod
190
+ async def is_file(self, path: Union[str, Path, pathlib.Path]) -> bool:
191
+ """Check if a given path is a file object on the storage system.
192
+
193
+ Parameters
194
+ ^^^^^^^^^^
195
+ path:
196
+ Path of the object store
197
+
198
+ Returns
199
+ ^^^^^^^
200
+ bool:
201
+ True if path is file object, False if otherwise or doesn't exist
202
+ """
203
+ ... # pragma: no cover
204
+
205
+ @abc.abstractmethod
206
+ async def iterdir(
207
+ self,
208
+ path: Union[str, Path, pathlib.Path],
209
+ ) -> AsyncIterator[str]:
210
+ """Get all sub directories from a given path.
211
+
212
+ Parameters
213
+ ^^^^^^^^^^
214
+ path:
215
+ Path of the object store
216
+
217
+ Yields
218
+ ^^^^^^
219
+ str:
220
+ 1st level sub directory
221
+ """
222
+ yield "" # pragma: no cover
223
+
224
+ @abc.abstractmethod
225
+ async def rglob(
226
+ self, path: Union[str, Path, pathlib.Path], glob_pattern: str = "*"
227
+ ) -> AsyncIterator[MetadataType]:
228
+ """Search recursively for paths matching a given glob pattern.
229
+
230
+ Parameters
231
+ ^^^^^^^^^^
232
+ path:
233
+ Path of the object store
234
+ glob_pattern: str
235
+ Pattern that the target files must match
236
+
237
+ Yields
238
+ ^^^^^^
239
+ MetadataType: Path of the object store that matches the glob pattern.
240
+ """
241
+ yield MetadataType(path="", metadata={}) # pragma: no cover
242
+
243
+ def fs_type(self, path: Union[str, Path, pathlib.Path]) -> str:
244
+ """Define the file system type."""
245
+ return self._fs_type or ""
246
+
247
+ @abc.abstractmethod
248
+ def path(self, path: Union[str, Path, pathlib.Path]) -> str:
249
+ """Get the full path (including any schemas/netlocs).
250
+
251
+ Parameters
252
+ ^^^^^^^^^^
253
+ path:
254
+ Path of the object store
255
+
256
+ Returns
257
+ ^^^^^^^
258
+ str:
259
+ URI of the object store
260
+ """
261
+ ... # pragma: no cover
262
+
263
+ @abc.abstractmethod
264
+ def uri(self, path: Union[str, Path, pathlib.Path]) -> str:
265
+ """Get the uri of the object store.
266
+
267
+ Parameters
268
+ ^^^^^^^^^^
269
+ path:
270
+ Path of the object store
271
+
272
+ Returns
273
+ ^^^^^^^
274
+ str:
275
+ URI of the object store
276
+ """
277
+ ... # pragma: no cover
@@ -0,0 +1 @@
1
+ """Storage backend definitions."""
@@ -0,0 +1,211 @@
1
+ """Interact with the INTAKE metadata catalogues."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import pathlib
6
+ from fnmatch import fnmatch
7
+ from types import NoneType
8
+ from typing import (
9
+ Any,
10
+ AsyncIterator,
11
+ Callable,
12
+ Dict,
13
+ Union,
14
+ )
15
+ from urllib.parse import unquote, urlparse
16
+
17
+ import fsspec
18
+ import intake
19
+ import pandas as pd
20
+ from anyio import Path
21
+
22
+ from ..api.storage_backend import Metadata, MetadataType, PathTemplate
23
+ from ..logger import logger
24
+
25
+
26
+ class IntakePath(PathTemplate):
27
+ """Class to interact with the Intake metadata catalogues."""
28
+
29
+ _fs_type = None
30
+
31
+ async def is_file(self, path: str | Path | pathlib.Path) -> bool:
32
+ """Check if a given path is a file."""
33
+ return True
34
+
35
+ async def is_dir(self, path: str | Path | pathlib.Path) -> bool:
36
+ """Check if a given path is a directory."""
37
+ return False
38
+
39
+ @staticmethod
40
+ def _normalize_path(path: str) -> str:
41
+ """Turn file:// URLs into OS paths; leave others as-is."""
42
+ if isinstance(path, str) and path.startswith("file://"):
43
+ return unquote(urlparse(path).path)
44
+ return path
45
+
46
+ async def _walk_yaml_catalogue(
47
+ self,
48
+ cat: intake.catalog.Catalog,
49
+ ) -> AsyncIterator[MetadataType]:
50
+
51
+ for name in cat:
52
+ entry = cat[name]
53
+ container = getattr(entry, "container", None)
54
+
55
+ if container == "catalog":
56
+ async for md in self._walk_yaml_catalogue(entry()):
57
+ yield md
58
+ continue
59
+
60
+ src = entry()
61
+ meta = getattr(src, "_entry", src).describe() or {}
62
+ args = meta.get("args", {})
63
+ urlpath = (
64
+ args.get("urlpath")
65
+ or args.get("path")
66
+ or args.get("url")
67
+ or meta.get("uri")
68
+ or meta.get("file")
69
+ or args.get("urlpaths")
70
+ ) or []
71
+ for raw_path in urlpath if isinstance(urlpath, list) else [urlpath]:
72
+ path = self._normalize_path(raw_path)
73
+ logger.debug("Found file %s", path)
74
+ yield MetadataType(
75
+ path=path,
76
+ metadata=getattr(src, "metadata", meta.get("metadata", {})),
77
+ )
78
+
79
+ @staticmethod
80
+ def _to_py(value: Any) -> Any:
81
+ if isinstance(value, (float, int, bool, str, NoneType)):
82
+ return value
83
+ try:
84
+ if hasattr(value, "tolist"):
85
+ return value.tolist()
86
+ if pd.isna(value):
87
+ return None
88
+ except Exception:
89
+ pass
90
+ return value
91
+
92
+ async def _walk_esm_catalogue(
93
+ self,
94
+ cat: intake.catalog.Catalog,
95
+ ) -> AsyncIterator[MetadataType]:
96
+ df: pd.DataFrame = getattr(cat, "df", pd.DataFrame())
97
+ cols = list(df.columns)
98
+ for row in df.itertuples(index=False, name=None):
99
+ meta: Dict[str, Any] = {k: self._to_py(v) for k, v in zip(cols, row)}
100
+ urlpath = (
101
+ meta.get("urlpath")
102
+ or meta.get("path")
103
+ or meta.get("url")
104
+ or meta.get("uri")
105
+ or meta.get("file")
106
+ or meta.get("urlpaths")
107
+ ) or []
108
+ for raw_path in urlpath if isinstance(urlpath, list) else [urlpath]:
109
+ path = self._normalize_path(raw_path)
110
+ logger.debug("Found file %s", path)
111
+ yield MetadataType(path=path, metadata=meta)
112
+
113
+ async def iterdir(
114
+ self,
115
+ path: Union[str, Path, pathlib.Path],
116
+ ) -> AsyncIterator[str]:
117
+ """Get all sub directories from a given path.
118
+
119
+ Parameter
120
+ ---------
121
+ path : str, asyncio.Path, pathlib.Path
122
+ Path of the object store
123
+
124
+ Yields
125
+ ------
126
+ str:
127
+ 1st level sub directory
128
+ """
129
+ yield str(path)
130
+
131
+ def _is_esm_catalogue(self, path: str) -> bool:
132
+ if not self._normalize_path(path).endswith(".json"):
133
+ return False
134
+ esmcat = False
135
+ fs = fsspec.get_filesystem_class(
136
+ fsspec.core.split_protocol(path)[0] or "file"
137
+ )(**self.storage_options)
138
+ with fs.open(path, mode="rb", **self.storage_options) as stream:
139
+ num = 0
140
+ for line in stream:
141
+ if "esmcat" in line.decode("utf-8"):
142
+ esmcat = True
143
+ break
144
+ if num > 19:
145
+ break
146
+ num += 1
147
+ return esmcat
148
+
149
+ async def rglob(
150
+ self, path: str | Path | pathlib.Path, glob_pattern: str = "*"
151
+ ) -> AsyncIterator[MetadataType]:
152
+ """Go through catalogue path."""
153
+ path = str(path)
154
+ if self._is_esm_catalogue(path):
155
+ cat: intake.catalog.Catalog = intake.open_esm_datastore(
156
+ path, **self.storage_options
157
+ )
158
+ func: Callable[[str], AsyncIterator[MetadataType]] = (
159
+ self._walk_esm_catalogue
160
+ )
161
+ else:
162
+ cat = intake.open_catalog(path, **self.storage_options)
163
+ func = self._walk_yaml_catalogue
164
+
165
+ async for md in func(cat):
166
+ if "." + md["path"].rpartition(".")[-1] in self.suffixes and fnmatch(
167
+ md["path"], glob_pattern
168
+ ):
169
+ yield md
170
+
171
+ def path(self, path: Union[str, Path, pathlib.Path]) -> str:
172
+ """Get the full path (including any schemas/netlocs).
173
+
174
+ Parameters
175
+ ----------
176
+ path: str, asyncio.Path, pathlib.Path
177
+ Path of the object store
178
+
179
+ Returns
180
+ -------
181
+ str:
182
+ URI of the object store
183
+ """
184
+ return str(path)
185
+
186
+ def uri(self, path: Union[str, Path, pathlib.Path]) -> str:
187
+ """Get the uri of the object store.
188
+
189
+ Parameters
190
+ ----------
191
+ path: str, asyncio.Path, pathlib.Path
192
+ Path of the object store
193
+
194
+ Returns
195
+ -------
196
+ str:
197
+ URI of the object store
198
+ """
199
+ fs_type, path = fsspec.core.split_protocol(str(path))
200
+ fs_type = fs_type or "file"
201
+ return f"{fs_type}://{path}"
202
+
203
+ def fs_type(self, path: Union[str, Path, pathlib.Path]) -> str:
204
+ """Define the file system type."""
205
+ fs_type, _ = fsspec.core.split_protocol(str(path))
206
+ return fs_type or "posix"
207
+
208
+ async def walk(self, path: str) -> AsyncIterator[Metadata]:
209
+ """Walk a catalogue."""
210
+ async for md in self.rglob(path):
211
+ yield Metadata(path=md["path"], metadata=md["metadata"])