PyPI - metadata-crawler - Versions diffs - 2510.1.0__py3-none-any.whl - Mend

metadata-crawler 2510.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of metadata-crawler might be problematic. Click here for more details.

Files changed (35) hide show

metadata_crawler/__init__.py +263 -0
metadata_crawler/__main__.py +8 -0
metadata_crawler/_version.py +1 -0
metadata_crawler/api/__init__.py +1 -0
metadata_crawler/api/cli.py +57 -0
metadata_crawler/api/config.py +831 -0
metadata_crawler/api/drs_config.toml +440 -0
metadata_crawler/api/index.py +151 -0
metadata_crawler/api/metadata_stores.py +755 -0
metadata_crawler/api/mixin/__init__.py +7 -0
metadata_crawler/api/mixin/lookup_mixin.py +112 -0
metadata_crawler/api/mixin/lookup_tables.py +10010 -0
metadata_crawler/api/mixin/path_mixin.py +46 -0
metadata_crawler/api/mixin/template_mixin.py +145 -0
metadata_crawler/api/storage_backend.py +277 -0
metadata_crawler/backends/__init__.py +1 -0
metadata_crawler/backends/intake.py +211 -0
metadata_crawler/backends/posix.py +121 -0
metadata_crawler/backends/s3.py +140 -0
metadata_crawler/backends/swift.py +305 -0
metadata_crawler/cli.py +547 -0
metadata_crawler/data_collector.py +278 -0
metadata_crawler/ingester/__init__.py +1 -0
metadata_crawler/ingester/mongo.py +206 -0
metadata_crawler/ingester/solr.py +282 -0
metadata_crawler/logger.py +153 -0
metadata_crawler/py.typed +0 -0
metadata_crawler/run.py +419 -0
metadata_crawler/utils/__init__.py +482 -0
metadata_crawler/utils/cftime_utils.py +207 -0
metadata_crawler-2510.1.0.dist-info/METADATA +401 -0
metadata_crawler-2510.1.0.dist-info/RECORD +35 -0
metadata_crawler-2510.1.0.dist-info/WHEEL +4 -0
metadata_crawler-2510.1.0.dist-info/entry_points.txt +14 -0
metadata_crawler-2510.1.0.dist-info/licenses/LICENSE +28 -0

metadata_crawler/api/mixin/path_mixin.py ADDED Viewed

@@ -0,0 +1,46 @@
+"""Definitions for path manipulatins."""
+from pathlib import Path
+from typing import Tuple, Union
+from urllib.parse import urlsplit
+import fsspec
+from anyio import Path as aPath
+class PathMixin:
+    """Class that defines typical Path operations."""
+    async def suffix(self, path: Union[str, Path, aPath]) -> str:
+        """Get the suffix of a given input path.
+        Parameters
+        ^^^^^^^^^^
+        path: str, asyncio.Path, pathlib.Path
+            Path of the object store
+        Returns
+        ^^^^^^-
+        str: The file type extension of the path.
+        """
+        return Path(path).suffix
+    def get_fs_and_path(self, uri: str) -> Tuple[fsspec.AbstractFileSystem, str]:
+        """Return (fs, path) suitable for xarray.
+        Parameters
+        ^^^^^^^^^^
+        uri:
+            Path to the object store / file name
+        Returns
+        ^^^^^^-
+        fsspec.AbstractFileSystem, str:
+            The AbstractFileSystem class and the corresponding path to the
+            data store.
+        """
+        protocol, path = fsspec.core.split_protocol(uri)
+        protocol = protocol or "file"
+        path = urlsplit(uri.removeprefix(f"{protocol}://")).path
+        return fsspec.filesystem(protocol), path

metadata_crawler/api/mixin/template_mixin.py ADDED Viewed

@@ -0,0 +1,145 @@
+"""Definitions for jinja2 templating."""
+import os
+from functools import lru_cache
+from typing import Any, Dict, Mapping, Optional
+from jinja2 import Environment, Template, Undefined
+ENV = Environment(undefined=Undefined, autoescape=True)
+@lru_cache(maxsize=1024)
+def _compile_jinja_template(s: str) -> Template:
+    return ENV.from_string(s)
+class TemplateMixin:
+    """Apply templating egine jinja2."""
+    env_map: Optional[Dict[str, str]] = None
+    _rendered = False
+    def prep_template_env(self) -> None:
+        """Prepare the jinja2 env."""
+        def _env_get(name: str, default: Optional[str] = None) -> Optional[str]:
+            return os.getenv(name, default)
+        def _getenv_filter(
+            varname: str, default: Optional[str] = None
+        ) -> Optional[str]:
+            return os.getenv(varname, default)
+        ENV.globals.setdefault("env", _env_get)
+        ENV.globals.setdefault("ENV", dict(os.environ))
+        ENV.filters.setdefault("getenv", _getenv_filter)
+        self._rendered = True
+    def render_templates(
+        self,
+        data: Any,
+        context: Mapping[str, Any],
+        *,
+        max_passes: int = 2,
+    ) -> Any:
+        """Recursively render Jinja2 templates found in strings within data.
+        This function traverses common container types (``dict``, ``list``,
+        ``tuple``, ``set``), dataclasses, namedtuples, and ``pathlib.Path`` objects.
+        Every string encountered is treated as a Jinja2 template and rendered with
+        the provided ``context``. Rendering can be repeated up to ``max_passes``
+        times to resolve templates that produce further templates on the first pass.
+        Parameters
+        ^^^^^^^^^^
+        data:
+            Arbitrary Python data structure. Supported containers are ``dict``
+            (keys and values), ``list``, ``tuple`` (including namedtuples),
+            ``set``, dataclasses (fields), and ``pathlib.Path``.
+            Scalars (e.g., ``int``, ``float``, ``bool``, ``None``) are returned
+            unchanged. Strings are rendered as Jinja2 templates.
+        context:
+            Mapping of template variables available to Jinja2 during rendering.
+        max_passes:
+            Maximum number of rendering passes to perform on each string,
+            by default ``2``. Increase this if templates generate further
+            templates that need resolution.
+        Returns
+        ^^^^^^^
+        Any:
+            A structure of the same shape with all strings rendered. Container and
+            object types are preserved where feasible (e.g., ``tuple`` stays a
+            ``tuple``, namedtuple stays a namedtuple, dataclass remains the
+            same dataclass type).
+        Raises
+        ^^^^^^^
+        jinja2.TemplateError
+            For other Jinja2 template errors encountered during rendering.
+        Notes
+        ^^^^^^
+        * Dictionary keys are also rendered if they are strings (or nested
+          containers with strings). If rendering causes key collisions, the
+          **last** rendered key wins.
+        * For dataclasses, all fields are rendered and a new instance is returned using
+          ``dataclasses.replace``. Frozen dataclasses are supported.
+        * Namedtuples are detected via the ``_fields`` attribute and
+          reconstructed with the same type.
+        Examples
+        ^^^^^^^^^
+            .. code-block:: python
+                data = {
+                    "greeting": "Hello, {{ name }}!",
+                    "items": ["{{ count }} item(s)", 42],
+                    "path": {"root": "/home/{{ user }}", "cfg": "{{ root }}/cfg"},
+                }
+                ctx = {"name": "Ada", "count": 3, "user": "ada", "root": "/opt/app"}
+                TemplateMixin().render_templates(data, ctx)
+                # {'greeting': 'Hello, Ada!',
+                #   'items': ['3 item(s)', 42],
+                #    'path': {'root': '/home/ada', 'cfg': '/opt/app/cfg'}}
+        """
+        if not self._rendered:
+            self.prep_template_env()
+        def _render_str(s: str) -> str:
+            out = s
+            if ("{{" not in s) and ("{%" not in s):
+                return out
+            for _ in range(max_passes):
+                new = _compile_jinja_template(out).render(context)
+                if new == out:
+                    break
+                out = new
+            return out
+        def _walk(obj: Any) -> Any:
+            if isinstance(obj, str):
+                return _render_str(obj)
+            if isinstance(obj, dict):
+                rendered: dict[Any, Any] = {}
+                for k, v in obj.items():
+                    rk = _render_str(k) if isinstance(k, str) else k
+                    rendered[rk] = _walk(v)
+                return rendered
+            if isinstance(obj, list):
+                return [_walk(x) for x in obj]
+            if isinstance(obj, tuple):
+                return tuple(_walk(x) for x in obj)
+            if isinstance(obj, set):
+                return {_walk(x) for x in obj}
+            return obj
+        return _walk(data)

metadata_crawler/api/storage_backend.py ADDED Viewed

@@ -0,0 +1,277 @@
+"""API for adding new storage backends via :py:class:`BasePath`."""
+import abc
+import os
+import pathlib
+import threading
+from getpass import getuser
+from typing import (
+    Any,
+    AsyncIterator,
+    ClassVar,
+    Dict,
+    List,
+    Optional,
+    TypedDict,
+    Union,
+    cast,
+)
+import h5netcdf
+import xarray as xr
+from anyio import Path
+from pydantic import BaseModel, Field
+from .mixin import LookupMixin, PathMixin, TemplateMixin
+class MetadataType(TypedDict):
+    """A dict representation of the metadata."""
+    path: str
+    metadata: Dict[str, Any]
+class Metadata(BaseModel):
+    """Meta data that is attached to each discovered path."""
+    path: str
+    metadata: Dict[str, Any] = Field(default_factory=dict)
+class BasePath(abc.ABCMeta):
+    """Every storage backend class should be of this type."""
+class PathTemplate(
+    abc.ABC, PathMixin, TemplateMixin, LookupMixin, metaclass=BasePath
+):
+    """Base class for interacting with different storage systems.
+    This class defines fundamental methods that should be implemented
+    to retrieve information across different storage systems.
+    Parameters
+    ^^^^^^^^^^
+    suffixes: List[str], default:  [".nc", ".girb", ".zarr", ".tar", ".hdf5"]
+        A list of available file suffixes.
+    Other Parameters
+    ^^^^^^^^^^^^^^^^
+    storage_options: Any
+        Information needed to interact with the storage system.
+    Attributes
+    ^^^^^^^^^^
+    _user : str
+        Value of the ``DRS_STORAGE_USER`` env variable (defaults to current user)
+    _pw : str
+        a password passed by the ``DRS_STORAGE_PASSWD`` env variable
+    suffixes: List[str]
+        A list of available file suffixes.
+    storage_options: Dist[str, Any]
+        A dict with information needed to interact with the storage system.
+    """
+    _fs_type: ClassVar[Optional[str]]
+    """Definition of the file system time for each implementation."""
+    _lock = threading.RLock()
+    def __init__(
+        self, suffixes: Optional[List[str]] = None, **storage_options: Any
+    ) -> None:
+        self._user: str = os.environ.get("DRS_STORAGE_USER") or getuser()
+        self._pw: str = os.environ.get("DRS_STORAGE_PASSWD") or ""
+        self.suffixes = suffixes or [".nc", ".girb", ".zarr", ".tar", ".hdf5"]
+        self.storage_options = cast(
+            Dict[str, Any], self.render_templates(storage_options or {}, {})
+        )
+        self.set_static_from_nested()
+        self.__post_init__()
+    def __post_init__(self) -> None:
+        """Call this method after the __init__ get called.
+        If you need to assign any attributes redefine this method in your class.
+        """
+    async def close(self) -> None:
+        """Close any open sessions."""
+    def open_dataset(
+        self, path: str, **read_kws: Any
+    ) -> Union[xr.Dataset, h5netcdf.core.File]:
+        """Open a dataset with xarray.
+        Parameters
+        ^^^^^^^^^^
+        path:
+            Path to the object store / file name
+        **read_kws:
+            Keyword arguments passed to open the datasets.
+        Returns
+        ^^^^^^-
+        xarray.Dataset:
+            The xarray dataset.
+        """
+        fs, path = self.get_fs_and_path(path)
+        def _get_engine(file_name: str) -> str:
+            engines = {
+                "grb": "cfgrib",
+                "grib": "cfgrib",
+                "gb": "gb",
+                "nc": "h5netcdf",
+                "nc4": "h5netcdf",
+                "netcdf": "h5netcdf",
+                "cdf": "h5netcdf",
+                "hdf5": "h5netcdf",
+                "h5": "h5netcdf",
+                "zarr": "zarr",
+                "zar": "zarr",
+            }
+            suffix = file_name.rpartition(".")[-1]
+            return engines.get(suffix, "")
+        kwargs = read_kws.copy()
+        engine = kwargs.setdefault("engine", _get_engine(path) or None)
+        if engine == "zarr":
+            dset: xr.Dataset = xr.open_zarr(fs.get_mapper(path))
+            return dset
+        if fs.protocol[0] == "file" and engine == "h5netcdf":
+            return h5netcdf.File(path)
+        if fs.protocol[0] == "file":
+            return xr.open_mfdataset(path, **kwargs)
+        with fs.open(path, "rb") as stream:
+            return xr.open_dataset(stream, **kwargs)
+    def read_attr(
+        self, attribute: str, path: Union[str, pathlib.Path], **read_kws: Any
+    ) -> Any:
+        """Get a metadata attribute from a datastore object.
+        Parameters
+        ^^^^^^^^^^
+        attr: The attribute that is queried can be of the form of
+              <attribute>, <variable>.<attribute>, <attribute>,
+              <variable>.<attribute>
+        path: Path to the object store / file path
+        read_kws: Keyword arguments for opening the datasets.
+        Returns
+        ^^^^^^^
+        str: Metadata from the data.
+        """
+        with self.open_dataset(str(path), **read_kws) as dset:
+            if "." not in attribute:
+                return dset.attrs[attribute]
+            var, _, attr = attribute.partition(".")
+            return dset[var].attrs[attr]
+    @abc.abstractmethod
+    async def is_dir(self, path: Union[str, Path, pathlib.Path]) -> bool:
+        """Check if a given path is a directory object on the storage system.
+        Parameters
+        ^^^^^^^^^^
+        path : str, asyncio.Path, pathlib.Path
+            Path of the object store
+        Returns
+        ^^^^^^-
+        bool: True if path is dir object, False if otherwise or doesn't exist
+        """
+    @abc.abstractmethod
+    async def is_file(self, path: Union[str, Path, pathlib.Path]) -> bool:
+        """Check if a given path is a file object on the storage system.
+        Parameters
+        ^^^^^^^^^^
+        path:
+            Path of the object store
+        Returns
+        ^^^^^^^
+        bool:
+            True if path is file object, False if otherwise or doesn't exist
+        """
+        ...  # pragma: no cover
+    @abc.abstractmethod
+    async def iterdir(
+        self,
+        path: Union[str, Path, pathlib.Path],
+    ) -> AsyncIterator[str]:
+        """Get all sub directories from a given path.
+        Parameters
+        ^^^^^^^^^^
+        path:
+            Path of the object store
+        Yields
+        ^^^^^^
+        str:
+            1st level sub directory
+        """
+        yield ""  # pragma: no cover
+    @abc.abstractmethod
+    async def rglob(
+        self, path: Union[str, Path, pathlib.Path], glob_pattern: str = "*"
+    ) -> AsyncIterator[MetadataType]:
+        """Search recursively for paths matching a given glob pattern.
+        Parameters
+        ^^^^^^^^^^
+        path:
+            Path of the object store
+        glob_pattern: str
+            Pattern that the target files must match
+        Yields
+        ^^^^^^
+        MetadataType: Path of the object store that matches the glob pattern.
+        """
+        yield MetadataType(path="", metadata={})  # pragma: no cover
+    def fs_type(self, path: Union[str, Path, pathlib.Path]) -> str:
+        """Define the file system type."""
+        return self._fs_type or ""
+    @abc.abstractmethod
+    def path(self, path: Union[str, Path, pathlib.Path]) -> str:
+        """Get the full path (including any schemas/netlocs).
+        Parameters
+        ^^^^^^^^^^
+        path:
+            Path of the object store
+        Returns
+        ^^^^^^^
+        str:
+            URI of the object store
+        """
+        ...  # pragma: no cover
+    @abc.abstractmethod
+    def uri(self, path: Union[str, Path, pathlib.Path]) -> str:
+        """Get the uri of the object store.
+        Parameters
+        ^^^^^^^^^^
+        path:
+            Path of the object store
+        Returns
+        ^^^^^^^
+        str:
+            URI of the object store
+        """
+        ...  # pragma: no cover

metadata_crawler/backends/__init__.py ADDED Viewed

	@@ -0,0 +1 @@
1	+ """Storage backend definitions."""

metadata_crawler/backends/intake.py ADDED Viewed

@@ -0,0 +1,211 @@
+"""Interact with the INTAKE metadata catalogues."""
+from __future__ import annotations
+import pathlib
+from fnmatch import fnmatch
+from types import NoneType
+from typing import (
+    Any,
+    AsyncIterator,
+    Callable,
+    Dict,
+    Union,
+)
+from urllib.parse import unquote, urlparse
+import fsspec
+import intake
+import pandas as pd
+from anyio import Path
+from ..api.storage_backend import Metadata, MetadataType, PathTemplate
+from ..logger import logger
+class IntakePath(PathTemplate):
+    """Class to interact with the Intake metadata catalogues."""
+    _fs_type = None
+    async def is_file(self, path: str | Path | pathlib.Path) -> bool:
+        """Check if a given path is a file."""
+        return True
+    async def is_dir(self, path: str | Path | pathlib.Path) -> bool:
+        """Check if a given path is a directory."""
+        return False
+    @staticmethod
+    def _normalize_path(path: str) -> str:
+        """Turn file:// URLs into OS paths; leave others as-is."""
+        if isinstance(path, str) and path.startswith("file://"):
+            return unquote(urlparse(path).path)
+        return path
+    async def _walk_yaml_catalogue(
+        self,
+        cat: intake.catalog.Catalog,
+    ) -> AsyncIterator[MetadataType]:
+        for name in cat:
+            entry = cat[name]
+            container = getattr(entry, "container", None)
+            if container == "catalog":
+                async for md in self._walk_yaml_catalogue(entry()):
+                    yield md
+                continue
+            src = entry()
+            meta = getattr(src, "_entry", src).describe() or {}
+            args = meta.get("args", {})
+            urlpath = (
+                args.get("urlpath")
+                or args.get("path")
+                or args.get("url")
+                or meta.get("uri")
+                or meta.get("file")
+                or args.get("urlpaths")
+            ) or []
+            for raw_path in urlpath if isinstance(urlpath, list) else [urlpath]:
+                path = self._normalize_path(raw_path)
+                logger.debug("Found file %s", path)
+                yield MetadataType(
+                    path=path,
+                    metadata=getattr(src, "metadata", meta.get("metadata", {})),
+                )
+    @staticmethod
+    def _to_py(value: Any) -> Any:
+        if isinstance(value, (float, int, bool, str, NoneType)):
+            return value
+        try:
+            if hasattr(value, "tolist"):
+                return value.tolist()
+            if pd.isna(value):
+                return None
+        except Exception:
+            pass
+        return value
+    async def _walk_esm_catalogue(
+        self,
+        cat: intake.catalog.Catalog,
+    ) -> AsyncIterator[MetadataType]:
+        df: pd.DataFrame = getattr(cat, "df", pd.DataFrame())
+        cols = list(df.columns)
+        for row in df.itertuples(index=False, name=None):
+            meta: Dict[str, Any] = {k: self._to_py(v) for k, v in zip(cols, row)}
+            urlpath = (
+                meta.get("urlpath")
+                or meta.get("path")
+                or meta.get("url")
+                or meta.get("uri")
+                or meta.get("file")
+                or meta.get("urlpaths")
+            ) or []
+            for raw_path in urlpath if isinstance(urlpath, list) else [urlpath]:
+                path = self._normalize_path(raw_path)
+                logger.debug("Found file %s", path)
+                yield MetadataType(path=path, metadata=meta)
+    async def iterdir(
+        self,
+        path: Union[str, Path, pathlib.Path],
+    ) -> AsyncIterator[str]:
+        """Get all sub directories from a given path.
+        Parameter
+        ---------
+        path : str, asyncio.Path, pathlib.Path
+            Path of the object store
+        Yields
+        ------
+        str:
+            1st level sub directory
+        """
+        yield str(path)
+    def _is_esm_catalogue(self, path: str) -> bool:
+        if not self._normalize_path(path).endswith(".json"):
+            return False
+        esmcat = False
+        fs = fsspec.get_filesystem_class(
+            fsspec.core.split_protocol(path)[0] or "file"
+        )(**self.storage_options)
+        with fs.open(path, mode="rb", **self.storage_options) as stream:
+            num = 0
+            for line in stream:
+                if "esmcat" in line.decode("utf-8"):
+                    esmcat = True
+                    break
+                if num > 19:
+                    break
+                num += 1
+        return esmcat
+    async def rglob(
+        self, path: str | Path | pathlib.Path, glob_pattern: str = "*"
+    ) -> AsyncIterator[MetadataType]:
+        """Go through catalogue path."""
+        path = str(path)
+        if self._is_esm_catalogue(path):
+            cat: intake.catalog.Catalog = intake.open_esm_datastore(
+                path, **self.storage_options
+            )
+            func: Callable[[str], AsyncIterator[MetadataType]] = (
+                self._walk_esm_catalogue
+            )
+        else:
+            cat = intake.open_catalog(path, **self.storage_options)
+            func = self._walk_yaml_catalogue
+        async for md in func(cat):
+            if "." + md["path"].rpartition(".")[-1] in self.suffixes and fnmatch(
+                md["path"], glob_pattern
+            ):
+                yield md
+    def path(self, path: Union[str, Path, pathlib.Path]) -> str:
+        """Get the full path (including any schemas/netlocs).
+        Parameters
+        ----------
+        path: str, asyncio.Path, pathlib.Path
+            Path of the object store
+        Returns
+        -------
+        str:
+            URI of the object store
+        """
+        return str(path)
+    def uri(self, path: Union[str, Path, pathlib.Path]) -> str:
+        """Get the uri of the object store.
+        Parameters
+        ----------
+        path: str, asyncio.Path, pathlib.Path
+            Path of the object store
+        Returns
+        -------
+        str:
+            URI of the object store
+        """
+        fs_type, path = fsspec.core.split_protocol(str(path))
+        fs_type = fs_type or "file"
+        return f"{fs_type}://{path}"
+    def fs_type(self, path: Union[str, Path, pathlib.Path]) -> str:
+        """Define the file system type."""
+        fs_type, _ = fsspec.core.split_protocol(str(path))
+        return fs_type or "posix"
+    async def walk(self, path: str) -> AsyncIterator[Metadata]:
+        """Walk a catalogue."""
+        async for md in self.rglob(path):
+            yield Metadata(path=md["path"], metadata=md["metadata"])