PyPI - metadata-crawler - Versions diffs - 2509.0.0__py3-none-any.whl - Mend

metadata-crawler 2509.0.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of metadata-crawler might be problematic. Click here for more details.

Files changed (34) hide show

metadata_crawler/__init__.py +248 -0
metadata_crawler/__main__.py +8 -0
metadata_crawler/_version.py +1 -0
metadata_crawler/api/__init__.py +1 -0
metadata_crawler/api/cli.py +57 -0
metadata_crawler/api/config.py +801 -0
metadata_crawler/api/drs_config.toml +439 -0
metadata_crawler/api/index.py +132 -0
metadata_crawler/api/metadata_stores.py +749 -0
metadata_crawler/api/mixin/__init__.py +7 -0
metadata_crawler/api/mixin/lookup_mixin.py +112 -0
metadata_crawler/api/mixin/lookup_tables.py +10010 -0
metadata_crawler/api/mixin/path_mixin.py +46 -0
metadata_crawler/api/mixin/template_mixin.py +145 -0
metadata_crawler/api/storage_backend.py +277 -0
metadata_crawler/backends/__init__.py +1 -0
metadata_crawler/backends/intake.py +211 -0
metadata_crawler/backends/posix.py +121 -0
metadata_crawler/backends/s3.py +136 -0
metadata_crawler/backends/swift.py +305 -0
metadata_crawler/cli.py +539 -0
metadata_crawler/data_collector.py +258 -0
metadata_crawler/ingester/__init__.py +1 -0
metadata_crawler/ingester/mongo.py +193 -0
metadata_crawler/ingester/solr.py +152 -0
metadata_crawler/logger.py +142 -0
metadata_crawler/py.typed +0 -0
metadata_crawler/run.py +373 -0
metadata_crawler/utils.py +411 -0
metadata_crawler-2509.0.0.dist-info/METADATA +399 -0
metadata_crawler-2509.0.0.dist-info/RECORD +34 -0
metadata_crawler-2509.0.0.dist-info/WHEEL +4 -0
metadata_crawler-2509.0.0.dist-info/entry_points.txt +14 -0
metadata_crawler-2509.0.0.dist-info/licenses/LICENSE +28 -0

metadata_crawler/backends/posix.py ADDED Viewed

@@ -0,0 +1,121 @@
+"""Interact with the a posix file system."""
+from __future__ import annotations
+import pathlib
+from typing import AsyncIterator, Union
+from anyio import Path
+from ..api.storage_backend import MetadataType, PathTemplate
+class PosixPath(PathTemplate):
+    """Class to interact with a Posix file system."""
+    _fs_type = "posix"
+    async def is_dir(self, path: Union[str, Path, pathlib.Path]) -> bool:
+        """Check if a given path is a directory object on the storage system.
+        Parameter
+        ---------
+        path : str, asyncio.Path, pathlib.Path
+            Path of the object store
+        Returns
+        -------
+        bool: True if path is dir object, False if otherwise or doesn't exist
+        """
+        return await Path(path).is_dir()
+    async def is_file(self, path: Union[str, Path, pathlib.Path]) -> bool:
+        """Check if a given path is a file object on the storage system.
+        Parameter
+        ---------
+        path : str, asyncio.Path, pathlib.Path
+            Path of the object store
+        Returns
+        -------
+        bool: True if path is file object, False if otherwise or doesn't exist
+        """
+        return await Path(path).is_file()
+    async def iterdir(
+        self, path: Union[str, Path, pathlib.Path]
+    ) -> AsyncIterator[str]:
+        """Get all sub directories from a given path.
+        Parameter
+        ---------
+        path : str, asyncio.Path, pathlib.Path
+            Path of the object store
+        Yields
+        ------
+        str: 1st level sub directory
+        """
+        try:
+            async for out_d in Path(path).iterdir():
+                yield str(out_d)
+        except NotADirectoryError:
+            yield str(path)
+        except FileNotFoundError:
+            pass
+    async def rglob(
+        self, path: Union[str, Path, pathlib.Path], glob_pattern: str = "*"
+    ) -> AsyncIterator[MetadataType]:
+        """Search recursively for paths matching a given glob pattern.
+        Parameter
+        ---------
+        path : str, asyncio.Path, pathlib.Path
+            Path of the object store
+        glob_pattern: str
+            Pattern that the target files must match
+        Yields
+        ------
+        MetadataType: Path of the object store that matches the glob pattern.
+        """
+        p = Path(path)
+        if await self.is_file(p) or p.suffix == ".zarr":
+            yield MetadataType(path=str(p), metadata={})
+        else:
+            async for out_f in p.rglob(glob_pattern):
+                if out_f.suffix in self.suffixes:
+                    yield MetadataType(path=str(out_f), metadata={})
+    def path(self, path: Union[str, Path, pathlib.Path]) -> str:
+        """Get the full path (including any schemas/netlocs).
+        Parameters
+        ----------
+        path: str, asyncio.Path, pathlib.Path
+            Path of the object store
+        Returns
+        -------
+        str:
+            URI of the object store
+        """
+        return str(pathlib.Path(path).absolute())
+    def uri(self, path: Union[str, Path, pathlib.Path]) -> str:
+        """Get the uri of the object store.
+        Parameters
+        ----------
+        path: str, asyncio.Path, pathlib.Path
+            Path of the object store
+        Returns
+        -------
+        str:
+            URI of the object store
+        """
+        return f"file://{pathlib.Path(path).absolute()}"

metadata_crawler/backends/s3.py ADDED Viewed

@@ -0,0 +1,136 @@
+"""Interact with an S3 Object Store."""
+import asyncio
+import pathlib
+from typing import AsyncIterator, Optional, Tuple, Union, cast
+import fsspec
+from anyio import Path
+from s3fs import S3FileSystem
+from ..api.storage_backend import MetadataType, PathTemplate
+from ..logger import logger
+class S3Path(PathTemplate):
+    """Class to interact with an S3 object store."""
+    _fs_type = "s3"
+    def __post_init__(self) -> None:
+        self._client: Optional[S3FileSystem] = None
+        self.storage_options = self.storage_options or {"anon": True}
+    async def close(self) -> None:
+        """Close the connection."""
+        client = await self._get_client()
+        await client.s3.close()
+    def get_fs_and_path(self, path: str) -> Tuple[fsspec.AbstractFileSystem, str]:
+        """S3 implementation for returning (fs, path) suitable for xarray.
+        Parameters
+        ^^^^^^^^^^
+        path:
+            Path to the object store / file name
+        Returns
+        ^^^^^^^
+        fsspec.AbstractFileSystem, str:
+            The AbstractFileSystem class and the corresponding path to the
+            data store.
+        """
+        return fsspec.filesystem("s3", **self.storage_options), path
+    async def _get_client(self) -> S3FileSystem:
+        if self._client is None:
+            logger.debug(
+                "Creating S3 Filesystem with storage_options: %s",
+                self.storage_options,
+            )
+            loop = asyncio.get_running_loop()
+            self._client = S3FileSystem(
+                asynchronous=True, loop=loop, **self.storage_options
+            )
+            self._client._loop = loop
+            await self._client.set_session()
+        return self._client
+    async def is_file(self, path: Union[str, Path, pathlib.Path]) -> bool:
+        """Check if a given path is a file object on the storage system."""
+        client = await self._get_client()
+        return cast(bool, await client._isfile(str(path)))
+    async def is_dir(self, path: str | Path | pathlib.Path) -> bool:
+        """Check if a given path is a directory object on the storage system."""
+        client = await self._get_client()
+        return cast(bool, await client._isdir(str(path)))
+    async def iterdir(
+        self, path: Union[str, Path, pathlib.Path]
+    ) -> AsyncIterator[str]:
+        """Retrieve sub directories of directory."""
+        path = str(path)
+        client = await self._get_client()
+        for _content in await client._lsdir(path):
+            if _content.get("type", "") == "directory":
+                yield f'{_content.get("name", "")}'
+    async def rglob(
+        self, path: str | Path | pathlib.Path, glob_pattern: str = "*"
+    ) -> AsyncIterator[MetadataType]:
+        """Search recursively for files matching a ``glob_pattern``.
+        Parameters
+        ^^^^^^^^^^
+        path: str
+            A resource composed by:
+                - bucket, 'bucketname'
+                - prefix, 'prefix/to/a/path'
+            E.g.: '/bucketname/prefix/to/objects'
+            Will be translated into a request to
+            `self.url`+`/bucketname?prefix="prefix/to/objects`
+        glob_pattern: str
+            A string reprenseting several glob patterns, separated by '|'
+            E.g.: '*.zarr|*.nc|*.hdf5'
+        """
+        client = await self._get_client()
+        if await self.is_file(path):
+            yield MetadataType(path=str(path), metadata={})
+        else:
+            for suffix in self.suffixes:
+                for content in await client._glob(f"{path}/**/*{suffix}"):
+                    yield MetadataType(path=f"/{content}", metadata={})
+    def path(self, path: Union[str, Path, pathlib.Path]) -> str:
+        """Get the full path (including any schemas/netlocs).
+        Parameters
+        ^^^^^^^^^^
+        path: str, asyncio.Path, pathlib.Path
+            Path of the object store
+        Returns
+        ^^^^^^^
+        str:
+            URI of the object store
+        """
+        return cast(
+            str, fsspec.filesystem("s3", **self.storage_options).url(str(path))
+        )
+    def uri(self, path: Union[str, Path, pathlib.Path]) -> str:
+        """Get the uri of the object store.
+        Parameters
+        ^^^^^^^^^^
+        path: str, asyncio.Path, pathlib.Path
+            Path of the object store
+        Returns
+        ^^^^^^^
+        str:
+            URI of the object store
+        """
+        return self.path(path)

metadata_crawler/backends/swift.py ADDED Viewed

@@ -0,0 +1,305 @@
+"""Interact with the OpenStack swift cloud."""
+from __future__ import annotations
+import asyncio
+import pathlib
+from fnmatch import fnmatch
+from typing import AsyncIterator, Dict, List, Optional, Tuple, Union, cast
+from urllib.parse import SplitResult, urljoin, urlsplit, urlunparse
+import aiohttp
+import fsspec
+from anyio import Path
+from ..api.storage_backend import MetadataType, PathTemplate
+def _basename(key: str) -> str:
+    return pathlib.PosixPath(key[:-1] if key.endswith("/") else key).name
+class SwiftPath(PathTemplate):
+    """Class to interact with the OpenStack swift cloud storage system."""
+    _fs_type = "swift"
+    def __post_init__(self) -> None:
+        self.storage_options = self.storage_options or {}
+        self.os_password = self.storage_options.get("os_password", self._pw)
+        self.os_user_id = self.storage_options.get("os_user_id", self._user)
+        self.os_project_id = self.storage_options.get("os_project_id")
+        self.os_auth_token = self.storage_options.get("os_auth_token") or None
+        self._os_storage_url = self.storage_options.get(
+            "os_storage_url", ""
+        ).rstrip("/")
+        self.os_auth_url = self.storage_options.get(
+            "os_auth_url", self._guess_tempauth_url(self._os_storage_url)
+        )
+        self._container = self.storage_options.get(
+            "container", self._os_storage_url.split("/")[-1]
+        ).rstrip("/")
+        self._os_storage_url = self._os_storage_url.removesuffix(self._container)
+        self._url_split: Optional[SplitResult] = None
+    @staticmethod
+    def _guess_tempauth_url(storage_url: str) -> str:
+        """Construct the swift url.
+        Heuristic: For TempAuth, switch '/v1/...' to '/auth/v1.0' on same host:port.
+        Returns None if storage_url doesn't look like a Swift v1 endpoint.
+        """
+        p = urlsplit(storage_url)
+        # Typical Swift proxy paths: '/v1/...' or '/swift/v1/...'
+        if not (p.path.startswith("/v1/") or p.path.startswith("/swift/v1/")):
+            return ""
+        # Use same scheme+netloc, set path to /auth/v1.0
+        return urlunparse((p.scheme, p.netloc, "/auth/v1.0", "", "", ""))
+    @property
+    def storage_path(self) -> str:
+        """Path part of the storage url."""
+        split = self.url_split
+        return "/" + split.path.lstrip("/").rstrip("/")
+    @property
+    def url_split(self) -> SplitResult:
+        """Retrieve the split parts of the storage url."""
+        if self._url_split is not None:
+            return self._url_split
+        if not self._os_storage_url:
+            raise RuntimeError("os_storage_url must be set")
+        storage_url = self._os_storage_url.removesuffix(self._container)
+        self._url_split = urlsplit(urljoin(storage_url, self._container))
+        return self._url_split
+    @property
+    def _anon(self) -> bool:
+        """Decide if we can logon at all."""
+        return False if self.os_password or self.headers else True
+    async def logon(self) -> None:
+        """Logon to the swfit system if necessary."""
+        headers = {
+            "X-Auth-User": f"{self.os_project_id}:{self.os_user_id}",
+            "X-Auth-Key": self.os_password,
+        }
+        async with aiohttp.ClientSession() as session:
+            async with session.get(self.os_auth_url, headers=headers) as res:
+                if res.status != 200:
+                    raise ValueError(f"Logon to {self.os_auth_url} failed")
+                self.os_auth_token = res.headers["X-Auth-Token"]
+    def _is_zarr_like_match(self, key: str, glob_pattern: str) -> bool:
+        key_l = key.lower()
+        base = _basename(key)
+        if key_l.endswith(".zarr") or key_l.endswith(".zarr/"):
+            if ".zarr" in self.suffixes and fnmatch(base, glob_pattern):
+                return True
+        return False
+    async def _url_fragments(self, url: str) -> Tuple[str, str]:
+        url_split = urlsplit(url)
+        url_path = (
+            ("/" + url_split.path.lstrip("/"))
+            .removeprefix(self.storage_path)
+            .rstrip("/")
+            .lstrip("/")
+        )
+        parsed_url = SplitResult(
+            url_split.scheme or self.url_split.scheme,
+            url_split.netloc or self.url_split.netloc,
+            f"{self.storage_path}/{url_path}",
+            url_split.query,
+            url_split.fragment,
+        )
+        _path = pathlib.PosixPath(parsed_url.path).parts[1:]
+        url_prefix = "/".join(_path[:3])
+        prefix = "/".join(_path[3:])
+        if prefix:
+            prefix += "/"
+        url_head = f"{parsed_url.scheme}://{parsed_url.netloc}/{url_prefix}"
+        return url_head, prefix
+    async def _read_json(
+        self, path: str, delimiter: Optional[str] = "/"
+    ) -> List[Dict[str, str]]:
+        url, prefix = await self._url_fragments(path)
+        suffix = f"?format=json&prefix={prefix}"
+        if delimiter:
+            suffix += f"&delimiter={delimiter}"
+        else:
+            suffix = suffix.rstrip("/")
+        url = f"{url}{suffix}"
+        errors = {
+            403: PermissionError(f"Permission denied for {path}"),
+            404: FileNotFoundError(f"No such file or directory {path}"),
+        }
+        async with aiohttp.ClientSession() as session:
+            for _ in range(2):
+                async with session.get(url, headers=self.headers) as res:
+                    if res.status < 300:
+                        return cast(list[dict[str, str]], await res.json())
+                    if res.status == 401:
+                        await self.logon()
+                        continue
+        raise errors.get(res.status, RuntimeError(f"Failed to query {path}"))
+    def _get_dir_from_path(self, data: dict[str, str]) -> str | None:
+        if (
+            data.get("subdir")
+            or data.get("content_type", "") == "application/directory"
+        ):
+            return data.get("subdir") or data.get("name")
+        return None
+    @property
+    def headers(self) -> dict[str, str]:
+        """Define the headers used to interact with swift."""
+        if self.os_auth_token is None:
+            return {}
+        return {"X-Auth-Token": self.os_auth_token}
+    async def is_file(self, path: str | Path | pathlib.Path) -> bool:
+        """Check if a given path is a file object on the storage system."""
+        try:
+            data = (await self._read_json(str(path)))[0]
+        except (FileNotFoundError, IndexError):
+            return False
+        return self._get_dir_from_path(data) is None
+    async def is_dir(self, path: str | Path | pathlib.Path) -> bool:
+        """Check if a given path is a directory object on the storage system."""
+        try:
+            data = (await self._read_json(str(path)))[0]
+        except (FileNotFoundError, IndexError):
+            return False
+        return self._get_dir_from_path(data) is not None
+    async def iterdir(
+        self, path: Union[str, Path, pathlib.Path]
+    ) -> AsyncIterator[str]:
+        """Get all sub directories of a directory."""
+        try:
+            for data in await self._read_json(str(path)):
+                new_path = self._get_dir_from_path(data)
+                if new_path:
+                    out = (
+                        str(path).lstrip("/")
+                        + "/"
+                        + pathlib.PosixPath(new_path).name
+                    )
+                    yield out
+        except (FileNotFoundError, PermissionError):
+            pass
+    async def rglob(
+        self,
+        path: Union[str, Path, pathlib.Path],
+        glob_pattern: str = "*",
+    ) -> AsyncIterator[MetadataType]:
+        """Search recursively for files matching a glob_pattern."""
+        delimiter: Optional[str] = None
+        if await self.is_dir(path):
+            delimiter = "/"
+        for data in await self._read_json(str(path), delimiter=delimiter):
+            # swift doesn't natively support pagination, so we need to do it
+            # ourselves.
+            name = data.get("name")
+            dir_name = self._get_dir_from_path(data)
+            if dir_name:
+                # if it's an actual object named foo.zarr, treat as zarr store
+                if self._is_zarr_like_match(dir_name, glob_pattern):
+                    yield MetadataType(path=dir_name.rstrip("/"), metadata={})
+                else:
+                    async for md in self.rglob(dir_name, glob_pattern):
+                        yield md
+            elif name:
+                if pathlib.PosixPath(name).suffix in self.suffixes and fnmatch(
+                    name, glob_pattern
+                ):
+                    yield MetadataType(path=name, metadata={})
+    def get_fs_and_path(self, uri: str) -> Tuple[fsspec.AbstractFileSystem, str]:
+        """Return (fs, path) suitable for xarray.
+        Parameters
+        ----------
+        uri:
+            Path to the object store / file name
+        Returns
+        -------
+        fsspec.AbstractFileSystem, str:
+            The AbstractFileSystem class and the corresponding path to the
+            data store.
+        """
+        url_split = urlsplit(uri)
+        url_path = (
+            ("/" + url_split.path.lstrip("/"))
+            .removeprefix(self.storage_path)
+            .rstrip("/")
+            .lstrip("/")
+        )
+        url = SplitResult(
+            url_split.scheme or self.url_split.scheme,
+            url_split.netloc or self.url_split.netloc,
+            f"{self.storage_path}/{url_path}",
+            url_split.query,
+            url_split.fragment,
+        ).geturl()
+        if not self._anon:
+            asyncio.run(self.logon())
+        return (
+            fsspec.filesystem(
+                "http",
+                headers=self.headers,
+                block_size=2**20,
+            ),
+            url,
+        )
+    def path(self, path: Union[str, Path, pathlib.Path]) -> str:
+        """Get the full path (including any schemas/netlocs).
+        Parameters
+        ----------
+        path: str, asyncio.Path, pathlib.Path
+            Path of the object store
+        Returns
+        -------
+        str:
+            URI of the object store
+        """
+        url_split = urlsplit(str(path))
+        if not url_split.netloc:
+            path = f"{self.url_split.path}/{url_split.path}"
+        else:
+            path = url_split.path
+        res = SplitResult(
+            url_split.scheme or self.url_split.scheme,
+            url_split.netloc or self.url_split.netloc,
+            path,
+            url_split.query,
+            url_split.fragment,
+        ).geturl()
+        return res
+    def uri(self, path: Union[str, Path, pathlib.Path]) -> str:
+        """Get the uri of the object store.
+        Parameters
+        ----------
+        path: str, asyncio.Path, pathlib.Path
+            Path of the object store
+        Returns
+        -------
+        str:
+            URI of the object store
+        """
+        return self.path(path)