metadata-crawler 2509.0.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of metadata-crawler might be problematic. Click here for more details.

Files changed (34) hide show
  1. metadata_crawler/__init__.py +248 -0
  2. metadata_crawler/__main__.py +8 -0
  3. metadata_crawler/_version.py +1 -0
  4. metadata_crawler/api/__init__.py +1 -0
  5. metadata_crawler/api/cli.py +57 -0
  6. metadata_crawler/api/config.py +801 -0
  7. metadata_crawler/api/drs_config.toml +439 -0
  8. metadata_crawler/api/index.py +132 -0
  9. metadata_crawler/api/metadata_stores.py +749 -0
  10. metadata_crawler/api/mixin/__init__.py +7 -0
  11. metadata_crawler/api/mixin/lookup_mixin.py +112 -0
  12. metadata_crawler/api/mixin/lookup_tables.py +10010 -0
  13. metadata_crawler/api/mixin/path_mixin.py +46 -0
  14. metadata_crawler/api/mixin/template_mixin.py +145 -0
  15. metadata_crawler/api/storage_backend.py +277 -0
  16. metadata_crawler/backends/__init__.py +1 -0
  17. metadata_crawler/backends/intake.py +211 -0
  18. metadata_crawler/backends/posix.py +121 -0
  19. metadata_crawler/backends/s3.py +136 -0
  20. metadata_crawler/backends/swift.py +305 -0
  21. metadata_crawler/cli.py +539 -0
  22. metadata_crawler/data_collector.py +258 -0
  23. metadata_crawler/ingester/__init__.py +1 -0
  24. metadata_crawler/ingester/mongo.py +193 -0
  25. metadata_crawler/ingester/solr.py +152 -0
  26. metadata_crawler/logger.py +142 -0
  27. metadata_crawler/py.typed +0 -0
  28. metadata_crawler/run.py +373 -0
  29. metadata_crawler/utils.py +411 -0
  30. metadata_crawler-2509.0.0.dist-info/METADATA +399 -0
  31. metadata_crawler-2509.0.0.dist-info/RECORD +34 -0
  32. metadata_crawler-2509.0.0.dist-info/WHEEL +4 -0
  33. metadata_crawler-2509.0.0.dist-info/entry_points.txt +14 -0
  34. metadata_crawler-2509.0.0.dist-info/licenses/LICENSE +28 -0
@@ -0,0 +1,121 @@
1
+ """Interact with the a posix file system."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import pathlib
6
+ from typing import AsyncIterator, Union
7
+
8
+ from anyio import Path
9
+
10
+ from ..api.storage_backend import MetadataType, PathTemplate
11
+
12
+
13
+ class PosixPath(PathTemplate):
14
+ """Class to interact with a Posix file system."""
15
+
16
+ _fs_type = "posix"
17
+
18
+ async def is_dir(self, path: Union[str, Path, pathlib.Path]) -> bool:
19
+ """Check if a given path is a directory object on the storage system.
20
+
21
+ Parameter
22
+ ---------
23
+ path : str, asyncio.Path, pathlib.Path
24
+ Path of the object store
25
+
26
+ Returns
27
+ -------
28
+ bool: True if path is dir object, False if otherwise or doesn't exist
29
+ """
30
+ return await Path(path).is_dir()
31
+
32
+ async def is_file(self, path: Union[str, Path, pathlib.Path]) -> bool:
33
+ """Check if a given path is a file object on the storage system.
34
+
35
+ Parameter
36
+ ---------
37
+ path : str, asyncio.Path, pathlib.Path
38
+ Path of the object store
39
+
40
+
41
+ Returns
42
+ -------
43
+ bool: True if path is file object, False if otherwise or doesn't exist
44
+ """
45
+ return await Path(path).is_file()
46
+
47
+ async def iterdir(
48
+ self, path: Union[str, Path, pathlib.Path]
49
+ ) -> AsyncIterator[str]:
50
+ """Get all sub directories from a given path.
51
+
52
+ Parameter
53
+ ---------
54
+ path : str, asyncio.Path, pathlib.Path
55
+ Path of the object store
56
+
57
+ Yields
58
+ ------
59
+ str: 1st level sub directory
60
+ """
61
+ try:
62
+ async for out_d in Path(path).iterdir():
63
+ yield str(out_d)
64
+ except NotADirectoryError:
65
+ yield str(path)
66
+ except FileNotFoundError:
67
+ pass
68
+
69
+ async def rglob(
70
+ self, path: Union[str, Path, pathlib.Path], glob_pattern: str = "*"
71
+ ) -> AsyncIterator[MetadataType]:
72
+ """Search recursively for paths matching a given glob pattern.
73
+
74
+ Parameter
75
+ ---------
76
+ path : str, asyncio.Path, pathlib.Path
77
+ Path of the object store
78
+ glob_pattern: str
79
+ Pattern that the target files must match
80
+
81
+ Yields
82
+ ------
83
+ MetadataType: Path of the object store that matches the glob pattern.
84
+ """
85
+ p = Path(path)
86
+ if await self.is_file(p) or p.suffix == ".zarr":
87
+ yield MetadataType(path=str(p), metadata={})
88
+ else:
89
+ async for out_f in p.rglob(glob_pattern):
90
+ if out_f.suffix in self.suffixes:
91
+ yield MetadataType(path=str(out_f), metadata={})
92
+
93
+ def path(self, path: Union[str, Path, pathlib.Path]) -> str:
94
+ """Get the full path (including any schemas/netlocs).
95
+
96
+ Parameters
97
+ ----------
98
+ path: str, asyncio.Path, pathlib.Path
99
+ Path of the object store
100
+
101
+ Returns
102
+ -------
103
+ str:
104
+ URI of the object store
105
+ """
106
+ return str(pathlib.Path(path).absolute())
107
+
108
+ def uri(self, path: Union[str, Path, pathlib.Path]) -> str:
109
+ """Get the uri of the object store.
110
+
111
+ Parameters
112
+ ----------
113
+ path: str, asyncio.Path, pathlib.Path
114
+ Path of the object store
115
+
116
+ Returns
117
+ -------
118
+ str:
119
+ URI of the object store
120
+ """
121
+ return f"file://{pathlib.Path(path).absolute()}"
@@ -0,0 +1,136 @@
1
+ """Interact with an S3 Object Store."""
2
+
3
+ import asyncio
4
+ import pathlib
5
+ from typing import AsyncIterator, Optional, Tuple, Union, cast
6
+
7
+ import fsspec
8
+ from anyio import Path
9
+ from s3fs import S3FileSystem
10
+
11
+ from ..api.storage_backend import MetadataType, PathTemplate
12
+ from ..logger import logger
13
+
14
+
15
+ class S3Path(PathTemplate):
16
+ """Class to interact with an S3 object store."""
17
+
18
+ _fs_type = "s3"
19
+
20
+ def __post_init__(self) -> None:
21
+ self._client: Optional[S3FileSystem] = None
22
+ self.storage_options = self.storage_options or {"anon": True}
23
+
24
+ async def close(self) -> None:
25
+ """Close the connection."""
26
+ client = await self._get_client()
27
+ await client.s3.close()
28
+
29
+ def get_fs_and_path(self, path: str) -> Tuple[fsspec.AbstractFileSystem, str]:
30
+ """S3 implementation for returning (fs, path) suitable for xarray.
31
+
32
+ Parameters
33
+ ^^^^^^^^^^
34
+ path:
35
+ Path to the object store / file name
36
+
37
+ Returns
38
+ ^^^^^^^
39
+ fsspec.AbstractFileSystem, str:
40
+ The AbstractFileSystem class and the corresponding path to the
41
+ data store.
42
+ """
43
+ return fsspec.filesystem("s3", **self.storage_options), path
44
+
45
+ async def _get_client(self) -> S3FileSystem:
46
+ if self._client is None:
47
+ logger.debug(
48
+ "Creating S3 Filesystem with storage_options: %s",
49
+ self.storage_options,
50
+ )
51
+ loop = asyncio.get_running_loop()
52
+ self._client = S3FileSystem(
53
+ asynchronous=True, loop=loop, **self.storage_options
54
+ )
55
+ self._client._loop = loop
56
+
57
+ await self._client.set_session()
58
+ return self._client
59
+
60
+ async def is_file(self, path: Union[str, Path, pathlib.Path]) -> bool:
61
+ """Check if a given path is a file object on the storage system."""
62
+ client = await self._get_client()
63
+ return cast(bool, await client._isfile(str(path)))
64
+
65
+ async def is_dir(self, path: str | Path | pathlib.Path) -> bool:
66
+ """Check if a given path is a directory object on the storage system."""
67
+ client = await self._get_client()
68
+ return cast(bool, await client._isdir(str(path)))
69
+
70
+ async def iterdir(
71
+ self, path: Union[str, Path, pathlib.Path]
72
+ ) -> AsyncIterator[str]:
73
+ """Retrieve sub directories of directory."""
74
+ path = str(path)
75
+ client = await self._get_client()
76
+ for _content in await client._lsdir(path):
77
+ if _content.get("type", "") == "directory":
78
+ yield f'{_content.get("name", "")}'
79
+
80
+ async def rglob(
81
+ self, path: str | Path | pathlib.Path, glob_pattern: str = "*"
82
+ ) -> AsyncIterator[MetadataType]:
83
+ """Search recursively for files matching a ``glob_pattern``.
84
+
85
+ Parameters
86
+ ^^^^^^^^^^
87
+ path: str
88
+ A resource composed by:
89
+ - bucket, 'bucketname'
90
+ - prefix, 'prefix/to/a/path'
91
+ E.g.: '/bucketname/prefix/to/objects'
92
+ Will be translated into a request to
93
+ `self.url`+`/bucketname?prefix="prefix/to/objects`
94
+ glob_pattern: str
95
+ A string reprenseting several glob patterns, separated by '|'
96
+ E.g.: '*.zarr|*.nc|*.hdf5'
97
+ """
98
+ client = await self._get_client()
99
+ if await self.is_file(path):
100
+ yield MetadataType(path=str(path), metadata={})
101
+ else:
102
+ for suffix in self.suffixes:
103
+ for content in await client._glob(f"{path}/**/*{suffix}"):
104
+ yield MetadataType(path=f"/{content}", metadata={})
105
+
106
+ def path(self, path: Union[str, Path, pathlib.Path]) -> str:
107
+ """Get the full path (including any schemas/netlocs).
108
+
109
+ Parameters
110
+ ^^^^^^^^^^
111
+ path: str, asyncio.Path, pathlib.Path
112
+ Path of the object store
113
+
114
+ Returns
115
+ ^^^^^^^
116
+ str:
117
+ URI of the object store
118
+ """
119
+ return cast(
120
+ str, fsspec.filesystem("s3", **self.storage_options).url(str(path))
121
+ )
122
+
123
+ def uri(self, path: Union[str, Path, pathlib.Path]) -> str:
124
+ """Get the uri of the object store.
125
+
126
+ Parameters
127
+ ^^^^^^^^^^
128
+ path: str, asyncio.Path, pathlib.Path
129
+ Path of the object store
130
+
131
+ Returns
132
+ ^^^^^^^
133
+ str:
134
+ URI of the object store
135
+ """
136
+ return self.path(path)
@@ -0,0 +1,305 @@
1
+ """Interact with the OpenStack swift cloud."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import asyncio
6
+ import pathlib
7
+ from fnmatch import fnmatch
8
+ from typing import AsyncIterator, Dict, List, Optional, Tuple, Union, cast
9
+ from urllib.parse import SplitResult, urljoin, urlsplit, urlunparse
10
+
11
+ import aiohttp
12
+ import fsspec
13
+ from anyio import Path
14
+
15
+ from ..api.storage_backend import MetadataType, PathTemplate
16
+
17
+
18
+ def _basename(key: str) -> str:
19
+ return pathlib.PosixPath(key[:-1] if key.endswith("/") else key).name
20
+
21
+
22
+ class SwiftPath(PathTemplate):
23
+ """Class to interact with the OpenStack swift cloud storage system."""
24
+
25
+ _fs_type = "swift"
26
+
27
+ def __post_init__(self) -> None:
28
+ self.storage_options = self.storage_options or {}
29
+ self.os_password = self.storage_options.get("os_password", self._pw)
30
+ self.os_user_id = self.storage_options.get("os_user_id", self._user)
31
+ self.os_project_id = self.storage_options.get("os_project_id")
32
+ self.os_auth_token = self.storage_options.get("os_auth_token") or None
33
+ self._os_storage_url = self.storage_options.get(
34
+ "os_storage_url", ""
35
+ ).rstrip("/")
36
+ self.os_auth_url = self.storage_options.get(
37
+ "os_auth_url", self._guess_tempauth_url(self._os_storage_url)
38
+ )
39
+ self._container = self.storage_options.get(
40
+ "container", self._os_storage_url.split("/")[-1]
41
+ ).rstrip("/")
42
+ self._os_storage_url = self._os_storage_url.removesuffix(self._container)
43
+ self._url_split: Optional[SplitResult] = None
44
+
45
+ @staticmethod
46
+ def _guess_tempauth_url(storage_url: str) -> str:
47
+ """Construct the swift url.
48
+
49
+ Heuristic: For TempAuth, switch '/v1/...' to '/auth/v1.0' on same host:port.
50
+ Returns None if storage_url doesn't look like a Swift v1 endpoint.
51
+ """
52
+ p = urlsplit(storage_url)
53
+ # Typical Swift proxy paths: '/v1/...' or '/swift/v1/...'
54
+ if not (p.path.startswith("/v1/") or p.path.startswith("/swift/v1/")):
55
+ return ""
56
+ # Use same scheme+netloc, set path to /auth/v1.0
57
+ return urlunparse((p.scheme, p.netloc, "/auth/v1.0", "", "", ""))
58
+
59
+ @property
60
+ def storage_path(self) -> str:
61
+ """Path part of the storage url."""
62
+ split = self.url_split
63
+ return "/" + split.path.lstrip("/").rstrip("/")
64
+
65
+ @property
66
+ def url_split(self) -> SplitResult:
67
+ """Retrieve the split parts of the storage url."""
68
+ if self._url_split is not None:
69
+ return self._url_split
70
+ if not self._os_storage_url:
71
+ raise RuntimeError("os_storage_url must be set")
72
+ storage_url = self._os_storage_url.removesuffix(self._container)
73
+ self._url_split = urlsplit(urljoin(storage_url, self._container))
74
+ return self._url_split
75
+
76
+ @property
77
+ def _anon(self) -> bool:
78
+ """Decide if we can logon at all."""
79
+ return False if self.os_password or self.headers else True
80
+
81
+ async def logon(self) -> None:
82
+ """Logon to the swfit system if necessary."""
83
+ headers = {
84
+ "X-Auth-User": f"{self.os_project_id}:{self.os_user_id}",
85
+ "X-Auth-Key": self.os_password,
86
+ }
87
+ async with aiohttp.ClientSession() as session:
88
+ async with session.get(self.os_auth_url, headers=headers) as res:
89
+ if res.status != 200:
90
+ raise ValueError(f"Logon to {self.os_auth_url} failed")
91
+ self.os_auth_token = res.headers["X-Auth-Token"]
92
+
93
+ def _is_zarr_like_match(self, key: str, glob_pattern: str) -> bool:
94
+ key_l = key.lower()
95
+ base = _basename(key)
96
+ if key_l.endswith(".zarr") or key_l.endswith(".zarr/"):
97
+ if ".zarr" in self.suffixes and fnmatch(base, glob_pattern):
98
+ return True
99
+ return False
100
+
101
+ async def _url_fragments(self, url: str) -> Tuple[str, str]:
102
+ url_split = urlsplit(url)
103
+ url_path = (
104
+ ("/" + url_split.path.lstrip("/"))
105
+ .removeprefix(self.storage_path)
106
+ .rstrip("/")
107
+ .lstrip("/")
108
+ )
109
+
110
+ parsed_url = SplitResult(
111
+ url_split.scheme or self.url_split.scheme,
112
+ url_split.netloc or self.url_split.netloc,
113
+ f"{self.storage_path}/{url_path}",
114
+ url_split.query,
115
+ url_split.fragment,
116
+ )
117
+ _path = pathlib.PosixPath(parsed_url.path).parts[1:]
118
+ url_prefix = "/".join(_path[:3])
119
+ prefix = "/".join(_path[3:])
120
+ if prefix:
121
+ prefix += "/"
122
+ url_head = f"{parsed_url.scheme}://{parsed_url.netloc}/{url_prefix}"
123
+ return url_head, prefix
124
+
125
+ async def _read_json(
126
+ self, path: str, delimiter: Optional[str] = "/"
127
+ ) -> List[Dict[str, str]]:
128
+ url, prefix = await self._url_fragments(path)
129
+ suffix = f"?format=json&prefix={prefix}"
130
+ if delimiter:
131
+ suffix += f"&delimiter={delimiter}"
132
+ else:
133
+ suffix = suffix.rstrip("/")
134
+ url = f"{url}{suffix}"
135
+ errors = {
136
+ 403: PermissionError(f"Permission denied for {path}"),
137
+ 404: FileNotFoundError(f"No such file or directory {path}"),
138
+ }
139
+ async with aiohttp.ClientSession() as session:
140
+ for _ in range(2):
141
+ async with session.get(url, headers=self.headers) as res:
142
+ if res.status < 300:
143
+ return cast(list[dict[str, str]], await res.json())
144
+ if res.status == 401:
145
+ await self.logon()
146
+ continue
147
+ raise errors.get(res.status, RuntimeError(f"Failed to query {path}"))
148
+
149
+ def _get_dir_from_path(self, data: dict[str, str]) -> str | None:
150
+ if (
151
+ data.get("subdir")
152
+ or data.get("content_type", "") == "application/directory"
153
+ ):
154
+ return data.get("subdir") or data.get("name")
155
+ return None
156
+
157
+ @property
158
+ def headers(self) -> dict[str, str]:
159
+ """Define the headers used to interact with swift."""
160
+ if self.os_auth_token is None:
161
+ return {}
162
+ return {"X-Auth-Token": self.os_auth_token}
163
+
164
+ async def is_file(self, path: str | Path | pathlib.Path) -> bool:
165
+ """Check if a given path is a file object on the storage system."""
166
+ try:
167
+ data = (await self._read_json(str(path)))[0]
168
+ except (FileNotFoundError, IndexError):
169
+ return False
170
+ return self._get_dir_from_path(data) is None
171
+
172
+ async def is_dir(self, path: str | Path | pathlib.Path) -> bool:
173
+ """Check if a given path is a directory object on the storage system."""
174
+ try:
175
+ data = (await self._read_json(str(path)))[0]
176
+ except (FileNotFoundError, IndexError):
177
+ return False
178
+ return self._get_dir_from_path(data) is not None
179
+
180
+ async def iterdir(
181
+ self, path: Union[str, Path, pathlib.Path]
182
+ ) -> AsyncIterator[str]:
183
+ """Get all sub directories of a directory."""
184
+ try:
185
+ for data in await self._read_json(str(path)):
186
+ new_path = self._get_dir_from_path(data)
187
+ if new_path:
188
+ out = (
189
+ str(path).lstrip("/")
190
+ + "/"
191
+ + pathlib.PosixPath(new_path).name
192
+ )
193
+ yield out
194
+ except (FileNotFoundError, PermissionError):
195
+ pass
196
+
197
+ async def rglob(
198
+ self,
199
+ path: Union[str, Path, pathlib.Path],
200
+ glob_pattern: str = "*",
201
+ ) -> AsyncIterator[MetadataType]:
202
+ """Search recursively for files matching a glob_pattern."""
203
+ delimiter: Optional[str] = None
204
+ if await self.is_dir(path):
205
+ delimiter = "/"
206
+ for data in await self._read_json(str(path), delimiter=delimiter):
207
+ # swift doesn't natively support pagination, so we need to do it
208
+ # ourselves.
209
+ name = data.get("name")
210
+ dir_name = self._get_dir_from_path(data)
211
+ if dir_name:
212
+ # if it's an actual object named foo.zarr, treat as zarr store
213
+ if self._is_zarr_like_match(dir_name, glob_pattern):
214
+ yield MetadataType(path=dir_name.rstrip("/"), metadata={})
215
+ else:
216
+ async for md in self.rglob(dir_name, glob_pattern):
217
+ yield md
218
+ elif name:
219
+ if pathlib.PosixPath(name).suffix in self.suffixes and fnmatch(
220
+ name, glob_pattern
221
+ ):
222
+ yield MetadataType(path=name, metadata={})
223
+
224
+ def get_fs_and_path(self, uri: str) -> Tuple[fsspec.AbstractFileSystem, str]:
225
+ """Return (fs, path) suitable for xarray.
226
+
227
+ Parameters
228
+ ----------
229
+ uri:
230
+ Path to the object store / file name
231
+
232
+
233
+ Returns
234
+ -------
235
+ fsspec.AbstractFileSystem, str:
236
+ The AbstractFileSystem class and the corresponding path to the
237
+ data store.
238
+ """
239
+ url_split = urlsplit(uri)
240
+ url_path = (
241
+ ("/" + url_split.path.lstrip("/"))
242
+ .removeprefix(self.storage_path)
243
+ .rstrip("/")
244
+ .lstrip("/")
245
+ )
246
+ url = SplitResult(
247
+ url_split.scheme or self.url_split.scheme,
248
+ url_split.netloc or self.url_split.netloc,
249
+ f"{self.storage_path}/{url_path}",
250
+ url_split.query,
251
+ url_split.fragment,
252
+ ).geturl()
253
+ if not self._anon:
254
+ asyncio.run(self.logon())
255
+ return (
256
+ fsspec.filesystem(
257
+ "http",
258
+ headers=self.headers,
259
+ block_size=2**20,
260
+ ),
261
+ url,
262
+ )
263
+
264
+ def path(self, path: Union[str, Path, pathlib.Path]) -> str:
265
+ """Get the full path (including any schemas/netlocs).
266
+
267
+ Parameters
268
+ ----------
269
+ path: str, asyncio.Path, pathlib.Path
270
+ Path of the object store
271
+
272
+ Returns
273
+ -------
274
+ str:
275
+ URI of the object store
276
+ """
277
+ url_split = urlsplit(str(path))
278
+ if not url_split.netloc:
279
+ path = f"{self.url_split.path}/{url_split.path}"
280
+ else:
281
+ path = url_split.path
282
+
283
+ res = SplitResult(
284
+ url_split.scheme or self.url_split.scheme,
285
+ url_split.netloc or self.url_split.netloc,
286
+ path,
287
+ url_split.query,
288
+ url_split.fragment,
289
+ ).geturl()
290
+ return res
291
+
292
+ def uri(self, path: Union[str, Path, pathlib.Path]) -> str:
293
+ """Get the uri of the object store.
294
+
295
+ Parameters
296
+ ----------
297
+ path: str, asyncio.Path, pathlib.Path
298
+ Path of the object store
299
+
300
+ Returns
301
+ -------
302
+ str:
303
+ URI of the object store
304
+ """
305
+ return self.path(path)