metadata-crawler 2510.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of metadata-crawler might be problematic. Click here for more details.

Files changed (35) hide show
  1. metadata_crawler/__init__.py +263 -0
  2. metadata_crawler/__main__.py +8 -0
  3. metadata_crawler/_version.py +1 -0
  4. metadata_crawler/api/__init__.py +1 -0
  5. metadata_crawler/api/cli.py +57 -0
  6. metadata_crawler/api/config.py +831 -0
  7. metadata_crawler/api/drs_config.toml +440 -0
  8. metadata_crawler/api/index.py +151 -0
  9. metadata_crawler/api/metadata_stores.py +755 -0
  10. metadata_crawler/api/mixin/__init__.py +7 -0
  11. metadata_crawler/api/mixin/lookup_mixin.py +112 -0
  12. metadata_crawler/api/mixin/lookup_tables.py +10010 -0
  13. metadata_crawler/api/mixin/path_mixin.py +46 -0
  14. metadata_crawler/api/mixin/template_mixin.py +145 -0
  15. metadata_crawler/api/storage_backend.py +277 -0
  16. metadata_crawler/backends/__init__.py +1 -0
  17. metadata_crawler/backends/intake.py +211 -0
  18. metadata_crawler/backends/posix.py +121 -0
  19. metadata_crawler/backends/s3.py +140 -0
  20. metadata_crawler/backends/swift.py +305 -0
  21. metadata_crawler/cli.py +547 -0
  22. metadata_crawler/data_collector.py +278 -0
  23. metadata_crawler/ingester/__init__.py +1 -0
  24. metadata_crawler/ingester/mongo.py +206 -0
  25. metadata_crawler/ingester/solr.py +282 -0
  26. metadata_crawler/logger.py +153 -0
  27. metadata_crawler/py.typed +0 -0
  28. metadata_crawler/run.py +419 -0
  29. metadata_crawler/utils/__init__.py +482 -0
  30. metadata_crawler/utils/cftime_utils.py +207 -0
  31. metadata_crawler-2510.1.0.dist-info/METADATA +401 -0
  32. metadata_crawler-2510.1.0.dist-info/RECORD +35 -0
  33. metadata_crawler-2510.1.0.dist-info/WHEEL +4 -0
  34. metadata_crawler-2510.1.0.dist-info/entry_points.txt +14 -0
  35. metadata_crawler-2510.1.0.dist-info/licenses/LICENSE +28 -0
@@ -0,0 +1,278 @@
1
+ """Gather metadata and for adding them to a temporary metadata store."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import asyncio
6
+ import os
7
+ from multiprocessing import Event, Value
8
+ from pathlib import Path
9
+ from types import TracebackType
10
+ from typing import (
11
+ Any,
12
+ AsyncIterator,
13
+ Callable,
14
+ Coroutine,
15
+ Dict,
16
+ Iterator,
17
+ Optional,
18
+ Tuple,
19
+ Type,
20
+ Union,
21
+ cast,
22
+ )
23
+
24
+ import tomlkit
25
+
26
+ from .api.config import CrawlerSettings, DRSConfig
27
+ from .api.metadata_stores import CatalogueWriter, IndexName
28
+ from .api.storage_backend import PathTemplate
29
+ from .logger import logger
30
+ from .utils import (
31
+ Counter,
32
+ MetadataCrawlerException,
33
+ create_async_iterator,
34
+ print_performance,
35
+ )
36
+
37
+ ScanItem = Tuple[str, str, bool, bool]
38
+
39
+
40
+ class DataCollector:
41
+ """Collect file objects from a given directory object and search for files.
42
+
43
+ Parameters
44
+ ----------
45
+ config_file:
46
+ Path to the drs-config file / loaded configuration.
47
+ *search_objects:
48
+ Paths of the search directories. e.g. `root_path` attr in drs_config
49
+ uri: str
50
+ the uir of the metadata store.
51
+ password: str
52
+ Password for the ingestion
53
+ batch_size: int
54
+ Batch size for the ingestion
55
+ """
56
+
57
+ def __init__(
58
+ self,
59
+ config_file: Union[Path, str, Dict[str, Any], tomlkit.TOMLDocument],
60
+ metadata_store: Optional[
61
+ Union[Path, str, Dict[str, Any], tomlkit.TOMLDocument]
62
+ ],
63
+ index_name: IndexName,
64
+ *search_objects: CrawlerSettings,
65
+ **kwargs: Any,
66
+ ):
67
+ self._search_objects = search_objects
68
+ if not search_objects:
69
+ raise MetadataCrawlerException("You have to give search directories")
70
+ self._num_files: Counter = Value("i", 0)
71
+ self.index_name = index_name
72
+ self.config = DRSConfig.load(config_file)
73
+ kwargs.setdefault("scan_concurrency", os.getenv("SCAN_CONCURRENCY", "64"))
74
+ self._scan_concurrency: int = int(kwargs.pop("scan_concurrency", 64))
75
+ self._scan_queue: asyncio.Queue[Optional[ScanItem]] = asyncio.Queue(
76
+ maxsize=int(kwargs.pop("scan_queue_size", 10_000))
77
+ )
78
+ self._print_status = Event()
79
+ self.ingest_queue = CatalogueWriter(
80
+ str(metadata_store or "metadata.yaml"),
81
+ index_name=index_name,
82
+ config=config_file,
83
+ **kwargs,
84
+ )
85
+ self.ingest_queue.run_consumer()
86
+ self._max_files = int(cast(str, os.getenv("MDC_MAX_FILES", "-1")))
87
+
88
+ @property
89
+ def crawled_files(
90
+ self,
91
+ ) -> int:
92
+ """Get the total number of crawled files."""
93
+ return self._num_files.value
94
+
95
+ @property
96
+ def ingested_objects(self) -> int:
97
+ """Get the number of ingested objects."""
98
+ return self.ingest_queue.ingested_objects
99
+
100
+ @property
101
+ def search_objects(self) -> Iterator[tuple[str, str]]:
102
+ """Async iterator for the search directories."""
103
+ for cfg in self._search_objects:
104
+ yield cfg.name, str(cfg.search_path)
105
+
106
+ async def __aenter__(self) -> "DataCollector":
107
+ return self
108
+
109
+ async def __aexit__(
110
+ self,
111
+ exc_type: Optional[Type[BaseException]],
112
+ exc: Optional[BaseException],
113
+ tb: TracebackType,
114
+ ) -> None:
115
+ self._print_status.clear()
116
+ self.ingest_queue.join_all_tasks()
117
+ await self.ingest_queue.close()
118
+
119
+ async def _safe_close(b: PathTemplate) -> None:
120
+ try:
121
+ await asyncio.wait_for(b.close(), timeout=3)
122
+ except Exception:
123
+ pass
124
+
125
+ await asyncio.gather(
126
+ *[_safe_close(ds.backend) for ds in self.config.datasets.values()],
127
+ return_exceptions=True,
128
+ )
129
+
130
+ def _test_env(self) -> bool:
131
+ return (
132
+ True
133
+ if self._max_files > 0 and self._max_files < self.crawled_files
134
+ else False
135
+ )
136
+
137
+ async def _ingest_dir(
138
+ self,
139
+ drs_type: str,
140
+ search_dir: str,
141
+ iterable: bool = True,
142
+ is_versioned: bool = True,
143
+ ) -> None:
144
+ if iterable:
145
+ try:
146
+ sub_dirs = self.config.datasets[drs_type].backend.iterdir(
147
+ search_dir
148
+ )
149
+ except Exception as error:
150
+ logger.error(error)
151
+ return
152
+ else:
153
+ sub_dirs = cast(
154
+ AsyncIterator[str], create_async_iterator([search_dir])
155
+ )
156
+ rank = 0
157
+ async for _dir in sub_dirs:
158
+ async for _inp in self.config.datasets[drs_type].backend.rglob(
159
+ _dir, self.config.datasets[drs_type].glob_pattern
160
+ ):
161
+ if self._test_env():
162
+ return
163
+ await self.ingest_queue.put(
164
+ _inp, drs_type, name=self.index_name.all
165
+ )
166
+ if rank == 0 or is_versioned is False:
167
+ await self.ingest_queue.put(
168
+ _inp, drs_type, name=self.index_name.latest
169
+ )
170
+ self._num_files.value += 1
171
+ rank += 1
172
+ return None
173
+
174
+ async def _scan_worker(self) -> None:
175
+ """Drain _scan_queue and run _ingest_dir concurrently (bounded pool)."""
176
+ while True:
177
+ item = await self._scan_queue.get() # blocks
178
+ if item is None: # sentinel -> exit
179
+ # do not task_done() for sentinel
180
+ break
181
+ drs_type, path, iterable, is_versioned = item
182
+ try:
183
+ await self._ingest_dir(
184
+ drs_type, path, iterable=iterable, is_versioned=is_versioned
185
+ )
186
+ except Exception as error:
187
+ logger.error(error)
188
+ finally:
189
+ self._scan_queue.task_done()
190
+
191
+ async def _iter_content(
192
+ self,
193
+ drs_type: str,
194
+ inp_dir: str,
195
+ pos: int = 0,
196
+ is_versioned: bool = True,
197
+ ) -> None:
198
+ """Walk recursively until files or the version level is reached."""
199
+ store = self.config.datasets[drs_type].backend
200
+ if self._test_env():
201
+ return
202
+ try:
203
+ is_file, iterable, suffix = await asyncio.gather(
204
+ store.is_file(inp_dir),
205
+ store.is_dir(inp_dir),
206
+ store.suffix(inp_dir),
207
+ )
208
+ except Exception as error:
209
+ logger.error("Error checking file %s", error)
210
+ return
211
+
212
+ iterable = False if suffix == ".zarr" else iterable
213
+ op: Optional[Callable[..., Coroutine[Any, Any, None]]] = None
214
+ if is_file and suffix in self.config.suffixes:
215
+ op = self._ingest_dir
216
+ elif pos <= 0 or suffix == ".zarr":
217
+ op = self._ingest_dir
218
+
219
+ if op is not None:
220
+ # enqueue the heavy scan; workers will run _ingest_dir concurrently
221
+ await self._scan_queue.put(
222
+ (drs_type, inp_dir, iterable, is_versioned)
223
+ )
224
+ return
225
+
226
+ # otherwise, recurse sequentially (cheap) — no task per directory
227
+ try:
228
+ async for sub in store.iterdir(inp_dir):
229
+ await self._iter_content(
230
+ drs_type, sub, pos - 1, is_versioned=is_versioned
231
+ )
232
+ except Exception as error:
233
+ logger.error(error)
234
+
235
+ async def ingest_data(self) -> None:
236
+ """Produce scan tasks and process them with a bounded worker pool."""
237
+ self._print_status.set()
238
+ self._num_files.value = 0
239
+ print_performance(
240
+ self._print_status,
241
+ self._num_files,
242
+ self.ingest_queue.queue,
243
+ self.ingest_queue.num_objects,
244
+ )
245
+
246
+ async with asyncio.TaskGroup() as tg:
247
+ # start scan workers
248
+ for _ in range(self._scan_concurrency):
249
+ tg.create_task(self._scan_worker())
250
+
251
+ # produce scan items by walking roots sequentially
252
+ for drs_type, path in self.search_objects: # <- property is sync
253
+ pos, is_versioned = self.config.max_directory_tree_level(
254
+ path, drs_type=drs_type
255
+ )
256
+ if pos < 0:
257
+ logger.warning(
258
+ "Can't define latest version of versioned dataset."
259
+ " This might lead to unexpected results. Try adjusting"
260
+ " your search path."
261
+ )
262
+
263
+ await self._iter_content(
264
+ drs_type, path, pos, is_versioned=is_versioned
265
+ )
266
+
267
+ # wait until all queued scan items are processed
268
+ await self._scan_queue.join()
269
+
270
+ # stop workers (one sentinel per worker)
271
+ for _ in range(self._scan_concurrency):
272
+ await self._scan_queue.put(None)
273
+
274
+ logger.info(
275
+ "%i ingestion tasks have been completed", len(self._search_objects)
276
+ )
277
+ self.ingest_queue.join_all_tasks()
278
+ self._print_status.clear()
@@ -0,0 +1 @@
1
+ """Module for ingesting data to the metadata index."""
@@ -0,0 +1,206 @@
1
+ """Collection of aync data ingest classes."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import asyncio
6
+ import re
7
+ from functools import cached_property
8
+ from typing import Annotated, Any, Dict, List, Optional, Tuple
9
+ from urllib.parse import ParseResult, parse_qs, urlencode, urlparse, urlunparse
10
+
11
+ from motor.motor_asyncio import (
12
+ AsyncIOMotorClient,
13
+ AsyncIOMotorCollection,
14
+ AsyncIOMotorDatabase,
15
+ )
16
+ from pymongo import DeleteMany, UpdateOne
17
+
18
+ from ..api.cli import cli_function, cli_parameter
19
+ from ..api.index import BaseIndex
20
+ from ..logger import logger
21
+
22
+
23
+ class MongoIndex(BaseIndex):
24
+ """Ingest metadata into a mongoDB server."""
25
+
26
+ def __post_init__(self) -> None:
27
+ self._raw_uri = ""
28
+ self._url = ""
29
+ self._client: Optional[AsyncIOMotorClient[Any]] = None
30
+
31
+ @property
32
+ def uri(self) -> str:
33
+ """Create the connection uri for the mongoDB."""
34
+ if self._url:
35
+ return self._url
36
+ parsed_url = urlparse(self._raw_uri)
37
+ query = parse_qs(parsed_url.query)
38
+ if "timeout" not in parsed_url.query.lower():
39
+ query["timeoutMS"] = ["5000"]
40
+ new_query = urlencode(query, doseq=True)
41
+ self._url = urlunparse(
42
+ ParseResult(
43
+ parsed_url.scheme or "mongodb",
44
+ parsed_url.netloc,
45
+ parsed_url.path.rstrip("/"),
46
+ parsed_url.params,
47
+ new_query,
48
+ parsed_url.fragment,
49
+ )
50
+ )
51
+ return self._url
52
+
53
+ @cached_property
54
+ def unique_index(self) -> str:
55
+ """Get the index."""
56
+ for name, schema in self.index_schema.items():
57
+ if schema.unique:
58
+ return name
59
+ raise ValueError("The schema doesn't define a unique value.")
60
+
61
+ @property
62
+ def client(self) -> AsyncIOMotorClient[Any]:
63
+ """Get the mongoDB client."""
64
+ if self._client is None:
65
+ logger.debug("Creating async mongoDB client: %s", self.uri)
66
+ self._client = AsyncIOMotorClient(self.uri)
67
+ return self._client
68
+
69
+ async def _bulk_upsert(
70
+ self, chunk: List[Dict[str, Any]], collection: AsyncIOMotorCollection[Any]
71
+ ) -> None:
72
+ ops = [
73
+ UpdateOne(
74
+ {self.unique_index: m[self.unique_index]},
75
+ {"$set": m},
76
+ upsert=True,
77
+ )
78
+ for m in chunk
79
+ ]
80
+ await collection.bulk_write(ops, ordered=False)
81
+
82
+ async def _index_collection(
83
+ self, db: AsyncIOMotorDatabase[Any], collection: str, suffix: str = ""
84
+ ) -> None:
85
+ """Index a collection."""
86
+ col = collection + suffix
87
+ await db[col].create_index(self.unique_index, unique=True)
88
+ async for chunk in self.get_metadata(collection):
89
+ await self._bulk_upsert(chunk, db[col])
90
+
91
+ async def _prep_db_connection(
92
+ self, database: str, url: str
93
+ ) -> AsyncIOMotorDatabase[Any]:
94
+
95
+ await self.close()
96
+ self._raw_uri = url or ""
97
+ return self.client[database]
98
+
99
+ @cli_function(
100
+ help="Add metadata to the mongoDB metadata server.",
101
+ )
102
+ async def index(
103
+ self,
104
+ *,
105
+ url: Annotated[
106
+ Optional[str],
107
+ cli_parameter(
108
+ "--url",
109
+ help="The <host>:<port> to the mngoDB server",
110
+ type=str,
111
+ ),
112
+ ] = None,
113
+ database: Annotated[
114
+ str,
115
+ cli_parameter(
116
+ "--database",
117
+ "--db",
118
+ help="The DB name holding the metadata.",
119
+ type=str,
120
+ default="metadata",
121
+ ),
122
+ ] = "metadata",
123
+ index_suffix: Annotated[
124
+ Optional[str],
125
+ cli_parameter(
126
+ "--index-suffix",
127
+ help="Suffix for the latest and all version collections.",
128
+ type=str,
129
+ ),
130
+ ] = None,
131
+ ) -> None:
132
+ """Add metadata to the mongoDB metadata server."""
133
+ db = await self._prep_db_connection(database, url or "")
134
+ async with asyncio.TaskGroup() as tg:
135
+ for collection in self.index_names:
136
+ tg.create_task(
137
+ self._index_collection(
138
+ db, collection, suffix=index_suffix or ""
139
+ )
140
+ )
141
+
142
+ async def close(self) -> None:
143
+ """Close the mongoDB connection."""
144
+ self._client.close() if self._client is not None else None
145
+ self._url = ""
146
+ self._raw_uri = ""
147
+
148
+ @cli_function(
149
+ help="Remove metadata from the mongoDB metadata server.",
150
+ )
151
+ async def delete(
152
+ self,
153
+ *,
154
+ url: Annotated[
155
+ Optional[str],
156
+ cli_parameter(
157
+ "--url",
158
+ help="The <host>:<port> to the mngoDB server",
159
+ type=str,
160
+ ),
161
+ ] = None,
162
+ database: Annotated[
163
+ str,
164
+ cli_parameter(
165
+ "--database",
166
+ "--db",
167
+ help="The DB name holding the metadata.",
168
+ type=str,
169
+ default="metadata",
170
+ ),
171
+ ] = "metadata",
172
+ facets: Annotated[
173
+ Optional[List[Tuple[str, str]]],
174
+ cli_parameter(
175
+ "-f",
176
+ "--facets",
177
+ type=str,
178
+ nargs=2,
179
+ action="append",
180
+ help="Search facets matching the delete query.",
181
+ ),
182
+ ] = None,
183
+ ) -> None:
184
+ """Remove metadata from the mongoDB metadata server."""
185
+ db = await self._prep_db_connection(database, url or "")
186
+ if not facets:
187
+ logger.info("Nothing to delete")
188
+ return
189
+
190
+ def glob_to_regex(glob: str) -> str:
191
+ """Turn a shell‐style glob into a anchored mongo regex."""
192
+ # escape everything, then un-escape our wildcards
193
+ esc = re.escape(glob)
194
+ esc = esc.replace(r"\*", ".*").replace(r"\?", ".")
195
+ return f"^{esc}$"
196
+
197
+ ops: List[DeleteMany] = []
198
+ for field, val in facets:
199
+ if "*" in val or "?" in val:
200
+ pattern = glob_to_regex(val)
201
+ ops.append(DeleteMany({field: {"$regex": pattern}}))
202
+ else:
203
+ ops.append(DeleteMany({field: val}))
204
+ logger.debug("Deleting entries matching %s", ops)
205
+ for collection in await db.list_collection_names():
206
+ await db[collection].bulk_write(ops, ordered=False)