metadata-crawler 2509.0.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of metadata-crawler might be problematic. Click here for more details.

Files changed (34) hide show
  1. metadata_crawler/__init__.py +248 -0
  2. metadata_crawler/__main__.py +8 -0
  3. metadata_crawler/_version.py +1 -0
  4. metadata_crawler/api/__init__.py +1 -0
  5. metadata_crawler/api/cli.py +57 -0
  6. metadata_crawler/api/config.py +801 -0
  7. metadata_crawler/api/drs_config.toml +439 -0
  8. metadata_crawler/api/index.py +132 -0
  9. metadata_crawler/api/metadata_stores.py +749 -0
  10. metadata_crawler/api/mixin/__init__.py +7 -0
  11. metadata_crawler/api/mixin/lookup_mixin.py +112 -0
  12. metadata_crawler/api/mixin/lookup_tables.py +10010 -0
  13. metadata_crawler/api/mixin/path_mixin.py +46 -0
  14. metadata_crawler/api/mixin/template_mixin.py +145 -0
  15. metadata_crawler/api/storage_backend.py +277 -0
  16. metadata_crawler/backends/__init__.py +1 -0
  17. metadata_crawler/backends/intake.py +211 -0
  18. metadata_crawler/backends/posix.py +121 -0
  19. metadata_crawler/backends/s3.py +136 -0
  20. metadata_crawler/backends/swift.py +305 -0
  21. metadata_crawler/cli.py +539 -0
  22. metadata_crawler/data_collector.py +258 -0
  23. metadata_crawler/ingester/__init__.py +1 -0
  24. metadata_crawler/ingester/mongo.py +193 -0
  25. metadata_crawler/ingester/solr.py +152 -0
  26. metadata_crawler/logger.py +142 -0
  27. metadata_crawler/py.typed +0 -0
  28. metadata_crawler/run.py +373 -0
  29. metadata_crawler/utils.py +411 -0
  30. metadata_crawler-2509.0.0.dist-info/METADATA +399 -0
  31. metadata_crawler-2509.0.0.dist-info/RECORD +34 -0
  32. metadata_crawler-2509.0.0.dist-info/WHEEL +4 -0
  33. metadata_crawler-2509.0.0.dist-info/entry_points.txt +14 -0
  34. metadata_crawler-2509.0.0.dist-info/licenses/LICENSE +28 -0
@@ -0,0 +1,258 @@
1
+ """Gather metadata and for adding them to a temporary metadata store."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import asyncio
6
+ import os
7
+ from multiprocessing import Event, Value
8
+ from pathlib import Path
9
+ from types import TracebackType
10
+ from typing import (
11
+ Any,
12
+ AsyncIterator,
13
+ Callable,
14
+ Coroutine,
15
+ Dict,
16
+ Iterator,
17
+ Optional,
18
+ Type,
19
+ Union,
20
+ cast,
21
+ )
22
+
23
+ import tomlkit
24
+
25
+ from .api.config import CrawlerSettings, DRSConfig
26
+ from .api.metadata_stores import CatalogueWriter, IndexName
27
+ from .api.storage_backend import PathTemplate
28
+ from .logger import logger
29
+ from .utils import (
30
+ Counter,
31
+ MetadataCrawlerException,
32
+ create_async_iterator,
33
+ print_performance,
34
+ )
35
+
36
+ ScanItem = tuple[str, str, bool]
37
+
38
+
39
+ class DataCollector:
40
+ """Collect file objects from a given directory object and search for files.
41
+
42
+ Parameters
43
+ ----------
44
+ config_file:
45
+ Path to the drs-config file / loaded configuration.
46
+ *search_objects:
47
+ Paths of the search directories. e.g. `root_path` attr in drs_config
48
+ uri: str
49
+ the uir of the metadata store.
50
+ password: str
51
+ Password for the ingestion
52
+ batch_size: int
53
+ Batch size for the ingestion
54
+ """
55
+
56
+ def __init__(
57
+ self,
58
+ config_file: Union[Path, str, Dict[str, Any], tomlkit.TOMLDocument],
59
+ metadata_store: Optional[
60
+ Union[Path, str, Dict[str, Any], tomlkit.TOMLDocument]
61
+ ],
62
+ index_name: IndexName,
63
+ *search_objects: CrawlerSettings,
64
+ **kwargs: Any,
65
+ ):
66
+ self._search_objects = search_objects
67
+ if not search_objects:
68
+ raise MetadataCrawlerException("You have to give search directories")
69
+ self._num_files: Counter = Value("i", 0)
70
+ self.index_name = index_name
71
+ self.config = DRSConfig.load(config_file)
72
+ kwargs.setdefault("scan_concurrency", os.getenv("SCAN_CONCURRENCY", "64"))
73
+ self._scan_concurrency: int = int(kwargs.pop("scan_concurrency", 64))
74
+ self._scan_queue: asyncio.Queue[Optional[ScanItem]] = asyncio.Queue(
75
+ maxsize=int(kwargs.pop("scan_queue_size", 10_000))
76
+ )
77
+ self._print_status = Event()
78
+ self.ingest_queue = CatalogueWriter(
79
+ str(metadata_store or "metadata.yaml"),
80
+ index_name=index_name,
81
+ config=config_file,
82
+ **kwargs,
83
+ )
84
+ self.ingest_queue.run_consumer()
85
+ self._max_files = int(cast(str, os.getenv("MDC_MAX_FILES", "-1")))
86
+
87
+ @property
88
+ def crawled_files(
89
+ self,
90
+ ) -> int:
91
+ """Get the total number of crawled files."""
92
+ return self._num_files.value
93
+
94
+ @property
95
+ def ingested_objects(self) -> int:
96
+ """Get the number of ingested objects."""
97
+ return self.ingest_queue.ingested_objects
98
+
99
+ @property
100
+ def search_objects(self) -> Iterator[tuple[str, str]]:
101
+ """Async iterator for the search directories."""
102
+ for cfg in self._search_objects:
103
+ yield cfg.name, str(cfg.search_path)
104
+
105
+ async def __aenter__(self) -> "DataCollector":
106
+ return self
107
+
108
+ async def __aexit__(
109
+ self,
110
+ exc_type: Optional[Type[BaseException]],
111
+ exc: Optional[BaseException],
112
+ tb: TracebackType,
113
+ ) -> None:
114
+ self._print_status.clear()
115
+ self.ingest_queue.join_all_tasks()
116
+ await self.ingest_queue.close()
117
+
118
+ async def _safe_close(b: PathTemplate) -> None:
119
+ try:
120
+ await asyncio.wait_for(b.close(), timeout=3)
121
+ except Exception:
122
+ pass
123
+
124
+ await asyncio.gather(
125
+ *[_safe_close(ds.backend) for ds in self.config.datasets.values()],
126
+ return_exceptions=True,
127
+ )
128
+
129
+ def _test_env(self) -> bool:
130
+ return (
131
+ True
132
+ if self._max_files > 0 and self._max_files < self.crawled_files
133
+ else False
134
+ )
135
+
136
+ async def _ingest_dir(
137
+ self,
138
+ drs_type: str,
139
+ search_dir: str,
140
+ iterable: bool = True,
141
+ ) -> None:
142
+ if iterable:
143
+ try:
144
+ sub_dirs = self.config.datasets[drs_type].backend.iterdir(
145
+ search_dir
146
+ )
147
+ except Exception as error:
148
+ logger.error(error)
149
+ return
150
+ else:
151
+ sub_dirs = cast(
152
+ AsyncIterator[str], create_async_iterator([search_dir])
153
+ )
154
+ rank = 0
155
+ async for _dir in sub_dirs:
156
+ async for _inp in self.config.datasets[drs_type].backend.rglob(
157
+ _dir, self.config.datasets[drs_type].glob_pattern
158
+ ):
159
+ if self._test_env():
160
+ return
161
+ await self.ingest_queue.put(
162
+ _inp, drs_type, name=self.index_name.all
163
+ )
164
+ if rank == 0:
165
+ await self.ingest_queue.put(
166
+ _inp, drs_type, name=self.index_name.latest
167
+ )
168
+ self._num_files.value += 1
169
+ rank += 1
170
+ return None
171
+
172
+ async def _scan_worker(self) -> None:
173
+ """Drain _scan_queue and run _ingest_dir concurrently (bounded pool)."""
174
+ while True:
175
+ item = await self._scan_queue.get() # blocks
176
+ if item is None: # sentinel -> exit
177
+ # do not task_done() for sentinel
178
+ break
179
+ drs_type, path, iterable = item
180
+ try:
181
+ await self._ingest_dir(drs_type, path, iterable=iterable)
182
+ except Exception as error:
183
+ logger.error(error)
184
+ finally:
185
+ self._scan_queue.task_done()
186
+
187
+ async def _iter_content(
188
+ self, drs_type: str, inp_dir: str, pos: int = 0
189
+ ) -> None:
190
+ """Walk recursively until files or the version level is reached."""
191
+ store = self.config.datasets[drs_type].backend
192
+ if self._test_env():
193
+ return
194
+ try:
195
+ is_file, iterable, suffix = await asyncio.gather(
196
+ store.is_file(inp_dir),
197
+ store.is_dir(inp_dir),
198
+ store.suffix(inp_dir),
199
+ )
200
+ except Exception as error:
201
+ logger.error("Error checking file %s", error)
202
+ return
203
+
204
+ iterable = False if suffix == ".zarr" else iterable
205
+ op: Optional[Callable[..., Coroutine[Any, Any, None]]] = None
206
+
207
+ if is_file and suffix in self.config.suffixes:
208
+ op = self._ingest_dir
209
+ elif pos <= 0 or suffix == ".zarr":
210
+ op = self._ingest_dir
211
+
212
+ if op is not None:
213
+ # enqueue the heavy scan; workers will run _ingest_dir concurrently
214
+ await self._scan_queue.put((drs_type, inp_dir, iterable))
215
+ return
216
+
217
+ # otherwise, recurse sequentially (cheap) — no task per directory
218
+ try:
219
+ async for sub in store.iterdir(inp_dir):
220
+ await self._iter_content(drs_type, sub, pos - 1)
221
+ except Exception as error:
222
+ logger.error(error)
223
+
224
+ async def ingest_data(self) -> None:
225
+ """Produce scan tasks and process them with a bounded worker pool."""
226
+ self._print_status.set()
227
+ self._num_files.value = 0
228
+ print_performance(
229
+ self._print_status,
230
+ self._num_files,
231
+ self.ingest_queue.queue,
232
+ self.ingest_queue.num_objects,
233
+ )
234
+
235
+ async with asyncio.TaskGroup() as tg:
236
+ # start scan workers
237
+ for _ in range(self._scan_concurrency):
238
+ tg.create_task(self._scan_worker())
239
+
240
+ # produce scan items by walking roots sequentially
241
+ for drs_type, path in self.search_objects: # <- property is sync
242
+ pos = self.config.max_directory_tree_level(
243
+ path, drs_type=drs_type
244
+ )
245
+ await self._iter_content(drs_type, path, pos)
246
+
247
+ # wait until all queued scan items are processed
248
+ await self._scan_queue.join()
249
+
250
+ # stop workers (one sentinel per worker)
251
+ for _ in range(self._scan_concurrency):
252
+ await self._scan_queue.put(None)
253
+
254
+ logger.info(
255
+ "%i ingestion tasks have been completed", len(self._search_objects)
256
+ )
257
+ self.ingest_queue.join_all_tasks()
258
+ self._print_status.clear()
@@ -0,0 +1 @@
1
+ """Module for ingesting data to the metadata index."""
@@ -0,0 +1,193 @@
1
+ """Collection of aync data ingest classes."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import asyncio
6
+ import re
7
+ from functools import cached_property
8
+ from typing import Annotated, Any, Dict, List, Optional, Tuple
9
+ from urllib.parse import ParseResult, parse_qs, urlencode, urlparse, urlunparse
10
+
11
+ from motor.motor_asyncio import (
12
+ AsyncIOMotorClient,
13
+ AsyncIOMotorCollection,
14
+ AsyncIOMotorDatabase,
15
+ )
16
+ from pymongo import DeleteMany, UpdateOne
17
+
18
+ from ..api.cli import cli_function, cli_parameter
19
+ from ..api.index import BaseIndex
20
+ from ..logger import logger
21
+
22
+
23
+ class MongoIndex(BaseIndex):
24
+ """Ingest metadata into a mongoDB server."""
25
+
26
+ def __post_init__(self) -> None:
27
+ self._raw_uri = ""
28
+ self._url = ""
29
+ self._client: Optional[AsyncIOMotorClient[Any]] = None
30
+
31
+ @property
32
+ def uri(self) -> str:
33
+ """Create the connection uri for the mongoDB."""
34
+ if self._url:
35
+ return self._url
36
+ parsed_url = urlparse(self._raw_uri)
37
+ query = parse_qs(parsed_url.query)
38
+ if "timeout" not in parsed_url.query.lower():
39
+ query["timeoutMS"] = ["5000"]
40
+ new_query = urlencode(query, doseq=True)
41
+ self._url = urlunparse(
42
+ ParseResult(
43
+ parsed_url.scheme or "mongodb",
44
+ parsed_url.netloc,
45
+ parsed_url.path.rstrip("/"),
46
+ parsed_url.params,
47
+ new_query,
48
+ parsed_url.fragment,
49
+ )
50
+ )
51
+ return self._url
52
+
53
+ @cached_property
54
+ def unique_index(self) -> str:
55
+ """Get the index."""
56
+ for name, schema in self.index_schema.items():
57
+ if schema.unique:
58
+ return name
59
+ raise ValueError("The schema doesn't define a unique value.")
60
+
61
+ @property
62
+ def client(self) -> AsyncIOMotorClient[Any]:
63
+ """Get the mongoDB client."""
64
+ if self._client is None:
65
+ logger.debug("Creating async mongoDB client: %s", self.uri)
66
+ self._client = AsyncIOMotorClient(self.uri)
67
+ return self._client
68
+
69
+ async def _bulk_upsert(
70
+ self, chunk: List[Dict[str, Any]], collection: AsyncIOMotorCollection[Any]
71
+ ) -> None:
72
+ ops = [
73
+ UpdateOne(
74
+ {self.unique_index: m[self.unique_index]},
75
+ {"$set": m},
76
+ upsert=True,
77
+ )
78
+ for m in chunk
79
+ ]
80
+ await collection.bulk_write(ops, ordered=False)
81
+
82
+ async def _index_collection(
83
+ self, db: AsyncIOMotorDatabase[Any], collection: str
84
+ ) -> None:
85
+ """Index a collection."""
86
+ await db[collection].create_index(self.unique_index, unique=True)
87
+ async for chunk in self.get_metadata(collection):
88
+ await self._bulk_upsert(chunk, db[collection])
89
+
90
+ async def _prep_db_connection(
91
+ self, database: str, url: str
92
+ ) -> AsyncIOMotorDatabase[Any]:
93
+
94
+ await self.close()
95
+ self._raw_uri = url or ""
96
+ return self.client[database]
97
+
98
+ @cli_function(
99
+ help="Add metadata to the mongoDB metadata server.",
100
+ )
101
+ async def index(
102
+ self,
103
+ *,
104
+ url: Annotated[
105
+ Optional[str],
106
+ cli_parameter(
107
+ "--url",
108
+ help="The <host>:<port> to the mngoDB server",
109
+ type=str,
110
+ ),
111
+ ] = None,
112
+ database: Annotated[
113
+ str,
114
+ cli_parameter(
115
+ "--database",
116
+ "--db",
117
+ help="The DB name holding the metadata.",
118
+ type=str,
119
+ default="metadata",
120
+ ),
121
+ ] = "metadata",
122
+ ) -> None:
123
+ """Add metadata to the mongoDB metadata server."""
124
+ db = await self._prep_db_connection(database, url or "")
125
+ async with asyncio.TaskGroup() as tg:
126
+ for collection in self.index_names:
127
+ tg.create_task(self._index_collection(db, collection))
128
+
129
+ async def close(self) -> None:
130
+ """Close the mongoDB connection."""
131
+ self._client.close() if self._client is not None else None
132
+ self._url = ""
133
+ self._raw_uri = ""
134
+
135
+ @cli_function(
136
+ help="Remove metadata from the mongoDB metadata server.",
137
+ )
138
+ async def delete(
139
+ self,
140
+ *,
141
+ url: Annotated[
142
+ Optional[str],
143
+ cli_parameter(
144
+ "--url",
145
+ help="The <host>:<port> to the mngoDB server",
146
+ type=str,
147
+ ),
148
+ ] = None,
149
+ database: Annotated[
150
+ str,
151
+ cli_parameter(
152
+ "--database",
153
+ "--db",
154
+ help="The DB name holding the metadata.",
155
+ type=str,
156
+ default="metadata",
157
+ ),
158
+ ] = "metadata",
159
+ facets: Annotated[
160
+ Optional[List[Tuple[str, str]]],
161
+ cli_parameter(
162
+ "-f",
163
+ "--facets",
164
+ type=str,
165
+ nargs=2,
166
+ action="append",
167
+ help="Search facets matching the delete query.",
168
+ ),
169
+ ] = None,
170
+ ) -> None:
171
+ """Remove metadata from the mongoDB metadata server."""
172
+ db = await self._prep_db_connection(database, url or "")
173
+ if not facets:
174
+ logger.info("Nothing to delete")
175
+ return
176
+
177
+ def glob_to_regex(glob: str) -> str:
178
+ """Turn a shell‐style glob into a anchored mongo regex."""
179
+ # escape everything, then un-escape our wildcards
180
+ esc = re.escape(glob)
181
+ esc = esc.replace(r"\*", ".*").replace(r"\?", ".")
182
+ return f"^{esc}$"
183
+
184
+ ops: List[DeleteMany] = []
185
+ for field, val in facets:
186
+ if "*" in val or "?" in val:
187
+ pattern = glob_to_regex(val)
188
+ ops.append(DeleteMany({field: {"$regex": pattern}}))
189
+ else:
190
+ ops.append(DeleteMany({field: val}))
191
+ logger.debug("Deleting entries matching %s", ops)
192
+ for collection in await db.list_collection_names():
193
+ await db[collection].bulk_write(ops, ordered=False)
@@ -0,0 +1,152 @@
1
+ """Collection of aync data ingest classes."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import asyncio
6
+ import logging
7
+ import os
8
+ from typing import Annotated, Any, Dict, List, Optional
9
+
10
+ import aiohttp
11
+
12
+ from ..api.cli import cli_function, cli_parameter
13
+ from ..api.index import BaseIndex
14
+ from ..api.metadata_stores import IndexName
15
+ from ..logger import logger
16
+
17
+
18
+ class SolrIndex(BaseIndex):
19
+ """Ingest metadata into an apache solr server."""
20
+
21
+ def __post_init__(self) -> None:
22
+ self.timeout = aiohttp.ClientTimeout(total=50)
23
+ self._uri: str = ""
24
+
25
+ async def solr_url(self, server: str, core: str) -> str:
26
+ """Construct the solr url from a given solr core."""
27
+ if not self._uri:
28
+ scheme, _, server = server.rpartition("://")
29
+ scheme = scheme or "http"
30
+ solr_server, _, solr_port = server.partition(":")
31
+ solr_port = solr_port or "8983"
32
+ solr_server = solr_server or "localhost"
33
+ self._uri = f"{scheme}://{solr_server}:{solr_port}/solr"
34
+ return f"{self._uri}/{core}/update/json?commit=true"
35
+
36
+ @cli_function(
37
+ help="Remove metadata from the apache solr server.",
38
+ )
39
+ async def delete(
40
+ self,
41
+ *,
42
+ server: Annotated[
43
+ Optional[str],
44
+ cli_parameter(
45
+ "-sv",
46
+ "--server",
47
+ help="The <host>:<port> to the solr server",
48
+ type=str,
49
+ ),
50
+ ] = None,
51
+ facets: Annotated[
52
+ Optional[List[tuple[str, str]]],
53
+ cli_parameter(
54
+ "-f",
55
+ "--facets",
56
+ type=str,
57
+ nargs=2,
58
+ action="append",
59
+ help="Search facets matching the delete query.",
60
+ ),
61
+ ] = None,
62
+ latest_version: Annotated[
63
+ str,
64
+ cli_parameter(
65
+ "--latest-version",
66
+ type=str,
67
+ help="Name of the core holding 'latest' metadata.",
68
+ ),
69
+ ] = IndexName().latest,
70
+ all_versions: Annotated[
71
+ str,
72
+ cli_parameter(
73
+ "--all-versions",
74
+ type=str,
75
+ help="Name of the core holding 'all' metadata versions.",
76
+ ),
77
+ ] = IndexName().all,
78
+ ) -> None:
79
+ """Remove metadata from the apache solr server."""
80
+ query = []
81
+ for key, value in facets or []:
82
+ if key.lower() == "file":
83
+ if value[0] in (os.sep, "/"):
84
+ value = f"\\{value}"
85
+ value = value.replace(":", "\\:")
86
+ else:
87
+ value = value.lower()
88
+ query.append(f"{key.lower()}:{value}")
89
+ query_str = " AND ".join(query)
90
+ server = server or ""
91
+ async with aiohttp.ClientSession(timeout=self.timeout) as session:
92
+ logger.debug("Deleting entries matching %s", query_str)
93
+ for core in (all_versions, latest_version):
94
+ url = await self.solr_url(server, core)
95
+ async with session.post(
96
+ url, json={"delete": {"query": query_str}}
97
+ ) as resp:
98
+ level = (
99
+ logging.WARNING
100
+ if resp.status not in (200, 201)
101
+ else logging.DEBUG
102
+ )
103
+ logger.log(level, await resp.text())
104
+
105
+ def _convert(self, metadata: Dict[str, Any]) -> Dict[str, Any]:
106
+ for k, v in metadata.items():
107
+ match self.index_schema[k].type:
108
+ case "bbox":
109
+ metadata[k] = f"ENVELOPE({v[0]}, {v[1]}, {v[3]}, {v[2]})"
110
+ case "daterange":
111
+ metadata[k] = f"[{v[0].isoformat()} TO {v[-1].isoformat()}]"
112
+
113
+ return metadata
114
+
115
+ async def _index_core(self, server: str, core: str) -> None:
116
+ """Index data to a solr core."""
117
+ url = await self.solr_url(server, core)
118
+ async for chunk in self.get_metadata(core):
119
+ async with aiohttp.ClientSession(
120
+ timeout=self.timeout, raise_for_status=True
121
+ ) as session:
122
+ try:
123
+ payload = list(map(self._convert, chunk))
124
+ async with session.post(url, json=payload) as resp:
125
+ logger.debug(await resp.text())
126
+ except Exception as error:
127
+ logger.log(
128
+ logging.WARNING,
129
+ error,
130
+ exc_info=logger.level < logging.INFO,
131
+ )
132
+
133
+ @cli_function(
134
+ help="Add metadata to the apache solr metadata server.",
135
+ )
136
+ async def index(
137
+ self,
138
+ *,
139
+ server: Annotated[
140
+ Optional[str],
141
+ cli_parameter(
142
+ "-sv",
143
+ "--server",
144
+ help="The <host>:<port> to the solr server",
145
+ type=str,
146
+ ),
147
+ ] = None,
148
+ ) -> None:
149
+ """Add metadata to the apache solr metadata server."""
150
+ async with asyncio.TaskGroup() as tg:
151
+ for core in self.index_names:
152
+ tg.create_task(self._index_core(server or "", core))