metadata-crawler 2510.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of metadata-crawler might be problematic. Click here for more details.

Files changed (35) hide show
  1. metadata_crawler/__init__.py +263 -0
  2. metadata_crawler/__main__.py +8 -0
  3. metadata_crawler/_version.py +1 -0
  4. metadata_crawler/api/__init__.py +1 -0
  5. metadata_crawler/api/cli.py +57 -0
  6. metadata_crawler/api/config.py +831 -0
  7. metadata_crawler/api/drs_config.toml +440 -0
  8. metadata_crawler/api/index.py +151 -0
  9. metadata_crawler/api/metadata_stores.py +755 -0
  10. metadata_crawler/api/mixin/__init__.py +7 -0
  11. metadata_crawler/api/mixin/lookup_mixin.py +112 -0
  12. metadata_crawler/api/mixin/lookup_tables.py +10010 -0
  13. metadata_crawler/api/mixin/path_mixin.py +46 -0
  14. metadata_crawler/api/mixin/template_mixin.py +145 -0
  15. metadata_crawler/api/storage_backend.py +277 -0
  16. metadata_crawler/backends/__init__.py +1 -0
  17. metadata_crawler/backends/intake.py +211 -0
  18. metadata_crawler/backends/posix.py +121 -0
  19. metadata_crawler/backends/s3.py +140 -0
  20. metadata_crawler/backends/swift.py +305 -0
  21. metadata_crawler/cli.py +547 -0
  22. metadata_crawler/data_collector.py +278 -0
  23. metadata_crawler/ingester/__init__.py +1 -0
  24. metadata_crawler/ingester/mongo.py +206 -0
  25. metadata_crawler/ingester/solr.py +282 -0
  26. metadata_crawler/logger.py +153 -0
  27. metadata_crawler/py.typed +0 -0
  28. metadata_crawler/run.py +419 -0
  29. metadata_crawler/utils/__init__.py +482 -0
  30. metadata_crawler/utils/cftime_utils.py +207 -0
  31. metadata_crawler-2510.1.0.dist-info/METADATA +401 -0
  32. metadata_crawler-2510.1.0.dist-info/RECORD +35 -0
  33. metadata_crawler-2510.1.0.dist-info/WHEEL +4 -0
  34. metadata_crawler-2510.1.0.dist-info/entry_points.txt +14 -0
  35. metadata_crawler-2510.1.0.dist-info/licenses/LICENSE +28 -0
@@ -0,0 +1,282 @@
1
+ """Collection of aync data ingest classes."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import asyncio
6
+ import logging
7
+ import os
8
+ import time
9
+ from concurrent.futures import ThreadPoolExecutor
10
+ from types import TracebackType
11
+ from typing import Annotated, Any, Dict, List, Optional, Tuple, Type, cast
12
+
13
+ import aiohttp
14
+ import orjson
15
+
16
+ from ..api.cli import cli_function, cli_parameter
17
+ from ..api.index import BaseIndex
18
+ from ..api.metadata_stores import IndexName
19
+ from ..logger import logger
20
+
21
+
22
+ class SolrIndex(BaseIndex):
23
+ """Ingest metadata into an apache solr server."""
24
+
25
+ senteniel: Optional[bytes] = None
26
+
27
+ def __post_init__(self) -> None:
28
+ self.timeout = aiohttp.ClientTimeout(
29
+ connect=10, sock_connect=10, sock_read=180, total=None
30
+ )
31
+ self.semaphore = asyncio.Event()
32
+ self.max_http_workers: int = 0
33
+ queue_max: int = 128
34
+ encode_workers: int = 4
35
+ self._uri: str = ""
36
+ self.cpu_pool = ThreadPoolExecutor(max_workers=encode_workers)
37
+ self.producer_queue: asyncio.Queue[Tuple[str, Optional[bytes]]] = (
38
+ asyncio.Queue(maxsize=queue_max)
39
+ )
40
+ self.connector = aiohttp.TCPConnector(
41
+ ttl_dns_cache=300,
42
+ use_dns_cache=True,
43
+ enable_cleanup_closed=True,
44
+ )
45
+
46
+ async def solr_url(self, server: str, core: str) -> str:
47
+ """Construct the solr url from a given solr core."""
48
+ if not self._uri:
49
+ scheme, _, server = server.rpartition("://")
50
+ scheme = scheme or "http"
51
+ solr_server, _, solr_port = server.partition(":")
52
+ solr_server = solr_server or "localhost"
53
+ uri = f"{scheme}://{solr_server}"
54
+ uri = f"{uri}:{solr_port}" if solr_port else uri
55
+ self._uri = f"{uri}/solr"
56
+ return f"{self._uri}/{core}/update/json?commit=true"
57
+
58
+ @cli_function(
59
+ help="Remove metadata from the apache solr server.",
60
+ )
61
+ async def delete(
62
+ self,
63
+ *,
64
+ server: Annotated[
65
+ Optional[str],
66
+ cli_parameter(
67
+ "-sv",
68
+ "--server",
69
+ help="The <host>:<port> to the solr server",
70
+ type=str,
71
+ ),
72
+ ] = None,
73
+ facets: Annotated[
74
+ Optional[List[tuple[str, str]]],
75
+ cli_parameter(
76
+ "-f",
77
+ "--facets",
78
+ type=str,
79
+ nargs=2,
80
+ action="append",
81
+ help="Search facets matching the delete query.",
82
+ ),
83
+ ] = None,
84
+ latest_version: Annotated[
85
+ str,
86
+ cli_parameter(
87
+ "--latest-version",
88
+ type=str,
89
+ help="Name of the core holding 'latest' metadata.",
90
+ ),
91
+ ] = IndexName().latest,
92
+ all_versions: Annotated[
93
+ str,
94
+ cli_parameter(
95
+ "--all-versions",
96
+ type=str,
97
+ help="Name of the core holding 'all' metadata versions.",
98
+ ),
99
+ ] = IndexName().all,
100
+ ) -> None:
101
+ """Remove metadata from the apache solr server."""
102
+ query = []
103
+ for key, value in facets or []:
104
+ if key.lower() == "file":
105
+ if value[0] in (os.sep, "/"):
106
+ value = f"\\{value}"
107
+ value = value.replace(":", "\\:")
108
+ else:
109
+ value = value.lower()
110
+ query.append(f"{key.lower()}:{value}")
111
+ query_str = " AND ".join(query)
112
+ server = server or ""
113
+ async with aiohttp.ClientSession(timeout=self.timeout) as session:
114
+ logger.debug("Deleting entries matching %s", query_str)
115
+ for core in (all_versions, latest_version):
116
+ url = await self.solr_url(server, core)
117
+ async with session.post(
118
+ url, json={"delete": {"query": query_str}}
119
+ ) as resp:
120
+ level = (
121
+ logging.WARNING
122
+ if resp.status not in (200, 201)
123
+ else logging.DEBUG
124
+ )
125
+ logger.log(level, await resp.text())
126
+
127
+ def _convert(self, metadata: Dict[str, Any]) -> Dict[str, Any]:
128
+ for k, v in metadata.items():
129
+ match self.index_schema[k].type:
130
+ case "bbox":
131
+ metadata[k] = f"ENVELOPE({v[0]}, {v[1]}, {v[3]}, {v[2]})"
132
+ case "daterange":
133
+ metadata[k] = f"[{v[0].isoformat()} TO {v[-1].isoformat()}]"
134
+
135
+ return metadata
136
+
137
+ def _encode_payload(self, chunk: List[Dict[str, Any]]) -> bytes:
138
+ """CPU-bound: convert docs and JSON-encode off the event loop."""
139
+ return orjson.dumps([self._convert(x) for x in chunk])
140
+
141
+ async def _post_chunk(
142
+ self,
143
+ session: aiohttp.ClientSession,
144
+ url: str,
145
+ body: bytes,
146
+ ) -> None:
147
+ """POST one batch with minimal overhead and simple retries."""
148
+ status = 500
149
+ t0 = time.perf_counter()
150
+ try:
151
+ async with session.post(
152
+ url, data=body, headers={"Content-Type": "application/json"}
153
+ ) as resp:
154
+ status = resp.status
155
+ await resp.read()
156
+
157
+ except Exception as error:
158
+ logger.log(
159
+ logging.WARNING,
160
+ error,
161
+ exc_info=logger.level < logging.INFO,
162
+ )
163
+ return
164
+ logger.debug(
165
+ "POST %s -> %i (index time: %.3f)",
166
+ url,
167
+ status,
168
+ time.perf_counter() - t0,
169
+ )
170
+
171
+ async def consumer(self, session: aiohttp.ClientSession) -> None:
172
+ """Consume the metadata read by the porducers."""
173
+ while True:
174
+ update_url, body = await self.producer_queue.get()
175
+ if body is self.senteniel:
176
+ self.producer_queue.task_done()
177
+ break
178
+ try:
179
+ await self._post_chunk(session, update_url, cast(bytes, body))
180
+ finally:
181
+ self.producer_queue.task_done()
182
+
183
+ async def _index_core(
184
+ self,
185
+ session: aiohttp.ClientSession,
186
+ server: str,
187
+ core: str,
188
+ suffix: str,
189
+ http_workers: int = 8,
190
+ ) -> None:
191
+ """Zero-copy-ish, backpressured, bounded-concurrency indexer.
192
+
193
+ - No per-batch commit.
194
+ - Bounded queue so tasks don't pile up.
195
+ - Constant number of worker tasks (not O(batches)).
196
+ """
197
+ base_url = await self.solr_url(server, core + suffix)
198
+ update_url = base_url.split("?", 1)[0] # guard
199
+ loop = asyncio.get_running_loop()
200
+ async for batch in self.get_metadata(core):
201
+ body = await loop.run_in_executor(
202
+ self.cpu_pool, self._encode_payload, batch
203
+ )
204
+ await self.producer_queue.put((update_url, body))
205
+ commit_url = f"{update_url}?commit=true"
206
+ async with session.post(
207
+ commit_url,
208
+ data=b"[]",
209
+ headers={"Content-Type": "application/json"},
210
+ ) as resp:
211
+ if resp.status >= 400:
212
+ text = await resp.text()
213
+ logger.warning(
214
+ "COMMIT %s -> %i: %s", commit_url, resp.status, text
215
+ )
216
+
217
+ async def __aexit__(
218
+ self,
219
+ exc_type: Optional[Type[BaseException]],
220
+ exc_val: Optional[BaseException],
221
+ exc_tb: Optional[TracebackType],
222
+ ) -> None:
223
+
224
+ try:
225
+ self.producer_queue.shutdown()
226
+ except AttributeError: # pragma: no cover
227
+ pass # prgama: no cover
228
+ self.cpu_pool.shutdown()
229
+
230
+ @cli_function(
231
+ help="Add metadata to the apache solr metadata server.",
232
+ )
233
+ async def index(
234
+ self,
235
+ *,
236
+ server: Annotated[
237
+ Optional[str],
238
+ cli_parameter(
239
+ "-sv",
240
+ "--server",
241
+ help="The <host>:<port> to the solr server",
242
+ type=str,
243
+ ),
244
+ ] = None,
245
+ index_suffix: Annotated[
246
+ Optional[str],
247
+ cli_parameter(
248
+ "--index-suffix",
249
+ help="Suffix for the latest and all version collections.",
250
+ type=str,
251
+ ),
252
+ ] = None,
253
+ http_workers: Annotated[
254
+ int,
255
+ cli_parameter(
256
+ "--http-workers", help="Number of ingestion threads.", type=int
257
+ ),
258
+ ] = 8,
259
+ ) -> None:
260
+ """Add metadata to the apache solr metadata server."""
261
+ async with aiohttp.ClientSession(
262
+ timeout=self.timeout, connector=self.connector, raise_for_status=True
263
+ ) as session:
264
+ consumers = [
265
+ asyncio.create_task(self.consumer(session))
266
+ for _ in range(http_workers)
267
+ ]
268
+ async with asyncio.TaskGroup() as tg:
269
+ for core in self.index_names:
270
+ tg.create_task(
271
+ self._index_core(
272
+ session,
273
+ server or "",
274
+ core,
275
+ suffix=index_suffix or "",
276
+ http_workers=http_workers,
277
+ )
278
+ )
279
+ for _ in range(http_workers):
280
+ await self.producer_queue.put(("", self.senteniel))
281
+ await self.producer_queue.join()
282
+ await asyncio.gather(*consumers)
@@ -0,0 +1,153 @@
1
+ """Logging utilities."""
2
+
3
+ import logging
4
+ import logging.config
5
+ import os
6
+ from logging.handlers import RotatingFileHandler
7
+ from pathlib import Path
8
+ from typing import Any, Optional, cast
9
+
10
+ import appdirs
11
+ from rich.console import Console
12
+ from rich.logging import RichHandler
13
+
14
+ THIS_NAME = "metadata-crawler"
15
+
16
+ logging.basicConfig(
17
+ level=logging.WARNING,
18
+ format="%(asctime)s %(levelname)s: %(name)s - %(message)s",
19
+ )
20
+
21
+ logging.config.dictConfig(
22
+ {
23
+ "version": 1,
24
+ # keep existing handlers
25
+ "disable_existing_loggers": False,
26
+ "root": {
27
+ "level": "CRITICAL",
28
+ "handlers": ["default"],
29
+ },
30
+ "formatters": {
31
+ "standard": {
32
+ "format": "%(asctime)s %(levelname)s: %(name)s - %(message)s",
33
+ },
34
+ },
35
+ "handlers": {
36
+ "default": {
37
+ "class": "logging.StreamHandler",
38
+ "formatter": "standard",
39
+ "level": "CRITICAL",
40
+ },
41
+ },
42
+ }
43
+ )
44
+
45
+
46
+ class Logger(logging.Logger):
47
+ """Custom Logger defining the logging behaviour."""
48
+
49
+ logfmt: str = "%(name)s: %(message)s"
50
+ filelogfmt: str = "%(asctime)s %(levelname)s: %(name)s - %(message)s"
51
+ datefmt: str = "%Y-%m-%dT%H:%M:%S"
52
+ no_debug: list[str] = ["watchfiles", "httpcore", "pymongo", "pika"]
53
+
54
+ def __init__(
55
+ self,
56
+ name: Optional[str] = None,
57
+ level: Optional[int] = None,
58
+ suffix: Optional[str] = None,
59
+ ) -> None:
60
+ """Instantiate this logger only once and for all."""
61
+ self.level = level or int(
62
+ cast(str, os.getenv("MDC_LOG_LEVEL", str(logging.CRITICAL)))
63
+ )
64
+ name = name or THIS_NAME
65
+ logger_format = logging.Formatter(self.logfmt, self.datefmt)
66
+ self.file_format = logging.Formatter(self.filelogfmt, self.datefmt)
67
+ self._logger_file_handle: Optional[RotatingFileHandler] = None
68
+ self._logger_stream_handle = RichHandler(
69
+ rich_tracebacks=True,
70
+ tracebacks_max_frames=10,
71
+ tracebacks_extra_lines=5,
72
+ show_path=True,
73
+ console=Console(
74
+ soft_wrap=False,
75
+ force_jupyter=False,
76
+ stderr=True,
77
+ ),
78
+ )
79
+ self._logger_stream_handle.setFormatter(logger_format)
80
+ self._logger_stream_handle.setLevel(self.level)
81
+ super().__init__(name, self.level)
82
+
83
+ self.propagate = False
84
+ self.handlers = [self._logger_stream_handle]
85
+ (
86
+ self.add_file_handle(suffix=suffix)
87
+ if os.getenv("MDC_LOG_INIT", "0") == "1"
88
+ else None
89
+ )
90
+
91
+ def set_level(self, level: int) -> None:
92
+ """Set the logger level to level."""
93
+ for handler in self.handlers:
94
+ log_level = level
95
+ if isinstance(handler, RotatingFileHandler):
96
+ log_level = min(level, logging.CRITICAL)
97
+ handler.setLevel(log_level)
98
+ self.setLevel(level)
99
+ self.level = level
100
+
101
+ def error(
102
+ self,
103
+ msg: object,
104
+ *args: Any,
105
+ **kwargs: Any,
106
+ ) -> None:
107
+ """Log an error. When log level is smaller than INFO, log exceptions."""
108
+ if self.level < logging.INFO:
109
+ kwargs.setdefault("exc_info", True)
110
+ self._log(logging.ERROR, msg, args, **kwargs)
111
+
112
+ def add_file_handle(
113
+ self,
114
+ suffix: Optional[str] = None,
115
+ level: int = logging.CRITICAL,
116
+ ) -> None:
117
+ """Add a file log handle to the logger."""
118
+ suffix = suffix or os.getenv("MDC_LOG_SUFFIX", "")
119
+ base_name = f"{THIS_NAME}-{suffix}" if suffix else THIS_NAME
120
+ log_dir = Path(os.getenv("MDC_LOG_DIR", appdirs.user_log_dir(THIS_NAME)))
121
+ log_dir.mkdir(exist_ok=True, parents=True)
122
+ logger_file_handle = RotatingFileHandler(
123
+ log_dir / f"{base_name}.log",
124
+ mode="a",
125
+ maxBytes=5 * 1024**2,
126
+ backupCount=5,
127
+ encoding="utf-8",
128
+ delay=False,
129
+ )
130
+ logger_file_handle.setFormatter(self.file_format)
131
+ logger_file_handle.setLevel(self.level)
132
+ self.addHandler(logger_file_handle)
133
+
134
+
135
+ logger = Logger()
136
+
137
+
138
+ def get_level_from_verbosity(verbosity: int) -> int:
139
+ """Calculate the log level from a verbosity."""
140
+ return max(logging.CRITICAL - 10 * verbosity, -1)
141
+
142
+
143
+ def apply_verbosity(
144
+ level: Optional[int] = None, suffix: Optional[str] = None
145
+ ) -> int:
146
+ """Set the logging level of the handlers to a certain level."""
147
+ level = logger.level if level is None else level
148
+ old_level = logger.level
149
+ level = get_level_from_verbosity(level)
150
+ logger.set_level(level)
151
+ logger.add_file_handle(suffix, level)
152
+
153
+ return old_level
File without changes