metadata-crawler 2510.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of metadata-crawler might be problematic. Click here for more details.
- metadata_crawler/__init__.py +263 -0
- metadata_crawler/__main__.py +8 -0
- metadata_crawler/_version.py +1 -0
- metadata_crawler/api/__init__.py +1 -0
- metadata_crawler/api/cli.py +57 -0
- metadata_crawler/api/config.py +831 -0
- metadata_crawler/api/drs_config.toml +440 -0
- metadata_crawler/api/index.py +151 -0
- metadata_crawler/api/metadata_stores.py +755 -0
- metadata_crawler/api/mixin/__init__.py +7 -0
- metadata_crawler/api/mixin/lookup_mixin.py +112 -0
- metadata_crawler/api/mixin/lookup_tables.py +10010 -0
- metadata_crawler/api/mixin/path_mixin.py +46 -0
- metadata_crawler/api/mixin/template_mixin.py +145 -0
- metadata_crawler/api/storage_backend.py +277 -0
- metadata_crawler/backends/__init__.py +1 -0
- metadata_crawler/backends/intake.py +211 -0
- metadata_crawler/backends/posix.py +121 -0
- metadata_crawler/backends/s3.py +140 -0
- metadata_crawler/backends/swift.py +305 -0
- metadata_crawler/cli.py +547 -0
- metadata_crawler/data_collector.py +278 -0
- metadata_crawler/ingester/__init__.py +1 -0
- metadata_crawler/ingester/mongo.py +206 -0
- metadata_crawler/ingester/solr.py +282 -0
- metadata_crawler/logger.py +153 -0
- metadata_crawler/py.typed +0 -0
- metadata_crawler/run.py +419 -0
- metadata_crawler/utils/__init__.py +482 -0
- metadata_crawler/utils/cftime_utils.py +207 -0
- metadata_crawler-2510.1.0.dist-info/METADATA +401 -0
- metadata_crawler-2510.1.0.dist-info/RECORD +35 -0
- metadata_crawler-2510.1.0.dist-info/WHEEL +4 -0
- metadata_crawler-2510.1.0.dist-info/entry_points.txt +14 -0
- metadata_crawler-2510.1.0.dist-info/licenses/LICENSE +28 -0
|
@@ -0,0 +1,282 @@
|
|
|
1
|
+
"""Collection of aync data ingest classes."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import asyncio
|
|
6
|
+
import logging
|
|
7
|
+
import os
|
|
8
|
+
import time
|
|
9
|
+
from concurrent.futures import ThreadPoolExecutor
|
|
10
|
+
from types import TracebackType
|
|
11
|
+
from typing import Annotated, Any, Dict, List, Optional, Tuple, Type, cast
|
|
12
|
+
|
|
13
|
+
import aiohttp
|
|
14
|
+
import orjson
|
|
15
|
+
|
|
16
|
+
from ..api.cli import cli_function, cli_parameter
|
|
17
|
+
from ..api.index import BaseIndex
|
|
18
|
+
from ..api.metadata_stores import IndexName
|
|
19
|
+
from ..logger import logger
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
class SolrIndex(BaseIndex):
|
|
23
|
+
"""Ingest metadata into an apache solr server."""
|
|
24
|
+
|
|
25
|
+
senteniel: Optional[bytes] = None
|
|
26
|
+
|
|
27
|
+
def __post_init__(self) -> None:
|
|
28
|
+
self.timeout = aiohttp.ClientTimeout(
|
|
29
|
+
connect=10, sock_connect=10, sock_read=180, total=None
|
|
30
|
+
)
|
|
31
|
+
self.semaphore = asyncio.Event()
|
|
32
|
+
self.max_http_workers: int = 0
|
|
33
|
+
queue_max: int = 128
|
|
34
|
+
encode_workers: int = 4
|
|
35
|
+
self._uri: str = ""
|
|
36
|
+
self.cpu_pool = ThreadPoolExecutor(max_workers=encode_workers)
|
|
37
|
+
self.producer_queue: asyncio.Queue[Tuple[str, Optional[bytes]]] = (
|
|
38
|
+
asyncio.Queue(maxsize=queue_max)
|
|
39
|
+
)
|
|
40
|
+
self.connector = aiohttp.TCPConnector(
|
|
41
|
+
ttl_dns_cache=300,
|
|
42
|
+
use_dns_cache=True,
|
|
43
|
+
enable_cleanup_closed=True,
|
|
44
|
+
)
|
|
45
|
+
|
|
46
|
+
async def solr_url(self, server: str, core: str) -> str:
|
|
47
|
+
"""Construct the solr url from a given solr core."""
|
|
48
|
+
if not self._uri:
|
|
49
|
+
scheme, _, server = server.rpartition("://")
|
|
50
|
+
scheme = scheme or "http"
|
|
51
|
+
solr_server, _, solr_port = server.partition(":")
|
|
52
|
+
solr_server = solr_server or "localhost"
|
|
53
|
+
uri = f"{scheme}://{solr_server}"
|
|
54
|
+
uri = f"{uri}:{solr_port}" if solr_port else uri
|
|
55
|
+
self._uri = f"{uri}/solr"
|
|
56
|
+
return f"{self._uri}/{core}/update/json?commit=true"
|
|
57
|
+
|
|
58
|
+
@cli_function(
|
|
59
|
+
help="Remove metadata from the apache solr server.",
|
|
60
|
+
)
|
|
61
|
+
async def delete(
|
|
62
|
+
self,
|
|
63
|
+
*,
|
|
64
|
+
server: Annotated[
|
|
65
|
+
Optional[str],
|
|
66
|
+
cli_parameter(
|
|
67
|
+
"-sv",
|
|
68
|
+
"--server",
|
|
69
|
+
help="The <host>:<port> to the solr server",
|
|
70
|
+
type=str,
|
|
71
|
+
),
|
|
72
|
+
] = None,
|
|
73
|
+
facets: Annotated[
|
|
74
|
+
Optional[List[tuple[str, str]]],
|
|
75
|
+
cli_parameter(
|
|
76
|
+
"-f",
|
|
77
|
+
"--facets",
|
|
78
|
+
type=str,
|
|
79
|
+
nargs=2,
|
|
80
|
+
action="append",
|
|
81
|
+
help="Search facets matching the delete query.",
|
|
82
|
+
),
|
|
83
|
+
] = None,
|
|
84
|
+
latest_version: Annotated[
|
|
85
|
+
str,
|
|
86
|
+
cli_parameter(
|
|
87
|
+
"--latest-version",
|
|
88
|
+
type=str,
|
|
89
|
+
help="Name of the core holding 'latest' metadata.",
|
|
90
|
+
),
|
|
91
|
+
] = IndexName().latest,
|
|
92
|
+
all_versions: Annotated[
|
|
93
|
+
str,
|
|
94
|
+
cli_parameter(
|
|
95
|
+
"--all-versions",
|
|
96
|
+
type=str,
|
|
97
|
+
help="Name of the core holding 'all' metadata versions.",
|
|
98
|
+
),
|
|
99
|
+
] = IndexName().all,
|
|
100
|
+
) -> None:
|
|
101
|
+
"""Remove metadata from the apache solr server."""
|
|
102
|
+
query = []
|
|
103
|
+
for key, value in facets or []:
|
|
104
|
+
if key.lower() == "file":
|
|
105
|
+
if value[0] in (os.sep, "/"):
|
|
106
|
+
value = f"\\{value}"
|
|
107
|
+
value = value.replace(":", "\\:")
|
|
108
|
+
else:
|
|
109
|
+
value = value.lower()
|
|
110
|
+
query.append(f"{key.lower()}:{value}")
|
|
111
|
+
query_str = " AND ".join(query)
|
|
112
|
+
server = server or ""
|
|
113
|
+
async with aiohttp.ClientSession(timeout=self.timeout) as session:
|
|
114
|
+
logger.debug("Deleting entries matching %s", query_str)
|
|
115
|
+
for core in (all_versions, latest_version):
|
|
116
|
+
url = await self.solr_url(server, core)
|
|
117
|
+
async with session.post(
|
|
118
|
+
url, json={"delete": {"query": query_str}}
|
|
119
|
+
) as resp:
|
|
120
|
+
level = (
|
|
121
|
+
logging.WARNING
|
|
122
|
+
if resp.status not in (200, 201)
|
|
123
|
+
else logging.DEBUG
|
|
124
|
+
)
|
|
125
|
+
logger.log(level, await resp.text())
|
|
126
|
+
|
|
127
|
+
def _convert(self, metadata: Dict[str, Any]) -> Dict[str, Any]:
|
|
128
|
+
for k, v in metadata.items():
|
|
129
|
+
match self.index_schema[k].type:
|
|
130
|
+
case "bbox":
|
|
131
|
+
metadata[k] = f"ENVELOPE({v[0]}, {v[1]}, {v[3]}, {v[2]})"
|
|
132
|
+
case "daterange":
|
|
133
|
+
metadata[k] = f"[{v[0].isoformat()} TO {v[-1].isoformat()}]"
|
|
134
|
+
|
|
135
|
+
return metadata
|
|
136
|
+
|
|
137
|
+
def _encode_payload(self, chunk: List[Dict[str, Any]]) -> bytes:
|
|
138
|
+
"""CPU-bound: convert docs and JSON-encode off the event loop."""
|
|
139
|
+
return orjson.dumps([self._convert(x) for x in chunk])
|
|
140
|
+
|
|
141
|
+
async def _post_chunk(
|
|
142
|
+
self,
|
|
143
|
+
session: aiohttp.ClientSession,
|
|
144
|
+
url: str,
|
|
145
|
+
body: bytes,
|
|
146
|
+
) -> None:
|
|
147
|
+
"""POST one batch with minimal overhead and simple retries."""
|
|
148
|
+
status = 500
|
|
149
|
+
t0 = time.perf_counter()
|
|
150
|
+
try:
|
|
151
|
+
async with session.post(
|
|
152
|
+
url, data=body, headers={"Content-Type": "application/json"}
|
|
153
|
+
) as resp:
|
|
154
|
+
status = resp.status
|
|
155
|
+
await resp.read()
|
|
156
|
+
|
|
157
|
+
except Exception as error:
|
|
158
|
+
logger.log(
|
|
159
|
+
logging.WARNING,
|
|
160
|
+
error,
|
|
161
|
+
exc_info=logger.level < logging.INFO,
|
|
162
|
+
)
|
|
163
|
+
return
|
|
164
|
+
logger.debug(
|
|
165
|
+
"POST %s -> %i (index time: %.3f)",
|
|
166
|
+
url,
|
|
167
|
+
status,
|
|
168
|
+
time.perf_counter() - t0,
|
|
169
|
+
)
|
|
170
|
+
|
|
171
|
+
async def consumer(self, session: aiohttp.ClientSession) -> None:
|
|
172
|
+
"""Consume the metadata read by the porducers."""
|
|
173
|
+
while True:
|
|
174
|
+
update_url, body = await self.producer_queue.get()
|
|
175
|
+
if body is self.senteniel:
|
|
176
|
+
self.producer_queue.task_done()
|
|
177
|
+
break
|
|
178
|
+
try:
|
|
179
|
+
await self._post_chunk(session, update_url, cast(bytes, body))
|
|
180
|
+
finally:
|
|
181
|
+
self.producer_queue.task_done()
|
|
182
|
+
|
|
183
|
+
async def _index_core(
|
|
184
|
+
self,
|
|
185
|
+
session: aiohttp.ClientSession,
|
|
186
|
+
server: str,
|
|
187
|
+
core: str,
|
|
188
|
+
suffix: str,
|
|
189
|
+
http_workers: int = 8,
|
|
190
|
+
) -> None:
|
|
191
|
+
"""Zero-copy-ish, backpressured, bounded-concurrency indexer.
|
|
192
|
+
|
|
193
|
+
- No per-batch commit.
|
|
194
|
+
- Bounded queue so tasks don't pile up.
|
|
195
|
+
- Constant number of worker tasks (not O(batches)).
|
|
196
|
+
"""
|
|
197
|
+
base_url = await self.solr_url(server, core + suffix)
|
|
198
|
+
update_url = base_url.split("?", 1)[0] # guard
|
|
199
|
+
loop = asyncio.get_running_loop()
|
|
200
|
+
async for batch in self.get_metadata(core):
|
|
201
|
+
body = await loop.run_in_executor(
|
|
202
|
+
self.cpu_pool, self._encode_payload, batch
|
|
203
|
+
)
|
|
204
|
+
await self.producer_queue.put((update_url, body))
|
|
205
|
+
commit_url = f"{update_url}?commit=true"
|
|
206
|
+
async with session.post(
|
|
207
|
+
commit_url,
|
|
208
|
+
data=b"[]",
|
|
209
|
+
headers={"Content-Type": "application/json"},
|
|
210
|
+
) as resp:
|
|
211
|
+
if resp.status >= 400:
|
|
212
|
+
text = await resp.text()
|
|
213
|
+
logger.warning(
|
|
214
|
+
"COMMIT %s -> %i: %s", commit_url, resp.status, text
|
|
215
|
+
)
|
|
216
|
+
|
|
217
|
+
async def __aexit__(
|
|
218
|
+
self,
|
|
219
|
+
exc_type: Optional[Type[BaseException]],
|
|
220
|
+
exc_val: Optional[BaseException],
|
|
221
|
+
exc_tb: Optional[TracebackType],
|
|
222
|
+
) -> None:
|
|
223
|
+
|
|
224
|
+
try:
|
|
225
|
+
self.producer_queue.shutdown()
|
|
226
|
+
except AttributeError: # pragma: no cover
|
|
227
|
+
pass # prgama: no cover
|
|
228
|
+
self.cpu_pool.shutdown()
|
|
229
|
+
|
|
230
|
+
@cli_function(
|
|
231
|
+
help="Add metadata to the apache solr metadata server.",
|
|
232
|
+
)
|
|
233
|
+
async def index(
|
|
234
|
+
self,
|
|
235
|
+
*,
|
|
236
|
+
server: Annotated[
|
|
237
|
+
Optional[str],
|
|
238
|
+
cli_parameter(
|
|
239
|
+
"-sv",
|
|
240
|
+
"--server",
|
|
241
|
+
help="The <host>:<port> to the solr server",
|
|
242
|
+
type=str,
|
|
243
|
+
),
|
|
244
|
+
] = None,
|
|
245
|
+
index_suffix: Annotated[
|
|
246
|
+
Optional[str],
|
|
247
|
+
cli_parameter(
|
|
248
|
+
"--index-suffix",
|
|
249
|
+
help="Suffix for the latest and all version collections.",
|
|
250
|
+
type=str,
|
|
251
|
+
),
|
|
252
|
+
] = None,
|
|
253
|
+
http_workers: Annotated[
|
|
254
|
+
int,
|
|
255
|
+
cli_parameter(
|
|
256
|
+
"--http-workers", help="Number of ingestion threads.", type=int
|
|
257
|
+
),
|
|
258
|
+
] = 8,
|
|
259
|
+
) -> None:
|
|
260
|
+
"""Add metadata to the apache solr metadata server."""
|
|
261
|
+
async with aiohttp.ClientSession(
|
|
262
|
+
timeout=self.timeout, connector=self.connector, raise_for_status=True
|
|
263
|
+
) as session:
|
|
264
|
+
consumers = [
|
|
265
|
+
asyncio.create_task(self.consumer(session))
|
|
266
|
+
for _ in range(http_workers)
|
|
267
|
+
]
|
|
268
|
+
async with asyncio.TaskGroup() as tg:
|
|
269
|
+
for core in self.index_names:
|
|
270
|
+
tg.create_task(
|
|
271
|
+
self._index_core(
|
|
272
|
+
session,
|
|
273
|
+
server or "",
|
|
274
|
+
core,
|
|
275
|
+
suffix=index_suffix or "",
|
|
276
|
+
http_workers=http_workers,
|
|
277
|
+
)
|
|
278
|
+
)
|
|
279
|
+
for _ in range(http_workers):
|
|
280
|
+
await self.producer_queue.put(("", self.senteniel))
|
|
281
|
+
await self.producer_queue.join()
|
|
282
|
+
await asyncio.gather(*consumers)
|
|
@@ -0,0 +1,153 @@
|
|
|
1
|
+
"""Logging utilities."""
|
|
2
|
+
|
|
3
|
+
import logging
|
|
4
|
+
import logging.config
|
|
5
|
+
import os
|
|
6
|
+
from logging.handlers import RotatingFileHandler
|
|
7
|
+
from pathlib import Path
|
|
8
|
+
from typing import Any, Optional, cast
|
|
9
|
+
|
|
10
|
+
import appdirs
|
|
11
|
+
from rich.console import Console
|
|
12
|
+
from rich.logging import RichHandler
|
|
13
|
+
|
|
14
|
+
THIS_NAME = "metadata-crawler"
|
|
15
|
+
|
|
16
|
+
logging.basicConfig(
|
|
17
|
+
level=logging.WARNING,
|
|
18
|
+
format="%(asctime)s %(levelname)s: %(name)s - %(message)s",
|
|
19
|
+
)
|
|
20
|
+
|
|
21
|
+
logging.config.dictConfig(
|
|
22
|
+
{
|
|
23
|
+
"version": 1,
|
|
24
|
+
# keep existing handlers
|
|
25
|
+
"disable_existing_loggers": False,
|
|
26
|
+
"root": {
|
|
27
|
+
"level": "CRITICAL",
|
|
28
|
+
"handlers": ["default"],
|
|
29
|
+
},
|
|
30
|
+
"formatters": {
|
|
31
|
+
"standard": {
|
|
32
|
+
"format": "%(asctime)s %(levelname)s: %(name)s - %(message)s",
|
|
33
|
+
},
|
|
34
|
+
},
|
|
35
|
+
"handlers": {
|
|
36
|
+
"default": {
|
|
37
|
+
"class": "logging.StreamHandler",
|
|
38
|
+
"formatter": "standard",
|
|
39
|
+
"level": "CRITICAL",
|
|
40
|
+
},
|
|
41
|
+
},
|
|
42
|
+
}
|
|
43
|
+
)
|
|
44
|
+
|
|
45
|
+
|
|
46
|
+
class Logger(logging.Logger):
|
|
47
|
+
"""Custom Logger defining the logging behaviour."""
|
|
48
|
+
|
|
49
|
+
logfmt: str = "%(name)s: %(message)s"
|
|
50
|
+
filelogfmt: str = "%(asctime)s %(levelname)s: %(name)s - %(message)s"
|
|
51
|
+
datefmt: str = "%Y-%m-%dT%H:%M:%S"
|
|
52
|
+
no_debug: list[str] = ["watchfiles", "httpcore", "pymongo", "pika"]
|
|
53
|
+
|
|
54
|
+
def __init__(
|
|
55
|
+
self,
|
|
56
|
+
name: Optional[str] = None,
|
|
57
|
+
level: Optional[int] = None,
|
|
58
|
+
suffix: Optional[str] = None,
|
|
59
|
+
) -> None:
|
|
60
|
+
"""Instantiate this logger only once and for all."""
|
|
61
|
+
self.level = level or int(
|
|
62
|
+
cast(str, os.getenv("MDC_LOG_LEVEL", str(logging.CRITICAL)))
|
|
63
|
+
)
|
|
64
|
+
name = name or THIS_NAME
|
|
65
|
+
logger_format = logging.Formatter(self.logfmt, self.datefmt)
|
|
66
|
+
self.file_format = logging.Formatter(self.filelogfmt, self.datefmt)
|
|
67
|
+
self._logger_file_handle: Optional[RotatingFileHandler] = None
|
|
68
|
+
self._logger_stream_handle = RichHandler(
|
|
69
|
+
rich_tracebacks=True,
|
|
70
|
+
tracebacks_max_frames=10,
|
|
71
|
+
tracebacks_extra_lines=5,
|
|
72
|
+
show_path=True,
|
|
73
|
+
console=Console(
|
|
74
|
+
soft_wrap=False,
|
|
75
|
+
force_jupyter=False,
|
|
76
|
+
stderr=True,
|
|
77
|
+
),
|
|
78
|
+
)
|
|
79
|
+
self._logger_stream_handle.setFormatter(logger_format)
|
|
80
|
+
self._logger_stream_handle.setLevel(self.level)
|
|
81
|
+
super().__init__(name, self.level)
|
|
82
|
+
|
|
83
|
+
self.propagate = False
|
|
84
|
+
self.handlers = [self._logger_stream_handle]
|
|
85
|
+
(
|
|
86
|
+
self.add_file_handle(suffix=suffix)
|
|
87
|
+
if os.getenv("MDC_LOG_INIT", "0") == "1"
|
|
88
|
+
else None
|
|
89
|
+
)
|
|
90
|
+
|
|
91
|
+
def set_level(self, level: int) -> None:
|
|
92
|
+
"""Set the logger level to level."""
|
|
93
|
+
for handler in self.handlers:
|
|
94
|
+
log_level = level
|
|
95
|
+
if isinstance(handler, RotatingFileHandler):
|
|
96
|
+
log_level = min(level, logging.CRITICAL)
|
|
97
|
+
handler.setLevel(log_level)
|
|
98
|
+
self.setLevel(level)
|
|
99
|
+
self.level = level
|
|
100
|
+
|
|
101
|
+
def error(
|
|
102
|
+
self,
|
|
103
|
+
msg: object,
|
|
104
|
+
*args: Any,
|
|
105
|
+
**kwargs: Any,
|
|
106
|
+
) -> None:
|
|
107
|
+
"""Log an error. When log level is smaller than INFO, log exceptions."""
|
|
108
|
+
if self.level < logging.INFO:
|
|
109
|
+
kwargs.setdefault("exc_info", True)
|
|
110
|
+
self._log(logging.ERROR, msg, args, **kwargs)
|
|
111
|
+
|
|
112
|
+
def add_file_handle(
|
|
113
|
+
self,
|
|
114
|
+
suffix: Optional[str] = None,
|
|
115
|
+
level: int = logging.CRITICAL,
|
|
116
|
+
) -> None:
|
|
117
|
+
"""Add a file log handle to the logger."""
|
|
118
|
+
suffix = suffix or os.getenv("MDC_LOG_SUFFIX", "")
|
|
119
|
+
base_name = f"{THIS_NAME}-{suffix}" if suffix else THIS_NAME
|
|
120
|
+
log_dir = Path(os.getenv("MDC_LOG_DIR", appdirs.user_log_dir(THIS_NAME)))
|
|
121
|
+
log_dir.mkdir(exist_ok=True, parents=True)
|
|
122
|
+
logger_file_handle = RotatingFileHandler(
|
|
123
|
+
log_dir / f"{base_name}.log",
|
|
124
|
+
mode="a",
|
|
125
|
+
maxBytes=5 * 1024**2,
|
|
126
|
+
backupCount=5,
|
|
127
|
+
encoding="utf-8",
|
|
128
|
+
delay=False,
|
|
129
|
+
)
|
|
130
|
+
logger_file_handle.setFormatter(self.file_format)
|
|
131
|
+
logger_file_handle.setLevel(self.level)
|
|
132
|
+
self.addHandler(logger_file_handle)
|
|
133
|
+
|
|
134
|
+
|
|
135
|
+
logger = Logger()
|
|
136
|
+
|
|
137
|
+
|
|
138
|
+
def get_level_from_verbosity(verbosity: int) -> int:
|
|
139
|
+
"""Calculate the log level from a verbosity."""
|
|
140
|
+
return max(logging.CRITICAL - 10 * verbosity, -1)
|
|
141
|
+
|
|
142
|
+
|
|
143
|
+
def apply_verbosity(
|
|
144
|
+
level: Optional[int] = None, suffix: Optional[str] = None
|
|
145
|
+
) -> int:
|
|
146
|
+
"""Set the logging level of the handlers to a certain level."""
|
|
147
|
+
level = logger.level if level is None else level
|
|
148
|
+
old_level = logger.level
|
|
149
|
+
level = get_level_from_verbosity(level)
|
|
150
|
+
logger.set_level(level)
|
|
151
|
+
logger.add_file_handle(suffix, level)
|
|
152
|
+
|
|
153
|
+
return old_level
|
|
File without changes
|