metadata-crawler 2510.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of metadata-crawler might be problematic. Click here for more details.
- metadata_crawler/__init__.py +263 -0
- metadata_crawler/__main__.py +8 -0
- metadata_crawler/_version.py +1 -0
- metadata_crawler/api/__init__.py +1 -0
- metadata_crawler/api/cli.py +57 -0
- metadata_crawler/api/config.py +831 -0
- metadata_crawler/api/drs_config.toml +440 -0
- metadata_crawler/api/index.py +151 -0
- metadata_crawler/api/metadata_stores.py +755 -0
- metadata_crawler/api/mixin/__init__.py +7 -0
- metadata_crawler/api/mixin/lookup_mixin.py +112 -0
- metadata_crawler/api/mixin/lookup_tables.py +10010 -0
- metadata_crawler/api/mixin/path_mixin.py +46 -0
- metadata_crawler/api/mixin/template_mixin.py +145 -0
- metadata_crawler/api/storage_backend.py +277 -0
- metadata_crawler/backends/__init__.py +1 -0
- metadata_crawler/backends/intake.py +211 -0
- metadata_crawler/backends/posix.py +121 -0
- metadata_crawler/backends/s3.py +140 -0
- metadata_crawler/backends/swift.py +305 -0
- metadata_crawler/cli.py +547 -0
- metadata_crawler/data_collector.py +278 -0
- metadata_crawler/ingester/__init__.py +1 -0
- metadata_crawler/ingester/mongo.py +206 -0
- metadata_crawler/ingester/solr.py +282 -0
- metadata_crawler/logger.py +153 -0
- metadata_crawler/py.typed +0 -0
- metadata_crawler/run.py +419 -0
- metadata_crawler/utils/__init__.py +482 -0
- metadata_crawler/utils/cftime_utils.py +207 -0
- metadata_crawler-2510.1.0.dist-info/METADATA +401 -0
- metadata_crawler-2510.1.0.dist-info/RECORD +35 -0
- metadata_crawler-2510.1.0.dist-info/WHEEL +4 -0
- metadata_crawler-2510.1.0.dist-info/entry_points.txt +14 -0
- metadata_crawler-2510.1.0.dist-info/licenses/LICENSE +28 -0
|
@@ -0,0 +1,755 @@
|
|
|
1
|
+
"""Metadata Storage definitions."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import abc
|
|
6
|
+
import asyncio
|
|
7
|
+
import gzip
|
|
8
|
+
import json
|
|
9
|
+
import multiprocessing as mp
|
|
10
|
+
import os
|
|
11
|
+
import re
|
|
12
|
+
import time
|
|
13
|
+
from concurrent.futures import ThreadPoolExecutor
|
|
14
|
+
from datetime import datetime
|
|
15
|
+
from enum import Enum
|
|
16
|
+
from io import BytesIO
|
|
17
|
+
from multiprocessing import sharedctypes
|
|
18
|
+
from pathlib import Path
|
|
19
|
+
from types import NoneType
|
|
20
|
+
from typing import (
|
|
21
|
+
Any,
|
|
22
|
+
AsyncIterator,
|
|
23
|
+
ClassVar,
|
|
24
|
+
Dict,
|
|
25
|
+
List,
|
|
26
|
+
Literal,
|
|
27
|
+
NamedTuple,
|
|
28
|
+
Optional,
|
|
29
|
+
Set,
|
|
30
|
+
Tuple,
|
|
31
|
+
Type,
|
|
32
|
+
TypeAlias,
|
|
33
|
+
Union,
|
|
34
|
+
cast,
|
|
35
|
+
)
|
|
36
|
+
|
|
37
|
+
import fsspec
|
|
38
|
+
import orjson
|
|
39
|
+
import tomlkit
|
|
40
|
+
import yaml
|
|
41
|
+
|
|
42
|
+
import metadata_crawler
|
|
43
|
+
|
|
44
|
+
from ..logger import logger
|
|
45
|
+
from ..utils import (
|
|
46
|
+
Counter,
|
|
47
|
+
MetadataCrawlerException,
|
|
48
|
+
QueueLike,
|
|
49
|
+
SimpleQueueLike,
|
|
50
|
+
create_async_iterator,
|
|
51
|
+
parse_batch,
|
|
52
|
+
)
|
|
53
|
+
from .config import DRSConfig, SchemaField
|
|
54
|
+
from .storage_backend import MetadataType
|
|
55
|
+
|
|
56
|
+
ISO_FORMAT_REGEX = re.compile(
|
|
57
|
+
r"^\d{4}-\d{2}-\d{2}T\d{2}:\d{2}:\d{2}(?:\.\d+)?Z?$"
|
|
58
|
+
)
|
|
59
|
+
|
|
60
|
+
BATCH_SECS_THRESHOLD = 20
|
|
61
|
+
BATCH_ITEM = List[Tuple[str, Dict[str, Any]]]
|
|
62
|
+
|
|
63
|
+
|
|
64
|
+
ConsumerQueueType: TypeAlias = QueueLike[
|
|
65
|
+
Union[int, Tuple[str, str, MetadataType]]
|
|
66
|
+
]
|
|
67
|
+
WriterQueueType: TypeAlias = SimpleQueueLike[Union[int, BATCH_ITEM]]
|
|
68
|
+
|
|
69
|
+
|
|
70
|
+
class Stream(NamedTuple):
|
|
71
|
+
"""A representation of a path stream as named tuple."""
|
|
72
|
+
|
|
73
|
+
name: str
|
|
74
|
+
path: str
|
|
75
|
+
|
|
76
|
+
|
|
77
|
+
class DateTimeEncoder(json.JSONEncoder):
|
|
78
|
+
"""JSON‐Encoder that emits datetimes as ISO‐8601 strings."""
|
|
79
|
+
|
|
80
|
+
def default(self, obj: Any) -> Any:
|
|
81
|
+
"""Set default time encoding."""
|
|
82
|
+
if isinstance(obj, datetime):
|
|
83
|
+
return obj.isoformat()
|
|
84
|
+
return super().default(obj)
|
|
85
|
+
|
|
86
|
+
|
|
87
|
+
class DateTimeDecoder(json.JSONDecoder):
|
|
88
|
+
"""JSON Decoder that converts ISO‐8601 strings to datetime objects."""
|
|
89
|
+
|
|
90
|
+
def __init__(self, *args: Any, **kwargs: Any) -> None:
|
|
91
|
+
super().__init__(object_hook=self._decode_objects, *args, **kwargs)
|
|
92
|
+
|
|
93
|
+
def _decode_datetime(self, obj: Any) -> Any:
|
|
94
|
+
if isinstance(obj, list):
|
|
95
|
+
return list(map(self._decode_datetime, obj))
|
|
96
|
+
elif isinstance(obj, dict):
|
|
97
|
+
for key in obj:
|
|
98
|
+
obj[key] = self._decode_datetime(obj[key])
|
|
99
|
+
if isinstance(obj, str):
|
|
100
|
+
try:
|
|
101
|
+
return datetime.fromisoformat(obj.replace("Z", "+00:00"))
|
|
102
|
+
except ValueError:
|
|
103
|
+
return obj
|
|
104
|
+
return obj
|
|
105
|
+
|
|
106
|
+
def _decode_objects(self, obj: Dict[str, Any]) -> Any:
|
|
107
|
+
for key, value in obj.items():
|
|
108
|
+
obj[key] = self._decode_datetime(value)
|
|
109
|
+
return obj
|
|
110
|
+
|
|
111
|
+
|
|
112
|
+
class IndexName(NamedTuple):
|
|
113
|
+
"""A paired set of metadata indexes representations.
|
|
114
|
+
|
|
115
|
+
- `latest`: Metadata for the latest version of each dataset.
|
|
116
|
+
- `files`: Metadata for all available versions of datasets.
|
|
117
|
+
|
|
118
|
+
This abstraction is backend-agnostic and can be used with any index system,
|
|
119
|
+
such as Apache Solr cores, MongoDB collections, or SQL tables.
|
|
120
|
+
|
|
121
|
+
"""
|
|
122
|
+
|
|
123
|
+
latest: str = "latest"
|
|
124
|
+
all: str = "files"
|
|
125
|
+
|
|
126
|
+
|
|
127
|
+
class IndexStore:
|
|
128
|
+
"""Base class for all metadata stores."""
|
|
129
|
+
|
|
130
|
+
suffix: ClassVar[str]
|
|
131
|
+
"""Path suffix of the metadata store."""
|
|
132
|
+
|
|
133
|
+
driver: ClassVar[str]
|
|
134
|
+
"""Intake driver."""
|
|
135
|
+
|
|
136
|
+
def __init__(
|
|
137
|
+
self,
|
|
138
|
+
path: str,
|
|
139
|
+
index_name: IndexName,
|
|
140
|
+
schema: Dict[str, SchemaField],
|
|
141
|
+
batch_size: int = 25_000,
|
|
142
|
+
mode: Literal["r", "w"] = "r",
|
|
143
|
+
storage_options: Optional[Dict[str, Any]] = None,
|
|
144
|
+
shadow: Optional[Union[str, List[str]]] = None,
|
|
145
|
+
**kwargs: Any,
|
|
146
|
+
) -> None:
|
|
147
|
+
self.storage_options = storage_options or {}
|
|
148
|
+
self._shadow_options = (
|
|
149
|
+
shadow or [] if isinstance(shadow, (list, NoneType)) else [shadow]
|
|
150
|
+
)
|
|
151
|
+
self._ctx = mp.get_context("spawn")
|
|
152
|
+
self.queue: WriterQueueType = self._ctx.SimpleQueue()
|
|
153
|
+
self._sent = 42
|
|
154
|
+
self._fs, self._is_local_path = self.get_fs(path, **self.storage_options)
|
|
155
|
+
self._path = self._fs.unstrip_protocol(path)
|
|
156
|
+
self.schema = schema
|
|
157
|
+
self.batch_size = batch_size
|
|
158
|
+
self.index_names: Tuple[str, str] = (index_name.latest, index_name.all)
|
|
159
|
+
self.mode = mode
|
|
160
|
+
self._rows_since_flush = 0
|
|
161
|
+
self._last_flush = time.time()
|
|
162
|
+
self._paths: List[Stream] = []
|
|
163
|
+
self.max_workers: int = max(1, (os.cpu_count() or 4))
|
|
164
|
+
for name in self.index_names:
|
|
165
|
+
out_path = self.get_path(name)
|
|
166
|
+
self._paths.append(Stream(name=name, path=out_path))
|
|
167
|
+
self._timestamp_keys: Set[str] = {
|
|
168
|
+
k
|
|
169
|
+
for k, col in schema.items()
|
|
170
|
+
if getattr(getattr(col, "base_type", None), "value", None)
|
|
171
|
+
== "timestamp"
|
|
172
|
+
}
|
|
173
|
+
|
|
174
|
+
@staticmethod
|
|
175
|
+
def get_fs(
|
|
176
|
+
uri: str, **storage_options: Any
|
|
177
|
+
) -> Tuple[fsspec.AbstractFileSystem, bool]:
|
|
178
|
+
"""Get the base-url from a path."""
|
|
179
|
+
protocol, path = fsspec.core.split_protocol(uri)
|
|
180
|
+
protocol = protocol or "file"
|
|
181
|
+
add = {"anon": True} if protocol == "s3" else {}
|
|
182
|
+
storage_options = storage_options or add
|
|
183
|
+
fs = fsspec.filesystem(protocol, **storage_options)
|
|
184
|
+
return fs, protocol == "file"
|
|
185
|
+
|
|
186
|
+
@abc.abstractmethod
|
|
187
|
+
async def read(
|
|
188
|
+
self,
|
|
189
|
+
index_name: str,
|
|
190
|
+
) -> AsyncIterator[List[Dict[str, Any]]]:
|
|
191
|
+
"""Yield batches of metadata records from a specific table.
|
|
192
|
+
|
|
193
|
+
Parameters
|
|
194
|
+
^^^^^^^^^^
|
|
195
|
+
index_name:
|
|
196
|
+
The name of the index_name.
|
|
197
|
+
|
|
198
|
+
Yields
|
|
199
|
+
^^^^^^
|
|
200
|
+
List[Dict[str, Any]]:
|
|
201
|
+
Deserialised metadata records.
|
|
202
|
+
"""
|
|
203
|
+
yield [{}] # pragma: no cover
|
|
204
|
+
|
|
205
|
+
def get_path(self, path_suffix: Optional[str] = None) -> str:
|
|
206
|
+
"""Construct a path name for a given suffix."""
|
|
207
|
+
path = self._path.removesuffix(self.suffix)
|
|
208
|
+
new_path = (
|
|
209
|
+
f"{path}-{path_suffix}{self.suffix}"
|
|
210
|
+
if path_suffix
|
|
211
|
+
else f"{path}{self.suffix}"
|
|
212
|
+
)
|
|
213
|
+
return new_path
|
|
214
|
+
|
|
215
|
+
def join(self) -> None:
|
|
216
|
+
"""Shutdown the writer task."""
|
|
217
|
+
self.queue.put(self._sent)
|
|
218
|
+
if self.proc is not None:
|
|
219
|
+
self.proc.join()
|
|
220
|
+
|
|
221
|
+
def close(self) -> None:
|
|
222
|
+
"""Shutdown the write worker."""
|
|
223
|
+
self.join()
|
|
224
|
+
|
|
225
|
+
@property
|
|
226
|
+
def proc(self) -> Optional["mp.process.BaseProcess"]:
|
|
227
|
+
"""The writer process."""
|
|
228
|
+
raise NotImplementedError("This property must be defined.")
|
|
229
|
+
|
|
230
|
+
@abc.abstractmethod
|
|
231
|
+
def get_args(self, index_name: str) -> Dict[str, Any]:
|
|
232
|
+
"""Define the intake arguments."""
|
|
233
|
+
... # pragma: no cover
|
|
234
|
+
|
|
235
|
+
def catalogue_storage_options(
|
|
236
|
+
self, path: Optional[str] = None
|
|
237
|
+
) -> Dict[str, Any]:
|
|
238
|
+
"""Construct the storage options for the catalogue."""
|
|
239
|
+
is_s3 = (path or "").startswith("s3://")
|
|
240
|
+
opts = {
|
|
241
|
+
k: v
|
|
242
|
+
for k, v in self.storage_options.items()
|
|
243
|
+
if k not in self._shadow_options
|
|
244
|
+
}
|
|
245
|
+
shadow_keys = {
|
|
246
|
+
"key",
|
|
247
|
+
"secret",
|
|
248
|
+
"token",
|
|
249
|
+
"username",
|
|
250
|
+
"user",
|
|
251
|
+
"password",
|
|
252
|
+
"secret_file",
|
|
253
|
+
"secretfile",
|
|
254
|
+
}
|
|
255
|
+
opts |= {"anon": True} if is_s3 and not shadow_keys & opts.keys() else {}
|
|
256
|
+
return opts
|
|
257
|
+
|
|
258
|
+
|
|
259
|
+
class JSONLineWriter:
|
|
260
|
+
"""Write JSONLines to disk."""
|
|
261
|
+
|
|
262
|
+
def __init__(
|
|
263
|
+
self,
|
|
264
|
+
*streams: Stream,
|
|
265
|
+
comp_level: int = 4,
|
|
266
|
+
shadow: Optional[Union[str, List[str]]] = None,
|
|
267
|
+
**storage_options: Any,
|
|
268
|
+
) -> None:
|
|
269
|
+
|
|
270
|
+
self._comp_level = comp_level
|
|
271
|
+
self._f: Dict[str, BytesIO] = {}
|
|
272
|
+
self._streams = {s.name: s.path for s in streams}
|
|
273
|
+
self._records = 0
|
|
274
|
+
self.storage_options = storage_options
|
|
275
|
+
for _stream in streams:
|
|
276
|
+
fs, _ = IndexStore.get_fs(_stream.path, **storage_options)
|
|
277
|
+
parent = os.path.dirname(_stream.path).rstrip("/")
|
|
278
|
+
try:
|
|
279
|
+
fs.makedirs(parent, exist_ok=True)
|
|
280
|
+
except Exception: # pragma: no cover
|
|
281
|
+
pass # pragma: no cover
|
|
282
|
+
self._f[_stream.name] = fs.open(_stream.path, mode="wb")
|
|
283
|
+
|
|
284
|
+
@classmethod
|
|
285
|
+
def as_daemon(
|
|
286
|
+
cls,
|
|
287
|
+
queue: WriterQueueType,
|
|
288
|
+
semaphore: int,
|
|
289
|
+
*streams: Stream,
|
|
290
|
+
comp_level: int = 4,
|
|
291
|
+
**storage_options: Any,
|
|
292
|
+
) -> None:
|
|
293
|
+
"""Start the writer process as a daemon."""
|
|
294
|
+
this = cls(*streams, comp_level=comp_level, **storage_options)
|
|
295
|
+
get = queue.get
|
|
296
|
+
add = this._add
|
|
297
|
+
while True:
|
|
298
|
+
item = get()
|
|
299
|
+
if item == semaphore:
|
|
300
|
+
logger.info("Closing writer task.")
|
|
301
|
+
break
|
|
302
|
+
try:
|
|
303
|
+
add(cast(BATCH_ITEM, item))
|
|
304
|
+
except Exception as error:
|
|
305
|
+
logger.error(error)
|
|
306
|
+
this.close()
|
|
307
|
+
|
|
308
|
+
@staticmethod
|
|
309
|
+
def _encode_records(records: List[Dict[str, Any]]) -> bytes:
|
|
310
|
+
"""Serialize a list of dicts into one JSONL bytes blob."""
|
|
311
|
+
parts = [orjson.dumps(rec) for rec in records]
|
|
312
|
+
return b"".join(p + b"\n" for p in parts)
|
|
313
|
+
|
|
314
|
+
def _gzip_once(self, payload: bytes) -> bytes:
|
|
315
|
+
"""Compress a whole JSONL blob into a single gz member (fast)."""
|
|
316
|
+
return gzip.compress(payload, compresslevel=self._comp_level)
|
|
317
|
+
|
|
318
|
+
def _add(self, metadata_batch: List[Tuple[str, Dict[str, Any]]]) -> None:
|
|
319
|
+
"""Add a batch of metadata to the gzip store."""
|
|
320
|
+
by_index: Dict[str, List[Dict[str, Any]]] = {
|
|
321
|
+
name: [] for name in self._streams
|
|
322
|
+
}
|
|
323
|
+
for index_name, metadata in metadata_batch:
|
|
324
|
+
by_index[index_name].append(metadata)
|
|
325
|
+
for index_name, records in by_index.items():
|
|
326
|
+
if not records:
|
|
327
|
+
continue
|
|
328
|
+
payload = self._encode_records(records)
|
|
329
|
+
gz = self._gzip_once(payload)
|
|
330
|
+
self._f[index_name].write(gz)
|
|
331
|
+
self._records += len(records)
|
|
332
|
+
|
|
333
|
+
def close(self) -> None:
|
|
334
|
+
"""Close the files."""
|
|
335
|
+
for name, stream in self._f.items():
|
|
336
|
+
try:
|
|
337
|
+
stream.flush()
|
|
338
|
+
except Exception:
|
|
339
|
+
pass
|
|
340
|
+
stream.close()
|
|
341
|
+
if not self._records:
|
|
342
|
+
fs, _ = IndexStore.get_fs(
|
|
343
|
+
self._streams[name], **self.storage_options
|
|
344
|
+
)
|
|
345
|
+
fs.rm(self._streams[name])
|
|
346
|
+
|
|
347
|
+
|
|
348
|
+
class JSONLines(IndexStore):
|
|
349
|
+
"""Write metadata to gzipped JSONLines files."""
|
|
350
|
+
|
|
351
|
+
suffix = ".json.gz"
|
|
352
|
+
driver = "intake.source.jsonfiles.JSONLinesFileSource"
|
|
353
|
+
|
|
354
|
+
def __init__(
|
|
355
|
+
self,
|
|
356
|
+
path: str,
|
|
357
|
+
index_name: IndexName,
|
|
358
|
+
schema: Dict[str, SchemaField],
|
|
359
|
+
mode: Literal["w", "r"] = "r",
|
|
360
|
+
storage_options: Optional[Dict[str, Any]] = None,
|
|
361
|
+
shadow: Optional[Union[str, List[str]]] = None,
|
|
362
|
+
batch_size: int = 25_000,
|
|
363
|
+
**kwargs: Any,
|
|
364
|
+
):
|
|
365
|
+
super().__init__(
|
|
366
|
+
path,
|
|
367
|
+
index_name,
|
|
368
|
+
schema,
|
|
369
|
+
mode=mode,
|
|
370
|
+
shadow=shadow,
|
|
371
|
+
storage_options=storage_options,
|
|
372
|
+
batch_size=batch_size,
|
|
373
|
+
)
|
|
374
|
+
_comp_level = int(kwargs.get("comp_level", "4"))
|
|
375
|
+
self._proc: Optional["mp.process.BaseProcess"] = None
|
|
376
|
+
if mode == "w":
|
|
377
|
+
self._proc = self._ctx.Process(
|
|
378
|
+
target=JSONLineWriter.as_daemon,
|
|
379
|
+
args=(
|
|
380
|
+
self.queue,
|
|
381
|
+
self._sent,
|
|
382
|
+
)
|
|
383
|
+
+ tuple(self._paths),
|
|
384
|
+
kwargs={**{"comp_level": _comp_level}, **self.storage_options},
|
|
385
|
+
daemon=True,
|
|
386
|
+
)
|
|
387
|
+
self._proc.start()
|
|
388
|
+
|
|
389
|
+
@property
|
|
390
|
+
def proc(self) -> Optional["mp.process.BaseProcess"]:
|
|
391
|
+
"""The writer process."""
|
|
392
|
+
return self._proc
|
|
393
|
+
|
|
394
|
+
def get_args(self, index_name: str) -> Dict[str, Any]:
|
|
395
|
+
"""Define the intake arguments."""
|
|
396
|
+
path = self.get_path(index_name)
|
|
397
|
+
return {
|
|
398
|
+
"urlpath": path,
|
|
399
|
+
"compression": "gzip",
|
|
400
|
+
"text_mode": True,
|
|
401
|
+
"storage_options": self.catalogue_storage_options(path),
|
|
402
|
+
}
|
|
403
|
+
|
|
404
|
+
async def read(
|
|
405
|
+
self,
|
|
406
|
+
index_name: str,
|
|
407
|
+
) -> AsyncIterator[List[Dict[str, Any]]]:
|
|
408
|
+
"""Yield batches of metadata records from a specific table.
|
|
409
|
+
|
|
410
|
+
Parameters
|
|
411
|
+
^^^^^^^^^^
|
|
412
|
+
index_name:
|
|
413
|
+
The name of the index_name.
|
|
414
|
+
|
|
415
|
+
Yields
|
|
416
|
+
^^^^^^^
|
|
417
|
+
List[Dict[str, Any]]:
|
|
418
|
+
Deserialised metadata records.
|
|
419
|
+
"""
|
|
420
|
+
loop = asyncio.get_running_loop()
|
|
421
|
+
ts_keys = self._timestamp_keys
|
|
422
|
+
path = self.get_path(index_name)
|
|
423
|
+
with (
|
|
424
|
+
self._fs.open(
|
|
425
|
+
path,
|
|
426
|
+
mode="rt",
|
|
427
|
+
compression="gzip",
|
|
428
|
+
encoding="utf-8",
|
|
429
|
+
) as stream,
|
|
430
|
+
ThreadPoolExecutor(max_workers=self.max_workers) as pool,
|
|
431
|
+
):
|
|
432
|
+
raw_lines: List[str] = []
|
|
433
|
+
async for line in create_async_iterator(stream):
|
|
434
|
+
raw_lines.append(line)
|
|
435
|
+
if len(raw_lines) >= self.batch_size:
|
|
436
|
+
batch = await loop.run_in_executor(
|
|
437
|
+
pool, parse_batch, raw_lines, ts_keys
|
|
438
|
+
)
|
|
439
|
+
yield batch
|
|
440
|
+
raw_lines.clear()
|
|
441
|
+
if raw_lines:
|
|
442
|
+
batch = await loop.run_in_executor(
|
|
443
|
+
pool, parse_batch, raw_lines, ts_keys
|
|
444
|
+
)
|
|
445
|
+
yield batch
|
|
446
|
+
|
|
447
|
+
|
|
448
|
+
class CatalogueBackends(Enum):
|
|
449
|
+
"""Define the implemented catalogue backends."""
|
|
450
|
+
|
|
451
|
+
jsonlines = JSONLines
|
|
452
|
+
|
|
453
|
+
|
|
454
|
+
CatalogueBackendType: TypeAlias = Literal["jsonlines"]
|
|
455
|
+
|
|
456
|
+
|
|
457
|
+
class CatalogueReader:
|
|
458
|
+
"""Backend for reading the content of an intake catalogue.
|
|
459
|
+
|
|
460
|
+
Parameters
|
|
461
|
+
^^^^^^^^^^
|
|
462
|
+
catalogue_file:
|
|
463
|
+
Path to the intake catalogue
|
|
464
|
+
batch_size:
|
|
465
|
+
Size of the metadata chunks that should be read.
|
|
466
|
+
"""
|
|
467
|
+
|
|
468
|
+
def __init__(
|
|
469
|
+
self,
|
|
470
|
+
catalogue_file: Union[str, Path],
|
|
471
|
+
batch_size: int = 2500,
|
|
472
|
+
storage_options: Optional[Dict[str, Any]] = None,
|
|
473
|
+
) -> None:
|
|
474
|
+
catalogue_file = str(catalogue_file)
|
|
475
|
+
storage_options = storage_options or {}
|
|
476
|
+
cat = self.load_catalogue(catalogue_file, **storage_options)
|
|
477
|
+
_schema_json = cat["metadata"]["schema"]
|
|
478
|
+
schema = {s["key"]: SchemaField(**s) for k, s in _schema_json.items()}
|
|
479
|
+
index_name = IndexName(**cat["metadata"]["index_names"])
|
|
480
|
+
cls: Type[IndexStore] = CatalogueBackends[
|
|
481
|
+
cat["metadata"]["backend"]
|
|
482
|
+
].value
|
|
483
|
+
storage_options = cat["metadata"].get("storage_options", {})
|
|
484
|
+
self.store = cls(
|
|
485
|
+
cat["metadata"]["prefix"],
|
|
486
|
+
index_name,
|
|
487
|
+
schema,
|
|
488
|
+
mode="r",
|
|
489
|
+
batch_size=batch_size,
|
|
490
|
+
storage_options=storage_options,
|
|
491
|
+
)
|
|
492
|
+
|
|
493
|
+
@staticmethod
|
|
494
|
+
def load_catalogue(path: Union[str, Path], **storage_options: Any) -> Any:
|
|
495
|
+
"""Load a intake yaml catalogue (remote or local)."""
|
|
496
|
+
fs, _ = IndexStore.get_fs(str(path), **storage_options)
|
|
497
|
+
cat_path = fs.unstrip_protocol(path)
|
|
498
|
+
with fs.open(cat_path) as stream:
|
|
499
|
+
return yaml.safe_load(stream.read())
|
|
500
|
+
|
|
501
|
+
|
|
502
|
+
class QueueConsumer:
|
|
503
|
+
"""Class that consumes the file discovery queue."""
|
|
504
|
+
|
|
505
|
+
def __init__(
|
|
506
|
+
self,
|
|
507
|
+
config: Optional[Union[str, Path]],
|
|
508
|
+
num_objects: "sharedctypes.Synchronized[Any]",
|
|
509
|
+
writer_queue: WriterQueueType,
|
|
510
|
+
) -> None:
|
|
511
|
+
self.config = DRSConfig.load(config)
|
|
512
|
+
self._writer_queue = writer_queue
|
|
513
|
+
self.num_objects = num_objects
|
|
514
|
+
|
|
515
|
+
def _flush_batch(
|
|
516
|
+
self,
|
|
517
|
+
batch: List[Tuple[str, Dict[str, Any]]],
|
|
518
|
+
) -> None:
|
|
519
|
+
logger.info("Ingesting %i items", len(batch))
|
|
520
|
+
try:
|
|
521
|
+
self._writer_queue.put(batch.copy())
|
|
522
|
+
with self.num_objects.get_lock():
|
|
523
|
+
self.num_objects.value += len(batch)
|
|
524
|
+
except Exception as error: # pragma: no cover
|
|
525
|
+
logger.error(error) # pragma: no cover
|
|
526
|
+
batch.clear()
|
|
527
|
+
|
|
528
|
+
@classmethod
|
|
529
|
+
def run_consumer_task(
|
|
530
|
+
cls,
|
|
531
|
+
queue: ConsumerQueueType,
|
|
532
|
+
writer_queue: WriterQueueType,
|
|
533
|
+
config: Optional[Union[str, Path]],
|
|
534
|
+
num_objects: "sharedctypes.Synchronized[Any]",
|
|
535
|
+
batch_size: int,
|
|
536
|
+
poison_pill: int,
|
|
537
|
+
) -> None:
|
|
538
|
+
"""Set up a consumer task waiting for incoming data to be ingested."""
|
|
539
|
+
this = cls(config, num_objects, writer_queue)
|
|
540
|
+
this_worker = os.getpid()
|
|
541
|
+
logger.info("Adding %i consumer to consumers.", this_worker)
|
|
542
|
+
batch: List[Tuple[str, Dict[str, Any]]] = []
|
|
543
|
+
append = batch.append
|
|
544
|
+
read_metadata = this.config.read_metadata
|
|
545
|
+
flush = this._flush_batch
|
|
546
|
+
get = queue.get
|
|
547
|
+
while True:
|
|
548
|
+
item = get()
|
|
549
|
+
if item == poison_pill:
|
|
550
|
+
break
|
|
551
|
+
try:
|
|
552
|
+
name, drs_type, inp = cast(Tuple[str, str, MetadataType], item)
|
|
553
|
+
metadata = read_metadata(drs_type, inp)
|
|
554
|
+
except MetadataCrawlerException as error:
|
|
555
|
+
logger.warning(error)
|
|
556
|
+
continue
|
|
557
|
+
except Exception as error:
|
|
558
|
+
logger.error(error)
|
|
559
|
+
continue
|
|
560
|
+
append((name, metadata))
|
|
561
|
+
if len(batch) >= batch_size:
|
|
562
|
+
flush(batch)
|
|
563
|
+
if batch:
|
|
564
|
+
flush(batch)
|
|
565
|
+
logger.info("Closing consumer %i", this_worker)
|
|
566
|
+
|
|
567
|
+
|
|
568
|
+
class CatalogueWriter:
|
|
569
|
+
"""Create intake catalogues that store metadata entries.
|
|
570
|
+
|
|
571
|
+
Parameters
|
|
572
|
+
^^^^^^^^^^
|
|
573
|
+
yaml_path:
|
|
574
|
+
Path the to intake catalogue that should be created.
|
|
575
|
+
index_name:
|
|
576
|
+
Names of the metadata indexes.
|
|
577
|
+
data_store_prefix:
|
|
578
|
+
Prefix of the path/url where the metadata is stored.
|
|
579
|
+
batch_size:
|
|
580
|
+
Size of the metadata chunks that should be added to the data store.
|
|
581
|
+
index_schema:
|
|
582
|
+
Schema of the metadata
|
|
583
|
+
storage_options:
|
|
584
|
+
Set additional storage options for adding metadata to the metadata store
|
|
585
|
+
shadow:
|
|
586
|
+
'Shadow' this storage options. This is useful to hide secrets in public
|
|
587
|
+
data catalogues.
|
|
588
|
+
"""
|
|
589
|
+
|
|
590
|
+
def __init__(
|
|
591
|
+
self,
|
|
592
|
+
yaml_path: str,
|
|
593
|
+
index_name: IndexName,
|
|
594
|
+
data_store_prefix: str = "metadata",
|
|
595
|
+
backend: str = "jsonlines",
|
|
596
|
+
batch_size: int = 25_000,
|
|
597
|
+
config: Optional[
|
|
598
|
+
Union[Path, str, Dict[str, Any], tomlkit.TOMLDocument]
|
|
599
|
+
] = None,
|
|
600
|
+
n_procs: Optional[int] = None,
|
|
601
|
+
storage_options: Optional[Dict[str, Any]] = None,
|
|
602
|
+
shadow: Optional[Union[str, List[str]]] = None,
|
|
603
|
+
**kwargs: Any,
|
|
604
|
+
) -> None:
|
|
605
|
+
self.config = DRSConfig.load(config)
|
|
606
|
+
storage_options = storage_options or {}
|
|
607
|
+
self.fs, _ = IndexStore.get_fs(yaml_path, **storage_options)
|
|
608
|
+
self.path = self.fs.unstrip_protocol(yaml_path)
|
|
609
|
+
scheme, _, _ = data_store_prefix.rpartition("://")
|
|
610
|
+
self.backend = backend
|
|
611
|
+
if not scheme and not os.path.isabs(data_store_prefix):
|
|
612
|
+
data_store_prefix = os.path.join(
|
|
613
|
+
os.path.abspath(os.path.dirname(yaml_path)), data_store_prefix
|
|
614
|
+
)
|
|
615
|
+
self.prefix = data_store_prefix
|
|
616
|
+
self.index_name = index_name
|
|
617
|
+
cls: Type[IndexStore] = CatalogueBackends[backend].value
|
|
618
|
+
self.store = cls(
|
|
619
|
+
data_store_prefix,
|
|
620
|
+
index_name,
|
|
621
|
+
self.config.index_schema,
|
|
622
|
+
mode="w",
|
|
623
|
+
storage_options=storage_options,
|
|
624
|
+
shadow=shadow,
|
|
625
|
+
**kwargs,
|
|
626
|
+
)
|
|
627
|
+
self._ctx = mp.get_context("spawn")
|
|
628
|
+
self.queue: ConsumerQueueType = self._ctx.Queue()
|
|
629
|
+
self._poison_pill = 13
|
|
630
|
+
self.num_objects: Counter = self._ctx.Value("i", 0)
|
|
631
|
+
n_procs = n_procs or min(mp.cpu_count(), 15)
|
|
632
|
+
batch_size_per_proc = max(int(batch_size / n_procs), 100)
|
|
633
|
+
self._tasks = [
|
|
634
|
+
self._ctx.Process(
|
|
635
|
+
target=QueueConsumer.run_consumer_task,
|
|
636
|
+
args=(
|
|
637
|
+
self.queue,
|
|
638
|
+
self.store.queue,
|
|
639
|
+
config,
|
|
640
|
+
self.num_objects,
|
|
641
|
+
batch_size_per_proc,
|
|
642
|
+
self._poison_pill,
|
|
643
|
+
),
|
|
644
|
+
)
|
|
645
|
+
for i in range(n_procs)
|
|
646
|
+
]
|
|
647
|
+
|
|
648
|
+
async def put(
|
|
649
|
+
self,
|
|
650
|
+
inp: MetadataType,
|
|
651
|
+
drs_type: str,
|
|
652
|
+
name: str = "",
|
|
653
|
+
) -> None:
|
|
654
|
+
"""Add items to the fifo queue.
|
|
655
|
+
|
|
656
|
+
This method is used by the data crawling (discovery) method
|
|
657
|
+
to add the name of the catalogue, the path to the input file object
|
|
658
|
+
and a reference of the Data Reference Syntax class for this
|
|
659
|
+
type of dataset.
|
|
660
|
+
|
|
661
|
+
Parameters
|
|
662
|
+
^^^^^^^^^^
|
|
663
|
+
inp:
|
|
664
|
+
Path and metadata of the discovered object.
|
|
665
|
+
drs_type:
|
|
666
|
+
The data type the discovered object belongs to.
|
|
667
|
+
name:
|
|
668
|
+
Name of the catalogue, if applicable. This variable depends on
|
|
669
|
+
the cataloguing system. For example apache solr would use a `core`.
|
|
670
|
+
"""
|
|
671
|
+
self.queue.put((name, drs_type, inp))
|
|
672
|
+
|
|
673
|
+
@property
|
|
674
|
+
def ingested_objects(self) -> int:
|
|
675
|
+
"""Get the number of ingested objects."""
|
|
676
|
+
return self.num_objects.value
|
|
677
|
+
|
|
678
|
+
@property
|
|
679
|
+
def size(self) -> int:
|
|
680
|
+
"""Get the size of the worker queue."""
|
|
681
|
+
return self.queue.qsize()
|
|
682
|
+
|
|
683
|
+
def join_all_tasks(self) -> None:
|
|
684
|
+
"""Block the execution until all tasks are marked as done."""
|
|
685
|
+
logger.debug("Releasing consumers from their duty.")
|
|
686
|
+
for _ in self._tasks:
|
|
687
|
+
self.queue.put(self._poison_pill)
|
|
688
|
+
for task in self._tasks:
|
|
689
|
+
task.join()
|
|
690
|
+
self.store.join()
|
|
691
|
+
|
|
692
|
+
async def close(self, create_catalogue: bool = True) -> None:
|
|
693
|
+
"""Close any connections."""
|
|
694
|
+
self.store.join()
|
|
695
|
+
self.store.close()
|
|
696
|
+
if create_catalogue:
|
|
697
|
+
self._create_catalogue_file()
|
|
698
|
+
|
|
699
|
+
async def delete(self) -> None:
|
|
700
|
+
"""Delete all stores."""
|
|
701
|
+
await self.close(False)
|
|
702
|
+
for name in self.index_name.latest, self.index_name.all:
|
|
703
|
+
path = self.store.get_path(name)
|
|
704
|
+
self.store._fs.rm(path) if self.store._fs.exists(path) else None
|
|
705
|
+
self.fs.rm(self.path) if self.fs.exists(self.path) else None
|
|
706
|
+
|
|
707
|
+
def run_consumer(self) -> None:
|
|
708
|
+
"""Set up all the consumers."""
|
|
709
|
+
for task in self._tasks:
|
|
710
|
+
task.start()
|
|
711
|
+
|
|
712
|
+
def _create_catalogue_file(self) -> None:
|
|
713
|
+
catalog = {
|
|
714
|
+
"description": (
|
|
715
|
+
f"{metadata_crawler.__name__} "
|
|
716
|
+
f"(v{metadata_crawler.__version__})"
|
|
717
|
+
f" at {datetime.now().strftime('%c')}"
|
|
718
|
+
),
|
|
719
|
+
"metadata": {
|
|
720
|
+
"version": 1,
|
|
721
|
+
"backend": self.backend,
|
|
722
|
+
"prefix": self.prefix,
|
|
723
|
+
"storage_options": self.store.catalogue_storage_options(
|
|
724
|
+
self.prefix
|
|
725
|
+
),
|
|
726
|
+
"index_names": {
|
|
727
|
+
"latest": self.index_name.latest,
|
|
728
|
+
"all": self.index_name.all,
|
|
729
|
+
},
|
|
730
|
+
"indexed_objects": self.ingested_objects,
|
|
731
|
+
"schema": {
|
|
732
|
+
k: json.loads(s.model_dump_json())
|
|
733
|
+
for k, s in self.store.schema.items()
|
|
734
|
+
},
|
|
735
|
+
},
|
|
736
|
+
"sources": {
|
|
737
|
+
self.index_name.latest: {
|
|
738
|
+
"description": "Latest metadata versions.",
|
|
739
|
+
"driver": self.store.driver,
|
|
740
|
+
"args": self.store.get_args(self.index_name.latest),
|
|
741
|
+
},
|
|
742
|
+
self.index_name.all: {
|
|
743
|
+
"description": "All metadata versions only.",
|
|
744
|
+
"driver": self.store.driver,
|
|
745
|
+
"args": self.store.get_args(self.index_name.all),
|
|
746
|
+
},
|
|
747
|
+
},
|
|
748
|
+
}
|
|
749
|
+
with self.fs.open(self.path, "w", encoding="utf-8") as f:
|
|
750
|
+
yaml.safe_dump(
|
|
751
|
+
catalog,
|
|
752
|
+
f,
|
|
753
|
+
sort_keys=False, # preserve our ordering
|
|
754
|
+
default_flow_style=False,
|
|
755
|
+
)
|