metadata-crawler 2509.0.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of metadata-crawler might be problematic. Click here for more details.

Files changed (34) hide show
  1. metadata_crawler/__init__.py +248 -0
  2. metadata_crawler/__main__.py +8 -0
  3. metadata_crawler/_version.py +1 -0
  4. metadata_crawler/api/__init__.py +1 -0
  5. metadata_crawler/api/cli.py +57 -0
  6. metadata_crawler/api/config.py +801 -0
  7. metadata_crawler/api/drs_config.toml +439 -0
  8. metadata_crawler/api/index.py +132 -0
  9. metadata_crawler/api/metadata_stores.py +749 -0
  10. metadata_crawler/api/mixin/__init__.py +7 -0
  11. metadata_crawler/api/mixin/lookup_mixin.py +112 -0
  12. metadata_crawler/api/mixin/lookup_tables.py +10010 -0
  13. metadata_crawler/api/mixin/path_mixin.py +46 -0
  14. metadata_crawler/api/mixin/template_mixin.py +145 -0
  15. metadata_crawler/api/storage_backend.py +277 -0
  16. metadata_crawler/backends/__init__.py +1 -0
  17. metadata_crawler/backends/intake.py +211 -0
  18. metadata_crawler/backends/posix.py +121 -0
  19. metadata_crawler/backends/s3.py +136 -0
  20. metadata_crawler/backends/swift.py +305 -0
  21. metadata_crawler/cli.py +539 -0
  22. metadata_crawler/data_collector.py +258 -0
  23. metadata_crawler/ingester/__init__.py +1 -0
  24. metadata_crawler/ingester/mongo.py +193 -0
  25. metadata_crawler/ingester/solr.py +152 -0
  26. metadata_crawler/logger.py +142 -0
  27. metadata_crawler/py.typed +0 -0
  28. metadata_crawler/run.py +373 -0
  29. metadata_crawler/utils.py +411 -0
  30. metadata_crawler-2509.0.0.dist-info/METADATA +399 -0
  31. metadata_crawler-2509.0.0.dist-info/RECORD +34 -0
  32. metadata_crawler-2509.0.0.dist-info/WHEEL +4 -0
  33. metadata_crawler-2509.0.0.dist-info/entry_points.txt +14 -0
  34. metadata_crawler-2509.0.0.dist-info/licenses/LICENSE +28 -0
@@ -0,0 +1,749 @@
1
+ """Metadata Storage definitions."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import abc
6
+ import asyncio
7
+ import gzip
8
+ import json
9
+ import multiprocessing as mp
10
+ import os
11
+ import re
12
+ import time
13
+ from concurrent.futures import ThreadPoolExecutor
14
+ from datetime import datetime
15
+ from enum import Enum
16
+ from io import BytesIO
17
+ from multiprocessing import sharedctypes
18
+ from pathlib import Path
19
+ from types import NoneType
20
+ from typing import (
21
+ Any,
22
+ AsyncIterator,
23
+ ClassVar,
24
+ Dict,
25
+ List,
26
+ Literal,
27
+ NamedTuple,
28
+ Optional,
29
+ Set,
30
+ Tuple,
31
+ Type,
32
+ TypeAlias,
33
+ Union,
34
+ cast,
35
+ )
36
+
37
+ import fsspec
38
+ import orjson
39
+ import tomlkit
40
+ import yaml
41
+
42
+ import metadata_crawler
43
+
44
+ from ..logger import logger
45
+ from ..utils import (
46
+ Counter,
47
+ MetadataCrawlerException,
48
+ QueueLike,
49
+ SimpleQueueLike,
50
+ create_async_iterator,
51
+ parse_batch,
52
+ )
53
+ from .config import DRSConfig, SchemaField
54
+ from .storage_backend import MetadataType
55
+
56
+ ISO_FORMAT_REGEX = re.compile(
57
+ r"^\d{4}-\d{2}-\d{2}T\d{2}:\d{2}:\d{2}(?:\.\d+)?Z?$"
58
+ )
59
+
60
+ BATCH_SECS_THRESHOLD = 20
61
+ BATCH_ITEM = List[Tuple[str, Dict[str, Any]]]
62
+
63
+
64
+ ConsumerQueueType: TypeAlias = QueueLike[
65
+ Union[int, Tuple[str, str, MetadataType]]
66
+ ]
67
+ WriterQueueType: TypeAlias = SimpleQueueLike[Union[int, BATCH_ITEM]]
68
+
69
+
70
+ class Stream(NamedTuple):
71
+ """A representation of a path stream as named tuple."""
72
+
73
+ name: str
74
+ path: str
75
+
76
+
77
+ class DateTimeEncoder(json.JSONEncoder):
78
+ """JSON‐Encoder that emits datetimes as ISO‐8601 strings."""
79
+
80
+ def default(self, obj: Any) -> Any:
81
+ """Set default time encoding."""
82
+ if isinstance(obj, datetime):
83
+ return obj.isoformat()
84
+ return super().default(obj)
85
+
86
+
87
+ class DateTimeDecoder(json.JSONDecoder):
88
+ """JSON Decoder that converts ISO‐8601 strings to datetime objects."""
89
+
90
+ def __init__(self, *args: Any, **kwargs: Any) -> None:
91
+ super().__init__(object_hook=self._decode_objects, *args, **kwargs)
92
+
93
+ def _decode_datetime(self, obj: Any) -> Any:
94
+ if isinstance(obj, list):
95
+ return list(map(self._decode_datetime, obj))
96
+ elif isinstance(obj, dict):
97
+ for key in obj:
98
+ obj[key] = self._decode_datetime(obj[key])
99
+ if isinstance(obj, str):
100
+ try:
101
+ return datetime.fromisoformat(obj.replace("Z", "+00:00"))
102
+ except ValueError:
103
+ return obj
104
+ return obj
105
+
106
+ def _decode_objects(self, obj: Dict[str, Any]) -> Any:
107
+ for key, value in obj.items():
108
+ obj[key] = self._decode_datetime(value)
109
+ return obj
110
+
111
+
112
+ class IndexName(NamedTuple):
113
+ """A paired set of metadata indexes representations.
114
+
115
+ - `latest`: Metadata for the latest version of each dataset.
116
+ - `files`: Metadata for all available versions of datasets.
117
+
118
+ This abstraction is backend-agnostic and can be used with any index system,
119
+ such as Apache Solr cores, MongoDB collections, or SQL tables.
120
+
121
+ """
122
+
123
+ latest: str = "latest"
124
+ all: str = "files"
125
+
126
+
127
+ class IndexStore:
128
+ """Base class for all metadata stores."""
129
+
130
+ suffix: ClassVar[str]
131
+ """Path suffix of the metadata store."""
132
+
133
+ driver: ClassVar[str]
134
+ """Intake driver."""
135
+
136
+ def __init__(
137
+ self,
138
+ path: str,
139
+ index_name: IndexName,
140
+ schema: Dict[str, SchemaField],
141
+ batch_size: int = 25_000,
142
+ mode: Literal["r", "w"] = "r",
143
+ storage_options: Optional[Dict[str, Any]] = None,
144
+ shadow: Optional[Union[str, List[str]]] = None,
145
+ **kwargs: Any,
146
+ ) -> None:
147
+ self.storage_options = storage_options or {}
148
+ self._shadow_options = (
149
+ shadow or [] if isinstance(shadow, (list, NoneType)) else [shadow]
150
+ )
151
+ self._ctx = mp.get_context("spawn")
152
+ self.queue: WriterQueueType = self._ctx.SimpleQueue()
153
+ self._sent = 42
154
+ self._fs, self._is_local_path = self.get_fs(path, **self.storage_options)
155
+ self._path = self._fs.unstrip_protocol(path)
156
+ self.schema = schema
157
+ self.batch_size = batch_size
158
+ self.index_names: Tuple[str, str] = (index_name.latest, index_name.all)
159
+ self.mode = mode
160
+ self._rows_since_flush = 0
161
+ self._last_flush = time.time()
162
+ self._paths: List[Stream] = []
163
+ self.max_workers: int = max(1, (os.cpu_count() or 4))
164
+ for name in self.index_names:
165
+ out_path = self.get_path(name)
166
+ self._paths.append(Stream(name=name, path=out_path))
167
+ self._timestamp_keys: Set[str] = {
168
+ k
169
+ for k, col in schema.items()
170
+ if getattr(getattr(col, "base_type", None), "value", None)
171
+ == "timestamp"
172
+ }
173
+
174
+ @staticmethod
175
+ def get_fs(
176
+ uri: str, **storage_options: Any
177
+ ) -> Tuple[fsspec.AbstractFileSystem, bool]:
178
+ """Get the base-url from a path."""
179
+ protocol, path = fsspec.core.split_protocol(uri)
180
+ protocol = protocol or "file"
181
+ add = {"anon": True} if protocol == "s3" else {}
182
+ storage_options = storage_options or add
183
+ fs = fsspec.filesystem(protocol, **storage_options)
184
+ return fs, protocol == "file"
185
+
186
+ @abc.abstractmethod
187
+ async def read(
188
+ self,
189
+ index_name: str,
190
+ ) -> AsyncIterator[List[Dict[str, Any]]]:
191
+ """Yield batches of metadata records from a specific table.
192
+
193
+ Parameters
194
+ ^^^^^^^^^^
195
+ index_name:
196
+ The name of the index_name.
197
+
198
+ Yields
199
+ ^^^^^^
200
+ List[Dict[str, Any]]:
201
+ Deserialised metadata records.
202
+ """
203
+ yield [{}] # pragma: no cover
204
+
205
+ def get_path(self, path_suffix: Optional[str] = None) -> str:
206
+ """Construct a path name for a given suffix."""
207
+ path = self._path.removesuffix(self.suffix)
208
+ new_path = (
209
+ f"{path}-{path_suffix}{self.suffix}"
210
+ if path_suffix
211
+ else f"{path}{self.suffix}"
212
+ )
213
+ return new_path
214
+
215
+ def join(self) -> None:
216
+ """Shutdown the writer task."""
217
+ self.queue.put(self._sent)
218
+ if self.proc is not None:
219
+ self.proc.join()
220
+
221
+ def close(self) -> None:
222
+ """Shutdown the write worker."""
223
+ self.join()
224
+
225
+ @property
226
+ def proc(self) -> Optional["mp.process.BaseProcess"]:
227
+ """The writer process."""
228
+ raise NotImplementedError("This property must be defined.")
229
+
230
+ @abc.abstractmethod
231
+ def get_args(self, index_name: str) -> Dict[str, Any]:
232
+ """Define the intake arguments."""
233
+ ... # pragma: no cover
234
+
235
+ def catalogue_storage_options(
236
+ self, path: Optional[str] = None
237
+ ) -> Dict[str, Any]:
238
+ """Construct the storage options for the catalogue."""
239
+ is_s3 = (path or "").startswith("s3://")
240
+ opts = {
241
+ k: v
242
+ for k, v in self.storage_options.items()
243
+ if k not in self._shadow_options
244
+ }
245
+ shadow_keys = {
246
+ "key",
247
+ "secret",
248
+ "token",
249
+ "username",
250
+ "user",
251
+ "password",
252
+ "secret_file",
253
+ "secretfile",
254
+ }
255
+ opts |= {"anon": True} if is_s3 and not shadow_keys & opts.keys() else {}
256
+ return opts
257
+
258
+
259
+ class JSONLineWriter:
260
+ """Write JSONLines to disk."""
261
+
262
+ def __init__(
263
+ self,
264
+ *streams: Stream,
265
+ comp_level: int = 4,
266
+ shadow: Optional[Union[str, List[str]]] = None,
267
+ **storage_options: Any,
268
+ ) -> None:
269
+
270
+ self._comp_level = comp_level
271
+ self._f: Dict[str, BytesIO] = {}
272
+ self._streams = {s.name: s.path for s in streams}
273
+ self._records = 0
274
+ self.storage_options = storage_options
275
+ for _stream in streams:
276
+ fs, _ = IndexStore.get_fs(_stream.path, **storage_options)
277
+ parent = os.path.dirname(_stream.path).rstrip("/")
278
+ try:
279
+ fs.makedirs(parent, exist_ok=True)
280
+ except Exception: # pragma: no cover
281
+ pass # pragma: no cover
282
+ self._f[_stream.name] = fs.open(_stream.path, mode="wb")
283
+
284
+ @classmethod
285
+ def as_daemon(
286
+ cls,
287
+ queue: WriterQueueType,
288
+ semaphore: int,
289
+ *streams: Stream,
290
+ comp_level: int = 4,
291
+ **storage_options: Any,
292
+ ) -> None:
293
+ """Start the writer process as a daemon."""
294
+ this = cls(*streams, comp_level=comp_level, **storage_options)
295
+ get = queue.get
296
+ add = this._add
297
+ while True:
298
+ item = get()
299
+ if item == semaphore:
300
+ logger.info("Closing writer task.")
301
+ break
302
+ try:
303
+ add(cast(BATCH_ITEM, item))
304
+ except Exception as error:
305
+ logger.error(error)
306
+ this.close()
307
+
308
+ @staticmethod
309
+ def _encode_records(records: List[Dict[str, Any]]) -> bytes:
310
+ """Serialize a list of dicts into one JSONL bytes blob."""
311
+ parts = [orjson.dumps(rec) for rec in records]
312
+ return b"".join(p + b"\n" for p in parts)
313
+
314
+ def _gzip_once(self, payload: bytes) -> bytes:
315
+ """Compress a whole JSONL blob into a single gz member (fast)."""
316
+ return gzip.compress(payload, compresslevel=self._comp_level)
317
+
318
+ def _add(self, metadata_batch: List[Tuple[str, Dict[str, Any]]]) -> None:
319
+ """Add a batch of metadata to the gzip store."""
320
+ by_index: Dict[str, List[Dict[str, Any]]] = {
321
+ name: [] for name in self._streams
322
+ }
323
+ for index_name, metadata in metadata_batch:
324
+ by_index[index_name].append(metadata)
325
+ for index_name, records in by_index.items():
326
+ if not records:
327
+ continue
328
+ payload = self._encode_records(records)
329
+ gz = self._gzip_once(payload)
330
+ self._f[index_name].write(gz)
331
+ self._records += len(records)
332
+
333
+ def close(self) -> None:
334
+ """Close the files."""
335
+ for name, stream in self._f.items():
336
+ try:
337
+ stream.flush()
338
+ except Exception:
339
+ pass
340
+ stream.close()
341
+ if not self._records:
342
+ fs, _ = IndexStore.get_fs(
343
+ self._streams[name], **self.storage_options
344
+ )
345
+ fs.rm(self._streams[name])
346
+
347
+
348
+ class JSONLines(IndexStore):
349
+ """Write metadata to gzipped JSONLines files."""
350
+
351
+ suffix = ".json.gz"
352
+ driver = "intake.source.jsonfiles.JSONLinesFileSource"
353
+
354
+ def __init__(
355
+ self,
356
+ path: str,
357
+ index_name: IndexName,
358
+ schema: Dict[str, SchemaField],
359
+ mode: Literal["w", "r"] = "r",
360
+ storage_options: Optional[Dict[str, Any]] = None,
361
+ shadow: Optional[Union[str, List[str]]] = None,
362
+ batch_size: int = 25_000,
363
+ **kwargs: Any,
364
+ ):
365
+ super().__init__(
366
+ path,
367
+ index_name,
368
+ schema,
369
+ mode=mode,
370
+ shadow=shadow,
371
+ storage_options=storage_options,
372
+ batch_size=batch_size,
373
+ )
374
+ _comp_level = int(kwargs.get("comp_level", "4"))
375
+ self._proc: Optional["mp.process.BaseProcess"] = None
376
+ if mode == "w":
377
+ self._proc = self._ctx.Process(
378
+ target=JSONLineWriter.as_daemon,
379
+ args=(
380
+ self.queue,
381
+ self._sent,
382
+ )
383
+ + tuple(self._paths),
384
+ kwargs={**{"comp_level": _comp_level}, **self.storage_options},
385
+ daemon=True,
386
+ )
387
+ self._proc.start()
388
+
389
+ @property
390
+ def proc(self) -> Optional["mp.process.BaseProcess"]:
391
+ """The writer process."""
392
+ return self._proc
393
+
394
+ def get_args(self, index_name: str) -> Dict[str, Any]:
395
+ """Define the intake arguments."""
396
+ path = self.get_path(index_name)
397
+ return {
398
+ "urlpath": path,
399
+ "compression": "gzip",
400
+ "text_mode": True,
401
+ "storage_options": self.catalogue_storage_options(path),
402
+ }
403
+
404
+ async def read(
405
+ self,
406
+ index_name: str,
407
+ ) -> AsyncIterator[List[Dict[str, Any]]]:
408
+ """Yield batches of metadata records from a specific table.
409
+
410
+ Parameters
411
+ ^^^^^^^^^^
412
+ index_name:
413
+ The name of the index_name.
414
+
415
+ Yields
416
+ ^^^^^^^
417
+ List[Dict[str, Any]]:
418
+ Deserialised metadata records.
419
+ """
420
+ loop = asyncio.get_running_loop()
421
+ ts_keys = self._timestamp_keys
422
+ path = self.get_path(index_name)
423
+ with (
424
+ self._fs.open(
425
+ path,
426
+ mode="rt",
427
+ compression="gzip",
428
+ encoding="utf-8",
429
+ ) as stream,
430
+ ThreadPoolExecutor(max_workers=self.max_workers) as pool,
431
+ ):
432
+ raw_lines: List[str] = []
433
+ async for line in create_async_iterator(stream):
434
+ raw_lines.append(line)
435
+ if len(raw_lines) >= self.batch_size:
436
+ batch = await loop.run_in_executor(
437
+ pool, parse_batch, raw_lines, ts_keys
438
+ )
439
+ yield batch
440
+ raw_lines.clear()
441
+ if raw_lines:
442
+ batch = await loop.run_in_executor(
443
+ pool, parse_batch, raw_lines, ts_keys
444
+ )
445
+ yield batch
446
+
447
+
448
+ class CatalogueBackends(Enum):
449
+ """Define the implemented catalogue backends."""
450
+
451
+ jsonlines = JSONLines
452
+
453
+
454
+ CatalogueBackendType: TypeAlias = Literal["jsonlines"]
455
+
456
+
457
+ class CatalogueReader:
458
+ """Backend for reading the content of an intake catalogue.
459
+
460
+ Parameters
461
+ ^^^^^^^^^^
462
+ catalogue_file:
463
+ Path to the intake catalogue
464
+ batch_size:
465
+ Size of the metadata chunks that should be read.
466
+ """
467
+
468
+ def __init__(
469
+ self,
470
+ catalogue_file: Union[str, Path],
471
+ batch_size: int = 2500,
472
+ storage_options: Optional[Dict[str, Any]] = None,
473
+ ) -> None:
474
+ catalogue_file = str(catalogue_file)
475
+ storage_options = storage_options or {}
476
+ fs, _ = IndexStore.get_fs(catalogue_file, **storage_options)
477
+ path = fs.unstrip_protocol(catalogue_file)
478
+ with fs.open(path) as stream:
479
+ cat = yaml.safe_load(stream.read())
480
+ _schema_json = cat["metadata"]["schema"]
481
+ schema = {s["key"]: SchemaField(**s) for k, s in _schema_json.items()}
482
+ index_name = IndexName(**cat["metadata"]["index_names"])
483
+ cls: Type[IndexStore] = CatalogueBackends[
484
+ cat["metadata"]["backend"]
485
+ ].value
486
+ storage_options = cat["metadata"].get("storage_options", {})
487
+ self.store = cls(
488
+ cat["metadata"]["prefix"],
489
+ index_name,
490
+ schema,
491
+ mode="r",
492
+ batch_size=batch_size,
493
+ storage_options=storage_options,
494
+ )
495
+
496
+
497
+ class QueueConsumer:
498
+ """Class that consumes the file discovery queue."""
499
+
500
+ def __init__(
501
+ self,
502
+ config: Optional[Union[str, Path]],
503
+ num_objects: "sharedctypes.Synchronized[Any]",
504
+ writer_queue: WriterQueueType,
505
+ ) -> None:
506
+ self.config = DRSConfig.load(config)
507
+ self._writer_queue = writer_queue
508
+ self.num_objects = num_objects
509
+
510
+ def _flush_batch(
511
+ self,
512
+ batch: List[Tuple[str, Dict[str, Any]]],
513
+ ) -> None:
514
+ logger.info("Ingesting %i items", len(batch))
515
+ try:
516
+ self._writer_queue.put(batch.copy())
517
+ with self.num_objects.get_lock():
518
+ self.num_objects.value += len(batch)
519
+ except Exception as error: # pragma: no cover
520
+ logger.error(error) # pragma: no cover
521
+ batch.clear()
522
+
523
+ @classmethod
524
+ def run_consumer_task(
525
+ cls,
526
+ queue: ConsumerQueueType,
527
+ writer_queue: WriterQueueType,
528
+ config: Optional[Union[str, Path]],
529
+ num_objects: "sharedctypes.Synchronized[Any]",
530
+ batch_size: int,
531
+ poison_pill: int,
532
+ ) -> None:
533
+ """Set up a consumer task waiting for incoming data to be ingested."""
534
+ this = cls(config, num_objects, writer_queue)
535
+ this_worker = os.getpid()
536
+ logger.info("Adding %i consumer to consumers.", this_worker)
537
+ batch: List[Tuple[str, Dict[str, Any]]] = []
538
+ append = batch.append
539
+ read_metadata = this.config.read_metadata
540
+ flush = this._flush_batch
541
+ get = queue.get
542
+ while True:
543
+ item = get()
544
+ if item == poison_pill:
545
+ break
546
+ try:
547
+ name, drs_type, inp = cast(Tuple[str, str, MetadataType], item)
548
+ metadata = read_metadata(drs_type, inp)
549
+ except MetadataCrawlerException as error:
550
+ logger.warning(error)
551
+ continue
552
+ except Exception as error:
553
+ logger.error(error)
554
+ continue
555
+ append((name, metadata))
556
+ if len(batch) >= batch_size:
557
+ flush(batch)
558
+ if batch:
559
+ flush(batch)
560
+ logger.info("Closing consumer %i", this_worker)
561
+
562
+
563
+ class CatalogueWriter:
564
+ """Create intake catalogues that store metadata entries.
565
+
566
+ Parameters
567
+ ^^^^^^^^^^
568
+ yaml_path:
569
+ Path the to intake catalogue that should be created.
570
+ index_name:
571
+ Names of the metadata indexes.
572
+ data_store_prefix:
573
+ Prefix of the path/url where the metadata is stored.
574
+ batch_size:
575
+ Size of the metadata chunks that should be added to the data store.
576
+ index_schema:
577
+ Schema of the metadata
578
+ storage_options:
579
+ Set additional storage options for adding metadata to the metadata store
580
+ shadow:
581
+ 'Shadow' this storage options. This is useful to hide secrets in public
582
+ data catalogues.
583
+ """
584
+
585
+ def __init__(
586
+ self,
587
+ yaml_path: str,
588
+ index_name: IndexName,
589
+ data_store_prefix: str = "metadata",
590
+ backend: str = "jsonlines",
591
+ batch_size: int = 25_000,
592
+ config: Optional[
593
+ Union[Path, str, Dict[str, Any], tomlkit.TOMLDocument]
594
+ ] = None,
595
+ n_procs: Optional[int] = None,
596
+ storage_options: Optional[Dict[str, Any]] = None,
597
+ shadow: Optional[Union[str, List[str]]] = None,
598
+ **kwargs: Any,
599
+ ) -> None:
600
+ self.config = DRSConfig.load(config)
601
+ storage_options = storage_options or {}
602
+ self.fs, _ = IndexStore.get_fs(yaml_path, **storage_options)
603
+ self.path = self.fs.unstrip_protocol(yaml_path)
604
+ scheme, _, _ = data_store_prefix.rpartition("://")
605
+ self.backend = backend
606
+ if not scheme and not os.path.isabs(data_store_prefix):
607
+ data_store_prefix = os.path.join(
608
+ os.path.abspath(os.path.dirname(yaml_path)), data_store_prefix
609
+ )
610
+ self.prefix = data_store_prefix
611
+ self.index_name = index_name
612
+ cls: Type[IndexStore] = CatalogueBackends[backend].value
613
+ self.store = cls(
614
+ data_store_prefix,
615
+ index_name,
616
+ self.config.index_schema,
617
+ mode="w",
618
+ storage_options=storage_options,
619
+ shadow=shadow,
620
+ **kwargs,
621
+ )
622
+ self._ctx = mp.get_context("spawn")
623
+ self.queue: ConsumerQueueType = self._ctx.Queue()
624
+ self._poison_pill = 13
625
+ self.num_objects: Counter = self._ctx.Value("i", 0)
626
+ n_procs = n_procs or min(mp.cpu_count(), 15)
627
+ batch_size_per_proc = max(int(batch_size / n_procs), 100)
628
+ self._tasks = [
629
+ self._ctx.Process(
630
+ target=QueueConsumer.run_consumer_task,
631
+ args=(
632
+ self.queue,
633
+ self.store.queue,
634
+ config,
635
+ self.num_objects,
636
+ batch_size_per_proc,
637
+ self._poison_pill,
638
+ ),
639
+ )
640
+ for i in range(n_procs)
641
+ ]
642
+
643
+ async def put(
644
+ self,
645
+ inp: MetadataType,
646
+ drs_type: str,
647
+ name: str = "",
648
+ ) -> None:
649
+ """Add items to the fifo queue.
650
+
651
+ This method is used by the data crawling (discovery) method
652
+ to add the name of the catalogue, the path to the input file object
653
+ and a reference of the Data Reference Syntax class for this
654
+ type of dataset.
655
+
656
+ Parameters
657
+ ^^^^^^^^^^
658
+ inp:
659
+ Path and metadata of the discovered object.
660
+ drs_type:
661
+ The data type the discovered object belongs to.
662
+ name:
663
+ Name of the catalogue, if applicable. This variable depends on
664
+ the cataloguing system. For example apache solr would use a `core`.
665
+ """
666
+ self.queue.put((name, drs_type, inp))
667
+
668
+ @property
669
+ def ingested_objects(self) -> int:
670
+ """Get the number of ingested objects."""
671
+ return self.num_objects.value
672
+
673
+ @property
674
+ def size(self) -> int:
675
+ """Get the size of the worker queue."""
676
+ return self.queue.qsize()
677
+
678
+ def join_all_tasks(self) -> None:
679
+ """Block the execution until all tasks are marked as done."""
680
+ logger.debug("Releasing consumers from their duty.")
681
+ for _ in self._tasks:
682
+ self.queue.put(self._poison_pill)
683
+ for task in self._tasks:
684
+ task.join()
685
+ self.store.join()
686
+
687
+ async def close(self, create_catalogue: bool = True) -> None:
688
+ """Close any connections."""
689
+ self.store.join()
690
+ self.store.close()
691
+ if create_catalogue:
692
+ self._create_catalogue_file()
693
+
694
+ async def delete(self) -> None:
695
+ """Delete all stores."""
696
+ await self.close(False)
697
+ for name in self.index_name.latest, self.index_name.all:
698
+ path = self.store.get_path(name)
699
+ self.store._fs.rm(path) if self.store._fs.exists(path) else None
700
+ self.fs.rm(self.path) if self.fs.exists(self.path) else None
701
+
702
+ def run_consumer(self) -> None:
703
+ """Set up all the consumers."""
704
+ for task in self._tasks:
705
+ task.start()
706
+
707
+ def _create_catalogue_file(self) -> None:
708
+ catalog = {
709
+ "description": (
710
+ f"{metadata_crawler.__name__} "
711
+ f"(v{metadata_crawler.__version__})"
712
+ f" at {datetime.now().strftime('%c')}"
713
+ ),
714
+ "metadata": {
715
+ "version": 1,
716
+ "backend": self.backend,
717
+ "prefix": self.prefix,
718
+ "storage_options": self.store.catalogue_storage_options(
719
+ self.prefix
720
+ ),
721
+ "index_names": {
722
+ "latest": self.index_name.latest,
723
+ "all": self.index_name.all,
724
+ },
725
+ "schema": {
726
+ k: json.loads(s.model_dump_json())
727
+ for k, s in self.store.schema.items()
728
+ },
729
+ },
730
+ "sources": {
731
+ self.index_name.latest: {
732
+ "description": "Latest metadata versions.",
733
+ "driver": self.store.driver,
734
+ "args": self.store.get_args(self.index_name.latest),
735
+ },
736
+ self.index_name.all: {
737
+ "description": "All metadata versions only.",
738
+ "driver": self.store.driver,
739
+ "args": self.store.get_args(self.index_name.all),
740
+ },
741
+ },
742
+ }
743
+ with self.fs.open(self.path, "w", encoding="utf-8") as f:
744
+ yaml.safe_dump(
745
+ catalog,
746
+ f,
747
+ sort_keys=False, # preserve our ordering
748
+ default_flow_style=False,
749
+ )