ctao-bdms-clients 0.1.0rc3__py3-none-any.whl → 0.2.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
bdms/_version.py CHANGED
@@ -1,8 +1,13 @@
1
- # file generated by setuptools_scm
1
+ # file generated by setuptools-scm
2
2
  # don't change, don't track in version control
3
+
4
+ __all__ = ["__version__", "__version_tuple__", "version", "version_tuple"]
5
+
3
6
  TYPE_CHECKING = False
4
7
  if TYPE_CHECKING:
5
- from typing import Tuple, Union
8
+ from typing import Tuple
9
+ from typing import Union
10
+
6
11
  VERSION_TUPLE = Tuple[Union[int, str], ...]
7
12
  else:
8
13
  VERSION_TUPLE = object
@@ -12,5 +17,5 @@ __version__: str
12
17
  __version_tuple__: VERSION_TUPLE
13
18
  version_tuple: VERSION_TUPLE
14
19
 
15
- __version__ = version = '0.1.0rc3'
16
- __version_tuple__ = version_tuple = (0, 1, 0)
20
+ __version__ = version = '0.2.0'
21
+ __version_tuple__ = version_tuple = (0, 2, 0)
@@ -0,0 +1,479 @@
1
+ """Module for ACADA data ingestion (onsite) into the BDMS system using the IngestionClient.
2
+
3
+ This module provides the IngestionClient class to manage the ingestion of ACADA data into the BDMS system.
4
+ It includes functionality for constructing FITS file paths, converting ACADA paths to Logical File Names (LFNs),
5
+ and registering replicas in Rucio.
6
+ """
7
+
8
+ import logging
9
+ import os
10
+ from contextlib import ExitStack
11
+ from pathlib import Path
12
+ from typing import Optional, Union
13
+
14
+ from astropy.io import fits
15
+ from rucio.client.accountclient import AccountClient
16
+ from rucio.client.client import Client, DIDClient
17
+ from rucio.client.replicaclient import ReplicaClient
18
+ from rucio.client.rseclient import RSEClient
19
+ from rucio.client.ruleclient import RuleClient
20
+ from rucio.client.scopeclient import ScopeClient
21
+ from rucio.common.exception import Duplicate, RucioException
22
+ from rucio.common.utils import adler32
23
+
24
+ from bdms.extract_fits_metadata import (
25
+ extract_metadata_from_data,
26
+ extract_metadata_from_headers,
27
+ )
28
+
29
+ LOGGER = logging.getLogger(__name__)
30
+
31
+
32
+ __all__ = [
33
+ "IngestionClient",
34
+ ]
35
+
36
+
37
+ class IngestionClient:
38
+ """A client for BDMS ingestion and replication.
39
+
40
+ This class provides methods to ingest ACADA data into the BDMS system, including converting ACADA paths to
41
+ Logical File Names (LFNs), registering replicas in Rucio, and replicating data to offsite RSEs.
42
+
43
+ Parameters
44
+ ----------
45
+ data_path : str
46
+ Path to data directory. This is a required argument.
47
+ rse : str
48
+ Rucio Storage Element (RSE) name. This is a required argument.
49
+ vo : str, optional
50
+ Virtual organization name prefix. Defaults to "ctao".
51
+ logger : logging.Logger, optional
52
+ Logger instance. If None, a new logger is created.
53
+ scope : str, optional
54
+ Rucio scope to use for replica registration. Defaults to 'acada'.
55
+
56
+ Raises
57
+ ------
58
+ FileNotFoundError
59
+ If the specified data directory does not exist.
60
+ ValueError
61
+ If the specified RSE is not available in Rucio.
62
+ RuntimeError
63
+ If there is an error communicating with Rucio while:
64
+
65
+ - Checking RSE availability.
66
+ - Initializing Rucio clients (related to configuration and authentication issues).
67
+ - Managing the Rucio scope.
68
+ """
69
+
70
+ def __init__(
71
+ self,
72
+ data_path: Union[str, os.PathLike],
73
+ rse: str,
74
+ vo="ctao",
75
+ logger=None,
76
+ scope="acada",
77
+ ) -> None:
78
+ self.logger = logger or LOGGER.getChild(self.__class__.__name__)
79
+ self.vo = vo
80
+
81
+ # Set data path (Prefix)
82
+ self.data_path = Path(data_path)
83
+ if not self.data_path.is_dir():
84
+ raise FileNotFoundError(f"Data directory not found at {self.data_path}")
85
+
86
+ self.rse = rse
87
+
88
+ # Check RSE availability before proceeding to next steps
89
+ self._check_rse_availability()
90
+
91
+ # Initialize Rucio clients
92
+ try:
93
+ self.client = Client()
94
+ self.replica_client = ReplicaClient()
95
+ self.scope_client = ScopeClient()
96
+ self.account_client = AccountClient()
97
+ self.rse_client = RSEClient()
98
+ self.rule_client = RuleClient()
99
+ self.did_client = DIDClient()
100
+ except RucioException as e:
101
+ self.logger.error("Failed to initialize Rucio clients: %s", str(e))
102
+ raise
103
+
104
+ # Set the scope and ensure it exists in Rucio
105
+ self.scope = scope
106
+ self.user = self.account_client.whoami()["account"]
107
+ self._add_acada_scope()
108
+
109
+ def _check_rse_availability(self) -> None:
110
+ """Check if the specified RSE is available in Rucio.
111
+
112
+ Raises
113
+ ------
114
+ ValueError
115
+ If the RSE is not found in Rucio.
116
+ rucio.common.exception.RucioException
117
+ If there is an error communicating with Rucio (e.g., network issues, authentication errors).
118
+ """
119
+ rse_client = RSEClient()
120
+ available_rses = [rse["rse"] for rse in rse_client.list_rses()]
121
+ if self.rse not in available_rses:
122
+ raise ValueError(
123
+ f"RSE '{self.rse}' is not available in Rucio. Available RSEs: {available_rses}"
124
+ )
125
+ self.logger.info("RSE '%s' is available in Rucio", self.rse)
126
+
127
+ def _add_acada_scope(self) -> None:
128
+ """Add the specified scope to Rucio if it doesn't already exist.
129
+
130
+ Raises
131
+ ------
132
+ RuntimeError
133
+ If the scope cannot be created or managed in Rucio.
134
+ """
135
+ try:
136
+ self.scope_client.add_scope(self.user, self.scope)
137
+ except Duplicate:
138
+ # Scope already exists
139
+ return
140
+ except RucioException as e:
141
+ self.logger.error(
142
+ "Failed to manage scope '%s' in Rucio: %s",
143
+ self.scope,
144
+ str(e),
145
+ )
146
+ raise
147
+
148
+ def acada_to_lfn(self, acada_path) -> str:
149
+ """Convert an ACADA path to a BDMS Logical File Name (LFN).
150
+
151
+ Parameters
152
+ ----------
153
+ acada_path : str or Path
154
+ The ACADA file path to convert.
155
+
156
+ Returns
157
+ -------
158
+ str
159
+ The generated BDMS LFN (e.g., '/ctao/acada/DL0/LSTN-01/events/YYYY/MM/DD/file.fits.fz').
160
+
161
+ Raises
162
+ ------
163
+ ValueError
164
+ If ``acada_path`` is not an absolute path or is not within the BDMS data path (prefix) or
165
+ does not start with the expected '<vo>/<scope>' prefix under the data path.
166
+ """
167
+ acada_path = Path(acada_path)
168
+
169
+ # Validate that the path is absolute
170
+ if not acada_path.is_absolute():
171
+ raise ValueError("acada_path must be absolute")
172
+
173
+ # Validate that acada_path is within data_path
174
+ try:
175
+ rel_path = acada_path.relative_to(self.data_path)
176
+ except ValueError:
177
+ raise ValueError(
178
+ f"acada_path {acada_path} is not within data_path {self.data_path}"
179
+ )
180
+
181
+ # Validate that acada_path starts with <vo>/<scope> under data_path
182
+ expected_prefix = self.data_path / self.vo / self.scope
183
+ if not acada_path.is_relative_to(expected_prefix):
184
+ raise ValueError(
185
+ f"acada_path {acada_path} must start with {expected_prefix} (vo: {self.vo}, scope: {self.scope})"
186
+ )
187
+
188
+ bdms_lfn = f"/{rel_path}"
189
+ return bdms_lfn
190
+
191
+ def check_replica_exists(self, lfn: str) -> bool:
192
+ """Check if a replica already exists for the given LFN on the specified RSE.
193
+
194
+ Parameters
195
+ ----------
196
+ lfn : str
197
+ The Logical File Name (LFN) to check.
198
+
199
+
200
+ Returns
201
+ -------
202
+ bool
203
+ True if the replica exists and has a valid PFN, False otherwise.
204
+
205
+ Raises
206
+ ------
207
+ RuntimeError
208
+ If a replica exists but has no PFN for the RSE, indicating an invalid replica state.
209
+ """
210
+ replicas = list(
211
+ self.replica_client.list_replicas(
212
+ dids=[{"scope": self.scope, "name": lfn}],
213
+ rse_expression=self.rse,
214
+ )
215
+ )
216
+
217
+ self.logger.debug("Existing Replicas for lfn '%r'", replicas)
218
+ if replicas:
219
+ replica = replicas[0]
220
+ pfns = replica["rses"].get(self.rse, [])
221
+ if not pfns:
222
+ raise RuntimeError(
223
+ f"No PFN found for existing replica with LFN {lfn} on {self.rse}"
224
+ )
225
+ return True
226
+ return False
227
+
228
+ def add_onsite_replica(self, acada_path) -> str:
229
+ """Register a file as a replica in Rucio on the specified RSE and retrieve its LFN.
230
+
231
+ Parameters
232
+ ----------
233
+ acada_path : str or Path
234
+ The ACADA path where the file is located.
235
+
236
+ rse : str, optional
237
+ The RSE to register the replica on. If None, uses the client's RSE (self.rse).
238
+
239
+ Returns
240
+ -------
241
+ str
242
+ The Logical File Name (LFN) of the registered replica.
243
+
244
+ Raises
245
+ ------
246
+ FileNotFoundError
247
+ If the file does not exist at ``acada_path``.
248
+ RuntimeError
249
+ In the following cases:
250
+ - If a replica already exists but has no PFN for the RSE (raised by `check_replica_exists`).
251
+ - If the ``IngestionClient.add_replica`` call fails during registration (e.g., due to a Rucio server issue).
252
+ """
253
+ acada_path = Path(acada_path)
254
+ self.logger.debug("Starting ingestion for path '%s'", acada_path)
255
+
256
+ # Validate file existence
257
+ if not acada_path.is_file():
258
+ raise FileNotFoundError(f"File does not exist at {acada_path}")
259
+
260
+ # Generate LFN
261
+ lfn = self.acada_to_lfn(acada_path=str(acada_path))
262
+ self.logger.info("Using LFN '%s' for path '%s'", lfn, acada_path)
263
+
264
+ # Check if the replica already exists
265
+ if self.check_replica_exists(lfn):
266
+ self.logger.info("Replica already exists for lfn '%s', skipping", lfn)
267
+ return lfn
268
+
269
+ # Proceed with registering the replica if check_replica_exists returns False
270
+ valid, metadata = verify_and_extract_metadata(acada_path)
271
+ metadata["valid_fits_checksum"] = valid
272
+
273
+ # Compute rucio file metadata
274
+ file_size = acada_path.stat().st_size
275
+ checksum = adler32(acada_path)
276
+
277
+ # Register the replica in Rucio
278
+ try:
279
+ success = self.replica_client.add_replica(
280
+ rse=self.rse,
281
+ scope=self.scope,
282
+ name=lfn,
283
+ bytes_=file_size,
284
+ adler32=checksum,
285
+ )
286
+ if not success:
287
+ raise RuntimeError(
288
+ f"Failed to register replica for LFN {lfn} on {self.rse}"
289
+ )
290
+ except Exception as e:
291
+ raise RuntimeError(
292
+ f"Failed to register replica for LFN {lfn} on {self.rse}: {str(e)}"
293
+ )
294
+ self.logger.info("Successfully registered the replica for lfn '%s'", lfn)
295
+
296
+ if len(metadata) > 0:
297
+ self.did_client.set_metadata_bulk(scope=self.scope, name=lfn, meta=metadata)
298
+ self.logger.info("Set metadata of %r to %r", lfn, metadata)
299
+
300
+ return lfn
301
+
302
+ def add_offsite_replication_rules(
303
+ self,
304
+ lfn: str,
305
+ copies: int = 1,
306
+ lifetime: Optional[int] = None,
307
+ offsite_rse_expression: str = "OFFSITE",
308
+ ) -> list[str]:
309
+ """Replicate an already-ingested ACADA data product to offsite RSEs.
310
+
311
+ This method assumes the data product has already been ingested into the onsite RSE and is identified by the given LFN.
312
+ It creates one or two replication rules to offsite RSEs, depending on the number of copies requested:
313
+ - First rule: Always creates exactly 1 replica to prevent parallel transfers from the onsite RSE.
314
+ - Second rule (if copies > 1): Creates additional replicas (equal to the requested copies), sourcing data from offsite RSEs to avoid further transfers from the onsite RSE.
315
+
316
+ Parameters
317
+ ----------
318
+ lfn : str
319
+ The Logical File Name (LFN) of the already-ingested ACADA data product.
320
+ copies : int, optional
321
+ The total number of offsite replicas to create. Defaults to 1.
322
+ - If copies == 1, only one rule is created with 1 replica.
323
+ - If copies > 1, a second rule is created with the requested number of copies, sourcing from offsite RSEs.
324
+ lifetime : int, optional
325
+ The lifetime of the replication rules in seconds. If None, the rules are permanent.
326
+ offsite_rse_expression : str, optional
327
+ The RSE expression identifying offsite Rucio Storage Elements (RSEs). Defaults to "OFFSITE".
328
+
329
+ Returns
330
+ -------
331
+ List[str]
332
+ The list of replication rule IDs created (1 or 2 rules, depending on the copies parameter).
333
+
334
+ Raises
335
+ ------
336
+ RuntimeError
337
+ If there is an error interacting with Rucio, including:
338
+ - Failure to create a new replication rule (e.g., DuplicateRule).
339
+ """
340
+ # Create the DID for replication
341
+ did = {"scope": self.scope, "name": lfn}
342
+ dids = [did]
343
+
344
+ # Initialize the list of rule IDs
345
+ rule_ids = []
346
+
347
+ # First rule: Always create exactly 1 replica to prevent parallel transfers from onsite RSE
348
+ try:
349
+ rule_id_offsite_1 = self.rule_client.add_replication_rule(
350
+ dids=dids,
351
+ rse_expression=offsite_rse_expression,
352
+ copies=1,
353
+ lifetime=lifetime,
354
+ source_replica_expression=None, # Let Rucio choose the source (onsite RSE)
355
+ )[0]
356
+ self.logger.debug(
357
+ "Created first replication rule %s for DID %s to RSE expression '%s' with 1 copy, lifetime %s",
358
+ rule_id_offsite_1,
359
+ did,
360
+ offsite_rse_expression,
361
+ lifetime if lifetime is not None else "permanent",
362
+ )
363
+ rule_ids.append(rule_id_offsite_1)
364
+ except RucioException as e:
365
+ self.logger.error(
366
+ "Failed to create first offsite replication rule for DID %s to RSE expression '%s': %s",
367
+ did,
368
+ offsite_rse_expression,
369
+ str(e),
370
+ )
371
+ raise
372
+
373
+ # Second rule: If more than one copy is requested, create a second rule sourcing from offsite RSEs
374
+ if copies > 1:
375
+ # Exclude the onsite RSE to ensure the data is sourced from an offsite RSE
376
+ # source_replica_expression = f"*\\{onsite_rse}" (we could also consider this expression)
377
+ source_replica_expression = offsite_rse_expression
378
+ self.logger.debug(
379
+ "Creating second offsite replication rule to RSE expression '%s' with %d copies, sourcing from offsite RSEs",
380
+ offsite_rse_expression,
381
+ copies,
382
+ )
383
+ try:
384
+ rule_id_offsite_2 = self.rule_client.add_replication_rule(
385
+ dids=dids,
386
+ rse_expression=offsite_rse_expression,
387
+ copies=copies, # Use requested number of copies
388
+ lifetime=lifetime,
389
+ source_replica_expression=source_replica_expression,
390
+ )[0]
391
+ self.logger.debug(
392
+ "Created second replication rule %s for DID %s to RSE expression '%s' with %d copies, source_replica_expression '%s', lifetime %s",
393
+ rule_id_offsite_2,
394
+ did,
395
+ offsite_rse_expression,
396
+ copies,
397
+ source_replica_expression,
398
+ lifetime if lifetime is not None else "permanent",
399
+ )
400
+ rule_ids.append(rule_id_offsite_2)
401
+ except RucioException as e:
402
+ self.logger.error(
403
+ "Failed to create second offsite replication rule for DID %s to RSE expression '%s': %s",
404
+ did,
405
+ offsite_rse_expression,
406
+ str(e),
407
+ )
408
+ raise
409
+
410
+ self.logger.info(
411
+ "Created %d offsite replication rule(s) for LFN '%s' to RSE expression '%s': %s",
412
+ len(rule_ids),
413
+ lfn,
414
+ offsite_rse_expression,
415
+ rule_ids,
416
+ )
417
+ return rule_ids
418
+
419
+
420
+ class FITSVerificationError(Exception):
421
+ """Raised when a FITS file does not pass verification."""
422
+
423
+
424
+ def verify_fits_checksum(hdul: fits.HDUList):
425
+ """
426
+ Verify all present checksums in the given HDUList.
427
+
428
+ Goes through all HDUs and verifies DATASUM and CHECKSUM if
429
+ present in the given HDU.
430
+
431
+ Verifies DATASUM before CHECKSUM to distinguish failure
432
+ in data section vs. failure in header section.
433
+
434
+ Raises
435
+ ------
436
+ FITSVerificationError: in case any of the checks are not passing
437
+ """
438
+ for pos, hdu in enumerate(hdul):
439
+ name = hdu.name or ""
440
+
441
+ checksum_result = hdu.verify_checksum()
442
+ if checksum_result == 0:
443
+ msg = f"CHECKSUM verification failed for HDU {pos} with name {name!r}"
444
+ raise FITSVerificationError(msg)
445
+ elif checksum_result == 2 and pos != 0: # ignore primary for warning
446
+ LOGGER.warning("No CHECKSUM in HDU %d with name %r", pos, name)
447
+
448
+
449
+ def verify_and_extract_metadata(fits_path):
450
+ """Verify checksums and extract metadata from FITS files.
451
+
452
+ This wrapper transforms exceptions into log errors and minimizes
453
+ the number of times the FITS file has to be opened.
454
+ """
455
+ # this context manager allows elegant handling
456
+ # of conditionally present context managers
457
+ # which allows better handling of exceptions below
458
+ context = ExitStack()
459
+ metadata = {}
460
+ with context:
461
+ try:
462
+ hdul = context.enter_context(fits.open(fits_path))
463
+ except Exception as e:
464
+ LOGGER.error("Failed to open FITS file %r: %s", fits_path, e)
465
+ return False, metadata
466
+
467
+ try:
468
+ verify_fits_checksum(hdul)
469
+ except FITSVerificationError as e:
470
+ LOGGER.error("File %r failed FITS checksum verification: %s", fits_path, e)
471
+ return False, metadata
472
+
473
+ try:
474
+ metadata = extract_metadata_from_headers(hdul)
475
+ metadata.update(extract_metadata_from_data(fits_path))
476
+ return True, metadata
477
+ except Exception as e:
478
+ LOGGER.error("Failed to extract metadata from %r: %s", fits_path, e)
479
+ return False, metadata
@@ -0,0 +1,134 @@
1
+ """Functions to extract metadata from input files."""
2
+
3
+ import logging
4
+
5
+ import numpy as np
6
+ from protozfits import File
7
+
8
+ # Configure logger
9
+ logger = logging.getLogger(__name__)
10
+
11
+ # COMMON HEADER
12
+ start_time = "DataStream.DATE"
13
+
14
+ # COMMON DATA
15
+ origin = "DataStream.ORIGIN"
16
+ sb_id = "DataStream.sb_id"
17
+ obs_id = "DataStream.obs_id"
18
+
19
+ # -- FOR TEL_TRIG
20
+ tel_ids = "DataStream.tel_ids"
21
+
22
+ # -- FOR TEL_SUB
23
+ subarray_id = "DataStream.subarray_id"
24
+
25
+ METADATA_TEL = {
26
+ "HEADER": {
27
+ "observatory": origin,
28
+ "start_time": start_time,
29
+ "end_time": "Events.DATEEND",
30
+ },
31
+ "PAYLOAD": {
32
+ "sb_id": sb_id,
33
+ "obs_id": obs_id,
34
+ },
35
+ }
36
+
37
+ METADATA_SUB = {
38
+ "HEADER": {
39
+ "observatory": origin,
40
+ "start_time": start_time,
41
+ "end_time": "SubarrayEvents.DATEEND",
42
+ },
43
+ "PAYLOAD": {
44
+ "subarray_id": subarray_id,
45
+ "sb_id": sb_id,
46
+ "obs_id": obs_id,
47
+ },
48
+ }
49
+
50
+ METADATA_TRIG = {
51
+ "HEADER": {
52
+ "observatory": origin,
53
+ "start_time": start_time,
54
+ "end_time": "Triggers.DATEEND",
55
+ },
56
+ "PAYLOAD": {
57
+ "tel_ids": tel_ids,
58
+ "sb_id": sb_id,
59
+ "obs_id": obs_id,
60
+ },
61
+ }
62
+
63
+ #: Mapping from DataStream.PBFHEAD to the metadata items we want to collect
64
+ METADATA_SCHEMAS = {
65
+ "DL0v1.Trigger.DataStream": METADATA_TRIG,
66
+ "DL0v1.Subarray.DataStream": METADATA_SUB,
67
+ "DL0v1.Telescope.DataStream": METADATA_TEL,
68
+ }
69
+
70
+
71
+ def extract_metadata_from_headers(hdul):
72
+ """Extract metadata from FITS headers of hdul."""
73
+ all_headers = {}
74
+ for hdu in hdul:
75
+ if hdu.is_image:
76
+ continue
77
+ all_headers[hdu.name] = dict(hdu.header)
78
+
79
+ try:
80
+ all_headers["DataStream"]
81
+ except KeyError:
82
+ logger.error("No DataStream HDU found in the FITS file.")
83
+ return {}
84
+
85
+ pbfhead = all_headers["DataStream"]["PBFHEAD"]
86
+ schema = METADATA_SCHEMAS.get(pbfhead)
87
+ if schema is None:
88
+ logger.error(
89
+ "The PBFHEAD %r does not correspond to any known FITS type.", pbfhead
90
+ )
91
+ return {}
92
+
93
+ logger.debug("Headers extracted: %s", all_headers.keys())
94
+
95
+ metadata = {}
96
+ for value_name, metadata_path in schema["HEADER"].items():
97
+ extname, header_key = metadata_path.split(".")
98
+ table = all_headers[extname][header_key]
99
+ metadata[value_name] = table
100
+
101
+ return metadata
102
+
103
+
104
+ def extract_metadata_from_data(path):
105
+ """Extract metadata from zFITS payload in path."""
106
+ with File(path) as f:
107
+ if not hasattr(f, "DataStream"):
108
+ return {}
109
+
110
+ pbfhead = f.DataStream.header["PBFHEAD"]
111
+ schema = METADATA_SCHEMAS.get(pbfhead)
112
+ if schema is None:
113
+ logger.error(
114
+ "The PBFHEAD %r does not correspond to any known FITS type.", pbfhead
115
+ )
116
+ return {}
117
+
118
+ metadata = {}
119
+ for value_name, metadata_path in schema["PAYLOAD"].items():
120
+ hdu, column = metadata_path.split(".")
121
+ row = getattr(f, hdu)[0]
122
+ metadata[value_name] = getattr(row, column)
123
+
124
+ if isinstance(metadata[value_name], np.ndarray):
125
+ # Convert numpy array to a Python list
126
+ metadata[value_name] = metadata[value_name].tolist()
127
+
128
+ logger.debug(
129
+ "Value '%s' from '%s' extracted. (renamed as '%s')",
130
+ column,
131
+ hdu,
132
+ value_name,
133
+ )
134
+ return metadata