ctao-bdms-clients 0.2.1__py3-none-any.whl → 0.3.0rc1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
bdms/acada_ingestion.py CHANGED
@@ -2,16 +2,28 @@
2
2
 
3
3
  This module provides the IngestionClient class to manage the ingestion of ACADA data into the BDMS system.
4
4
  It includes functionality for constructing FITS file paths, converting ACADA paths to Logical File Names (LFNs),
5
- and registering replicas in Rucio.
5
+ registering replicas in Rucio, extracting metadata and adding metadata to registered replicas. Furthermore, the Ingest class asynchronously
6
+ processes ACADA data using a process pool, managing file discovery, queuing, and distribution to worker processes for ingestion using a continuous
7
+ polling-based approach with the watchdog library.
6
8
  """
7
9
 
8
10
  import logging
9
11
  import os
12
+ import threading
13
+ import time
14
+ from concurrent.futures import Future, ProcessPoolExecutor
10
15
  from contextlib import ExitStack
16
+ from enum import Enum
17
+ from functools import partial
18
+ from multiprocessing import cpu_count
11
19
  from pathlib import Path
20
+ from queue import Empty, Queue
21
+ from traceback import format_exception
12
22
  from typing import Optional, Union
13
23
 
14
24
  from astropy.io import fits
25
+ from filelock import FileLock, Timeout
26
+ from prometheus_client import Counter, Gauge
15
27
  from rucio.client.accountclient import AccountClient
16
28
  from rucio.client.client import Client, DIDClient
17
29
  from rucio.client.replicaclient import ReplicaClient
@@ -20,6 +32,8 @@ from rucio.client.ruleclient import RuleClient
20
32
  from rucio.client.scopeclient import ScopeClient
21
33
  from rucio.common.exception import Duplicate, RucioException
22
34
  from rucio.common.utils import adler32
35
+ from watchdog.events import FileSystemEventHandler
36
+ from watchdog.observers.polling import PollingObserver
23
37
 
24
38
  from bdms.extract_fits_metadata import (
25
39
  extract_metadata_from_data,
@@ -28,17 +42,32 @@ from bdms.extract_fits_metadata import (
28
42
 
29
43
  LOGGER = logging.getLogger(__name__)
30
44
 
45
+ __all__ = ["IngestionClient", "FITSVerificationError", "Ingest"]
31
46
 
32
- __all__ = [
33
- "IngestionClient",
34
- ]
47
+ INGEST_RUNNING_MESSAGE = "Another ingestion process is already running"
48
+ DETECTED_NEW_TRIGGER_FILE = "Detected new trigger file"
49
+ INGEST_SUCCESS_MESSAGE = "Successfully ingested"
50
+ TRIGGER_SUFFIX = ".trigger"
51
+
52
+ # Prometheus Metrics for monitoring
53
+ N_TASKS_SUCCESS = Counter("n_tasks_success", "Number of successfully finished tasks.")
54
+ N_TASKS_FAILED = Counter("n_tasks_failed", "Number of failed tasks.")
55
+ N_TASKS_CANCELLED = Counter("n_tasks_cancelled", "Number of cancelled tasks.")
56
+ N_TASKS_SKIPPED = Counter("n_tasks_skipped", "Number of skipped tasks.")
57
+ N_TASKS_PROCESSED = Counter(
58
+ "n_tasks_processed", "Total number of tasks processed by the Ingest daemon"
59
+ )
60
+ TASKS_IN_QUEUE = Gauge("n_tasks_queued", "Current number of queued tasks")
61
+
62
+ IngestStatus = Enum("IngestStatus", [("SUCCESS", 0), ("FAILURE", 1), ("SKIPPED", 2)])
35
63
 
36
64
 
37
65
  class IngestionClient:
38
66
  """A client for BDMS ingestion and replication.
39
67
 
40
68
  This class provides methods to ingest ACADA data into the BDMS system, including converting ACADA paths to
41
- Logical File Names (LFNs), registering replicas in Rucio, and replicating data to offsite RSEs.
69
+ Logical File Names (LFNs), registering replicas in Rucio, extracting metadata and adding metadata to registered replicas, and
70
+ replicating data to offsite RSEs.
42
71
 
43
72
  Parameters
44
73
  ----------
@@ -78,6 +107,9 @@ class IngestionClient:
78
107
  self.logger = logger or LOGGER.getChild(self.__class__.__name__)
79
108
  self.vo = vo
80
109
 
110
+ if data_path is None:
111
+ raise ValueError("data_path must be provided and cannot be None")
112
+
81
113
  # Set data path (Prefix)
82
114
  self.data_path = Path(data_path)
83
115
  if not self.data_path.is_dir():
@@ -225,7 +257,7 @@ class IngestionClient:
225
257
  return True
226
258
  return False
227
259
 
228
- def add_onsite_replica(self, acada_path) -> str:
260
+ def add_onsite_replica(self, acada_path: Union[str, Path]) -> tuple[str, bool]:
229
261
  """Register a file as a replica in Rucio on the specified RSE and retrieve its LFN.
230
262
 
231
263
  Parameters
@@ -233,13 +265,12 @@ class IngestionClient:
233
265
  acada_path : str or Path
234
266
  The ACADA path where the file is located.
235
267
 
236
- rse : str, optional
237
- The RSE to register the replica on. If None, uses the client's RSE (self.rse).
238
-
239
268
  Returns
240
269
  -------
241
- str
242
- The Logical File Name (LFN) of the registered replica.
270
+ tuple[str, bool]
271
+ A tuple containing:
272
+ - The Logical File Name (LFN) of the registered or existing replica.
273
+ - A boolean indicating if the replica was skipped (True) or newly ingested (False).
243
274
 
244
275
  Raises
245
276
  ------
@@ -264,7 +295,7 @@ class IngestionClient:
264
295
  # Check if the replica already exists
265
296
  if self.check_replica_exists(lfn):
266
297
  self.logger.info("Replica already exists for lfn '%s', skipping", lfn)
267
- return lfn
298
+ return lfn, True # Indicate the file was skipped
268
299
 
269
300
  # Proceed with registering the replica if check_replica_exists returns False
270
301
  valid, metadata = verify_and_extract_metadata(acada_path)
@@ -297,7 +328,7 @@ class IngestionClient:
297
328
  self.did_client.set_metadata_bulk(scope=self.scope, name=lfn, meta=metadata)
298
329
  self.logger.info("Set metadata of %r to %r", lfn, metadata)
299
330
 
300
- return lfn
331
+ return lfn, False # Indicate the file was newly ingested
301
332
 
302
333
  def add_offsite_replication_rules(
303
334
  self,
@@ -477,3 +508,439 @@ def verify_and_extract_metadata(fits_path):
477
508
  except Exception as e:
478
509
  LOGGER.error("Failed to extract metadata from %r: %s", fits_path, e)
479
510
  return False, metadata
511
+
512
+
513
+ def process_file(
514
+ client: IngestionClient, file_path: str, logger=None, copies: int = 2
515
+ ) -> IngestStatus:
516
+ """Process a single file with IngestionClient, clean up the trigger file, and return the ingestion status.
517
+
518
+ Parameters
519
+ ----------
520
+ client : IngestionClient
521
+ The IngestionClient instance to handle replica registration and replication.
522
+ file_path : str
523
+ The path to the file to process.
524
+ logger : logging.Logger, optional
525
+ Logger instance. If None, uses the client's logger or a default logger.
526
+
527
+ Returns
528
+ -------
529
+ IngestStatus
530
+ The status of the ingestion process:
531
+ - SUCCESS if the file is ingested successfully.
532
+ - FAILURE if an error occurs during ingestion.
533
+ - SKIPPED if the file was already ingested.
534
+ """
535
+ logger = logger or LOGGER.getChild("Ingest")
536
+ trigger_file = Path(file_path + TRIGGER_SUFFIX)
537
+ try:
538
+ lfn, was_skipped = client.add_onsite_replica(file_path)
539
+ if was_skipped:
540
+ logger.info("Replica already exists for %s, skipping", file_path)
541
+ if trigger_file.exists():
542
+ trigger_file.unlink()
543
+ logger.debug("Removed trigger file %s", trigger_file)
544
+ return IngestStatus.SKIPPED
545
+ client.add_offsite_replication_rules(lfn, copies=copies)
546
+ logger.info("%s %s, LFN: %s", INGEST_SUCCESS_MESSAGE, file_path, lfn)
547
+ if trigger_file.exists():
548
+ trigger_file.unlink()
549
+ logger.debug("Removed trigger file %s", trigger_file)
550
+ return IngestStatus.SUCCESS
551
+ except Exception as e:
552
+ logger.exception("Exception in process_file for %s: %s", file_path, str(e))
553
+ return IngestStatus.FAILURE
554
+
555
+
556
+ class TriggerFileHandler(FileSystemEventHandler):
557
+ """File system event handler for detecting and processing trigger files.
558
+
559
+ This handler monitors file system events and responds to the creation of
560
+ trigger files (files ending with '.trigger'). When a trigger file is detected,
561
+ it immediately submits the corresponding data file for ingestion processing
562
+ without any intermediate queuing to the worker pool.
563
+ """
564
+
565
+ def __init__(self, ingest_instance):
566
+ """Initialize the handler with an Ingest instance.
567
+
568
+ Parameters
569
+ ----------
570
+ ingest_instance : Ingest
571
+ The Ingest daemon instance to process files and log events.
572
+ Must provide `submit_file(file_path)` method and `logger` attribute.
573
+ """
574
+ self.ingest = ingest_instance
575
+
576
+ def on_moved(self, event):
577
+ """Handle file move events, which includes symlink creation.
578
+
579
+ When creating symlinks with 'ln -s', the filesystem generates a
580
+ FileMovedEvent instead of FileCreatedEvent. This method handles
581
+ trigger file detection for symlink-based triggers.
582
+
583
+ Parameters
584
+ ----------
585
+ event : watchdog.events.FileMovedEvent
586
+ The file move event containing source and destination paths.
587
+ """
588
+ self.ingest.logger.debug("MOVE Event received: %s", event)
589
+ if event.is_directory or self.ingest.stop_event.is_set():
590
+ return
591
+
592
+ # Check if the destination is a trigger file
593
+ if event.dest_path and event.dest_path.endswith(TRIGGER_SUFFIX):
594
+ trigger_file = Path(event.dest_path)
595
+ data_file = trigger_file.with_suffix("")
596
+
597
+ if not data_file.exists():
598
+ self.ingest.logger.error(
599
+ "Data file %s for trigger %s does not exist, skipping",
600
+ data_file,
601
+ trigger_file,
602
+ )
603
+ return
604
+
605
+ self.ingest.logger.info(
606
+ "%s %s, submitting data file %s",
607
+ DETECTED_NEW_TRIGGER_FILE,
608
+ trigger_file,
609
+ data_file,
610
+ )
611
+
612
+ self.ingest._submit_file(str(data_file))
613
+
614
+
615
+ class Ingest:
616
+ """Ingestion daemon service to process ACADA data products using a process pool with result handling.
617
+
618
+ Monitors a specified directory for trigger files using a polling-based observer,
619
+ submitting each file for ingestion to a ProcessPoolExecutor for parallel processing.
620
+ Uses an improved callback-based result handling system with structured task tracking
621
+ and immediate result processing. The daemon ensures compatibility with shared
622
+ filesystems through polling and prevents multiple instances using a lock file.
623
+ """
624
+
625
+ def __init__(
626
+ self,
627
+ client,
628
+ top_dir: Union[str, Path],
629
+ num_workers: int = cpu_count(),
630
+ lock_file_path: Union[str, Path, None] = None,
631
+ polling_interval: float = 1.0,
632
+ check_interval: float = 1.0,
633
+ offsite_copies: int = 2,
634
+ ) -> None:
635
+ """Initialize the ingestion daemon with configuration parameters.
636
+
637
+ Sets up the client, directory, worker count, intervals, and initializes
638
+ a process-safe queue and daemon state.
639
+ """
640
+ self.client = client
641
+ self.top_dir = Path(top_dir)
642
+ self.num_workers = num_workers
643
+ self.lock_file_path = (
644
+ Path(lock_file_path)
645
+ if lock_file_path is not None
646
+ else self.top_dir / "bdms_ingest.lock"
647
+ )
648
+ self.polling_interval = polling_interval
649
+ self.check_interval = check_interval
650
+ self.offsite_copies = offsite_copies
651
+ self.stop_event = threading.Event()
652
+ self.logger = LOGGER.getChild(self.__class__.__name__)
653
+
654
+ # Result handling
655
+ self.result_queue = Queue()
656
+ self.task_counter = 0
657
+ self.submitted_tasks = {} # Track submitted tasks: {task_id: file_path}
658
+
659
+ # Statistics tracking
660
+ self.max_concurrent_tasks = 0
661
+ self.total_tasks_submitted = 0
662
+
663
+ # Lock instance to be held during entire daemon execution
664
+ self.lock = None
665
+
666
+ def _done_callback(self, future, task_id: int, file_path: str):
667
+ """Queue completed task result for processing.
668
+
669
+ This method is invoked immediately when a worker process finishes
670
+ processing a file. It queues the result for processing by the
671
+ dedicated result handling thread.
672
+
673
+ Parameters
674
+ ----------
675
+ future : concurrent.futures.Future
676
+ The completed Future object containing the task result.
677
+ task_id : int
678
+ Unique identifier for the completed task.
679
+ file_path : str
680
+ Path to the file that was processed.
681
+ """
682
+ self.result_queue.put((task_id, file_path, future))
683
+
684
+ def _submit_file(self, file_path: str):
685
+ """Submit a file for processing using the callback pattern.
686
+
687
+ Creates a unique task ID, submits the file to the worker pool, and
688
+ sets up an immediate callback for result processing
689
+
690
+ Parameters
691
+ ----------
692
+ file_path : str
693
+ Path to the data file to be processed.
694
+ """
695
+ task_id = self.task_counter
696
+ self.task_counter += 1
697
+ self.total_tasks_submitted += 1
698
+
699
+ self.submitted_tasks[task_id] = file_path
700
+
701
+ # Update max concurrent tasks tracking
702
+ current_concurrent = len(self.submitted_tasks)
703
+ self.max_concurrent_tasks = max(self.max_concurrent_tasks, current_concurrent)
704
+
705
+ # Increment queue counter when task is submitted
706
+ TASKS_IN_QUEUE.inc()
707
+
708
+ self.logger.debug(
709
+ "Submitting task %d for file %s (concurrent: %d, max: %d)",
710
+ task_id,
711
+ file_path,
712
+ current_concurrent,
713
+ self.max_concurrent_tasks,
714
+ )
715
+
716
+ # Submit with callback using partial
717
+ future = self.executor.submit(
718
+ process_file,
719
+ self.client,
720
+ file_path,
721
+ logger=self.logger,
722
+ copies=self.offsite_copies,
723
+ )
724
+ future.add_done_callback(
725
+ partial(self._done_callback, task_id=task_id, file_path=file_path)
726
+ )
727
+
728
+ def _handle_result(
729
+ self,
730
+ task_id: int,
731
+ file_path: str,
732
+ future: "Future",
733
+ processed_count: int,
734
+ start_time: float,
735
+ ) -> None:
736
+ """Handle the result of a completed task.
737
+
738
+ This method processes the result of a completed ingestion task, performs
739
+ cleanup of task tracking data, calculates processing statistics, and logs
740
+ the outcome. It handles successful completion, cancellation, and error cases.
741
+
742
+ Parameters
743
+ ----------
744
+ task_id : int
745
+ Unique identifier for the completed task.
746
+ file_path : str
747
+ Path to the file that was processed.
748
+ future : concurrent.futures.Future
749
+ The completed Future object containing the task result.
750
+ processed_count : int
751
+ Total number of tasks processed so far.
752
+ start_time : float
753
+ Start time of the result processing thread for rate calculation.
754
+ """
755
+ elapsed_time = time.time() - start_time
756
+ rate = processed_count / elapsed_time if elapsed_time > 0 else 0
757
+
758
+ # Clean up task tracking
759
+ self.submitted_tasks.pop(task_id, None)
760
+ current_concurrent = len(self.submitted_tasks)
761
+ TASKS_IN_QUEUE.dec() # Always decrement queue counter
762
+
763
+ self.logger.debug(
764
+ "Task %d completed, remaining concurrent: %d",
765
+ task_id,
766
+ current_concurrent,
767
+ )
768
+
769
+ # Process the result
770
+ if future.cancelled():
771
+ status = "cancelled"
772
+ N_TASKS_CANCELLED.inc() # Increment cancellation counter
773
+ elif (e := future.exception()) is not None:
774
+ self.logger.error(
775
+ "Task %d failed: %s",
776
+ task_id,
777
+ "".join(format_exception(type(e), e, e.__traceback__)),
778
+ )
779
+ status = "failed"
780
+ N_TASKS_FAILED.inc() # Increment failure counter
781
+ else:
782
+ result = future.result()
783
+ if result == IngestStatus.SUCCESS:
784
+ status = "success"
785
+ N_TASKS_SUCCESS.inc() # Increment success counter
786
+ elif result == IngestStatus.SKIPPED:
787
+ status = "skipped"
788
+ N_TASKS_SKIPPED.inc() # Increment skipped counter
789
+ else:
790
+ status = "failed"
791
+ N_TASKS_FAILED.inc() # Increment failure counter
792
+
793
+ N_TASKS_PROCESSED.inc() # Increment total processed counter
794
+
795
+ # Summary log for all cases
796
+ self.logger.info(
797
+ "Processed file %s with result %s. Rate: %.2f files/sec",
798
+ file_path,
799
+ status,
800
+ rate,
801
+ )
802
+
803
+ def _process_results(self):
804
+ """Process results from the result queue.
805
+
806
+ This method runs in a separate daemon thread and continuously processes
807
+ completed tasks from the result queue. It handles task cleanup, result
808
+ logging, and error reporting. The method implements the improved result
809
+ handling pattern with structured error handling and performance tracking.
810
+
811
+ The method maintains local counters for processed_count and start_time,
812
+ which are passed to _handle_result for rate calculation and logging.
813
+ The thread processes results until the stop_event is set and the queue
814
+ is empty, ensuring all results are handled before shutdown.
815
+ """
816
+ self.logger.info("Result processing thread started")
817
+ start_time = time.time()
818
+ processed_count = 0
819
+
820
+ try:
821
+ while not self.stop_event.is_set() or not self.result_queue.empty():
822
+ try:
823
+ task_id, file_path, future = self.result_queue.get(
824
+ timeout=self.check_interval
825
+ )
826
+ except Empty:
827
+ continue
828
+
829
+ try:
830
+ processed_count += 1
831
+ self._handle_result(
832
+ task_id, file_path, future, processed_count, start_time
833
+ )
834
+ except Exception as e:
835
+ self.logger.exception(
836
+ "Error processing result for task %d: %s", task_id, str(e)
837
+ )
838
+
839
+ except Exception as e:
840
+ self.logger.exception("Fatal error in result processing thread: %s", str(e))
841
+ finally:
842
+ self.logger.info("Result processing thread stopped")
843
+
844
+ def _check_directory(self) -> None:
845
+ """Check if the directory is readable.
846
+
847
+ Raises
848
+ ------
849
+ RuntimeError
850
+ If the top directory is not accessible.
851
+ """
852
+ if not self.top_dir.is_dir() or not os.access(self.top_dir, os.R_OK):
853
+ self.logger.error("Cannot read directory %s", self.top_dir)
854
+ raise RuntimeError(f"Cannot read directory {self.top_dir}")
855
+
856
+ def run(self) -> None:
857
+ """Run the ingestion daemon, submitting file ingestion tasks to a process pool, and result handling.
858
+
859
+ Initializes and runs the complete ingestion system including:
860
+
861
+ 1. Process checks (lock file acquisition and hold for entire runtime)
862
+ 2. Validates directory access
863
+ 3. Result processing thread startup
864
+ 4. Worker process pool creation
865
+ 5. File system monitoring with polling observer
866
+ 6. Graceful shutdown handling
867
+
868
+ The method blocks until a shutdown signal is received (KeyboardInterrupt)
869
+ or the stop_event is set. All components are properly shut down and
870
+ cleaned up before the method returns.
871
+
872
+ Raises
873
+ ------
874
+ RuntimeError
875
+ If another ingestion process is running or the directory is unreadable.
876
+ """
877
+ # Acquire lock for the entire daemon execution, preventing multiple instances
878
+ self.lock = FileLock(self.lock_file_path, timeout=10)
879
+
880
+ try:
881
+ # Acquire the lock - this will be held for the entire daemon runtime
882
+ self.lock.acquire(timeout=10)
883
+ self.logger.info("Acquired lock file: %s", self.lock.lock_file)
884
+ except Timeout:
885
+ raise RuntimeError(INGEST_RUNNING_MESSAGE)
886
+
887
+ # Write PID to the original lock file for reference
888
+ self.lock_file_path.write_text(str(os.getpid()))
889
+ self.logger.info("Written PID %d to %s", os.getpid(), self.lock_file_path)
890
+
891
+ try:
892
+ self._check_directory()
893
+
894
+ # Start the result processing thread
895
+ result_thread = threading.Thread(target=self._process_results, daemon=True)
896
+ result_thread.start()
897
+ self.logger.info("Started result processing thread")
898
+
899
+ with ProcessPoolExecutor(max_workers=self.num_workers) as executor:
900
+ self.executor = executor # Store reference for submit_file method
901
+ self.logger.info(
902
+ "Started process pool with %d workers", self.num_workers
903
+ )
904
+
905
+ event_handler = TriggerFileHandler(self)
906
+
907
+ self.logger.info(
908
+ "Starting continuous polling-based monitoring of directory %s with interval %s seconds",
909
+ self.top_dir,
910
+ self.polling_interval,
911
+ )
912
+ observer = PollingObserver(timeout=self.polling_interval)
913
+ observer.schedule(event_handler, str(self.top_dir), recursive=True)
914
+ observer.start()
915
+ self.logger.info("File monitoring observer started successfully")
916
+
917
+ try:
918
+ while not self.stop_event.is_set():
919
+ self.stop_event.wait(self.check_interval)
920
+ except KeyboardInterrupt:
921
+ self.logger.info("Received shutdown signal, stopping daemon")
922
+ finally:
923
+ self.stop_event.set()
924
+ self.logger.info("Stopping file observer")
925
+ observer.stop()
926
+ observer.join()
927
+ self.logger.info("Stopping result processing thread")
928
+ result_thread.join()
929
+
930
+ finally:
931
+ # Always release the lock and clean up, even if an exception occurred
932
+ if self.lock and self.lock.is_locked:
933
+ self.lock.release()
934
+ self.logger.info("Released lock file")
935
+
936
+ # Clean up PID file
937
+ if self.lock_file_path.exists():
938
+ try:
939
+ self.lock_file_path.unlink()
940
+ self.logger.info("Removed PID file: %s", self.lock_file_path)
941
+ except Exception as e:
942
+ self.logger.warning(
943
+ "Failed to remove PID file %s: %s", self.lock_file_path, e
944
+ )
945
+
946
+ self.logger.info("Stopped ingestion daemon")