ctao-bdms-clients 0.2.1__py3-none-any.whl → 0.3.0rc1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -5,10 +5,17 @@ and the replication of data between Rucio storage elements (RSEs).
5
5
  """
6
6
 
7
7
  import logging
8
+ import os
9
+ import re
8
10
  import subprocess
11
+ import threading
12
+ import time
13
+ from concurrent.futures import ProcessPoolExecutor
9
14
  from pathlib import Path
10
15
  from shutil import copy2
16
+ from urllib.request import urlopen
11
17
 
18
+ import numpy as np
12
19
  import pytest
13
20
  from astropy.io import fits
14
21
  from astropy.table import Table
@@ -18,8 +25,18 @@ from rucio.client.replicaclient import ReplicaClient
18
25
  from rucio.client.ruleclient import RuleClient
19
26
  from rucio.common.exception import RucioException
20
27
  from rucio.common.utils import adler32
21
-
22
- from bdms.acada_ingestion import IngestionClient
28
+ from watchdog.events import FileMovedEvent
29
+
30
+ from bdms.acada_ingestion import (
31
+ DETECTED_NEW_TRIGGER_FILE,
32
+ INGEST_SUCCESS_MESSAGE,
33
+ TRIGGER_SUFFIX,
34
+ Ingest,
35
+ IngestionClient,
36
+ IngestStatus,
37
+ TriggerFileHandler,
38
+ process_file,
39
+ )
23
40
  from bdms.tests.utils import reset_xrootd_permissions, wait_for_replication_status
24
41
 
25
42
  LOGGER = logging.getLogger(__name__)
@@ -28,13 +45,14 @@ ONSITE_RSE = "STORAGE-1"
28
45
  OFFSITE_RSE_1 = "STORAGE-2"
29
46
  OFFSITE_RSE_2 = "STORAGE-3"
30
47
 
48
+ TEST_FILE_TRIGGER = "test_file.trigger"
49
+
31
50
 
32
51
  def test_shared_storage(storage_mount_path: Path):
33
52
  """Test that the shared storage path is available."""
34
53
 
35
- assert (
36
- storage_mount_path.exists()
37
- ), f"Shared storage {storage_mount_path} is not available on the client"
54
+ msg = f"Shared storage {storage_mount_path} is not available on the client"
55
+ assert storage_mount_path.exists(), msg
38
56
 
39
57
 
40
58
  def trigger_judge_repairer() -> None:
@@ -83,7 +101,8 @@ def test_acada_to_lfn(storage_mount_path: Path, test_vo: str):
83
101
  )
84
102
  lfn = ingestion_client.acada_to_lfn(acada_path=acada_path)
85
103
 
86
- assert lfn == expected_lfn, f"Expected {expected_lfn}, got {lfn}"
104
+ msg = f"Expected {expected_lfn}, got {lfn}"
105
+ assert lfn == expected_lfn, msg
87
106
 
88
107
  # Test Case 2: Non-absolute acada_path (empty string)
89
108
  with pytest.raises(ValueError, match="acada_path must be absolute"):
@@ -121,7 +140,10 @@ def test_check_replica_exists(
121
140
  """Test the check_replica_exists method of IngestionClient."""
122
141
 
123
142
  ingestion_client = IngestionClient(
124
- storage_mount_path, ONSITE_RSE, scope=test_scope, vo=test_vo
143
+ data_path=storage_mount_path,
144
+ rse=ONSITE_RSE,
145
+ vo=test_vo,
146
+ scope=test_scope,
125
147
  )
126
148
 
127
149
  acada_path, _ = onsite_test_file
@@ -202,22 +224,29 @@ def test_add_onsite_replica_with_minio_fits_file(
202
224
  ):
203
225
  """Test the add_onsite_replica method of IngestionClient using a dummy file."""
204
226
 
227
+ ingestion_client = IngestionClient(
228
+ data_path=storage_mount_path,
229
+ rse=ONSITE_RSE,
230
+ vo=test_vo,
231
+ scope=test_scope,
232
+ )
233
+
205
234
  filename = str(file_location).split("/")[-1]
206
235
  acada_path = storage_mount_path / test_vo / test_scope / filename
207
236
  acada_path.parent.mkdir(parents=True, exist_ok=True)
208
237
  copy2(file_location, str(acada_path))
209
238
  reset_xrootd_permissions(storage_mount_path)
210
239
 
211
- ingestion_client = IngestionClient(
212
- storage_mount_path, ONSITE_RSE, scope=test_scope, vo=test_vo
213
- )
214
-
215
240
  # Use add_onsite_replica to register the replica
216
- lfn = ingestion_client.add_onsite_replica(acada_path=acada_path)
241
+ lfn, skipped = ingestion_client.add_onsite_replica(acada_path=acada_path)
217
242
 
218
243
  # Verify the LFN matches the expected LFN
219
244
  expected_lfn = ingestion_client.acada_to_lfn(acada_path)
220
- assert lfn == expected_lfn, f"Expected LFN {expected_lfn}, got {lfn}"
245
+ msg = f"Expected LFN {expected_lfn}, got {lfn}"
246
+ assert lfn == expected_lfn, msg
247
+
248
+ msg = "Expected the file to be newly ingested, but it was skipped"
249
+ assert not skipped, msg
221
250
 
222
251
  # Download the file using the LFN
223
252
  download_spec = {
@@ -230,18 +259,29 @@ def test_add_onsite_replica_with_minio_fits_file(
230
259
 
231
260
  # Verify the downloaded file
232
261
  download_path = tmp_path / lfn.lstrip("/")
233
- assert download_path.is_file(), f"Download failed at {download_path}"
262
+ msg = f"Download failed at {download_path}"
263
+ assert download_path.is_file(), msg
234
264
 
235
- assert adler32(download_path) == adler32(
236
- file_location
237
- ), "Downloaded file content does not match the original. "
265
+ msg = "Downloaded file content does not match the original."
266
+ assert adler32(download_path) == adler32(file_location), msg
238
267
 
239
268
  # Check for don't ingest again if its already registered
240
269
  caplog.clear()
241
- lfn = ingestion_client.add_onsite_replica(acada_path=acada_path)
270
+ lfn_check, skipped_check = ingestion_client.add_onsite_replica(
271
+ acada_path=acada_path
272
+ )
273
+ msg = f"LFN mismatch on second ingestion attempt: expected {lfn}, got {lfn_check}"
274
+ assert lfn_check == lfn, msg
275
+
276
+ msg = (
277
+ "Expected the file to be skipped on second ingestion, but it was ingested again"
278
+ )
279
+ assert skipped_check, msg
280
+
281
+ msg = f"'Replica already exists for lfn '{lfn}', skipping' in caplog records"
242
282
  assert f"Replica already exists for lfn '{lfn}', skipping" in [
243
283
  r.message for r in caplog.records
244
- ]
284
+ ], msg
245
285
 
246
286
  # Retrieve metadata using the DIDClient
247
287
  did_client = Client()
@@ -251,10 +291,11 @@ def test_add_onsite_replica_with_minio_fits_file(
251
291
 
252
292
  # Verify the metadata matches the expected metadata
253
293
  for key, value in metadata_dict.items():
254
- assert retrieved_metadata.get(key) == value, (
294
+ msg = (
255
295
  f"Metadata mismatch for key '{key}'. "
256
296
  f"Expected: {value}, Got: {retrieved_metadata.get(key)}"
257
297
  )
298
+ assert retrieved_metadata.get(key) == value, msg
258
299
 
259
300
 
260
301
  def test_rses():
@@ -263,9 +304,14 @@ def test_rses():
263
304
  result = list(client.list_rses())
264
305
 
265
306
  rses = [r["rse"] for r in result]
266
- assert ONSITE_RSE in rses, f"Expected RSE {ONSITE_RSE} not found in {rses}"
267
- assert OFFSITE_RSE_1 in rses, f"Expected RSE {OFFSITE_RSE_1} not found in {rses}"
268
- assert OFFSITE_RSE_2 in rses, f"Expected RSE {OFFSITE_RSE_2} not found in {rses}"
307
+ msg = f"Expected RSE {ONSITE_RSE} not found in {rses}"
308
+ assert ONSITE_RSE in rses, msg
309
+
310
+ msg = f"Expected RSE {OFFSITE_RSE_1} not found in {rses}"
311
+ assert OFFSITE_RSE_1 in rses, msg
312
+
313
+ msg = f"Expected RSE {OFFSITE_RSE_2} not found in {rses}"
314
+ assert OFFSITE_RSE_2 in rses, msg
269
315
 
270
316
 
271
317
  @pytest.fixture
@@ -306,9 +352,8 @@ def pre_existing_lfn(
306
352
 
307
353
  # Verify the replica is registered
308
354
  replicas = list(replica_client.list_replicas(dids=[did]))
309
- assert (
310
- replicas
311
- ), f"Failed to verify pre-registration of replica for LFN {lfn} on {ONSITE_RSE}"
355
+ msg = f"Failed to verify pre-registration of replica for LFN {lfn} on {ONSITE_RSE}"
356
+ assert replicas, msg
312
357
 
313
358
  return lfn
314
359
 
@@ -326,9 +371,11 @@ def test_add_offsite_replication_rules(
326
371
  ):
327
372
  """Test the add_offsite_replication_rules method of IngestionClient."""
328
373
  ingestion_client = IngestionClient(
329
- storage_mount_path, ONSITE_RSE, scope=test_scope, vo=test_vo
374
+ data_path=storage_mount_path,
375
+ rse=ONSITE_RSE,
376
+ vo=test_vo,
377
+ scope=test_scope,
330
378
  )
331
- caplog.set_level(logging.DEBUG)
332
379
 
333
380
  # Replicate the ACADA file to two offsite RSEs
334
381
  lfn = pre_existing_lfn
@@ -356,10 +403,11 @@ def test_add_offsite_replication_rules(
356
403
  replica_client = ReplicaClient()
357
404
  replicas = next(replica_client.list_replicas(dids=[did]))
358
405
  states = replicas.get("states", {})
406
+ msg = f"Expected replica on either {OFFSITE_RSE_1} or {OFFSITE_RSE_2} to be AVAILABLE after first rule: {states}"
359
407
  assert (
360
408
  states.get(OFFSITE_RSE_1) == "AVAILABLE"
361
409
  or states.get(OFFSITE_RSE_2) == "AVAILABLE"
362
- ), f"Expected replica on either {OFFSITE_RSE_1} or {OFFSITE_RSE_2} to be AVAILABLE after first rule: {states}"
410
+ ), msg
363
411
 
364
412
  # Manually trigger the judge-repairer to ensure the second rule doesn't get stuck
365
413
  trigger_judge_repairer()
@@ -376,15 +424,15 @@ def test_add_offsite_replication_rules(
376
424
  did,
377
425
  states,
378
426
  )
379
- assert (
380
- states.get(ONSITE_RSE) == "AVAILABLE"
381
- ), f"Expected replica on {ONSITE_RSE} to be AVAILABLE: {states}"
382
- assert (
383
- states.get(OFFSITE_RSE_1) == "AVAILABLE"
384
- ), f"Expected replica on {OFFSITE_RSE_1} to be AVAILABLE: {states}"
385
- assert (
386
- states.get(OFFSITE_RSE_2) == "AVAILABLE"
387
- ), f"Expected replica on {OFFSITE_RSE_2} to be AVAILABLE: {states}"
427
+
428
+ msg = f"Expected replica on {ONSITE_RSE} to be AVAILABLE: {states}"
429
+ assert states.get(ONSITE_RSE) == "AVAILABLE", msg
430
+
431
+ msg = f"Expected replica on {OFFSITE_RSE_1} to be AVAILABLE: {states}"
432
+ assert states.get(OFFSITE_RSE_1) == "AVAILABLE", msg
433
+
434
+ msg = f"Expected replica on {OFFSITE_RSE_2} to be AVAILABLE: {states}"
435
+ assert states.get(OFFSITE_RSE_2) == "AVAILABLE", msg
388
436
 
389
437
  # Download the file from OFFSITE_RSE_2 to verify its content
390
438
  download_spec = {
@@ -398,12 +446,15 @@ def test_add_offsite_replication_rules(
398
446
 
399
447
  # Verify the downloaded file content
400
448
  download_path = tmp_path / lfn.lstrip("/")
401
- assert download_path.is_file(), f"Download failed at {download_path}"
449
+ msg = f"Download failed at {download_path}"
450
+ assert download_path.is_file(), msg
451
+
402
452
  downloaded_content = download_path.read_text()
403
- assert downloaded_content == test_file_content, (
453
+ msg = (
404
454
  f"Downloaded file content does not match the original. "
405
455
  f"Expected: {test_file_content}, Got: {downloaded_content}"
406
456
  )
457
+ assert downloaded_content == test_file_content, msg
407
458
 
408
459
 
409
460
  @pytest.mark.usefixtures("_auth_proxy")
@@ -418,10 +469,13 @@ def test_add_offsite_replication_rules_single_copy(
418
469
  caplog,
419
470
  ):
420
471
  """Test the add_offsite_replication_rules method of IngestionClient with a single copy (copies=1)."""
472
+
421
473
  ingestion_client = IngestionClient(
422
- storage_mount_path, ONSITE_RSE, scope=test_scope, vo=test_vo
474
+ data_path=storage_mount_path,
475
+ rse=ONSITE_RSE,
476
+ vo=test_vo,
477
+ scope=test_scope,
423
478
  )
424
- caplog.set_level(logging.DEBUG)
425
479
 
426
480
  # Replicate the ACADA file to one offsite RSE
427
481
  lfn = pre_existing_lfn
@@ -439,9 +493,9 @@ def test_add_offsite_replication_rules_single_copy(
439
493
  )
440
494
 
441
495
  # Verify that only one rule was created
442
- assert (
443
- len(rule_ids) == 1
444
- ), f"Expected exactly 1 rule ID, got {len(rule_ids)}: {rule_ids}"
496
+ msg = f"Expected exactly 1 rule ID, got {len(rule_ids)}: {rule_ids}"
497
+ assert len(rule_ids) == 1, msg
498
+
445
499
  rule_id_offsite_1 = rule_ids[0]
446
500
  rule_client = RuleClient()
447
501
 
@@ -461,9 +515,8 @@ def test_add_offsite_replication_rules_single_copy(
461
515
  offsite_replica_count = sum(
462
516
  1 for rse in [OFFSITE_RSE_1, OFFSITE_RSE_2] if states.get(rse) == "AVAILABLE"
463
517
  )
464
- assert (
465
- offsite_replica_count == 1
466
- ), f"Expected exactly 1 offsite replica (on either {OFFSITE_RSE_1} or {OFFSITE_RSE_2}), got {offsite_replica_count}: {states}"
518
+ msg = f"Expected exactly 1 offsite replica (on either {OFFSITE_RSE_1} or {OFFSITE_RSE_2}), got {offsite_replica_count}: {states}"
519
+ assert offsite_replica_count == 1, msg
467
520
 
468
521
  # Determine which offsite RSE the replica was created on
469
522
  target_offsite_rse = (
@@ -482,12 +535,14 @@ def test_add_offsite_replication_rules_single_copy(
482
535
 
483
536
  # Verify the downloaded file content
484
537
  download_path = tmp_path / lfn.lstrip("/")
485
- assert download_path.is_file(), f"Download failed at {download_path}"
538
+ msg = f"Download failed at {download_path}"
539
+ assert download_path.is_file(), msg
486
540
  downloaded_content = download_path.read_text()
487
- assert downloaded_content == test_file_content, (
541
+ msg = (
488
542
  f"Downloaded file content does not match the original. "
489
543
  f"Expected: {test_file_content}, Got: {downloaded_content}"
490
544
  )
545
+ assert downloaded_content == test_file_content, msg
491
546
 
492
547
 
493
548
  def test_verify_fits_file(tel_events_test_file):
@@ -524,3 +579,1140 @@ def test_verify_fits_file_invalid_checksum(broken_checksum):
524
579
  with fits.open(broken_checksum) as hdul:
525
580
  with pytest.raises(FITSVerificationError, match="CHECKSUM verification failed"):
526
581
  verify_fits_checksum(hdul)
582
+
583
+
584
+ def test_ingest_init(storage_mount_path):
585
+ """Test that Ingest initializes correctly with given parameters."""
586
+ ingestion_client = IngestionClient(
587
+ data_path=storage_mount_path,
588
+ rse=ONSITE_RSE,
589
+ vo="ctao",
590
+ scope="acada",
591
+ )
592
+
593
+ ingest = Ingest(
594
+ client=ingestion_client,
595
+ top_dir=storage_mount_path,
596
+ num_workers=3,
597
+ lock_file_path=storage_mount_path / "lockfile.lock",
598
+ polling_interval=0.5,
599
+ check_interval=0.2,
600
+ )
601
+ assert ingest.client == ingestion_client
602
+ assert ingest.top_dir == storage_mount_path
603
+ assert ingest.num_workers == 3
604
+ assert ingest.lock_file_path == storage_mount_path / "lockfile.lock"
605
+ assert ingest.polling_interval == 0.5
606
+ assert ingest.check_interval == 0.2
607
+ assert not ingest.stop_event.is_set() # check stop_event initial state
608
+ assert hasattr(ingest, "result_queue")
609
+ assert hasattr(ingest, "task_counter")
610
+ assert hasattr(ingest, "submitted_tasks")
611
+ assert ingest.task_counter == 0
612
+ assert len(ingest.submitted_tasks) == 0
613
+
614
+
615
+ def test_check_directory_valid(storage_mount_path, tmp_path, caplog):
616
+ """Test _check_directory with a valid, readable directory."""
617
+ ingestion_client = IngestionClient(
618
+ data_path=storage_mount_path,
619
+ rse=ONSITE_RSE,
620
+ vo="ctao",
621
+ scope="acada",
622
+ )
623
+
624
+ ingest_instance = Ingest(
625
+ client=ingestion_client,
626
+ top_dir=tmp_path,
627
+ num_workers=1,
628
+ lock_file_path=storage_mount_path / "bdms_ingest.lock",
629
+ polling_interval=0.5,
630
+ check_interval=0.5,
631
+ )
632
+
633
+ ingest_instance.top_dir = tmp_path
634
+ ingest_instance._check_directory()
635
+
636
+
637
+ def test_check_directory_invalid(storage_mount_path, tmp_path, caplog):
638
+ """Test _check_directory with an invalid directory."""
639
+ ingestion_client = IngestionClient(
640
+ data_path=storage_mount_path,
641
+ rse=ONSITE_RSE,
642
+ vo="ctao",
643
+ scope="acada",
644
+ logger=LOGGER,
645
+ )
646
+
647
+ invalid_dir = tmp_path / "nonexistent"
648
+
649
+ ingest_instance = Ingest(
650
+ client=ingestion_client,
651
+ top_dir=invalid_dir,
652
+ num_workers=1,
653
+ lock_file_path=storage_mount_path / "bdms_ingest.lock",
654
+ polling_interval=0.5,
655
+ check_interval=0.5,
656
+ )
657
+
658
+ with pytest.raises(RuntimeError, match=f"Cannot read directory {invalid_dir}"):
659
+ ingest_instance._check_directory()
660
+ assert f"Cannot read directory {invalid_dir}" in caplog.text
661
+
662
+
663
+ @pytest.mark.usefixtures("_auth_proxy")
664
+ def test_process_file_success(
665
+ storage_mount_path, caplog, onsite_test_file, test_vo, test_scope
666
+ ):
667
+ """Test for checking successful ingestion with trigger file clean-up, depends on IngestionClient"""
668
+ ingestion_client = IngestionClient(
669
+ data_path=storage_mount_path,
670
+ rse=ONSITE_RSE,
671
+ vo=test_vo,
672
+ scope=test_scope,
673
+ )
674
+
675
+ acada_path, _ = onsite_test_file
676
+ test_file = acada_path
677
+ trigger_file = Path(str(test_file) + TRIGGER_SUFFIX)
678
+ trigger_file.symlink_to(test_file)
679
+ result = process_file(ingestion_client, str(test_file))
680
+ assert result == IngestStatus.SUCCESS
681
+ assert not trigger_file.exists()
682
+ assert INGEST_SUCCESS_MESSAGE in caplog.text
683
+
684
+
685
+ @pytest.mark.usefixtures("_auth_proxy")
686
+ def test_process_file_skipped(
687
+ storage_mount_path, caplog, onsite_test_file, test_vo, test_scope
688
+ ):
689
+ """Test for checking skipped ingestion when replica already exists"""
690
+ ingestion_client = IngestionClient(
691
+ data_path=storage_mount_path,
692
+ rse=ONSITE_RSE,
693
+ vo=test_vo,
694
+ scope=test_scope,
695
+ )
696
+
697
+ acada_path, _ = onsite_test_file
698
+ test_file = acada_path
699
+ trigger_file = Path(str(test_file) + TRIGGER_SUFFIX)
700
+ trigger_file.symlink_to(test_file)
701
+ process_file(ingestion_client, str(test_file))
702
+ caplog.clear()
703
+ result = process_file(ingestion_client, str(test_file))
704
+ assert result == IngestStatus.SKIPPED
705
+ assert "Replica already exists" in caplog.text
706
+
707
+
708
+ @pytest.mark.usefixtures("_auth_proxy")
709
+ def test_process_file_failure(storage_mount_path, caplog, tmp_path):
710
+ """Test for checking failure for invalid file paths"""
711
+ ingestion_client = IngestionClient(
712
+ data_path=storage_mount_path,
713
+ rse=ONSITE_RSE,
714
+ vo="ctao",
715
+ scope="acada",
716
+ )
717
+
718
+ invalid_file = tmp_path / "invalid_file.fits"
719
+ invalid_file.write_text("dummy content")
720
+ trigger_file = Path(str(invalid_file) + TRIGGER_SUFFIX)
721
+ trigger_file.symlink_to(invalid_file)
722
+
723
+ # The file path is outside the data_path causing a ValueError in acada_to_lfn
724
+ result = process_file(ingestion_client, str(invalid_file))
725
+
726
+ # Verify the function returns FAILURE status instead of raising an exception
727
+ assert result == IngestStatus.FAILURE
728
+
729
+ # Check for the actual error message that gets logged
730
+ assert "Exception in process_file" in caplog.text
731
+ # Verify the file path is in the error message
732
+ assert str(invalid_file) in caplog.text
733
+
734
+ # Verify that no success message was logged
735
+ assert INGEST_SUCCESS_MESSAGE not in caplog.text
736
+
737
+ # Trigger file should still exist since ingestion failed
738
+ msg = "Trigger file should not be removed when ingestion fails"
739
+ assert trigger_file.exists(), msg
740
+
741
+
742
+ def test_trigger_file_handler_init(storage_mount_path):
743
+ """Test TriggerFileHandler initialization."""
744
+ ingestion_client = IngestionClient(
745
+ data_path=storage_mount_path,
746
+ rse=ONSITE_RSE,
747
+ vo="ctao",
748
+ scope="acada",
749
+ )
750
+
751
+ ingest_instance = Ingest(
752
+ client=ingestion_client,
753
+ top_dir=storage_mount_path,
754
+ num_workers=1,
755
+ lock_file_path=storage_mount_path / "bdms_ingest.lock",
756
+ polling_interval=0.5,
757
+ check_interval=0.5,
758
+ )
759
+
760
+ handler = TriggerFileHandler(ingest_instance)
761
+ assert handler.ingest == ingest_instance
762
+
763
+
764
+ def test_trigger_file_handler_on_moved_missing_data_file(
765
+ storage_mount_path, tmp_path, caplog
766
+ ):
767
+ """Test on_moved skips when data file is missing."""
768
+ ingestion_client = IngestionClient(
769
+ data_path=storage_mount_path,
770
+ rse=ONSITE_RSE,
771
+ vo="ctao",
772
+ scope="acada",
773
+ )
774
+
775
+ ingest_instance = Ingest(
776
+ client=ingestion_client,
777
+ top_dir=storage_mount_path,
778
+ num_workers=1,
779
+ lock_file_path=storage_mount_path / "bdms_ingest.lock",
780
+ polling_interval=0.5,
781
+ check_interval=0.5,
782
+ )
783
+
784
+ handler = TriggerFileHandler(ingest_instance)
785
+ trigger_file = tmp_path / TEST_FILE_TRIGGER
786
+ data_file = tmp_path / "test_file"
787
+
788
+ # Create symlink to non-existent data file
789
+ trigger_file.symlink_to(data_file)
790
+
791
+ # Create FileMovedEvent (simulating ln -s)
792
+ event = FileMovedEvent(src_path=str(data_file), dest_path=str(trigger_file))
793
+ handler.on_moved(event)
794
+
795
+ assert (
796
+ f"Data file {data_file} for trigger {trigger_file} does not exist, skipping"
797
+ in caplog.text
798
+ )
799
+ assert (
800
+ DETECTED_NEW_TRIGGER_FILE not in caplog.text
801
+ ) # Skips processing since the data file is missing
802
+
803
+
804
+ def test_trigger_file_handler_on_moved_success(
805
+ storage_mount_path, tmp_path, onsite_test_file, test_vo, test_scope, caplog
806
+ ):
807
+ """Test on_moved successfully processing a valid trigger file."""
808
+ ingestion_client = IngestionClient(
809
+ data_path=storage_mount_path,
810
+ rse=ONSITE_RSE,
811
+ vo=test_vo,
812
+ scope=test_scope,
813
+ )
814
+
815
+ ingest_instance = Ingest(
816
+ client=ingestion_client,
817
+ top_dir=storage_mount_path,
818
+ num_workers=1,
819
+ lock_file_path=storage_mount_path / "bdms_ingest.lock",
820
+ polling_interval=0.5,
821
+ check_interval=0.5,
822
+ )
823
+
824
+ # Create ProcessPoolExecutor for the ingest instance
825
+ with ProcessPoolExecutor(max_workers=1) as executor:
826
+ ingest_instance.executor = executor
827
+
828
+ handler = TriggerFileHandler(ingest_instance)
829
+ acada_path, _ = onsite_test_file
830
+ test_file = acada_path
831
+ trigger_file = Path(str(test_file) + TRIGGER_SUFFIX)
832
+ trigger_file.symlink_to(test_file)
833
+
834
+ # Create FileMovedEvent (simulating ln -s)
835
+ event = FileMovedEvent(src_path=str(test_file), dest_path=str(trigger_file))
836
+
837
+ # Record initial state
838
+ initial_task_counter = ingest_instance.task_counter
839
+ initial_total_tasks = ingest_instance.total_tasks_submitted
840
+ initial_submitted_tasks_count = len(ingest_instance.submitted_tasks)
841
+
842
+ handler.on_moved(event)
843
+
844
+ # Verify the expected log message
845
+ msg = f"'Detected new trigger file {trigger_file}, submitting data file {test_file}' in caplog"
846
+ assert (
847
+ f"Detected new trigger file {trigger_file}, submitting data file {test_file}"
848
+ in caplog.text
849
+ ), msg
850
+
851
+ # Verify task submission metrics were updated
852
+ assert ingest_instance.task_counter == initial_task_counter + 1
853
+ assert ingest_instance.total_tasks_submitted == initial_total_tasks + 1
854
+ assert len(ingest_instance.submitted_tasks) == initial_submitted_tasks_count + 1
855
+
856
+ # Verify the task was submitted with correct file path
857
+ submitted_task_files = list(ingest_instance.submitted_tasks.values())
858
+ assert str(test_file) in submitted_task_files
859
+
860
+ # Give some time for the task to potentially complete
861
+ time.sleep(0.5)
862
+
863
+
864
+ def test_trigger_file_handler_on_moved_stop_event_set(
865
+ storage_mount_path, tmp_path, caplog
866
+ ):
867
+ """Test on_moved skips processing when stop_event is set."""
868
+ ingestion_client = IngestionClient(
869
+ data_path=storage_mount_path,
870
+ rse=ONSITE_RSE,
871
+ vo="ctao",
872
+ scope="acada",
873
+ )
874
+
875
+ ingest_instance = Ingest(
876
+ client=ingestion_client,
877
+ top_dir=storage_mount_path,
878
+ num_workers=1,
879
+ lock_file_path=storage_mount_path / "bdms_ingest.lock",
880
+ polling_interval=0.5,
881
+ check_interval=0.5,
882
+ )
883
+
884
+ handler = TriggerFileHandler(ingest_instance)
885
+ trigger_file = tmp_path / TEST_FILE_TRIGGER
886
+ data_file = tmp_path / "test_file"
887
+ data_file.write_text("data") # Data file exists
888
+ trigger_file.symlink_to(data_file)
889
+
890
+ # Create FileMovedEvent
891
+ event = FileMovedEvent(src_path=str(data_file), dest_path=str(trigger_file))
892
+
893
+ # Set stop event
894
+ ingest_instance.stop_event.set()
895
+
896
+ # Record initial state
897
+ initial_task_counter = ingest_instance.task_counter
898
+ initial_total_tasks = ingest_instance.total_tasks_submitted
899
+
900
+ try:
901
+ handler.on_moved(event)
902
+
903
+ # Should not process anything when stop_event is set
904
+ assert ingest_instance.task_counter == initial_task_counter
905
+ assert ingest_instance.total_tasks_submitted == initial_total_tasks
906
+ assert DETECTED_NEW_TRIGGER_FILE not in caplog.text
907
+
908
+ finally:
909
+ ingest_instance.stop_event.clear() # Reset for other tests
910
+
911
+
912
+ def test_trigger_file_handler_on_moved_directory_event(
913
+ storage_mount_path, tmp_path, caplog
914
+ ):
915
+ """Test on_moved skips directory events."""
916
+ ingestion_client = IngestionClient(
917
+ data_path=storage_mount_path,
918
+ rse=ONSITE_RSE,
919
+ vo="ctao",
920
+ scope="acada",
921
+ )
922
+
923
+ ingest_instance = Ingest(
924
+ client=ingestion_client,
925
+ top_dir=storage_mount_path,
926
+ num_workers=1,
927
+ lock_file_path=storage_mount_path / "bdms_ingest.lock",
928
+ polling_interval=0.5,
929
+ check_interval=0.5,
930
+ )
931
+
932
+ handler = TriggerFileHandler(ingest_instance)
933
+ trigger_dir = tmp_path / "some_directory.trigger"
934
+ source_dir = tmp_path / "source_directory"
935
+ source_dir.mkdir()
936
+ trigger_dir.mkdir()
937
+
938
+ # Create directory move event
939
+ event = FileMovedEvent(src_path=str(source_dir), dest_path=str(trigger_dir))
940
+ event.is_directory = True # mark as directory event
941
+
942
+ # Record initial state
943
+ initial_task_counter = ingest_instance.task_counter
944
+ initial_total_tasks = ingest_instance.total_tasks_submitted
945
+
946
+ handler.on_moved(event)
947
+
948
+ # Should not process directory events
949
+ assert ingest_instance.task_counter == initial_task_counter
950
+ assert ingest_instance.total_tasks_submitted == initial_total_tasks
951
+ assert DETECTED_NEW_TRIGGER_FILE not in caplog.text
952
+
953
+
954
+ def test_trigger_file_handler_on_moved_with_actual_processing(
955
+ storage_mount_path, tmp_path, onsite_test_file, test_vo, test_scope, caplog
956
+ ):
957
+ """Test on_moved with successfully processing a valid trigger file."""
958
+ ingestion_client = IngestionClient(
959
+ data_path=storage_mount_path,
960
+ rse=ONSITE_RSE,
961
+ vo=test_vo,
962
+ scope=test_scope,
963
+ )
964
+
965
+ ingest_instance = Ingest(
966
+ client=ingestion_client,
967
+ top_dir=storage_mount_path,
968
+ num_workers=1,
969
+ lock_file_path=storage_mount_path / "bdms_ingest.lock",
970
+ polling_interval=0.5,
971
+ check_interval=0.5,
972
+ )
973
+
974
+ # Start the result processing thread manually for this test
975
+ result_thread = threading.Thread(
976
+ target=ingest_instance._process_results, daemon=True
977
+ )
978
+ result_thread.start()
979
+
980
+ with ProcessPoolExecutor(max_workers=1) as executor:
981
+ ingest_instance.executor = executor
982
+
983
+ handler = TriggerFileHandler(ingest_instance)
984
+ acada_path, _ = onsite_test_file
985
+ test_file = acada_path
986
+ trigger_file = Path(str(test_file) + TRIGGER_SUFFIX)
987
+ trigger_file.symlink_to(test_file)
988
+
989
+ # Create FileMovedEvent
990
+ event = FileMovedEvent(src_path=str(test_file), dest_path=str(trigger_file))
991
+
992
+ handler.on_moved(event)
993
+
994
+ # Wait for processing to complete
995
+ timeout = 10.0
996
+ start_time = time.time()
997
+ processed = False
998
+
999
+ while time.time() - start_time < timeout:
1000
+ # Check if task was completed (removed from submitted_tasks)
1001
+ if len(ingest_instance.submitted_tasks) == 0:
1002
+ processed = True
1003
+ break
1004
+ time.sleep(0.1)
1005
+
1006
+ # Stop the result processing thread
1007
+ ingest_instance.stop_event.set()
1008
+ result_thread.join(timeout=2.0)
1009
+
1010
+ # Verify processing occurred
1011
+ msg = "Task was not processed within timeout"
1012
+ assert processed, msg
1013
+
1014
+ msg = f"'Detected new trigger file {trigger_file}, submitting data file {test_file}' in caplog"
1015
+ assert (
1016
+ f"Detected new trigger file {trigger_file}, submitting data file {test_file}"
1017
+ in caplog.text
1018
+ ), msg
1019
+
1020
+ # Check that a result was logged (either success, failure, or error)
1021
+ result_logged = any(
1022
+ phrase in caplog.text
1023
+ for phrase in ["Processed file", "failed:", "Exception in process_file"]
1024
+ )
1025
+ msg = "No processing result was logged"
1026
+ assert result_logged, msg
1027
+
1028
+
1029
+ def test_sequential_exclusion_lock_prevention(storage_mount_path, tmp_path):
1030
+ """Test that a second daemon instance cannot start when first is already running.
1031
+
1032
+ This test validates sequential exclusion: when one ingestion daemon is already
1033
+ running and has acquired the lock, any subsequent attempt to start another
1034
+ daemon instance should fail with a clear error message.
1035
+ """
1036
+ lock_file = tmp_path / "sequential_test.pid"
1037
+
1038
+ ingestion_client = IngestionClient(
1039
+ data_path=storage_mount_path,
1040
+ rse=ONSITE_RSE,
1041
+ vo="ctao",
1042
+ scope="acada",
1043
+ )
1044
+
1045
+ # Create first instance
1046
+ instance1 = Ingest(
1047
+ client=ingestion_client,
1048
+ top_dir=tmp_path,
1049
+ lock_file_path=lock_file,
1050
+ num_workers=1,
1051
+ polling_interval=0.1,
1052
+ check_interval=0.1,
1053
+ )
1054
+
1055
+ # Create second instance with same lock file
1056
+ instance2 = Ingest(
1057
+ client=ingestion_client,
1058
+ top_dir=tmp_path,
1059
+ lock_file_path=lock_file,
1060
+ num_workers=1,
1061
+ polling_interval=0.1,
1062
+ check_interval=0.1,
1063
+ )
1064
+
1065
+ results = {}
1066
+ first_instance_started = threading.Event()
1067
+
1068
+ def run_first_instance():
1069
+ """Run first instance - should succeed and run until manually stopped."""
1070
+ try:
1071
+ # signal: about to start daemon
1072
+ first_instance_started.set()
1073
+ instance1.run()
1074
+ results["first"] = "success"
1075
+ except Exception as e:
1076
+ results["first"] = f"error: {str(e)}"
1077
+
1078
+ def run_second_instance():
1079
+ """Try to run second instance while first is running - should fail with lock conflict."""
1080
+ try:
1081
+ # Verify first instance has actually acquired the lock
1082
+ lock_acquired_timeout = 15.0
1083
+ start_wait = time.time()
1084
+ while time.time() - start_wait < lock_acquired_timeout:
1085
+ if lock_file.exists():
1086
+ break
1087
+ time.sleep(0.1)
1088
+ else:
1089
+ results["second"] = "first_instance_never_acquired_lock"
1090
+ return
1091
+
1092
+ # This should fail because first instance holds the lock
1093
+ instance2.run()
1094
+ results["second"] = "unexpected_success" # Should not reach here
1095
+ except RuntimeError as e:
1096
+ error_msg = str(e)
1097
+ if "Another ingestion process is already running" in error_msg:
1098
+ results["second"] = f"expected_lock_conflict: {str(e)}"
1099
+ else:
1100
+ results["second"] = f"unexpected_runtime_error: {str(e)}"
1101
+ except Exception as e:
1102
+ results["second"] = f"unexpected_error: {str(e)}"
1103
+
1104
+ # Start first instance with non-daemon thread
1105
+ thread1 = threading.Thread(target=run_first_instance, daemon=False)
1106
+ thread1.start()
1107
+
1108
+ # Wait for first instance to signal it's starting
1109
+ msg = "First instance failed to start"
1110
+ assert first_instance_started.wait(timeout=10), msg
1111
+
1112
+ # Give first instance time to acquire lock and initialize
1113
+ time.sleep(3.0)
1114
+
1115
+ # Verify first instance has acquired lock with content validation
1116
+ msg = "First instance should have created PID file"
1117
+ assert lock_file.exists(), msg
1118
+
1119
+ # Read PID and verify it's valid
1120
+ pid_content = lock_file.read_text().strip()
1121
+ msg = f"PID file should contain a number, got: {pid_content}"
1122
+ assert pid_content.isdigit(), msg
1123
+
1124
+ # Verify the lock file contains current process PID or a valid PID
1125
+ current_pid = os.getpid()
1126
+ stored_pid = int(pid_content)
1127
+ # The stored PID should be current process since we're running in same process
1128
+ msg = f"Expected PID {current_pid}, got {stored_pid}"
1129
+ assert stored_pid == current_pid, msg
1130
+
1131
+ # Now try to start second instance - this should fail
1132
+ thread2 = threading.Thread(target=run_second_instance, daemon=False)
1133
+ thread2.start()
1134
+
1135
+ # Wait for second instance to complete with better timeout handling
1136
+ # FileLock timeout is 10 seconds, so we give a bit more time
1137
+ thread2.join(timeout=15)
1138
+
1139
+ # Explicit check for thread completion
1140
+ if thread2.is_alive():
1141
+ # Force stop and fail the test
1142
+ instance1.stop_event.set()
1143
+ thread1.join(timeout=5)
1144
+ pytest.fail("Second instance thread did not complete within expected timeout")
1145
+
1146
+ # Stop first instance now that we've tested the lock
1147
+ instance1.stop_event.set()
1148
+ thread1.join(timeout=10)
1149
+
1150
+ # Ensure first thread also terminates
1151
+ if thread1.is_alive():
1152
+ pytest.fail("First instance thread did not terminate within timeout")
1153
+
1154
+ # Verify results
1155
+ msg = f"Second instance should have completed. Results: {results}"
1156
+ assert "second" in results, msg
1157
+
1158
+ # More specific assertion for expected lock conflict
1159
+ second_result = results["second"]
1160
+ msg = f"Second instance should have failed with lock conflict. Got: {second_result}"
1161
+ assert second_result.startswith("expected_lock_conflict"), msg
1162
+
1163
+ # Verify the error message is the expected one from Ingest class
1164
+ msg = f"Expected specific error message, got: {second_result}"
1165
+ assert "Another ingestion process is already running" in second_result, msg
1166
+
1167
+ # First instance should have run successfully (we stopped it manually)
1168
+ if "first" in results:
1169
+ msg = f"First instance should succeed, got: {results['first']}"
1170
+ assert results["first"] == "success", msg
1171
+
1172
+ # Improved cleanup verification with timeout-based checking
1173
+ cleanup_timeout = 5.0
1174
+ start_cleanup_wait = time.time()
1175
+ while time.time() - start_cleanup_wait < cleanup_timeout:
1176
+ if not lock_file.exists():
1177
+ break
1178
+ time.sleep(0.1)
1179
+
1180
+ msg = "PID file should be cleaned up after first instance stops"
1181
+ assert not lock_file.exists(), msg
1182
+
1183
+ # logging
1184
+ LOGGER.info("Sequential exclusion test completed successfully")
1185
+ LOGGER.info("First instance: %s", results.get("first", "stopped manually"))
1186
+ LOGGER.info("Second instance correctly failed with: %s", second_result)
1187
+
1188
+
1189
+ def test_concurrent_exclusion_lock_prevention(storage_mount_path, tmp_path):
1190
+ """Test FileLock behavior under true concurrent access - simultaneous daemon startup attempts.
1191
+
1192
+ This test validates real concurrent scenario where multiple daemon instances
1193
+ attempt to acquire the same lock simultaneously, simulating race conditions
1194
+ that occur in production environments.
1195
+ """
1196
+ lock_file = tmp_path / "concurrent_test.pid"
1197
+
1198
+ ingestion_client = IngestionClient(
1199
+ data_path=storage_mount_path,
1200
+ rse=ONSITE_RSE,
1201
+ vo="ctao",
1202
+ scope="acada",
1203
+ )
1204
+
1205
+ # Create both instances
1206
+ instance1 = Ingest(
1207
+ client=ingestion_client,
1208
+ top_dir=tmp_path,
1209
+ lock_file_path=lock_file,
1210
+ num_workers=1,
1211
+ polling_interval=0.1,
1212
+ check_interval=0.1,
1213
+ )
1214
+ instance2 = Ingest(
1215
+ client=ingestion_client,
1216
+ top_dir=tmp_path,
1217
+ lock_file_path=lock_file,
1218
+ num_workers=1,
1219
+ polling_interval=0.1,
1220
+ check_interval=0.1,
1221
+ )
1222
+
1223
+ results = {}
1224
+
1225
+ # Synchronization barrier - both threads wait here until released
1226
+ start_barrier = threading.Barrier(3) # 2 worker threads + 1 main thread
1227
+
1228
+ def run_instance(instance_id, instance):
1229
+ """Run instance - both will try to start simultaneously."""
1230
+ try:
1231
+ # Wait for barrier - ensures simultaneous start
1232
+ start_barrier.wait() # All threads start together!
1233
+
1234
+ instance.run()
1235
+ results[instance_id] = "success"
1236
+ except RuntimeError as e:
1237
+ if "Another ingestion process is already running" in str(e):
1238
+ results[instance_id] = f"lock_conflict: {str(e)}"
1239
+ else:
1240
+ results[instance_id] = f"unexpected_error: {str(e)}"
1241
+ except Exception as e:
1242
+ results[instance_id] = f"error: {str(e)}"
1243
+
1244
+ # Create both threads
1245
+ thread1 = threading.Thread(
1246
+ target=run_instance, args=("first", instance1), daemon=False
1247
+ )
1248
+ thread2 = threading.Thread(
1249
+ target=run_instance, args=("second", instance2), daemon=False
1250
+ )
1251
+
1252
+ # Start both threads - they will wait at the barrier
1253
+ thread1.start()
1254
+ thread2.start()
1255
+
1256
+ # Give threads time to reach barrier
1257
+ time.sleep(0.5)
1258
+
1259
+ # Release the barrier - both threads start simultaneously
1260
+ start_barrier.wait()
1261
+
1262
+ # Wait for both to complete the lock acquisition attempt
1263
+ thread1.join(timeout=15)
1264
+ thread2.join(timeout=15)
1265
+
1266
+ # Stop whichever instance succeeded
1267
+ if "first" in results and results["first"] == "success":
1268
+ instance1.stop_event.set()
1269
+ if "second" in results and results["second"] == "success":
1270
+ instance2.stop_event.set()
1271
+
1272
+ # Ensure threads complete
1273
+ if thread1.is_alive():
1274
+ instance1.stop_event.set()
1275
+ thread1.join(timeout=5)
1276
+ if thread2.is_alive():
1277
+ instance2.stop_event.set()
1278
+ thread2.join(timeout=5)
1279
+
1280
+ # Verify results - Exactly ONE should succeed, ONE should fail
1281
+ msg = f"Both instances should complete, got: {results}"
1282
+ assert len(results) == 2, msg
1283
+
1284
+ success_count = sum(1 for result in results.values() if result == "success")
1285
+ conflict_count = sum(1 for result in results.values() if "lock_conflict" in result)
1286
+
1287
+ msg = f"Exactly ONE instance should succeed, got {success_count}: {results}"
1288
+ assert success_count == 1, msg
1289
+
1290
+ msg = f"Exactly ONE instance should get lock conflict, got {conflict_count}: {results}"
1291
+ assert conflict_count == 1, msg
1292
+
1293
+ # Verify the lock conflict has correct error message
1294
+ conflict_result = [r for r in results.values() if "lock_conflict" in r][0]
1295
+ msg = "Expected 'Another ingestion process is already running' message in conflict result"
1296
+ assert "Another ingestion process is already running" in conflict_result, msg
1297
+
1298
+ # Verify cleanup
1299
+ cleanup_timeout = 5.0
1300
+ start_cleanup = time.time()
1301
+ while time.time() - start_cleanup < cleanup_timeout:
1302
+ if not lock_file.exists():
1303
+ break
1304
+ time.sleep(0.1)
1305
+ msg = "Lock file should be cleaned up"
1306
+ assert not lock_file.exists(), msg
1307
+
1308
+ LOGGER.info("True Concurrency tests: %s", results)
1309
+ LOGGER.info("Real concurrent lock acquisition tested successfully!")
1310
+
1311
+
1312
+ def acada_write_test_files(
1313
+ storage_mount_path, test_vo, test_scope, n_files=7
1314
+ ) -> list[Path]:
1315
+ """Represents ACADA writing test files to the storage mount path."""
1316
+
1317
+ test_dir = storage_mount_path / test_vo / test_scope
1318
+ test_dir.mkdir(parents=True, exist_ok=True)
1319
+
1320
+ # Create seven dummy FITS files
1321
+ data_files = []
1322
+ rng = np.random.default_rng()
1323
+ for i in range(n_files):
1324
+ data_file = test_dir / f"testfile_{i}_20250609.fits"
1325
+ hdu = fits.PrimaryHDU(rng.random((50, 50)))
1326
+ hdu.writeto(data_file, overwrite=True, checksum=True)
1327
+ data_files.append(data_file)
1328
+
1329
+ LOGGER.info("Created test file: %s", data_file)
1330
+
1331
+ # Move permission reset before daemon start to avoid timing issues
1332
+ reset_xrootd_permissions(storage_mount_path)
1333
+ time.sleep(1.0) # Allow permissions to be applied
1334
+
1335
+ return data_files
1336
+
1337
+
1338
+ def acada_create_trigger_symlink(data_file, creation_results):
1339
+ """Represents creating a trigger symlink for a given data file."""
1340
+
1341
+ try:
1342
+ trigger_file = Path(str(data_file) + TRIGGER_SUFFIX)
1343
+ trigger_file.symlink_to(data_file)
1344
+ LOGGER.info("Created trigger file: %s -> %s", trigger_file, data_file)
1345
+
1346
+ # Verify creation was successful
1347
+ if trigger_file.exists() and trigger_file.is_symlink():
1348
+ creation_results.append({"file": str(data_file), "status": "success"})
1349
+ else:
1350
+ creation_results.append(
1351
+ {"file": str(data_file), "status": "creation_failed"}
1352
+ )
1353
+ except Exception as e:
1354
+ LOGGER.exception("Failed to create trigger for %s: %s", data_file, e)
1355
+ creation_results.append({"file": str(data_file), "status": f"error: {str(e)}"})
1356
+
1357
+ return creation_results
1358
+
1359
+
1360
+ def ensure_files_ingested(data_files, storage_mount_path, test_scope, timeout_s=120):
1361
+ """Ensure that all files are ingested by checking the IngestStatus."""
1362
+
1363
+ replica_client = ReplicaClient()
1364
+
1365
+ timeout_at = time.time() + timeout_s
1366
+
1367
+ data_file_entries = [
1368
+ {
1369
+ "file": str(data_file),
1370
+ "expected_lfn": f"/{data_file.relative_to(storage_mount_path)}",
1371
+ "found": False,
1372
+ }
1373
+ for data_file in data_files
1374
+ ]
1375
+
1376
+ while time.time() < timeout_at and not all(
1377
+ status["found"] for status in data_file_entries
1378
+ ):
1379
+ for data_file_entry in data_file_entries:
1380
+ if not data_file_entry["found"]:
1381
+ try:
1382
+ replicas = list(
1383
+ replica_client.list_replicas(
1384
+ dids=[
1385
+ {
1386
+ "scope": test_scope,
1387
+ "name": data_file_entry["expected_lfn"],
1388
+ }
1389
+ ]
1390
+ )
1391
+ )
1392
+ if not replicas:
1393
+ LOGGER.info(
1394
+ "No replica found for %s", data_file_entry["expected_lfn"]
1395
+ )
1396
+ else:
1397
+ LOGGER.info(
1398
+ "Replica found for %s: %s",
1399
+ data_file_entry["expected_lfn"],
1400
+ replicas[0],
1401
+ )
1402
+ data_file_entry["found"] = True
1403
+ except Exception:
1404
+ LOGGER.exception(
1405
+ "Failed to list replicas for %s",
1406
+ data_file_entry["expected_lfn"],
1407
+ )
1408
+ time.sleep(1.0)
1409
+
1410
+ if not all(status["found"] for status in data_file_entries):
1411
+ pytest.fail(f"Not all replicas found for files: {data_files}")
1412
+
1413
+
1414
+ @pytest.mark.usefixtures(
1415
+ "_auth_proxy", "lock_for_ingestion_daemon", "disable_ingestion_daemon"
1416
+ )
1417
+ @pytest.mark.verifies_usecase("UC-110-1.1.4")
1418
+ def test_ingest_parallel_submission(storage_mount_path, caplog, test_vo, test_scope):
1419
+ """Test parallel file processing: creates multiple FITS files simultaneously and verifies that the
1420
+ daemon can detect, process, and ingest them efficiently using parallel workers.
1421
+ """
1422
+ ingestion_client = IngestionClient(
1423
+ data_path=storage_mount_path,
1424
+ rse=ONSITE_RSE,
1425
+ vo=test_vo,
1426
+ scope=test_scope,
1427
+ )
1428
+
1429
+ ingest_instance = Ingest(
1430
+ client=ingestion_client,
1431
+ top_dir=storage_mount_path,
1432
+ num_workers=4,
1433
+ lock_file_path=storage_mount_path / "bdms_ingest.lock",
1434
+ polling_interval=0.5,
1435
+ check_interval=0.5,
1436
+ )
1437
+
1438
+ data_files = acada_write_test_files(storage_mount_path, test_vo, test_scope)
1439
+
1440
+ # Daemon startup with exception handling
1441
+ daemon_exception = None
1442
+ daemon_started = threading.Event()
1443
+
1444
+ def run_daemon():
1445
+ """Run daemon with exception capture."""
1446
+ nonlocal daemon_exception
1447
+ try:
1448
+ daemon_started.set() # Signal daemon thread started
1449
+ ingest_instance.run()
1450
+ except Exception as e:
1451
+ daemon_exception = e
1452
+ LOGGER.exception("Daemon failed with exception: %s", str(e))
1453
+
1454
+ # Start daemon with non-daemon thread for reliability
1455
+ daemon_thread = threading.Thread(target=run_daemon, daemon=False)
1456
+ daemon_thread.start()
1457
+
1458
+ # Wait for daemon thread to start
1459
+ msg = "Daemon thread failed to start"
1460
+ assert daemon_started.wait(timeout=10), msg
1461
+
1462
+ # Daemon initialization verification
1463
+ daemon_init_timeout = 20.0 # Increased timeout for robust initialization
1464
+ daemon_init_start = time.time()
1465
+ required_conditions = {
1466
+ "lock_acquired": False,
1467
+ "result_thread_started": False,
1468
+ "pool_started": False,
1469
+ "monitoring_started": False,
1470
+ "observer_started": False,
1471
+ }
1472
+
1473
+ while time.time() - daemon_init_start < daemon_init_timeout:
1474
+ # Check for daemon startup failure early
1475
+ if daemon_exception:
1476
+ pytest.fail(f"Daemon failed during initialization: {daemon_exception}")
1477
+
1478
+ # Check for lock acquisition (critical for daemon operation)
1479
+ if ingest_instance.lock_file_path.exists():
1480
+ required_conditions["lock_acquired"] = True
1481
+
1482
+ # Check log messages for initialization steps
1483
+ log_text = caplog.text
1484
+ if "Result processing thread started" in log_text:
1485
+ required_conditions["result_thread_started"] = True
1486
+
1487
+ # Flexible process pool verification to work with any worker count
1488
+ if re.search(r"Started process pool with \d+ workers", log_text):
1489
+ required_conditions["pool_started"] = True
1490
+
1491
+ if "Starting continuous polling-based monitoring" in log_text:
1492
+ required_conditions["monitoring_started"] = True
1493
+ if "File monitoring observer started successfully" in log_text:
1494
+ required_conditions["observer_started"] = True
1495
+
1496
+ # Check if all conditions are met
1497
+ if all(required_conditions.values()):
1498
+ break
1499
+
1500
+ time.sleep(0.2)
1501
+
1502
+ # Verify complete initialization or provide diagnostics
1503
+ missing_conditions = [k for k, v in required_conditions.items() if not v]
1504
+ if missing_conditions:
1505
+ ingest_instance.stop_event.set()
1506
+ daemon_thread.join(timeout=5)
1507
+ pytest.fail(
1508
+ f"Daemon initialization incomplete. Missing: {missing_conditions}. Check logs for errors."
1509
+ )
1510
+
1511
+ time.sleep(0.5) # some additional time to stabilize
1512
+
1513
+ # Create trigger files and also track
1514
+ trigger_files = []
1515
+ natural_start = time.time()
1516
+
1517
+ for data_file in data_files:
1518
+ trigger_file = Path(str(data_file) + TRIGGER_SUFFIX)
1519
+ trigger_file.symlink_to(data_file)
1520
+ trigger_files.append(trigger_file)
1521
+
1522
+ # Test regular detection, looking for MOVE events
1523
+ natural_detection_timeout = 30.0
1524
+ natural_start = time.time()
1525
+
1526
+ while time.time() - natural_start < natural_detection_timeout:
1527
+ # Look for actual processing
1528
+ if caplog.text.count("Detected new trigger file") > 0:
1529
+ break
1530
+ time.sleep(1.0)
1531
+
1532
+ # Count events after the loop completes
1533
+ move_events_detected = caplog.text.count("MOVE Event received")
1534
+
1535
+ # Wait for processing with concurrency monitoring
1536
+ processing_timeout = 120.0
1537
+ processing_start = time.time()
1538
+ processed_files = set()
1539
+ max_concurrent_samples = []
1540
+
1541
+ while time.time() - processing_start < processing_timeout:
1542
+ # Sample concurrent tasks frequently to catch parallelism
1543
+ current_concurrent = len(ingest_instance.submitted_tasks)
1544
+ max_concurrent_samples.append(current_concurrent)
1545
+
1546
+ # Check processing results
1547
+ for data_file in data_files:
1548
+ success_pattern = f"Processed file {data_file} with result success"
1549
+ skipped_pattern = f"Processed file {data_file} with result skipped"
1550
+
1551
+ if str(data_file) not in processed_files:
1552
+ if success_pattern in caplog.text or skipped_pattern in caplog.text:
1553
+ processed_files.add(str(data_file))
1554
+
1555
+ if len(processed_files) == 7:
1556
+ break
1557
+
1558
+ if "Fatal error in result processing thread" in caplog.text:
1559
+ break
1560
+
1561
+ time.sleep(0.1) # Sample frequently to catch concurrency
1562
+
1563
+ assert len(processed_files) == 7
1564
+
1565
+ # Record ingestion workflow completion time
1566
+ workflow_end_time = time.time()
1567
+
1568
+ # Stop the daemon
1569
+ ingest_instance.stop_event.set()
1570
+ daemon_thread.join(timeout=10)
1571
+
1572
+ if daemon_thread.is_alive():
1573
+ pytest.fail("Ingest Daemon thread did not terminate within timeout")
1574
+
1575
+ # Verify results
1576
+ msg = "Process pool startup failed"
1577
+ assert "Started process pool with 4 workers" in caplog.text, msg
1578
+
1579
+ msg = "Result processing thread startup failed"
1580
+ assert "Result processing thread started" in caplog.text, msg
1581
+
1582
+ # Verify trigger files were cleaned up during successful processing
1583
+ remaining_triggers = sum(1 for tf in trigger_files if tf.exists())
1584
+ msg = f"Expected all trigger files to be cleaned up, {remaining_triggers} remain"
1585
+ assert remaining_triggers == 0, msg
1586
+
1587
+ # Verify clean shutdown
1588
+ msg = "Lock file not cleaned up"
1589
+ assert not ingest_instance.lock_file_path.exists(), msg
1590
+
1591
+ msg = "Daemon shutdown not logged"
1592
+ assert "Stopped ingestion daemon" in caplog.text, msg
1593
+
1594
+ msg = "Result thread shutdown not logged"
1595
+ assert "Result processing thread stopped" in caplog.text, msg
1596
+
1597
+ # Clean up data files
1598
+ for data_file in data_files:
1599
+ if data_file.exists():
1600
+ data_file.unlink()
1601
+
1602
+ # Statistics
1603
+ # Ingestion workflow time: from trigger detection to ingestion with replication completion
1604
+ max_concurrent_observed = (
1605
+ max(max_concurrent_samples) if max_concurrent_samples else 0
1606
+ )
1607
+ max_concurrent_tracked = ingest_instance.max_concurrent_tasks
1608
+
1609
+ detection_to_completion_time = workflow_end_time - natural_start
1610
+ processing_rate = (
1611
+ len(processed_files) / detection_to_completion_time
1612
+ if detection_to_completion_time > 0
1613
+ else 0
1614
+ )
1615
+
1616
+ total_submitted = ingest_instance.total_tasks_submitted
1617
+ tasks_cleaned_up = len(ingest_instance.submitted_tasks) == 0
1618
+ max_concurrent_final = max(max_concurrent_tracked, max_concurrent_observed)
1619
+ parallel_achieved = max_concurrent_final >= 2
1620
+
1621
+ # Summary
1622
+ status = "parallel" if parallel_achieved else "sequential"
1623
+
1624
+ LOGGER.info("=== Parallel Ingestion Test Results ===")
1625
+ LOGGER.info(
1626
+ "Files processed: %d/7 in %.1fs",
1627
+ len(processed_files),
1628
+ detection_to_completion_time,
1629
+ )
1630
+ LOGGER.info("Processing rate: %.1f files/sec", processing_rate)
1631
+ LOGGER.info("Max concurrent tasks: %d (mode: %s)", max_concurrent_final, status)
1632
+ LOGGER.info("Total tasks submitted: %d", total_submitted)
1633
+ LOGGER.info("Task cleanup successful: %s", tasks_cleaned_up)
1634
+ LOGGER.info("Event detection: %d move events", move_events_detected)
1635
+
1636
+
1637
+ def fetch_ingestion_daemon_metrics():
1638
+ """Fetch metrics from the ingestion daemon to verify its operation."""
1639
+
1640
+ response = urlopen("http://bdms-ingestion-daemon:8000/")
1641
+
1642
+ assert response.status == 200, "Ingestion daemon metrics are not responding"
1643
+
1644
+ n_tasks_metrics = {}
1645
+ for line in response.readlines():
1646
+ line = line.decode("utf-8").strip()
1647
+ if line.startswith("n_tasks_"):
1648
+ LOGGER.info("Ingestion daemon metrics: %s", line)
1649
+ key, value = line.split(" ", 1)
1650
+ n_tasks_metrics[key] = float(value)
1651
+
1652
+ return n_tasks_metrics
1653
+
1654
+
1655
+ @pytest.mark.usefixtures(
1656
+ "_auth_proxy", "lock_for_ingestion_daemon", "enable_ingestion_daemon"
1657
+ )
1658
+ @pytest.mark.verifies_usecase("UC-110-1.1.4")
1659
+ def test_ingest_parallel_submission_with_live_daemon(storage_mount_path, test_vo):
1660
+ """Test parallel file processing with an already running daemon."""
1661
+
1662
+ # with live test, the daemon is deployed outside of this test, so we need to pick a persistent location, matching the daemon's storage mount path
1663
+ # note that if kind cluster creation fixture is used, the directory can be unique per test session
1664
+ # this test does only checks that the files are consumed, not that they are replicated
1665
+
1666
+ test_scope = "test_scope_persistent"
1667
+
1668
+ n_tasks_metrics_before_test = fetch_ingestion_daemon_metrics()
1669
+
1670
+ for tf in (storage_mount_path / test_vo / test_scope).glob("*" + TRIGGER_SUFFIX):
1671
+ if tf.exists():
1672
+ LOGGER.info("Cleaning up existing trigger file: %s", tf)
1673
+ tf.unlink()
1674
+
1675
+ data_files = acada_write_test_files(storage_mount_path, test_vo, test_scope)
1676
+
1677
+ creation_results = []
1678
+ for data_file in data_files:
1679
+ acada_create_trigger_symlink(data_file, creation_results)
1680
+
1681
+ trigger_files = [Path(str(df) + TRIGGER_SUFFIX) for df in data_files]
1682
+
1683
+ timeout = 120.0
1684
+ start_time = time.time()
1685
+
1686
+ remaining_triggers = 0
1687
+ while time.time() - start_time < timeout:
1688
+ # Verify trigger files were cleaned up during successful processing
1689
+ remaining_triggers = sum(1 for tf in trigger_files if tf.exists())
1690
+
1691
+ if remaining_triggers == 0:
1692
+ LOGGER.info("All trigger files consumed up successfully, exiting test.")
1693
+ break
1694
+ else:
1695
+ LOGGER.info(
1696
+ "Waiting for trigger files to be cleaned up, %s remain.",
1697
+ remaining_triggers,
1698
+ )
1699
+
1700
+ time.sleep(1.0) # Sample frequently to catch concurrency
1701
+
1702
+ assert remaining_triggers == 0, "Expected all trigger files to be consumed up"
1703
+
1704
+ ensure_files_ingested(data_files, storage_mount_path, test_scope)
1705
+
1706
+ # make sure that metrics are available from the daemon
1707
+ n_tasks_metrics = fetch_ingestion_daemon_metrics()
1708
+
1709
+ assert n_tasks_metrics["n_tasks_success_created"] < time.time()
1710
+ assert n_tasks_metrics["n_tasks_processed_total"] - n_tasks_metrics_before_test[
1711
+ "n_tasks_processed_total"
1712
+ ] == len(data_files)
1713
+ assert (
1714
+ n_tasks_metrics["n_tasks_processed_total"]
1715
+ - n_tasks_metrics_before_test["n_tasks_processed_total"]
1716
+ == n_tasks_metrics["n_tasks_success_total"]
1717
+ + n_tasks_metrics["n_tasks_skipped_total"]
1718
+ ), "Ingestion daemon metrics do not match expected values"