ctao-bdms-clients 0.2.0rc1__py3-none-any.whl → 0.3.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -6,11 +6,16 @@ and the replication of data between Rucio storage elements (RSEs).
6
6
 
7
7
  import logging
8
8
  import os
9
+ import re
9
10
  import subprocess
10
- from datetime import datetime
11
+ import threading
12
+ import time
13
+ from concurrent.futures import ProcessPoolExecutor
11
14
  from pathlib import Path
12
- from secrets import token_hex
15
+ from shutil import copy2
16
+ from urllib.request import urlopen
13
17
 
18
+ import numpy as np
14
19
  import pytest
15
20
  from astropy.io import fits
16
21
  from astropy.table import Table
@@ -20,35 +25,34 @@ from rucio.client.replicaclient import ReplicaClient
20
25
  from rucio.client.ruleclient import RuleClient
21
26
  from rucio.common.exception import RucioException
22
27
  from rucio.common.utils import adler32
23
-
24
- from bdms.acada_ingestion import IngestionClient
25
- from bdms.tests.utils import wait_for_replication_status
28
+ from watchdog.events import FileMovedEvent
29
+
30
+ from bdms.acada_ingestion import (
31
+ DETECTED_NEW_TRIGGER_FILE,
32
+ INGEST_SUCCESS_MESSAGE,
33
+ TRIGGER_SUFFIX,
34
+ Ingest,
35
+ IngestionClient,
36
+ IngestStatus,
37
+ TriggerFileHandler,
38
+ process_file,
39
+ )
40
+ from bdms.tests.utils import reset_xrootd_permissions, wait_for_replication_status
26
41
 
27
42
  LOGGER = logging.getLogger(__name__)
28
43
 
29
- XROOTD_UID = 994
30
- XROOTD_GID = 994
31
44
  ONSITE_RSE = "STORAGE-1"
32
45
  OFFSITE_RSE_1 = "STORAGE-2"
33
46
  OFFSITE_RSE_2 = "STORAGE-3"
34
47
 
48
+ TEST_FILE_TRIGGER = "test_file.trigger"
49
+
35
50
 
36
51
  def test_shared_storage(storage_mount_path: Path):
37
52
  """Test that the shared storage path is available."""
38
53
 
39
- assert (
40
- storage_mount_path.exists()
41
- ), f"Shared storage {storage_mount_path} is not available on the client"
42
-
43
-
44
- def recursive_chown(path: Path, uid: int, gid: int):
45
- """Equivalent of unix chmod -R <uid>:<gid> <path>."""
46
- for root, dirs, files in os.walk(path):
47
- root = Path(root)
48
- for d in dirs:
49
- os.chown(root / d, uid, gid)
50
- for f in files:
51
- os.chown(root / f, uid, gid)
54
+ msg = f"Shared storage {storage_mount_path} is not available on the client"
55
+ assert storage_mount_path.exists(), msg
52
56
 
53
57
 
54
58
  def trigger_judge_repairer() -> None:
@@ -80,30 +84,6 @@ def trigger_judge_repairer() -> None:
80
84
  raise
81
85
 
82
86
 
83
- @pytest.fixture
84
- def test_file(
85
- storage_mount_path: Path, test_scope: str, test_vo: str
86
- ) -> tuple[Path, str]:
87
- """Create a dummy .fits.fz file in the shared storage for testing."""
88
-
89
- unique_id = f"{datetime.now():%Y%m%d_%H%M%S}_{token_hex(8)}"
90
- filename = f"testfile_{unique_id}.fits.fz"
91
-
92
- test_file_path = storage_mount_path / test_vo / test_scope / filename
93
- test_file_path.parent.mkdir(parents=True, exist_ok=True)
94
-
95
- # need to change file permissions of created directories so that
96
- # the xrootd still can read and write there
97
- recursive_chown(storage_mount_path / test_vo, XROOTD_UID, XROOTD_GID)
98
-
99
- # Write a small test content (simulating a .fits.fz file with minimal content for testing)
100
- test_file_content = f"FITS-like content for {unique_id}"
101
- test_file_path.write_text(test_file_content)
102
- os.chown(test_file_path, XROOTD_UID, XROOTD_GID)
103
-
104
- return test_file_path, test_file_content
105
-
106
-
107
87
  def test_acada_to_lfn(storage_mount_path: Path, test_vo: str):
108
88
  """Test the acada_to_lfn method of IngestionClient with valid and invalid inputs."""
109
89
 
@@ -121,7 +101,8 @@ def test_acada_to_lfn(storage_mount_path: Path, test_vo: str):
121
101
  )
122
102
  lfn = ingestion_client.acada_to_lfn(acada_path=acada_path)
123
103
 
124
- assert lfn == expected_lfn, f"Expected {expected_lfn}, got {lfn}"
104
+ msg = f"Expected {expected_lfn}, got {lfn}"
105
+ assert lfn == expected_lfn, msg
125
106
 
126
107
  # Test Case 2: Non-absolute acada_path (empty string)
127
108
  with pytest.raises(ValueError, match="acada_path must be absolute"):
@@ -151,15 +132,21 @@ def test_acada_to_lfn(storage_mount_path: Path, test_vo: str):
151
132
 
152
133
  @pytest.mark.usefixtures("_auth_proxy")
153
134
  def test_check_replica_exists(
154
- storage_mount_path: Path, test_scope: str, test_file: tuple[Path, str], test_vo: str
135
+ storage_mount_path: Path,
136
+ test_scope: str,
137
+ onsite_test_file: tuple[Path, str],
138
+ test_vo: str,
155
139
  ):
156
140
  """Test the check_replica_exists method of IngestionClient."""
157
141
 
158
142
  ingestion_client = IngestionClient(
159
- storage_mount_path, ONSITE_RSE, scope=test_scope, vo=test_vo
143
+ data_path=storage_mount_path,
144
+ rse=ONSITE_RSE,
145
+ vo=test_vo,
146
+ scope=test_scope,
160
147
  )
161
148
 
162
- acada_path, _ = test_file
149
+ acada_path, _ = onsite_test_file
163
150
 
164
151
  # Generate the LFN
165
152
  lfn = ingestion_client.acada_to_lfn(acada_path)
@@ -181,10 +168,54 @@ def test_check_replica_exists(
181
168
  assert not ingestion_client.check_replica_exists(nonexistent_lfn), msg
182
169
 
183
170
 
171
+ @pytest.fixture
172
+ def file_location(request):
173
+ return request.getfixturevalue(request.param)
174
+
175
+
176
+ @pytest.mark.parametrize(
177
+ ("file_location", "metadata_dict"),
178
+ [
179
+ (
180
+ "subarray_test_file",
181
+ {
182
+ "observatory": "CTA",
183
+ "start_time": "2025-02-04T21:34:05",
184
+ "end_time": "2025-02-04T21:43:12",
185
+ "subarray_id": 0,
186
+ "sb_id": 2000000066,
187
+ "obs_id": 2000000200,
188
+ },
189
+ ),
190
+ (
191
+ "tel_trigger_test_file",
192
+ {
193
+ "observatory": "CTA",
194
+ "start_time": "2025-02-04T21:34:05",
195
+ "end_time": "2025-02-04T21:43:11",
196
+ "tel_ids": [1],
197
+ "sb_id": 2000000066,
198
+ "obs_id": 2000000200,
199
+ },
200
+ ),
201
+ (
202
+ "tel_events_test_file",
203
+ {
204
+ "observatory": "CTA",
205
+ "start_time": "2025-04-01T15:25:02",
206
+ "end_time": "2025-04-01T15:25:03",
207
+ "sb_id": 0,
208
+ "obs_id": 0,
209
+ },
210
+ ),
211
+ ],
212
+ indirect=["file_location"],
213
+ )
184
214
  @pytest.mark.usefixtures("_auth_proxy")
185
215
  @pytest.mark.verifies_usecase("UC-110-1.1.1")
186
- def test_add_onsite_replica_with_dummy_file(
187
- test_file: tuple[Path, str],
216
+ def test_add_onsite_replica_with_minio_fits_file(
217
+ file_location: str,
218
+ metadata_dict: dict,
188
219
  test_scope: str,
189
220
  tmp_path: Path,
190
221
  storage_mount_path,
@@ -194,16 +225,28 @@ def test_add_onsite_replica_with_dummy_file(
194
225
  """Test the add_onsite_replica method of IngestionClient using a dummy file."""
195
226
 
196
227
  ingestion_client = IngestionClient(
197
- storage_mount_path, ONSITE_RSE, scope=test_scope, vo=test_vo
228
+ data_path=storage_mount_path,
229
+ rse=ONSITE_RSE,
230
+ vo=test_vo,
231
+ scope=test_scope,
198
232
  )
199
233
 
200
- acada_path, test_file_content = test_file
234
+ filename = str(file_location).split("/")[-1]
235
+ acada_path = storage_mount_path / test_vo / test_scope / filename
236
+ acada_path.parent.mkdir(parents=True, exist_ok=True)
237
+ copy2(file_location, str(acada_path))
238
+ reset_xrootd_permissions(storage_mount_path)
239
+
201
240
  # Use add_onsite_replica to register the replica
202
- lfn = ingestion_client.add_onsite_replica(acada_path=acada_path)
241
+ lfn, skipped = ingestion_client.add_onsite_replica(acada_path=acada_path)
203
242
 
204
243
  # Verify the LFN matches the expected LFN
205
244
  expected_lfn = ingestion_client.acada_to_lfn(acada_path)
206
- assert lfn == expected_lfn, f"Expected LFN {expected_lfn}, got {lfn}"
245
+ msg = f"Expected LFN {expected_lfn}, got {lfn}"
246
+ assert lfn == expected_lfn, msg
247
+
248
+ msg = "Expected the file to be newly ingested, but it was skipped"
249
+ assert not skipped, msg
207
250
 
208
251
  # Download the file using the LFN
209
252
  download_spec = {
@@ -216,20 +259,43 @@ def test_add_onsite_replica_with_dummy_file(
216
259
 
217
260
  # Verify the downloaded file
218
261
  download_path = tmp_path / lfn.lstrip("/")
219
- assert download_path.is_file(), f"Download failed at {download_path}"
262
+ msg = f"Download failed at {download_path}"
263
+ assert download_path.is_file(), msg
220
264
 
221
- downloaded_content = download_path.read_text()
222
- assert downloaded_content == test_file_content, (
223
- f"Downloaded file content does not match the original. "
224
- f"Expected: {test_file_content}, Got: {downloaded_content}"
225
- )
265
+ msg = "Downloaded file content does not match the original."
266
+ assert adler32(download_path) == adler32(file_location), msg
226
267
 
227
268
  # Check for don't ingest again if its already registered
228
269
  caplog.clear()
229
- lfn = ingestion_client.add_onsite_replica(acada_path=acada_path)
270
+ lfn_check, skipped_check = ingestion_client.add_onsite_replica(
271
+ acada_path=acada_path
272
+ )
273
+ msg = f"LFN mismatch on second ingestion attempt: expected {lfn}, got {lfn_check}"
274
+ assert lfn_check == lfn, msg
275
+
276
+ msg = (
277
+ "Expected the file to be skipped on second ingestion, but it was ingested again"
278
+ )
279
+ assert skipped_check, msg
280
+
281
+ msg = f"'Replica already exists for lfn '{lfn}', skipping' in caplog records"
230
282
  assert f"Replica already exists for lfn '{lfn}', skipping" in [
231
283
  r.message for r in caplog.records
232
- ]
284
+ ], msg
285
+
286
+ # Retrieve metadata using the DIDClient
287
+ did_client = Client()
288
+ retrieved_metadata = did_client.get_metadata(
289
+ scope=ingestion_client.scope, name=lfn, plugin="JSON"
290
+ )
291
+
292
+ # Verify the metadata matches the expected metadata
293
+ for key, value in metadata_dict.items():
294
+ msg = (
295
+ f"Metadata mismatch for key '{key}'. "
296
+ f"Expected: {value}, Got: {retrieved_metadata.get(key)}"
297
+ )
298
+ assert retrieved_metadata.get(key) == value, msg
233
299
 
234
300
 
235
301
  def test_rses():
@@ -238,21 +304,26 @@ def test_rses():
238
304
  result = list(client.list_rses())
239
305
 
240
306
  rses = [r["rse"] for r in result]
241
- assert ONSITE_RSE in rses, f"Expected RSE {ONSITE_RSE} not found in {rses}"
242
- assert OFFSITE_RSE_1 in rses, f"Expected RSE {OFFSITE_RSE_1} not found in {rses}"
243
- assert OFFSITE_RSE_2 in rses, f"Expected RSE {OFFSITE_RSE_2} not found in {rses}"
307
+ msg = f"Expected RSE {ONSITE_RSE} not found in {rses}"
308
+ assert ONSITE_RSE in rses, msg
309
+
310
+ msg = f"Expected RSE {OFFSITE_RSE_1} not found in {rses}"
311
+ assert OFFSITE_RSE_1 in rses, msg
312
+
313
+ msg = f"Expected RSE {OFFSITE_RSE_2} not found in {rses}"
314
+ assert OFFSITE_RSE_2 in rses, msg
244
315
 
245
316
 
246
317
  @pytest.fixture
247
318
  def pre_existing_lfn(
248
- test_file: tuple[Path, str],
319
+ onsite_test_file: tuple[Path, str],
249
320
  test_scope: str,
250
321
  test_vo: str,
251
322
  ) -> str:
252
323
  """Fixture to provide an LFN for a replica pre-registered in Rucio without using IngestionClient."""
253
324
 
254
325
  # Construct the LFN manually based on the test file and scope
255
- acada_path, _ = test_file
326
+ acada_path, _ = onsite_test_file
256
327
  relative_path = str(acada_path).split(f"{test_vo}/{test_scope}/", 1)[-1]
257
328
  lfn = f"/{test_vo}/{test_scope}/{relative_path}"
258
329
  checksum = adler32(acada_path)
@@ -281,9 +352,8 @@ def pre_existing_lfn(
281
352
 
282
353
  # Verify the replica is registered
283
354
  replicas = list(replica_client.list_replicas(dids=[did]))
284
- assert (
285
- replicas
286
- ), f"Failed to verify pre-registration of replica for LFN {lfn} on {ONSITE_RSE}"
355
+ msg = f"Failed to verify pre-registration of replica for LFN {lfn} on {ONSITE_RSE}"
356
+ assert replicas, msg
287
357
 
288
358
  return lfn
289
359
 
@@ -296,20 +366,22 @@ def test_add_offsite_replication_rules(
296
366
  test_vo: str,
297
367
  storage_mount_path: Path,
298
368
  tmp_path: Path,
299
- test_file: tuple[Path, str],
369
+ onsite_test_file: tuple[Path, str],
300
370
  caplog,
301
371
  ):
302
372
  """Test the add_offsite_replication_rules method of IngestionClient."""
303
373
  ingestion_client = IngestionClient(
304
- storage_mount_path, ONSITE_RSE, scope=test_scope, vo=test_vo
374
+ data_path=storage_mount_path,
375
+ rse=ONSITE_RSE,
376
+ vo=test_vo,
377
+ scope=test_scope,
305
378
  )
306
- caplog.set_level(logging.DEBUG)
307
379
 
308
380
  # Replicate the ACADA file to two offsite RSEs
309
381
  lfn = pre_existing_lfn
310
382
  did = {"scope": test_scope, "name": lfn}
311
383
 
312
- _, test_file_content = test_file # Get the test file content
384
+ _, test_file_content = onsite_test_file # Get the test file content
313
385
 
314
386
  offsite_rse_expression = "OFFSITE"
315
387
  copies = 2
@@ -331,10 +403,11 @@ def test_add_offsite_replication_rules(
331
403
  replica_client = ReplicaClient()
332
404
  replicas = next(replica_client.list_replicas(dids=[did]))
333
405
  states = replicas.get("states", {})
406
+ msg = f"Expected replica on either {OFFSITE_RSE_1} or {OFFSITE_RSE_2} to be AVAILABLE after first rule: {states}"
334
407
  assert (
335
408
  states.get(OFFSITE_RSE_1) == "AVAILABLE"
336
409
  or states.get(OFFSITE_RSE_2) == "AVAILABLE"
337
- ), f"Expected replica on either {OFFSITE_RSE_1} or {OFFSITE_RSE_2} to be AVAILABLE after first rule: {states}"
410
+ ), msg
338
411
 
339
412
  # Manually trigger the judge-repairer to ensure the second rule doesn't get stuck
340
413
  trigger_judge_repairer()
@@ -351,15 +424,15 @@ def test_add_offsite_replication_rules(
351
424
  did,
352
425
  states,
353
426
  )
354
- assert (
355
- states.get(ONSITE_RSE) == "AVAILABLE"
356
- ), f"Expected replica on {ONSITE_RSE} to be AVAILABLE: {states}"
357
- assert (
358
- states.get(OFFSITE_RSE_1) == "AVAILABLE"
359
- ), f"Expected replica on {OFFSITE_RSE_1} to be AVAILABLE: {states}"
360
- assert (
361
- states.get(OFFSITE_RSE_2) == "AVAILABLE"
362
- ), f"Expected replica on {OFFSITE_RSE_2} to be AVAILABLE: {states}"
427
+
428
+ msg = f"Expected replica on {ONSITE_RSE} to be AVAILABLE: {states}"
429
+ assert states.get(ONSITE_RSE) == "AVAILABLE", msg
430
+
431
+ msg = f"Expected replica on {OFFSITE_RSE_1} to be AVAILABLE: {states}"
432
+ assert states.get(OFFSITE_RSE_1) == "AVAILABLE", msg
433
+
434
+ msg = f"Expected replica on {OFFSITE_RSE_2} to be AVAILABLE: {states}"
435
+ assert states.get(OFFSITE_RSE_2) == "AVAILABLE", msg
363
436
 
364
437
  # Download the file from OFFSITE_RSE_2 to verify its content
365
438
  download_spec = {
@@ -373,12 +446,15 @@ def test_add_offsite_replication_rules(
373
446
 
374
447
  # Verify the downloaded file content
375
448
  download_path = tmp_path / lfn.lstrip("/")
376
- assert download_path.is_file(), f"Download failed at {download_path}"
449
+ msg = f"Download failed at {download_path}"
450
+ assert download_path.is_file(), msg
451
+
377
452
  downloaded_content = download_path.read_text()
378
- assert downloaded_content == test_file_content, (
453
+ msg = (
379
454
  f"Downloaded file content does not match the original. "
380
455
  f"Expected: {test_file_content}, Got: {downloaded_content}"
381
456
  )
457
+ assert downloaded_content == test_file_content, msg
382
458
 
383
459
 
384
460
  @pytest.mark.usefixtures("_auth_proxy")
@@ -389,20 +465,23 @@ def test_add_offsite_replication_rules_single_copy(
389
465
  test_vo: str,
390
466
  storage_mount_path: Path,
391
467
  tmp_path: Path,
392
- test_file: tuple[Path, str],
468
+ onsite_test_file: tuple[Path, str],
393
469
  caplog,
394
470
  ):
395
471
  """Test the add_offsite_replication_rules method of IngestionClient with a single copy (copies=1)."""
472
+
396
473
  ingestion_client = IngestionClient(
397
- storage_mount_path, ONSITE_RSE, scope=test_scope, vo=test_vo
474
+ data_path=storage_mount_path,
475
+ rse=ONSITE_RSE,
476
+ vo=test_vo,
477
+ scope=test_scope,
398
478
  )
399
- caplog.set_level(logging.DEBUG)
400
479
 
401
480
  # Replicate the ACADA file to one offsite RSE
402
481
  lfn = pre_existing_lfn
403
482
  did = {"scope": test_scope, "name": lfn}
404
483
 
405
- _, test_file_content = test_file
484
+ _, test_file_content = onsite_test_file
406
485
 
407
486
  offsite_rse_expression = "OFFSITE"
408
487
  copies = 1
@@ -414,9 +493,9 @@ def test_add_offsite_replication_rules_single_copy(
414
493
  )
415
494
 
416
495
  # Verify that only one rule was created
417
- assert (
418
- len(rule_ids) == 1
419
- ), f"Expected exactly 1 rule ID, got {len(rule_ids)}: {rule_ids}"
496
+ msg = f"Expected exactly 1 rule ID, got {len(rule_ids)}: {rule_ids}"
497
+ assert len(rule_ids) == 1, msg
498
+
420
499
  rule_id_offsite_1 = rule_ids[0]
421
500
  rule_client = RuleClient()
422
501
 
@@ -436,9 +515,8 @@ def test_add_offsite_replication_rules_single_copy(
436
515
  offsite_replica_count = sum(
437
516
  1 for rse in [OFFSITE_RSE_1, OFFSITE_RSE_2] if states.get(rse) == "AVAILABLE"
438
517
  )
439
- assert (
440
- offsite_replica_count == 1
441
- ), f"Expected exactly 1 offsite replica (on either {OFFSITE_RSE_1} or {OFFSITE_RSE_2}), got {offsite_replica_count}: {states}"
518
+ msg = f"Expected exactly 1 offsite replica (on either {OFFSITE_RSE_1} or {OFFSITE_RSE_2}), got {offsite_replica_count}: {states}"
519
+ assert offsite_replica_count == 1, msg
442
520
 
443
521
  # Determine which offsite RSE the replica was created on
444
522
  target_offsite_rse = (
@@ -457,12 +535,14 @@ def test_add_offsite_replication_rules_single_copy(
457
535
 
458
536
  # Verify the downloaded file content
459
537
  download_path = tmp_path / lfn.lstrip("/")
460
- assert download_path.is_file(), f"Download failed at {download_path}"
538
+ msg = f"Download failed at {download_path}"
539
+ assert download_path.is_file(), msg
461
540
  downloaded_content = download_path.read_text()
462
- assert downloaded_content == test_file_content, (
541
+ msg = (
463
542
  f"Downloaded file content does not match the original. "
464
543
  f"Expected: {test_file_content}, Got: {downloaded_content}"
465
544
  )
545
+ assert downloaded_content == test_file_content, msg
466
546
 
467
547
 
468
548
  def test_verify_fits_file(tel_events_test_file):
@@ -499,3 +579,1140 @@ def test_verify_fits_file_invalid_checksum(broken_checksum):
499
579
  with fits.open(broken_checksum) as hdul:
500
580
  with pytest.raises(FITSVerificationError, match="CHECKSUM verification failed"):
501
581
  verify_fits_checksum(hdul)
582
+
583
+
584
+ def test_ingest_init(storage_mount_path):
585
+ """Test that Ingest initializes correctly with given parameters."""
586
+ ingestion_client = IngestionClient(
587
+ data_path=storage_mount_path,
588
+ rse=ONSITE_RSE,
589
+ vo="ctao",
590
+ scope="acada",
591
+ )
592
+
593
+ ingest = Ingest(
594
+ client=ingestion_client,
595
+ top_dir=storage_mount_path,
596
+ num_workers=3,
597
+ lock_file_path=storage_mount_path / "lockfile.lock",
598
+ polling_interval=0.5,
599
+ check_interval=0.2,
600
+ )
601
+ assert ingest.client == ingestion_client
602
+ assert ingest.top_dir == storage_mount_path
603
+ assert ingest.num_workers == 3
604
+ assert ingest.lock_file_path == storage_mount_path / "lockfile.lock"
605
+ assert ingest.polling_interval == 0.5
606
+ assert ingest.check_interval == 0.2
607
+ assert not ingest.stop_event.is_set() # check stop_event initial state
608
+ assert hasattr(ingest, "result_queue")
609
+ assert hasattr(ingest, "task_counter")
610
+ assert hasattr(ingest, "submitted_tasks")
611
+ assert ingest.task_counter == 0
612
+ assert len(ingest.submitted_tasks) == 0
613
+
614
+
615
+ def test_check_directory_valid(storage_mount_path, tmp_path, caplog):
616
+ """Test _check_directory with a valid, readable directory."""
617
+ ingestion_client = IngestionClient(
618
+ data_path=storage_mount_path,
619
+ rse=ONSITE_RSE,
620
+ vo="ctao",
621
+ scope="acada",
622
+ )
623
+
624
+ ingest_instance = Ingest(
625
+ client=ingestion_client,
626
+ top_dir=tmp_path,
627
+ num_workers=1,
628
+ lock_file_path=storage_mount_path / "bdms_ingest.lock",
629
+ polling_interval=0.5,
630
+ check_interval=0.5,
631
+ )
632
+
633
+ ingest_instance.top_dir = tmp_path
634
+ ingest_instance._check_directory()
635
+
636
+
637
+ def test_check_directory_invalid(storage_mount_path, tmp_path, caplog):
638
+ """Test _check_directory with an invalid directory."""
639
+ ingestion_client = IngestionClient(
640
+ data_path=storage_mount_path,
641
+ rse=ONSITE_RSE,
642
+ vo="ctao",
643
+ scope="acada",
644
+ logger=LOGGER,
645
+ )
646
+
647
+ invalid_dir = tmp_path / "nonexistent"
648
+
649
+ ingest_instance = Ingest(
650
+ client=ingestion_client,
651
+ top_dir=invalid_dir,
652
+ num_workers=1,
653
+ lock_file_path=storage_mount_path / "bdms_ingest.lock",
654
+ polling_interval=0.5,
655
+ check_interval=0.5,
656
+ )
657
+
658
+ with pytest.raises(RuntimeError, match=f"Cannot read directory {invalid_dir}"):
659
+ ingest_instance._check_directory()
660
+ assert f"Cannot read directory {invalid_dir}" in caplog.text
661
+
662
+
663
+ @pytest.mark.usefixtures("_auth_proxy")
664
+ def test_process_file_success(
665
+ storage_mount_path, caplog, onsite_test_file, test_vo, test_scope
666
+ ):
667
+ """Test for checking successful ingestion with trigger file clean-up, depends on IngestionClient"""
668
+ ingestion_client = IngestionClient(
669
+ data_path=storage_mount_path,
670
+ rse=ONSITE_RSE,
671
+ vo=test_vo,
672
+ scope=test_scope,
673
+ )
674
+
675
+ acada_path, _ = onsite_test_file
676
+ test_file = acada_path
677
+ trigger_file = Path(str(test_file) + TRIGGER_SUFFIX)
678
+ trigger_file.symlink_to(test_file)
679
+ result = process_file(ingestion_client, str(test_file))
680
+ assert result == IngestStatus.SUCCESS
681
+ assert not trigger_file.exists()
682
+ assert INGEST_SUCCESS_MESSAGE in caplog.text
683
+
684
+
685
+ @pytest.mark.usefixtures("_auth_proxy")
686
+ def test_process_file_skipped(
687
+ storage_mount_path, caplog, onsite_test_file, test_vo, test_scope
688
+ ):
689
+ """Test for checking skipped ingestion when replica already exists"""
690
+ ingestion_client = IngestionClient(
691
+ data_path=storage_mount_path,
692
+ rse=ONSITE_RSE,
693
+ vo=test_vo,
694
+ scope=test_scope,
695
+ )
696
+
697
+ acada_path, _ = onsite_test_file
698
+ test_file = acada_path
699
+ trigger_file = Path(str(test_file) + TRIGGER_SUFFIX)
700
+ trigger_file.symlink_to(test_file)
701
+ process_file(ingestion_client, str(test_file))
702
+ caplog.clear()
703
+ result = process_file(ingestion_client, str(test_file))
704
+ assert result == IngestStatus.SKIPPED
705
+ assert "Replica already exists" in caplog.text
706
+
707
+
708
+ @pytest.mark.usefixtures("_auth_proxy")
709
+ def test_process_file_failure(storage_mount_path, caplog, tmp_path):
710
+ """Test for checking failure for invalid file paths"""
711
+ ingestion_client = IngestionClient(
712
+ data_path=storage_mount_path,
713
+ rse=ONSITE_RSE,
714
+ vo="ctao",
715
+ scope="acada",
716
+ )
717
+
718
+ invalid_file = tmp_path / "invalid_file.fits"
719
+ invalid_file.write_text("dummy content")
720
+ trigger_file = Path(str(invalid_file) + TRIGGER_SUFFIX)
721
+ trigger_file.symlink_to(invalid_file)
722
+
723
+ # The file path is outside the data_path causing a ValueError in acada_to_lfn
724
+ result = process_file(ingestion_client, str(invalid_file))
725
+
726
+ # Verify the function returns FAILURE status instead of raising an exception
727
+ assert result == IngestStatus.FAILURE
728
+
729
+ # Check for the actual error message that gets logged
730
+ assert "Exception in process_file" in caplog.text
731
+ # Verify the file path is in the error message
732
+ assert str(invalid_file) in caplog.text
733
+
734
+ # Verify that no success message was logged
735
+ assert INGEST_SUCCESS_MESSAGE not in caplog.text
736
+
737
+ # Trigger file should still exist since ingestion failed
738
+ msg = "Trigger file should not be removed when ingestion fails"
739
+ assert trigger_file.exists(), msg
740
+
741
+
742
+ def test_trigger_file_handler_init(storage_mount_path):
743
+ """Test TriggerFileHandler initialization."""
744
+ ingestion_client = IngestionClient(
745
+ data_path=storage_mount_path,
746
+ rse=ONSITE_RSE,
747
+ vo="ctao",
748
+ scope="acada",
749
+ )
750
+
751
+ ingest_instance = Ingest(
752
+ client=ingestion_client,
753
+ top_dir=storage_mount_path,
754
+ num_workers=1,
755
+ lock_file_path=storage_mount_path / "bdms_ingest.lock",
756
+ polling_interval=0.5,
757
+ check_interval=0.5,
758
+ )
759
+
760
+ handler = TriggerFileHandler(ingest_instance)
761
+ assert handler.ingest == ingest_instance
762
+
763
+
764
+ def test_trigger_file_handler_on_moved_missing_data_file(
765
+ storage_mount_path, tmp_path, caplog
766
+ ):
767
+ """Test on_moved skips when data file is missing."""
768
+ ingestion_client = IngestionClient(
769
+ data_path=storage_mount_path,
770
+ rse=ONSITE_RSE,
771
+ vo="ctao",
772
+ scope="acada",
773
+ )
774
+
775
+ ingest_instance = Ingest(
776
+ client=ingestion_client,
777
+ top_dir=storage_mount_path,
778
+ num_workers=1,
779
+ lock_file_path=storage_mount_path / "bdms_ingest.lock",
780
+ polling_interval=0.5,
781
+ check_interval=0.5,
782
+ )
783
+
784
+ handler = TriggerFileHandler(ingest_instance)
785
+ trigger_file = tmp_path / TEST_FILE_TRIGGER
786
+ data_file = tmp_path / "test_file"
787
+
788
+ # Create symlink to non-existent data file
789
+ trigger_file.symlink_to(data_file)
790
+
791
+ # Create FileMovedEvent (simulating ln -s)
792
+ event = FileMovedEvent(src_path=str(data_file), dest_path=str(trigger_file))
793
+ handler.on_moved(event)
794
+
795
+ assert (
796
+ f"Data file {data_file} for trigger {trigger_file} does not exist, skipping"
797
+ in caplog.text
798
+ )
799
+ assert (
800
+ DETECTED_NEW_TRIGGER_FILE not in caplog.text
801
+ ) # Skips processing since the data file is missing
802
+
803
+
804
+ def test_trigger_file_handler_on_moved_success(
805
+ storage_mount_path, tmp_path, onsite_test_file, test_vo, test_scope, caplog
806
+ ):
807
+ """Test on_moved successfully processing a valid trigger file."""
808
+ ingestion_client = IngestionClient(
809
+ data_path=storage_mount_path,
810
+ rse=ONSITE_RSE,
811
+ vo=test_vo,
812
+ scope=test_scope,
813
+ )
814
+
815
+ ingest_instance = Ingest(
816
+ client=ingestion_client,
817
+ top_dir=storage_mount_path,
818
+ num_workers=1,
819
+ lock_file_path=storage_mount_path / "bdms_ingest.lock",
820
+ polling_interval=0.5,
821
+ check_interval=0.5,
822
+ )
823
+
824
+ # Create ProcessPoolExecutor for the ingest instance
825
+ with ProcessPoolExecutor(max_workers=1) as executor:
826
+ ingest_instance.executor = executor
827
+
828
+ handler = TriggerFileHandler(ingest_instance)
829
+ acada_path, _ = onsite_test_file
830
+ test_file = acada_path
831
+ trigger_file = Path(str(test_file) + TRIGGER_SUFFIX)
832
+ trigger_file.symlink_to(test_file)
833
+
834
+ # Create FileMovedEvent (simulating ln -s)
835
+ event = FileMovedEvent(src_path=str(test_file), dest_path=str(trigger_file))
836
+
837
+ # Record initial state
838
+ initial_task_counter = ingest_instance.task_counter
839
+ initial_total_tasks = ingest_instance.total_tasks_submitted
840
+ initial_submitted_tasks_count = len(ingest_instance.submitted_tasks)
841
+
842
+ handler.on_moved(event)
843
+
844
+ # Verify the expected log message
845
+ msg = f"'Detected new trigger file {trigger_file}, submitting data file {test_file}' in caplog"
846
+ assert (
847
+ f"Detected new trigger file {trigger_file}, submitting data file {test_file}"
848
+ in caplog.text
849
+ ), msg
850
+
851
+ # Verify task submission metrics were updated
852
+ assert ingest_instance.task_counter == initial_task_counter + 1
853
+ assert ingest_instance.total_tasks_submitted == initial_total_tasks + 1
854
+ assert len(ingest_instance.submitted_tasks) == initial_submitted_tasks_count + 1
855
+
856
+ # Verify the task was submitted with correct file path
857
+ submitted_task_files = list(ingest_instance.submitted_tasks.values())
858
+ assert str(test_file) in submitted_task_files
859
+
860
+ # Give some time for the task to potentially complete
861
+ time.sleep(0.5)
862
+
863
+
864
+ def test_trigger_file_handler_on_moved_stop_event_set(
865
+ storage_mount_path, tmp_path, caplog
866
+ ):
867
+ """Test on_moved skips processing when stop_event is set."""
868
+ ingestion_client = IngestionClient(
869
+ data_path=storage_mount_path,
870
+ rse=ONSITE_RSE,
871
+ vo="ctao",
872
+ scope="acada",
873
+ )
874
+
875
+ ingest_instance = Ingest(
876
+ client=ingestion_client,
877
+ top_dir=storage_mount_path,
878
+ num_workers=1,
879
+ lock_file_path=storage_mount_path / "bdms_ingest.lock",
880
+ polling_interval=0.5,
881
+ check_interval=0.5,
882
+ )
883
+
884
+ handler = TriggerFileHandler(ingest_instance)
885
+ trigger_file = tmp_path / TEST_FILE_TRIGGER
886
+ data_file = tmp_path / "test_file"
887
+ data_file.write_text("data") # Data file exists
888
+ trigger_file.symlink_to(data_file)
889
+
890
+ # Create FileMovedEvent
891
+ event = FileMovedEvent(src_path=str(data_file), dest_path=str(trigger_file))
892
+
893
+ # Set stop event
894
+ ingest_instance.stop_event.set()
895
+
896
+ # Record initial state
897
+ initial_task_counter = ingest_instance.task_counter
898
+ initial_total_tasks = ingest_instance.total_tasks_submitted
899
+
900
+ try:
901
+ handler.on_moved(event)
902
+
903
+ # Should not process anything when stop_event is set
904
+ assert ingest_instance.task_counter == initial_task_counter
905
+ assert ingest_instance.total_tasks_submitted == initial_total_tasks
906
+ assert DETECTED_NEW_TRIGGER_FILE not in caplog.text
907
+
908
+ finally:
909
+ ingest_instance.stop_event.clear() # Reset for other tests
910
+
911
+
912
+ def test_trigger_file_handler_on_moved_directory_event(
913
+ storage_mount_path, tmp_path, caplog
914
+ ):
915
+ """Test on_moved skips directory events."""
916
+ ingestion_client = IngestionClient(
917
+ data_path=storage_mount_path,
918
+ rse=ONSITE_RSE,
919
+ vo="ctao",
920
+ scope="acada",
921
+ )
922
+
923
+ ingest_instance = Ingest(
924
+ client=ingestion_client,
925
+ top_dir=storage_mount_path,
926
+ num_workers=1,
927
+ lock_file_path=storage_mount_path / "bdms_ingest.lock",
928
+ polling_interval=0.5,
929
+ check_interval=0.5,
930
+ )
931
+
932
+ handler = TriggerFileHandler(ingest_instance)
933
+ trigger_dir = tmp_path / "some_directory.trigger"
934
+ source_dir = tmp_path / "source_directory"
935
+ source_dir.mkdir()
936
+ trigger_dir.mkdir()
937
+
938
+ # Create directory move event
939
+ event = FileMovedEvent(src_path=str(source_dir), dest_path=str(trigger_dir))
940
+ event.is_directory = True # mark as directory event
941
+
942
+ # Record initial state
943
+ initial_task_counter = ingest_instance.task_counter
944
+ initial_total_tasks = ingest_instance.total_tasks_submitted
945
+
946
+ handler.on_moved(event)
947
+
948
+ # Should not process directory events
949
+ assert ingest_instance.task_counter == initial_task_counter
950
+ assert ingest_instance.total_tasks_submitted == initial_total_tasks
951
+ assert DETECTED_NEW_TRIGGER_FILE not in caplog.text
952
+
953
+
954
+ def test_trigger_file_handler_on_moved_with_actual_processing(
955
+ storage_mount_path, tmp_path, onsite_test_file, test_vo, test_scope, caplog
956
+ ):
957
+ """Test on_moved with successfully processing a valid trigger file."""
958
+ ingestion_client = IngestionClient(
959
+ data_path=storage_mount_path,
960
+ rse=ONSITE_RSE,
961
+ vo=test_vo,
962
+ scope=test_scope,
963
+ )
964
+
965
+ ingest_instance = Ingest(
966
+ client=ingestion_client,
967
+ top_dir=storage_mount_path,
968
+ num_workers=1,
969
+ lock_file_path=storage_mount_path / "bdms_ingest.lock",
970
+ polling_interval=0.5,
971
+ check_interval=0.5,
972
+ )
973
+
974
+ # Start the result processing thread manually for this test
975
+ result_thread = threading.Thread(
976
+ target=ingest_instance._process_results, daemon=True
977
+ )
978
+ result_thread.start()
979
+
980
+ with ProcessPoolExecutor(max_workers=1) as executor:
981
+ ingest_instance.executor = executor
982
+
983
+ handler = TriggerFileHandler(ingest_instance)
984
+ acada_path, _ = onsite_test_file
985
+ test_file = acada_path
986
+ trigger_file = Path(str(test_file) + TRIGGER_SUFFIX)
987
+ trigger_file.symlink_to(test_file)
988
+
989
+ # Create FileMovedEvent
990
+ event = FileMovedEvent(src_path=str(test_file), dest_path=str(trigger_file))
991
+
992
+ handler.on_moved(event)
993
+
994
+ # Wait for processing to complete
995
+ timeout = 10.0
996
+ start_time = time.time()
997
+ processed = False
998
+
999
+ while time.time() - start_time < timeout:
1000
+ # Check if task was completed (removed from submitted_tasks)
1001
+ if len(ingest_instance.submitted_tasks) == 0:
1002
+ processed = True
1003
+ break
1004
+ time.sleep(0.1)
1005
+
1006
+ # Stop the result processing thread
1007
+ ingest_instance.stop_event.set()
1008
+ result_thread.join(timeout=2.0)
1009
+
1010
+ # Verify processing occurred
1011
+ msg = "Task was not processed within timeout"
1012
+ assert processed, msg
1013
+
1014
+ msg = f"'Detected new trigger file {trigger_file}, submitting data file {test_file}' in caplog"
1015
+ assert (
1016
+ f"Detected new trigger file {trigger_file}, submitting data file {test_file}"
1017
+ in caplog.text
1018
+ ), msg
1019
+
1020
+ # Check that a result was logged (either success, failure, or error)
1021
+ result_logged = any(
1022
+ phrase in caplog.text
1023
+ for phrase in ["Processed file", "failed:", "Exception in process_file"]
1024
+ )
1025
+ msg = "No processing result was logged"
1026
+ assert result_logged, msg
1027
+
1028
+
1029
+ def test_sequential_exclusion_lock_prevention(storage_mount_path, tmp_path):
1030
+ """Test that a second daemon instance cannot start when first is already running.
1031
+
1032
+ This test validates sequential exclusion: when one ingestion daemon is already
1033
+ running and has acquired the lock, any subsequent attempt to start another
1034
+ daemon instance should fail with a clear error message.
1035
+ """
1036
+ lock_file = tmp_path / "sequential_test.pid"
1037
+
1038
+ ingestion_client = IngestionClient(
1039
+ data_path=storage_mount_path,
1040
+ rse=ONSITE_RSE,
1041
+ vo="ctao",
1042
+ scope="acada",
1043
+ )
1044
+
1045
+ # Create first instance
1046
+ instance1 = Ingest(
1047
+ client=ingestion_client,
1048
+ top_dir=tmp_path,
1049
+ lock_file_path=lock_file,
1050
+ num_workers=1,
1051
+ polling_interval=0.1,
1052
+ check_interval=0.1,
1053
+ )
1054
+
1055
+ # Create second instance with same lock file
1056
+ instance2 = Ingest(
1057
+ client=ingestion_client,
1058
+ top_dir=tmp_path,
1059
+ lock_file_path=lock_file,
1060
+ num_workers=1,
1061
+ polling_interval=0.1,
1062
+ check_interval=0.1,
1063
+ )
1064
+
1065
+ results = {}
1066
+ first_instance_started = threading.Event()
1067
+
1068
+ def run_first_instance():
1069
+ """Run first instance - should succeed and run until manually stopped."""
1070
+ try:
1071
+ # signal: about to start daemon
1072
+ first_instance_started.set()
1073
+ instance1.run()
1074
+ results["first"] = "success"
1075
+ except Exception as e:
1076
+ results["first"] = f"error: {str(e)}"
1077
+
1078
+ def run_second_instance():
1079
+ """Try to run second instance while first is running - should fail with lock conflict."""
1080
+ try:
1081
+ # Verify first instance has actually acquired the lock
1082
+ lock_acquired_timeout = 15.0
1083
+ start_wait = time.time()
1084
+ while time.time() - start_wait < lock_acquired_timeout:
1085
+ if lock_file.exists():
1086
+ break
1087
+ time.sleep(0.1)
1088
+ else:
1089
+ results["second"] = "first_instance_never_acquired_lock"
1090
+ return
1091
+
1092
+ # This should fail because first instance holds the lock
1093
+ instance2.run()
1094
+ results["second"] = "unexpected_success" # Should not reach here
1095
+ except RuntimeError as e:
1096
+ error_msg = str(e)
1097
+ if "Another ingestion process is already running" in error_msg:
1098
+ results["second"] = f"expected_lock_conflict: {str(e)}"
1099
+ else:
1100
+ results["second"] = f"unexpected_runtime_error: {str(e)}"
1101
+ except Exception as e:
1102
+ results["second"] = f"unexpected_error: {str(e)}"
1103
+
1104
+ # Start first instance with non-daemon thread
1105
+ thread1 = threading.Thread(target=run_first_instance, daemon=False)
1106
+ thread1.start()
1107
+
1108
+ # Wait for first instance to signal it's starting
1109
+ msg = "First instance failed to start"
1110
+ assert first_instance_started.wait(timeout=10), msg
1111
+
1112
+ # Give first instance time to acquire lock and initialize
1113
+ time.sleep(3.0)
1114
+
1115
+ # Verify first instance has acquired lock with content validation
1116
+ msg = "First instance should have created PID file"
1117
+ assert lock_file.exists(), msg
1118
+
1119
+ # Read PID and verify it's valid
1120
+ pid_content = lock_file.read_text().strip()
1121
+ msg = f"PID file should contain a number, got: {pid_content}"
1122
+ assert pid_content.isdigit(), msg
1123
+
1124
+ # Verify the lock file contains current process PID or a valid PID
1125
+ current_pid = os.getpid()
1126
+ stored_pid = int(pid_content)
1127
+ # The stored PID should be current process since we're running in same process
1128
+ msg = f"Expected PID {current_pid}, got {stored_pid}"
1129
+ assert stored_pid == current_pid, msg
1130
+
1131
+ # Now try to start second instance - this should fail
1132
+ thread2 = threading.Thread(target=run_second_instance, daemon=False)
1133
+ thread2.start()
1134
+
1135
+ # Wait for second instance to complete with better timeout handling
1136
+ # FileLock timeout is 10 seconds, so we give a bit more time
1137
+ thread2.join(timeout=15)
1138
+
1139
+ # Explicit check for thread completion
1140
+ if thread2.is_alive():
1141
+ # Force stop and fail the test
1142
+ instance1.stop_event.set()
1143
+ thread1.join(timeout=5)
1144
+ pytest.fail("Second instance thread did not complete within expected timeout")
1145
+
1146
+ # Stop first instance now that we've tested the lock
1147
+ instance1.stop_event.set()
1148
+ thread1.join(timeout=10)
1149
+
1150
+ # Ensure first thread also terminates
1151
+ if thread1.is_alive():
1152
+ pytest.fail("First instance thread did not terminate within timeout")
1153
+
1154
+ # Verify results
1155
+ msg = f"Second instance should have completed. Results: {results}"
1156
+ assert "second" in results, msg
1157
+
1158
+ # More specific assertion for expected lock conflict
1159
+ second_result = results["second"]
1160
+ msg = f"Second instance should have failed with lock conflict. Got: {second_result}"
1161
+ assert second_result.startswith("expected_lock_conflict"), msg
1162
+
1163
+ # Verify the error message is the expected one from Ingest class
1164
+ msg = f"Expected specific error message, got: {second_result}"
1165
+ assert "Another ingestion process is already running" in second_result, msg
1166
+
1167
+ # First instance should have run successfully (we stopped it manually)
1168
+ if "first" in results:
1169
+ msg = f"First instance should succeed, got: {results['first']}"
1170
+ assert results["first"] == "success", msg
1171
+
1172
+ # Improved cleanup verification with timeout-based checking
1173
+ cleanup_timeout = 5.0
1174
+ start_cleanup_wait = time.time()
1175
+ while time.time() - start_cleanup_wait < cleanup_timeout:
1176
+ if not lock_file.exists():
1177
+ break
1178
+ time.sleep(0.1)
1179
+
1180
+ msg = "PID file should be cleaned up after first instance stops"
1181
+ assert not lock_file.exists(), msg
1182
+
1183
+ # logging
1184
+ LOGGER.info("Sequential exclusion test completed successfully")
1185
+ LOGGER.info("First instance: %s", results.get("first", "stopped manually"))
1186
+ LOGGER.info("Second instance correctly failed with: %s", second_result)
1187
+
1188
+
1189
+ def test_concurrent_exclusion_lock_prevention(storage_mount_path, tmp_path):
1190
+ """Test FileLock behavior under true concurrent access - simultaneous daemon startup attempts.
1191
+
1192
+ This test validates real concurrent scenario where multiple daemon instances
1193
+ attempt to acquire the same lock simultaneously, simulating race conditions
1194
+ that occur in production environments.
1195
+ """
1196
+ lock_file = tmp_path / "concurrent_test.pid"
1197
+
1198
+ ingestion_client = IngestionClient(
1199
+ data_path=storage_mount_path,
1200
+ rse=ONSITE_RSE,
1201
+ vo="ctao",
1202
+ scope="acada",
1203
+ )
1204
+
1205
+ # Create both instances
1206
+ instance1 = Ingest(
1207
+ client=ingestion_client,
1208
+ top_dir=tmp_path,
1209
+ lock_file_path=lock_file,
1210
+ num_workers=1,
1211
+ polling_interval=0.1,
1212
+ check_interval=0.1,
1213
+ )
1214
+ instance2 = Ingest(
1215
+ client=ingestion_client,
1216
+ top_dir=tmp_path,
1217
+ lock_file_path=lock_file,
1218
+ num_workers=1,
1219
+ polling_interval=0.1,
1220
+ check_interval=0.1,
1221
+ )
1222
+
1223
+ results = {}
1224
+
1225
+ # Synchronization barrier - both threads wait here until released
1226
+ start_barrier = threading.Barrier(3) # 2 worker threads + 1 main thread
1227
+
1228
+ def run_instance(instance_id, instance):
1229
+ """Run instance - both will try to start simultaneously."""
1230
+ try:
1231
+ # Wait for barrier - ensures simultaneous start
1232
+ start_barrier.wait() # All threads start together!
1233
+
1234
+ instance.run()
1235
+ results[instance_id] = "success"
1236
+ except RuntimeError as e:
1237
+ if "Another ingestion process is already running" in str(e):
1238
+ results[instance_id] = f"lock_conflict: {str(e)}"
1239
+ else:
1240
+ results[instance_id] = f"unexpected_error: {str(e)}"
1241
+ except Exception as e:
1242
+ results[instance_id] = f"error: {str(e)}"
1243
+
1244
+ # Create both threads
1245
+ thread1 = threading.Thread(
1246
+ target=run_instance, args=("first", instance1), daemon=False
1247
+ )
1248
+ thread2 = threading.Thread(
1249
+ target=run_instance, args=("second", instance2), daemon=False
1250
+ )
1251
+
1252
+ # Start both threads - they will wait at the barrier
1253
+ thread1.start()
1254
+ thread2.start()
1255
+
1256
+ # Give threads time to reach barrier
1257
+ time.sleep(0.5)
1258
+
1259
+ # Release the barrier - both threads start simultaneously
1260
+ start_barrier.wait()
1261
+
1262
+ # Wait for both to complete the lock acquisition attempt
1263
+ thread1.join(timeout=15)
1264
+ thread2.join(timeout=15)
1265
+
1266
+ # Stop whichever instance succeeded
1267
+ if "first" in results and results["first"] == "success":
1268
+ instance1.stop_event.set()
1269
+ if "second" in results and results["second"] == "success":
1270
+ instance2.stop_event.set()
1271
+
1272
+ # Ensure threads complete
1273
+ if thread1.is_alive():
1274
+ instance1.stop_event.set()
1275
+ thread1.join(timeout=5)
1276
+ if thread2.is_alive():
1277
+ instance2.stop_event.set()
1278
+ thread2.join(timeout=5)
1279
+
1280
+ # Verify results - Exactly ONE should succeed, ONE should fail
1281
+ msg = f"Both instances should complete, got: {results}"
1282
+ assert len(results) == 2, msg
1283
+
1284
+ success_count = sum(1 for result in results.values() if result == "success")
1285
+ conflict_count = sum(1 for result in results.values() if "lock_conflict" in result)
1286
+
1287
+ msg = f"Exactly ONE instance should succeed, got {success_count}: {results}"
1288
+ assert success_count == 1, msg
1289
+
1290
+ msg = f"Exactly ONE instance should get lock conflict, got {conflict_count}: {results}"
1291
+ assert conflict_count == 1, msg
1292
+
1293
+ # Verify the lock conflict has correct error message
1294
+ conflict_result = [r for r in results.values() if "lock_conflict" in r][0]
1295
+ msg = "Expected 'Another ingestion process is already running' message in conflict result"
1296
+ assert "Another ingestion process is already running" in conflict_result, msg
1297
+
1298
+ # Verify cleanup
1299
+ cleanup_timeout = 5.0
1300
+ start_cleanup = time.time()
1301
+ while time.time() - start_cleanup < cleanup_timeout:
1302
+ if not lock_file.exists():
1303
+ break
1304
+ time.sleep(0.1)
1305
+ msg = "Lock file should be cleaned up"
1306
+ assert not lock_file.exists(), msg
1307
+
1308
+ LOGGER.info("True Concurrency tests: %s", results)
1309
+ LOGGER.info("Real concurrent lock acquisition tested successfully!")
1310
+
1311
+
1312
+ def acada_write_test_files(
1313
+ storage_mount_path, test_vo, test_scope, n_files=7
1314
+ ) -> list[Path]:
1315
+ """Represents ACADA writing test files to the storage mount path."""
1316
+
1317
+ test_dir = storage_mount_path / test_vo / test_scope
1318
+ test_dir.mkdir(parents=True, exist_ok=True)
1319
+
1320
+ # Create seven dummy FITS files
1321
+ data_files = []
1322
+ rng = np.random.default_rng()
1323
+ for i in range(n_files):
1324
+ data_file = test_dir / f"testfile_{i}_20250609.fits"
1325
+ hdu = fits.PrimaryHDU(rng.random((50, 50)))
1326
+ hdu.writeto(data_file, overwrite=True, checksum=True)
1327
+ data_files.append(data_file)
1328
+
1329
+ LOGGER.info("Created test file: %s", data_file)
1330
+
1331
+ # Move permission reset before daemon start to avoid timing issues
1332
+ reset_xrootd_permissions(storage_mount_path)
1333
+ time.sleep(1.0) # Allow permissions to be applied
1334
+
1335
+ return data_files
1336
+
1337
+
1338
+ def acada_create_trigger_symlink(data_file, creation_results):
1339
+ """Represents creating a trigger symlink for a given data file."""
1340
+
1341
+ try:
1342
+ trigger_file = Path(str(data_file) + TRIGGER_SUFFIX)
1343
+ trigger_file.symlink_to(data_file)
1344
+ LOGGER.info("Created trigger file: %s -> %s", trigger_file, data_file)
1345
+
1346
+ # Verify creation was successful
1347
+ if trigger_file.exists() and trigger_file.is_symlink():
1348
+ creation_results.append({"file": str(data_file), "status": "success"})
1349
+ else:
1350
+ creation_results.append(
1351
+ {"file": str(data_file), "status": "creation_failed"}
1352
+ )
1353
+ except Exception as e:
1354
+ LOGGER.exception("Failed to create trigger for %s: %s", data_file, e)
1355
+ creation_results.append({"file": str(data_file), "status": f"error: {str(e)}"})
1356
+
1357
+ return creation_results
1358
+
1359
+
1360
+ def ensure_files_ingested(data_files, storage_mount_path, test_scope, timeout_s=120):
1361
+ """Ensure that all files are ingested by checking the IngestStatus."""
1362
+
1363
+ replica_client = ReplicaClient()
1364
+
1365
+ timeout_at = time.time() + timeout_s
1366
+
1367
+ data_file_entries = [
1368
+ {
1369
+ "file": str(data_file),
1370
+ "expected_lfn": f"/{data_file.relative_to(storage_mount_path)}",
1371
+ "found": False,
1372
+ }
1373
+ for data_file in data_files
1374
+ ]
1375
+
1376
+ while time.time() < timeout_at and not all(
1377
+ status["found"] for status in data_file_entries
1378
+ ):
1379
+ for data_file_entry in data_file_entries:
1380
+ if not data_file_entry["found"]:
1381
+ try:
1382
+ replicas = list(
1383
+ replica_client.list_replicas(
1384
+ dids=[
1385
+ {
1386
+ "scope": test_scope,
1387
+ "name": data_file_entry["expected_lfn"],
1388
+ }
1389
+ ]
1390
+ )
1391
+ )
1392
+ if not replicas:
1393
+ LOGGER.info(
1394
+ "No replica found for %s", data_file_entry["expected_lfn"]
1395
+ )
1396
+ else:
1397
+ LOGGER.info(
1398
+ "Replica found for %s: %s",
1399
+ data_file_entry["expected_lfn"],
1400
+ replicas[0],
1401
+ )
1402
+ data_file_entry["found"] = True
1403
+ except Exception:
1404
+ LOGGER.exception(
1405
+ "Failed to list replicas for %s",
1406
+ data_file_entry["expected_lfn"],
1407
+ )
1408
+ time.sleep(1.0)
1409
+
1410
+ if not all(status["found"] for status in data_file_entries):
1411
+ pytest.fail(f"Not all replicas found for files: {data_files}")
1412
+
1413
+
1414
+ @pytest.mark.usefixtures(
1415
+ "_auth_proxy", "lock_for_ingestion_daemon", "disable_ingestion_daemon"
1416
+ )
1417
+ @pytest.mark.verifies_usecase("UC-110-1.1.4")
1418
+ def test_ingest_parallel_submission(storage_mount_path, caplog, test_vo, test_scope):
1419
+ """Test parallel file processing: creates multiple FITS files simultaneously and verifies that the
1420
+ daemon can detect, process, and ingest them efficiently using parallel workers.
1421
+ """
1422
+ ingestion_client = IngestionClient(
1423
+ data_path=storage_mount_path,
1424
+ rse=ONSITE_RSE,
1425
+ vo=test_vo,
1426
+ scope=test_scope,
1427
+ )
1428
+
1429
+ ingest_instance = Ingest(
1430
+ client=ingestion_client,
1431
+ top_dir=storage_mount_path,
1432
+ num_workers=4,
1433
+ lock_file_path=storage_mount_path / "bdms_ingest.lock",
1434
+ polling_interval=0.5,
1435
+ check_interval=0.5,
1436
+ )
1437
+
1438
+ data_files = acada_write_test_files(storage_mount_path, test_vo, test_scope)
1439
+
1440
+ # Daemon startup with exception handling
1441
+ daemon_exception = None
1442
+ daemon_started = threading.Event()
1443
+
1444
+ def run_daemon():
1445
+ """Run daemon with exception capture."""
1446
+ nonlocal daemon_exception
1447
+ try:
1448
+ daemon_started.set() # Signal daemon thread started
1449
+ ingest_instance.run()
1450
+ except Exception as e:
1451
+ daemon_exception = e
1452
+ LOGGER.exception("Daemon failed with exception: %s", str(e))
1453
+
1454
+ # Start daemon with non-daemon thread for reliability
1455
+ daemon_thread = threading.Thread(target=run_daemon, daemon=False)
1456
+ daemon_thread.start()
1457
+
1458
+ # Wait for daemon thread to start
1459
+ msg = "Daemon thread failed to start"
1460
+ assert daemon_started.wait(timeout=10), msg
1461
+
1462
+ # Daemon initialization verification
1463
+ daemon_init_timeout = 20.0 # Increased timeout for robust initialization
1464
+ daemon_init_start = time.time()
1465
+ required_conditions = {
1466
+ "lock_acquired": False,
1467
+ "result_thread_started": False,
1468
+ "pool_started": False,
1469
+ "monitoring_started": False,
1470
+ "observer_started": False,
1471
+ }
1472
+
1473
+ while time.time() - daemon_init_start < daemon_init_timeout:
1474
+ # Check for daemon startup failure early
1475
+ if daemon_exception:
1476
+ pytest.fail(f"Daemon failed during initialization: {daemon_exception}")
1477
+
1478
+ # Check for lock acquisition (critical for daemon operation)
1479
+ if ingest_instance.lock_file_path.exists():
1480
+ required_conditions["lock_acquired"] = True
1481
+
1482
+ # Check log messages for initialization steps
1483
+ log_text = caplog.text
1484
+ if "Result processing thread started" in log_text:
1485
+ required_conditions["result_thread_started"] = True
1486
+
1487
+ # Flexible process pool verification to work with any worker count
1488
+ if re.search(r"Started process pool with \d+ workers", log_text):
1489
+ required_conditions["pool_started"] = True
1490
+
1491
+ if "Starting continuous polling-based monitoring" in log_text:
1492
+ required_conditions["monitoring_started"] = True
1493
+ if "File monitoring observer started successfully" in log_text:
1494
+ required_conditions["observer_started"] = True
1495
+
1496
+ # Check if all conditions are met
1497
+ if all(required_conditions.values()):
1498
+ break
1499
+
1500
+ time.sleep(0.2)
1501
+
1502
+ # Verify complete initialization or provide diagnostics
1503
+ missing_conditions = [k for k, v in required_conditions.items() if not v]
1504
+ if missing_conditions:
1505
+ ingest_instance.stop_event.set()
1506
+ daemon_thread.join(timeout=5)
1507
+ pytest.fail(
1508
+ f"Daemon initialization incomplete. Missing: {missing_conditions}. Check logs for errors."
1509
+ )
1510
+
1511
+ time.sleep(0.5) # some additional time to stabilize
1512
+
1513
+ # Create trigger files and also track
1514
+ trigger_files = []
1515
+ natural_start = time.time()
1516
+
1517
+ for data_file in data_files:
1518
+ trigger_file = Path(str(data_file) + TRIGGER_SUFFIX)
1519
+ trigger_file.symlink_to(data_file)
1520
+ trigger_files.append(trigger_file)
1521
+
1522
+ # Test regular detection, looking for MOVE events
1523
+ natural_detection_timeout = 30.0
1524
+ natural_start = time.time()
1525
+
1526
+ while time.time() - natural_start < natural_detection_timeout:
1527
+ # Look for actual processing
1528
+ if caplog.text.count("Detected new trigger file") > 0:
1529
+ break
1530
+ time.sleep(1.0)
1531
+
1532
+ # Count events after the loop completes
1533
+ move_events_detected = caplog.text.count("MOVE Event received")
1534
+
1535
+ # Wait for processing with concurrency monitoring
1536
+ processing_timeout = 120.0
1537
+ processing_start = time.time()
1538
+ processed_files = set()
1539
+ max_concurrent_samples = []
1540
+
1541
+ while time.time() - processing_start < processing_timeout:
1542
+ # Sample concurrent tasks frequently to catch parallelism
1543
+ current_concurrent = len(ingest_instance.submitted_tasks)
1544
+ max_concurrent_samples.append(current_concurrent)
1545
+
1546
+ # Check processing results
1547
+ for data_file in data_files:
1548
+ success_pattern = f"Processed file {data_file} with result success"
1549
+ skipped_pattern = f"Processed file {data_file} with result skipped"
1550
+
1551
+ if str(data_file) not in processed_files:
1552
+ if success_pattern in caplog.text or skipped_pattern in caplog.text:
1553
+ processed_files.add(str(data_file))
1554
+
1555
+ if len(processed_files) == 7:
1556
+ break
1557
+
1558
+ if "Fatal error in result processing thread" in caplog.text:
1559
+ break
1560
+
1561
+ time.sleep(0.1) # Sample frequently to catch concurrency
1562
+
1563
+ assert len(processed_files) == 7
1564
+
1565
+ # Record ingestion workflow completion time
1566
+ workflow_end_time = time.time()
1567
+
1568
+ # Stop the daemon
1569
+ ingest_instance.stop_event.set()
1570
+ daemon_thread.join(timeout=10)
1571
+
1572
+ if daemon_thread.is_alive():
1573
+ pytest.fail("Ingest Daemon thread did not terminate within timeout")
1574
+
1575
+ # Verify results
1576
+ msg = "Process pool startup failed"
1577
+ assert "Started process pool with 4 workers" in caplog.text, msg
1578
+
1579
+ msg = "Result processing thread startup failed"
1580
+ assert "Result processing thread started" in caplog.text, msg
1581
+
1582
+ # Verify trigger files were cleaned up during successful processing
1583
+ remaining_triggers = sum(1 for tf in trigger_files if tf.exists())
1584
+ msg = f"Expected all trigger files to be cleaned up, {remaining_triggers} remain"
1585
+ assert remaining_triggers == 0, msg
1586
+
1587
+ # Verify clean shutdown
1588
+ msg = "Lock file not cleaned up"
1589
+ assert not ingest_instance.lock_file_path.exists(), msg
1590
+
1591
+ msg = "Daemon shutdown not logged"
1592
+ assert "Stopped ingestion daemon" in caplog.text, msg
1593
+
1594
+ msg = "Result thread shutdown not logged"
1595
+ assert "Result processing thread stopped" in caplog.text, msg
1596
+
1597
+ # Clean up data files
1598
+ for data_file in data_files:
1599
+ if data_file.exists():
1600
+ data_file.unlink()
1601
+
1602
+ # Statistics
1603
+ # Ingestion workflow time: from trigger detection to ingestion with replication completion
1604
+ max_concurrent_observed = (
1605
+ max(max_concurrent_samples) if max_concurrent_samples else 0
1606
+ )
1607
+ max_concurrent_tracked = ingest_instance.max_concurrent_tasks
1608
+
1609
+ detection_to_completion_time = workflow_end_time - natural_start
1610
+ processing_rate = (
1611
+ len(processed_files) / detection_to_completion_time
1612
+ if detection_to_completion_time > 0
1613
+ else 0
1614
+ )
1615
+
1616
+ total_submitted = ingest_instance.total_tasks_submitted
1617
+ tasks_cleaned_up = len(ingest_instance.submitted_tasks) == 0
1618
+ max_concurrent_final = max(max_concurrent_tracked, max_concurrent_observed)
1619
+ parallel_achieved = max_concurrent_final >= 2
1620
+
1621
+ # Summary
1622
+ status = "parallel" if parallel_achieved else "sequential"
1623
+
1624
+ LOGGER.info("=== Parallel Ingestion Test Results ===")
1625
+ LOGGER.info(
1626
+ "Files processed: %d/7 in %.1fs",
1627
+ len(processed_files),
1628
+ detection_to_completion_time,
1629
+ )
1630
+ LOGGER.info("Processing rate: %.1f files/sec", processing_rate)
1631
+ LOGGER.info("Max concurrent tasks: %d (mode: %s)", max_concurrent_final, status)
1632
+ LOGGER.info("Total tasks submitted: %d", total_submitted)
1633
+ LOGGER.info("Task cleanup successful: %s", tasks_cleaned_up)
1634
+ LOGGER.info("Event detection: %d move events", move_events_detected)
1635
+
1636
+
1637
+ def fetch_ingestion_daemon_metrics():
1638
+ """Fetch metrics from the ingestion daemon to verify its operation."""
1639
+
1640
+ response = urlopen("http://bdms-ingestion-daemon:8000/")
1641
+
1642
+ assert response.status == 200, "Ingestion daemon metrics are not responding"
1643
+
1644
+ n_tasks_metrics = {}
1645
+ for line in response.readlines():
1646
+ line = line.decode("utf-8").strip()
1647
+ if line.startswith("n_tasks_"):
1648
+ LOGGER.info("Ingestion daemon metrics: %s", line)
1649
+ key, value = line.split(" ", 1)
1650
+ n_tasks_metrics[key] = float(value)
1651
+
1652
+ return n_tasks_metrics
1653
+
1654
+
1655
+ @pytest.mark.usefixtures(
1656
+ "_auth_proxy", "lock_for_ingestion_daemon", "enable_ingestion_daemon"
1657
+ )
1658
+ @pytest.mark.verifies_usecase("UC-110-1.1.4")
1659
+ def test_ingest_parallel_submission_with_live_daemon(storage_mount_path, test_vo):
1660
+ """Test parallel file processing with an already running daemon."""
1661
+
1662
+ # with live test, the daemon is deployed outside of this test, so we need to pick a persistent location, matching the daemon's storage mount path
1663
+ # note that if kind cluster creation fixture is used, the directory can be unique per test session
1664
+ # this test does only checks that the files are consumed, not that they are replicated
1665
+
1666
+ test_scope = "test_scope_persistent"
1667
+
1668
+ n_tasks_metrics_before_test = fetch_ingestion_daemon_metrics()
1669
+
1670
+ for tf in (storage_mount_path / test_vo / test_scope).glob("*" + TRIGGER_SUFFIX):
1671
+ if tf.exists():
1672
+ LOGGER.info("Cleaning up existing trigger file: %s", tf)
1673
+ tf.unlink()
1674
+
1675
+ data_files = acada_write_test_files(storage_mount_path, test_vo, test_scope)
1676
+
1677
+ creation_results = []
1678
+ for data_file in data_files:
1679
+ acada_create_trigger_symlink(data_file, creation_results)
1680
+
1681
+ trigger_files = [Path(str(df) + TRIGGER_SUFFIX) for df in data_files]
1682
+
1683
+ timeout = 120.0
1684
+ start_time = time.time()
1685
+
1686
+ remaining_triggers = 0
1687
+ while time.time() - start_time < timeout:
1688
+ # Verify trigger files were cleaned up during successful processing
1689
+ remaining_triggers = sum(1 for tf in trigger_files if tf.exists())
1690
+
1691
+ if remaining_triggers == 0:
1692
+ LOGGER.info("All trigger files consumed up successfully, exiting test.")
1693
+ break
1694
+ else:
1695
+ LOGGER.info(
1696
+ "Waiting for trigger files to be cleaned up, %s remain.",
1697
+ remaining_triggers,
1698
+ )
1699
+
1700
+ time.sleep(1.0) # Sample frequently to catch concurrency
1701
+
1702
+ assert remaining_triggers == 0, "Expected all trigger files to be consumed up"
1703
+
1704
+ ensure_files_ingested(data_files, storage_mount_path, test_scope)
1705
+
1706
+ # make sure that metrics are available from the daemon
1707
+ n_tasks_metrics = fetch_ingestion_daemon_metrics()
1708
+
1709
+ assert n_tasks_metrics["n_tasks_success_created"] < time.time()
1710
+ assert n_tasks_metrics["n_tasks_processed_total"] - n_tasks_metrics_before_test[
1711
+ "n_tasks_processed_total"
1712
+ ] == len(data_files)
1713
+ assert (
1714
+ n_tasks_metrics["n_tasks_processed_total"]
1715
+ - n_tasks_metrics_before_test["n_tasks_processed_total"]
1716
+ == n_tasks_metrics["n_tasks_success_total"]
1717
+ + n_tasks_metrics["n_tasks_skipped_total"]
1718
+ ), "Ingestion daemon metrics do not match expected values"