ctao-bdms-clients 0.2.1__py3-none-any.whl → 0.3.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- bdms/_version.py +2 -2
- bdms/acada_ingest_cli.py +400 -0
- bdms/acada_ingestion.py +480 -13
- bdms/tests/conftest.py +132 -12
- bdms/tests/test_acada_ingest_cli.py +279 -0
- bdms/tests/test_acada_ingestion.py +1242 -50
- bdms/tests/test_dpps_rel_0_0.py +6 -0
- bdms/tests/utils.py +11 -1
- {ctao_bdms_clients-0.2.1.dist-info → ctao_bdms_clients-0.3.0.dist-info}/METADATA +5 -1
- ctao_bdms_clients-0.3.0.dist-info/RECORD +23 -0
- ctao_bdms_clients-0.3.0.dist-info/entry_points.txt +2 -0
- ctao_bdms_clients-0.2.1.dist-info/RECORD +0 -20
- {ctao_bdms_clients-0.2.1.dist-info → ctao_bdms_clients-0.3.0.dist-info}/WHEEL +0 -0
- {ctao_bdms_clients-0.2.1.dist-info → ctao_bdms_clients-0.3.0.dist-info}/licenses/LICENSE +0 -0
- {ctao_bdms_clients-0.2.1.dist-info → ctao_bdms_clients-0.3.0.dist-info}/top_level.txt +0 -0
@@ -5,10 +5,17 @@ and the replication of data between Rucio storage elements (RSEs).
|
|
5
5
|
"""
|
6
6
|
|
7
7
|
import logging
|
8
|
+
import os
|
9
|
+
import re
|
8
10
|
import subprocess
|
11
|
+
import threading
|
12
|
+
import time
|
13
|
+
from concurrent.futures import ProcessPoolExecutor
|
9
14
|
from pathlib import Path
|
10
15
|
from shutil import copy2
|
16
|
+
from urllib.request import urlopen
|
11
17
|
|
18
|
+
import numpy as np
|
12
19
|
import pytest
|
13
20
|
from astropy.io import fits
|
14
21
|
from astropy.table import Table
|
@@ -18,8 +25,18 @@ from rucio.client.replicaclient import ReplicaClient
|
|
18
25
|
from rucio.client.ruleclient import RuleClient
|
19
26
|
from rucio.common.exception import RucioException
|
20
27
|
from rucio.common.utils import adler32
|
21
|
-
|
22
|
-
|
28
|
+
from watchdog.events import FileMovedEvent
|
29
|
+
|
30
|
+
from bdms.acada_ingestion import (
|
31
|
+
DETECTED_NEW_TRIGGER_FILE,
|
32
|
+
INGEST_SUCCESS_MESSAGE,
|
33
|
+
TRIGGER_SUFFIX,
|
34
|
+
Ingest,
|
35
|
+
IngestionClient,
|
36
|
+
IngestStatus,
|
37
|
+
TriggerFileHandler,
|
38
|
+
process_file,
|
39
|
+
)
|
23
40
|
from bdms.tests.utils import reset_xrootd_permissions, wait_for_replication_status
|
24
41
|
|
25
42
|
LOGGER = logging.getLogger(__name__)
|
@@ -28,13 +45,14 @@ ONSITE_RSE = "STORAGE-1"
|
|
28
45
|
OFFSITE_RSE_1 = "STORAGE-2"
|
29
46
|
OFFSITE_RSE_2 = "STORAGE-3"
|
30
47
|
|
48
|
+
TEST_FILE_TRIGGER = "test_file.trigger"
|
49
|
+
|
31
50
|
|
32
51
|
def test_shared_storage(storage_mount_path: Path):
|
33
52
|
"""Test that the shared storage path is available."""
|
34
53
|
|
35
|
-
|
36
|
-
|
37
|
-
), f"Shared storage {storage_mount_path} is not available on the client"
|
54
|
+
msg = f"Shared storage {storage_mount_path} is not available on the client"
|
55
|
+
assert storage_mount_path.exists(), msg
|
38
56
|
|
39
57
|
|
40
58
|
def trigger_judge_repairer() -> None:
|
@@ -83,7 +101,8 @@ def test_acada_to_lfn(storage_mount_path: Path, test_vo: str):
|
|
83
101
|
)
|
84
102
|
lfn = ingestion_client.acada_to_lfn(acada_path=acada_path)
|
85
103
|
|
86
|
-
|
104
|
+
msg = f"Expected {expected_lfn}, got {lfn}"
|
105
|
+
assert lfn == expected_lfn, msg
|
87
106
|
|
88
107
|
# Test Case 2: Non-absolute acada_path (empty string)
|
89
108
|
with pytest.raises(ValueError, match="acada_path must be absolute"):
|
@@ -121,7 +140,10 @@ def test_check_replica_exists(
|
|
121
140
|
"""Test the check_replica_exists method of IngestionClient."""
|
122
141
|
|
123
142
|
ingestion_client = IngestionClient(
|
124
|
-
storage_mount_path,
|
143
|
+
data_path=storage_mount_path,
|
144
|
+
rse=ONSITE_RSE,
|
145
|
+
vo=test_vo,
|
146
|
+
scope=test_scope,
|
125
147
|
)
|
126
148
|
|
127
149
|
acada_path, _ = onsite_test_file
|
@@ -202,22 +224,29 @@ def test_add_onsite_replica_with_minio_fits_file(
|
|
202
224
|
):
|
203
225
|
"""Test the add_onsite_replica method of IngestionClient using a dummy file."""
|
204
226
|
|
227
|
+
ingestion_client = IngestionClient(
|
228
|
+
data_path=storage_mount_path,
|
229
|
+
rse=ONSITE_RSE,
|
230
|
+
vo=test_vo,
|
231
|
+
scope=test_scope,
|
232
|
+
)
|
233
|
+
|
205
234
|
filename = str(file_location).split("/")[-1]
|
206
235
|
acada_path = storage_mount_path / test_vo / test_scope / filename
|
207
236
|
acada_path.parent.mkdir(parents=True, exist_ok=True)
|
208
237
|
copy2(file_location, str(acada_path))
|
209
238
|
reset_xrootd_permissions(storage_mount_path)
|
210
239
|
|
211
|
-
ingestion_client = IngestionClient(
|
212
|
-
storage_mount_path, ONSITE_RSE, scope=test_scope, vo=test_vo
|
213
|
-
)
|
214
|
-
|
215
240
|
# Use add_onsite_replica to register the replica
|
216
|
-
lfn = ingestion_client.add_onsite_replica(acada_path=acada_path)
|
241
|
+
lfn, skipped = ingestion_client.add_onsite_replica(acada_path=acada_path)
|
217
242
|
|
218
243
|
# Verify the LFN matches the expected LFN
|
219
244
|
expected_lfn = ingestion_client.acada_to_lfn(acada_path)
|
220
|
-
|
245
|
+
msg = f"Expected LFN {expected_lfn}, got {lfn}"
|
246
|
+
assert lfn == expected_lfn, msg
|
247
|
+
|
248
|
+
msg = "Expected the file to be newly ingested, but it was skipped"
|
249
|
+
assert not skipped, msg
|
221
250
|
|
222
251
|
# Download the file using the LFN
|
223
252
|
download_spec = {
|
@@ -230,18 +259,29 @@ def test_add_onsite_replica_with_minio_fits_file(
|
|
230
259
|
|
231
260
|
# Verify the downloaded file
|
232
261
|
download_path = tmp_path / lfn.lstrip("/")
|
233
|
-
|
262
|
+
msg = f"Download failed at {download_path}"
|
263
|
+
assert download_path.is_file(), msg
|
234
264
|
|
235
|
-
|
236
|
-
|
237
|
-
), "Downloaded file content does not match the original. "
|
265
|
+
msg = "Downloaded file content does not match the original."
|
266
|
+
assert adler32(download_path) == adler32(file_location), msg
|
238
267
|
|
239
268
|
# Check for don't ingest again if its already registered
|
240
269
|
caplog.clear()
|
241
|
-
|
270
|
+
lfn_check, skipped_check = ingestion_client.add_onsite_replica(
|
271
|
+
acada_path=acada_path
|
272
|
+
)
|
273
|
+
msg = f"LFN mismatch on second ingestion attempt: expected {lfn}, got {lfn_check}"
|
274
|
+
assert lfn_check == lfn, msg
|
275
|
+
|
276
|
+
msg = (
|
277
|
+
"Expected the file to be skipped on second ingestion, but it was ingested again"
|
278
|
+
)
|
279
|
+
assert skipped_check, msg
|
280
|
+
|
281
|
+
msg = f"'Replica already exists for lfn '{lfn}', skipping' in caplog records"
|
242
282
|
assert f"Replica already exists for lfn '{lfn}', skipping" in [
|
243
283
|
r.message for r in caplog.records
|
244
|
-
]
|
284
|
+
], msg
|
245
285
|
|
246
286
|
# Retrieve metadata using the DIDClient
|
247
287
|
did_client = Client()
|
@@ -251,10 +291,11 @@ def test_add_onsite_replica_with_minio_fits_file(
|
|
251
291
|
|
252
292
|
# Verify the metadata matches the expected metadata
|
253
293
|
for key, value in metadata_dict.items():
|
254
|
-
|
294
|
+
msg = (
|
255
295
|
f"Metadata mismatch for key '{key}'. "
|
256
296
|
f"Expected: {value}, Got: {retrieved_metadata.get(key)}"
|
257
297
|
)
|
298
|
+
assert retrieved_metadata.get(key) == value, msg
|
258
299
|
|
259
300
|
|
260
301
|
def test_rses():
|
@@ -263,9 +304,14 @@ def test_rses():
|
|
263
304
|
result = list(client.list_rses())
|
264
305
|
|
265
306
|
rses = [r["rse"] for r in result]
|
266
|
-
|
267
|
-
assert
|
268
|
-
|
307
|
+
msg = f"Expected RSE {ONSITE_RSE} not found in {rses}"
|
308
|
+
assert ONSITE_RSE in rses, msg
|
309
|
+
|
310
|
+
msg = f"Expected RSE {OFFSITE_RSE_1} not found in {rses}"
|
311
|
+
assert OFFSITE_RSE_1 in rses, msg
|
312
|
+
|
313
|
+
msg = f"Expected RSE {OFFSITE_RSE_2} not found in {rses}"
|
314
|
+
assert OFFSITE_RSE_2 in rses, msg
|
269
315
|
|
270
316
|
|
271
317
|
@pytest.fixture
|
@@ -306,9 +352,8 @@ def pre_existing_lfn(
|
|
306
352
|
|
307
353
|
# Verify the replica is registered
|
308
354
|
replicas = list(replica_client.list_replicas(dids=[did]))
|
309
|
-
|
310
|
-
|
311
|
-
), f"Failed to verify pre-registration of replica for LFN {lfn} on {ONSITE_RSE}"
|
355
|
+
msg = f"Failed to verify pre-registration of replica for LFN {lfn} on {ONSITE_RSE}"
|
356
|
+
assert replicas, msg
|
312
357
|
|
313
358
|
return lfn
|
314
359
|
|
@@ -326,9 +371,11 @@ def test_add_offsite_replication_rules(
|
|
326
371
|
):
|
327
372
|
"""Test the add_offsite_replication_rules method of IngestionClient."""
|
328
373
|
ingestion_client = IngestionClient(
|
329
|
-
storage_mount_path,
|
374
|
+
data_path=storage_mount_path,
|
375
|
+
rse=ONSITE_RSE,
|
376
|
+
vo=test_vo,
|
377
|
+
scope=test_scope,
|
330
378
|
)
|
331
|
-
caplog.set_level(logging.DEBUG)
|
332
379
|
|
333
380
|
# Replicate the ACADA file to two offsite RSEs
|
334
381
|
lfn = pre_existing_lfn
|
@@ -356,10 +403,11 @@ def test_add_offsite_replication_rules(
|
|
356
403
|
replica_client = ReplicaClient()
|
357
404
|
replicas = next(replica_client.list_replicas(dids=[did]))
|
358
405
|
states = replicas.get("states", {})
|
406
|
+
msg = f"Expected replica on either {OFFSITE_RSE_1} or {OFFSITE_RSE_2} to be AVAILABLE after first rule: {states}"
|
359
407
|
assert (
|
360
408
|
states.get(OFFSITE_RSE_1) == "AVAILABLE"
|
361
409
|
or states.get(OFFSITE_RSE_2) == "AVAILABLE"
|
362
|
-
),
|
410
|
+
), msg
|
363
411
|
|
364
412
|
# Manually trigger the judge-repairer to ensure the second rule doesn't get stuck
|
365
413
|
trigger_judge_repairer()
|
@@ -376,15 +424,15 @@ def test_add_offsite_replication_rules(
|
|
376
424
|
did,
|
377
425
|
states,
|
378
426
|
)
|
379
|
-
|
380
|
-
|
381
|
-
|
382
|
-
|
383
|
-
|
384
|
-
|
385
|
-
|
386
|
-
|
387
|
-
|
427
|
+
|
428
|
+
msg = f"Expected replica on {ONSITE_RSE} to be AVAILABLE: {states}"
|
429
|
+
assert states.get(ONSITE_RSE) == "AVAILABLE", msg
|
430
|
+
|
431
|
+
msg = f"Expected replica on {OFFSITE_RSE_1} to be AVAILABLE: {states}"
|
432
|
+
assert states.get(OFFSITE_RSE_1) == "AVAILABLE", msg
|
433
|
+
|
434
|
+
msg = f"Expected replica on {OFFSITE_RSE_2} to be AVAILABLE: {states}"
|
435
|
+
assert states.get(OFFSITE_RSE_2) == "AVAILABLE", msg
|
388
436
|
|
389
437
|
# Download the file from OFFSITE_RSE_2 to verify its content
|
390
438
|
download_spec = {
|
@@ -398,12 +446,15 @@ def test_add_offsite_replication_rules(
|
|
398
446
|
|
399
447
|
# Verify the downloaded file content
|
400
448
|
download_path = tmp_path / lfn.lstrip("/")
|
401
|
-
|
449
|
+
msg = f"Download failed at {download_path}"
|
450
|
+
assert download_path.is_file(), msg
|
451
|
+
|
402
452
|
downloaded_content = download_path.read_text()
|
403
|
-
|
453
|
+
msg = (
|
404
454
|
f"Downloaded file content does not match the original. "
|
405
455
|
f"Expected: {test_file_content}, Got: {downloaded_content}"
|
406
456
|
)
|
457
|
+
assert downloaded_content == test_file_content, msg
|
407
458
|
|
408
459
|
|
409
460
|
@pytest.mark.usefixtures("_auth_proxy")
|
@@ -418,10 +469,13 @@ def test_add_offsite_replication_rules_single_copy(
|
|
418
469
|
caplog,
|
419
470
|
):
|
420
471
|
"""Test the add_offsite_replication_rules method of IngestionClient with a single copy (copies=1)."""
|
472
|
+
|
421
473
|
ingestion_client = IngestionClient(
|
422
|
-
storage_mount_path,
|
474
|
+
data_path=storage_mount_path,
|
475
|
+
rse=ONSITE_RSE,
|
476
|
+
vo=test_vo,
|
477
|
+
scope=test_scope,
|
423
478
|
)
|
424
|
-
caplog.set_level(logging.DEBUG)
|
425
479
|
|
426
480
|
# Replicate the ACADA file to one offsite RSE
|
427
481
|
lfn = pre_existing_lfn
|
@@ -439,9 +493,9 @@ def test_add_offsite_replication_rules_single_copy(
|
|
439
493
|
)
|
440
494
|
|
441
495
|
# Verify that only one rule was created
|
442
|
-
|
443
|
-
|
444
|
-
|
496
|
+
msg = f"Expected exactly 1 rule ID, got {len(rule_ids)}: {rule_ids}"
|
497
|
+
assert len(rule_ids) == 1, msg
|
498
|
+
|
445
499
|
rule_id_offsite_1 = rule_ids[0]
|
446
500
|
rule_client = RuleClient()
|
447
501
|
|
@@ -461,9 +515,8 @@ def test_add_offsite_replication_rules_single_copy(
|
|
461
515
|
offsite_replica_count = sum(
|
462
516
|
1 for rse in [OFFSITE_RSE_1, OFFSITE_RSE_2] if states.get(rse) == "AVAILABLE"
|
463
517
|
)
|
464
|
-
|
465
|
-
|
466
|
-
), f"Expected exactly 1 offsite replica (on either {OFFSITE_RSE_1} or {OFFSITE_RSE_2}), got {offsite_replica_count}: {states}"
|
518
|
+
msg = f"Expected exactly 1 offsite replica (on either {OFFSITE_RSE_1} or {OFFSITE_RSE_2}), got {offsite_replica_count}: {states}"
|
519
|
+
assert offsite_replica_count == 1, msg
|
467
520
|
|
468
521
|
# Determine which offsite RSE the replica was created on
|
469
522
|
target_offsite_rse = (
|
@@ -482,12 +535,14 @@ def test_add_offsite_replication_rules_single_copy(
|
|
482
535
|
|
483
536
|
# Verify the downloaded file content
|
484
537
|
download_path = tmp_path / lfn.lstrip("/")
|
485
|
-
|
538
|
+
msg = f"Download failed at {download_path}"
|
539
|
+
assert download_path.is_file(), msg
|
486
540
|
downloaded_content = download_path.read_text()
|
487
|
-
|
541
|
+
msg = (
|
488
542
|
f"Downloaded file content does not match the original. "
|
489
543
|
f"Expected: {test_file_content}, Got: {downloaded_content}"
|
490
544
|
)
|
545
|
+
assert downloaded_content == test_file_content, msg
|
491
546
|
|
492
547
|
|
493
548
|
def test_verify_fits_file(tel_events_test_file):
|
@@ -524,3 +579,1140 @@ def test_verify_fits_file_invalid_checksum(broken_checksum):
|
|
524
579
|
with fits.open(broken_checksum) as hdul:
|
525
580
|
with pytest.raises(FITSVerificationError, match="CHECKSUM verification failed"):
|
526
581
|
verify_fits_checksum(hdul)
|
582
|
+
|
583
|
+
|
584
|
+
def test_ingest_init(storage_mount_path):
|
585
|
+
"""Test that Ingest initializes correctly with given parameters."""
|
586
|
+
ingestion_client = IngestionClient(
|
587
|
+
data_path=storage_mount_path,
|
588
|
+
rse=ONSITE_RSE,
|
589
|
+
vo="ctao",
|
590
|
+
scope="acada",
|
591
|
+
)
|
592
|
+
|
593
|
+
ingest = Ingest(
|
594
|
+
client=ingestion_client,
|
595
|
+
top_dir=storage_mount_path,
|
596
|
+
num_workers=3,
|
597
|
+
lock_file_path=storage_mount_path / "lockfile.lock",
|
598
|
+
polling_interval=0.5,
|
599
|
+
check_interval=0.2,
|
600
|
+
)
|
601
|
+
assert ingest.client == ingestion_client
|
602
|
+
assert ingest.top_dir == storage_mount_path
|
603
|
+
assert ingest.num_workers == 3
|
604
|
+
assert ingest.lock_file_path == storage_mount_path / "lockfile.lock"
|
605
|
+
assert ingest.polling_interval == 0.5
|
606
|
+
assert ingest.check_interval == 0.2
|
607
|
+
assert not ingest.stop_event.is_set() # check stop_event initial state
|
608
|
+
assert hasattr(ingest, "result_queue")
|
609
|
+
assert hasattr(ingest, "task_counter")
|
610
|
+
assert hasattr(ingest, "submitted_tasks")
|
611
|
+
assert ingest.task_counter == 0
|
612
|
+
assert len(ingest.submitted_tasks) == 0
|
613
|
+
|
614
|
+
|
615
|
+
def test_check_directory_valid(storage_mount_path, tmp_path, caplog):
|
616
|
+
"""Test _check_directory with a valid, readable directory."""
|
617
|
+
ingestion_client = IngestionClient(
|
618
|
+
data_path=storage_mount_path,
|
619
|
+
rse=ONSITE_RSE,
|
620
|
+
vo="ctao",
|
621
|
+
scope="acada",
|
622
|
+
)
|
623
|
+
|
624
|
+
ingest_instance = Ingest(
|
625
|
+
client=ingestion_client,
|
626
|
+
top_dir=tmp_path,
|
627
|
+
num_workers=1,
|
628
|
+
lock_file_path=storage_mount_path / "bdms_ingest.lock",
|
629
|
+
polling_interval=0.5,
|
630
|
+
check_interval=0.5,
|
631
|
+
)
|
632
|
+
|
633
|
+
ingest_instance.top_dir = tmp_path
|
634
|
+
ingest_instance._check_directory()
|
635
|
+
|
636
|
+
|
637
|
+
def test_check_directory_invalid(storage_mount_path, tmp_path, caplog):
|
638
|
+
"""Test _check_directory with an invalid directory."""
|
639
|
+
ingestion_client = IngestionClient(
|
640
|
+
data_path=storage_mount_path,
|
641
|
+
rse=ONSITE_RSE,
|
642
|
+
vo="ctao",
|
643
|
+
scope="acada",
|
644
|
+
logger=LOGGER,
|
645
|
+
)
|
646
|
+
|
647
|
+
invalid_dir = tmp_path / "nonexistent"
|
648
|
+
|
649
|
+
ingest_instance = Ingest(
|
650
|
+
client=ingestion_client,
|
651
|
+
top_dir=invalid_dir,
|
652
|
+
num_workers=1,
|
653
|
+
lock_file_path=storage_mount_path / "bdms_ingest.lock",
|
654
|
+
polling_interval=0.5,
|
655
|
+
check_interval=0.5,
|
656
|
+
)
|
657
|
+
|
658
|
+
with pytest.raises(RuntimeError, match=f"Cannot read directory {invalid_dir}"):
|
659
|
+
ingest_instance._check_directory()
|
660
|
+
assert f"Cannot read directory {invalid_dir}" in caplog.text
|
661
|
+
|
662
|
+
|
663
|
+
@pytest.mark.usefixtures("_auth_proxy")
|
664
|
+
def test_process_file_success(
|
665
|
+
storage_mount_path, caplog, onsite_test_file, test_vo, test_scope
|
666
|
+
):
|
667
|
+
"""Test for checking successful ingestion with trigger file clean-up, depends on IngestionClient"""
|
668
|
+
ingestion_client = IngestionClient(
|
669
|
+
data_path=storage_mount_path,
|
670
|
+
rse=ONSITE_RSE,
|
671
|
+
vo=test_vo,
|
672
|
+
scope=test_scope,
|
673
|
+
)
|
674
|
+
|
675
|
+
acada_path, _ = onsite_test_file
|
676
|
+
test_file = acada_path
|
677
|
+
trigger_file = Path(str(test_file) + TRIGGER_SUFFIX)
|
678
|
+
trigger_file.symlink_to(test_file)
|
679
|
+
result = process_file(ingestion_client, str(test_file))
|
680
|
+
assert result == IngestStatus.SUCCESS
|
681
|
+
assert not trigger_file.exists()
|
682
|
+
assert INGEST_SUCCESS_MESSAGE in caplog.text
|
683
|
+
|
684
|
+
|
685
|
+
@pytest.mark.usefixtures("_auth_proxy")
|
686
|
+
def test_process_file_skipped(
|
687
|
+
storage_mount_path, caplog, onsite_test_file, test_vo, test_scope
|
688
|
+
):
|
689
|
+
"""Test for checking skipped ingestion when replica already exists"""
|
690
|
+
ingestion_client = IngestionClient(
|
691
|
+
data_path=storage_mount_path,
|
692
|
+
rse=ONSITE_RSE,
|
693
|
+
vo=test_vo,
|
694
|
+
scope=test_scope,
|
695
|
+
)
|
696
|
+
|
697
|
+
acada_path, _ = onsite_test_file
|
698
|
+
test_file = acada_path
|
699
|
+
trigger_file = Path(str(test_file) + TRIGGER_SUFFIX)
|
700
|
+
trigger_file.symlink_to(test_file)
|
701
|
+
process_file(ingestion_client, str(test_file))
|
702
|
+
caplog.clear()
|
703
|
+
result = process_file(ingestion_client, str(test_file))
|
704
|
+
assert result == IngestStatus.SKIPPED
|
705
|
+
assert "Replica already exists" in caplog.text
|
706
|
+
|
707
|
+
|
708
|
+
@pytest.mark.usefixtures("_auth_proxy")
|
709
|
+
def test_process_file_failure(storage_mount_path, caplog, tmp_path):
|
710
|
+
"""Test for checking failure for invalid file paths"""
|
711
|
+
ingestion_client = IngestionClient(
|
712
|
+
data_path=storage_mount_path,
|
713
|
+
rse=ONSITE_RSE,
|
714
|
+
vo="ctao",
|
715
|
+
scope="acada",
|
716
|
+
)
|
717
|
+
|
718
|
+
invalid_file = tmp_path / "invalid_file.fits"
|
719
|
+
invalid_file.write_text("dummy content")
|
720
|
+
trigger_file = Path(str(invalid_file) + TRIGGER_SUFFIX)
|
721
|
+
trigger_file.symlink_to(invalid_file)
|
722
|
+
|
723
|
+
# The file path is outside the data_path causing a ValueError in acada_to_lfn
|
724
|
+
result = process_file(ingestion_client, str(invalid_file))
|
725
|
+
|
726
|
+
# Verify the function returns FAILURE status instead of raising an exception
|
727
|
+
assert result == IngestStatus.FAILURE
|
728
|
+
|
729
|
+
# Check for the actual error message that gets logged
|
730
|
+
assert "Exception in process_file" in caplog.text
|
731
|
+
# Verify the file path is in the error message
|
732
|
+
assert str(invalid_file) in caplog.text
|
733
|
+
|
734
|
+
# Verify that no success message was logged
|
735
|
+
assert INGEST_SUCCESS_MESSAGE not in caplog.text
|
736
|
+
|
737
|
+
# Trigger file should still exist since ingestion failed
|
738
|
+
msg = "Trigger file should not be removed when ingestion fails"
|
739
|
+
assert trigger_file.exists(), msg
|
740
|
+
|
741
|
+
|
742
|
+
def test_trigger_file_handler_init(storage_mount_path):
|
743
|
+
"""Test TriggerFileHandler initialization."""
|
744
|
+
ingestion_client = IngestionClient(
|
745
|
+
data_path=storage_mount_path,
|
746
|
+
rse=ONSITE_RSE,
|
747
|
+
vo="ctao",
|
748
|
+
scope="acada",
|
749
|
+
)
|
750
|
+
|
751
|
+
ingest_instance = Ingest(
|
752
|
+
client=ingestion_client,
|
753
|
+
top_dir=storage_mount_path,
|
754
|
+
num_workers=1,
|
755
|
+
lock_file_path=storage_mount_path / "bdms_ingest.lock",
|
756
|
+
polling_interval=0.5,
|
757
|
+
check_interval=0.5,
|
758
|
+
)
|
759
|
+
|
760
|
+
handler = TriggerFileHandler(ingest_instance)
|
761
|
+
assert handler.ingest == ingest_instance
|
762
|
+
|
763
|
+
|
764
|
+
def test_trigger_file_handler_on_moved_missing_data_file(
|
765
|
+
storage_mount_path, tmp_path, caplog
|
766
|
+
):
|
767
|
+
"""Test on_moved skips when data file is missing."""
|
768
|
+
ingestion_client = IngestionClient(
|
769
|
+
data_path=storage_mount_path,
|
770
|
+
rse=ONSITE_RSE,
|
771
|
+
vo="ctao",
|
772
|
+
scope="acada",
|
773
|
+
)
|
774
|
+
|
775
|
+
ingest_instance = Ingest(
|
776
|
+
client=ingestion_client,
|
777
|
+
top_dir=storage_mount_path,
|
778
|
+
num_workers=1,
|
779
|
+
lock_file_path=storage_mount_path / "bdms_ingest.lock",
|
780
|
+
polling_interval=0.5,
|
781
|
+
check_interval=0.5,
|
782
|
+
)
|
783
|
+
|
784
|
+
handler = TriggerFileHandler(ingest_instance)
|
785
|
+
trigger_file = tmp_path / TEST_FILE_TRIGGER
|
786
|
+
data_file = tmp_path / "test_file"
|
787
|
+
|
788
|
+
# Create symlink to non-existent data file
|
789
|
+
trigger_file.symlink_to(data_file)
|
790
|
+
|
791
|
+
# Create FileMovedEvent (simulating ln -s)
|
792
|
+
event = FileMovedEvent(src_path=str(data_file), dest_path=str(trigger_file))
|
793
|
+
handler.on_moved(event)
|
794
|
+
|
795
|
+
assert (
|
796
|
+
f"Data file {data_file} for trigger {trigger_file} does not exist, skipping"
|
797
|
+
in caplog.text
|
798
|
+
)
|
799
|
+
assert (
|
800
|
+
DETECTED_NEW_TRIGGER_FILE not in caplog.text
|
801
|
+
) # Skips processing since the data file is missing
|
802
|
+
|
803
|
+
|
804
|
+
def test_trigger_file_handler_on_moved_success(
|
805
|
+
storage_mount_path, tmp_path, onsite_test_file, test_vo, test_scope, caplog
|
806
|
+
):
|
807
|
+
"""Test on_moved successfully processing a valid trigger file."""
|
808
|
+
ingestion_client = IngestionClient(
|
809
|
+
data_path=storage_mount_path,
|
810
|
+
rse=ONSITE_RSE,
|
811
|
+
vo=test_vo,
|
812
|
+
scope=test_scope,
|
813
|
+
)
|
814
|
+
|
815
|
+
ingest_instance = Ingest(
|
816
|
+
client=ingestion_client,
|
817
|
+
top_dir=storage_mount_path,
|
818
|
+
num_workers=1,
|
819
|
+
lock_file_path=storage_mount_path / "bdms_ingest.lock",
|
820
|
+
polling_interval=0.5,
|
821
|
+
check_interval=0.5,
|
822
|
+
)
|
823
|
+
|
824
|
+
# Create ProcessPoolExecutor for the ingest instance
|
825
|
+
with ProcessPoolExecutor(max_workers=1) as executor:
|
826
|
+
ingest_instance.executor = executor
|
827
|
+
|
828
|
+
handler = TriggerFileHandler(ingest_instance)
|
829
|
+
acada_path, _ = onsite_test_file
|
830
|
+
test_file = acada_path
|
831
|
+
trigger_file = Path(str(test_file) + TRIGGER_SUFFIX)
|
832
|
+
trigger_file.symlink_to(test_file)
|
833
|
+
|
834
|
+
# Create FileMovedEvent (simulating ln -s)
|
835
|
+
event = FileMovedEvent(src_path=str(test_file), dest_path=str(trigger_file))
|
836
|
+
|
837
|
+
# Record initial state
|
838
|
+
initial_task_counter = ingest_instance.task_counter
|
839
|
+
initial_total_tasks = ingest_instance.total_tasks_submitted
|
840
|
+
initial_submitted_tasks_count = len(ingest_instance.submitted_tasks)
|
841
|
+
|
842
|
+
handler.on_moved(event)
|
843
|
+
|
844
|
+
# Verify the expected log message
|
845
|
+
msg = f"'Detected new trigger file {trigger_file}, submitting data file {test_file}' in caplog"
|
846
|
+
assert (
|
847
|
+
f"Detected new trigger file {trigger_file}, submitting data file {test_file}"
|
848
|
+
in caplog.text
|
849
|
+
), msg
|
850
|
+
|
851
|
+
# Verify task submission metrics were updated
|
852
|
+
assert ingest_instance.task_counter == initial_task_counter + 1
|
853
|
+
assert ingest_instance.total_tasks_submitted == initial_total_tasks + 1
|
854
|
+
assert len(ingest_instance.submitted_tasks) == initial_submitted_tasks_count + 1
|
855
|
+
|
856
|
+
# Verify the task was submitted with correct file path
|
857
|
+
submitted_task_files = list(ingest_instance.submitted_tasks.values())
|
858
|
+
assert str(test_file) in submitted_task_files
|
859
|
+
|
860
|
+
# Give some time for the task to potentially complete
|
861
|
+
time.sleep(0.5)
|
862
|
+
|
863
|
+
|
864
|
+
def test_trigger_file_handler_on_moved_stop_event_set(
|
865
|
+
storage_mount_path, tmp_path, caplog
|
866
|
+
):
|
867
|
+
"""Test on_moved skips processing when stop_event is set."""
|
868
|
+
ingestion_client = IngestionClient(
|
869
|
+
data_path=storage_mount_path,
|
870
|
+
rse=ONSITE_RSE,
|
871
|
+
vo="ctao",
|
872
|
+
scope="acada",
|
873
|
+
)
|
874
|
+
|
875
|
+
ingest_instance = Ingest(
|
876
|
+
client=ingestion_client,
|
877
|
+
top_dir=storage_mount_path,
|
878
|
+
num_workers=1,
|
879
|
+
lock_file_path=storage_mount_path / "bdms_ingest.lock",
|
880
|
+
polling_interval=0.5,
|
881
|
+
check_interval=0.5,
|
882
|
+
)
|
883
|
+
|
884
|
+
handler = TriggerFileHandler(ingest_instance)
|
885
|
+
trigger_file = tmp_path / TEST_FILE_TRIGGER
|
886
|
+
data_file = tmp_path / "test_file"
|
887
|
+
data_file.write_text("data") # Data file exists
|
888
|
+
trigger_file.symlink_to(data_file)
|
889
|
+
|
890
|
+
# Create FileMovedEvent
|
891
|
+
event = FileMovedEvent(src_path=str(data_file), dest_path=str(trigger_file))
|
892
|
+
|
893
|
+
# Set stop event
|
894
|
+
ingest_instance.stop_event.set()
|
895
|
+
|
896
|
+
# Record initial state
|
897
|
+
initial_task_counter = ingest_instance.task_counter
|
898
|
+
initial_total_tasks = ingest_instance.total_tasks_submitted
|
899
|
+
|
900
|
+
try:
|
901
|
+
handler.on_moved(event)
|
902
|
+
|
903
|
+
# Should not process anything when stop_event is set
|
904
|
+
assert ingest_instance.task_counter == initial_task_counter
|
905
|
+
assert ingest_instance.total_tasks_submitted == initial_total_tasks
|
906
|
+
assert DETECTED_NEW_TRIGGER_FILE not in caplog.text
|
907
|
+
|
908
|
+
finally:
|
909
|
+
ingest_instance.stop_event.clear() # Reset for other tests
|
910
|
+
|
911
|
+
|
912
|
+
def test_trigger_file_handler_on_moved_directory_event(
|
913
|
+
storage_mount_path, tmp_path, caplog
|
914
|
+
):
|
915
|
+
"""Test on_moved skips directory events."""
|
916
|
+
ingestion_client = IngestionClient(
|
917
|
+
data_path=storage_mount_path,
|
918
|
+
rse=ONSITE_RSE,
|
919
|
+
vo="ctao",
|
920
|
+
scope="acada",
|
921
|
+
)
|
922
|
+
|
923
|
+
ingest_instance = Ingest(
|
924
|
+
client=ingestion_client,
|
925
|
+
top_dir=storage_mount_path,
|
926
|
+
num_workers=1,
|
927
|
+
lock_file_path=storage_mount_path / "bdms_ingest.lock",
|
928
|
+
polling_interval=0.5,
|
929
|
+
check_interval=0.5,
|
930
|
+
)
|
931
|
+
|
932
|
+
handler = TriggerFileHandler(ingest_instance)
|
933
|
+
trigger_dir = tmp_path / "some_directory.trigger"
|
934
|
+
source_dir = tmp_path / "source_directory"
|
935
|
+
source_dir.mkdir()
|
936
|
+
trigger_dir.mkdir()
|
937
|
+
|
938
|
+
# Create directory move event
|
939
|
+
event = FileMovedEvent(src_path=str(source_dir), dest_path=str(trigger_dir))
|
940
|
+
event.is_directory = True # mark as directory event
|
941
|
+
|
942
|
+
# Record initial state
|
943
|
+
initial_task_counter = ingest_instance.task_counter
|
944
|
+
initial_total_tasks = ingest_instance.total_tasks_submitted
|
945
|
+
|
946
|
+
handler.on_moved(event)
|
947
|
+
|
948
|
+
# Should not process directory events
|
949
|
+
assert ingest_instance.task_counter == initial_task_counter
|
950
|
+
assert ingest_instance.total_tasks_submitted == initial_total_tasks
|
951
|
+
assert DETECTED_NEW_TRIGGER_FILE not in caplog.text
|
952
|
+
|
953
|
+
|
954
|
+
def test_trigger_file_handler_on_moved_with_actual_processing(
|
955
|
+
storage_mount_path, tmp_path, onsite_test_file, test_vo, test_scope, caplog
|
956
|
+
):
|
957
|
+
"""Test on_moved with successfully processing a valid trigger file."""
|
958
|
+
ingestion_client = IngestionClient(
|
959
|
+
data_path=storage_mount_path,
|
960
|
+
rse=ONSITE_RSE,
|
961
|
+
vo=test_vo,
|
962
|
+
scope=test_scope,
|
963
|
+
)
|
964
|
+
|
965
|
+
ingest_instance = Ingest(
|
966
|
+
client=ingestion_client,
|
967
|
+
top_dir=storage_mount_path,
|
968
|
+
num_workers=1,
|
969
|
+
lock_file_path=storage_mount_path / "bdms_ingest.lock",
|
970
|
+
polling_interval=0.5,
|
971
|
+
check_interval=0.5,
|
972
|
+
)
|
973
|
+
|
974
|
+
# Start the result processing thread manually for this test
|
975
|
+
result_thread = threading.Thread(
|
976
|
+
target=ingest_instance._process_results, daemon=True
|
977
|
+
)
|
978
|
+
result_thread.start()
|
979
|
+
|
980
|
+
with ProcessPoolExecutor(max_workers=1) as executor:
|
981
|
+
ingest_instance.executor = executor
|
982
|
+
|
983
|
+
handler = TriggerFileHandler(ingest_instance)
|
984
|
+
acada_path, _ = onsite_test_file
|
985
|
+
test_file = acada_path
|
986
|
+
trigger_file = Path(str(test_file) + TRIGGER_SUFFIX)
|
987
|
+
trigger_file.symlink_to(test_file)
|
988
|
+
|
989
|
+
# Create FileMovedEvent
|
990
|
+
event = FileMovedEvent(src_path=str(test_file), dest_path=str(trigger_file))
|
991
|
+
|
992
|
+
handler.on_moved(event)
|
993
|
+
|
994
|
+
# Wait for processing to complete
|
995
|
+
timeout = 10.0
|
996
|
+
start_time = time.time()
|
997
|
+
processed = False
|
998
|
+
|
999
|
+
while time.time() - start_time < timeout:
|
1000
|
+
# Check if task was completed (removed from submitted_tasks)
|
1001
|
+
if len(ingest_instance.submitted_tasks) == 0:
|
1002
|
+
processed = True
|
1003
|
+
break
|
1004
|
+
time.sleep(0.1)
|
1005
|
+
|
1006
|
+
# Stop the result processing thread
|
1007
|
+
ingest_instance.stop_event.set()
|
1008
|
+
result_thread.join(timeout=2.0)
|
1009
|
+
|
1010
|
+
# Verify processing occurred
|
1011
|
+
msg = "Task was not processed within timeout"
|
1012
|
+
assert processed, msg
|
1013
|
+
|
1014
|
+
msg = f"'Detected new trigger file {trigger_file}, submitting data file {test_file}' in caplog"
|
1015
|
+
assert (
|
1016
|
+
f"Detected new trigger file {trigger_file}, submitting data file {test_file}"
|
1017
|
+
in caplog.text
|
1018
|
+
), msg
|
1019
|
+
|
1020
|
+
# Check that a result was logged (either success, failure, or error)
|
1021
|
+
result_logged = any(
|
1022
|
+
phrase in caplog.text
|
1023
|
+
for phrase in ["Processed file", "failed:", "Exception in process_file"]
|
1024
|
+
)
|
1025
|
+
msg = "No processing result was logged"
|
1026
|
+
assert result_logged, msg
|
1027
|
+
|
1028
|
+
|
1029
|
+
def test_sequential_exclusion_lock_prevention(storage_mount_path, tmp_path):
|
1030
|
+
"""Test that a second daemon instance cannot start when first is already running.
|
1031
|
+
|
1032
|
+
This test validates sequential exclusion: when one ingestion daemon is already
|
1033
|
+
running and has acquired the lock, any subsequent attempt to start another
|
1034
|
+
daemon instance should fail with a clear error message.
|
1035
|
+
"""
|
1036
|
+
lock_file = tmp_path / "sequential_test.pid"
|
1037
|
+
|
1038
|
+
ingestion_client = IngestionClient(
|
1039
|
+
data_path=storage_mount_path,
|
1040
|
+
rse=ONSITE_RSE,
|
1041
|
+
vo="ctao",
|
1042
|
+
scope="acada",
|
1043
|
+
)
|
1044
|
+
|
1045
|
+
# Create first instance
|
1046
|
+
instance1 = Ingest(
|
1047
|
+
client=ingestion_client,
|
1048
|
+
top_dir=tmp_path,
|
1049
|
+
lock_file_path=lock_file,
|
1050
|
+
num_workers=1,
|
1051
|
+
polling_interval=0.1,
|
1052
|
+
check_interval=0.1,
|
1053
|
+
)
|
1054
|
+
|
1055
|
+
# Create second instance with same lock file
|
1056
|
+
instance2 = Ingest(
|
1057
|
+
client=ingestion_client,
|
1058
|
+
top_dir=tmp_path,
|
1059
|
+
lock_file_path=lock_file,
|
1060
|
+
num_workers=1,
|
1061
|
+
polling_interval=0.1,
|
1062
|
+
check_interval=0.1,
|
1063
|
+
)
|
1064
|
+
|
1065
|
+
results = {}
|
1066
|
+
first_instance_started = threading.Event()
|
1067
|
+
|
1068
|
+
def run_first_instance():
|
1069
|
+
"""Run first instance - should succeed and run until manually stopped."""
|
1070
|
+
try:
|
1071
|
+
# signal: about to start daemon
|
1072
|
+
first_instance_started.set()
|
1073
|
+
instance1.run()
|
1074
|
+
results["first"] = "success"
|
1075
|
+
except Exception as e:
|
1076
|
+
results["first"] = f"error: {str(e)}"
|
1077
|
+
|
1078
|
+
def run_second_instance():
|
1079
|
+
"""Try to run second instance while first is running - should fail with lock conflict."""
|
1080
|
+
try:
|
1081
|
+
# Verify first instance has actually acquired the lock
|
1082
|
+
lock_acquired_timeout = 15.0
|
1083
|
+
start_wait = time.time()
|
1084
|
+
while time.time() - start_wait < lock_acquired_timeout:
|
1085
|
+
if lock_file.exists():
|
1086
|
+
break
|
1087
|
+
time.sleep(0.1)
|
1088
|
+
else:
|
1089
|
+
results["second"] = "first_instance_never_acquired_lock"
|
1090
|
+
return
|
1091
|
+
|
1092
|
+
# This should fail because first instance holds the lock
|
1093
|
+
instance2.run()
|
1094
|
+
results["second"] = "unexpected_success" # Should not reach here
|
1095
|
+
except RuntimeError as e:
|
1096
|
+
error_msg = str(e)
|
1097
|
+
if "Another ingestion process is already running" in error_msg:
|
1098
|
+
results["second"] = f"expected_lock_conflict: {str(e)}"
|
1099
|
+
else:
|
1100
|
+
results["second"] = f"unexpected_runtime_error: {str(e)}"
|
1101
|
+
except Exception as e:
|
1102
|
+
results["second"] = f"unexpected_error: {str(e)}"
|
1103
|
+
|
1104
|
+
# Start first instance with non-daemon thread
|
1105
|
+
thread1 = threading.Thread(target=run_first_instance, daemon=False)
|
1106
|
+
thread1.start()
|
1107
|
+
|
1108
|
+
# Wait for first instance to signal it's starting
|
1109
|
+
msg = "First instance failed to start"
|
1110
|
+
assert first_instance_started.wait(timeout=10), msg
|
1111
|
+
|
1112
|
+
# Give first instance time to acquire lock and initialize
|
1113
|
+
time.sleep(3.0)
|
1114
|
+
|
1115
|
+
# Verify first instance has acquired lock with content validation
|
1116
|
+
msg = "First instance should have created PID file"
|
1117
|
+
assert lock_file.exists(), msg
|
1118
|
+
|
1119
|
+
# Read PID and verify it's valid
|
1120
|
+
pid_content = lock_file.read_text().strip()
|
1121
|
+
msg = f"PID file should contain a number, got: {pid_content}"
|
1122
|
+
assert pid_content.isdigit(), msg
|
1123
|
+
|
1124
|
+
# Verify the lock file contains current process PID or a valid PID
|
1125
|
+
current_pid = os.getpid()
|
1126
|
+
stored_pid = int(pid_content)
|
1127
|
+
# The stored PID should be current process since we're running in same process
|
1128
|
+
msg = f"Expected PID {current_pid}, got {stored_pid}"
|
1129
|
+
assert stored_pid == current_pid, msg
|
1130
|
+
|
1131
|
+
# Now try to start second instance - this should fail
|
1132
|
+
thread2 = threading.Thread(target=run_second_instance, daemon=False)
|
1133
|
+
thread2.start()
|
1134
|
+
|
1135
|
+
# Wait for second instance to complete with better timeout handling
|
1136
|
+
# FileLock timeout is 10 seconds, so we give a bit more time
|
1137
|
+
thread2.join(timeout=15)
|
1138
|
+
|
1139
|
+
# Explicit check for thread completion
|
1140
|
+
if thread2.is_alive():
|
1141
|
+
# Force stop and fail the test
|
1142
|
+
instance1.stop_event.set()
|
1143
|
+
thread1.join(timeout=5)
|
1144
|
+
pytest.fail("Second instance thread did not complete within expected timeout")
|
1145
|
+
|
1146
|
+
# Stop first instance now that we've tested the lock
|
1147
|
+
instance1.stop_event.set()
|
1148
|
+
thread1.join(timeout=10)
|
1149
|
+
|
1150
|
+
# Ensure first thread also terminates
|
1151
|
+
if thread1.is_alive():
|
1152
|
+
pytest.fail("First instance thread did not terminate within timeout")
|
1153
|
+
|
1154
|
+
# Verify results
|
1155
|
+
msg = f"Second instance should have completed. Results: {results}"
|
1156
|
+
assert "second" in results, msg
|
1157
|
+
|
1158
|
+
# More specific assertion for expected lock conflict
|
1159
|
+
second_result = results["second"]
|
1160
|
+
msg = f"Second instance should have failed with lock conflict. Got: {second_result}"
|
1161
|
+
assert second_result.startswith("expected_lock_conflict"), msg
|
1162
|
+
|
1163
|
+
# Verify the error message is the expected one from Ingest class
|
1164
|
+
msg = f"Expected specific error message, got: {second_result}"
|
1165
|
+
assert "Another ingestion process is already running" in second_result, msg
|
1166
|
+
|
1167
|
+
# First instance should have run successfully (we stopped it manually)
|
1168
|
+
if "first" in results:
|
1169
|
+
msg = f"First instance should succeed, got: {results['first']}"
|
1170
|
+
assert results["first"] == "success", msg
|
1171
|
+
|
1172
|
+
# Improved cleanup verification with timeout-based checking
|
1173
|
+
cleanup_timeout = 5.0
|
1174
|
+
start_cleanup_wait = time.time()
|
1175
|
+
while time.time() - start_cleanup_wait < cleanup_timeout:
|
1176
|
+
if not lock_file.exists():
|
1177
|
+
break
|
1178
|
+
time.sleep(0.1)
|
1179
|
+
|
1180
|
+
msg = "PID file should be cleaned up after first instance stops"
|
1181
|
+
assert not lock_file.exists(), msg
|
1182
|
+
|
1183
|
+
# logging
|
1184
|
+
LOGGER.info("Sequential exclusion test completed successfully")
|
1185
|
+
LOGGER.info("First instance: %s", results.get("first", "stopped manually"))
|
1186
|
+
LOGGER.info("Second instance correctly failed with: %s", second_result)
|
1187
|
+
|
1188
|
+
|
1189
|
+
def test_concurrent_exclusion_lock_prevention(storage_mount_path, tmp_path):
|
1190
|
+
"""Test FileLock behavior under true concurrent access - simultaneous daemon startup attempts.
|
1191
|
+
|
1192
|
+
This test validates real concurrent scenario where multiple daemon instances
|
1193
|
+
attempt to acquire the same lock simultaneously, simulating race conditions
|
1194
|
+
that occur in production environments.
|
1195
|
+
"""
|
1196
|
+
lock_file = tmp_path / "concurrent_test.pid"
|
1197
|
+
|
1198
|
+
ingestion_client = IngestionClient(
|
1199
|
+
data_path=storage_mount_path,
|
1200
|
+
rse=ONSITE_RSE,
|
1201
|
+
vo="ctao",
|
1202
|
+
scope="acada",
|
1203
|
+
)
|
1204
|
+
|
1205
|
+
# Create both instances
|
1206
|
+
instance1 = Ingest(
|
1207
|
+
client=ingestion_client,
|
1208
|
+
top_dir=tmp_path,
|
1209
|
+
lock_file_path=lock_file,
|
1210
|
+
num_workers=1,
|
1211
|
+
polling_interval=0.1,
|
1212
|
+
check_interval=0.1,
|
1213
|
+
)
|
1214
|
+
instance2 = Ingest(
|
1215
|
+
client=ingestion_client,
|
1216
|
+
top_dir=tmp_path,
|
1217
|
+
lock_file_path=lock_file,
|
1218
|
+
num_workers=1,
|
1219
|
+
polling_interval=0.1,
|
1220
|
+
check_interval=0.1,
|
1221
|
+
)
|
1222
|
+
|
1223
|
+
results = {}
|
1224
|
+
|
1225
|
+
# Synchronization barrier - both threads wait here until released
|
1226
|
+
start_barrier = threading.Barrier(3) # 2 worker threads + 1 main thread
|
1227
|
+
|
1228
|
+
def run_instance(instance_id, instance):
|
1229
|
+
"""Run instance - both will try to start simultaneously."""
|
1230
|
+
try:
|
1231
|
+
# Wait for barrier - ensures simultaneous start
|
1232
|
+
start_barrier.wait() # All threads start together!
|
1233
|
+
|
1234
|
+
instance.run()
|
1235
|
+
results[instance_id] = "success"
|
1236
|
+
except RuntimeError as e:
|
1237
|
+
if "Another ingestion process is already running" in str(e):
|
1238
|
+
results[instance_id] = f"lock_conflict: {str(e)}"
|
1239
|
+
else:
|
1240
|
+
results[instance_id] = f"unexpected_error: {str(e)}"
|
1241
|
+
except Exception as e:
|
1242
|
+
results[instance_id] = f"error: {str(e)}"
|
1243
|
+
|
1244
|
+
# Create both threads
|
1245
|
+
thread1 = threading.Thread(
|
1246
|
+
target=run_instance, args=("first", instance1), daemon=False
|
1247
|
+
)
|
1248
|
+
thread2 = threading.Thread(
|
1249
|
+
target=run_instance, args=("second", instance2), daemon=False
|
1250
|
+
)
|
1251
|
+
|
1252
|
+
# Start both threads - they will wait at the barrier
|
1253
|
+
thread1.start()
|
1254
|
+
thread2.start()
|
1255
|
+
|
1256
|
+
# Give threads time to reach barrier
|
1257
|
+
time.sleep(0.5)
|
1258
|
+
|
1259
|
+
# Release the barrier - both threads start simultaneously
|
1260
|
+
start_barrier.wait()
|
1261
|
+
|
1262
|
+
# Wait for both to complete the lock acquisition attempt
|
1263
|
+
thread1.join(timeout=15)
|
1264
|
+
thread2.join(timeout=15)
|
1265
|
+
|
1266
|
+
# Stop whichever instance succeeded
|
1267
|
+
if "first" in results and results["first"] == "success":
|
1268
|
+
instance1.stop_event.set()
|
1269
|
+
if "second" in results and results["second"] == "success":
|
1270
|
+
instance2.stop_event.set()
|
1271
|
+
|
1272
|
+
# Ensure threads complete
|
1273
|
+
if thread1.is_alive():
|
1274
|
+
instance1.stop_event.set()
|
1275
|
+
thread1.join(timeout=5)
|
1276
|
+
if thread2.is_alive():
|
1277
|
+
instance2.stop_event.set()
|
1278
|
+
thread2.join(timeout=5)
|
1279
|
+
|
1280
|
+
# Verify results - Exactly ONE should succeed, ONE should fail
|
1281
|
+
msg = f"Both instances should complete, got: {results}"
|
1282
|
+
assert len(results) == 2, msg
|
1283
|
+
|
1284
|
+
success_count = sum(1 for result in results.values() if result == "success")
|
1285
|
+
conflict_count = sum(1 for result in results.values() if "lock_conflict" in result)
|
1286
|
+
|
1287
|
+
msg = f"Exactly ONE instance should succeed, got {success_count}: {results}"
|
1288
|
+
assert success_count == 1, msg
|
1289
|
+
|
1290
|
+
msg = f"Exactly ONE instance should get lock conflict, got {conflict_count}: {results}"
|
1291
|
+
assert conflict_count == 1, msg
|
1292
|
+
|
1293
|
+
# Verify the lock conflict has correct error message
|
1294
|
+
conflict_result = [r for r in results.values() if "lock_conflict" in r][0]
|
1295
|
+
msg = "Expected 'Another ingestion process is already running' message in conflict result"
|
1296
|
+
assert "Another ingestion process is already running" in conflict_result, msg
|
1297
|
+
|
1298
|
+
# Verify cleanup
|
1299
|
+
cleanup_timeout = 5.0
|
1300
|
+
start_cleanup = time.time()
|
1301
|
+
while time.time() - start_cleanup < cleanup_timeout:
|
1302
|
+
if not lock_file.exists():
|
1303
|
+
break
|
1304
|
+
time.sleep(0.1)
|
1305
|
+
msg = "Lock file should be cleaned up"
|
1306
|
+
assert not lock_file.exists(), msg
|
1307
|
+
|
1308
|
+
LOGGER.info("True Concurrency tests: %s", results)
|
1309
|
+
LOGGER.info("Real concurrent lock acquisition tested successfully!")
|
1310
|
+
|
1311
|
+
|
1312
|
+
def acada_write_test_files(
|
1313
|
+
storage_mount_path, test_vo, test_scope, n_files=7
|
1314
|
+
) -> list[Path]:
|
1315
|
+
"""Represents ACADA writing test files to the storage mount path."""
|
1316
|
+
|
1317
|
+
test_dir = storage_mount_path / test_vo / test_scope
|
1318
|
+
test_dir.mkdir(parents=True, exist_ok=True)
|
1319
|
+
|
1320
|
+
# Create seven dummy FITS files
|
1321
|
+
data_files = []
|
1322
|
+
rng = np.random.default_rng()
|
1323
|
+
for i in range(n_files):
|
1324
|
+
data_file = test_dir / f"testfile_{i}_20250609.fits"
|
1325
|
+
hdu = fits.PrimaryHDU(rng.random((50, 50)))
|
1326
|
+
hdu.writeto(data_file, overwrite=True, checksum=True)
|
1327
|
+
data_files.append(data_file)
|
1328
|
+
|
1329
|
+
LOGGER.info("Created test file: %s", data_file)
|
1330
|
+
|
1331
|
+
# Move permission reset before daemon start to avoid timing issues
|
1332
|
+
reset_xrootd_permissions(storage_mount_path)
|
1333
|
+
time.sleep(1.0) # Allow permissions to be applied
|
1334
|
+
|
1335
|
+
return data_files
|
1336
|
+
|
1337
|
+
|
1338
|
+
def acada_create_trigger_symlink(data_file, creation_results):
|
1339
|
+
"""Represents creating a trigger symlink for a given data file."""
|
1340
|
+
|
1341
|
+
try:
|
1342
|
+
trigger_file = Path(str(data_file) + TRIGGER_SUFFIX)
|
1343
|
+
trigger_file.symlink_to(data_file)
|
1344
|
+
LOGGER.info("Created trigger file: %s -> %s", trigger_file, data_file)
|
1345
|
+
|
1346
|
+
# Verify creation was successful
|
1347
|
+
if trigger_file.exists() and trigger_file.is_symlink():
|
1348
|
+
creation_results.append({"file": str(data_file), "status": "success"})
|
1349
|
+
else:
|
1350
|
+
creation_results.append(
|
1351
|
+
{"file": str(data_file), "status": "creation_failed"}
|
1352
|
+
)
|
1353
|
+
except Exception as e:
|
1354
|
+
LOGGER.exception("Failed to create trigger for %s: %s", data_file, e)
|
1355
|
+
creation_results.append({"file": str(data_file), "status": f"error: {str(e)}"})
|
1356
|
+
|
1357
|
+
return creation_results
|
1358
|
+
|
1359
|
+
|
1360
|
+
def ensure_files_ingested(data_files, storage_mount_path, test_scope, timeout_s=120):
|
1361
|
+
"""Ensure that all files are ingested by checking the IngestStatus."""
|
1362
|
+
|
1363
|
+
replica_client = ReplicaClient()
|
1364
|
+
|
1365
|
+
timeout_at = time.time() + timeout_s
|
1366
|
+
|
1367
|
+
data_file_entries = [
|
1368
|
+
{
|
1369
|
+
"file": str(data_file),
|
1370
|
+
"expected_lfn": f"/{data_file.relative_to(storage_mount_path)}",
|
1371
|
+
"found": False,
|
1372
|
+
}
|
1373
|
+
for data_file in data_files
|
1374
|
+
]
|
1375
|
+
|
1376
|
+
while time.time() < timeout_at and not all(
|
1377
|
+
status["found"] for status in data_file_entries
|
1378
|
+
):
|
1379
|
+
for data_file_entry in data_file_entries:
|
1380
|
+
if not data_file_entry["found"]:
|
1381
|
+
try:
|
1382
|
+
replicas = list(
|
1383
|
+
replica_client.list_replicas(
|
1384
|
+
dids=[
|
1385
|
+
{
|
1386
|
+
"scope": test_scope,
|
1387
|
+
"name": data_file_entry["expected_lfn"],
|
1388
|
+
}
|
1389
|
+
]
|
1390
|
+
)
|
1391
|
+
)
|
1392
|
+
if not replicas:
|
1393
|
+
LOGGER.info(
|
1394
|
+
"No replica found for %s", data_file_entry["expected_lfn"]
|
1395
|
+
)
|
1396
|
+
else:
|
1397
|
+
LOGGER.info(
|
1398
|
+
"Replica found for %s: %s",
|
1399
|
+
data_file_entry["expected_lfn"],
|
1400
|
+
replicas[0],
|
1401
|
+
)
|
1402
|
+
data_file_entry["found"] = True
|
1403
|
+
except Exception:
|
1404
|
+
LOGGER.exception(
|
1405
|
+
"Failed to list replicas for %s",
|
1406
|
+
data_file_entry["expected_lfn"],
|
1407
|
+
)
|
1408
|
+
time.sleep(1.0)
|
1409
|
+
|
1410
|
+
if not all(status["found"] for status in data_file_entries):
|
1411
|
+
pytest.fail(f"Not all replicas found for files: {data_files}")
|
1412
|
+
|
1413
|
+
|
1414
|
+
@pytest.mark.usefixtures(
|
1415
|
+
"_auth_proxy", "lock_for_ingestion_daemon", "disable_ingestion_daemon"
|
1416
|
+
)
|
1417
|
+
@pytest.mark.verifies_usecase("UC-110-1.1.4")
|
1418
|
+
def test_ingest_parallel_submission(storage_mount_path, caplog, test_vo, test_scope):
|
1419
|
+
"""Test parallel file processing: creates multiple FITS files simultaneously and verifies that the
|
1420
|
+
daemon can detect, process, and ingest them efficiently using parallel workers.
|
1421
|
+
"""
|
1422
|
+
ingestion_client = IngestionClient(
|
1423
|
+
data_path=storage_mount_path,
|
1424
|
+
rse=ONSITE_RSE,
|
1425
|
+
vo=test_vo,
|
1426
|
+
scope=test_scope,
|
1427
|
+
)
|
1428
|
+
|
1429
|
+
ingest_instance = Ingest(
|
1430
|
+
client=ingestion_client,
|
1431
|
+
top_dir=storage_mount_path,
|
1432
|
+
num_workers=4,
|
1433
|
+
lock_file_path=storage_mount_path / "bdms_ingest.lock",
|
1434
|
+
polling_interval=0.5,
|
1435
|
+
check_interval=0.5,
|
1436
|
+
)
|
1437
|
+
|
1438
|
+
data_files = acada_write_test_files(storage_mount_path, test_vo, test_scope)
|
1439
|
+
|
1440
|
+
# Daemon startup with exception handling
|
1441
|
+
daemon_exception = None
|
1442
|
+
daemon_started = threading.Event()
|
1443
|
+
|
1444
|
+
def run_daemon():
|
1445
|
+
"""Run daemon with exception capture."""
|
1446
|
+
nonlocal daemon_exception
|
1447
|
+
try:
|
1448
|
+
daemon_started.set() # Signal daemon thread started
|
1449
|
+
ingest_instance.run()
|
1450
|
+
except Exception as e:
|
1451
|
+
daemon_exception = e
|
1452
|
+
LOGGER.exception("Daemon failed with exception: %s", str(e))
|
1453
|
+
|
1454
|
+
# Start daemon with non-daemon thread for reliability
|
1455
|
+
daemon_thread = threading.Thread(target=run_daemon, daemon=False)
|
1456
|
+
daemon_thread.start()
|
1457
|
+
|
1458
|
+
# Wait for daemon thread to start
|
1459
|
+
msg = "Daemon thread failed to start"
|
1460
|
+
assert daemon_started.wait(timeout=10), msg
|
1461
|
+
|
1462
|
+
# Daemon initialization verification
|
1463
|
+
daemon_init_timeout = 20.0 # Increased timeout for robust initialization
|
1464
|
+
daemon_init_start = time.time()
|
1465
|
+
required_conditions = {
|
1466
|
+
"lock_acquired": False,
|
1467
|
+
"result_thread_started": False,
|
1468
|
+
"pool_started": False,
|
1469
|
+
"monitoring_started": False,
|
1470
|
+
"observer_started": False,
|
1471
|
+
}
|
1472
|
+
|
1473
|
+
while time.time() - daemon_init_start < daemon_init_timeout:
|
1474
|
+
# Check for daemon startup failure early
|
1475
|
+
if daemon_exception:
|
1476
|
+
pytest.fail(f"Daemon failed during initialization: {daemon_exception}")
|
1477
|
+
|
1478
|
+
# Check for lock acquisition (critical for daemon operation)
|
1479
|
+
if ingest_instance.lock_file_path.exists():
|
1480
|
+
required_conditions["lock_acquired"] = True
|
1481
|
+
|
1482
|
+
# Check log messages for initialization steps
|
1483
|
+
log_text = caplog.text
|
1484
|
+
if "Result processing thread started" in log_text:
|
1485
|
+
required_conditions["result_thread_started"] = True
|
1486
|
+
|
1487
|
+
# Flexible process pool verification to work with any worker count
|
1488
|
+
if re.search(r"Started process pool with \d+ workers", log_text):
|
1489
|
+
required_conditions["pool_started"] = True
|
1490
|
+
|
1491
|
+
if "Starting continuous polling-based monitoring" in log_text:
|
1492
|
+
required_conditions["monitoring_started"] = True
|
1493
|
+
if "File monitoring observer started successfully" in log_text:
|
1494
|
+
required_conditions["observer_started"] = True
|
1495
|
+
|
1496
|
+
# Check if all conditions are met
|
1497
|
+
if all(required_conditions.values()):
|
1498
|
+
break
|
1499
|
+
|
1500
|
+
time.sleep(0.2)
|
1501
|
+
|
1502
|
+
# Verify complete initialization or provide diagnostics
|
1503
|
+
missing_conditions = [k for k, v in required_conditions.items() if not v]
|
1504
|
+
if missing_conditions:
|
1505
|
+
ingest_instance.stop_event.set()
|
1506
|
+
daemon_thread.join(timeout=5)
|
1507
|
+
pytest.fail(
|
1508
|
+
f"Daemon initialization incomplete. Missing: {missing_conditions}. Check logs for errors."
|
1509
|
+
)
|
1510
|
+
|
1511
|
+
time.sleep(0.5) # some additional time to stabilize
|
1512
|
+
|
1513
|
+
# Create trigger files and also track
|
1514
|
+
trigger_files = []
|
1515
|
+
natural_start = time.time()
|
1516
|
+
|
1517
|
+
for data_file in data_files:
|
1518
|
+
trigger_file = Path(str(data_file) + TRIGGER_SUFFIX)
|
1519
|
+
trigger_file.symlink_to(data_file)
|
1520
|
+
trigger_files.append(trigger_file)
|
1521
|
+
|
1522
|
+
# Test regular detection, looking for MOVE events
|
1523
|
+
natural_detection_timeout = 30.0
|
1524
|
+
natural_start = time.time()
|
1525
|
+
|
1526
|
+
while time.time() - natural_start < natural_detection_timeout:
|
1527
|
+
# Look for actual processing
|
1528
|
+
if caplog.text.count("Detected new trigger file") > 0:
|
1529
|
+
break
|
1530
|
+
time.sleep(1.0)
|
1531
|
+
|
1532
|
+
# Count events after the loop completes
|
1533
|
+
move_events_detected = caplog.text.count("MOVE Event received")
|
1534
|
+
|
1535
|
+
# Wait for processing with concurrency monitoring
|
1536
|
+
processing_timeout = 120.0
|
1537
|
+
processing_start = time.time()
|
1538
|
+
processed_files = set()
|
1539
|
+
max_concurrent_samples = []
|
1540
|
+
|
1541
|
+
while time.time() - processing_start < processing_timeout:
|
1542
|
+
# Sample concurrent tasks frequently to catch parallelism
|
1543
|
+
current_concurrent = len(ingest_instance.submitted_tasks)
|
1544
|
+
max_concurrent_samples.append(current_concurrent)
|
1545
|
+
|
1546
|
+
# Check processing results
|
1547
|
+
for data_file in data_files:
|
1548
|
+
success_pattern = f"Processed file {data_file} with result success"
|
1549
|
+
skipped_pattern = f"Processed file {data_file} with result skipped"
|
1550
|
+
|
1551
|
+
if str(data_file) not in processed_files:
|
1552
|
+
if success_pattern in caplog.text or skipped_pattern in caplog.text:
|
1553
|
+
processed_files.add(str(data_file))
|
1554
|
+
|
1555
|
+
if len(processed_files) == 7:
|
1556
|
+
break
|
1557
|
+
|
1558
|
+
if "Fatal error in result processing thread" in caplog.text:
|
1559
|
+
break
|
1560
|
+
|
1561
|
+
time.sleep(0.1) # Sample frequently to catch concurrency
|
1562
|
+
|
1563
|
+
assert len(processed_files) == 7
|
1564
|
+
|
1565
|
+
# Record ingestion workflow completion time
|
1566
|
+
workflow_end_time = time.time()
|
1567
|
+
|
1568
|
+
# Stop the daemon
|
1569
|
+
ingest_instance.stop_event.set()
|
1570
|
+
daemon_thread.join(timeout=10)
|
1571
|
+
|
1572
|
+
if daemon_thread.is_alive():
|
1573
|
+
pytest.fail("Ingest Daemon thread did not terminate within timeout")
|
1574
|
+
|
1575
|
+
# Verify results
|
1576
|
+
msg = "Process pool startup failed"
|
1577
|
+
assert "Started process pool with 4 workers" in caplog.text, msg
|
1578
|
+
|
1579
|
+
msg = "Result processing thread startup failed"
|
1580
|
+
assert "Result processing thread started" in caplog.text, msg
|
1581
|
+
|
1582
|
+
# Verify trigger files were cleaned up during successful processing
|
1583
|
+
remaining_triggers = sum(1 for tf in trigger_files if tf.exists())
|
1584
|
+
msg = f"Expected all trigger files to be cleaned up, {remaining_triggers} remain"
|
1585
|
+
assert remaining_triggers == 0, msg
|
1586
|
+
|
1587
|
+
# Verify clean shutdown
|
1588
|
+
msg = "Lock file not cleaned up"
|
1589
|
+
assert not ingest_instance.lock_file_path.exists(), msg
|
1590
|
+
|
1591
|
+
msg = "Daemon shutdown not logged"
|
1592
|
+
assert "Stopped ingestion daemon" in caplog.text, msg
|
1593
|
+
|
1594
|
+
msg = "Result thread shutdown not logged"
|
1595
|
+
assert "Result processing thread stopped" in caplog.text, msg
|
1596
|
+
|
1597
|
+
# Clean up data files
|
1598
|
+
for data_file in data_files:
|
1599
|
+
if data_file.exists():
|
1600
|
+
data_file.unlink()
|
1601
|
+
|
1602
|
+
# Statistics
|
1603
|
+
# Ingestion workflow time: from trigger detection to ingestion with replication completion
|
1604
|
+
max_concurrent_observed = (
|
1605
|
+
max(max_concurrent_samples) if max_concurrent_samples else 0
|
1606
|
+
)
|
1607
|
+
max_concurrent_tracked = ingest_instance.max_concurrent_tasks
|
1608
|
+
|
1609
|
+
detection_to_completion_time = workflow_end_time - natural_start
|
1610
|
+
processing_rate = (
|
1611
|
+
len(processed_files) / detection_to_completion_time
|
1612
|
+
if detection_to_completion_time > 0
|
1613
|
+
else 0
|
1614
|
+
)
|
1615
|
+
|
1616
|
+
total_submitted = ingest_instance.total_tasks_submitted
|
1617
|
+
tasks_cleaned_up = len(ingest_instance.submitted_tasks) == 0
|
1618
|
+
max_concurrent_final = max(max_concurrent_tracked, max_concurrent_observed)
|
1619
|
+
parallel_achieved = max_concurrent_final >= 2
|
1620
|
+
|
1621
|
+
# Summary
|
1622
|
+
status = "parallel" if parallel_achieved else "sequential"
|
1623
|
+
|
1624
|
+
LOGGER.info("=== Parallel Ingestion Test Results ===")
|
1625
|
+
LOGGER.info(
|
1626
|
+
"Files processed: %d/7 in %.1fs",
|
1627
|
+
len(processed_files),
|
1628
|
+
detection_to_completion_time,
|
1629
|
+
)
|
1630
|
+
LOGGER.info("Processing rate: %.1f files/sec", processing_rate)
|
1631
|
+
LOGGER.info("Max concurrent tasks: %d (mode: %s)", max_concurrent_final, status)
|
1632
|
+
LOGGER.info("Total tasks submitted: %d", total_submitted)
|
1633
|
+
LOGGER.info("Task cleanup successful: %s", tasks_cleaned_up)
|
1634
|
+
LOGGER.info("Event detection: %d move events", move_events_detected)
|
1635
|
+
|
1636
|
+
|
1637
|
+
def fetch_ingestion_daemon_metrics():
|
1638
|
+
"""Fetch metrics from the ingestion daemon to verify its operation."""
|
1639
|
+
|
1640
|
+
response = urlopen("http://bdms-ingestion-daemon:8000/")
|
1641
|
+
|
1642
|
+
assert response.status == 200, "Ingestion daemon metrics are not responding"
|
1643
|
+
|
1644
|
+
n_tasks_metrics = {}
|
1645
|
+
for line in response.readlines():
|
1646
|
+
line = line.decode("utf-8").strip()
|
1647
|
+
if line.startswith("n_tasks_"):
|
1648
|
+
LOGGER.info("Ingestion daemon metrics: %s", line)
|
1649
|
+
key, value = line.split(" ", 1)
|
1650
|
+
n_tasks_metrics[key] = float(value)
|
1651
|
+
|
1652
|
+
return n_tasks_metrics
|
1653
|
+
|
1654
|
+
|
1655
|
+
@pytest.mark.usefixtures(
|
1656
|
+
"_auth_proxy", "lock_for_ingestion_daemon", "enable_ingestion_daemon"
|
1657
|
+
)
|
1658
|
+
@pytest.mark.verifies_usecase("UC-110-1.1.4")
|
1659
|
+
def test_ingest_parallel_submission_with_live_daemon(storage_mount_path, test_vo):
|
1660
|
+
"""Test parallel file processing with an already running daemon."""
|
1661
|
+
|
1662
|
+
# with live test, the daemon is deployed outside of this test, so we need to pick a persistent location, matching the daemon's storage mount path
|
1663
|
+
# note that if kind cluster creation fixture is used, the directory can be unique per test session
|
1664
|
+
# this test does only checks that the files are consumed, not that they are replicated
|
1665
|
+
|
1666
|
+
test_scope = "test_scope_persistent"
|
1667
|
+
|
1668
|
+
n_tasks_metrics_before_test = fetch_ingestion_daemon_metrics()
|
1669
|
+
|
1670
|
+
for tf in (storage_mount_path / test_vo / test_scope).glob("*" + TRIGGER_SUFFIX):
|
1671
|
+
if tf.exists():
|
1672
|
+
LOGGER.info("Cleaning up existing trigger file: %s", tf)
|
1673
|
+
tf.unlink()
|
1674
|
+
|
1675
|
+
data_files = acada_write_test_files(storage_mount_path, test_vo, test_scope)
|
1676
|
+
|
1677
|
+
creation_results = []
|
1678
|
+
for data_file in data_files:
|
1679
|
+
acada_create_trigger_symlink(data_file, creation_results)
|
1680
|
+
|
1681
|
+
trigger_files = [Path(str(df) + TRIGGER_SUFFIX) for df in data_files]
|
1682
|
+
|
1683
|
+
timeout = 120.0
|
1684
|
+
start_time = time.time()
|
1685
|
+
|
1686
|
+
remaining_triggers = 0
|
1687
|
+
while time.time() - start_time < timeout:
|
1688
|
+
# Verify trigger files were cleaned up during successful processing
|
1689
|
+
remaining_triggers = sum(1 for tf in trigger_files if tf.exists())
|
1690
|
+
|
1691
|
+
if remaining_triggers == 0:
|
1692
|
+
LOGGER.info("All trigger files consumed up successfully, exiting test.")
|
1693
|
+
break
|
1694
|
+
else:
|
1695
|
+
LOGGER.info(
|
1696
|
+
"Waiting for trigger files to be cleaned up, %s remain.",
|
1697
|
+
remaining_triggers,
|
1698
|
+
)
|
1699
|
+
|
1700
|
+
time.sleep(1.0) # Sample frequently to catch concurrency
|
1701
|
+
|
1702
|
+
assert remaining_triggers == 0, "Expected all trigger files to be consumed up"
|
1703
|
+
|
1704
|
+
ensure_files_ingested(data_files, storage_mount_path, test_scope)
|
1705
|
+
|
1706
|
+
# make sure that metrics are available from the daemon
|
1707
|
+
n_tasks_metrics = fetch_ingestion_daemon_metrics()
|
1708
|
+
|
1709
|
+
assert n_tasks_metrics["n_tasks_success_created"] < time.time()
|
1710
|
+
assert n_tasks_metrics["n_tasks_processed_total"] - n_tasks_metrics_before_test[
|
1711
|
+
"n_tasks_processed_total"
|
1712
|
+
] == len(data_files)
|
1713
|
+
assert (
|
1714
|
+
n_tasks_metrics["n_tasks_processed_total"]
|
1715
|
+
- n_tasks_metrics_before_test["n_tasks_processed_total"]
|
1716
|
+
== n_tasks_metrics["n_tasks_success_total"]
|
1717
|
+
+ n_tasks_metrics["n_tasks_skipped_total"]
|
1718
|
+
), "Ingestion daemon metrics do not match expected values"
|