ctao-bdms-clients 0.2.0rc1__py3-none-any.whl → 0.3.0rc1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- bdms/_version.py +2 -2
- bdms/acada_ingest_cli.py +400 -0
- bdms/acada_ingestion.py +528 -17
- bdms/extract_fits_metadata.py +134 -0
- bdms/tests/conftest.py +157 -14
- bdms/tests/test_acada_ingest_cli.py +279 -0
- bdms/tests/test_acada_ingestion.py +1315 -98
- bdms/tests/test_basic_rucio_functionality.py +0 -1
- bdms/tests/test_dpps_rel_0_0.py +6 -0
- bdms/tests/test_extract_fits_metadata.py +97 -0
- bdms/tests/test_onsite_storage.py +16 -35
- bdms/tests/utils.py +28 -0
- {ctao_bdms_clients-0.2.0rc1.dist-info → ctao_bdms_clients-0.3.0rc1.dist-info}/METADATA +8 -2
- ctao_bdms_clients-0.3.0rc1.dist-info/RECORD +23 -0
- {ctao_bdms_clients-0.2.0rc1.dist-info → ctao_bdms_clients-0.3.0rc1.dist-info}/WHEEL +1 -1
- ctao_bdms_clients-0.3.0rc1.dist-info/entry_points.txt +2 -0
- ctao_bdms_clients-0.2.0rc1.dist-info/RECORD +0 -18
- {ctao_bdms_clients-0.2.0rc1.dist-info → ctao_bdms_clients-0.3.0rc1.dist-info}/licenses/LICENSE +0 -0
- {ctao_bdms_clients-0.2.0rc1.dist-info → ctao_bdms_clients-0.3.0rc1.dist-info}/top_level.txt +0 -0
@@ -6,11 +6,16 @@ and the replication of data between Rucio storage elements (RSEs).
|
|
6
6
|
|
7
7
|
import logging
|
8
8
|
import os
|
9
|
+
import re
|
9
10
|
import subprocess
|
10
|
-
|
11
|
+
import threading
|
12
|
+
import time
|
13
|
+
from concurrent.futures import ProcessPoolExecutor
|
11
14
|
from pathlib import Path
|
12
|
-
from
|
15
|
+
from shutil import copy2
|
16
|
+
from urllib.request import urlopen
|
13
17
|
|
18
|
+
import numpy as np
|
14
19
|
import pytest
|
15
20
|
from astropy.io import fits
|
16
21
|
from astropy.table import Table
|
@@ -20,35 +25,34 @@ from rucio.client.replicaclient import ReplicaClient
|
|
20
25
|
from rucio.client.ruleclient import RuleClient
|
21
26
|
from rucio.common.exception import RucioException
|
22
27
|
from rucio.common.utils import adler32
|
23
|
-
|
24
|
-
|
25
|
-
from bdms.
|
28
|
+
from watchdog.events import FileMovedEvent
|
29
|
+
|
30
|
+
from bdms.acada_ingestion import (
|
31
|
+
DETECTED_NEW_TRIGGER_FILE,
|
32
|
+
INGEST_SUCCESS_MESSAGE,
|
33
|
+
TRIGGER_SUFFIX,
|
34
|
+
Ingest,
|
35
|
+
IngestionClient,
|
36
|
+
IngestStatus,
|
37
|
+
TriggerFileHandler,
|
38
|
+
process_file,
|
39
|
+
)
|
40
|
+
from bdms.tests.utils import reset_xrootd_permissions, wait_for_replication_status
|
26
41
|
|
27
42
|
LOGGER = logging.getLogger(__name__)
|
28
43
|
|
29
|
-
XROOTD_UID = 994
|
30
|
-
XROOTD_GID = 994
|
31
44
|
ONSITE_RSE = "STORAGE-1"
|
32
45
|
OFFSITE_RSE_1 = "STORAGE-2"
|
33
46
|
OFFSITE_RSE_2 = "STORAGE-3"
|
34
47
|
|
48
|
+
TEST_FILE_TRIGGER = "test_file.trigger"
|
49
|
+
|
35
50
|
|
36
51
|
def test_shared_storage(storage_mount_path: Path):
|
37
52
|
"""Test that the shared storage path is available."""
|
38
53
|
|
39
|
-
|
40
|
-
|
41
|
-
), f"Shared storage {storage_mount_path} is not available on the client"
|
42
|
-
|
43
|
-
|
44
|
-
def recursive_chown(path: Path, uid: int, gid: int):
|
45
|
-
"""Equivalent of unix chmod -R <uid>:<gid> <path>."""
|
46
|
-
for root, dirs, files in os.walk(path):
|
47
|
-
root = Path(root)
|
48
|
-
for d in dirs:
|
49
|
-
os.chown(root / d, uid, gid)
|
50
|
-
for f in files:
|
51
|
-
os.chown(root / f, uid, gid)
|
54
|
+
msg = f"Shared storage {storage_mount_path} is not available on the client"
|
55
|
+
assert storage_mount_path.exists(), msg
|
52
56
|
|
53
57
|
|
54
58
|
def trigger_judge_repairer() -> None:
|
@@ -80,30 +84,6 @@ def trigger_judge_repairer() -> None:
|
|
80
84
|
raise
|
81
85
|
|
82
86
|
|
83
|
-
@pytest.fixture
|
84
|
-
def test_file(
|
85
|
-
storage_mount_path: Path, test_scope: str, test_vo: str
|
86
|
-
) -> tuple[Path, str]:
|
87
|
-
"""Create a dummy .fits.fz file in the shared storage for testing."""
|
88
|
-
|
89
|
-
unique_id = f"{datetime.now():%Y%m%d_%H%M%S}_{token_hex(8)}"
|
90
|
-
filename = f"testfile_{unique_id}.fits.fz"
|
91
|
-
|
92
|
-
test_file_path = storage_mount_path / test_vo / test_scope / filename
|
93
|
-
test_file_path.parent.mkdir(parents=True, exist_ok=True)
|
94
|
-
|
95
|
-
# need to change file permissions of created directories so that
|
96
|
-
# the xrootd still can read and write there
|
97
|
-
recursive_chown(storage_mount_path / test_vo, XROOTD_UID, XROOTD_GID)
|
98
|
-
|
99
|
-
# Write a small test content (simulating a .fits.fz file with minimal content for testing)
|
100
|
-
test_file_content = f"FITS-like content for {unique_id}"
|
101
|
-
test_file_path.write_text(test_file_content)
|
102
|
-
os.chown(test_file_path, XROOTD_UID, XROOTD_GID)
|
103
|
-
|
104
|
-
return test_file_path, test_file_content
|
105
|
-
|
106
|
-
|
107
87
|
def test_acada_to_lfn(storage_mount_path: Path, test_vo: str):
|
108
88
|
"""Test the acada_to_lfn method of IngestionClient with valid and invalid inputs."""
|
109
89
|
|
@@ -121,7 +101,8 @@ def test_acada_to_lfn(storage_mount_path: Path, test_vo: str):
|
|
121
101
|
)
|
122
102
|
lfn = ingestion_client.acada_to_lfn(acada_path=acada_path)
|
123
103
|
|
124
|
-
|
104
|
+
msg = f"Expected {expected_lfn}, got {lfn}"
|
105
|
+
assert lfn == expected_lfn, msg
|
125
106
|
|
126
107
|
# Test Case 2: Non-absolute acada_path (empty string)
|
127
108
|
with pytest.raises(ValueError, match="acada_path must be absolute"):
|
@@ -151,15 +132,21 @@ def test_acada_to_lfn(storage_mount_path: Path, test_vo: str):
|
|
151
132
|
|
152
133
|
@pytest.mark.usefixtures("_auth_proxy")
|
153
134
|
def test_check_replica_exists(
|
154
|
-
storage_mount_path: Path,
|
135
|
+
storage_mount_path: Path,
|
136
|
+
test_scope: str,
|
137
|
+
onsite_test_file: tuple[Path, str],
|
138
|
+
test_vo: str,
|
155
139
|
):
|
156
140
|
"""Test the check_replica_exists method of IngestionClient."""
|
157
141
|
|
158
142
|
ingestion_client = IngestionClient(
|
159
|
-
storage_mount_path,
|
143
|
+
data_path=storage_mount_path,
|
144
|
+
rse=ONSITE_RSE,
|
145
|
+
vo=test_vo,
|
146
|
+
scope=test_scope,
|
160
147
|
)
|
161
148
|
|
162
|
-
acada_path, _ =
|
149
|
+
acada_path, _ = onsite_test_file
|
163
150
|
|
164
151
|
# Generate the LFN
|
165
152
|
lfn = ingestion_client.acada_to_lfn(acada_path)
|
@@ -181,10 +168,54 @@ def test_check_replica_exists(
|
|
181
168
|
assert not ingestion_client.check_replica_exists(nonexistent_lfn), msg
|
182
169
|
|
183
170
|
|
171
|
+
@pytest.fixture
|
172
|
+
def file_location(request):
|
173
|
+
return request.getfixturevalue(request.param)
|
174
|
+
|
175
|
+
|
176
|
+
@pytest.mark.parametrize(
|
177
|
+
("file_location", "metadata_dict"),
|
178
|
+
[
|
179
|
+
(
|
180
|
+
"subarray_test_file",
|
181
|
+
{
|
182
|
+
"observatory": "CTA",
|
183
|
+
"start_time": "2025-02-04T21:34:05",
|
184
|
+
"end_time": "2025-02-04T21:43:12",
|
185
|
+
"subarray_id": 0,
|
186
|
+
"sb_id": 2000000066,
|
187
|
+
"obs_id": 2000000200,
|
188
|
+
},
|
189
|
+
),
|
190
|
+
(
|
191
|
+
"tel_trigger_test_file",
|
192
|
+
{
|
193
|
+
"observatory": "CTA",
|
194
|
+
"start_time": "2025-02-04T21:34:05",
|
195
|
+
"end_time": "2025-02-04T21:43:11",
|
196
|
+
"tel_ids": [1],
|
197
|
+
"sb_id": 2000000066,
|
198
|
+
"obs_id": 2000000200,
|
199
|
+
},
|
200
|
+
),
|
201
|
+
(
|
202
|
+
"tel_events_test_file",
|
203
|
+
{
|
204
|
+
"observatory": "CTA",
|
205
|
+
"start_time": "2025-04-01T15:25:02",
|
206
|
+
"end_time": "2025-04-01T15:25:03",
|
207
|
+
"sb_id": 0,
|
208
|
+
"obs_id": 0,
|
209
|
+
},
|
210
|
+
),
|
211
|
+
],
|
212
|
+
indirect=["file_location"],
|
213
|
+
)
|
184
214
|
@pytest.mark.usefixtures("_auth_proxy")
|
185
215
|
@pytest.mark.verifies_usecase("UC-110-1.1.1")
|
186
|
-
def
|
187
|
-
|
216
|
+
def test_add_onsite_replica_with_minio_fits_file(
|
217
|
+
file_location: str,
|
218
|
+
metadata_dict: dict,
|
188
219
|
test_scope: str,
|
189
220
|
tmp_path: Path,
|
190
221
|
storage_mount_path,
|
@@ -194,16 +225,28 @@ def test_add_onsite_replica_with_dummy_file(
|
|
194
225
|
"""Test the add_onsite_replica method of IngestionClient using a dummy file."""
|
195
226
|
|
196
227
|
ingestion_client = IngestionClient(
|
197
|
-
storage_mount_path,
|
228
|
+
data_path=storage_mount_path,
|
229
|
+
rse=ONSITE_RSE,
|
230
|
+
vo=test_vo,
|
231
|
+
scope=test_scope,
|
198
232
|
)
|
199
233
|
|
200
|
-
|
234
|
+
filename = str(file_location).split("/")[-1]
|
235
|
+
acada_path = storage_mount_path / test_vo / test_scope / filename
|
236
|
+
acada_path.parent.mkdir(parents=True, exist_ok=True)
|
237
|
+
copy2(file_location, str(acada_path))
|
238
|
+
reset_xrootd_permissions(storage_mount_path)
|
239
|
+
|
201
240
|
# Use add_onsite_replica to register the replica
|
202
|
-
lfn = ingestion_client.add_onsite_replica(acada_path=acada_path)
|
241
|
+
lfn, skipped = ingestion_client.add_onsite_replica(acada_path=acada_path)
|
203
242
|
|
204
243
|
# Verify the LFN matches the expected LFN
|
205
244
|
expected_lfn = ingestion_client.acada_to_lfn(acada_path)
|
206
|
-
|
245
|
+
msg = f"Expected LFN {expected_lfn}, got {lfn}"
|
246
|
+
assert lfn == expected_lfn, msg
|
247
|
+
|
248
|
+
msg = "Expected the file to be newly ingested, but it was skipped"
|
249
|
+
assert not skipped, msg
|
207
250
|
|
208
251
|
# Download the file using the LFN
|
209
252
|
download_spec = {
|
@@ -216,20 +259,43 @@ def test_add_onsite_replica_with_dummy_file(
|
|
216
259
|
|
217
260
|
# Verify the downloaded file
|
218
261
|
download_path = tmp_path / lfn.lstrip("/")
|
219
|
-
|
262
|
+
msg = f"Download failed at {download_path}"
|
263
|
+
assert download_path.is_file(), msg
|
220
264
|
|
221
|
-
|
222
|
-
assert
|
223
|
-
f"Downloaded file content does not match the original. "
|
224
|
-
f"Expected: {test_file_content}, Got: {downloaded_content}"
|
225
|
-
)
|
265
|
+
msg = "Downloaded file content does not match the original."
|
266
|
+
assert adler32(download_path) == adler32(file_location), msg
|
226
267
|
|
227
268
|
# Check for don't ingest again if its already registered
|
228
269
|
caplog.clear()
|
229
|
-
|
270
|
+
lfn_check, skipped_check = ingestion_client.add_onsite_replica(
|
271
|
+
acada_path=acada_path
|
272
|
+
)
|
273
|
+
msg = f"LFN mismatch on second ingestion attempt: expected {lfn}, got {lfn_check}"
|
274
|
+
assert lfn_check == lfn, msg
|
275
|
+
|
276
|
+
msg = (
|
277
|
+
"Expected the file to be skipped on second ingestion, but it was ingested again"
|
278
|
+
)
|
279
|
+
assert skipped_check, msg
|
280
|
+
|
281
|
+
msg = f"'Replica already exists for lfn '{lfn}', skipping' in caplog records"
|
230
282
|
assert f"Replica already exists for lfn '{lfn}', skipping" in [
|
231
283
|
r.message for r in caplog.records
|
232
|
-
]
|
284
|
+
], msg
|
285
|
+
|
286
|
+
# Retrieve metadata using the DIDClient
|
287
|
+
did_client = Client()
|
288
|
+
retrieved_metadata = did_client.get_metadata(
|
289
|
+
scope=ingestion_client.scope, name=lfn, plugin="JSON"
|
290
|
+
)
|
291
|
+
|
292
|
+
# Verify the metadata matches the expected metadata
|
293
|
+
for key, value in metadata_dict.items():
|
294
|
+
msg = (
|
295
|
+
f"Metadata mismatch for key '{key}'. "
|
296
|
+
f"Expected: {value}, Got: {retrieved_metadata.get(key)}"
|
297
|
+
)
|
298
|
+
assert retrieved_metadata.get(key) == value, msg
|
233
299
|
|
234
300
|
|
235
301
|
def test_rses():
|
@@ -238,21 +304,26 @@ def test_rses():
|
|
238
304
|
result = list(client.list_rses())
|
239
305
|
|
240
306
|
rses = [r["rse"] for r in result]
|
241
|
-
|
242
|
-
assert
|
243
|
-
|
307
|
+
msg = f"Expected RSE {ONSITE_RSE} not found in {rses}"
|
308
|
+
assert ONSITE_RSE in rses, msg
|
309
|
+
|
310
|
+
msg = f"Expected RSE {OFFSITE_RSE_1} not found in {rses}"
|
311
|
+
assert OFFSITE_RSE_1 in rses, msg
|
312
|
+
|
313
|
+
msg = f"Expected RSE {OFFSITE_RSE_2} not found in {rses}"
|
314
|
+
assert OFFSITE_RSE_2 in rses, msg
|
244
315
|
|
245
316
|
|
246
317
|
@pytest.fixture
|
247
318
|
def pre_existing_lfn(
|
248
|
-
|
319
|
+
onsite_test_file: tuple[Path, str],
|
249
320
|
test_scope: str,
|
250
321
|
test_vo: str,
|
251
322
|
) -> str:
|
252
323
|
"""Fixture to provide an LFN for a replica pre-registered in Rucio without using IngestionClient."""
|
253
324
|
|
254
325
|
# Construct the LFN manually based on the test file and scope
|
255
|
-
acada_path, _ =
|
326
|
+
acada_path, _ = onsite_test_file
|
256
327
|
relative_path = str(acada_path).split(f"{test_vo}/{test_scope}/", 1)[-1]
|
257
328
|
lfn = f"/{test_vo}/{test_scope}/{relative_path}"
|
258
329
|
checksum = adler32(acada_path)
|
@@ -281,9 +352,8 @@ def pre_existing_lfn(
|
|
281
352
|
|
282
353
|
# Verify the replica is registered
|
283
354
|
replicas = list(replica_client.list_replicas(dids=[did]))
|
284
|
-
|
285
|
-
|
286
|
-
), f"Failed to verify pre-registration of replica for LFN {lfn} on {ONSITE_RSE}"
|
355
|
+
msg = f"Failed to verify pre-registration of replica for LFN {lfn} on {ONSITE_RSE}"
|
356
|
+
assert replicas, msg
|
287
357
|
|
288
358
|
return lfn
|
289
359
|
|
@@ -296,20 +366,22 @@ def test_add_offsite_replication_rules(
|
|
296
366
|
test_vo: str,
|
297
367
|
storage_mount_path: Path,
|
298
368
|
tmp_path: Path,
|
299
|
-
|
369
|
+
onsite_test_file: tuple[Path, str],
|
300
370
|
caplog,
|
301
371
|
):
|
302
372
|
"""Test the add_offsite_replication_rules method of IngestionClient."""
|
303
373
|
ingestion_client = IngestionClient(
|
304
|
-
storage_mount_path,
|
374
|
+
data_path=storage_mount_path,
|
375
|
+
rse=ONSITE_RSE,
|
376
|
+
vo=test_vo,
|
377
|
+
scope=test_scope,
|
305
378
|
)
|
306
|
-
caplog.set_level(logging.DEBUG)
|
307
379
|
|
308
380
|
# Replicate the ACADA file to two offsite RSEs
|
309
381
|
lfn = pre_existing_lfn
|
310
382
|
did = {"scope": test_scope, "name": lfn}
|
311
383
|
|
312
|
-
_, test_file_content =
|
384
|
+
_, test_file_content = onsite_test_file # Get the test file content
|
313
385
|
|
314
386
|
offsite_rse_expression = "OFFSITE"
|
315
387
|
copies = 2
|
@@ -331,10 +403,11 @@ def test_add_offsite_replication_rules(
|
|
331
403
|
replica_client = ReplicaClient()
|
332
404
|
replicas = next(replica_client.list_replicas(dids=[did]))
|
333
405
|
states = replicas.get("states", {})
|
406
|
+
msg = f"Expected replica on either {OFFSITE_RSE_1} or {OFFSITE_RSE_2} to be AVAILABLE after first rule: {states}"
|
334
407
|
assert (
|
335
408
|
states.get(OFFSITE_RSE_1) == "AVAILABLE"
|
336
409
|
or states.get(OFFSITE_RSE_2) == "AVAILABLE"
|
337
|
-
),
|
410
|
+
), msg
|
338
411
|
|
339
412
|
# Manually trigger the judge-repairer to ensure the second rule doesn't get stuck
|
340
413
|
trigger_judge_repairer()
|
@@ -351,15 +424,15 @@ def test_add_offsite_replication_rules(
|
|
351
424
|
did,
|
352
425
|
states,
|
353
426
|
)
|
354
|
-
|
355
|
-
|
356
|
-
|
357
|
-
|
358
|
-
|
359
|
-
|
360
|
-
|
361
|
-
|
362
|
-
|
427
|
+
|
428
|
+
msg = f"Expected replica on {ONSITE_RSE} to be AVAILABLE: {states}"
|
429
|
+
assert states.get(ONSITE_RSE) == "AVAILABLE", msg
|
430
|
+
|
431
|
+
msg = f"Expected replica on {OFFSITE_RSE_1} to be AVAILABLE: {states}"
|
432
|
+
assert states.get(OFFSITE_RSE_1) == "AVAILABLE", msg
|
433
|
+
|
434
|
+
msg = f"Expected replica on {OFFSITE_RSE_2} to be AVAILABLE: {states}"
|
435
|
+
assert states.get(OFFSITE_RSE_2) == "AVAILABLE", msg
|
363
436
|
|
364
437
|
# Download the file from OFFSITE_RSE_2 to verify its content
|
365
438
|
download_spec = {
|
@@ -373,12 +446,15 @@ def test_add_offsite_replication_rules(
|
|
373
446
|
|
374
447
|
# Verify the downloaded file content
|
375
448
|
download_path = tmp_path / lfn.lstrip("/")
|
376
|
-
|
449
|
+
msg = f"Download failed at {download_path}"
|
450
|
+
assert download_path.is_file(), msg
|
451
|
+
|
377
452
|
downloaded_content = download_path.read_text()
|
378
|
-
|
453
|
+
msg = (
|
379
454
|
f"Downloaded file content does not match the original. "
|
380
455
|
f"Expected: {test_file_content}, Got: {downloaded_content}"
|
381
456
|
)
|
457
|
+
assert downloaded_content == test_file_content, msg
|
382
458
|
|
383
459
|
|
384
460
|
@pytest.mark.usefixtures("_auth_proxy")
|
@@ -389,20 +465,23 @@ def test_add_offsite_replication_rules_single_copy(
|
|
389
465
|
test_vo: str,
|
390
466
|
storage_mount_path: Path,
|
391
467
|
tmp_path: Path,
|
392
|
-
|
468
|
+
onsite_test_file: tuple[Path, str],
|
393
469
|
caplog,
|
394
470
|
):
|
395
471
|
"""Test the add_offsite_replication_rules method of IngestionClient with a single copy (copies=1)."""
|
472
|
+
|
396
473
|
ingestion_client = IngestionClient(
|
397
|
-
storage_mount_path,
|
474
|
+
data_path=storage_mount_path,
|
475
|
+
rse=ONSITE_RSE,
|
476
|
+
vo=test_vo,
|
477
|
+
scope=test_scope,
|
398
478
|
)
|
399
|
-
caplog.set_level(logging.DEBUG)
|
400
479
|
|
401
480
|
# Replicate the ACADA file to one offsite RSE
|
402
481
|
lfn = pre_existing_lfn
|
403
482
|
did = {"scope": test_scope, "name": lfn}
|
404
483
|
|
405
|
-
_, test_file_content =
|
484
|
+
_, test_file_content = onsite_test_file
|
406
485
|
|
407
486
|
offsite_rse_expression = "OFFSITE"
|
408
487
|
copies = 1
|
@@ -414,9 +493,9 @@ def test_add_offsite_replication_rules_single_copy(
|
|
414
493
|
)
|
415
494
|
|
416
495
|
# Verify that only one rule was created
|
417
|
-
|
418
|
-
|
419
|
-
|
496
|
+
msg = f"Expected exactly 1 rule ID, got {len(rule_ids)}: {rule_ids}"
|
497
|
+
assert len(rule_ids) == 1, msg
|
498
|
+
|
420
499
|
rule_id_offsite_1 = rule_ids[0]
|
421
500
|
rule_client = RuleClient()
|
422
501
|
|
@@ -436,9 +515,8 @@ def test_add_offsite_replication_rules_single_copy(
|
|
436
515
|
offsite_replica_count = sum(
|
437
516
|
1 for rse in [OFFSITE_RSE_1, OFFSITE_RSE_2] if states.get(rse) == "AVAILABLE"
|
438
517
|
)
|
439
|
-
|
440
|
-
|
441
|
-
), f"Expected exactly 1 offsite replica (on either {OFFSITE_RSE_1} or {OFFSITE_RSE_2}), got {offsite_replica_count}: {states}"
|
518
|
+
msg = f"Expected exactly 1 offsite replica (on either {OFFSITE_RSE_1} or {OFFSITE_RSE_2}), got {offsite_replica_count}: {states}"
|
519
|
+
assert offsite_replica_count == 1, msg
|
442
520
|
|
443
521
|
# Determine which offsite RSE the replica was created on
|
444
522
|
target_offsite_rse = (
|
@@ -457,12 +535,14 @@ def test_add_offsite_replication_rules_single_copy(
|
|
457
535
|
|
458
536
|
# Verify the downloaded file content
|
459
537
|
download_path = tmp_path / lfn.lstrip("/")
|
460
|
-
|
538
|
+
msg = f"Download failed at {download_path}"
|
539
|
+
assert download_path.is_file(), msg
|
461
540
|
downloaded_content = download_path.read_text()
|
462
|
-
|
541
|
+
msg = (
|
463
542
|
f"Downloaded file content does not match the original. "
|
464
543
|
f"Expected: {test_file_content}, Got: {downloaded_content}"
|
465
544
|
)
|
545
|
+
assert downloaded_content == test_file_content, msg
|
466
546
|
|
467
547
|
|
468
548
|
def test_verify_fits_file(tel_events_test_file):
|
@@ -499,3 +579,1140 @@ def test_verify_fits_file_invalid_checksum(broken_checksum):
|
|
499
579
|
with fits.open(broken_checksum) as hdul:
|
500
580
|
with pytest.raises(FITSVerificationError, match="CHECKSUM verification failed"):
|
501
581
|
verify_fits_checksum(hdul)
|
582
|
+
|
583
|
+
|
584
|
+
def test_ingest_init(storage_mount_path):
|
585
|
+
"""Test that Ingest initializes correctly with given parameters."""
|
586
|
+
ingestion_client = IngestionClient(
|
587
|
+
data_path=storage_mount_path,
|
588
|
+
rse=ONSITE_RSE,
|
589
|
+
vo="ctao",
|
590
|
+
scope="acada",
|
591
|
+
)
|
592
|
+
|
593
|
+
ingest = Ingest(
|
594
|
+
client=ingestion_client,
|
595
|
+
top_dir=storage_mount_path,
|
596
|
+
num_workers=3,
|
597
|
+
lock_file_path=storage_mount_path / "lockfile.lock",
|
598
|
+
polling_interval=0.5,
|
599
|
+
check_interval=0.2,
|
600
|
+
)
|
601
|
+
assert ingest.client == ingestion_client
|
602
|
+
assert ingest.top_dir == storage_mount_path
|
603
|
+
assert ingest.num_workers == 3
|
604
|
+
assert ingest.lock_file_path == storage_mount_path / "lockfile.lock"
|
605
|
+
assert ingest.polling_interval == 0.5
|
606
|
+
assert ingest.check_interval == 0.2
|
607
|
+
assert not ingest.stop_event.is_set() # check stop_event initial state
|
608
|
+
assert hasattr(ingest, "result_queue")
|
609
|
+
assert hasattr(ingest, "task_counter")
|
610
|
+
assert hasattr(ingest, "submitted_tasks")
|
611
|
+
assert ingest.task_counter == 0
|
612
|
+
assert len(ingest.submitted_tasks) == 0
|
613
|
+
|
614
|
+
|
615
|
+
def test_check_directory_valid(storage_mount_path, tmp_path, caplog):
|
616
|
+
"""Test _check_directory with a valid, readable directory."""
|
617
|
+
ingestion_client = IngestionClient(
|
618
|
+
data_path=storage_mount_path,
|
619
|
+
rse=ONSITE_RSE,
|
620
|
+
vo="ctao",
|
621
|
+
scope="acada",
|
622
|
+
)
|
623
|
+
|
624
|
+
ingest_instance = Ingest(
|
625
|
+
client=ingestion_client,
|
626
|
+
top_dir=tmp_path,
|
627
|
+
num_workers=1,
|
628
|
+
lock_file_path=storage_mount_path / "bdms_ingest.lock",
|
629
|
+
polling_interval=0.5,
|
630
|
+
check_interval=0.5,
|
631
|
+
)
|
632
|
+
|
633
|
+
ingest_instance.top_dir = tmp_path
|
634
|
+
ingest_instance._check_directory()
|
635
|
+
|
636
|
+
|
637
|
+
def test_check_directory_invalid(storage_mount_path, tmp_path, caplog):
|
638
|
+
"""Test _check_directory with an invalid directory."""
|
639
|
+
ingestion_client = IngestionClient(
|
640
|
+
data_path=storage_mount_path,
|
641
|
+
rse=ONSITE_RSE,
|
642
|
+
vo="ctao",
|
643
|
+
scope="acada",
|
644
|
+
logger=LOGGER,
|
645
|
+
)
|
646
|
+
|
647
|
+
invalid_dir = tmp_path / "nonexistent"
|
648
|
+
|
649
|
+
ingest_instance = Ingest(
|
650
|
+
client=ingestion_client,
|
651
|
+
top_dir=invalid_dir,
|
652
|
+
num_workers=1,
|
653
|
+
lock_file_path=storage_mount_path / "bdms_ingest.lock",
|
654
|
+
polling_interval=0.5,
|
655
|
+
check_interval=0.5,
|
656
|
+
)
|
657
|
+
|
658
|
+
with pytest.raises(RuntimeError, match=f"Cannot read directory {invalid_dir}"):
|
659
|
+
ingest_instance._check_directory()
|
660
|
+
assert f"Cannot read directory {invalid_dir}" in caplog.text
|
661
|
+
|
662
|
+
|
663
|
+
@pytest.mark.usefixtures("_auth_proxy")
|
664
|
+
def test_process_file_success(
|
665
|
+
storage_mount_path, caplog, onsite_test_file, test_vo, test_scope
|
666
|
+
):
|
667
|
+
"""Test for checking successful ingestion with trigger file clean-up, depends on IngestionClient"""
|
668
|
+
ingestion_client = IngestionClient(
|
669
|
+
data_path=storage_mount_path,
|
670
|
+
rse=ONSITE_RSE,
|
671
|
+
vo=test_vo,
|
672
|
+
scope=test_scope,
|
673
|
+
)
|
674
|
+
|
675
|
+
acada_path, _ = onsite_test_file
|
676
|
+
test_file = acada_path
|
677
|
+
trigger_file = Path(str(test_file) + TRIGGER_SUFFIX)
|
678
|
+
trigger_file.symlink_to(test_file)
|
679
|
+
result = process_file(ingestion_client, str(test_file))
|
680
|
+
assert result == IngestStatus.SUCCESS
|
681
|
+
assert not trigger_file.exists()
|
682
|
+
assert INGEST_SUCCESS_MESSAGE in caplog.text
|
683
|
+
|
684
|
+
|
685
|
+
@pytest.mark.usefixtures("_auth_proxy")
|
686
|
+
def test_process_file_skipped(
|
687
|
+
storage_mount_path, caplog, onsite_test_file, test_vo, test_scope
|
688
|
+
):
|
689
|
+
"""Test for checking skipped ingestion when replica already exists"""
|
690
|
+
ingestion_client = IngestionClient(
|
691
|
+
data_path=storage_mount_path,
|
692
|
+
rse=ONSITE_RSE,
|
693
|
+
vo=test_vo,
|
694
|
+
scope=test_scope,
|
695
|
+
)
|
696
|
+
|
697
|
+
acada_path, _ = onsite_test_file
|
698
|
+
test_file = acada_path
|
699
|
+
trigger_file = Path(str(test_file) + TRIGGER_SUFFIX)
|
700
|
+
trigger_file.symlink_to(test_file)
|
701
|
+
process_file(ingestion_client, str(test_file))
|
702
|
+
caplog.clear()
|
703
|
+
result = process_file(ingestion_client, str(test_file))
|
704
|
+
assert result == IngestStatus.SKIPPED
|
705
|
+
assert "Replica already exists" in caplog.text
|
706
|
+
|
707
|
+
|
708
|
+
@pytest.mark.usefixtures("_auth_proxy")
|
709
|
+
def test_process_file_failure(storage_mount_path, caplog, tmp_path):
|
710
|
+
"""Test for checking failure for invalid file paths"""
|
711
|
+
ingestion_client = IngestionClient(
|
712
|
+
data_path=storage_mount_path,
|
713
|
+
rse=ONSITE_RSE,
|
714
|
+
vo="ctao",
|
715
|
+
scope="acada",
|
716
|
+
)
|
717
|
+
|
718
|
+
invalid_file = tmp_path / "invalid_file.fits"
|
719
|
+
invalid_file.write_text("dummy content")
|
720
|
+
trigger_file = Path(str(invalid_file) + TRIGGER_SUFFIX)
|
721
|
+
trigger_file.symlink_to(invalid_file)
|
722
|
+
|
723
|
+
# The file path is outside the data_path causing a ValueError in acada_to_lfn
|
724
|
+
result = process_file(ingestion_client, str(invalid_file))
|
725
|
+
|
726
|
+
# Verify the function returns FAILURE status instead of raising an exception
|
727
|
+
assert result == IngestStatus.FAILURE
|
728
|
+
|
729
|
+
# Check for the actual error message that gets logged
|
730
|
+
assert "Exception in process_file" in caplog.text
|
731
|
+
# Verify the file path is in the error message
|
732
|
+
assert str(invalid_file) in caplog.text
|
733
|
+
|
734
|
+
# Verify that no success message was logged
|
735
|
+
assert INGEST_SUCCESS_MESSAGE not in caplog.text
|
736
|
+
|
737
|
+
# Trigger file should still exist since ingestion failed
|
738
|
+
msg = "Trigger file should not be removed when ingestion fails"
|
739
|
+
assert trigger_file.exists(), msg
|
740
|
+
|
741
|
+
|
742
|
+
def test_trigger_file_handler_init(storage_mount_path):
|
743
|
+
"""Test TriggerFileHandler initialization."""
|
744
|
+
ingestion_client = IngestionClient(
|
745
|
+
data_path=storage_mount_path,
|
746
|
+
rse=ONSITE_RSE,
|
747
|
+
vo="ctao",
|
748
|
+
scope="acada",
|
749
|
+
)
|
750
|
+
|
751
|
+
ingest_instance = Ingest(
|
752
|
+
client=ingestion_client,
|
753
|
+
top_dir=storage_mount_path,
|
754
|
+
num_workers=1,
|
755
|
+
lock_file_path=storage_mount_path / "bdms_ingest.lock",
|
756
|
+
polling_interval=0.5,
|
757
|
+
check_interval=0.5,
|
758
|
+
)
|
759
|
+
|
760
|
+
handler = TriggerFileHandler(ingest_instance)
|
761
|
+
assert handler.ingest == ingest_instance
|
762
|
+
|
763
|
+
|
764
|
+
def test_trigger_file_handler_on_moved_missing_data_file(
|
765
|
+
storage_mount_path, tmp_path, caplog
|
766
|
+
):
|
767
|
+
"""Test on_moved skips when data file is missing."""
|
768
|
+
ingestion_client = IngestionClient(
|
769
|
+
data_path=storage_mount_path,
|
770
|
+
rse=ONSITE_RSE,
|
771
|
+
vo="ctao",
|
772
|
+
scope="acada",
|
773
|
+
)
|
774
|
+
|
775
|
+
ingest_instance = Ingest(
|
776
|
+
client=ingestion_client,
|
777
|
+
top_dir=storage_mount_path,
|
778
|
+
num_workers=1,
|
779
|
+
lock_file_path=storage_mount_path / "bdms_ingest.lock",
|
780
|
+
polling_interval=0.5,
|
781
|
+
check_interval=0.5,
|
782
|
+
)
|
783
|
+
|
784
|
+
handler = TriggerFileHandler(ingest_instance)
|
785
|
+
trigger_file = tmp_path / TEST_FILE_TRIGGER
|
786
|
+
data_file = tmp_path / "test_file"
|
787
|
+
|
788
|
+
# Create symlink to non-existent data file
|
789
|
+
trigger_file.symlink_to(data_file)
|
790
|
+
|
791
|
+
# Create FileMovedEvent (simulating ln -s)
|
792
|
+
event = FileMovedEvent(src_path=str(data_file), dest_path=str(trigger_file))
|
793
|
+
handler.on_moved(event)
|
794
|
+
|
795
|
+
assert (
|
796
|
+
f"Data file {data_file} for trigger {trigger_file} does not exist, skipping"
|
797
|
+
in caplog.text
|
798
|
+
)
|
799
|
+
assert (
|
800
|
+
DETECTED_NEW_TRIGGER_FILE not in caplog.text
|
801
|
+
) # Skips processing since the data file is missing
|
802
|
+
|
803
|
+
|
804
|
+
def test_trigger_file_handler_on_moved_success(
|
805
|
+
storage_mount_path, tmp_path, onsite_test_file, test_vo, test_scope, caplog
|
806
|
+
):
|
807
|
+
"""Test on_moved successfully processing a valid trigger file."""
|
808
|
+
ingestion_client = IngestionClient(
|
809
|
+
data_path=storage_mount_path,
|
810
|
+
rse=ONSITE_RSE,
|
811
|
+
vo=test_vo,
|
812
|
+
scope=test_scope,
|
813
|
+
)
|
814
|
+
|
815
|
+
ingest_instance = Ingest(
|
816
|
+
client=ingestion_client,
|
817
|
+
top_dir=storage_mount_path,
|
818
|
+
num_workers=1,
|
819
|
+
lock_file_path=storage_mount_path / "bdms_ingest.lock",
|
820
|
+
polling_interval=0.5,
|
821
|
+
check_interval=0.5,
|
822
|
+
)
|
823
|
+
|
824
|
+
# Create ProcessPoolExecutor for the ingest instance
|
825
|
+
with ProcessPoolExecutor(max_workers=1) as executor:
|
826
|
+
ingest_instance.executor = executor
|
827
|
+
|
828
|
+
handler = TriggerFileHandler(ingest_instance)
|
829
|
+
acada_path, _ = onsite_test_file
|
830
|
+
test_file = acada_path
|
831
|
+
trigger_file = Path(str(test_file) + TRIGGER_SUFFIX)
|
832
|
+
trigger_file.symlink_to(test_file)
|
833
|
+
|
834
|
+
# Create FileMovedEvent (simulating ln -s)
|
835
|
+
event = FileMovedEvent(src_path=str(test_file), dest_path=str(trigger_file))
|
836
|
+
|
837
|
+
# Record initial state
|
838
|
+
initial_task_counter = ingest_instance.task_counter
|
839
|
+
initial_total_tasks = ingest_instance.total_tasks_submitted
|
840
|
+
initial_submitted_tasks_count = len(ingest_instance.submitted_tasks)
|
841
|
+
|
842
|
+
handler.on_moved(event)
|
843
|
+
|
844
|
+
# Verify the expected log message
|
845
|
+
msg = f"'Detected new trigger file {trigger_file}, submitting data file {test_file}' in caplog"
|
846
|
+
assert (
|
847
|
+
f"Detected new trigger file {trigger_file}, submitting data file {test_file}"
|
848
|
+
in caplog.text
|
849
|
+
), msg
|
850
|
+
|
851
|
+
# Verify task submission metrics were updated
|
852
|
+
assert ingest_instance.task_counter == initial_task_counter + 1
|
853
|
+
assert ingest_instance.total_tasks_submitted == initial_total_tasks + 1
|
854
|
+
assert len(ingest_instance.submitted_tasks) == initial_submitted_tasks_count + 1
|
855
|
+
|
856
|
+
# Verify the task was submitted with correct file path
|
857
|
+
submitted_task_files = list(ingest_instance.submitted_tasks.values())
|
858
|
+
assert str(test_file) in submitted_task_files
|
859
|
+
|
860
|
+
# Give some time for the task to potentially complete
|
861
|
+
time.sleep(0.5)
|
862
|
+
|
863
|
+
|
864
|
+
def test_trigger_file_handler_on_moved_stop_event_set(
|
865
|
+
storage_mount_path, tmp_path, caplog
|
866
|
+
):
|
867
|
+
"""Test on_moved skips processing when stop_event is set."""
|
868
|
+
ingestion_client = IngestionClient(
|
869
|
+
data_path=storage_mount_path,
|
870
|
+
rse=ONSITE_RSE,
|
871
|
+
vo="ctao",
|
872
|
+
scope="acada",
|
873
|
+
)
|
874
|
+
|
875
|
+
ingest_instance = Ingest(
|
876
|
+
client=ingestion_client,
|
877
|
+
top_dir=storage_mount_path,
|
878
|
+
num_workers=1,
|
879
|
+
lock_file_path=storage_mount_path / "bdms_ingest.lock",
|
880
|
+
polling_interval=0.5,
|
881
|
+
check_interval=0.5,
|
882
|
+
)
|
883
|
+
|
884
|
+
handler = TriggerFileHandler(ingest_instance)
|
885
|
+
trigger_file = tmp_path / TEST_FILE_TRIGGER
|
886
|
+
data_file = tmp_path / "test_file"
|
887
|
+
data_file.write_text("data") # Data file exists
|
888
|
+
trigger_file.symlink_to(data_file)
|
889
|
+
|
890
|
+
# Create FileMovedEvent
|
891
|
+
event = FileMovedEvent(src_path=str(data_file), dest_path=str(trigger_file))
|
892
|
+
|
893
|
+
# Set stop event
|
894
|
+
ingest_instance.stop_event.set()
|
895
|
+
|
896
|
+
# Record initial state
|
897
|
+
initial_task_counter = ingest_instance.task_counter
|
898
|
+
initial_total_tasks = ingest_instance.total_tasks_submitted
|
899
|
+
|
900
|
+
try:
|
901
|
+
handler.on_moved(event)
|
902
|
+
|
903
|
+
# Should not process anything when stop_event is set
|
904
|
+
assert ingest_instance.task_counter == initial_task_counter
|
905
|
+
assert ingest_instance.total_tasks_submitted == initial_total_tasks
|
906
|
+
assert DETECTED_NEW_TRIGGER_FILE not in caplog.text
|
907
|
+
|
908
|
+
finally:
|
909
|
+
ingest_instance.stop_event.clear() # Reset for other tests
|
910
|
+
|
911
|
+
|
912
|
+
def test_trigger_file_handler_on_moved_directory_event(
|
913
|
+
storage_mount_path, tmp_path, caplog
|
914
|
+
):
|
915
|
+
"""Test on_moved skips directory events."""
|
916
|
+
ingestion_client = IngestionClient(
|
917
|
+
data_path=storage_mount_path,
|
918
|
+
rse=ONSITE_RSE,
|
919
|
+
vo="ctao",
|
920
|
+
scope="acada",
|
921
|
+
)
|
922
|
+
|
923
|
+
ingest_instance = Ingest(
|
924
|
+
client=ingestion_client,
|
925
|
+
top_dir=storage_mount_path,
|
926
|
+
num_workers=1,
|
927
|
+
lock_file_path=storage_mount_path / "bdms_ingest.lock",
|
928
|
+
polling_interval=0.5,
|
929
|
+
check_interval=0.5,
|
930
|
+
)
|
931
|
+
|
932
|
+
handler = TriggerFileHandler(ingest_instance)
|
933
|
+
trigger_dir = tmp_path / "some_directory.trigger"
|
934
|
+
source_dir = tmp_path / "source_directory"
|
935
|
+
source_dir.mkdir()
|
936
|
+
trigger_dir.mkdir()
|
937
|
+
|
938
|
+
# Create directory move event
|
939
|
+
event = FileMovedEvent(src_path=str(source_dir), dest_path=str(trigger_dir))
|
940
|
+
event.is_directory = True # mark as directory event
|
941
|
+
|
942
|
+
# Record initial state
|
943
|
+
initial_task_counter = ingest_instance.task_counter
|
944
|
+
initial_total_tasks = ingest_instance.total_tasks_submitted
|
945
|
+
|
946
|
+
handler.on_moved(event)
|
947
|
+
|
948
|
+
# Should not process directory events
|
949
|
+
assert ingest_instance.task_counter == initial_task_counter
|
950
|
+
assert ingest_instance.total_tasks_submitted == initial_total_tasks
|
951
|
+
assert DETECTED_NEW_TRIGGER_FILE not in caplog.text
|
952
|
+
|
953
|
+
|
954
|
+
def test_trigger_file_handler_on_moved_with_actual_processing(
|
955
|
+
storage_mount_path, tmp_path, onsite_test_file, test_vo, test_scope, caplog
|
956
|
+
):
|
957
|
+
"""Test on_moved with successfully processing a valid trigger file."""
|
958
|
+
ingestion_client = IngestionClient(
|
959
|
+
data_path=storage_mount_path,
|
960
|
+
rse=ONSITE_RSE,
|
961
|
+
vo=test_vo,
|
962
|
+
scope=test_scope,
|
963
|
+
)
|
964
|
+
|
965
|
+
ingest_instance = Ingest(
|
966
|
+
client=ingestion_client,
|
967
|
+
top_dir=storage_mount_path,
|
968
|
+
num_workers=1,
|
969
|
+
lock_file_path=storage_mount_path / "bdms_ingest.lock",
|
970
|
+
polling_interval=0.5,
|
971
|
+
check_interval=0.5,
|
972
|
+
)
|
973
|
+
|
974
|
+
# Start the result processing thread manually for this test
|
975
|
+
result_thread = threading.Thread(
|
976
|
+
target=ingest_instance._process_results, daemon=True
|
977
|
+
)
|
978
|
+
result_thread.start()
|
979
|
+
|
980
|
+
with ProcessPoolExecutor(max_workers=1) as executor:
|
981
|
+
ingest_instance.executor = executor
|
982
|
+
|
983
|
+
handler = TriggerFileHandler(ingest_instance)
|
984
|
+
acada_path, _ = onsite_test_file
|
985
|
+
test_file = acada_path
|
986
|
+
trigger_file = Path(str(test_file) + TRIGGER_SUFFIX)
|
987
|
+
trigger_file.symlink_to(test_file)
|
988
|
+
|
989
|
+
# Create FileMovedEvent
|
990
|
+
event = FileMovedEvent(src_path=str(test_file), dest_path=str(trigger_file))
|
991
|
+
|
992
|
+
handler.on_moved(event)
|
993
|
+
|
994
|
+
# Wait for processing to complete
|
995
|
+
timeout = 10.0
|
996
|
+
start_time = time.time()
|
997
|
+
processed = False
|
998
|
+
|
999
|
+
while time.time() - start_time < timeout:
|
1000
|
+
# Check if task was completed (removed from submitted_tasks)
|
1001
|
+
if len(ingest_instance.submitted_tasks) == 0:
|
1002
|
+
processed = True
|
1003
|
+
break
|
1004
|
+
time.sleep(0.1)
|
1005
|
+
|
1006
|
+
# Stop the result processing thread
|
1007
|
+
ingest_instance.stop_event.set()
|
1008
|
+
result_thread.join(timeout=2.0)
|
1009
|
+
|
1010
|
+
# Verify processing occurred
|
1011
|
+
msg = "Task was not processed within timeout"
|
1012
|
+
assert processed, msg
|
1013
|
+
|
1014
|
+
msg = f"'Detected new trigger file {trigger_file}, submitting data file {test_file}' in caplog"
|
1015
|
+
assert (
|
1016
|
+
f"Detected new trigger file {trigger_file}, submitting data file {test_file}"
|
1017
|
+
in caplog.text
|
1018
|
+
), msg
|
1019
|
+
|
1020
|
+
# Check that a result was logged (either success, failure, or error)
|
1021
|
+
result_logged = any(
|
1022
|
+
phrase in caplog.text
|
1023
|
+
for phrase in ["Processed file", "failed:", "Exception in process_file"]
|
1024
|
+
)
|
1025
|
+
msg = "No processing result was logged"
|
1026
|
+
assert result_logged, msg
|
1027
|
+
|
1028
|
+
|
1029
|
+
def test_sequential_exclusion_lock_prevention(storage_mount_path, tmp_path):
|
1030
|
+
"""Test that a second daemon instance cannot start when first is already running.
|
1031
|
+
|
1032
|
+
This test validates sequential exclusion: when one ingestion daemon is already
|
1033
|
+
running and has acquired the lock, any subsequent attempt to start another
|
1034
|
+
daemon instance should fail with a clear error message.
|
1035
|
+
"""
|
1036
|
+
lock_file = tmp_path / "sequential_test.pid"
|
1037
|
+
|
1038
|
+
ingestion_client = IngestionClient(
|
1039
|
+
data_path=storage_mount_path,
|
1040
|
+
rse=ONSITE_RSE,
|
1041
|
+
vo="ctao",
|
1042
|
+
scope="acada",
|
1043
|
+
)
|
1044
|
+
|
1045
|
+
# Create first instance
|
1046
|
+
instance1 = Ingest(
|
1047
|
+
client=ingestion_client,
|
1048
|
+
top_dir=tmp_path,
|
1049
|
+
lock_file_path=lock_file,
|
1050
|
+
num_workers=1,
|
1051
|
+
polling_interval=0.1,
|
1052
|
+
check_interval=0.1,
|
1053
|
+
)
|
1054
|
+
|
1055
|
+
# Create second instance with same lock file
|
1056
|
+
instance2 = Ingest(
|
1057
|
+
client=ingestion_client,
|
1058
|
+
top_dir=tmp_path,
|
1059
|
+
lock_file_path=lock_file,
|
1060
|
+
num_workers=1,
|
1061
|
+
polling_interval=0.1,
|
1062
|
+
check_interval=0.1,
|
1063
|
+
)
|
1064
|
+
|
1065
|
+
results = {}
|
1066
|
+
first_instance_started = threading.Event()
|
1067
|
+
|
1068
|
+
def run_first_instance():
|
1069
|
+
"""Run first instance - should succeed and run until manually stopped."""
|
1070
|
+
try:
|
1071
|
+
# signal: about to start daemon
|
1072
|
+
first_instance_started.set()
|
1073
|
+
instance1.run()
|
1074
|
+
results["first"] = "success"
|
1075
|
+
except Exception as e:
|
1076
|
+
results["first"] = f"error: {str(e)}"
|
1077
|
+
|
1078
|
+
def run_second_instance():
|
1079
|
+
"""Try to run second instance while first is running - should fail with lock conflict."""
|
1080
|
+
try:
|
1081
|
+
# Verify first instance has actually acquired the lock
|
1082
|
+
lock_acquired_timeout = 15.0
|
1083
|
+
start_wait = time.time()
|
1084
|
+
while time.time() - start_wait < lock_acquired_timeout:
|
1085
|
+
if lock_file.exists():
|
1086
|
+
break
|
1087
|
+
time.sleep(0.1)
|
1088
|
+
else:
|
1089
|
+
results["second"] = "first_instance_never_acquired_lock"
|
1090
|
+
return
|
1091
|
+
|
1092
|
+
# This should fail because first instance holds the lock
|
1093
|
+
instance2.run()
|
1094
|
+
results["second"] = "unexpected_success" # Should not reach here
|
1095
|
+
except RuntimeError as e:
|
1096
|
+
error_msg = str(e)
|
1097
|
+
if "Another ingestion process is already running" in error_msg:
|
1098
|
+
results["second"] = f"expected_lock_conflict: {str(e)}"
|
1099
|
+
else:
|
1100
|
+
results["second"] = f"unexpected_runtime_error: {str(e)}"
|
1101
|
+
except Exception as e:
|
1102
|
+
results["second"] = f"unexpected_error: {str(e)}"
|
1103
|
+
|
1104
|
+
# Start first instance with non-daemon thread
|
1105
|
+
thread1 = threading.Thread(target=run_first_instance, daemon=False)
|
1106
|
+
thread1.start()
|
1107
|
+
|
1108
|
+
# Wait for first instance to signal it's starting
|
1109
|
+
msg = "First instance failed to start"
|
1110
|
+
assert first_instance_started.wait(timeout=10), msg
|
1111
|
+
|
1112
|
+
# Give first instance time to acquire lock and initialize
|
1113
|
+
time.sleep(3.0)
|
1114
|
+
|
1115
|
+
# Verify first instance has acquired lock with content validation
|
1116
|
+
msg = "First instance should have created PID file"
|
1117
|
+
assert lock_file.exists(), msg
|
1118
|
+
|
1119
|
+
# Read PID and verify it's valid
|
1120
|
+
pid_content = lock_file.read_text().strip()
|
1121
|
+
msg = f"PID file should contain a number, got: {pid_content}"
|
1122
|
+
assert pid_content.isdigit(), msg
|
1123
|
+
|
1124
|
+
# Verify the lock file contains current process PID or a valid PID
|
1125
|
+
current_pid = os.getpid()
|
1126
|
+
stored_pid = int(pid_content)
|
1127
|
+
# The stored PID should be current process since we're running in same process
|
1128
|
+
msg = f"Expected PID {current_pid}, got {stored_pid}"
|
1129
|
+
assert stored_pid == current_pid, msg
|
1130
|
+
|
1131
|
+
# Now try to start second instance - this should fail
|
1132
|
+
thread2 = threading.Thread(target=run_second_instance, daemon=False)
|
1133
|
+
thread2.start()
|
1134
|
+
|
1135
|
+
# Wait for second instance to complete with better timeout handling
|
1136
|
+
# FileLock timeout is 10 seconds, so we give a bit more time
|
1137
|
+
thread2.join(timeout=15)
|
1138
|
+
|
1139
|
+
# Explicit check for thread completion
|
1140
|
+
if thread2.is_alive():
|
1141
|
+
# Force stop and fail the test
|
1142
|
+
instance1.stop_event.set()
|
1143
|
+
thread1.join(timeout=5)
|
1144
|
+
pytest.fail("Second instance thread did not complete within expected timeout")
|
1145
|
+
|
1146
|
+
# Stop first instance now that we've tested the lock
|
1147
|
+
instance1.stop_event.set()
|
1148
|
+
thread1.join(timeout=10)
|
1149
|
+
|
1150
|
+
# Ensure first thread also terminates
|
1151
|
+
if thread1.is_alive():
|
1152
|
+
pytest.fail("First instance thread did not terminate within timeout")
|
1153
|
+
|
1154
|
+
# Verify results
|
1155
|
+
msg = f"Second instance should have completed. Results: {results}"
|
1156
|
+
assert "second" in results, msg
|
1157
|
+
|
1158
|
+
# More specific assertion for expected lock conflict
|
1159
|
+
second_result = results["second"]
|
1160
|
+
msg = f"Second instance should have failed with lock conflict. Got: {second_result}"
|
1161
|
+
assert second_result.startswith("expected_lock_conflict"), msg
|
1162
|
+
|
1163
|
+
# Verify the error message is the expected one from Ingest class
|
1164
|
+
msg = f"Expected specific error message, got: {second_result}"
|
1165
|
+
assert "Another ingestion process is already running" in second_result, msg
|
1166
|
+
|
1167
|
+
# First instance should have run successfully (we stopped it manually)
|
1168
|
+
if "first" in results:
|
1169
|
+
msg = f"First instance should succeed, got: {results['first']}"
|
1170
|
+
assert results["first"] == "success", msg
|
1171
|
+
|
1172
|
+
# Improved cleanup verification with timeout-based checking
|
1173
|
+
cleanup_timeout = 5.0
|
1174
|
+
start_cleanup_wait = time.time()
|
1175
|
+
while time.time() - start_cleanup_wait < cleanup_timeout:
|
1176
|
+
if not lock_file.exists():
|
1177
|
+
break
|
1178
|
+
time.sleep(0.1)
|
1179
|
+
|
1180
|
+
msg = "PID file should be cleaned up after first instance stops"
|
1181
|
+
assert not lock_file.exists(), msg
|
1182
|
+
|
1183
|
+
# logging
|
1184
|
+
LOGGER.info("Sequential exclusion test completed successfully")
|
1185
|
+
LOGGER.info("First instance: %s", results.get("first", "stopped manually"))
|
1186
|
+
LOGGER.info("Second instance correctly failed with: %s", second_result)
|
1187
|
+
|
1188
|
+
|
1189
|
+
def test_concurrent_exclusion_lock_prevention(storage_mount_path, tmp_path):
|
1190
|
+
"""Test FileLock behavior under true concurrent access - simultaneous daemon startup attempts.
|
1191
|
+
|
1192
|
+
This test validates real concurrent scenario where multiple daemon instances
|
1193
|
+
attempt to acquire the same lock simultaneously, simulating race conditions
|
1194
|
+
that occur in production environments.
|
1195
|
+
"""
|
1196
|
+
lock_file = tmp_path / "concurrent_test.pid"
|
1197
|
+
|
1198
|
+
ingestion_client = IngestionClient(
|
1199
|
+
data_path=storage_mount_path,
|
1200
|
+
rse=ONSITE_RSE,
|
1201
|
+
vo="ctao",
|
1202
|
+
scope="acada",
|
1203
|
+
)
|
1204
|
+
|
1205
|
+
# Create both instances
|
1206
|
+
instance1 = Ingest(
|
1207
|
+
client=ingestion_client,
|
1208
|
+
top_dir=tmp_path,
|
1209
|
+
lock_file_path=lock_file,
|
1210
|
+
num_workers=1,
|
1211
|
+
polling_interval=0.1,
|
1212
|
+
check_interval=0.1,
|
1213
|
+
)
|
1214
|
+
instance2 = Ingest(
|
1215
|
+
client=ingestion_client,
|
1216
|
+
top_dir=tmp_path,
|
1217
|
+
lock_file_path=lock_file,
|
1218
|
+
num_workers=1,
|
1219
|
+
polling_interval=0.1,
|
1220
|
+
check_interval=0.1,
|
1221
|
+
)
|
1222
|
+
|
1223
|
+
results = {}
|
1224
|
+
|
1225
|
+
# Synchronization barrier - both threads wait here until released
|
1226
|
+
start_barrier = threading.Barrier(3) # 2 worker threads + 1 main thread
|
1227
|
+
|
1228
|
+
def run_instance(instance_id, instance):
|
1229
|
+
"""Run instance - both will try to start simultaneously."""
|
1230
|
+
try:
|
1231
|
+
# Wait for barrier - ensures simultaneous start
|
1232
|
+
start_barrier.wait() # All threads start together!
|
1233
|
+
|
1234
|
+
instance.run()
|
1235
|
+
results[instance_id] = "success"
|
1236
|
+
except RuntimeError as e:
|
1237
|
+
if "Another ingestion process is already running" in str(e):
|
1238
|
+
results[instance_id] = f"lock_conflict: {str(e)}"
|
1239
|
+
else:
|
1240
|
+
results[instance_id] = f"unexpected_error: {str(e)}"
|
1241
|
+
except Exception as e:
|
1242
|
+
results[instance_id] = f"error: {str(e)}"
|
1243
|
+
|
1244
|
+
# Create both threads
|
1245
|
+
thread1 = threading.Thread(
|
1246
|
+
target=run_instance, args=("first", instance1), daemon=False
|
1247
|
+
)
|
1248
|
+
thread2 = threading.Thread(
|
1249
|
+
target=run_instance, args=("second", instance2), daemon=False
|
1250
|
+
)
|
1251
|
+
|
1252
|
+
# Start both threads - they will wait at the barrier
|
1253
|
+
thread1.start()
|
1254
|
+
thread2.start()
|
1255
|
+
|
1256
|
+
# Give threads time to reach barrier
|
1257
|
+
time.sleep(0.5)
|
1258
|
+
|
1259
|
+
# Release the barrier - both threads start simultaneously
|
1260
|
+
start_barrier.wait()
|
1261
|
+
|
1262
|
+
# Wait for both to complete the lock acquisition attempt
|
1263
|
+
thread1.join(timeout=15)
|
1264
|
+
thread2.join(timeout=15)
|
1265
|
+
|
1266
|
+
# Stop whichever instance succeeded
|
1267
|
+
if "first" in results and results["first"] == "success":
|
1268
|
+
instance1.stop_event.set()
|
1269
|
+
if "second" in results and results["second"] == "success":
|
1270
|
+
instance2.stop_event.set()
|
1271
|
+
|
1272
|
+
# Ensure threads complete
|
1273
|
+
if thread1.is_alive():
|
1274
|
+
instance1.stop_event.set()
|
1275
|
+
thread1.join(timeout=5)
|
1276
|
+
if thread2.is_alive():
|
1277
|
+
instance2.stop_event.set()
|
1278
|
+
thread2.join(timeout=5)
|
1279
|
+
|
1280
|
+
# Verify results - Exactly ONE should succeed, ONE should fail
|
1281
|
+
msg = f"Both instances should complete, got: {results}"
|
1282
|
+
assert len(results) == 2, msg
|
1283
|
+
|
1284
|
+
success_count = sum(1 for result in results.values() if result == "success")
|
1285
|
+
conflict_count = sum(1 for result in results.values() if "lock_conflict" in result)
|
1286
|
+
|
1287
|
+
msg = f"Exactly ONE instance should succeed, got {success_count}: {results}"
|
1288
|
+
assert success_count == 1, msg
|
1289
|
+
|
1290
|
+
msg = f"Exactly ONE instance should get lock conflict, got {conflict_count}: {results}"
|
1291
|
+
assert conflict_count == 1, msg
|
1292
|
+
|
1293
|
+
# Verify the lock conflict has correct error message
|
1294
|
+
conflict_result = [r for r in results.values() if "lock_conflict" in r][0]
|
1295
|
+
msg = "Expected 'Another ingestion process is already running' message in conflict result"
|
1296
|
+
assert "Another ingestion process is already running" in conflict_result, msg
|
1297
|
+
|
1298
|
+
# Verify cleanup
|
1299
|
+
cleanup_timeout = 5.0
|
1300
|
+
start_cleanup = time.time()
|
1301
|
+
while time.time() - start_cleanup < cleanup_timeout:
|
1302
|
+
if not lock_file.exists():
|
1303
|
+
break
|
1304
|
+
time.sleep(0.1)
|
1305
|
+
msg = "Lock file should be cleaned up"
|
1306
|
+
assert not lock_file.exists(), msg
|
1307
|
+
|
1308
|
+
LOGGER.info("True Concurrency tests: %s", results)
|
1309
|
+
LOGGER.info("Real concurrent lock acquisition tested successfully!")
|
1310
|
+
|
1311
|
+
|
1312
|
+
def acada_write_test_files(
|
1313
|
+
storage_mount_path, test_vo, test_scope, n_files=7
|
1314
|
+
) -> list[Path]:
|
1315
|
+
"""Represents ACADA writing test files to the storage mount path."""
|
1316
|
+
|
1317
|
+
test_dir = storage_mount_path / test_vo / test_scope
|
1318
|
+
test_dir.mkdir(parents=True, exist_ok=True)
|
1319
|
+
|
1320
|
+
# Create seven dummy FITS files
|
1321
|
+
data_files = []
|
1322
|
+
rng = np.random.default_rng()
|
1323
|
+
for i in range(n_files):
|
1324
|
+
data_file = test_dir / f"testfile_{i}_20250609.fits"
|
1325
|
+
hdu = fits.PrimaryHDU(rng.random((50, 50)))
|
1326
|
+
hdu.writeto(data_file, overwrite=True, checksum=True)
|
1327
|
+
data_files.append(data_file)
|
1328
|
+
|
1329
|
+
LOGGER.info("Created test file: %s", data_file)
|
1330
|
+
|
1331
|
+
# Move permission reset before daemon start to avoid timing issues
|
1332
|
+
reset_xrootd_permissions(storage_mount_path)
|
1333
|
+
time.sleep(1.0) # Allow permissions to be applied
|
1334
|
+
|
1335
|
+
return data_files
|
1336
|
+
|
1337
|
+
|
1338
|
+
def acada_create_trigger_symlink(data_file, creation_results):
|
1339
|
+
"""Represents creating a trigger symlink for a given data file."""
|
1340
|
+
|
1341
|
+
try:
|
1342
|
+
trigger_file = Path(str(data_file) + TRIGGER_SUFFIX)
|
1343
|
+
trigger_file.symlink_to(data_file)
|
1344
|
+
LOGGER.info("Created trigger file: %s -> %s", trigger_file, data_file)
|
1345
|
+
|
1346
|
+
# Verify creation was successful
|
1347
|
+
if trigger_file.exists() and trigger_file.is_symlink():
|
1348
|
+
creation_results.append({"file": str(data_file), "status": "success"})
|
1349
|
+
else:
|
1350
|
+
creation_results.append(
|
1351
|
+
{"file": str(data_file), "status": "creation_failed"}
|
1352
|
+
)
|
1353
|
+
except Exception as e:
|
1354
|
+
LOGGER.exception("Failed to create trigger for %s: %s", data_file, e)
|
1355
|
+
creation_results.append({"file": str(data_file), "status": f"error: {str(e)}"})
|
1356
|
+
|
1357
|
+
return creation_results
|
1358
|
+
|
1359
|
+
|
1360
|
+
def ensure_files_ingested(data_files, storage_mount_path, test_scope, timeout_s=120):
|
1361
|
+
"""Ensure that all files are ingested by checking the IngestStatus."""
|
1362
|
+
|
1363
|
+
replica_client = ReplicaClient()
|
1364
|
+
|
1365
|
+
timeout_at = time.time() + timeout_s
|
1366
|
+
|
1367
|
+
data_file_entries = [
|
1368
|
+
{
|
1369
|
+
"file": str(data_file),
|
1370
|
+
"expected_lfn": f"/{data_file.relative_to(storage_mount_path)}",
|
1371
|
+
"found": False,
|
1372
|
+
}
|
1373
|
+
for data_file in data_files
|
1374
|
+
]
|
1375
|
+
|
1376
|
+
while time.time() < timeout_at and not all(
|
1377
|
+
status["found"] for status in data_file_entries
|
1378
|
+
):
|
1379
|
+
for data_file_entry in data_file_entries:
|
1380
|
+
if not data_file_entry["found"]:
|
1381
|
+
try:
|
1382
|
+
replicas = list(
|
1383
|
+
replica_client.list_replicas(
|
1384
|
+
dids=[
|
1385
|
+
{
|
1386
|
+
"scope": test_scope,
|
1387
|
+
"name": data_file_entry["expected_lfn"],
|
1388
|
+
}
|
1389
|
+
]
|
1390
|
+
)
|
1391
|
+
)
|
1392
|
+
if not replicas:
|
1393
|
+
LOGGER.info(
|
1394
|
+
"No replica found for %s", data_file_entry["expected_lfn"]
|
1395
|
+
)
|
1396
|
+
else:
|
1397
|
+
LOGGER.info(
|
1398
|
+
"Replica found for %s: %s",
|
1399
|
+
data_file_entry["expected_lfn"],
|
1400
|
+
replicas[0],
|
1401
|
+
)
|
1402
|
+
data_file_entry["found"] = True
|
1403
|
+
except Exception:
|
1404
|
+
LOGGER.exception(
|
1405
|
+
"Failed to list replicas for %s",
|
1406
|
+
data_file_entry["expected_lfn"],
|
1407
|
+
)
|
1408
|
+
time.sleep(1.0)
|
1409
|
+
|
1410
|
+
if not all(status["found"] for status in data_file_entries):
|
1411
|
+
pytest.fail(f"Not all replicas found for files: {data_files}")
|
1412
|
+
|
1413
|
+
|
1414
|
+
@pytest.mark.usefixtures(
|
1415
|
+
"_auth_proxy", "lock_for_ingestion_daemon", "disable_ingestion_daemon"
|
1416
|
+
)
|
1417
|
+
@pytest.mark.verifies_usecase("UC-110-1.1.4")
|
1418
|
+
def test_ingest_parallel_submission(storage_mount_path, caplog, test_vo, test_scope):
|
1419
|
+
"""Test parallel file processing: creates multiple FITS files simultaneously and verifies that the
|
1420
|
+
daemon can detect, process, and ingest them efficiently using parallel workers.
|
1421
|
+
"""
|
1422
|
+
ingestion_client = IngestionClient(
|
1423
|
+
data_path=storage_mount_path,
|
1424
|
+
rse=ONSITE_RSE,
|
1425
|
+
vo=test_vo,
|
1426
|
+
scope=test_scope,
|
1427
|
+
)
|
1428
|
+
|
1429
|
+
ingest_instance = Ingest(
|
1430
|
+
client=ingestion_client,
|
1431
|
+
top_dir=storage_mount_path,
|
1432
|
+
num_workers=4,
|
1433
|
+
lock_file_path=storage_mount_path / "bdms_ingest.lock",
|
1434
|
+
polling_interval=0.5,
|
1435
|
+
check_interval=0.5,
|
1436
|
+
)
|
1437
|
+
|
1438
|
+
data_files = acada_write_test_files(storage_mount_path, test_vo, test_scope)
|
1439
|
+
|
1440
|
+
# Daemon startup with exception handling
|
1441
|
+
daemon_exception = None
|
1442
|
+
daemon_started = threading.Event()
|
1443
|
+
|
1444
|
+
def run_daemon():
|
1445
|
+
"""Run daemon with exception capture."""
|
1446
|
+
nonlocal daemon_exception
|
1447
|
+
try:
|
1448
|
+
daemon_started.set() # Signal daemon thread started
|
1449
|
+
ingest_instance.run()
|
1450
|
+
except Exception as e:
|
1451
|
+
daemon_exception = e
|
1452
|
+
LOGGER.exception("Daemon failed with exception: %s", str(e))
|
1453
|
+
|
1454
|
+
# Start daemon with non-daemon thread for reliability
|
1455
|
+
daemon_thread = threading.Thread(target=run_daemon, daemon=False)
|
1456
|
+
daemon_thread.start()
|
1457
|
+
|
1458
|
+
# Wait for daemon thread to start
|
1459
|
+
msg = "Daemon thread failed to start"
|
1460
|
+
assert daemon_started.wait(timeout=10), msg
|
1461
|
+
|
1462
|
+
# Daemon initialization verification
|
1463
|
+
daemon_init_timeout = 20.0 # Increased timeout for robust initialization
|
1464
|
+
daemon_init_start = time.time()
|
1465
|
+
required_conditions = {
|
1466
|
+
"lock_acquired": False,
|
1467
|
+
"result_thread_started": False,
|
1468
|
+
"pool_started": False,
|
1469
|
+
"monitoring_started": False,
|
1470
|
+
"observer_started": False,
|
1471
|
+
}
|
1472
|
+
|
1473
|
+
while time.time() - daemon_init_start < daemon_init_timeout:
|
1474
|
+
# Check for daemon startup failure early
|
1475
|
+
if daemon_exception:
|
1476
|
+
pytest.fail(f"Daemon failed during initialization: {daemon_exception}")
|
1477
|
+
|
1478
|
+
# Check for lock acquisition (critical for daemon operation)
|
1479
|
+
if ingest_instance.lock_file_path.exists():
|
1480
|
+
required_conditions["lock_acquired"] = True
|
1481
|
+
|
1482
|
+
# Check log messages for initialization steps
|
1483
|
+
log_text = caplog.text
|
1484
|
+
if "Result processing thread started" in log_text:
|
1485
|
+
required_conditions["result_thread_started"] = True
|
1486
|
+
|
1487
|
+
# Flexible process pool verification to work with any worker count
|
1488
|
+
if re.search(r"Started process pool with \d+ workers", log_text):
|
1489
|
+
required_conditions["pool_started"] = True
|
1490
|
+
|
1491
|
+
if "Starting continuous polling-based monitoring" in log_text:
|
1492
|
+
required_conditions["monitoring_started"] = True
|
1493
|
+
if "File monitoring observer started successfully" in log_text:
|
1494
|
+
required_conditions["observer_started"] = True
|
1495
|
+
|
1496
|
+
# Check if all conditions are met
|
1497
|
+
if all(required_conditions.values()):
|
1498
|
+
break
|
1499
|
+
|
1500
|
+
time.sleep(0.2)
|
1501
|
+
|
1502
|
+
# Verify complete initialization or provide diagnostics
|
1503
|
+
missing_conditions = [k for k, v in required_conditions.items() if not v]
|
1504
|
+
if missing_conditions:
|
1505
|
+
ingest_instance.stop_event.set()
|
1506
|
+
daemon_thread.join(timeout=5)
|
1507
|
+
pytest.fail(
|
1508
|
+
f"Daemon initialization incomplete. Missing: {missing_conditions}. Check logs for errors."
|
1509
|
+
)
|
1510
|
+
|
1511
|
+
time.sleep(0.5) # some additional time to stabilize
|
1512
|
+
|
1513
|
+
# Create trigger files and also track
|
1514
|
+
trigger_files = []
|
1515
|
+
natural_start = time.time()
|
1516
|
+
|
1517
|
+
for data_file in data_files:
|
1518
|
+
trigger_file = Path(str(data_file) + TRIGGER_SUFFIX)
|
1519
|
+
trigger_file.symlink_to(data_file)
|
1520
|
+
trigger_files.append(trigger_file)
|
1521
|
+
|
1522
|
+
# Test regular detection, looking for MOVE events
|
1523
|
+
natural_detection_timeout = 30.0
|
1524
|
+
natural_start = time.time()
|
1525
|
+
|
1526
|
+
while time.time() - natural_start < natural_detection_timeout:
|
1527
|
+
# Look for actual processing
|
1528
|
+
if caplog.text.count("Detected new trigger file") > 0:
|
1529
|
+
break
|
1530
|
+
time.sleep(1.0)
|
1531
|
+
|
1532
|
+
# Count events after the loop completes
|
1533
|
+
move_events_detected = caplog.text.count("MOVE Event received")
|
1534
|
+
|
1535
|
+
# Wait for processing with concurrency monitoring
|
1536
|
+
processing_timeout = 120.0
|
1537
|
+
processing_start = time.time()
|
1538
|
+
processed_files = set()
|
1539
|
+
max_concurrent_samples = []
|
1540
|
+
|
1541
|
+
while time.time() - processing_start < processing_timeout:
|
1542
|
+
# Sample concurrent tasks frequently to catch parallelism
|
1543
|
+
current_concurrent = len(ingest_instance.submitted_tasks)
|
1544
|
+
max_concurrent_samples.append(current_concurrent)
|
1545
|
+
|
1546
|
+
# Check processing results
|
1547
|
+
for data_file in data_files:
|
1548
|
+
success_pattern = f"Processed file {data_file} with result success"
|
1549
|
+
skipped_pattern = f"Processed file {data_file} with result skipped"
|
1550
|
+
|
1551
|
+
if str(data_file) not in processed_files:
|
1552
|
+
if success_pattern in caplog.text or skipped_pattern in caplog.text:
|
1553
|
+
processed_files.add(str(data_file))
|
1554
|
+
|
1555
|
+
if len(processed_files) == 7:
|
1556
|
+
break
|
1557
|
+
|
1558
|
+
if "Fatal error in result processing thread" in caplog.text:
|
1559
|
+
break
|
1560
|
+
|
1561
|
+
time.sleep(0.1) # Sample frequently to catch concurrency
|
1562
|
+
|
1563
|
+
assert len(processed_files) == 7
|
1564
|
+
|
1565
|
+
# Record ingestion workflow completion time
|
1566
|
+
workflow_end_time = time.time()
|
1567
|
+
|
1568
|
+
# Stop the daemon
|
1569
|
+
ingest_instance.stop_event.set()
|
1570
|
+
daemon_thread.join(timeout=10)
|
1571
|
+
|
1572
|
+
if daemon_thread.is_alive():
|
1573
|
+
pytest.fail("Ingest Daemon thread did not terminate within timeout")
|
1574
|
+
|
1575
|
+
# Verify results
|
1576
|
+
msg = "Process pool startup failed"
|
1577
|
+
assert "Started process pool with 4 workers" in caplog.text, msg
|
1578
|
+
|
1579
|
+
msg = "Result processing thread startup failed"
|
1580
|
+
assert "Result processing thread started" in caplog.text, msg
|
1581
|
+
|
1582
|
+
# Verify trigger files were cleaned up during successful processing
|
1583
|
+
remaining_triggers = sum(1 for tf in trigger_files if tf.exists())
|
1584
|
+
msg = f"Expected all trigger files to be cleaned up, {remaining_triggers} remain"
|
1585
|
+
assert remaining_triggers == 0, msg
|
1586
|
+
|
1587
|
+
# Verify clean shutdown
|
1588
|
+
msg = "Lock file not cleaned up"
|
1589
|
+
assert not ingest_instance.lock_file_path.exists(), msg
|
1590
|
+
|
1591
|
+
msg = "Daemon shutdown not logged"
|
1592
|
+
assert "Stopped ingestion daemon" in caplog.text, msg
|
1593
|
+
|
1594
|
+
msg = "Result thread shutdown not logged"
|
1595
|
+
assert "Result processing thread stopped" in caplog.text, msg
|
1596
|
+
|
1597
|
+
# Clean up data files
|
1598
|
+
for data_file in data_files:
|
1599
|
+
if data_file.exists():
|
1600
|
+
data_file.unlink()
|
1601
|
+
|
1602
|
+
# Statistics
|
1603
|
+
# Ingestion workflow time: from trigger detection to ingestion with replication completion
|
1604
|
+
max_concurrent_observed = (
|
1605
|
+
max(max_concurrent_samples) if max_concurrent_samples else 0
|
1606
|
+
)
|
1607
|
+
max_concurrent_tracked = ingest_instance.max_concurrent_tasks
|
1608
|
+
|
1609
|
+
detection_to_completion_time = workflow_end_time - natural_start
|
1610
|
+
processing_rate = (
|
1611
|
+
len(processed_files) / detection_to_completion_time
|
1612
|
+
if detection_to_completion_time > 0
|
1613
|
+
else 0
|
1614
|
+
)
|
1615
|
+
|
1616
|
+
total_submitted = ingest_instance.total_tasks_submitted
|
1617
|
+
tasks_cleaned_up = len(ingest_instance.submitted_tasks) == 0
|
1618
|
+
max_concurrent_final = max(max_concurrent_tracked, max_concurrent_observed)
|
1619
|
+
parallel_achieved = max_concurrent_final >= 2
|
1620
|
+
|
1621
|
+
# Summary
|
1622
|
+
status = "parallel" if parallel_achieved else "sequential"
|
1623
|
+
|
1624
|
+
LOGGER.info("=== Parallel Ingestion Test Results ===")
|
1625
|
+
LOGGER.info(
|
1626
|
+
"Files processed: %d/7 in %.1fs",
|
1627
|
+
len(processed_files),
|
1628
|
+
detection_to_completion_time,
|
1629
|
+
)
|
1630
|
+
LOGGER.info("Processing rate: %.1f files/sec", processing_rate)
|
1631
|
+
LOGGER.info("Max concurrent tasks: %d (mode: %s)", max_concurrent_final, status)
|
1632
|
+
LOGGER.info("Total tasks submitted: %d", total_submitted)
|
1633
|
+
LOGGER.info("Task cleanup successful: %s", tasks_cleaned_up)
|
1634
|
+
LOGGER.info("Event detection: %d move events", move_events_detected)
|
1635
|
+
|
1636
|
+
|
1637
|
+
def fetch_ingestion_daemon_metrics():
|
1638
|
+
"""Fetch metrics from the ingestion daemon to verify its operation."""
|
1639
|
+
|
1640
|
+
response = urlopen("http://bdms-ingestion-daemon:8000/")
|
1641
|
+
|
1642
|
+
assert response.status == 200, "Ingestion daemon metrics are not responding"
|
1643
|
+
|
1644
|
+
n_tasks_metrics = {}
|
1645
|
+
for line in response.readlines():
|
1646
|
+
line = line.decode("utf-8").strip()
|
1647
|
+
if line.startswith("n_tasks_"):
|
1648
|
+
LOGGER.info("Ingestion daemon metrics: %s", line)
|
1649
|
+
key, value = line.split(" ", 1)
|
1650
|
+
n_tasks_metrics[key] = float(value)
|
1651
|
+
|
1652
|
+
return n_tasks_metrics
|
1653
|
+
|
1654
|
+
|
1655
|
+
@pytest.mark.usefixtures(
|
1656
|
+
"_auth_proxy", "lock_for_ingestion_daemon", "enable_ingestion_daemon"
|
1657
|
+
)
|
1658
|
+
@pytest.mark.verifies_usecase("UC-110-1.1.4")
|
1659
|
+
def test_ingest_parallel_submission_with_live_daemon(storage_mount_path, test_vo):
|
1660
|
+
"""Test parallel file processing with an already running daemon."""
|
1661
|
+
|
1662
|
+
# with live test, the daemon is deployed outside of this test, so we need to pick a persistent location, matching the daemon's storage mount path
|
1663
|
+
# note that if kind cluster creation fixture is used, the directory can be unique per test session
|
1664
|
+
# this test does only checks that the files are consumed, not that they are replicated
|
1665
|
+
|
1666
|
+
test_scope = "test_scope_persistent"
|
1667
|
+
|
1668
|
+
n_tasks_metrics_before_test = fetch_ingestion_daemon_metrics()
|
1669
|
+
|
1670
|
+
for tf in (storage_mount_path / test_vo / test_scope).glob("*" + TRIGGER_SUFFIX):
|
1671
|
+
if tf.exists():
|
1672
|
+
LOGGER.info("Cleaning up existing trigger file: %s", tf)
|
1673
|
+
tf.unlink()
|
1674
|
+
|
1675
|
+
data_files = acada_write_test_files(storage_mount_path, test_vo, test_scope)
|
1676
|
+
|
1677
|
+
creation_results = []
|
1678
|
+
for data_file in data_files:
|
1679
|
+
acada_create_trigger_symlink(data_file, creation_results)
|
1680
|
+
|
1681
|
+
trigger_files = [Path(str(df) + TRIGGER_SUFFIX) for df in data_files]
|
1682
|
+
|
1683
|
+
timeout = 120.0
|
1684
|
+
start_time = time.time()
|
1685
|
+
|
1686
|
+
remaining_triggers = 0
|
1687
|
+
while time.time() - start_time < timeout:
|
1688
|
+
# Verify trigger files were cleaned up during successful processing
|
1689
|
+
remaining_triggers = sum(1 for tf in trigger_files if tf.exists())
|
1690
|
+
|
1691
|
+
if remaining_triggers == 0:
|
1692
|
+
LOGGER.info("All trigger files consumed up successfully, exiting test.")
|
1693
|
+
break
|
1694
|
+
else:
|
1695
|
+
LOGGER.info(
|
1696
|
+
"Waiting for trigger files to be cleaned up, %s remain.",
|
1697
|
+
remaining_triggers,
|
1698
|
+
)
|
1699
|
+
|
1700
|
+
time.sleep(1.0) # Sample frequently to catch concurrency
|
1701
|
+
|
1702
|
+
assert remaining_triggers == 0, "Expected all trigger files to be consumed up"
|
1703
|
+
|
1704
|
+
ensure_files_ingested(data_files, storage_mount_path, test_scope)
|
1705
|
+
|
1706
|
+
# make sure that metrics are available from the daemon
|
1707
|
+
n_tasks_metrics = fetch_ingestion_daemon_metrics()
|
1708
|
+
|
1709
|
+
assert n_tasks_metrics["n_tasks_success_created"] < time.time()
|
1710
|
+
assert n_tasks_metrics["n_tasks_processed_total"] - n_tasks_metrics_before_test[
|
1711
|
+
"n_tasks_processed_total"
|
1712
|
+
] == len(data_files)
|
1713
|
+
assert (
|
1714
|
+
n_tasks_metrics["n_tasks_processed_total"]
|
1715
|
+
- n_tasks_metrics_before_test["n_tasks_processed_total"]
|
1716
|
+
== n_tasks_metrics["n_tasks_success_total"]
|
1717
|
+
+ n_tasks_metrics["n_tasks_skipped_total"]
|
1718
|
+
), "Ingestion daemon metrics do not match expected values"
|