np_codeocean 0.3.5__py3-none-any.whl → 0.3.6__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
np_codeocean/utils.py CHANGED
@@ -1,563 +1,671 @@
1
- from __future__ import annotations
2
-
3
- import contextlib
4
- import csv
5
- import datetime
6
- import functools
7
- import itertools
8
- import json
9
- import logging
10
- import os
11
- import pathlib
12
- import re
13
- from typing import Any, Generator, Iterable, Literal
14
- import typing_extensions
15
-
16
- import np_config
17
- import np_tools
18
- import npc_ephys
19
- import npc_sync
20
- import npc_session
21
- import numpy as np
22
- import polars as pl
23
- import requests
24
- from aind_codeocean_pipeline_monitor.models import PipelineMonitorSettings
25
- from aind_data_transfer_service.models.core import (
26
- SubmitJobRequestV2,
27
- Task,
28
- UploadJobConfigsV2,
29
- )
30
- from aind_data_schema_models.modalities import Modality
31
- from aind_data_schema_models.platforms import Platform
32
- from aind_slurm_rest_v2.models.v0040_job_desc_msg import (
33
- V0040JobDescMsg,
34
- )
35
-
36
- logger = logging.getLogger(__name__)
37
-
38
- AINDPlatform = Literal['ecephys', 'behavior']
39
-
40
- AIND_DATA_TRANSFER_SERVICE = "http://aind-data-transfer-service"
41
- DEV_SERVICE = "http://aind-data-transfer-service-dev"
42
- HPC_UPLOAD_JOB_EMAIL = "ben.hardcastle@alleninstitute.org"
43
- ACQ_DATETIME_FORMAT = "%Y-%m-%d %H:%M:%S"
44
-
45
- AIND_METADATA_NAMES: tuple[str, ...] = ('session', 'data_description', 'procedures', 'processing', 'rig', 'subject')
46
-
47
- # In the future, default slurm settings can be stored in a job_type in AWS Param Store
48
- # see http://aind-data-transfer-service/job_params for current job_types
49
- _DEFAULT_EPHYS_SLURM_SETTINGS_JSON = {
50
- "memory_per_cpu": {
51
- "set": True,
52
- "number": 8000
53
- },
54
- "minimum_cpus_per_node": 12, # 6 probes * (lfp + ap)
55
- "partition": "aind",
56
- "tasks": 1,
57
- "time_limit": {
58
- "set": True,
59
- "number": 15 * 60
60
- },
61
- "environment": [
62
- "PATH=/bin:/usr/bin/:/usr/local/bin/",
63
- "LD_LIBRARY_PATH=/lib/:/lib64/:/usr/local/lib"
64
- ],
65
- "maximum_nodes": 1,
66
- "minimum_nodes": 1,
67
- "current_working_directory": "."
68
- }
69
- """Increased timelimit and cpus for running ephys compression on the hpc"""
70
- DEFAULT_EPHYS_SLURM_SETTINGS = V0040JobDescMsg.model_validate(
71
- {
72
- **_DEFAULT_EPHYS_SLURM_SETTINGS_JSON,
73
- "qos": "production",
74
- "standard_error": "/allen/aind/scratch/svc_aind_airflow/prod/logs/%x_%j_error.out",
75
- "standard_output": "/allen/aind/scratch/svc_aind_airflow/prod/logs/%x_%j.out",
76
- }
77
- )
78
- DEFAULT_EPHYS_SLURM_SETTINGS_DEV = V0040JobDescMsg.model_validate(
79
- {
80
- **_DEFAULT_EPHYS_SLURM_SETTINGS_JSON,
81
- "qos": "dev",
82
- "standard_error": "/allen/aind/scratch/svc_aind_airflow/dev/logs/%x_%j_error.out",
83
- "standard_output": "/allen/aind/scratch/svc_aind_airflow/dev/logs/%x_%j.out",
84
- }
85
- )
86
- DEFAULT_EPHYS_IMAGE = {
87
- "image": "ghcr.io/allenneuraldynamics/aind-ephys-transformation",
88
- "image_version": "0.2.1",
89
- "command_script": "#!/bin/bash \nsingularity exec --cleanenv docker://%IMAGE:%IMAGE_VERSION python -m aind_ephys_transformation.ephys_job --job-settings ' %JOB_SETTINGS '",
90
- }
91
-
92
- class SyncFileNotFoundError(FileNotFoundError):
93
- pass
94
-
95
- @functools.cache
96
- def get_project_config() -> dict[str, Any]:
97
- """Config for this project"""
98
- return np_config.fetch('/projects/np_codeocean')
99
-
100
- def set_npc_lims_credentials() -> None:
101
- creds = np_config.fetch('/projects/np_codeocean/npc_lims')
102
- for k, v in creds.items():
103
- os.environ.setdefault(k, v)
104
-
105
- def get_home() -> pathlib.Path:
106
- if os.name == 'nt':
107
- return pathlib.Path(os.environ['USERPROFILE'])
108
- return pathlib.Path(os.environ['HOME'])
109
-
110
- def is_behavior_video_file(path: pathlib.Path) -> bool:
111
- if path.is_dir() or path.suffix not in ('.mp4', '.avi', '.json'):
112
- return False
113
- with contextlib.suppress(ValueError):
114
- _ = npc_session.extract_mvr_camera_name(path.as_posix())
115
- return True
116
- return False
117
-
118
- def is_surface_channel_recording(path_name: str) -> bool:
119
- """
120
- >>> import np_session
121
- >>> session = np_session.Session("//allen/programs/mindscope/workgroups/dynamicrouting/PilotEphys/Task 2 pilot/DRpilot_690706_20231129_surface_channels")
122
- >>> is_surface_channel_recording(session.npexp_path.as_posix())
123
- True
124
- """
125
- return 'surface_channels' in path_name.lower()
126
-
127
- def cleanup_ephys_symlinks(toplevel_dir: pathlib.Path) -> None:
128
- """After creating symlinks to the ephys data, run this to make any necessary
129
- modifications prior to upload.
130
-
131
- Provided dir path should be a directory containing all ephys data in
132
- subfolders (e.g. directory containing "Record Node 10x" folders)
133
-
134
- Only deletes symlinks or writes new files in place of symlinks - does not
135
- modify original data.
136
-
137
- Rules:
138
- - if any continuous.dat files are unreadable: remove them and their containing folders
139
- - if any probes were recorded on multiple record nodes: just keep the first
140
- - if continuous.dat files are missing (ie. excluded because probes weren't
141
- inserted, or we removed symlinks in previous steps): update metadata files
142
- """
143
- remove_unreadable_ephys_data(toplevel_dir)
144
- remove_duplicate_ephys_data(toplevel_dir)
145
- cleanup_ephys_metadata(toplevel_dir)
146
-
147
- def remove_unreadable_ephys_data(toplevel_dir: pathlib.Path) -> None:
148
-
149
- for continuous_dir in ephys_continuous_dir_generator(toplevel_dir):
150
- events_dir = continuous_dir.parent.parent / 'events' / continuous_dir.name / 'TTL'
151
- filenames = ('continuous.dat', 'timestamps.npy', 'sample_numbers.npy')
152
- dirs = (continuous_dir, ) + ((events_dir,) if events_dir.exists() else ())
153
- mark_for_removal = False
154
- for d in dirs:
155
- if not d.exists():
156
- continue
157
- for filename in filenames:
158
- if filename == 'continuous.dat' and d.name == 'TTL':
159
- continue # no continuous.dat expected in TTL events
160
- file = d / filename
161
- if not (file.is_symlink() or file.exists()):
162
- logger.warning(f'Critical file not found {file}, insufficient data for processing')
163
- mark_for_removal = True
164
- break
165
- try:
166
- data = np.memmap(decode_symlink_path(file), dtype="int16" if 'timestamps' not in file.name else "float64", mode="r")
167
- except Exception as exc:
168
- logger.warning(f'Failed to read {file}: {exc!r}')
169
- mark_for_removal = True
170
- break
171
- if data.size == 0:
172
- logger.warning(f'Empty file {file}')
173
- mark_for_removal = True
174
- break
175
- logger.debug(f'Found readable, non-empty data in {file}')
176
- if mark_for_removal:
177
- break
178
- if mark_for_removal:
179
- logger.warning(f'Removing {continuous_dir} and its contents')
180
- remove_folder_of_symlinks(continuous_dir)
181
- logger.warning(f'Removing {events_dir.parent} and its contents')
182
- remove_folder_of_symlinks(events_dir.parent)
183
-
184
- def remove_duplicate_ephys_data(toplevel_dir: pathlib.Path) -> None:
185
- logger.info('Checking for duplicate ephys data...')
186
- paths = sorted(ephys_continuous_dir_generator(toplevel_dir))
187
- experiments = set(re.findall(r'/experiment(\d+)/', path.as_posix())[0] for path in paths)
188
- logger.debug(f'Found {len(experiments)} experiments')
189
- for experiment in experiments:
190
- exp_paths = sorted(path for path in paths if f'experiment{experiment}' in path.as_posix())
191
- recordings = set(re.findall(r'/recording(\d+)/', path.as_posix())[0] for path in exp_paths)
192
- logger.debug(f'Found {len(recordings)} recordings in experiment{experiment}')
193
- for recording in recordings:
194
- recording_paths = sorted(path for path in exp_paths if f'recording{recording}' in path.as_posix())
195
- probes = []
196
- # import pdb; pdb.set_trace()
197
- for continuous_dir in recording_paths:
198
- try:
199
- probe = npc_session.ProbeRecord(continuous_dir.name)
200
- except ValueError:
201
- continue
202
- suffix = continuous_dir.name.split('-')[-1]
203
- assert suffix in ('AP', 'LFP')
204
- recording_name = f"{probe}-{suffix}"
205
- if recording_name in probes:
206
- logger.info(f'Duplicate {recording_name = } found in {continuous_dir.parent.parent} - removing')
207
- remove_folder_of_symlinks(continuous_dir)
208
- else:
209
- probes.append(recording_name)
210
-
211
- def remove_folder_of_symlinks(folder: pathlib.Path) -> None:
212
- """Recursive deletion of all files in dir tree, with a check that each is a
213
- symlink."""
214
- for path in folder.rglob('*'):
215
- if path.is_dir():
216
- remove_folder_of_symlinks(path)
217
- else:
218
- assert path.is_symlink(), f'Expected {path} to be a symlink'
219
- path.unlink(missing_ok=True)
220
- with contextlib.suppress(FileNotFoundError):
221
- folder.rmdir()
222
-
223
- def ephys_recording_dir_generator(toplevel_dir: pathlib.Path) -> Generator[pathlib.Path, None, None]:
224
- for recording_dir in toplevel_dir.rglob('recording[0-9]*'):
225
- if recording_dir.is_dir():
226
- yield recording_dir
227
-
228
- def ephys_continuous_dir_generator(toplevel_dir: pathlib.Path) -> Generator[pathlib.Path, None, None]:
229
- for recording_dir in ephys_recording_dir_generator(toplevel_dir):
230
- parent = recording_dir / 'continuous'
231
- if not parent.exists():
232
- continue
233
- for continuous_dir in parent.iterdir():
234
- if continuous_dir.is_dir():
235
- yield continuous_dir
236
-
237
- def ephys_structure_oebin_generator(toplevel_dir: pathlib.Path) -> Generator[pathlib.Path, None, None]:
238
- for recording_dir in ephys_recording_dir_generator(toplevel_dir):
239
- oebin_path = recording_dir / 'structure.oebin'
240
- if not (oebin_path.is_symlink() or oebin_path.exists()):
241
- # symlinks that are created for the hpc use posix paths, and aren't
242
- # readable on windows, so .exists() returns False: use .is_symlink() instead
243
- logger.warning(f'No structure.oebin found in {recording_dir}')
244
- continue
245
- yield oebin_path
246
-
247
- def cleanup_ephys_metadata(toplevel_dir: pathlib.Path) -> None:
248
- logger.debug('Checking structure.oebin for missing folders...')
249
- for oebin_path in ephys_structure_oebin_generator(toplevel_dir):
250
- oebin_obj = np_tools.read_oebin(decode_symlink_path(oebin_path))
251
- logger.debug(f'Checking {oebin_path} against actual folders...')
252
- any_removed = False
253
- for subdir_name in ('events', 'continuous'):
254
- subdir = oebin_path.parent / subdir_name
255
- # iterate over copy of list so as to not disrupt iteration when elements are removed
256
- for device in [device for device in oebin_obj[subdir_name]]:
257
- if not (subdir / device['folder_name']).exists():
258
- logger.info(f'{device["folder_name"]} not found in {subdir}, removing from structure.oebin')
259
- oebin_obj[subdir_name].remove(device)
260
- any_removed = True
261
- if any_removed:
262
- oebin_path.unlink()
263
- oebin_path.write_text(json.dumps(oebin_obj, indent=4))
264
- logger.debug('Overwrote symlink to structure.oebin with corrected structure.oebin')
265
-
266
- def write_corrected_ephys_timestamps(
267
- ephys_dir: pathlib.Path,
268
- behavior_dir: pathlib.Path,
269
- ) -> None:
270
- for path in itertools.chain(behavior_dir.glob('*.h5'), behavior_dir.glob('*.sync')):
271
- with contextlib.suppress(Exception):
272
- sync_dataset = npc_sync.SyncDataset(path)
273
- _ = sync_dataset.line_labels
274
- logger.info(f'Found valid sync file {path.as_posix()}')
275
- break
276
- else:
277
- raise SyncFileNotFoundError(f'No valid sync file found in {behavior_dir.as_posix()}')
278
-
279
- timing_on_pxi = (
280
- timing
281
- for timing in npc_ephys.get_ephys_timing_on_pxi(
282
- ephys_dir.glob("**/experiment*/recording*"),
283
- )
284
- )
285
- timing_on_sync = (
286
- npc_ephys.get_ephys_timing_on_sync(
287
- sync=sync_dataset,
288
- devices=timing_on_pxi,
289
- )
290
- )
291
- npc_ephys.overwrite_timestamps(timing_on_sync)
292
- logger.info(f'Corrected timestamps in {ephys_dir}')
293
-
294
- def decode_symlink_path(oebin_path: pathlib.Path) -> pathlib.Path:
295
- if not oebin_path.is_symlink():
296
- return oebin_path
297
- return np_config.normalize_path(oebin_path.readlink())
298
-
299
- def is_csv_in_hpc_upload_queue(csv_path: pathlib.Path, upload_service_url: str = AIND_DATA_TRANSFER_SERVICE) -> bool:
300
- """Check if an upload job has been submitted to the hpc upload queue.
301
-
302
- - currently assumes one job per csv
303
- - does not check status (job may be FINISHED rather than RUNNING)
304
-
305
- >>> is_csv_in_hpc_upload_queue("//allen/programs/mindscope/workgroups/np-exp/codeocean/DRpilot_664851_20231114/upload.csv")
306
- False
307
- """
308
- # get subject-id, acq-datetime from csv
309
- df = pl.read_csv(csv_path, eol_char='\r')
310
- for col in df.get_columns():
311
- if col.name.startswith('subject') and col.name.endswith('id'):
312
- subject = npc_session.SubjectRecord(col[0])
313
- continue
314
- if col.name.startswith('acq') and 'datetime' in col.name.lower():
315
- dt = npc_session.DatetimeRecord(col[0])
316
- continue
317
- if col.name == 'platform':
318
- platform = col[0]
319
- continue
320
- return is_session_in_hpc_queue(subject=subject, acq_datetime=dt.dt, platform=platform, upload_service_url=upload_service_url)
321
-
322
- def is_session_in_hpc_queue(subject: int | str, acq_datetime: str | datetime.datetime, platform: str | None = None, upload_service_url: str = AIND_DATA_TRANSFER_SERVICE) -> bool:
323
- """
324
- >>> is_session_in_hpc_queue(366122, datetime.datetime(2023, 11, 14, 0, 0, 0))
325
- False
326
- >>> is_session_in_hpc_queue(702136, datetime.datetime(2024, 3, 4, 13, 21, 35))
327
- True
328
- """
329
- if not isinstance(acq_datetime, datetime.datetime):
330
- acq_datetime = datetime.datetime.strptime(acq_datetime, ACQ_DATETIME_FORMAT)
331
- partial_session_id = f"{subject}_{acq_datetime.strftime(ACQ_DATETIME_FORMAT).replace(' ', '_').replace(':', '-')}"
332
- if platform:
333
- partial_session_id = f"{platform}_{partial_session_id}"
334
-
335
- jobs_response = requests.get(f"{upload_service_url}/jobs")
336
- jobs_response.raise_for_status()
337
- return partial_session_id in jobs_response.content.decode()
338
-
339
- def is_job_in_hpc_upload_queue(job: UploadJobConfigsV2, upload_service_url: str = AIND_DATA_TRANSFER_SERVICE) -> bool:
340
- return is_session_in_hpc_queue(job.subject_id, job.acq_datetime, job.platform.abbreviation, upload_service_url)
341
-
342
- def write_upload_csv(
343
- content: dict[str, Any],
344
- output_path: pathlib.Path,
345
- ) -> pathlib.Path:
346
- logger.info(f'Creating upload job file {output_path}')
347
- with open(output_path, 'w') as f:
348
- w = csv.writer(f, lineterminator='')
349
- w.writerow(content.keys())
350
- w.writerow('\n')
351
- w.writerow(content.values())
352
- return output_path
353
-
354
- def create_upload_job_configs_v2(
355
- project_name: str,
356
- platform: str,
357
- subject_id: str,
358
- force_cloud_sync: bool,
359
- modalities: dict[str, str],
360
- acq_datetime: datetime.datetime,
361
- user_email: str = HPC_UPLOAD_JOB_EMAIL,
362
- job_type: str = "default",
363
- metadata_dir: str | None = None,
364
- codeocean_pipeline_settings: dict[str, PipelineMonitorSettings] | None = None,
365
- check_timestamps: bool = True, # default in transfer service is True: checks timestamps have been corrected via flag file
366
- test: bool = False,
367
- **extra_UploadJobConfigsV2_params: Any,
368
- ) -> UploadJobConfigsV2:
369
- """Create a UploadJobConfigsV2 model. Modalities should be provided in format
370
- {modality_abbr: input_source}. job_type refers to the default or custom
371
- presets used for compression and Code Ocean pipelines.
372
- """
373
- # Each task in airflow can be configured individually
374
- # force_cloud_sync
375
- check_s3_folder_exists_task = Task(skip_task=True) if force_cloud_sync else None
376
- # metadata_dir
377
- gather_preliminary_metadata_task = (
378
- Task(job_settings={"metadata_dir": metadata_dir})
379
- if metadata_dir is not None
380
- else None
381
- )
382
- # modality transformation settings
383
- modality_transformation_settings_tasks = dict() # {modality_abbr: Task}
384
- if 'modalities' in extra_UploadJobConfigsV2_params:
385
- raise ValueError('modalities should not be passed as a parameter in extra_BasicUploadJobConfigs_params')
386
- for modality_abbr, input_source in modalities.items():
387
- job_settings: dict[str, Any] = {
388
- "input_source": input_source,
389
- "output_directory": "%OUTPUT_LOCATION",
390
- }
391
- # Ecephys compression settings are currently hardcoded
392
- # In the future, these can be stored in AWS param store as part of a "job_type"
393
- if (modality_abbr == Modality.ECEPHYS.abbreviation):
394
- if not check_timestamps:
395
- job_settings['check_timestamps'] = False
396
- image_resources = (DEFAULT_EPHYS_SLURM_SETTINGS_DEV if test else DEFAULT_EPHYS_SLURM_SETTINGS).model_dump(mode="json", exclude_none=True)
397
- modality_task = Task(
398
- skip_task=False,
399
- job_settings=job_settings,
400
- image_resources=image_resources,
401
- **DEFAULT_EPHYS_IMAGE,
402
- )
403
- else:
404
- modality_task = Task(
405
- job_settings=job_settings,
406
- )
407
- modality_transformation_settings_tasks[modality_abbr] = modality_task
408
-
409
- # Code Ocean pipeline settings
410
- # You can manually specify up to one pipeline conf per modality.
411
- # These will override any pipelines defined by the job_type.
412
- # In the future, these can be stored in AWS param store as part of a "job_type"
413
- codeocean_pipeline_settings_tasks = dict() # {modality_abbr: Task}
414
- if codeocean_pipeline_settings is not None:
415
- codeocean_pipeline_settings_tasks = {
416
- k: Task(
417
- job_settings={ "pipeline_monitor_settings": v.model_dump(mode="json", exclude_none=True)})
418
- for k, v in codeocean_pipeline_settings.items()
419
- }
420
- tasks = {
421
- "check_s3_folder_exists": check_s3_folder_exists_task,
422
- "gather_preliminary_metadata": gather_preliminary_metadata_task,
423
- "modality_transformation_settings": modality_transformation_settings_tasks,
424
- "codeocean_pipeline_settings": codeocean_pipeline_settings_tasks,
425
- }
426
- return UploadJobConfigsV2(
427
- job_type=job_type,
428
- platform=Platform.from_abbreviation(platform),
429
- project_name=project_name,
430
- subject_id=subject_id,
431
- acq_datetime=acq_datetime,
432
- modalities=[
433
- Modality.from_abbreviation(m) for m in modality_transformation_settings_tasks.keys()
434
- ],
435
- tasks={k: v for k, v in tasks.items() if v is not None},
436
- user_email=user_email,
437
- **extra_UploadJobConfigsV2_params,
438
- )
439
-
440
- def put_jobs_for_hpc_upload(
441
- upload_jobs: UploadJobConfigsV2 | Iterable[UploadJobConfigsV2],
442
- upload_service_url: str = AIND_DATA_TRANSFER_SERVICE,
443
- user_email: str = HPC_UPLOAD_JOB_EMAIL,
444
- email_notification_types: Iterable[Literal["begin", "end", "fail", "retry", "all"]] = ('fail',),
445
- dry_run: bool = False,
446
- save_path: pathlib.Path | None = None,
447
- **extra_model_kwargs: Any,
448
- ) -> None:
449
- """Submit one or more jobs to the aind-data-transfer-service, for
450
- upload to S3 on the hpc.
451
-
452
- - accepts one or more aind_data_transfer_service UploadJobConfigsV2 models
453
- - assembles a SubmitJobRequestV2 model
454
- - excludes jobs for sessions that are already in the upload queue
455
- - accepts additional parameters for SubmitJobRequestV2 as kwargs
456
- - submits json via http request
457
- - optionally saves the json file as a record
458
- """
459
- if isinstance(upload_jobs, UploadJobConfigsV2):
460
- upload_jobs = (upload_jobs, )
461
- submit_request = SubmitJobRequestV2(
462
- upload_jobs=[job for job in upload_jobs if not is_job_in_hpc_upload_queue(job)],
463
- user_email=user_email,
464
- email_notification_types=email_notification_types,
465
- **extra_model_kwargs,
466
- )
467
- post_request_content = submit_request.model_dump(mode="json", exclude_none=True)
468
- if save_path:
469
- save_path.write_text(submit_request.model_dump_json(round_trip=True, exclude_none=True, indent=4), errors='ignore')
470
- if dry_run:
471
- logger.warning(f'Dry run: not submitting {len(upload_jobs)} upload job(s) to {upload_service_url}')
472
- return
473
-
474
- # Uncomment to perform a validation check:
475
- # validate_json_response: requests.Response = requests.post(
476
- # url=f"{upload_service_url}/api/v2/validate_json",
477
- # json=post_request_content,
478
- # )
479
- # validate_json_response.raise_for_status()
480
-
481
- # Submit the jobs
482
- post_json_response: requests.Response = requests.post(
483
- url=f"{upload_service_url}/api/v2/submit_jobs",
484
- json=post_request_content,
485
- )
486
- logger.info(f"Submitted {len(upload_jobs)} upload job(s) to {upload_service_url}")
487
- post_json_response.raise_for_status()
488
-
489
- @typing_extensions.deprecated("Uses old, pre-v1 endpoints: use put_jobs_for_hpc_upload in combination with create_upload_job_configs_v2")
490
- def put_csv_for_hpc_upload(
491
- csv_path: pathlib.Path,
492
- upload_service_url: str = AIND_DATA_TRANSFER_SERVICE,
493
- hpc_upload_job_email: str = HPC_UPLOAD_JOB_EMAIL,
494
- dry_run: bool = False,
495
- ) -> None:
496
- """Submit a single job upload csv to the aind-data-transfer-service, for
497
- upload to S3 on the hpc.
498
-
499
- - gets validated version of csv
500
- - checks session is not already being uploaded
501
- - submits csv via http request
502
- """
503
- def _raise_for_status(response: requests.Response) -> None:
504
- """pydantic validation errors are returned as strings that can be eval'd
505
- to get the real error class + message."""
506
- if response.status_code != 200:
507
- try:
508
- response.json()['data']['errors']
509
- except (KeyError, IndexError, requests.exceptions.JSONDecodeError, SyntaxError) as exc1:
510
- try:
511
- response.raise_for_status()
512
- except requests.exceptions.HTTPError as exc2:
513
- raise exc2 from exc1
514
-
515
- with open(csv_path, 'rb') as f:
516
- validate_csv_response = requests.post(
517
- url=f"{upload_service_url}/api/validate_csv",
518
- files=dict(file=f),
519
- )
520
- _raise_for_status(validate_csv_response)
521
- logger.debug(f"Validated response: {validate_csv_response.json()}")
522
- if is_csv_in_hpc_upload_queue(csv_path, upload_service_url):
523
- logger.warning(f"Job already submitted for {csv_path}")
524
- return
525
- if dry_run:
526
- logger.info(f'Dry run: not submitting {csv_path} to hpc upload queue at {upload_service_url}.')
527
- return
528
- post_csv_response = requests.post(
529
- url=f"{upload_service_url}/api/submit_hpc_jobs",
530
- json=dict(
531
- jobs=[
532
- dict(
533
- hpc_settings=json.dumps({"time_limit": 60 * 15, "mail_user": hpc_upload_job_email}),
534
- upload_job_settings=validate_csv_response.json()["data"]["jobs"][0],
535
- script="",
536
- )
537
- ]
538
- ),
539
- )
540
- logger.info(f"Submitted {csv_path} to hpc upload queue at {upload_service_url}")
541
- _raise_for_status(post_csv_response)
542
-
543
-
544
- def ensure_posix(path: str | pathlib.Path) -> str:
545
- posix = pathlib.Path(path).as_posix()
546
- if posix.startswith('//'):
547
- posix = posix[1:]
548
- return posix
549
-
550
-
551
- def convert_symlinks_to_posix(toplevel_dir: str | pathlib.Path) -> None:
552
- """Convert all symlinks in `root_dir` (recursively) to POSIX paths. This is a
553
- necessary last step before submitting uploads to run on the HPC.
554
- """
555
- for path in pathlib.Path(toplevel_dir).rglob('*'):
556
- if path.is_symlink():
557
- posix_target = path.readlink().as_posix().removeprefix('//?/UNC')
558
- path.unlink()
559
- np_tools.symlink(src=ensure_posix(posix_target), dest=path)
560
-
561
- if __name__ == '__main__':
562
- import doctest
563
- doctest.testmod(optionflags=doctest.ELLIPSIS | doctest.NORMALIZE_WHITESPACE | doctest.IGNORE_EXCEPTION_DETAIL)
1
+ from __future__ import annotations
2
+
3
+ import contextlib
4
+ import csv
5
+ import datetime
6
+ import functools
7
+ import itertools
8
+ import json
9
+ import logging
10
+ import os
11
+ import pathlib
12
+ import re
13
+ from collections.abc import Generator, Iterable
14
+ from typing import Any, Literal
15
+
16
+ import np_config
17
+ import np_tools
18
+ import npc_ephys
19
+ import npc_session
20
+ import npc_sync
21
+ import numpy as np
22
+ import polars as pl
23
+ import requests
24
+ import typing_extensions
25
+ from aind_codeocean_pipeline_monitor.models import PipelineMonitorSettings
26
+ from aind_data_schema_models.modalities import Modality
27
+ from aind_data_schema_models.platforms import Platform
28
+ from aind_data_transfer_service.models.core import (
29
+ SubmitJobRequestV2,
30
+ Task,
31
+ UploadJobConfigsV2,
32
+ )
33
+ from aind_slurm_rest_v2.models.v0040_job_desc_msg import (
34
+ V0040JobDescMsg,
35
+ )
36
+
37
+ logger = logging.getLogger(__name__)
38
+
39
+ AINDPlatform = Literal["ecephys", "behavior"]
40
+
41
+ AIND_DATA_TRANSFER_SERVICE = "http://aind-data-transfer-service"
42
+ DEV_SERVICE = "http://aind-data-transfer-service-dev"
43
+ HPC_UPLOAD_JOB_EMAIL = "ben.hardcastle@alleninstitute.org"
44
+ ACQ_DATETIME_FORMAT = "%Y-%m-%d %H:%M:%S"
45
+
46
+ AIND_METADATA_NAMES: tuple[str, ...] = (
47
+ "session",
48
+ "data_description",
49
+ "procedures",
50
+ "processing",
51
+ "rig",
52
+ "subject",
53
+ )
54
+
55
+ # In the future, default slurm settings can be stored in a job_type in AWS Param Store
56
+ # see http://aind-data-transfer-service/job_params for current job_types
57
+ _DEFAULT_EPHYS_SLURM_SETTINGS_JSON = {
58
+ "memory_per_cpu": {"set": True, "number": 8000},
59
+ "minimum_cpus_per_node": 12, # 6 probes * (lfp + ap)
60
+ "partition": "aind",
61
+ "tasks": 1,
62
+ "time_limit": {"set": True, "number": 15 * 60},
63
+ "environment": [
64
+ "PATH=/bin:/usr/bin/:/usr/local/bin/",
65
+ "LD_LIBRARY_PATH=/lib/:/lib64/:/usr/local/lib",
66
+ ],
67
+ "maximum_nodes": 1,
68
+ "minimum_nodes": 1,
69
+ "current_working_directory": ".",
70
+ }
71
+ """Increased timelimit and cpus for running ephys compression on the hpc"""
72
+ DEFAULT_EPHYS_SLURM_SETTINGS = V0040JobDescMsg.model_validate(
73
+ {
74
+ **_DEFAULT_EPHYS_SLURM_SETTINGS_JSON,
75
+ "qos": "production",
76
+ "standard_error": "/allen/aind/scratch/svc_aind_airflow/prod/logs/%x_%j_error.out",
77
+ "standard_output": "/allen/aind/scratch/svc_aind_airflow/prod/logs/%x_%j.out",
78
+ }
79
+ )
80
+ DEFAULT_EPHYS_SLURM_SETTINGS_DEV = V0040JobDescMsg.model_validate(
81
+ {
82
+ **_DEFAULT_EPHYS_SLURM_SETTINGS_JSON,
83
+ "qos": "dev",
84
+ "standard_error": "/allen/aind/scratch/svc_aind_airflow/dev/logs/%x_%j_error.out",
85
+ "standard_output": "/allen/aind/scratch/svc_aind_airflow/dev/logs/%x_%j.out",
86
+ }
87
+ )
88
+ DEFAULT_EPHYS_IMAGE = {
89
+ "image": "ghcr.io/allenneuraldynamics/aind-ephys-transformation",
90
+ "image_version": "0.2.1",
91
+ "command_script": "#!/bin/bash \nsingularity exec --cleanenv docker://%IMAGE:%IMAGE_VERSION python -m aind_ephys_transformation.ephys_job --job-settings ' %JOB_SETTINGS '",
92
+ }
93
+
94
+
95
+ class SyncFileNotFoundError(FileNotFoundError):
96
+ pass
97
+
98
+
99
+ @functools.cache
100
+ def get_project_config() -> dict[str, Any]:
101
+ """Config for this project"""
102
+ return np_config.fetch("/projects/np_codeocean")
103
+
104
+
105
+ def set_npc_lims_credentials() -> None:
106
+ creds = np_config.fetch("/projects/np_codeocean/npc_lims")
107
+ for k, v in creds.items():
108
+ os.environ.setdefault(k, v)
109
+
110
+
111
+ def get_home() -> pathlib.Path:
112
+ if os.name == "nt":
113
+ return pathlib.Path(os.environ["USERPROFILE"])
114
+ return pathlib.Path(os.environ["HOME"])
115
+
116
+
117
+ def is_behavior_video_file(path: pathlib.Path) -> bool:
118
+ if path.is_dir() or path.suffix not in (".mp4", ".avi", ".json"):
119
+ return False
120
+ with contextlib.suppress(ValueError):
121
+ _ = npc_session.extract_mvr_camera_name(path.as_posix())
122
+ return True
123
+ return False
124
+
125
+
126
+ def is_surface_channel_recording(path_name: str) -> bool:
127
+ """
128
+ >>> import np_session
129
+ >>> session = np_session.Session("//allen/programs/mindscope/workgroups/dynamicrouting/PilotEphys/Task 2 pilot/DRpilot_690706_20231129_surface_channels")
130
+ >>> is_surface_channel_recording(session.npexp_path.as_posix())
131
+ True
132
+ """
133
+ return "surface_channels" in path_name.lower()
134
+
135
+
136
+ def cleanup_ephys_symlinks(toplevel_dir: pathlib.Path) -> None:
137
+ """After creating symlinks to the ephys data, run this to make any necessary
138
+ modifications prior to upload.
139
+
140
+ Provided dir path should be a directory containing all ephys data in
141
+ subfolders (e.g. directory containing "Record Node 10x" folders)
142
+
143
+ Only deletes symlinks or writes new files in place of symlinks - does not
144
+ modify original data.
145
+
146
+ Rules:
147
+ - if any continuous.dat files are unreadable: remove them and their containing folders
148
+ - if any probes were recorded on multiple record nodes: just keep the first
149
+ - if continuous.dat files are missing (ie. excluded because probes weren't
150
+ inserted, or we removed symlinks in previous steps): update metadata files
151
+ """
152
+ remove_unreadable_ephys_data(toplevel_dir)
153
+ remove_duplicate_ephys_data(toplevel_dir)
154
+ cleanup_ephys_metadata(toplevel_dir)
155
+
156
+
157
+ def remove_unreadable_ephys_data(toplevel_dir: pathlib.Path) -> None:
158
+
159
+ for continuous_dir in ephys_continuous_dir_generator(toplevel_dir):
160
+ events_dir = (
161
+ continuous_dir.parent.parent / "events" / continuous_dir.name / "TTL"
162
+ )
163
+ filenames = ("continuous.dat", "timestamps.npy", "sample_numbers.npy")
164
+ dirs = (continuous_dir,) + ((events_dir,) if events_dir.exists() else ())
165
+ mark_for_removal = False
166
+ for d in dirs:
167
+ if not d.exists():
168
+ continue
169
+ for filename in filenames:
170
+ if filename == "continuous.dat" and d.name == "TTL":
171
+ continue # no continuous.dat expected in TTL events
172
+ file = d / filename
173
+ if not (file.is_symlink() or file.exists()):
174
+ logger.warning(
175
+ f"Critical file not found {file}, insufficient data for processing"
176
+ )
177
+ mark_for_removal = True
178
+ break
179
+ try:
180
+ data = np.memmap(
181
+ decode_symlink_path(file),
182
+ dtype="int16" if "timestamps" not in file.name else "float64",
183
+ mode="r",
184
+ )
185
+ except Exception as exc:
186
+ logger.warning(f"Failed to read {file}: {exc!r}")
187
+ mark_for_removal = True
188
+ break
189
+ if data.size == 0:
190
+ logger.warning(f"Empty file {file}")
191
+ mark_for_removal = True
192
+ break
193
+ logger.debug(f"Found readable, non-empty data in {file}")
194
+ if mark_for_removal:
195
+ break
196
+ if mark_for_removal:
197
+ logger.warning(f"Removing {continuous_dir} and its contents")
198
+ remove_folder_of_symlinks(continuous_dir)
199
+ logger.warning(f"Removing {events_dir.parent} and its contents")
200
+ remove_folder_of_symlinks(events_dir.parent)
201
+
202
+
203
+ def remove_duplicate_ephys_data(toplevel_dir: pathlib.Path) -> None:
204
+ logger.info("Checking for duplicate ephys data...")
205
+ paths = sorted(ephys_continuous_dir_generator(toplevel_dir))
206
+ experiments = set(
207
+ re.findall(r"/experiment(\d+)/", path.as_posix())[0] for path in paths
208
+ )
209
+ logger.debug(f"Found {len(experiments)} experiments")
210
+ for experiment in experiments:
211
+ exp_paths = sorted(
212
+ path for path in paths if f"experiment{experiment}" in path.as_posix()
213
+ )
214
+ recordings = set(
215
+ re.findall(r"/recording(\d+)/", path.as_posix())[0] for path in exp_paths
216
+ )
217
+ logger.debug(f"Found {len(recordings)} recordings in experiment{experiment}")
218
+ for recording in recordings:
219
+ recording_paths = sorted(
220
+ path for path in exp_paths if f"recording{recording}" in path.as_posix()
221
+ )
222
+ probes = []
223
+ # import pdb; pdb.set_trace()
224
+ for continuous_dir in recording_paths:
225
+ try:
226
+ probe = npc_session.ProbeRecord(continuous_dir.name)
227
+ except ValueError:
228
+ continue
229
+ suffix = continuous_dir.name.split("-")[-1]
230
+ assert suffix in ("AP", "LFP")
231
+ recording_name = f"{probe}-{suffix}"
232
+ if recording_name in probes:
233
+ logger.info(
234
+ f"Duplicate {recording_name = } found in {continuous_dir.parent.parent} - removing"
235
+ )
236
+ remove_folder_of_symlinks(continuous_dir)
237
+ else:
238
+ probes.append(recording_name)
239
+
240
+
241
+ def remove_folder_of_symlinks(folder: pathlib.Path) -> None:
242
+ """Recursive deletion of all files in dir tree, with a check that each is a
243
+ symlink."""
244
+ for path in folder.rglob("*"):
245
+ if path.is_dir():
246
+ remove_folder_of_symlinks(path)
247
+ else:
248
+ assert path.is_symlink(), f"Expected {path} to be a symlink"
249
+ path.unlink(missing_ok=True)
250
+ with contextlib.suppress(FileNotFoundError):
251
+ folder.rmdir()
252
+
253
+
254
+ def ephys_recording_dir_generator(
255
+ toplevel_dir: pathlib.Path,
256
+ ) -> Generator[pathlib.Path, None, None]:
257
+ for recording_dir in toplevel_dir.rglob("recording[0-9]*"):
258
+ if recording_dir.is_dir():
259
+ yield recording_dir
260
+
261
+
262
+ def ephys_continuous_dir_generator(
263
+ toplevel_dir: pathlib.Path,
264
+ ) -> Generator[pathlib.Path, None, None]:
265
+ for recording_dir in ephys_recording_dir_generator(toplevel_dir):
266
+ parent = recording_dir / "continuous"
267
+ if not parent.exists():
268
+ continue
269
+ for continuous_dir in parent.iterdir():
270
+ if continuous_dir.is_dir():
271
+ yield continuous_dir
272
+
273
+
274
+ def ephys_structure_oebin_generator(
275
+ toplevel_dir: pathlib.Path,
276
+ ) -> Generator[pathlib.Path, None, None]:
277
+ for recording_dir in ephys_recording_dir_generator(toplevel_dir):
278
+ oebin_path = recording_dir / "structure.oebin"
279
+ if not (oebin_path.is_symlink() or oebin_path.exists()):
280
+ # symlinks that are created for the hpc use posix paths, and aren't
281
+ # readable on windows, so .exists() returns False: use .is_symlink() instead
282
+ logger.warning(f"No structure.oebin found in {recording_dir}")
283
+ continue
284
+ yield oebin_path
285
+
286
+
287
+ def cleanup_ephys_metadata(toplevel_dir: pathlib.Path) -> None:
288
+ logger.debug("Checking structure.oebin for missing folders...")
289
+ for oebin_path in ephys_structure_oebin_generator(toplevel_dir):
290
+ oebin_obj = np_tools.read_oebin(decode_symlink_path(oebin_path))
291
+ logger.debug(f"Checking {oebin_path} against actual folders...")
292
+ any_removed = False
293
+ for subdir_name in ("events", "continuous"):
294
+ subdir = oebin_path.parent / subdir_name
295
+ # iterate over copy of list so as to not disrupt iteration when elements are removed
296
+ for device in [device for device in oebin_obj[subdir_name]]:
297
+ if not (subdir / device["folder_name"]).exists():
298
+ logger.info(
299
+ f'{device["folder_name"]} not found in {subdir}, removing from structure.oebin'
300
+ )
301
+ oebin_obj[subdir_name].remove(device)
302
+ any_removed = True
303
+ if any_removed:
304
+ oebin_path.unlink()
305
+ oebin_path.write_text(json.dumps(oebin_obj, indent=4))
306
+ logger.debug(
307
+ "Overwrote symlink to structure.oebin with corrected structure.oebin"
308
+ )
309
+
310
+
311
+ def write_corrected_ephys_timestamps(
312
+ ephys_dir: pathlib.Path,
313
+ behavior_dir: pathlib.Path,
314
+ ) -> None:
315
+ for path in itertools.chain(behavior_dir.glob("*.h5"), behavior_dir.glob("*.sync")):
316
+ with contextlib.suppress(Exception):
317
+ sync_dataset = npc_sync.SyncDataset(path)
318
+ _ = sync_dataset.line_labels
319
+ logger.info(f"Found valid sync file {path.as_posix()}")
320
+ break
321
+ else:
322
+ raise SyncFileNotFoundError(
323
+ f"No valid sync file found in {behavior_dir.as_posix()}"
324
+ )
325
+
326
+ timing_on_pxi = (
327
+ timing
328
+ for timing in npc_ephys.get_ephys_timing_on_pxi(
329
+ ephys_dir.glob("**/experiment*/recording*"),
330
+ )
331
+ )
332
+ timing_on_sync = npc_ephys.get_ephys_timing_on_sync(
333
+ sync=sync_dataset,
334
+ devices=timing_on_pxi,
335
+ )
336
+ npc_ephys.overwrite_timestamps(timing_on_sync)
337
+ logger.info(f"Corrected timestamps in {ephys_dir}")
338
+
339
+
340
+ def decode_symlink_path(oebin_path: pathlib.Path) -> pathlib.Path:
341
+ if not oebin_path.is_symlink():
342
+ return oebin_path
343
+ return np_config.normalize_path(oebin_path.readlink())
344
+
345
+
346
+ def is_csv_in_hpc_upload_queue(
347
+ csv_path: pathlib.Path, upload_service_url: str = AIND_DATA_TRANSFER_SERVICE
348
+ ) -> bool:
349
+ """Check if an upload job has been submitted to the hpc upload queue.
350
+
351
+ - currently assumes one job per csv
352
+ - does not check status (job may be FINISHED rather than RUNNING)
353
+
354
+ >>> is_csv_in_hpc_upload_queue("//allen/programs/mindscope/workgroups/np-exp/codeocean/DRpilot_664851_20231114/upload.csv")
355
+ False
356
+ """
357
+ # get subject-id, acq-datetime from csv
358
+ df = pl.read_csv(csv_path, eol_char="\r")
359
+ for col in df.get_columns():
360
+ if col.name.startswith("subject") and col.name.endswith("id"):
361
+ subject = npc_session.SubjectRecord(col[0])
362
+ continue
363
+ if col.name.startswith("acq") and "datetime" in col.name.lower():
364
+ dt = npc_session.DatetimeRecord(col[0])
365
+ continue
366
+ if col.name == "platform":
367
+ platform = col[0]
368
+ continue
369
+ return is_session_in_hpc_queue(
370
+ subject=subject,
371
+ acq_datetime=dt.dt,
372
+ platform=platform,
373
+ upload_service_url=upload_service_url,
374
+ )
375
+
376
+
377
+ def is_session_in_hpc_queue(
378
+ subject: int | str,
379
+ acq_datetime: str | datetime.datetime,
380
+ platform: str | None = None,
381
+ upload_service_url: str = AIND_DATA_TRANSFER_SERVICE,
382
+ ) -> bool:
383
+ """
384
+ >>> is_session_in_hpc_queue(366122, datetime.datetime(2023, 11, 14, 0, 0, 0))
385
+ False
386
+ >>> is_session_in_hpc_queue(702136, datetime.datetime(2024, 3, 4, 13, 21, 35))
387
+ True
388
+ """
389
+ if not isinstance(acq_datetime, datetime.datetime):
390
+ acq_datetime = datetime.datetime.strptime(acq_datetime, ACQ_DATETIME_FORMAT)
391
+ partial_session_id = f"{subject}_{acq_datetime.strftime(ACQ_DATETIME_FORMAT).replace(' ', '_').replace(':', '-')}"
392
+ if platform:
393
+ partial_session_id = f"{platform}_{partial_session_id}"
394
+
395
+ jobs_response = requests.get(f"{upload_service_url}/jobs")
396
+ jobs_response.raise_for_status()
397
+ return partial_session_id in jobs_response.content.decode()
398
+
399
+
400
+ def is_job_in_hpc_upload_queue(
401
+ job: UploadJobConfigsV2, upload_service_url: str = AIND_DATA_TRANSFER_SERVICE
402
+ ) -> bool:
403
+ return is_session_in_hpc_queue(
404
+ job.subject_id, job.acq_datetime, job.platform.abbreviation, upload_service_url
405
+ )
406
+
407
+
408
+ def write_upload_csv(
409
+ content: dict[str, Any],
410
+ output_path: pathlib.Path,
411
+ ) -> pathlib.Path:
412
+ logger.info(f"Creating upload job file {output_path}")
413
+ with open(output_path, "w") as f:
414
+ w = csv.writer(f, lineterminator="")
415
+ w.writerow(content.keys())
416
+ w.writerow("\n")
417
+ w.writerow(content.values())
418
+ return output_path
419
+
420
+
421
+ def create_upload_job_configs_v2(
422
+ project_name: str,
423
+ platform: str,
424
+ subject_id: str,
425
+ force_cloud_sync: bool,
426
+ modalities: dict[str, str],
427
+ acq_datetime: datetime.datetime,
428
+ user_email: str = HPC_UPLOAD_JOB_EMAIL,
429
+ job_type: str = "default",
430
+ metadata_dir: str | None = None,
431
+ codeocean_pipeline_settings: dict[str, PipelineMonitorSettings] | None = None,
432
+ check_timestamps: bool = True, # default in transfer service is True: checks timestamps have been corrected via flag file
433
+ test: bool = False,
434
+ **extra_UploadJobConfigsV2_params: Any,
435
+ ) -> UploadJobConfigsV2:
436
+ """Create a UploadJobConfigsV2 model. Modalities should be provided in format
437
+ {modality_abbr: input_source}. job_type refers to the default or custom
438
+ presets used for compression and Code Ocean pipelines.
439
+ """
440
+ # Each task in airflow can be configured individually
441
+ # force_cloud_sync
442
+ check_s3_folder_exists_task = Task(skip_task=True) if force_cloud_sync else None
443
+ # metadata_dir
444
+ gather_preliminary_metadata_task = (
445
+ Task(job_settings={"metadata_dir": metadata_dir})
446
+ if metadata_dir is not None
447
+ else None
448
+ )
449
+ # modality transformation settings
450
+ modality_transformation_settings_tasks = dict() # {modality_abbr: Task}
451
+ if "modalities" in extra_UploadJobConfigsV2_params:
452
+ raise ValueError(
453
+ "modalities should not be passed as a parameter in extra_BasicUploadJobConfigs_params"
454
+ )
455
+ for modality_abbr, input_source in modalities.items():
456
+ job_settings: dict[str, Any] = {
457
+ "input_source": input_source,
458
+ "output_directory": "%OUTPUT_LOCATION",
459
+ }
460
+ # Ecephys compression settings are currently hardcoded
461
+ # In the future, these can be stored in AWS param store as part of a "job_type"
462
+ if modality_abbr == Modality.ECEPHYS.abbreviation:
463
+ if not check_timestamps:
464
+ job_settings["check_timestamps"] = False
465
+ image_resources = (
466
+ DEFAULT_EPHYS_SLURM_SETTINGS_DEV
467
+ if test
468
+ else DEFAULT_EPHYS_SLURM_SETTINGS
469
+ ).model_dump(mode="json", exclude_none=True)
470
+ modality_task = Task(
471
+ skip_task=False,
472
+ job_settings=job_settings,
473
+ image_resources=image_resources,
474
+ **DEFAULT_EPHYS_IMAGE,
475
+ )
476
+ else:
477
+ modality_task = Task(
478
+ job_settings=job_settings,
479
+ )
480
+ modality_transformation_settings_tasks[modality_abbr] = modality_task
481
+
482
+ # Code Ocean pipeline settings
483
+ # You can manually specify up to one pipeline conf per modality.
484
+ # These will override any pipelines defined by the job_type.
485
+ # In the future, these can be stored in AWS param store as part of a "job_type"
486
+ codeocean_pipeline_settings_tasks = dict() # {modality_abbr: Task}
487
+ if codeocean_pipeline_settings is not None:
488
+ codeocean_pipeline_settings_tasks = {
489
+ k: Task(
490
+ job_settings={
491
+ "pipeline_monitor_settings": v.model_dump(
492
+ mode="json", exclude_none=True
493
+ )
494
+ }
495
+ )
496
+ for k, v in codeocean_pipeline_settings.items()
497
+ }
498
+ tasks = {
499
+ "check_s3_folder_exists": check_s3_folder_exists_task,
500
+ "gather_preliminary_metadata": gather_preliminary_metadata_task,
501
+ "modality_transformation_settings": modality_transformation_settings_tasks,
502
+ "codeocean_pipeline_settings": codeocean_pipeline_settings_tasks,
503
+ }
504
+ return UploadJobConfigsV2(
505
+ job_type=job_type,
506
+ platform=Platform.from_abbreviation(platform),
507
+ project_name=project_name,
508
+ subject_id=subject_id,
509
+ acq_datetime=acq_datetime,
510
+ modalities=[
511
+ Modality.from_abbreviation(m)
512
+ for m in modality_transformation_settings_tasks.keys()
513
+ ],
514
+ tasks={k: v for k, v in tasks.items() if v is not None},
515
+ user_email=user_email,
516
+ **extra_UploadJobConfigsV2_params,
517
+ )
518
+
519
+
520
+ def put_jobs_for_hpc_upload(
521
+ upload_jobs: UploadJobConfigsV2 | Iterable[UploadJobConfigsV2],
522
+ upload_service_url: str = AIND_DATA_TRANSFER_SERVICE,
523
+ user_email: str = HPC_UPLOAD_JOB_EMAIL,
524
+ email_notification_types: Iterable[
525
+ Literal["begin", "end", "fail", "retry", "all"]
526
+ ] = ("fail",),
527
+ dry_run: bool = False,
528
+ save_path: pathlib.Path | None = None,
529
+ **extra_model_kwargs: Any,
530
+ ) -> None:
531
+ """Submit one or more jobs to the aind-data-transfer-service, for
532
+ upload to S3 on the hpc.
533
+
534
+ - accepts one or more aind_data_transfer_service UploadJobConfigsV2 models
535
+ - assembles a SubmitJobRequestV2 model
536
+ - excludes jobs for sessions that are already in the upload queue
537
+ - accepts additional parameters for SubmitJobRequestV2 as kwargs
538
+ - submits json via http request
539
+ - optionally saves the json file as a record
540
+ """
541
+ if isinstance(upload_jobs, UploadJobConfigsV2):
542
+ upload_jobs = (upload_jobs,)
543
+ submit_request = SubmitJobRequestV2(
544
+ upload_jobs=[job for job in upload_jobs if not is_job_in_hpc_upload_queue(job)],
545
+ user_email=user_email,
546
+ email_notification_types=email_notification_types,
547
+ **extra_model_kwargs,
548
+ )
549
+ post_request_content = submit_request.model_dump(mode="json", exclude_none=True)
550
+ if save_path:
551
+ save_path.write_text(
552
+ submit_request.model_dump_json(
553
+ round_trip=True, exclude_none=True, indent=4
554
+ ),
555
+ errors="ignore",
556
+ )
557
+ if dry_run:
558
+ logger.warning(
559
+ f"Dry run: not submitting {len(upload_jobs)} upload job(s) to {upload_service_url}"
560
+ )
561
+ return
562
+
563
+ # Uncomment to perform a validation check:
564
+ # validate_json_response: requests.Response = requests.post(
565
+ # url=f"{upload_service_url}/api/v2/validate_json",
566
+ # json=post_request_content,
567
+ # )
568
+ # validate_json_response.raise_for_status()
569
+
570
+ # Submit the jobs
571
+ post_json_response: requests.Response = requests.post(
572
+ url=f"{upload_service_url}/api/v2/submit_jobs",
573
+ json=post_request_content,
574
+ )
575
+ logger.info(f"Submitted {len(upload_jobs)} upload job(s) to {upload_service_url}")
576
+ post_json_response.raise_for_status()
577
+
578
+
579
+ @typing_extensions.deprecated(
580
+ "Uses old, pre-v1 endpoints: use put_jobs_for_hpc_upload in combination with create_upload_job_configs_v2"
581
+ )
582
+ def put_csv_for_hpc_upload(
583
+ csv_path: pathlib.Path,
584
+ upload_service_url: str = AIND_DATA_TRANSFER_SERVICE,
585
+ hpc_upload_job_email: str = HPC_UPLOAD_JOB_EMAIL,
586
+ dry_run: bool = False,
587
+ ) -> None:
588
+ """Submit a single job upload csv to the aind-data-transfer-service, for
589
+ upload to S3 on the hpc.
590
+
591
+ - gets validated version of csv
592
+ - checks session is not already being uploaded
593
+ - submits csv via http request
594
+ """
595
+
596
+ def _raise_for_status(response: requests.Response) -> None:
597
+ """pydantic validation errors are returned as strings that can be eval'd
598
+ to get the real error class + message."""
599
+ if response.status_code != 200:
600
+ try:
601
+ response.json()["data"]["errors"]
602
+ except (
603
+ KeyError,
604
+ IndexError,
605
+ requests.exceptions.JSONDecodeError,
606
+ SyntaxError,
607
+ ) as exc1:
608
+ try:
609
+ response.raise_for_status()
610
+ except requests.exceptions.HTTPError as exc2:
611
+ raise exc2 from exc1
612
+
613
+ with open(csv_path, "rb") as f:
614
+ validate_csv_response = requests.post(
615
+ url=f"{upload_service_url}/api/validate_csv",
616
+ files=dict(file=f),
617
+ )
618
+ _raise_for_status(validate_csv_response)
619
+ logger.debug(f"Validated response: {validate_csv_response.json()}")
620
+ if is_csv_in_hpc_upload_queue(csv_path, upload_service_url):
621
+ logger.warning(f"Job already submitted for {csv_path}")
622
+ return
623
+ if dry_run:
624
+ logger.info(
625
+ f"Dry run: not submitting {csv_path} to hpc upload queue at {upload_service_url}."
626
+ )
627
+ return
628
+ post_csv_response = requests.post(
629
+ url=f"{upload_service_url}/api/submit_hpc_jobs",
630
+ json=dict(
631
+ jobs=[
632
+ dict(
633
+ hpc_settings=json.dumps(
634
+ {"time_limit": 60 * 15, "mail_user": hpc_upload_job_email}
635
+ ),
636
+ upload_job_settings=validate_csv_response.json()["data"]["jobs"][0],
637
+ script="",
638
+ )
639
+ ]
640
+ ),
641
+ )
642
+ logger.info(f"Submitted {csv_path} to hpc upload queue at {upload_service_url}")
643
+ _raise_for_status(post_csv_response)
644
+
645
+
646
+ def ensure_posix(path: str | pathlib.Path) -> str:
647
+ posix = pathlib.Path(path).as_posix()
648
+ if posix.startswith("//"):
649
+ posix = posix[1:]
650
+ return posix
651
+
652
+
653
+ def convert_symlinks_to_posix(toplevel_dir: str | pathlib.Path) -> None:
654
+ """Convert all symlinks in `root_dir` (recursively) to POSIX paths. This is a
655
+ necessary last step before submitting uploads to run on the HPC.
656
+ """
657
+ for path in pathlib.Path(toplevel_dir).rglob("*"):
658
+ if path.is_symlink():
659
+ posix_target = path.readlink().as_posix().removeprefix("//?/UNC")
660
+ path.unlink()
661
+ np_tools.symlink(src=ensure_posix(posix_target), dest=path)
662
+
663
+
664
+ if __name__ == "__main__":
665
+ import doctest
666
+
667
+ doctest.testmod(
668
+ optionflags=doctest.ELLIPSIS
669
+ | doctest.NORMALIZE_WHITESPACE
670
+ | doctest.IGNORE_EXCEPTION_DETAIL
671
+ )