np_codeocean 0.3.5__py3-none-any.whl → 0.3.6__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,413 +1,483 @@
1
- from __future__ import annotations
2
-
3
- import argparse
4
- import concurrent.futures
5
- import contextlib
6
- import datetime
7
- import logging
8
- import logging.config
9
- import logging.handlers
10
- import multiprocessing.synchronize
11
- import pathlib
12
- import multiprocessing
13
- import multiprocessing.managers
14
- import threading
15
- import time
16
- import warnings
17
- from pathlib import Path
18
-
19
- import h5py
20
- import tqdm
21
- import np_codeocean
22
- import np_codeocean.utils
23
- from np_codeocean.metadata import core as metadata_core
24
- import np_config
25
- import np_session
26
- import np_tools
27
- import npc_lims
28
- import npc_session
29
- import npc_sessions # this is heavy, but has the logic for hdf5 -> session.json
30
- from aind_data_schema.core.rig import Rig
31
- from npc_lims.exceptions import NoSessionInfo
32
-
33
- import np_codeocean
34
- import np_codeocean.utils
35
-
36
- # Disable divide by zero or NaN warnings
37
- warnings.filterwarnings("ignore", category=RuntimeWarning)
38
-
39
- def reset_log_file() -> None:
40
- log = get_log_file()
41
- log.parent.mkdir(exist_ok=True)
42
- with contextlib.suppress(OSError):
43
- log.unlink(missing_ok=True)
44
-
45
- def get_log_file() -> pathlib.Path:
46
- folder = pathlib.Path("//allen/programs/mindscope/workgroups/np-exp") / "codeocean-logs"
47
- folder.mkdir(exist_ok=True)
48
- return folder / f"{pathlib.Path(__file__).stem}_{datetime.datetime.now().strftime('%Y-%m-%d')}.log"
49
-
50
- logging.basicConfig(
51
- filename=get_log_file().as_posix(),
52
- level=logging.INFO,
53
- format="%(asctime)s | %(name)s | %(levelname)s | PID: %(process)d | TID: %(thread)d | %(message)s",
54
- datefmt="%Y-%d-%m %H:%M:%S",
55
- )
56
- logger = logging.getLogger(__name__)
57
-
58
- RIG_ROOM_MAPPING = np_config.fetch('/rigs/room_numbers')
59
- HDF5_REPO = pathlib.Path('//allen/programs/mindscope/workgroups/dynamicrouting/DynamicRoutingTask/Data')
60
- SESSION_FOLDER_DIRS = (
61
- pathlib.Path('//allen/programs/mindscope/workgroups/dynamicrouting/PilotEphys/Task 2 pilot'),
62
- pathlib.Path('//allen/programs/mindscope/workgroups/templeton/TTOC/pilot recordings'),
63
- )
64
-
65
- EXCLUDED_SUBJECT_IDS = (0, 366122, 555555, 000000, 598796, 603810, 599657)
66
- TASK_HDF5_GLOB = "DynamicRouting1*.hdf5"
67
- RIG_IGNORE_PREFIXES = ("NP", "OG")
68
-
69
- DEFAULT_HPC_UPLOAD_JOB_EMAIL = "ben.hardcastle@alleninstitute.org"
70
-
71
- DEFAULT_DELAY_BETWEEN_UPLOADS = 40
72
-
73
-
74
- class SessionNotUploadedError(ValueError):
75
- pass
76
-
77
- class UploadLimitReachedError(RuntimeError):
78
- pass
79
-
80
- def reformat_rig_model_rig_id(rig_id: str, modification_date: datetime.date) -> str:
81
- rig_record = npc_session.RigRecord(rig_id)
82
- if not rig_record.is_behavior_cluster_rig:
83
- raise ValueError(
84
- f"Only behavior boxes are supported: {rig_id=}")
85
- room_number = RIG_ROOM_MAPPING.get(rig_record.behavior_cluster_id, "UNKNOWN")
86
- return rig_record.as_aind_data_schema_rig_id(str(room_number), modification_date)
87
-
88
-
89
- def extract_modification_date(rig: Rig) -> datetime.date:
90
- _, _, date_str = rig.rig_id.split("_")
91
- if len(date_str) == 6:
92
- return datetime.datetime.strptime(date_str, "%y%m%d").date()
93
- elif len(date_str) == 8:
94
- return datetime.datetime.strptime(date_str, "%Y%m%d").date()
95
- else:
96
- raise ValueError(f"Unsupported date format: {date_str}")
97
-
98
- def add_metadata(
99
- task_source: pathlib.Path,
100
- dest: pathlib.Path,
101
- rig_storage_directory: pathlib.Path,
102
- ):
103
- """Adds `aind-data-schema` rig and session metadata to a session directory.
104
- """
105
- # we need to patch due to this bug not getting addressed: https://github.com/AllenInstitute/npc_sessions/pull/103
106
- # npc_sessions.Session._aind_rig_id = property(aind_rig_id_patch)
107
- npc_sessions.Session(task_source) \
108
- ._aind_session_metadata.write_standard_file(dest)
109
-
110
- session_metadata_path = dest / "session.json"
111
- rig_metadata_path = metadata_core.copy_task_rig(
112
- task_source,
113
- dest / "rig.json",
114
- rig_storage_directory,
115
- )
116
- if not rig_metadata_path:
117
- raise FileNotFoundError("Failed to copy task rig.")
118
-
119
- rig_metadata = Rig.model_validate_json(rig_metadata_path.read_text())
120
- modification_date = datetime.date(2024, 4, 1) # keep cluster rigs static for now
121
- rig_metadata.modification_date = modification_date
122
- rig_metadata.rig_id = reformat_rig_model_rig_id(rig_metadata.rig_id, modification_date)
123
- rig_metadata.write_standard_file(dest) # assumes this will work out to dest/rig.json
124
-
125
- metadata_core.update_session_from_rig(
126
- session_metadata_path,
127
- rig_metadata_path,
128
- session_metadata_path,
129
- )
130
-
131
-
132
- def upload(
133
- task_source: Path,
134
- test: bool = False,
135
- force_cloud_sync: bool = False,
136
- debug: bool = False,
137
- dry_run: bool = False,
138
- hpc_upload_job_email: str = DEFAULT_HPC_UPLOAD_JOB_EMAIL,
139
- delay: int = DEFAULT_DELAY_BETWEEN_UPLOADS,
140
- lock: threading.Lock | None = None,
141
- stop_event: threading.Event | None = None,
142
- ) -> None:
143
- """
144
- Notes
145
- -----
146
- - task_source Path is expected to have the following naming convention:
147
- //allen/programs/mindscope/workgroups/dynamicrouting/DynamicRoutingTask/Data/<SUBJECT_ID>/<SESSION_ID>.hdf5
148
- """
149
- if debug:
150
- logger.setLevel(logging.DEBUG)
151
-
152
- if stop_event and stop_event.is_set():
153
- logger.debug("Stopping due to stop event")
154
- return
155
-
156
- extracted_subject_id = npc_session.extract_subject(task_source.stem)
157
- if extracted_subject_id is None:
158
- raise SessionNotUploadedError(f"Failed to extract subject ID from {task_source}")
159
- logger.debug(f"Extracted subject id: {extracted_subject_id}")
160
- # we don't want to upload files from folders that don't correspond to labtracks IDs, like `sound`, or `*_test`
161
- if not task_source.parent.name.isdigit():
162
- raise SessionNotUploadedError(
163
- f"{task_source.parent.name=} is not a labtracks MID"
164
- )
165
-
166
- if extracted_subject_id in EXCLUDED_SUBJECT_IDS:
167
- raise SessionNotUploadedError(
168
- f"{extracted_subject_id=} is in {EXCLUDED_SUBJECT_IDS=}"
169
- )
170
-
171
- upload_root = np_session.NPEXP_ROOT / ("codeocean-dev" if test else "codeocean")
172
- session_dir = upload_root / f"{extracted_subject_id}_{npc_session.extract_isoformat_date(task_source.stem)}"
173
-
174
- np_codeocean.utils.set_npc_lims_credentials()
175
- try:
176
- session_info = npc_lims.get_session_info(task_source.stem)
177
- except NoSessionInfo:
178
- raise SessionNotUploadedError(f"{task_source.name} not in Sam's spreadsheets (yet) - cannot deduce project etc.") from None
179
-
180
- # if session has been already been uploaded, skip it
181
- if not (force_cloud_sync or test) and session_info.is_uploaded: # note: session_info.is_uploaded doesnt work for uploads to dev service
182
- raise SessionNotUploadedError(
183
- f" {task_source.name} is already uploaded. Use --force-cloud-sync to re-upload."
184
- )
185
-
186
- # in the transfer-service airflow dag, jobs have failed after creating a folder
187
- # on S3, but before a data asset is created in codeocean (likely due to codeocean
188
- # being down):
189
- # in that case, our `is_uploaded` check would return False, but in airflow,
190
- # there's a `check_s3_folder_exists` task, which will fail since the folder
191
- # already exists.
192
- # To avoid this second failure, we can force a re-upload, regardless of
193
- # whether the folder exists on S3 or not
194
- force_cloud_sync = True
195
-
196
- rig_name = ""
197
- rig_name = session_info.training_info.get("rig_name", "")
198
- if not rig_name:
199
- with h5py.File(task_source, 'r') as file, contextlib.suppress(KeyError):
200
- rig_name = file['rigName'][()].decode('utf-8')
201
-
202
- if any(rig_name.startswith(i) for i in RIG_IGNORE_PREFIXES):
203
- raise SessionNotUploadedError(
204
- f"Not uploading {task_source} because rig_id starts with one of {RIG_IGNORE_PREFIXES!r}"
205
- )
206
-
207
- if stop_event and stop_event.is_set():
208
- logger.debug("Stopping due to stop event")
209
- return
210
-
211
- logger.debug(f"Session upload directory: {session_dir}")
212
-
213
- # external systems start getting modified here.
214
- session_dir.mkdir(exist_ok=True)
215
- metadata_dir = session_dir / 'aind_metadata'
216
- metadata_dir.mkdir(exist_ok=True)
217
- behavior_modality_dir = session_dir / "behavior"
218
- behavior_modality_dir.mkdir(exist_ok=True)
219
-
220
- rig_storage_directory = np_codeocean.get_project_config()["rig_metadata_dir"]
221
- logger.debug(f"Rig storage directory: {rig_storage_directory}")
222
- add_metadata(
223
- task_source,
224
- metadata_dir,
225
- rig_storage_directory=rig_storage_directory,
226
- )
227
-
228
- np_tools.symlink(
229
- np_codeocean.utils.ensure_posix(task_source),
230
- behavior_modality_dir / task_source.name,
231
- )
232
-
233
- upload_job_path = np_config.normalize_path(session_dir / 'upload.json')
234
-
235
- upload_service_url = np_codeocean.utils.DEV_SERVICE \
236
- if test else np_codeocean.utils.AIND_DATA_TRANSFER_SERVICE
237
-
238
- if stop_event and stop_event.is_set():
239
- logger.debug("Stopping due to stop event")
240
- return
241
-
242
- if lock is not None:
243
- with lock:
244
- if stop_event and stop_event.is_set():
245
- logger.debug("Stopping due to stop event")
246
- return
247
- if delay > 0:
248
- logger.info(f"Pausing {delay} seconds before creating upload request")
249
- time.sleep(delay)
250
-
251
- logger.info(f"Submitting {session_dir.name} to {upload_service_url}")
252
-
253
- acq_datetime_str = npc_session.extract_isoformat_datetime(task_source.stem)
254
- if not acq_datetime_str:
255
- raise SessionNotUploadedError(f"Could not extract acquisition datetime from {task_source.stem}")
256
- np_codeocean.utils.put_jobs_for_hpc_upload(
257
- upload_jobs=np_codeocean.utils.create_upload_job_configs_v2(
258
- subject_id=str(extracted_subject_id),
259
- acq_datetime=datetime.datetime.fromisoformat(acq_datetime_str),
260
- project_name='Dynamic Routing',
261
- platform='behavior',
262
- modalities={
263
- 'behavior': np_config.normalize_path(behavior_modality_dir).as_posix()
264
- },
265
- metadata_dir=np_config.normalize_path(metadata_dir).as_posix(),
266
- force_cloud_sync=force_cloud_sync,
267
- user_email=hpc_upload_job_email,
268
- test=test,
269
- ),
270
- upload_service_url=upload_service_url,
271
- user_email=hpc_upload_job_email,
272
- dry_run=dry_run,
273
- save_path=upload_job_path,
274
- )
275
-
276
- def upload_batch(
277
- batch_dir: pathlib.Path,
278
- test: bool = False,
279
- force_cloud_sync: bool = False,
280
- debug: bool = False,
281
- dry_run: bool = False,
282
- hpc_upload_job_email: str = DEFAULT_HPC_UPLOAD_JOB_EMAIL,
283
- delay: int = DEFAULT_DELAY_BETWEEN_UPLOADS,
284
- chronological_order: bool = False,
285
- batch_limit: int | None = None, # number of sessions to process, not upload
286
- ignore_errors: bool = True,
287
- ) -> None:
288
- if test:
289
- batch_limit = 3
290
-
291
- logger.addHandler(qh := logging.handlers.QueueHandler(queue := multiprocessing.Queue()))
292
- listener = logging.handlers.QueueListener(queue, qh)
293
- listener.start()
294
- sorted_files = tuple(
295
- sorted(
296
- batch_dir.rglob(TASK_HDF5_GLOB),
297
- key=lambda p: npc_session.extract_isoformat_date(p.name), # type: ignore[return-value]
298
- reverse=not chronological_order,
299
- )
300
- ) # to fix tqdm we need the length of files: len(futures_dict) doesn't work for some reason
301
- upload_count = 0
302
- batch_count = 0
303
- future_to_task_source: dict[concurrent.futures.Future, pathlib.Path] = {}
304
- with (
305
- multiprocessing.Manager() as manager,
306
- concurrent.futures.ProcessPoolExecutor(max_workers=None) as executor,
307
- ):
308
- sessions_remaining = manager.Value('i', batch_limit or -1)
309
- """Counts down and stops at zero. Set to -1 for no limit"""
310
- lock = manager.Lock()
311
- stop_event = manager.Event()
312
- for task_source in sorted_files:
313
- future = executor.submit(
314
- upload,
315
- task_source=task_source,
316
- test=test,
317
- force_cloud_sync=force_cloud_sync,
318
- debug=debug,
319
- dry_run=dry_run,
320
- hpc_upload_job_email=hpc_upload_job_email,
321
- delay=delay,
322
- lock=lock,
323
- stop_event=stop_event,
324
- )
325
- future_to_task_source[future] = task_source
326
- with tqdm.tqdm(total=len(sorted_files), desc="Checking status and uploading new sessions") as pbar:
327
- for future in concurrent.futures.as_completed(future_to_task_source):
328
- try:
329
- _ = future.result()
330
- except SessionNotUploadedError as exc: # any other errors will be raised: prefer to fail fast when we have 12k files to process
331
- logger.debug('Skipping upload of %s due to %r' % (future_to_task_source[future], exc))
332
- except Exception as e:
333
- logger.exception(e)
334
- if not ignore_errors:
335
- pbar.close()
336
- raise e
337
- else:
338
- upload_count += 1
339
- finally:
340
- pbar.update(1) # as_completed will iterate out of order, so update tqdm progress manually
341
- batch_count += 1
342
- if batch_limit is not None and batch_count >= batch_limit:
343
- pbar.close()
344
- msg = f"Reached {batch_limit = }: stopping pending and ongoing tasks"
345
- logger.info(msg)
346
- print(msg)
347
- stop_event.set()
348
- executor.shutdown(wait=True, cancel_futures=True)
349
- break
350
- pbar.close()
351
- msg = f"Batch upload complete: {upload_count} session(s) uploaded"
352
- logger.info(msg)
353
- print(msg)
354
- listener.stop()
355
-
356
- def parse_args() -> argparse.Namespace:
357
- parser = argparse.ArgumentParser()
358
- parser.add_argument('--task-source', type=pathlib.Path, default=HDF5_REPO, help="Path to a single DynamicRouting1*.hdf5 file or a directory containing them (rglob will be used to find files in all subfolder levels)")
359
- parser.add_argument('--test', action="store_true")
360
- parser.add_argument('--force-cloud-sync', action="store_true")
361
- parser.add_argument('--debug', action="store_true")
362
- parser.add_argument('--dry-run', action="store_true")
363
- parser.add_argument('--email', type=str, help=f"[optional] specify email address for hpc upload job updates. Default is {np_codeocean.utils.HPC_UPLOAD_JOB_EMAIL}")
364
- parser.add_argument('--delay', type=int, help=f"wait time (sec) between job submissions in batch mode, to avoid overloadig upload service. Default is {DEFAULT_DELAY_BETWEEN_UPLOADS}", default=DEFAULT_DELAY_BETWEEN_UPLOADS)
365
- parser.add_argument('--chronological', action="store_true", help="[batch mode only] Upload files in chronological order (oldest first) - default is newest first")
366
- parser.add_argument('--batch-limit', type=int, help="[batch mode only] Limit the number of files to upload in batch mode")
367
- parser.add_argument('--fail-fast', dest="ignore_errors", action="store_false", help="[batch mode only] If a session fails to upload, raise the error - default is to log error and continue with other sessions")
368
- return parser.parse_args()
369
-
370
-
371
- def main() -> None:
372
- reset_log_file()
373
- args = parse_args()
374
- logger.info(f"Parsed args: {args!r}")
375
- if not args.task_source.is_dir():
376
- logger.info(f"Uploading in single file mode: {args.task_source}")
377
- upload(
378
- args.task_source,
379
- test=args.test,
380
- force_cloud_sync=args.force_cloud_sync,
381
- debug=args.debug,
382
- dry_run=args.dry_run,
383
- hpc_upload_job_email=args.email,
384
- )
385
- else:
386
- logger.info(f"Uploading in batch mode: {args.task_source}")
387
- upload_batch(
388
- batch_dir=args.task_source,
389
- test=args.test,
390
- force_cloud_sync=args.force_cloud_sync,
391
- debug=args.debug,
392
- dry_run=args.dry_run,
393
- hpc_upload_job_email=args.email,
394
- delay=args.delay,
395
- chronological_order=args.chronological,
396
- batch_limit=args.batch_limit,
397
- ignore_errors=args.ignore_errors,
398
- )
399
-
400
-
401
- if __name__ == '__main__':
402
- main()
403
- # upload(
404
- # task_source=pathlib.Path("//allen/programs/mindscope/workgroups/dynamicrouting/DynamicRoutingTask/Data/714753/DynamicRouting1_714753_20240703_114241.hdf5"),
405
- # test=True,
406
- # )
407
- # upload(
408
- # task_source=Path("//allen/programs/mindscope/workgroups/dynamicrouting/DynamicRoutingTask/Data/659250/DynamicRouting1_659250_20230322_151236.hdf5"),
409
- # test=True,
410
- # force_cloud_sync=True,
411
- # debug=True,
412
- # dry_run=False,
413
- # )
1
+ from __future__ import annotations
2
+
3
+ import argparse
4
+ import concurrent.futures
5
+ import contextlib
6
+ import datetime
7
+ import logging
8
+ import logging.config
9
+ import logging.handlers
10
+ import multiprocessing
11
+ import multiprocessing.managers
12
+ import multiprocessing.synchronize
13
+ import pathlib
14
+ import threading
15
+ import time
16
+ import warnings
17
+ from pathlib import Path
18
+
19
+ import h5py
20
+ import np_config
21
+ import np_session
22
+ import np_tools
23
+ import npc_lims
24
+ import npc_session
25
+ import npc_sessions # this is heavy, but has the logic for hdf5 -> session.json
26
+ import tqdm
27
+ from aind_data_schema.core.rig import Rig
28
+ from npc_lims.exceptions import NoSessionInfo
29
+
30
+ import np_codeocean
31
+ import np_codeocean.utils
32
+ from np_codeocean.metadata import core as metadata_core
33
+
34
+ # Disable divide by zero or NaN warnings
35
+ warnings.filterwarnings("ignore", category=RuntimeWarning)
36
+
37
+
38
+ def reset_log_file() -> None:
39
+ log = get_log_file()
40
+ log.parent.mkdir(exist_ok=True)
41
+ with contextlib.suppress(OSError):
42
+ log.unlink(missing_ok=True)
43
+
44
+
45
+ def get_log_file() -> pathlib.Path:
46
+ folder = (
47
+ pathlib.Path("//allen/programs/mindscope/workgroups/np-exp") / "codeocean-logs"
48
+ )
49
+ folder.mkdir(exist_ok=True)
50
+ return (
51
+ folder
52
+ / f"{pathlib.Path(__file__).stem}_{datetime.datetime.now().strftime('%Y-%m-%d')}.log"
53
+ )
54
+
55
+
56
+ logging.basicConfig(
57
+ filename=get_log_file().as_posix(),
58
+ level=logging.INFO,
59
+ format="%(asctime)s | %(name)s | %(levelname)s | PID: %(process)d | TID: %(thread)d | %(message)s",
60
+ datefmt="%Y-%d-%m %H:%M:%S",
61
+ )
62
+ logger = logging.getLogger(__name__)
63
+
64
+ RIG_ROOM_MAPPING = np_config.fetch("/rigs/room_numbers")
65
+ HDF5_REPO = pathlib.Path(
66
+ "//allen/programs/mindscope/workgroups/dynamicrouting/DynamicRoutingTask/Data"
67
+ )
68
+ SESSION_FOLDER_DIRS = (
69
+ pathlib.Path(
70
+ "//allen/programs/mindscope/workgroups/dynamicrouting/PilotEphys/Task 2 pilot"
71
+ ),
72
+ pathlib.Path(
73
+ "//allen/programs/mindscope/workgroups/templeton/TTOC/pilot recordings"
74
+ ),
75
+ )
76
+
77
+ EXCLUDED_SUBJECT_IDS = (0, 366122, 555555, 000000, 598796, 603810, 599657)
78
+ TASK_HDF5_GLOB = "DynamicRouting1*.hdf5"
79
+ RIG_IGNORE_PREFIXES = ("NP", "OG")
80
+
81
+ DEFAULT_HPC_UPLOAD_JOB_EMAIL = "ben.hardcastle@alleninstitute.org"
82
+
83
+ DEFAULT_DELAY_BETWEEN_UPLOADS = 40
84
+
85
+
86
+ class SessionNotUploadedError(ValueError):
87
+ pass
88
+
89
+
90
+ class UploadLimitReachedError(RuntimeError):
91
+ pass
92
+
93
+
94
+ def reformat_rig_model_rig_id(rig_id: str, modification_date: datetime.date) -> str:
95
+ rig_record = npc_session.RigRecord(rig_id)
96
+ if not rig_record.is_behavior_cluster_rig:
97
+ raise ValueError(f"Only behavior boxes are supported: {rig_id=}")
98
+ room_number = RIG_ROOM_MAPPING.get(rig_record.behavior_cluster_id, "UNKNOWN")
99
+ return rig_record.as_aind_data_schema_rig_id(str(room_number), modification_date)
100
+
101
+
102
+ def extract_modification_date(rig: Rig) -> datetime.date:
103
+ _, _, date_str = rig.rig_id.split("_")
104
+ if len(date_str) == 6:
105
+ return datetime.datetime.strptime(date_str, "%y%m%d").date()
106
+ elif len(date_str) == 8:
107
+ return datetime.datetime.strptime(date_str, "%Y%m%d").date()
108
+ else:
109
+ raise ValueError(f"Unsupported date format: {date_str}")
110
+
111
+
112
+ def add_metadata(
113
+ task_source: pathlib.Path,
114
+ dest: pathlib.Path,
115
+ rig_storage_directory: pathlib.Path,
116
+ ):
117
+ """Adds `aind-data-schema` rig and session metadata to a session directory."""
118
+ # we need to patch due to this bug not getting addressed: https://github.com/AllenInstitute/npc_sessions/pull/103
119
+ # npc_sessions.Session._aind_rig_id = property(aind_rig_id_patch)
120
+ npc_sessions.Session(task_source)._aind_session_metadata.write_standard_file(dest)
121
+
122
+ session_metadata_path = dest / "session.json"
123
+ rig_metadata_path = metadata_core.copy_task_rig(
124
+ task_source,
125
+ dest / "rig.json",
126
+ rig_storage_directory,
127
+ )
128
+ if not rig_metadata_path:
129
+ raise FileNotFoundError("Failed to copy task rig.")
130
+
131
+ rig_metadata = Rig.model_validate_json(rig_metadata_path.read_text())
132
+ modification_date = datetime.date(2024, 4, 1) # keep cluster rigs static for now
133
+ rig_metadata.modification_date = modification_date
134
+ rig_metadata.rig_id = reformat_rig_model_rig_id(
135
+ rig_metadata.rig_id, modification_date
136
+ )
137
+ rig_metadata.write_standard_file(
138
+ dest
139
+ ) # assumes this will work out to dest/rig.json
140
+
141
+ metadata_core.update_session_from_rig(
142
+ session_metadata_path,
143
+ rig_metadata_path,
144
+ session_metadata_path,
145
+ )
146
+
147
+
148
+ def upload(
149
+ task_source: Path,
150
+ test: bool = False,
151
+ force_cloud_sync: bool = False,
152
+ debug: bool = False,
153
+ dry_run: bool = False,
154
+ hpc_upload_job_email: str = DEFAULT_HPC_UPLOAD_JOB_EMAIL,
155
+ delay: int = DEFAULT_DELAY_BETWEEN_UPLOADS,
156
+ lock: threading.Lock | None = None,
157
+ stop_event: threading.Event | None = None,
158
+ ) -> None:
159
+ """
160
+ Notes
161
+ -----
162
+ - task_source Path is expected to have the following naming convention:
163
+ //allen/programs/mindscope/workgroups/dynamicrouting/DynamicRoutingTask/Data/<SUBJECT_ID>/<SESSION_ID>.hdf5
164
+ """
165
+ if debug:
166
+ logger.setLevel(logging.DEBUG)
167
+
168
+ if stop_event and stop_event.is_set():
169
+ logger.debug("Stopping due to stop event")
170
+ return
171
+
172
+ extracted_subject_id = npc_session.extract_subject(task_source.stem)
173
+ if extracted_subject_id is None:
174
+ raise SessionNotUploadedError(
175
+ f"Failed to extract subject ID from {task_source}"
176
+ )
177
+ logger.debug(f"Extracted subject id: {extracted_subject_id}")
178
+ # we don't want to upload files from folders that don't correspond to labtracks IDs, like `sound`, or `*_test`
179
+ if not task_source.parent.name.isdigit():
180
+ raise SessionNotUploadedError(
181
+ f"{task_source.parent.name=} is not a labtracks MID"
182
+ )
183
+
184
+ if extracted_subject_id in EXCLUDED_SUBJECT_IDS:
185
+ raise SessionNotUploadedError(
186
+ f"{extracted_subject_id=} is in {EXCLUDED_SUBJECT_IDS=}"
187
+ )
188
+
189
+ upload_root = np_session.NPEXP_ROOT / ("codeocean-dev" if test else "codeocean")
190
+ session_dir = (
191
+ upload_root
192
+ / f"{extracted_subject_id}_{npc_session.extract_isoformat_date(task_source.stem)}"
193
+ )
194
+
195
+ np_codeocean.utils.set_npc_lims_credentials()
196
+ try:
197
+ session_info = npc_lims.get_session_info(task_source.stem)
198
+ except NoSessionInfo:
199
+ raise SessionNotUploadedError(
200
+ f"{task_source.name} not in Sam's spreadsheets (yet) - cannot deduce project etc."
201
+ ) from None
202
+
203
+ # if session has been already been uploaded, skip it
204
+ if (
205
+ not (force_cloud_sync or test) and session_info.is_uploaded
206
+ ): # note: session_info.is_uploaded doesnt work for uploads to dev service
207
+ raise SessionNotUploadedError(
208
+ f" {task_source.name} is already uploaded. Use --force-cloud-sync to re-upload."
209
+ )
210
+
211
+ # in the transfer-service airflow dag, jobs have failed after creating a folder
212
+ # on S3, but before a data asset is created in codeocean (likely due to codeocean
213
+ # being down):
214
+ # in that case, our `is_uploaded` check would return False, but in airflow,
215
+ # there's a `check_s3_folder_exists` task, which will fail since the folder
216
+ # already exists.
217
+ # To avoid this second failure, we can force a re-upload, regardless of
218
+ # whether the folder exists on S3 or not
219
+ force_cloud_sync = True
220
+
221
+ rig_name = ""
222
+ rig_name = session_info.training_info.get("rig_name", "")
223
+ if not rig_name:
224
+ with h5py.File(task_source, "r") as file, contextlib.suppress(KeyError):
225
+ rig_name = file["rigName"][()].decode("utf-8")
226
+
227
+ if any(rig_name.startswith(i) for i in RIG_IGNORE_PREFIXES):
228
+ raise SessionNotUploadedError(
229
+ f"Not uploading {task_source} because rig_id starts with one of {RIG_IGNORE_PREFIXES!r}"
230
+ )
231
+
232
+ if stop_event and stop_event.is_set():
233
+ logger.debug("Stopping due to stop event")
234
+ return
235
+
236
+ logger.debug(f"Session upload directory: {session_dir}")
237
+
238
+ # external systems start getting modified here.
239
+ session_dir.mkdir(exist_ok=True)
240
+ metadata_dir = session_dir / "aind_metadata"
241
+ metadata_dir.mkdir(exist_ok=True)
242
+ behavior_modality_dir = session_dir / "behavior"
243
+ behavior_modality_dir.mkdir(exist_ok=True)
244
+
245
+ rig_storage_directory = np_codeocean.get_project_config()["rig_metadata_dir"]
246
+ logger.debug(f"Rig storage directory: {rig_storage_directory}")
247
+ add_metadata(
248
+ task_source,
249
+ metadata_dir,
250
+ rig_storage_directory=rig_storage_directory,
251
+ )
252
+
253
+ np_tools.symlink(
254
+ np_codeocean.utils.ensure_posix(task_source),
255
+ behavior_modality_dir / task_source.name,
256
+ )
257
+
258
+ upload_job_path = np_config.normalize_path(session_dir / "upload.json")
259
+
260
+ upload_service_url = (
261
+ np_codeocean.utils.DEV_SERVICE
262
+ if test
263
+ else np_codeocean.utils.AIND_DATA_TRANSFER_SERVICE
264
+ )
265
+
266
+ if stop_event and stop_event.is_set():
267
+ logger.debug("Stopping due to stop event")
268
+ return
269
+
270
+ if lock is not None:
271
+ with lock:
272
+ if stop_event and stop_event.is_set():
273
+ logger.debug("Stopping due to stop event")
274
+ return
275
+ if delay > 0:
276
+ logger.info(f"Pausing {delay} seconds before creating upload request")
277
+ time.sleep(delay)
278
+
279
+ logger.info(f"Submitting {session_dir.name} to {upload_service_url}")
280
+
281
+ acq_datetime_str = npc_session.extract_isoformat_datetime(task_source.stem)
282
+ if not acq_datetime_str:
283
+ raise SessionNotUploadedError(
284
+ f"Could not extract acquisition datetime from {task_source.stem}"
285
+ )
286
+ np_codeocean.utils.put_jobs_for_hpc_upload(
287
+ upload_jobs=np_codeocean.utils.create_upload_job_configs_v2(
288
+ subject_id=str(extracted_subject_id),
289
+ acq_datetime=datetime.datetime.fromisoformat(acq_datetime_str),
290
+ project_name="Dynamic Routing",
291
+ platform="behavior",
292
+ modalities={
293
+ "behavior": np_config.normalize_path(behavior_modality_dir).as_posix()
294
+ },
295
+ metadata_dir=np_config.normalize_path(metadata_dir).as_posix(),
296
+ force_cloud_sync=force_cloud_sync,
297
+ user_email=hpc_upload_job_email,
298
+ test=test,
299
+ ),
300
+ upload_service_url=upload_service_url,
301
+ user_email=hpc_upload_job_email,
302
+ dry_run=dry_run,
303
+ save_path=upload_job_path,
304
+ )
305
+
306
+
307
+ def upload_batch(
308
+ batch_dir: pathlib.Path,
309
+ test: bool = False,
310
+ force_cloud_sync: bool = False,
311
+ debug: bool = False,
312
+ dry_run: bool = False,
313
+ hpc_upload_job_email: str = DEFAULT_HPC_UPLOAD_JOB_EMAIL,
314
+ delay: int = DEFAULT_DELAY_BETWEEN_UPLOADS,
315
+ chronological_order: bool = False,
316
+ batch_limit: int | None = None, # number of sessions to process, not upload
317
+ ignore_errors: bool = True,
318
+ ) -> None:
319
+ if test:
320
+ batch_limit = 3
321
+
322
+ logger.addHandler(
323
+ qh := logging.handlers.QueueHandler(queue := multiprocessing.Queue())
324
+ )
325
+ listener = logging.handlers.QueueListener(queue, qh)
326
+ listener.start()
327
+ sorted_files = tuple(
328
+ sorted(
329
+ batch_dir.rglob(TASK_HDF5_GLOB),
330
+ key=lambda p: npc_session.extract_isoformat_date(p.name), # type: ignore[return-value]
331
+ reverse=not chronological_order,
332
+ )
333
+ ) # to fix tqdm we need the length of files: len(futures_dict) doesn't work for some reason
334
+ upload_count = 0
335
+ batch_count = 0
336
+ future_to_task_source: dict[concurrent.futures.Future, pathlib.Path] = {}
337
+ with (
338
+ multiprocessing.Manager() as manager,
339
+ concurrent.futures.ProcessPoolExecutor(max_workers=None) as executor,
340
+ ):
341
+ sessions_remaining = manager.Value("i", batch_limit or -1)
342
+ """Counts down and stops at zero. Set to -1 for no limit"""
343
+ lock = manager.Lock()
344
+ stop_event = manager.Event()
345
+ for task_source in sorted_files:
346
+ future = executor.submit(
347
+ upload,
348
+ task_source=task_source,
349
+ test=test,
350
+ force_cloud_sync=force_cloud_sync,
351
+ debug=debug,
352
+ dry_run=dry_run,
353
+ hpc_upload_job_email=hpc_upload_job_email,
354
+ delay=delay,
355
+ lock=lock,
356
+ stop_event=stop_event,
357
+ )
358
+ future_to_task_source[future] = task_source
359
+ with tqdm.tqdm(
360
+ total=len(sorted_files), desc="Checking status and uploading new sessions"
361
+ ) as pbar:
362
+ for future in concurrent.futures.as_completed(future_to_task_source):
363
+ try:
364
+ _ = future.result()
365
+ except (
366
+ SessionNotUploadedError
367
+ ) as exc: # any other errors will be raised: prefer to fail fast when we have 12k files to process
368
+ logger.debug(
369
+ "Skipping upload of %s due to %r"
370
+ % (future_to_task_source[future], exc)
371
+ )
372
+ except Exception as e:
373
+ logger.exception(e)
374
+ if not ignore_errors:
375
+ pbar.close()
376
+ raise e
377
+ else:
378
+ upload_count += 1
379
+ finally:
380
+ pbar.update(
381
+ 1
382
+ ) # as_completed will iterate out of order, so update tqdm progress manually
383
+ batch_count += 1
384
+ if batch_limit is not None and batch_count >= batch_limit:
385
+ pbar.close()
386
+ msg = f"Reached {batch_limit = }: stopping pending and ongoing tasks"
387
+ logger.info(msg)
388
+ print(msg)
389
+ stop_event.set()
390
+ executor.shutdown(wait=True, cancel_futures=True)
391
+ break
392
+ pbar.close()
393
+ msg = f"Batch upload complete: {upload_count} session(s) uploaded"
394
+ logger.info(msg)
395
+ print(msg)
396
+ listener.stop()
397
+
398
+
399
+ def parse_args() -> argparse.Namespace:
400
+ parser = argparse.ArgumentParser()
401
+ parser.add_argument(
402
+ "--task-source",
403
+ type=pathlib.Path,
404
+ default=HDF5_REPO,
405
+ help="Path to a single DynamicRouting1*.hdf5 file or a directory containing them (rglob will be used to find files in all subfolder levels)",
406
+ )
407
+ parser.add_argument("--test", action="store_true")
408
+ parser.add_argument("--force-cloud-sync", action="store_true")
409
+ parser.add_argument("--debug", action="store_true")
410
+ parser.add_argument("--dry-run", action="store_true")
411
+ parser.add_argument(
412
+ "--email",
413
+ type=str,
414
+ help=f"[optional] specify email address for hpc upload job updates. Default is {np_codeocean.utils.HPC_UPLOAD_JOB_EMAIL}",
415
+ )
416
+ parser.add_argument(
417
+ "--delay",
418
+ type=int,
419
+ help=f"wait time (sec) between job submissions in batch mode, to avoid overloadig upload service. Default is {DEFAULT_DELAY_BETWEEN_UPLOADS}",
420
+ default=DEFAULT_DELAY_BETWEEN_UPLOADS,
421
+ )
422
+ parser.add_argument(
423
+ "--chronological",
424
+ action="store_true",
425
+ help="[batch mode only] Upload files in chronological order (oldest first) - default is newest first",
426
+ )
427
+ parser.add_argument(
428
+ "--batch-limit",
429
+ type=int,
430
+ help="[batch mode only] Limit the number of files to upload in batch mode",
431
+ )
432
+ parser.add_argument(
433
+ "--fail-fast",
434
+ dest="ignore_errors",
435
+ action="store_false",
436
+ help="[batch mode only] If a session fails to upload, raise the error - default is to log error and continue with other sessions",
437
+ )
438
+ return parser.parse_args()
439
+
440
+
441
+ def main() -> None:
442
+ reset_log_file()
443
+ args = parse_args()
444
+ logger.info(f"Parsed args: {args!r}")
445
+ if not args.task_source.is_dir():
446
+ logger.info(f"Uploading in single file mode: {args.task_source}")
447
+ upload(
448
+ args.task_source,
449
+ test=args.test,
450
+ force_cloud_sync=args.force_cloud_sync,
451
+ debug=args.debug,
452
+ dry_run=args.dry_run,
453
+ hpc_upload_job_email=args.email,
454
+ )
455
+ else:
456
+ logger.info(f"Uploading in batch mode: {args.task_source}")
457
+ upload_batch(
458
+ batch_dir=args.task_source,
459
+ test=args.test,
460
+ force_cloud_sync=args.force_cloud_sync,
461
+ debug=args.debug,
462
+ dry_run=args.dry_run,
463
+ hpc_upload_job_email=args.email,
464
+ delay=args.delay,
465
+ chronological_order=args.chronological,
466
+ batch_limit=args.batch_limit,
467
+ ignore_errors=args.ignore_errors,
468
+ )
469
+
470
+
471
+ if __name__ == "__main__":
472
+ main()
473
+ # upload(
474
+ # task_source=pathlib.Path("//allen/programs/mindscope/workgroups/dynamicrouting/DynamicRoutingTask/Data/714753/DynamicRouting1_714753_20240703_114241.hdf5"),
475
+ # test=True,
476
+ # )
477
+ # upload(
478
+ # task_source=Path("//allen/programs/mindscope/workgroups/dynamicrouting/DynamicRoutingTask/Data/659250/DynamicRouting1_659250_20230322_151236.hdf5"),
479
+ # test=True,
480
+ # force_cloud_sync=True,
481
+ # debug=True,
482
+ # dry_run=False,
483
+ # )