np_codeocean 0.1.8__py3-none-any.whl → 0.2.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,415 @@
1
+ from __future__ import annotations
2
+
3
+ import argparse
4
+ import concurrent.futures
5
+ import contextlib
6
+ import datetime
7
+ import logging
8
+ import logging.config
9
+ import logging.handlers
10
+ import multiprocessing.synchronize
11
+ import pathlib
12
+ import multiprocessing
13
+ import multiprocessing.managers
14
+ import threading
15
+ import time
16
+ import warnings
17
+ from pathlib import Path
18
+
19
+ import h5py
20
+ import tqdm
21
+ import np_codeocean
22
+ import np_codeocean.utils
23
+ import np_config
24
+ import np_session
25
+ import np_tools
26
+ import npc_lims
27
+ import npc_session
28
+ import npc_sessions # this is heavy, but has the logic for hdf5 -> session.json
29
+ from aind_data_schema.core.rig import Rig
30
+ from np_aind_metadata.integrations import dynamic_routing_task
31
+ from npc_lims.exceptions import NoSessionInfo
32
+
33
+ import np_codeocean
34
+ import np_codeocean.utils
35
+
36
+ # Disable divide by zero or NaN warnings
37
+ warnings.filterwarnings("ignore", category=RuntimeWarning)
38
+
39
+ def reset_log_file() -> None:
40
+ log = get_log_file()
41
+ log.parent.mkdir(exist_ok=True)
42
+ with contextlib.suppress(OSError):
43
+ log.unlink(missing_ok=True)
44
+
45
+ def get_log_file() -> pathlib.Path:
46
+ folder = pathlib.Path(".").resolve() / "logs"
47
+ return folder / f"{pathlib.Path(__file__).stem}_{datetime.datetime.now().strftime('%Y-%m-%d')}.log"
48
+
49
+ logging.basicConfig(
50
+ filename=get_log_file().as_posix(),
51
+ level=logging.INFO,
52
+ format="%(asctime)s | %(name)s | %(levelname)s | PID: %(process)d | TID: %(thread)d | %(message)s",
53
+ datefmt="%Y-%d-%m %H:%M:%S",
54
+ )
55
+ logger = logging.getLogger(__name__)
56
+
57
+ RIG_ROOM_MAPPING = np_config.fetch('/rigs/room_numbers')
58
+ HDF5_REPO = pathlib.Path('//allen/programs/mindscope/workgroups/dynamicrouting/DynamicRoutingTask/Data')
59
+ SESSION_FOLDER_DIRS = (
60
+ pathlib.Path('//allen/programs/mindscope/workgroups/dynamicrouting/PilotEphys/Task 2 pilot'),
61
+ pathlib.Path('//allen/programs/mindscope/workgroups/templeton/TTOC/pilot recordings'),
62
+ )
63
+
64
+ EXCLUDED_SUBJECT_IDS = (0, 366122, 555555, 000000, 598796, 603810, 599657)
65
+ TASK_HDF5_GLOB = "DynamicRouting1*.hdf5"
66
+ RIG_IGNORE_PREFIXES = ("NP", "OG")
67
+
68
+ DEFAULT_HPC_UPLOAD_JOB_EMAIL = "ben.hardcastle@alleninstitute.org"
69
+
70
+ DEFAULT_DELAY_BETWEEN_UPLOADS = 40
71
+
72
+
73
+ class SessionNotUploadedError(ValueError):
74
+ pass
75
+
76
+ class UploadLimitReachedError(RuntimeError):
77
+ pass
78
+
79
+ def reformat_rig_model_rig_id(rig_id: str, modification_date: datetime.date) -> str:
80
+ rig_record = npc_session.RigRecord(rig_id)
81
+ if not rig_record.is_behavior_cluster_rig:
82
+ raise ValueError(
83
+ f"Only behavior boxes are supported: {rig_id=}")
84
+ room_number = RIG_ROOM_MAPPING.get(rig_record.behavior_cluster_id, "UNKNOWN")
85
+ return rig_record.as_aind_data_schema_rig_id(str(room_number), modification_date)
86
+
87
+
88
+ def extract_modification_date(rig: Rig) -> datetime.date:
89
+ _, _, date_str = rig.rig_id.split("_")
90
+ if len(date_str) == 6:
91
+ return datetime.datetime.strptime(date_str, "%y%m%d").date()
92
+ elif len(date_str) == 8:
93
+ return datetime.datetime.strptime(date_str, "%Y%m%d").date()
94
+ else:
95
+ raise ValueError(f"Unsupported date format: {date_str}")
96
+
97
+ def add_metadata(
98
+ task_source: pathlib.Path,
99
+ dest: pathlib.Path,
100
+ rig_storage_directory: pathlib.Path,
101
+ ):
102
+ """Adds `aind-data-schema` rig and session metadata to a session directory.
103
+ """
104
+ # we need to patch due to this bug not getting addressed: https://github.com/AllenInstitute/npc_sessions/pull/103
105
+ # npc_sessions.Session._aind_rig_id = property(aind_rig_id_patch)
106
+ npc_sessions.Session(task_source) \
107
+ ._aind_session_metadata.write_standard_file(dest)
108
+
109
+ session_metadata_path = dest / "session.json"
110
+ rig_metadata_path = dynamic_routing_task.copy_task_rig(
111
+ task_source,
112
+ dest / "rig.json",
113
+ rig_storage_directory,
114
+ )
115
+ if not rig_metadata_path:
116
+ raise FileNotFoundError("Failed to copy task rig.")
117
+
118
+ rig_metadata = Rig.model_validate_json(rig_metadata_path.read_text())
119
+ modification_date = datetime.date(2024, 4, 1) # keep cluster rigs static for now
120
+ rig_metadata.modification_date = modification_date
121
+ rig_metadata.rig_id = reformat_rig_model_rig_id(rig_metadata.rig_id, modification_date)
122
+ rig_metadata.write_standard_file(dest) # assumes this will work out to dest/rig.json
123
+
124
+ dynamic_routing_task.update_session_from_rig(
125
+ session_metadata_path,
126
+ rig_metadata_path,
127
+ session_metadata_path,
128
+ )
129
+
130
+
131
+ def upload(
132
+ task_source: Path,
133
+ test: bool = False,
134
+ force_cloud_sync: bool = False,
135
+ debug: bool = False,
136
+ dry_run: bool = False,
137
+ hpc_upload_job_email: str = DEFAULT_HPC_UPLOAD_JOB_EMAIL,
138
+ delay: int = DEFAULT_DELAY_BETWEEN_UPLOADS,
139
+ lock: threading.Lock | None = None,
140
+ stop_event: threading.Event | None = None,
141
+ ) -> None:
142
+ """
143
+ Notes
144
+ -----
145
+ - task_source Path is expected to have the following naming convention:
146
+ //allen/programs/mindscope/workgroups/dynamicrouting/DynamicRoutingTask/Data/<SUBJECT_ID>/<SESSION_ID>.hdf5
147
+ """
148
+ if debug:
149
+ logger.setLevel(logging.DEBUG)
150
+
151
+ if stop_event and stop_event.is_set():
152
+ logger.debug("Stopping due to stop event")
153
+ return
154
+
155
+ extracted_subject_id = npc_session.extract_subject(task_source.stem)
156
+ if extracted_subject_id is None:
157
+ raise SessionNotUploadedError(f"Failed to extract subject ID from {task_source}")
158
+ logger.debug(f"Extracted subject id: {extracted_subject_id}")
159
+ # we don't want to upload files from folders that don't correspond to labtracks IDs, like `sound`, or `*_test`
160
+ if not task_source.parent.name.isdigit():
161
+ raise SessionNotUploadedError(
162
+ f"{task_source.parent.name=} is not a labtracks MID"
163
+ )
164
+
165
+ if extracted_subject_id in EXCLUDED_SUBJECT_IDS:
166
+ raise SessionNotUploadedError(
167
+ f"{extracted_subject_id=} is in {EXCLUDED_SUBJECT_IDS=}"
168
+ )
169
+
170
+ upload_root = np_session.NPEXP_ROOT / ("codeocean-dev" if test else "codeocean")
171
+ session_dir = upload_root / f"{extracted_subject_id}_{npc_session.extract_isoformat_date(task_source.stem)}"
172
+
173
+ np_codeocean.utils.set_npc_lims_credentials()
174
+ try:
175
+ session_info = npc_lims.get_session_info(task_source.stem)
176
+ except NoSessionInfo:
177
+ raise SessionNotUploadedError(f"{task_source.name} not in Sam's spreadsheets (yet) - cannot deduce project etc.") from None
178
+
179
+ # if session has been already been uploaded, skip it
180
+ if not (force_cloud_sync or test) and session_info.is_uploaded: # note: session_info.is_uploaded doesnt work for uploads to dev service
181
+ raise SessionNotUploadedError(
182
+ f" {task_source.name} is already uploaded. Use --force-cloud-sync to re-upload."
183
+ )
184
+
185
+ # in the transfer-service airflow dag, jobs have failed after creating a folder
186
+ # on S3, but before a data asset is created in codeocean (likely due to codeocean
187
+ # being down):
188
+ # in that case, our `is_uploaded` check would return False, but in airflow,
189
+ # there's a `check_s3_folder_exists` task, which will fail since the folder
190
+ # already exists.
191
+ # To avoid this second failure, we can force a re-upload, regardless of
192
+ # whether the folder exists on S3 or not
193
+ force_cloud_sync = True
194
+
195
+ rig_name = ""
196
+ rig_name = session_info.training_info.get("rig_name", "")
197
+ if not rig_name:
198
+ with h5py.File(task_source, 'r') as file, contextlib.suppress(KeyError):
199
+ rig_name = file['rigName'][()].decode('utf-8')
200
+
201
+ if any(rig_name.startswith(i) for i in RIG_IGNORE_PREFIXES):
202
+ raise SessionNotUploadedError(
203
+ f"Not uploading {task_source} because rig_id starts with one of {RIG_IGNORE_PREFIXES!r}"
204
+ )
205
+
206
+ if stop_event and stop_event.is_set():
207
+ logger.debug("Stopping due to stop event")
208
+ return
209
+
210
+ logger.debug(f"Session upload directory: {session_dir}")
211
+
212
+ # external systems start getting modified here.
213
+ session_dir.mkdir(exist_ok=True)
214
+ metadata_dir = session_dir / 'aind_metadata'
215
+ metadata_dir.mkdir(exist_ok=True)
216
+ behavior_modality_dir = session_dir / "behavior"
217
+ behavior_modality_dir.mkdir(exist_ok=True)
218
+
219
+ rig_storage_directory = np_codeocean.get_project_config()["rig_metadata_dir"]
220
+ logger.debug(f"Rig storage directory: {rig_storage_directory}")
221
+ add_metadata(
222
+ task_source,
223
+ metadata_dir,
224
+ rig_storage_directory=rig_storage_directory,
225
+ )
226
+
227
+ np_tools.symlink(
228
+ np_codeocean.utils.ensure_posix(task_source),
229
+ behavior_modality_dir / task_source.name,
230
+ )
231
+
232
+ upload_job_contents = {
233
+ 'subject-id': extracted_subject_id,
234
+ 'acq-datetime': npc_session.extract_isoformat_datetime(task_source.stem),
235
+ 'project_name': 'Dynamic Routing',
236
+ 'platform': 'behavior',
237
+ 'modality0': 'behavior',
238
+ 'metadata_dir': np_config.normalize_path(metadata_dir).as_posix(),
239
+ 'modality0.source': np_config.normalize_path(
240
+ behavior_modality_dir).as_posix(),
241
+ 'force_cloud_sync': force_cloud_sync,
242
+ }
243
+
244
+ upload_job_path = np_codeocean.write_upload_csv(
245
+ upload_job_contents,
246
+ np_config.normalize_path(session_dir / 'upload.csv'),
247
+ )
248
+
249
+ upload_service_url = np_codeocean.utils.DEV_SERVICE \
250
+ if test else np_codeocean.utils.AIND_DATA_TRANSFER_SERVICE
251
+
252
+ if stop_event and stop_event.is_set():
253
+ logger.debug("Stopping due to stop event")
254
+ return
255
+
256
+ if lock is not None:
257
+ with lock:
258
+ if stop_event and stop_event.is_set():
259
+ logger.debug("Stopping due to stop event")
260
+ return
261
+ if delay > 0:
262
+ logger.info(f"Pausing {delay} seconds before creating upload request")
263
+ time.sleep(delay)
264
+
265
+ logger.info(f"Submitting {session_dir.name} to {upload_service_url}")
266
+
267
+ np_codeocean.utils.put_jobs_for_hpc_upload(
268
+ upload_jobs=np_codeocean.utils.get_job_models_from_csv(
269
+ upload_job_path,
270
+ user_email=hpc_upload_job_email,
271
+ ),
272
+ upload_service_url=upload_service_url,
273
+ user_email=hpc_upload_job_email,
274
+ dry_run=dry_run,
275
+ save_path=upload_job_path.with_suffix('.json'),
276
+ )
277
+
278
+ def upload_batch(
279
+ batch_dir: pathlib.Path,
280
+ test: bool = False,
281
+ force_cloud_sync: bool = False,
282
+ debug: bool = False,
283
+ dry_run: bool = False,
284
+ hpc_upload_job_email: str = DEFAULT_HPC_UPLOAD_JOB_EMAIL,
285
+ delay: int = DEFAULT_DELAY_BETWEEN_UPLOADS,
286
+ chronological_order: bool = False,
287
+ batch_limit: int | None = None, # number of sessions to process, not upload
288
+ ignore_errors: bool = True,
289
+ ) -> None:
290
+ if test:
291
+ batch_limit = 3
292
+
293
+ logger.addHandler(qh := logging.handlers.QueueHandler(queue := multiprocessing.Queue()))
294
+ listener = logging.handlers.QueueListener(queue, qh)
295
+ listener.start()
296
+ sorted_files = tuple(
297
+ sorted(
298
+ batch_dir.rglob(TASK_HDF5_GLOB),
299
+ key=lambda p: npc_session.extract_isoformat_date(p.name), # type: ignore[return-value]
300
+ reverse=not chronological_order,
301
+ )
302
+ ) # to fix tqdm we need the length of files: len(futures_dict) doesn't work for some reason
303
+ upload_count = 0
304
+ batch_count = 0
305
+ future_to_task_source: dict[concurrent.futures.Future, pathlib.Path] = {}
306
+ with (
307
+ multiprocessing.Manager() as manager,
308
+ concurrent.futures.ProcessPoolExecutor(max_workers=None) as executor,
309
+ ):
310
+ sessions_remaining = manager.Value('i', batch_limit or -1)
311
+ """Counts down and stops at zero. Set to -1 for no limit"""
312
+ lock = manager.Lock()
313
+ stop_event = manager.Event()
314
+ for task_source in sorted_files:
315
+ future = executor.submit(
316
+ upload,
317
+ task_source=task_source,
318
+ test=test,
319
+ force_cloud_sync=force_cloud_sync,
320
+ debug=debug,
321
+ dry_run=dry_run,
322
+ hpc_upload_job_email=hpc_upload_job_email,
323
+ delay=delay,
324
+ lock=lock,
325
+ stop_event=stop_event,
326
+ )
327
+ future_to_task_source[future] = task_source
328
+ with tqdm.tqdm(total=len(sorted_files), desc="Checking status and uploading new sessions") as pbar:
329
+ for future in concurrent.futures.as_completed(future_to_task_source):
330
+ try:
331
+ _ = future.result()
332
+ except SessionNotUploadedError as exc: # any other errors will be raised: prefer to fail fast when we have 12k files to process
333
+ logger.debug('Skipping upload of %s due to %r' % (future_to_task_source[future], exc))
334
+ except Exception as e:
335
+ logger.exception(e)
336
+ if not ignore_errors:
337
+ pbar.close()
338
+ raise e
339
+ else:
340
+ upload_count += 1
341
+ finally:
342
+ pbar.update(1) # as_completed will iterate out of order, so update tqdm progress manually
343
+ batch_count += 1
344
+ if batch_limit is not None and batch_count >= batch_limit:
345
+ pbar.close()
346
+ msg = f"Reached {batch_limit = }: stopping pending and ongoing tasks"
347
+ logger.info(msg)
348
+ print(msg)
349
+ stop_event.set()
350
+ executor.shutdown(wait=True, cancel_futures=True)
351
+ break
352
+ pbar.close()
353
+ msg = f"Batch upload complete: {upload_count} session(s) uploaded"
354
+ logger.info(msg)
355
+ print(msg)
356
+ listener.stop()
357
+
358
+ def parse_args() -> argparse.Namespace:
359
+ parser = argparse.ArgumentParser()
360
+ parser.add_argument('--task-source', type=pathlib.Path, default=HDF5_REPO, help="Path to a single DynamicRouting1*.hdf5 file or a directory containing them (rglob will be used to find files in all subfolder levels)")
361
+ parser.add_argument('--test', action="store_true")
362
+ parser.add_argument('--force-cloud-sync', action="store_true")
363
+ parser.add_argument('--debug', action="store_true")
364
+ parser.add_argument('--dry-run', action="store_true")
365
+ parser.add_argument('--email', type=str, help=f"[optional] specify email address for hpc upload job updates. Default is {np_codeocean.utils.HPC_UPLOAD_JOB_EMAIL}")
366
+ parser.add_argument('--delay', type=int, help=f"wait time (sec) between job submissions in batch mode, to avoid overloadig upload service. Default is {DEFAULT_DELAY_BETWEEN_UPLOADS}", default=DEFAULT_DELAY_BETWEEN_UPLOADS)
367
+ parser.add_argument('--chronological', action="store_true", help="[batch mode only] Upload files in chronological order (oldest first) - default is newest first")
368
+ parser.add_argument('--batch-limit', type=int, help="[batch mode only] Limit the number of files to upload in batch mode")
369
+ parser.add_argument('--fail-fast', dest="ignore_errors", action="store_false", help="[batch mode only] If a session fails to upload, raise the error - default is to log error and continue with other sessions")
370
+ return parser.parse_args()
371
+
372
+
373
+ def main() -> None:
374
+ reset_log_file()
375
+ args = parse_args()
376
+ logger.info(f"Parsed args: {args!r}")
377
+ if not args.task_source.is_dir():
378
+ logger.info(f"Uploading in single file mode: {args.task_source}")
379
+ upload(
380
+ args.task_source,
381
+ test=args.test,
382
+ force_cloud_sync=args.force_cloud_sync,
383
+ debug=args.debug,
384
+ dry_run=args.dry_run,
385
+ hpc_upload_job_email=args.email,
386
+ )
387
+ else:
388
+ logger.info(f"Uploading in batch mode: {args.task_source}")
389
+ upload_batch(
390
+ batch_dir=args.task_source,
391
+ test=args.test,
392
+ force_cloud_sync=args.force_cloud_sync,
393
+ debug=args.debug,
394
+ dry_run=args.dry_run,
395
+ hpc_upload_job_email=args.email,
396
+ delay=args.delay,
397
+ chronological_order=args.chronological,
398
+ batch_limit=args.batch_limit,
399
+ ignore_errors=args.ignore_errors,
400
+ )
401
+
402
+
403
+ if __name__ == '__main__':
404
+ main()
405
+ # upload(
406
+ # task_source=pathlib.Path("//allen/programs/mindscope/workgroups/dynamicrouting/DynamicRoutingTask/Data/714753/DynamicRouting1_714753_20240703_114241.hdf5"),
407
+ # test=True,
408
+ # )
409
+ # upload(
410
+ # task_source=Path("//allen/programs/mindscope/workgroups/dynamicrouting/DynamicRoutingTask/Data/659250/DynamicRouting1_659250_20230322_151236.hdf5"),
411
+ # test=True,
412
+ # force_cloud_sync=True,
413
+ # debug=True,
414
+ # dry_run=False,
415
+ # )
@@ -0,0 +1,185 @@
1
+ import argparse
2
+ import datetime
3
+ import logging
4
+ import pathlib
5
+ import typing
6
+ import warnings
7
+
8
+ import np_config
9
+ import npc_session
10
+ import npc_sessions
11
+ from aind_data_schema.core.rig import Rig
12
+ from np_aind_metadata.integrations import dynamic_routing_task
13
+
14
+ import np_codeocean
15
+
16
+ # Disable divide by zero or NaN warnings
17
+ warnings.filterwarnings("ignore", category=RuntimeWarning)
18
+
19
+ logging.basicConfig(
20
+ filename=f"logs/{pathlib.Path(__file__).stem}_{datetime.datetime.now().strftime('%Y-%m-%d_%H-%M-%S')}.log",
21
+ level=logging.DEBUG,
22
+ format="%(asctime)s | %(name)s | %(levelname)s | %(message)s",
23
+ datefmt="%Y-%d-%m %H:%M:%S",
24
+ )
25
+ logger = logging.getLogger(__name__)
26
+
27
+ CONFIG = np_config.fetch('/rigs/room_numbers')
28
+
29
+
30
+ def reformat_rig_model_rig_id(rig_id: str, modification_date: datetime.date) -> str:
31
+ rig_record = npc_session.RigRecord(rig_id)
32
+ if not rig_record.is_neuro_pixels_rig:
33
+ raise Exception(
34
+ f"Rig is not a neuropixels rig. Only behavior cluster rigs are supported. rig_id={rig_id}")
35
+ room_number = CONFIG.get(rig_record, "UNKNOWN")
36
+ return rig_record.as_aind_data_schema_rig_id(str(room_number), modification_date)
37
+
38
+
39
+ def extract_modification_date(rig: Rig) -> datetime.date:
40
+ _, _, date_str = rig.rig_id.split("_")
41
+ if len(date_str) == 6:
42
+ return datetime.datetime.strptime(date_str, "%y%m%d").date()
43
+ elif len(date_str) == 8:
44
+ return datetime.datetime.strptime(date_str, "%Y%m%d").date()
45
+ else:
46
+ raise Exception(f"Unsupported date format: {date_str}")
47
+
48
+
49
+ def add_metadata(
50
+ session_directory: str | pathlib.Path,
51
+ session_datetime: datetime.datetime,
52
+ rig_storage_directory: pathlib.Path,
53
+ ignore_errors: bool = True,
54
+ skip_existing: bool = True,
55
+ ) -> None:
56
+ """Adds rig and sessions metadata to a session directory.
57
+ """
58
+ normalized_session_dir = np_config.normalize_path(session_directory)
59
+ logger.debug(f"{normalized_session_dir = }")
60
+ logger.debug(f"{rig_storage_directory = }")
61
+ session_json = normalized_session_dir / "session.json"
62
+ if not skip_existing or not (session_json.is_symlink() or session_json.exists()):
63
+ logger.debug("Attempting to create session.json")
64
+ try:
65
+ npc_sessions.DynamicRoutingSession(normalized_session_dir)._aind_session_metadata.write_standard_file(normalized_session_dir)
66
+ except Exception as e:
67
+ if not ignore_errors:
68
+ raise e from None
69
+ else:
70
+ logger.exception(e)
71
+ else:
72
+ if session_json.exists():
73
+ logger.debug("Created session.json")
74
+ else:
75
+ logger.warning("Failed to find created session.json, but no error occurred during creation: may be in unexpected location")
76
+
77
+ rig_model_path = normalized_session_dir / "rig.json"
78
+ if not skip_existing or not (rig_model_path.is_symlink() or rig_model_path.exists()):
79
+ if not (session_json.is_symlink() or session_json.exists()):
80
+ logger.warning("session.json is currently required for the rig.json to be created, so we can't continue with metadata creation")
81
+ return None
82
+ try:
83
+ dynamic_routing_task.add_np_rig_to_session_dir(
84
+ normalized_session_dir,
85
+ session_datetime,
86
+ rig_storage_directory,
87
+ )
88
+ except Exception as e:
89
+ if not ignore_errors:
90
+ raise e from None
91
+ else:
92
+ logger.exception(e)
93
+ else:
94
+ if rig_model_path.exists():
95
+ logger.debug("Created rig.json")
96
+ else:
97
+ logger.warning("Failed to find created rig.json, but no error occurred during creation: may be in unexpected location")
98
+ if not (rig_model_path.is_symlink() or rig_model_path.exists()):
99
+ return None
100
+
101
+ rig_metadata = Rig.model_validate_json(rig_model_path.read_text())
102
+ modification_date = extract_modification_date(rig_metadata)
103
+ rig_metadata.rig_id = reformat_rig_model_rig_id(rig_metadata.rig_id, modification_date)
104
+ rig_metadata.write_standard_file(normalized_session_dir) # assumes this will work out to dest/rig.json
105
+ session_model_path = dynamic_routing_task.scrape_session_model_path(
106
+ normalized_session_dir,
107
+ )
108
+ dynamic_routing_task.update_session_from_rig(
109
+ session_model_path,
110
+ rig_model_path,
111
+ session_model_path,
112
+ )
113
+
114
+ return None
115
+
116
+
117
+ def write_metadata_and_upload(
118
+ session_path_or_folder_name: str,
119
+ recording_dirs: typing.Iterable[str] | None = None,
120
+ force: bool = False,
121
+ dry_run: bool = False,
122
+ test: bool = False,
123
+ hpc_upload_job_email: str = np_codeocean.HPC_UPLOAD_JOB_EMAIL,
124
+ regenerate_metadata: bool = False,
125
+ regenerate_symlinks: bool = True,
126
+ ) -> None:
127
+ """Writes and updates aind-data-schema to the session directory
128
+ associated with the `session`. The aind-data-schema session model is
129
+ updated to reflect the `rig_id` of the rig model added to the session
130
+ directory.
131
+
132
+ Only handles ecephys platform uploads (ie sessions with a folder of data; not
133
+ behavior box sessions, which have a single hdf5 file only)
134
+ """
135
+ # session = np_session.Session(session) #! this doesn't work for surface_channels
136
+ session = np_codeocean.get_np_session(session_path_or_folder_name)
137
+ add_metadata(
138
+ session_directory=session.npexp_path,
139
+ session_datetime=(
140
+ session.start
141
+ if not np_codeocean.is_surface_channel_recording(session.npexp_path.name)
142
+ else np_codeocean.get_surface_channel_start_time(session)
143
+ ),
144
+ rig_storage_directory=pathlib.Path(np_codeocean.get_project_config()["rig_metadata_dir"]),
145
+ ignore_errors=True,
146
+ skip_existing=not regenerate_metadata,
147
+ )
148
+ return np_codeocean.upload_session(
149
+ session_path_or_folder_name,
150
+ recording_dirs=recording_dirs,
151
+ force=force,
152
+ dry_run=dry_run,
153
+ test=test,
154
+ hpc_upload_job_email=hpc_upload_job_email,
155
+ regenerate_symlinks=regenerate_symlinks,
156
+ )
157
+
158
+ def parse_args() -> argparse.Namespace:
159
+ parser = argparse.ArgumentParser(description="Upload a session to CodeOcean")
160
+ parser.add_argument('session_path_or_folder_name', help="session ID (lims or np-exp foldername) or path to session folder")
161
+ parser.add_argument('recording_dirs', nargs='*', help="[optional] specific names of recording directories to upload - for use with split recordings only.")
162
+ parser.add_argument('--email', dest='hpc_upload_job_email', type=str, help=f"[optional] specify email address for hpc upload job updates. Default is {np_codeocean.HPC_UPLOAD_JOB_EMAIL}")
163
+ parser.add_argument('--force', action='store_true', help="enable `force_cloud_sync` option, re-uploading and re-making raw asset even if data exists on S3")
164
+ parser.add_argument('--test', action='store_true', help="use the test-upload service, uploading to the test CodeOcean server instead of the production server")
165
+ parser.add_argument('--dry-run', action='store_true', help="Create upload job but do not submit to hpc upload queue.")
166
+ parser.add_argument('--preserve-symlinks', dest='regenerate_symlinks', action='store_false', help="Existing symlink folders will not be deleted and regenerated - may result in additional data being uploaded")
167
+ parser.add_argument('--regenerate-metadata', action='store_true', help="Regenerate metadata files (session.json and rig.json) even if they already exist")
168
+ return parser.parse_args()
169
+
170
+ def main() -> None:
171
+ args = parse_args()
172
+ write_metadata_and_upload(**vars(args))
173
+
174
+
175
+ if __name__ == '__main__':
176
+ main()
177
+ # write_metadata_and_upload(
178
+ # 'DRpilot_708016_20240429_surface_channels',
179
+ # force=True,
180
+ # regenerate_metadata=False,
181
+ # regenerate_symlinks=False,
182
+ # )
183
+ # upload_dr_ecephys DRpilot_712141_20240606 --regenerate-metadata
184
+ # upload_dr_ecephys DRpilot_712141_20240611 recording1 recording2 --regenerate-metadata --force
185
+ # upload_dr_ecephys DRpilot_712141_20240605 --regenerate-metadata
@@ -0,0 +1,22 @@
1
+ import upath
2
+ import concurrent.futures
3
+ local_to_s3_mapping = {
4
+ "//allen/programs/mindscope/workgroups/dynamicrouting/Ethan/new_annotations/single unit metrics": "s3://aind-scratch-data/dynamic-routing/ethan/single-unit-metrics",
5
+ "//allen/programs/mindscope/workgroups/templeton/TTOC/decoding results/": "s3://aind-scratch-data/dynamic-routing/ethan/decoding-results",
6
+ }
7
+
8
+ def helper(local_root, s3_root, file):
9
+ s3_path = upath.UPath(s3_root) / file.relative_to(local_root)
10
+ if not file.is_file():
11
+ return
12
+ if s3_path.exists():
13
+ print(file.relative_to(local_root), " - already uploaded")
14
+ return
15
+ print(file.relative_to(local_root))
16
+ s3_path.write_bytes(file.read_bytes())
17
+
18
+ if __name__ == "__main__":
19
+ with concurrent.futures.ThreadPoolExecutor(max_workers=8) as executor:
20
+ for local_root, s3_root in local_to_s3_mapping.items():
21
+ for file in upath.UPath(local_root).rglob("*"):
22
+ executor.submit(helper, local_root, s3_root, file)