np_codeocean 0.1.8__py3-none-any.whl → 0.2.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- np_codeocean/__init__.py +1 -1
- np_codeocean/np_session_utils.py +347 -0
- np_codeocean/scripts/fix_ephys_data_on_s3.py +20 -0
- np_codeocean/scripts/upload_dynamic_routing_behavior.py +415 -0
- np_codeocean/scripts/upload_dynamic_routing_ecephys.py +185 -0
- np_codeocean/scripts/upload_ethan_analysis_files.py +22 -0
- np_codeocean/utils.py +406 -94
- {np_codeocean-0.1.8.dist-info → np_codeocean-0.2.1.dist-info}/METADATA +12 -6
- np_codeocean-0.2.1.dist-info/RECORD +12 -0
- {np_codeocean-0.1.8.dist-info → np_codeocean-0.2.1.dist-info}/WHEEL +1 -1
- np_codeocean-0.2.1.dist-info/entry_points.txt +5 -0
- np_codeocean/upload.py +0 -386
- np_codeocean/upload_one.py +0 -183
- np_codeocean-0.1.8.dist-info/RECORD +0 -9
- np_codeocean-0.1.8.dist-info/entry_points.txt +0 -4
- /np_codeocean/scripts/{upload_sessions.py → upload_split_recordings_example.py} +0 -0
|
@@ -0,0 +1,415 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
import argparse
|
|
4
|
+
import concurrent.futures
|
|
5
|
+
import contextlib
|
|
6
|
+
import datetime
|
|
7
|
+
import logging
|
|
8
|
+
import logging.config
|
|
9
|
+
import logging.handlers
|
|
10
|
+
import multiprocessing.synchronize
|
|
11
|
+
import pathlib
|
|
12
|
+
import multiprocessing
|
|
13
|
+
import multiprocessing.managers
|
|
14
|
+
import threading
|
|
15
|
+
import time
|
|
16
|
+
import warnings
|
|
17
|
+
from pathlib import Path
|
|
18
|
+
|
|
19
|
+
import h5py
|
|
20
|
+
import tqdm
|
|
21
|
+
import np_codeocean
|
|
22
|
+
import np_codeocean.utils
|
|
23
|
+
import np_config
|
|
24
|
+
import np_session
|
|
25
|
+
import np_tools
|
|
26
|
+
import npc_lims
|
|
27
|
+
import npc_session
|
|
28
|
+
import npc_sessions # this is heavy, but has the logic for hdf5 -> session.json
|
|
29
|
+
from aind_data_schema.core.rig import Rig
|
|
30
|
+
from np_aind_metadata.integrations import dynamic_routing_task
|
|
31
|
+
from npc_lims.exceptions import NoSessionInfo
|
|
32
|
+
|
|
33
|
+
import np_codeocean
|
|
34
|
+
import np_codeocean.utils
|
|
35
|
+
|
|
36
|
+
# Disable divide by zero or NaN warnings
|
|
37
|
+
warnings.filterwarnings("ignore", category=RuntimeWarning)
|
|
38
|
+
|
|
39
|
+
def reset_log_file() -> None:
|
|
40
|
+
log = get_log_file()
|
|
41
|
+
log.parent.mkdir(exist_ok=True)
|
|
42
|
+
with contextlib.suppress(OSError):
|
|
43
|
+
log.unlink(missing_ok=True)
|
|
44
|
+
|
|
45
|
+
def get_log_file() -> pathlib.Path:
|
|
46
|
+
folder = pathlib.Path(".").resolve() / "logs"
|
|
47
|
+
return folder / f"{pathlib.Path(__file__).stem}_{datetime.datetime.now().strftime('%Y-%m-%d')}.log"
|
|
48
|
+
|
|
49
|
+
logging.basicConfig(
|
|
50
|
+
filename=get_log_file().as_posix(),
|
|
51
|
+
level=logging.INFO,
|
|
52
|
+
format="%(asctime)s | %(name)s | %(levelname)s | PID: %(process)d | TID: %(thread)d | %(message)s",
|
|
53
|
+
datefmt="%Y-%d-%m %H:%M:%S",
|
|
54
|
+
)
|
|
55
|
+
logger = logging.getLogger(__name__)
|
|
56
|
+
|
|
57
|
+
RIG_ROOM_MAPPING = np_config.fetch('/rigs/room_numbers')
|
|
58
|
+
HDF5_REPO = pathlib.Path('//allen/programs/mindscope/workgroups/dynamicrouting/DynamicRoutingTask/Data')
|
|
59
|
+
SESSION_FOLDER_DIRS = (
|
|
60
|
+
pathlib.Path('//allen/programs/mindscope/workgroups/dynamicrouting/PilotEphys/Task 2 pilot'),
|
|
61
|
+
pathlib.Path('//allen/programs/mindscope/workgroups/templeton/TTOC/pilot recordings'),
|
|
62
|
+
)
|
|
63
|
+
|
|
64
|
+
EXCLUDED_SUBJECT_IDS = (0, 366122, 555555, 000000, 598796, 603810, 599657)
|
|
65
|
+
TASK_HDF5_GLOB = "DynamicRouting1*.hdf5"
|
|
66
|
+
RIG_IGNORE_PREFIXES = ("NP", "OG")
|
|
67
|
+
|
|
68
|
+
DEFAULT_HPC_UPLOAD_JOB_EMAIL = "ben.hardcastle@alleninstitute.org"
|
|
69
|
+
|
|
70
|
+
DEFAULT_DELAY_BETWEEN_UPLOADS = 40
|
|
71
|
+
|
|
72
|
+
|
|
73
|
+
class SessionNotUploadedError(ValueError):
|
|
74
|
+
pass
|
|
75
|
+
|
|
76
|
+
class UploadLimitReachedError(RuntimeError):
|
|
77
|
+
pass
|
|
78
|
+
|
|
79
|
+
def reformat_rig_model_rig_id(rig_id: str, modification_date: datetime.date) -> str:
|
|
80
|
+
rig_record = npc_session.RigRecord(rig_id)
|
|
81
|
+
if not rig_record.is_behavior_cluster_rig:
|
|
82
|
+
raise ValueError(
|
|
83
|
+
f"Only behavior boxes are supported: {rig_id=}")
|
|
84
|
+
room_number = RIG_ROOM_MAPPING.get(rig_record.behavior_cluster_id, "UNKNOWN")
|
|
85
|
+
return rig_record.as_aind_data_schema_rig_id(str(room_number), modification_date)
|
|
86
|
+
|
|
87
|
+
|
|
88
|
+
def extract_modification_date(rig: Rig) -> datetime.date:
|
|
89
|
+
_, _, date_str = rig.rig_id.split("_")
|
|
90
|
+
if len(date_str) == 6:
|
|
91
|
+
return datetime.datetime.strptime(date_str, "%y%m%d").date()
|
|
92
|
+
elif len(date_str) == 8:
|
|
93
|
+
return datetime.datetime.strptime(date_str, "%Y%m%d").date()
|
|
94
|
+
else:
|
|
95
|
+
raise ValueError(f"Unsupported date format: {date_str}")
|
|
96
|
+
|
|
97
|
+
def add_metadata(
|
|
98
|
+
task_source: pathlib.Path,
|
|
99
|
+
dest: pathlib.Path,
|
|
100
|
+
rig_storage_directory: pathlib.Path,
|
|
101
|
+
):
|
|
102
|
+
"""Adds `aind-data-schema` rig and session metadata to a session directory.
|
|
103
|
+
"""
|
|
104
|
+
# we need to patch due to this bug not getting addressed: https://github.com/AllenInstitute/npc_sessions/pull/103
|
|
105
|
+
# npc_sessions.Session._aind_rig_id = property(aind_rig_id_patch)
|
|
106
|
+
npc_sessions.Session(task_source) \
|
|
107
|
+
._aind_session_metadata.write_standard_file(dest)
|
|
108
|
+
|
|
109
|
+
session_metadata_path = dest / "session.json"
|
|
110
|
+
rig_metadata_path = dynamic_routing_task.copy_task_rig(
|
|
111
|
+
task_source,
|
|
112
|
+
dest / "rig.json",
|
|
113
|
+
rig_storage_directory,
|
|
114
|
+
)
|
|
115
|
+
if not rig_metadata_path:
|
|
116
|
+
raise FileNotFoundError("Failed to copy task rig.")
|
|
117
|
+
|
|
118
|
+
rig_metadata = Rig.model_validate_json(rig_metadata_path.read_text())
|
|
119
|
+
modification_date = datetime.date(2024, 4, 1) # keep cluster rigs static for now
|
|
120
|
+
rig_metadata.modification_date = modification_date
|
|
121
|
+
rig_metadata.rig_id = reformat_rig_model_rig_id(rig_metadata.rig_id, modification_date)
|
|
122
|
+
rig_metadata.write_standard_file(dest) # assumes this will work out to dest/rig.json
|
|
123
|
+
|
|
124
|
+
dynamic_routing_task.update_session_from_rig(
|
|
125
|
+
session_metadata_path,
|
|
126
|
+
rig_metadata_path,
|
|
127
|
+
session_metadata_path,
|
|
128
|
+
)
|
|
129
|
+
|
|
130
|
+
|
|
131
|
+
def upload(
|
|
132
|
+
task_source: Path,
|
|
133
|
+
test: bool = False,
|
|
134
|
+
force_cloud_sync: bool = False,
|
|
135
|
+
debug: bool = False,
|
|
136
|
+
dry_run: bool = False,
|
|
137
|
+
hpc_upload_job_email: str = DEFAULT_HPC_UPLOAD_JOB_EMAIL,
|
|
138
|
+
delay: int = DEFAULT_DELAY_BETWEEN_UPLOADS,
|
|
139
|
+
lock: threading.Lock | None = None,
|
|
140
|
+
stop_event: threading.Event | None = None,
|
|
141
|
+
) -> None:
|
|
142
|
+
"""
|
|
143
|
+
Notes
|
|
144
|
+
-----
|
|
145
|
+
- task_source Path is expected to have the following naming convention:
|
|
146
|
+
//allen/programs/mindscope/workgroups/dynamicrouting/DynamicRoutingTask/Data/<SUBJECT_ID>/<SESSION_ID>.hdf5
|
|
147
|
+
"""
|
|
148
|
+
if debug:
|
|
149
|
+
logger.setLevel(logging.DEBUG)
|
|
150
|
+
|
|
151
|
+
if stop_event and stop_event.is_set():
|
|
152
|
+
logger.debug("Stopping due to stop event")
|
|
153
|
+
return
|
|
154
|
+
|
|
155
|
+
extracted_subject_id = npc_session.extract_subject(task_source.stem)
|
|
156
|
+
if extracted_subject_id is None:
|
|
157
|
+
raise SessionNotUploadedError(f"Failed to extract subject ID from {task_source}")
|
|
158
|
+
logger.debug(f"Extracted subject id: {extracted_subject_id}")
|
|
159
|
+
# we don't want to upload files from folders that don't correspond to labtracks IDs, like `sound`, or `*_test`
|
|
160
|
+
if not task_source.parent.name.isdigit():
|
|
161
|
+
raise SessionNotUploadedError(
|
|
162
|
+
f"{task_source.parent.name=} is not a labtracks MID"
|
|
163
|
+
)
|
|
164
|
+
|
|
165
|
+
if extracted_subject_id in EXCLUDED_SUBJECT_IDS:
|
|
166
|
+
raise SessionNotUploadedError(
|
|
167
|
+
f"{extracted_subject_id=} is in {EXCLUDED_SUBJECT_IDS=}"
|
|
168
|
+
)
|
|
169
|
+
|
|
170
|
+
upload_root = np_session.NPEXP_ROOT / ("codeocean-dev" if test else "codeocean")
|
|
171
|
+
session_dir = upload_root / f"{extracted_subject_id}_{npc_session.extract_isoformat_date(task_source.stem)}"
|
|
172
|
+
|
|
173
|
+
np_codeocean.utils.set_npc_lims_credentials()
|
|
174
|
+
try:
|
|
175
|
+
session_info = npc_lims.get_session_info(task_source.stem)
|
|
176
|
+
except NoSessionInfo:
|
|
177
|
+
raise SessionNotUploadedError(f"{task_source.name} not in Sam's spreadsheets (yet) - cannot deduce project etc.") from None
|
|
178
|
+
|
|
179
|
+
# if session has been already been uploaded, skip it
|
|
180
|
+
if not (force_cloud_sync or test) and session_info.is_uploaded: # note: session_info.is_uploaded doesnt work for uploads to dev service
|
|
181
|
+
raise SessionNotUploadedError(
|
|
182
|
+
f" {task_source.name} is already uploaded. Use --force-cloud-sync to re-upload."
|
|
183
|
+
)
|
|
184
|
+
|
|
185
|
+
# in the transfer-service airflow dag, jobs have failed after creating a folder
|
|
186
|
+
# on S3, but before a data asset is created in codeocean (likely due to codeocean
|
|
187
|
+
# being down):
|
|
188
|
+
# in that case, our `is_uploaded` check would return False, but in airflow,
|
|
189
|
+
# there's a `check_s3_folder_exists` task, which will fail since the folder
|
|
190
|
+
# already exists.
|
|
191
|
+
# To avoid this second failure, we can force a re-upload, regardless of
|
|
192
|
+
# whether the folder exists on S3 or not
|
|
193
|
+
force_cloud_sync = True
|
|
194
|
+
|
|
195
|
+
rig_name = ""
|
|
196
|
+
rig_name = session_info.training_info.get("rig_name", "")
|
|
197
|
+
if not rig_name:
|
|
198
|
+
with h5py.File(task_source, 'r') as file, contextlib.suppress(KeyError):
|
|
199
|
+
rig_name = file['rigName'][()].decode('utf-8')
|
|
200
|
+
|
|
201
|
+
if any(rig_name.startswith(i) for i in RIG_IGNORE_PREFIXES):
|
|
202
|
+
raise SessionNotUploadedError(
|
|
203
|
+
f"Not uploading {task_source} because rig_id starts with one of {RIG_IGNORE_PREFIXES!r}"
|
|
204
|
+
)
|
|
205
|
+
|
|
206
|
+
if stop_event and stop_event.is_set():
|
|
207
|
+
logger.debug("Stopping due to stop event")
|
|
208
|
+
return
|
|
209
|
+
|
|
210
|
+
logger.debug(f"Session upload directory: {session_dir}")
|
|
211
|
+
|
|
212
|
+
# external systems start getting modified here.
|
|
213
|
+
session_dir.mkdir(exist_ok=True)
|
|
214
|
+
metadata_dir = session_dir / 'aind_metadata'
|
|
215
|
+
metadata_dir.mkdir(exist_ok=True)
|
|
216
|
+
behavior_modality_dir = session_dir / "behavior"
|
|
217
|
+
behavior_modality_dir.mkdir(exist_ok=True)
|
|
218
|
+
|
|
219
|
+
rig_storage_directory = np_codeocean.get_project_config()["rig_metadata_dir"]
|
|
220
|
+
logger.debug(f"Rig storage directory: {rig_storage_directory}")
|
|
221
|
+
add_metadata(
|
|
222
|
+
task_source,
|
|
223
|
+
metadata_dir,
|
|
224
|
+
rig_storage_directory=rig_storage_directory,
|
|
225
|
+
)
|
|
226
|
+
|
|
227
|
+
np_tools.symlink(
|
|
228
|
+
np_codeocean.utils.ensure_posix(task_source),
|
|
229
|
+
behavior_modality_dir / task_source.name,
|
|
230
|
+
)
|
|
231
|
+
|
|
232
|
+
upload_job_contents = {
|
|
233
|
+
'subject-id': extracted_subject_id,
|
|
234
|
+
'acq-datetime': npc_session.extract_isoformat_datetime(task_source.stem),
|
|
235
|
+
'project_name': 'Dynamic Routing',
|
|
236
|
+
'platform': 'behavior',
|
|
237
|
+
'modality0': 'behavior',
|
|
238
|
+
'metadata_dir': np_config.normalize_path(metadata_dir).as_posix(),
|
|
239
|
+
'modality0.source': np_config.normalize_path(
|
|
240
|
+
behavior_modality_dir).as_posix(),
|
|
241
|
+
'force_cloud_sync': force_cloud_sync,
|
|
242
|
+
}
|
|
243
|
+
|
|
244
|
+
upload_job_path = np_codeocean.write_upload_csv(
|
|
245
|
+
upload_job_contents,
|
|
246
|
+
np_config.normalize_path(session_dir / 'upload.csv'),
|
|
247
|
+
)
|
|
248
|
+
|
|
249
|
+
upload_service_url = np_codeocean.utils.DEV_SERVICE \
|
|
250
|
+
if test else np_codeocean.utils.AIND_DATA_TRANSFER_SERVICE
|
|
251
|
+
|
|
252
|
+
if stop_event and stop_event.is_set():
|
|
253
|
+
logger.debug("Stopping due to stop event")
|
|
254
|
+
return
|
|
255
|
+
|
|
256
|
+
if lock is not None:
|
|
257
|
+
with lock:
|
|
258
|
+
if stop_event and stop_event.is_set():
|
|
259
|
+
logger.debug("Stopping due to stop event")
|
|
260
|
+
return
|
|
261
|
+
if delay > 0:
|
|
262
|
+
logger.info(f"Pausing {delay} seconds before creating upload request")
|
|
263
|
+
time.sleep(delay)
|
|
264
|
+
|
|
265
|
+
logger.info(f"Submitting {session_dir.name} to {upload_service_url}")
|
|
266
|
+
|
|
267
|
+
np_codeocean.utils.put_jobs_for_hpc_upload(
|
|
268
|
+
upload_jobs=np_codeocean.utils.get_job_models_from_csv(
|
|
269
|
+
upload_job_path,
|
|
270
|
+
user_email=hpc_upload_job_email,
|
|
271
|
+
),
|
|
272
|
+
upload_service_url=upload_service_url,
|
|
273
|
+
user_email=hpc_upload_job_email,
|
|
274
|
+
dry_run=dry_run,
|
|
275
|
+
save_path=upload_job_path.with_suffix('.json'),
|
|
276
|
+
)
|
|
277
|
+
|
|
278
|
+
def upload_batch(
|
|
279
|
+
batch_dir: pathlib.Path,
|
|
280
|
+
test: bool = False,
|
|
281
|
+
force_cloud_sync: bool = False,
|
|
282
|
+
debug: bool = False,
|
|
283
|
+
dry_run: bool = False,
|
|
284
|
+
hpc_upload_job_email: str = DEFAULT_HPC_UPLOAD_JOB_EMAIL,
|
|
285
|
+
delay: int = DEFAULT_DELAY_BETWEEN_UPLOADS,
|
|
286
|
+
chronological_order: bool = False,
|
|
287
|
+
batch_limit: int | None = None, # number of sessions to process, not upload
|
|
288
|
+
ignore_errors: bool = True,
|
|
289
|
+
) -> None:
|
|
290
|
+
if test:
|
|
291
|
+
batch_limit = 3
|
|
292
|
+
|
|
293
|
+
logger.addHandler(qh := logging.handlers.QueueHandler(queue := multiprocessing.Queue()))
|
|
294
|
+
listener = logging.handlers.QueueListener(queue, qh)
|
|
295
|
+
listener.start()
|
|
296
|
+
sorted_files = tuple(
|
|
297
|
+
sorted(
|
|
298
|
+
batch_dir.rglob(TASK_HDF5_GLOB),
|
|
299
|
+
key=lambda p: npc_session.extract_isoformat_date(p.name), # type: ignore[return-value]
|
|
300
|
+
reverse=not chronological_order,
|
|
301
|
+
)
|
|
302
|
+
) # to fix tqdm we need the length of files: len(futures_dict) doesn't work for some reason
|
|
303
|
+
upload_count = 0
|
|
304
|
+
batch_count = 0
|
|
305
|
+
future_to_task_source: dict[concurrent.futures.Future, pathlib.Path] = {}
|
|
306
|
+
with (
|
|
307
|
+
multiprocessing.Manager() as manager,
|
|
308
|
+
concurrent.futures.ProcessPoolExecutor(max_workers=None) as executor,
|
|
309
|
+
):
|
|
310
|
+
sessions_remaining = manager.Value('i', batch_limit or -1)
|
|
311
|
+
"""Counts down and stops at zero. Set to -1 for no limit"""
|
|
312
|
+
lock = manager.Lock()
|
|
313
|
+
stop_event = manager.Event()
|
|
314
|
+
for task_source in sorted_files:
|
|
315
|
+
future = executor.submit(
|
|
316
|
+
upload,
|
|
317
|
+
task_source=task_source,
|
|
318
|
+
test=test,
|
|
319
|
+
force_cloud_sync=force_cloud_sync,
|
|
320
|
+
debug=debug,
|
|
321
|
+
dry_run=dry_run,
|
|
322
|
+
hpc_upload_job_email=hpc_upload_job_email,
|
|
323
|
+
delay=delay,
|
|
324
|
+
lock=lock,
|
|
325
|
+
stop_event=stop_event,
|
|
326
|
+
)
|
|
327
|
+
future_to_task_source[future] = task_source
|
|
328
|
+
with tqdm.tqdm(total=len(sorted_files), desc="Checking status and uploading new sessions") as pbar:
|
|
329
|
+
for future in concurrent.futures.as_completed(future_to_task_source):
|
|
330
|
+
try:
|
|
331
|
+
_ = future.result()
|
|
332
|
+
except SessionNotUploadedError as exc: # any other errors will be raised: prefer to fail fast when we have 12k files to process
|
|
333
|
+
logger.debug('Skipping upload of %s due to %r' % (future_to_task_source[future], exc))
|
|
334
|
+
except Exception as e:
|
|
335
|
+
logger.exception(e)
|
|
336
|
+
if not ignore_errors:
|
|
337
|
+
pbar.close()
|
|
338
|
+
raise e
|
|
339
|
+
else:
|
|
340
|
+
upload_count += 1
|
|
341
|
+
finally:
|
|
342
|
+
pbar.update(1) # as_completed will iterate out of order, so update tqdm progress manually
|
|
343
|
+
batch_count += 1
|
|
344
|
+
if batch_limit is not None and batch_count >= batch_limit:
|
|
345
|
+
pbar.close()
|
|
346
|
+
msg = f"Reached {batch_limit = }: stopping pending and ongoing tasks"
|
|
347
|
+
logger.info(msg)
|
|
348
|
+
print(msg)
|
|
349
|
+
stop_event.set()
|
|
350
|
+
executor.shutdown(wait=True, cancel_futures=True)
|
|
351
|
+
break
|
|
352
|
+
pbar.close()
|
|
353
|
+
msg = f"Batch upload complete: {upload_count} session(s) uploaded"
|
|
354
|
+
logger.info(msg)
|
|
355
|
+
print(msg)
|
|
356
|
+
listener.stop()
|
|
357
|
+
|
|
358
|
+
def parse_args() -> argparse.Namespace:
|
|
359
|
+
parser = argparse.ArgumentParser()
|
|
360
|
+
parser.add_argument('--task-source', type=pathlib.Path, default=HDF5_REPO, help="Path to a single DynamicRouting1*.hdf5 file or a directory containing them (rglob will be used to find files in all subfolder levels)")
|
|
361
|
+
parser.add_argument('--test', action="store_true")
|
|
362
|
+
parser.add_argument('--force-cloud-sync', action="store_true")
|
|
363
|
+
parser.add_argument('--debug', action="store_true")
|
|
364
|
+
parser.add_argument('--dry-run', action="store_true")
|
|
365
|
+
parser.add_argument('--email', type=str, help=f"[optional] specify email address for hpc upload job updates. Default is {np_codeocean.utils.HPC_UPLOAD_JOB_EMAIL}")
|
|
366
|
+
parser.add_argument('--delay', type=int, help=f"wait time (sec) between job submissions in batch mode, to avoid overloadig upload service. Default is {DEFAULT_DELAY_BETWEEN_UPLOADS}", default=DEFAULT_DELAY_BETWEEN_UPLOADS)
|
|
367
|
+
parser.add_argument('--chronological', action="store_true", help="[batch mode only] Upload files in chronological order (oldest first) - default is newest first")
|
|
368
|
+
parser.add_argument('--batch-limit', type=int, help="[batch mode only] Limit the number of files to upload in batch mode")
|
|
369
|
+
parser.add_argument('--fail-fast', dest="ignore_errors", action="store_false", help="[batch mode only] If a session fails to upload, raise the error - default is to log error and continue with other sessions")
|
|
370
|
+
return parser.parse_args()
|
|
371
|
+
|
|
372
|
+
|
|
373
|
+
def main() -> None:
|
|
374
|
+
reset_log_file()
|
|
375
|
+
args = parse_args()
|
|
376
|
+
logger.info(f"Parsed args: {args!r}")
|
|
377
|
+
if not args.task_source.is_dir():
|
|
378
|
+
logger.info(f"Uploading in single file mode: {args.task_source}")
|
|
379
|
+
upload(
|
|
380
|
+
args.task_source,
|
|
381
|
+
test=args.test,
|
|
382
|
+
force_cloud_sync=args.force_cloud_sync,
|
|
383
|
+
debug=args.debug,
|
|
384
|
+
dry_run=args.dry_run,
|
|
385
|
+
hpc_upload_job_email=args.email,
|
|
386
|
+
)
|
|
387
|
+
else:
|
|
388
|
+
logger.info(f"Uploading in batch mode: {args.task_source}")
|
|
389
|
+
upload_batch(
|
|
390
|
+
batch_dir=args.task_source,
|
|
391
|
+
test=args.test,
|
|
392
|
+
force_cloud_sync=args.force_cloud_sync,
|
|
393
|
+
debug=args.debug,
|
|
394
|
+
dry_run=args.dry_run,
|
|
395
|
+
hpc_upload_job_email=args.email,
|
|
396
|
+
delay=args.delay,
|
|
397
|
+
chronological_order=args.chronological,
|
|
398
|
+
batch_limit=args.batch_limit,
|
|
399
|
+
ignore_errors=args.ignore_errors,
|
|
400
|
+
)
|
|
401
|
+
|
|
402
|
+
|
|
403
|
+
if __name__ == '__main__':
|
|
404
|
+
main()
|
|
405
|
+
# upload(
|
|
406
|
+
# task_source=pathlib.Path("//allen/programs/mindscope/workgroups/dynamicrouting/DynamicRoutingTask/Data/714753/DynamicRouting1_714753_20240703_114241.hdf5"),
|
|
407
|
+
# test=True,
|
|
408
|
+
# )
|
|
409
|
+
# upload(
|
|
410
|
+
# task_source=Path("//allen/programs/mindscope/workgroups/dynamicrouting/DynamicRoutingTask/Data/659250/DynamicRouting1_659250_20230322_151236.hdf5"),
|
|
411
|
+
# test=True,
|
|
412
|
+
# force_cloud_sync=True,
|
|
413
|
+
# debug=True,
|
|
414
|
+
# dry_run=False,
|
|
415
|
+
# )
|
|
@@ -0,0 +1,185 @@
|
|
|
1
|
+
import argparse
|
|
2
|
+
import datetime
|
|
3
|
+
import logging
|
|
4
|
+
import pathlib
|
|
5
|
+
import typing
|
|
6
|
+
import warnings
|
|
7
|
+
|
|
8
|
+
import np_config
|
|
9
|
+
import npc_session
|
|
10
|
+
import npc_sessions
|
|
11
|
+
from aind_data_schema.core.rig import Rig
|
|
12
|
+
from np_aind_metadata.integrations import dynamic_routing_task
|
|
13
|
+
|
|
14
|
+
import np_codeocean
|
|
15
|
+
|
|
16
|
+
# Disable divide by zero or NaN warnings
|
|
17
|
+
warnings.filterwarnings("ignore", category=RuntimeWarning)
|
|
18
|
+
|
|
19
|
+
logging.basicConfig(
|
|
20
|
+
filename=f"logs/{pathlib.Path(__file__).stem}_{datetime.datetime.now().strftime('%Y-%m-%d_%H-%M-%S')}.log",
|
|
21
|
+
level=logging.DEBUG,
|
|
22
|
+
format="%(asctime)s | %(name)s | %(levelname)s | %(message)s",
|
|
23
|
+
datefmt="%Y-%d-%m %H:%M:%S",
|
|
24
|
+
)
|
|
25
|
+
logger = logging.getLogger(__name__)
|
|
26
|
+
|
|
27
|
+
CONFIG = np_config.fetch('/rigs/room_numbers')
|
|
28
|
+
|
|
29
|
+
|
|
30
|
+
def reformat_rig_model_rig_id(rig_id: str, modification_date: datetime.date) -> str:
|
|
31
|
+
rig_record = npc_session.RigRecord(rig_id)
|
|
32
|
+
if not rig_record.is_neuro_pixels_rig:
|
|
33
|
+
raise Exception(
|
|
34
|
+
f"Rig is not a neuropixels rig. Only behavior cluster rigs are supported. rig_id={rig_id}")
|
|
35
|
+
room_number = CONFIG.get(rig_record, "UNKNOWN")
|
|
36
|
+
return rig_record.as_aind_data_schema_rig_id(str(room_number), modification_date)
|
|
37
|
+
|
|
38
|
+
|
|
39
|
+
def extract_modification_date(rig: Rig) -> datetime.date:
|
|
40
|
+
_, _, date_str = rig.rig_id.split("_")
|
|
41
|
+
if len(date_str) == 6:
|
|
42
|
+
return datetime.datetime.strptime(date_str, "%y%m%d").date()
|
|
43
|
+
elif len(date_str) == 8:
|
|
44
|
+
return datetime.datetime.strptime(date_str, "%Y%m%d").date()
|
|
45
|
+
else:
|
|
46
|
+
raise Exception(f"Unsupported date format: {date_str}")
|
|
47
|
+
|
|
48
|
+
|
|
49
|
+
def add_metadata(
|
|
50
|
+
session_directory: str | pathlib.Path,
|
|
51
|
+
session_datetime: datetime.datetime,
|
|
52
|
+
rig_storage_directory: pathlib.Path,
|
|
53
|
+
ignore_errors: bool = True,
|
|
54
|
+
skip_existing: bool = True,
|
|
55
|
+
) -> None:
|
|
56
|
+
"""Adds rig and sessions metadata to a session directory.
|
|
57
|
+
"""
|
|
58
|
+
normalized_session_dir = np_config.normalize_path(session_directory)
|
|
59
|
+
logger.debug(f"{normalized_session_dir = }")
|
|
60
|
+
logger.debug(f"{rig_storage_directory = }")
|
|
61
|
+
session_json = normalized_session_dir / "session.json"
|
|
62
|
+
if not skip_existing or not (session_json.is_symlink() or session_json.exists()):
|
|
63
|
+
logger.debug("Attempting to create session.json")
|
|
64
|
+
try:
|
|
65
|
+
npc_sessions.DynamicRoutingSession(normalized_session_dir)._aind_session_metadata.write_standard_file(normalized_session_dir)
|
|
66
|
+
except Exception as e:
|
|
67
|
+
if not ignore_errors:
|
|
68
|
+
raise e from None
|
|
69
|
+
else:
|
|
70
|
+
logger.exception(e)
|
|
71
|
+
else:
|
|
72
|
+
if session_json.exists():
|
|
73
|
+
logger.debug("Created session.json")
|
|
74
|
+
else:
|
|
75
|
+
logger.warning("Failed to find created session.json, but no error occurred during creation: may be in unexpected location")
|
|
76
|
+
|
|
77
|
+
rig_model_path = normalized_session_dir / "rig.json"
|
|
78
|
+
if not skip_existing or not (rig_model_path.is_symlink() or rig_model_path.exists()):
|
|
79
|
+
if not (session_json.is_symlink() or session_json.exists()):
|
|
80
|
+
logger.warning("session.json is currently required for the rig.json to be created, so we can't continue with metadata creation")
|
|
81
|
+
return None
|
|
82
|
+
try:
|
|
83
|
+
dynamic_routing_task.add_np_rig_to_session_dir(
|
|
84
|
+
normalized_session_dir,
|
|
85
|
+
session_datetime,
|
|
86
|
+
rig_storage_directory,
|
|
87
|
+
)
|
|
88
|
+
except Exception as e:
|
|
89
|
+
if not ignore_errors:
|
|
90
|
+
raise e from None
|
|
91
|
+
else:
|
|
92
|
+
logger.exception(e)
|
|
93
|
+
else:
|
|
94
|
+
if rig_model_path.exists():
|
|
95
|
+
logger.debug("Created rig.json")
|
|
96
|
+
else:
|
|
97
|
+
logger.warning("Failed to find created rig.json, but no error occurred during creation: may be in unexpected location")
|
|
98
|
+
if not (rig_model_path.is_symlink() or rig_model_path.exists()):
|
|
99
|
+
return None
|
|
100
|
+
|
|
101
|
+
rig_metadata = Rig.model_validate_json(rig_model_path.read_text())
|
|
102
|
+
modification_date = extract_modification_date(rig_metadata)
|
|
103
|
+
rig_metadata.rig_id = reformat_rig_model_rig_id(rig_metadata.rig_id, modification_date)
|
|
104
|
+
rig_metadata.write_standard_file(normalized_session_dir) # assumes this will work out to dest/rig.json
|
|
105
|
+
session_model_path = dynamic_routing_task.scrape_session_model_path(
|
|
106
|
+
normalized_session_dir,
|
|
107
|
+
)
|
|
108
|
+
dynamic_routing_task.update_session_from_rig(
|
|
109
|
+
session_model_path,
|
|
110
|
+
rig_model_path,
|
|
111
|
+
session_model_path,
|
|
112
|
+
)
|
|
113
|
+
|
|
114
|
+
return None
|
|
115
|
+
|
|
116
|
+
|
|
117
|
+
def write_metadata_and_upload(
|
|
118
|
+
session_path_or_folder_name: str,
|
|
119
|
+
recording_dirs: typing.Iterable[str] | None = None,
|
|
120
|
+
force: bool = False,
|
|
121
|
+
dry_run: bool = False,
|
|
122
|
+
test: bool = False,
|
|
123
|
+
hpc_upload_job_email: str = np_codeocean.HPC_UPLOAD_JOB_EMAIL,
|
|
124
|
+
regenerate_metadata: bool = False,
|
|
125
|
+
regenerate_symlinks: bool = True,
|
|
126
|
+
) -> None:
|
|
127
|
+
"""Writes and updates aind-data-schema to the session directory
|
|
128
|
+
associated with the `session`. The aind-data-schema session model is
|
|
129
|
+
updated to reflect the `rig_id` of the rig model added to the session
|
|
130
|
+
directory.
|
|
131
|
+
|
|
132
|
+
Only handles ecephys platform uploads (ie sessions with a folder of data; not
|
|
133
|
+
behavior box sessions, which have a single hdf5 file only)
|
|
134
|
+
"""
|
|
135
|
+
# session = np_session.Session(session) #! this doesn't work for surface_channels
|
|
136
|
+
session = np_codeocean.get_np_session(session_path_or_folder_name)
|
|
137
|
+
add_metadata(
|
|
138
|
+
session_directory=session.npexp_path,
|
|
139
|
+
session_datetime=(
|
|
140
|
+
session.start
|
|
141
|
+
if not np_codeocean.is_surface_channel_recording(session.npexp_path.name)
|
|
142
|
+
else np_codeocean.get_surface_channel_start_time(session)
|
|
143
|
+
),
|
|
144
|
+
rig_storage_directory=pathlib.Path(np_codeocean.get_project_config()["rig_metadata_dir"]),
|
|
145
|
+
ignore_errors=True,
|
|
146
|
+
skip_existing=not regenerate_metadata,
|
|
147
|
+
)
|
|
148
|
+
return np_codeocean.upload_session(
|
|
149
|
+
session_path_or_folder_name,
|
|
150
|
+
recording_dirs=recording_dirs,
|
|
151
|
+
force=force,
|
|
152
|
+
dry_run=dry_run,
|
|
153
|
+
test=test,
|
|
154
|
+
hpc_upload_job_email=hpc_upload_job_email,
|
|
155
|
+
regenerate_symlinks=regenerate_symlinks,
|
|
156
|
+
)
|
|
157
|
+
|
|
158
|
+
def parse_args() -> argparse.Namespace:
|
|
159
|
+
parser = argparse.ArgumentParser(description="Upload a session to CodeOcean")
|
|
160
|
+
parser.add_argument('session_path_or_folder_name', help="session ID (lims or np-exp foldername) or path to session folder")
|
|
161
|
+
parser.add_argument('recording_dirs', nargs='*', help="[optional] specific names of recording directories to upload - for use with split recordings only.")
|
|
162
|
+
parser.add_argument('--email', dest='hpc_upload_job_email', type=str, help=f"[optional] specify email address for hpc upload job updates. Default is {np_codeocean.HPC_UPLOAD_JOB_EMAIL}")
|
|
163
|
+
parser.add_argument('--force', action='store_true', help="enable `force_cloud_sync` option, re-uploading and re-making raw asset even if data exists on S3")
|
|
164
|
+
parser.add_argument('--test', action='store_true', help="use the test-upload service, uploading to the test CodeOcean server instead of the production server")
|
|
165
|
+
parser.add_argument('--dry-run', action='store_true', help="Create upload job but do not submit to hpc upload queue.")
|
|
166
|
+
parser.add_argument('--preserve-symlinks', dest='regenerate_symlinks', action='store_false', help="Existing symlink folders will not be deleted and regenerated - may result in additional data being uploaded")
|
|
167
|
+
parser.add_argument('--regenerate-metadata', action='store_true', help="Regenerate metadata files (session.json and rig.json) even if they already exist")
|
|
168
|
+
return parser.parse_args()
|
|
169
|
+
|
|
170
|
+
def main() -> None:
|
|
171
|
+
args = parse_args()
|
|
172
|
+
write_metadata_and_upload(**vars(args))
|
|
173
|
+
|
|
174
|
+
|
|
175
|
+
if __name__ == '__main__':
|
|
176
|
+
main()
|
|
177
|
+
# write_metadata_and_upload(
|
|
178
|
+
# 'DRpilot_708016_20240429_surface_channels',
|
|
179
|
+
# force=True,
|
|
180
|
+
# regenerate_metadata=False,
|
|
181
|
+
# regenerate_symlinks=False,
|
|
182
|
+
# )
|
|
183
|
+
# upload_dr_ecephys DRpilot_712141_20240606 --regenerate-metadata
|
|
184
|
+
# upload_dr_ecephys DRpilot_712141_20240611 recording1 recording2 --regenerate-metadata --force
|
|
185
|
+
# upload_dr_ecephys DRpilot_712141_20240605 --regenerate-metadata
|
|
@@ -0,0 +1,22 @@
|
|
|
1
|
+
import upath
|
|
2
|
+
import concurrent.futures
|
|
3
|
+
local_to_s3_mapping = {
|
|
4
|
+
"//allen/programs/mindscope/workgroups/dynamicrouting/Ethan/new_annotations/single unit metrics": "s3://aind-scratch-data/dynamic-routing/ethan/single-unit-metrics",
|
|
5
|
+
"//allen/programs/mindscope/workgroups/templeton/TTOC/decoding results/": "s3://aind-scratch-data/dynamic-routing/ethan/decoding-results",
|
|
6
|
+
}
|
|
7
|
+
|
|
8
|
+
def helper(local_root, s3_root, file):
|
|
9
|
+
s3_path = upath.UPath(s3_root) / file.relative_to(local_root)
|
|
10
|
+
if not file.is_file():
|
|
11
|
+
return
|
|
12
|
+
if s3_path.exists():
|
|
13
|
+
print(file.relative_to(local_root), " - already uploaded")
|
|
14
|
+
return
|
|
15
|
+
print(file.relative_to(local_root))
|
|
16
|
+
s3_path.write_bytes(file.read_bytes())
|
|
17
|
+
|
|
18
|
+
if __name__ == "__main__":
|
|
19
|
+
with concurrent.futures.ThreadPoolExecutor(max_workers=8) as executor:
|
|
20
|
+
for local_root, s3_root in local_to_s3_mapping.items():
|
|
21
|
+
for file in upath.UPath(local_root).rglob("*"):
|
|
22
|
+
executor.submit(helper, local_root, s3_root, file)
|