pysodafair 0.1.62__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (77) hide show
  1. pysoda/__init__.py +0 -0
  2. pysoda/constants.py +3 -0
  3. pysoda/core/__init__.py +10 -0
  4. pysoda/core/dataset_generation/__init__.py +11 -0
  5. pysoda/core/dataset_generation/manifestSession/__init__.py +1 -0
  6. pysoda/core/dataset_generation/manifestSession/manifest_session.py +146 -0
  7. pysoda/core/dataset_generation/upload.py +3951 -0
  8. pysoda/core/dataset_importing/__init__.py +1 -0
  9. pysoda/core/dataset_importing/import_dataset.py +662 -0
  10. pysoda/core/metadata/__init__.py +20 -0
  11. pysoda/core/metadata/code_description.py +109 -0
  12. pysoda/core/metadata/constants.py +32 -0
  13. pysoda/core/metadata/dataset_description.py +188 -0
  14. pysoda/core/metadata/excel_utils.py +41 -0
  15. pysoda/core/metadata/helpers.py +250 -0
  16. pysoda/core/metadata/manifest.py +112 -0
  17. pysoda/core/metadata/manifest_package/__init__.py +2 -0
  18. pysoda/core/metadata/manifest_package/manifest.py +0 -0
  19. pysoda/core/metadata/manifest_package/manifest_import.py +29 -0
  20. pysoda/core/metadata/manifest_package/manifest_writer.py +666 -0
  21. pysoda/core/metadata/performances.py +46 -0
  22. pysoda/core/metadata/resources.py +53 -0
  23. pysoda/core/metadata/samples.py +184 -0
  24. pysoda/core/metadata/sites.py +51 -0
  25. pysoda/core/metadata/subjects.py +172 -0
  26. pysoda/core/metadata/submission.py +91 -0
  27. pysoda/core/metadata/text_metadata.py +47 -0
  28. pysoda/core/metadata_templates/CHANGES +1 -0
  29. pysoda/core/metadata_templates/LICENSE +1 -0
  30. pysoda/core/metadata_templates/README.md +4 -0
  31. pysoda/core/metadata_templates/__init__.py +0 -0
  32. pysoda/core/metadata_templates/code_description.xlsx +0 -0
  33. pysoda/core/metadata_templates/code_parameters.xlsx +0 -0
  34. pysoda/core/metadata_templates/dataset_description.xlsx +0 -0
  35. pysoda/core/metadata_templates/manifest.xlsx +0 -0
  36. pysoda/core/metadata_templates/performances.xlsx +0 -0
  37. pysoda/core/metadata_templates/resources.xlsx +0 -0
  38. pysoda/core/metadata_templates/samples.xlsx +0 -0
  39. pysoda/core/metadata_templates/sites.xlsx +0 -0
  40. pysoda/core/metadata_templates/subjects.xlsx +0 -0
  41. pysoda/core/metadata_templates/subjects_pools_samples_structure.xlsx +0 -0
  42. pysoda/core/metadata_templates/subjects_pools_samples_structure_example.xlsx +0 -0
  43. pysoda/core/metadata_templates/submission.xlsx +0 -0
  44. pysoda/core/permissions/__init__.py +1 -0
  45. pysoda/core/permissions/permissions.py +31 -0
  46. pysoda/core/pysoda/__init__.py +2 -0
  47. pysoda/core/pysoda/soda.py +34 -0
  48. pysoda/core/pysoda/soda_object.py +55 -0
  49. pysoda/core/upload_manifests/__init__.py +1 -0
  50. pysoda/core/upload_manifests/upload_manifests.py +37 -0
  51. pysoda/schema/__init__.py +0 -0
  52. pysoda/schema/code_description.json +629 -0
  53. pysoda/schema/dataset_description.json +295 -0
  54. pysoda/schema/manifest.json +60 -0
  55. pysoda/schema/performances.json +44 -0
  56. pysoda/schema/resources.json +39 -0
  57. pysoda/schema/samples.json +97 -0
  58. pysoda/schema/sites.json +38 -0
  59. pysoda/schema/soda_schema.json +664 -0
  60. pysoda/schema/subjects.json +131 -0
  61. pysoda/schema/submission_schema.json +28 -0
  62. pysoda/utils/__init__.py +9 -0
  63. pysoda/utils/authentication.py +381 -0
  64. pysoda/utils/config.py +68 -0
  65. pysoda/utils/exceptions.py +156 -0
  66. pysoda/utils/logger.py +6 -0
  67. pysoda/utils/metadata_utils.py +74 -0
  68. pysoda/utils/pennsieveAgentUtils.py +11 -0
  69. pysoda/utils/pennsieveUtils.py +118 -0
  70. pysoda/utils/profile.py +28 -0
  71. pysoda/utils/schema_validation.py +133 -0
  72. pysoda/utils/time_utils.py +5 -0
  73. pysoda/utils/upload_utils.py +108 -0
  74. pysodafair-0.1.62.dist-info/METADATA +190 -0
  75. pysodafair-0.1.62.dist-info/RECORD +77 -0
  76. pysodafair-0.1.62.dist-info/WHEEL +4 -0
  77. pysodafair-0.1.62.dist-info/licenses/LICENSE +21 -0
@@ -0,0 +1,3951 @@
1
+
2
+ from ...utils import (
3
+ generate_options_set, generating_locally, generating_on_ps,
4
+ uploading_with_ps_account, uploading_to_existing_ps_dataset,
5
+ can_resume_prior_upload, virtual_dataset_empty, PropertyNotSetError,
6
+ connect_pennsieve_client, get_dataset_id, get_access_token,
7
+ PennsieveActionNoPermission, PennsieveDatasetCannotBeFound,
8
+ EmptyDatasetError, LocalDatasetMissingSpecifiedFiles,
9
+ PennsieveUploadException, create_request_headers, check_forbidden_characters_ps, get_users_dataset_list,
10
+ PennsieveDatasetNameInvalid, PennsieveDatasetNameTaken, PennsieveAccountInvalid, TZLOCAL, GenerateOptionsNotSet,
11
+ PennsieveDatasetFilesInvalid
12
+ )
13
+ from ..permissions import pennsieve_get_current_user_permissions
14
+ from os.path import isdir, isfile, getsize
15
+ from ..metadata import create_high_level_manifest_files, get_auto_generated_manifest_files, manifest, subjects, samples, code_description, dataset_description, performances, resources, sites, submission, text_metadata, METADATA_UPLOAD_PS_PATH, create_high_lvl_manifest_files_existing_ps_starting_point
16
+ from ..upload_manifests import get_upload_manifests
17
+ from .. import logger
18
+
19
+ main_curate_progress_message = ""
20
+ main_curate_status = ""
21
+
22
+ # -*- coding: utf-8 -*-
23
+
24
+ ### Import required python modules
25
+ import platform
26
+ import os
27
+ from os import listdir, makedirs, mkdir, walk, rename
28
+ from os.path import (
29
+ isdir,
30
+ isfile,
31
+ join,
32
+ splitext,
33
+ basename,
34
+ exists,
35
+ expanduser,
36
+ dirname,
37
+ getsize,
38
+ abspath,
39
+ )
40
+ import pandas as pd
41
+ import time
42
+ from timeit import default_timer as timer
43
+ from datetime import timedelta
44
+ import shutil
45
+ import subprocess
46
+ import gevent
47
+ import pathlib
48
+ import requests
49
+ from datetime import datetime
50
+ from openpyxl import load_workbook
51
+ from openpyxl.styles import PatternFill
52
+ # from utils import connect_pennsieve_client, get_dataset_id, create_request_headers, TZLOCAL, get_users_dataset_list
53
+ # from manifest import create_high_lvl_manifest_files_existing_ps_starting_point, create_high_level_manifest_files, get_auto_generated_manifest_files
54
+ # from errors import PennsieveUploadException
55
+ from .manifestSession import UploadManifestSession
56
+ from ...constants import PENNSIEVE_URL
57
+ from ..dataset_importing import import_pennsieve_dataset
58
+
59
+ # from pysodaUtils import (
60
+ # check_forbidden_characters_ps
61
+ # )
62
+
63
+ # from organizeDatasets import import_pennsieve_dataset
64
+
65
+
66
+ ### Global variables
67
+ curateprogress = " "
68
+ curatestatus = " "
69
+ curateprintstatus = " "
70
+ total_dataset_size = 1
71
+ curated_dataset_size = 0
72
+ start_time = 0
73
+ uploaded_folder_counter = 0
74
+ current_size_of_uploaded_files = 0
75
+ generated_dataset_id = None
76
+ # the pennsieve python client used for uploading dataset files
77
+ client = None
78
+
79
+ userpath = expanduser("~")
80
+ configpath = join(userpath, ".pennsieve", "config.ini")
81
+ submitdataprogress = " "
82
+ submitdatastatus = " "
83
+ submitprintstatus = " "
84
+ total_file_size = 1
85
+ uploaded_file_size = 0
86
+ start_time_bf_upload = 0
87
+ start_submit = 0
88
+ metadatapath = join(userpath, "SODA", "SODA_metadata")
89
+ ps_recognized_file_extensions = [
90
+ ".cram",
91
+ ".jp2",
92
+ ".jpx",
93
+ ".lsm",
94
+ ".ndpi",
95
+ ".nifti",
96
+ ".oib",
97
+ ".oif",
98
+ ".roi",
99
+ ".rtf",
100
+ ".swc",
101
+ ".abf",
102
+ ".acq",
103
+ ".adicht",
104
+ ".adidat",
105
+ ".aedt",
106
+ ".afni",
107
+ ".ai",
108
+ ".avi",
109
+ ".bam",
110
+ ".bash",
111
+ ".bcl",
112
+ ".bcl.gz",
113
+ ".bin",
114
+ ".brik",
115
+ ".brukertiff.gz",
116
+ ".continuous",
117
+ ".cpp",
118
+ ".csv",
119
+ ".curv",
120
+ ".cxls",
121
+ ".czi",
122
+ ".data",
123
+ ".dcm",
124
+ ".df",
125
+ ".dicom",
126
+ ".doc",
127
+ ".docx",
128
+ ".e",
129
+ ".edf",
130
+ ".eps",
131
+ ".events",
132
+ ".fasta",
133
+ ".fastq",
134
+ ".fcs",
135
+ ".feather",
136
+ ".fig",
137
+ ".gif",
138
+ ".h4",
139
+ ".h5",
140
+ ".hdf4",
141
+ ".hdf5",
142
+ ".hdr",
143
+ ".he2",
144
+ ".he5",
145
+ ".head",
146
+ ".hoc",
147
+ ".htm",
148
+ ".html",
149
+ ".ibw",
150
+ ".img",
151
+ ".ims",
152
+ ".ipynb",
153
+ ".jpeg",
154
+ ".jpg",
155
+ ".js",
156
+ ".json",
157
+ ".lay",
158
+ ".lh",
159
+ ".lif",
160
+ ".m",
161
+ ".mat",
162
+ ".md",
163
+ ".mef",
164
+ ".mefd.gz",
165
+ ".mex",
166
+ ".mgf",
167
+ ".mgh",
168
+ ".mgh.gz",
169
+ ".mgz",
170
+ ".mnc",
171
+ ".moberg.gz",
172
+ ".mod",
173
+ ".mov",
174
+ ".mp4",
175
+ ".mph",
176
+ ".mpj",
177
+ ".mtw",
178
+ ".ncs",
179
+ ".nd2",
180
+ ".nev",
181
+ ".nex",
182
+ ".nex5",
183
+ ".nf3",
184
+ ".nii",
185
+ ".nii.gz",
186
+ ".ns1",
187
+ ".ns2",
188
+ ".ns3",
189
+ ".ns4",
190
+ ".ns5",
191
+ ".ns6",
192
+ ".nwb",
193
+ ".ogg",
194
+ ".ogv",
195
+ ".ome.btf",
196
+ ".ome.tif",
197
+ ".ome.tif2",
198
+ ".ome.tif8",
199
+ ".ome.tiff",
200
+ ".ome.xml",
201
+ ".openephys",
202
+ ".pdf",
203
+ ".pgf",
204
+ ".png",
205
+ ".ppt",
206
+ ".pptx",
207
+ ".ps",
208
+ ".pul",
209
+ ".py",
210
+ ".r",
211
+ ".raw",
212
+ ".rdata",
213
+ ".rh",
214
+ ".rhd",
215
+ ".sh",
216
+ ".sldasm",
217
+ ".slddrw",
218
+ ".smr",
219
+ ".spikes",
220
+ ".svg",
221
+ ".svs",
222
+ ".tab",
223
+ ".tar",
224
+ ".tar.gz",
225
+ ".tcsh",
226
+ ".tdm",
227
+ ".tdms",
228
+ ".text",
229
+ ".tif",
230
+ ".tiff",
231
+ ".tsv",
232
+ ".txt",
233
+ ".vcf",
234
+ ".webm",
235
+ ".xlsx",
236
+ ".xml",
237
+ ".yaml",
238
+ ".yml",
239
+ ".zip",
240
+ ".zsh",
241
+ ]
242
+
243
+ myds = ""
244
+ initial_bfdataset_size = 0
245
+ upload_directly_to_bf = 0
246
+ initial_bfdataset_size_submit = 0
247
+ renaming_files_flow = False
248
+
249
+ total_files = 0 # the total number of files in a given dataset that need to be uploaded to Pennsieve
250
+ total_bytes_uploaded = 0 # current number of bytes uploaded to Pennsieve in the upload session
251
+ total_upload_size = 0 # total number of bytes to upload to Pennsieve in the upload session
252
+
253
+ forbidden_characters = '<>:"/\|?*'
254
+ forbidden_characters_bf = '\/:*?"<>'
255
+
256
+ # a global that tracks the amount of files that have been uploaded in an upload session;
257
+ # is reset once the session ends by success, or failure (is implicitly reset in case of Pennsieve Agent freeze by the user closing SODA)
258
+ main_curation_uploaded_files = 0
259
+
260
+ DEV_TEMPLATE_PATH = join(dirname(__file__), "..", "file_templates")
261
+
262
+ # once pysoda has been packaged with pyinstaller
263
+ # it becomes nested into the pysodadist/api directory
264
+ PROD_TEMPLATE_PATH = join(dirname(__file__), "..", "..", "file_templates")
265
+ TEMPLATE_PATH = DEV_TEMPLATE_PATH if exists(DEV_TEMPLATE_PATH) else PROD_TEMPLATE_PATH
266
+
267
+
268
+
269
+
270
+ ums = UploadManifestSession()
271
+
272
+
273
+
274
+
275
+
276
+ def open_file(file_path):
277
+ """
278
+ Opening folder on all platforms
279
+ https://stackoverflow.com/questions/6631299/python-opening-a-folder-in-explorer-nautilus-mac-thingie
280
+
281
+ Args:
282
+ file_path: path of the folder (string)
283
+ Action:
284
+ Opens file explorer window to the given path
285
+ """
286
+
287
+ if platform.system() == "Windows":
288
+ subprocess.Popen(f"explorer /select,{str(file_path)}")
289
+ elif platform.system() == "Darwin":
290
+ subprocess.Popen(["open", file_path])
291
+ else:
292
+ subprocess.Popen(["xdg-open", file_path])
293
+
294
+
295
+
296
+ def folder_size(path):
297
+ """
298
+ Provides the size of the folder indicated by path
299
+
300
+ Args:
301
+ path: path of the folder (string)
302
+ Returns:
303
+ total_size: total size of the folder in bytes (integer)
304
+ """
305
+ total_size = 0
306
+
307
+ for path, dirs, files in walk(path):
308
+ for f in files:
309
+ fp = join(path, f)
310
+ total_size += getsize(fp)
311
+ return total_size
312
+
313
+
314
+ def path_size(path):
315
+ """
316
+ Returns size of the path, after checking if it's a folder or a file
317
+ Args:
318
+ path: path of the file/folder (string)
319
+ Returns:
320
+ total_size: total size of the file/folder in bytes (integer)
321
+ """
322
+ return folder_size(path) if isdir(path) else getsize(path)
323
+
324
+
325
+ def create_folder_level_manifest(jsonpath, jsondescription):
326
+ """
327
+ Function to create manifest files for each SPARC folder.
328
+ Files are created in a temporary folder
329
+
330
+ Args:
331
+ datasetpath: path of the dataset (string)
332
+ jsonpath: all paths in json format with key being SPARC folder names (dictionary)
333
+ jsondescription: description associated with each path (dictionary)
334
+ Action:
335
+ Creates manifest files in xslx format for each SPARC folder
336
+ """
337
+ global total_dataset_size
338
+ local_timezone = TZLOCAL()
339
+
340
+ try:
341
+ shutil.rmtree(metadatapath) if isdir(metadatapath) else 0
342
+ makedirs(metadatapath)
343
+ folders = list(jsonpath.keys())
344
+
345
+ if "main" in folders:
346
+ folders.remove("main")
347
+ # In each SPARC folder, generate a manifest file
348
+ for folder in folders:
349
+ if jsonpath[folder] != []:
350
+ # Initialize dataframe where manifest info will be stored
351
+ df = pd.DataFrame(
352
+ columns=[
353
+ "filename",
354
+ "timestamp",
355
+ "description",
356
+ "file type",
357
+ "Additional Metadata",
358
+ ]
359
+ )
360
+ # Get list of files/folders in the the folder
361
+ # Remove manifest file from the list if already exists
362
+ folderpath = join(metadatapath, folder)
363
+ allfiles = jsonpath[folder]
364
+ alldescription = jsondescription[folder + "_description"]
365
+
366
+ countpath = -1
367
+ for pathname in allfiles:
368
+ countpath += 1
369
+ if basename(pathname) in ["manifest.csv", "manifest.xlsx"]:
370
+ allfiles.pop(countpath)
371
+ alldescription.pop(countpath)
372
+
373
+ # Populate manifest dataframe
374
+ filename, timestamp, filetype, filedescription = [], [], [], []
375
+ countpath = -1
376
+ for paths in allfiles:
377
+ if isdir(paths):
378
+ key = basename(paths)
379
+ alldescription.pop(0)
380
+ for subdir, dirs, files in os.walk(paths):
381
+ for file in files:
382
+ gevent.sleep(0)
383
+ filepath = pathlib.Path(paths) / subdir / file
384
+ mtime = filepath.stat().st_mtime
385
+ lastmodtime = datetime.fromtimestamp(mtime).astimezone(
386
+ local_timezone
387
+ )
388
+ timestamp.append(
389
+ lastmodtime.isoformat()
390
+ .replace(".", ",")
391
+ .replace("+00:00", "Z")
392
+ )
393
+ full_filename = filepath.name
394
+
395
+ if folder == "main": # if file in main folder
396
+ filename.append(
397
+ full_filename
398
+ ) if folder == "" else filename.append(
399
+ join(folder, full_filename)
400
+ )
401
+ else:
402
+ subdirname = os.path.relpath(
403
+ subdir, paths
404
+ ) # gives relative path of the directory of the file w.r.t paths
405
+ if subdirname == ".":
406
+ filename.append(join(key, full_filename))
407
+ else:
408
+ filename.append(
409
+ join(key, subdirname, full_filename)
410
+ )
411
+
412
+ fileextension = splitext(full_filename)[1]
413
+ if (
414
+ not fileextension
415
+ ): # if empty (happens e.g. with Readme files)
416
+ fileextension = "None"
417
+ filetype.append(fileextension)
418
+ filedescription.append("")
419
+ else:
420
+ gevent.sleep(0)
421
+ countpath += 1
422
+ filepath = pathlib.Path(paths)
423
+ file = filepath.name
424
+ filename.append(file)
425
+ mtime = filepath.stat().st_mtime
426
+ lastmodtime = datetime.fromtimestamp(mtime).astimezone(
427
+ local_timezone
428
+ )
429
+ timestamp.append(
430
+ lastmodtime.isoformat()
431
+ .replace(".", ",")
432
+ .replace("+00:00", "Z")
433
+ )
434
+ filedescription.append(alldescription[countpath])
435
+ if isdir(paths):
436
+ filetype.append("folder")
437
+ else:
438
+ fileextension = splitext(file)[1]
439
+ if (
440
+ not fileextension
441
+ ): # if empty (happens e.g. with Readme files)
442
+ fileextension = "None"
443
+ filetype.append(fileextension)
444
+
445
+ df["filename"] = filename
446
+ df["timestamp"] = timestamp
447
+ df["file type"] = filetype
448
+ df["description"] = filedescription
449
+
450
+ makedirs(folderpath)
451
+ # Save manifest as Excel sheet
452
+ manifestfile = join(folderpath, "manifest.xlsx")
453
+ df.to_excel(manifestfile, index=None, header=True)
454
+ wb = load_workbook(manifestfile)
455
+ ws = wb.active
456
+
457
+ blueFill = PatternFill(
458
+ start_color="9DC3E6", fill_type="solid"
459
+ )
460
+ greenFill = PatternFill(
461
+ start_color="A8D08D", fill_type="solid"
462
+ )
463
+ yellowFill = PatternFill(
464
+ start_color="FFD965", fill_type="solid"
465
+ )
466
+ ws['A1'].fill = blueFill
467
+ ws['B1'].fill = greenFill
468
+ ws['C1'].fill = greenFill
469
+ ws['D1'].fill = greenFill
470
+ ws['E1'].fill = yellowFill
471
+
472
+ wb.save(manifestfile)
473
+ total_dataset_size += path_size(manifestfile)
474
+ jsonpath[folder].append(manifestfile)
475
+
476
+ return jsonpath
477
+
478
+ except Exception as e:
479
+ raise e
480
+
481
+
482
+ def return_new_path(topath):
483
+ """
484
+ This function checks if a folder already exists and in such cases,
485
+ appends (1) or (2) etc. to the folder name
486
+
487
+ Args:
488
+ topath: path where the folder is supposed to be created (string)
489
+ Returns:
490
+ topath: new folder name based on the availability in destination folder (string)
491
+ """
492
+
493
+ if not exists(topath):
494
+ return topath
495
+
496
+ i = 1
497
+ while True:
498
+ if not exists(topath + " (" + str(i) + ")"):
499
+ return topath + " (" + str(i) + ")"
500
+ i += 1
501
+
502
+
503
+ def return_new_path_replace(topath):
504
+ """
505
+ This function checks if a folder already exists and in such cases,
506
+ replace the existing folder (this is the opposite situation to the function return_new_path)
507
+
508
+ Args:
509
+ topath: path where the folder is supposed to be created (string)
510
+ Returns:
511
+ topath: new folder name based on the availability in destination folder (string)
512
+ """
513
+
514
+ if not exists(topath):
515
+ return topath
516
+ i = 1
517
+ while True:
518
+ if not exists(topath + " (" + str(i) + ")"):
519
+ return topath + " (" + str(i) + ")"
520
+ i += 1
521
+
522
+
523
+ def time_format(elapsed_time):
524
+ mins, secs = divmod(elapsed_time, 60)
525
+ hours, mins = divmod(mins, 60)
526
+ return "%dh:%02dmin:%02ds" % (hours, mins, secs)
527
+
528
+
529
+ def mycopyfileobj(fsrc, fdst, length=16 * 1024 * 16):
530
+ """
531
+ Helper function to copy file
532
+
533
+ Args:
534
+ fsrc: source file opened in python (file-like object)
535
+ fdst: destination file accessed in python (file-like object)
536
+ length: copied buffer size in bytes (integer)
537
+ """
538
+ global curateprogress
539
+ global total_dataset_size
540
+ global curated_dataset_size
541
+ global main_generated_dataset_size
542
+
543
+ while True:
544
+ buf = fsrc.read(length)
545
+ if not buf:
546
+ break
547
+ gevent.sleep(0)
548
+ fdst.write(buf)
549
+ curated_dataset_size += len(buf)
550
+ main_generated_dataset_size += len(buf)
551
+
552
+
553
+ def mycopyfile_with_metadata(src, dst, *, follow_symlinks=True):
554
+ """
555
+ Copy file src to dst with metadata (timestamp, permission, etc.) conserved
556
+
557
+ Args:
558
+ src: source file (string)
559
+ dst: destination file (string)
560
+ Returns:
561
+ dst
562
+ """
563
+ if not follow_symlinks and os.path.islink(src):
564
+ os.symlink(os.readlink(src), dst)
565
+ else:
566
+ with open(src, "rb") as fsrc:
567
+ with open(dst, "wb") as fdst:
568
+ mycopyfileobj(fsrc, fdst)
569
+ shutil.copystat(src, dst)
570
+ return dst
571
+
572
+
573
+ def check_empty_files_folders(soda):
574
+ """
575
+ Function to check for empty files and folders
576
+
577
+ Args:
578
+ soda: soda dict with information about all specified files and folders
579
+ Output:
580
+ error: error message with list of non valid local data files, if any
581
+ """
582
+ try:
583
+ def recursive_empty_files_check(my_folder, my_relative_path, error_files):
584
+ for folder_key, folder in my_folder["folders"].items():
585
+ relative_path = my_relative_path + "/" + folder_key
586
+ error_files = recursive_empty_files_check(
587
+ folder, relative_path, error_files
588
+ )
589
+
590
+ for file_key in list(my_folder["files"].keys()):
591
+ file = my_folder["files"][file_key]
592
+ file_type = file.get("location")
593
+ if file_type == "local":
594
+ file_path = file["path"]
595
+ if isfile(file_path):
596
+ file_size = getsize(file_path)
597
+ if file_size == 0:
598
+ del my_folder["files"][file_key]
599
+ relative_path = my_relative_path + "/" + file_key
600
+ error_message = relative_path + " (path: " + file_path + ")"
601
+ error_files.append(error_message)
602
+
603
+ return error_files
604
+
605
+ def recursive_empty_local_folders_check(
606
+ my_folder,
607
+ my_folder_key,
608
+ my_folders_content,
609
+ my_relative_path,
610
+ error_folders,
611
+ ):
612
+ folders_content = my_folder["folders"]
613
+ for folder_key in list(my_folder["folders"].keys()):
614
+ folder = my_folder["folders"][folder_key]
615
+ relative_path = my_relative_path + "/" + folder_key
616
+ error_folders = recursive_empty_local_folders_check(
617
+ folder, folder_key, folders_content, relative_path, error_folders
618
+ )
619
+
620
+ if not my_folder["folders"] and not my_folder["files"]:
621
+ ignore = False
622
+ if "location" in my_folder and my_folder.get("location") == "ps":
623
+ ignore = True
624
+ if not ignore:
625
+ error_message = my_relative_path
626
+ error_folders.append(error_message)
627
+ del my_folders_content[my_folder_key]
628
+ return error_folders
629
+
630
+ error_files = []
631
+ error_folders = []
632
+ if "dataset-structure" in soda.keys():
633
+ dataset_structure = soda["dataset-structure"]
634
+ if "folders" in dataset_structure:
635
+ for folder_key, folder in dataset_structure["folders"].items():
636
+ relative_path = folder_key
637
+ error_files = recursive_empty_files_check(
638
+ folder, relative_path, error_files
639
+ )
640
+
641
+ folders_content = dataset_structure["folders"]
642
+ for folder_key in list(dataset_structure["folders"].keys()):
643
+ folder = dataset_structure["folders"][folder_key]
644
+ relative_path = folder_key
645
+ error_folders = recursive_empty_local_folders_check(
646
+ folder,
647
+ folder_key,
648
+ folders_content,
649
+ relative_path,
650
+ error_folders,
651
+ )
652
+
653
+ if "metadata-files" in soda.keys():
654
+ metadata_files = soda["metadata-files"]
655
+ for file_key in list(metadata_files.keys()):
656
+ file = metadata_files[file_key]
657
+ file_type = file.get("location")
658
+ if file_type == "local":
659
+ file_path = file["path"]
660
+ if isfile(file_path):
661
+ file_size = getsize(file_path)
662
+ if file_size == 0:
663
+ del metadata_files[file_key]
664
+ error_message = file_key + " (path: " + file_path + ")"
665
+ error_files.append(error_message)
666
+ if not metadata_files:
667
+ del soda["metadata-files"]
668
+
669
+ if len(error_files) > 0:
670
+ error_message = [
671
+ "The following local file(s) is/are empty (0 kb) and will be ignored."
672
+ ]
673
+ error_files = error_message + [] + error_files
674
+
675
+ if len(error_folders) > 0:
676
+ error_message = [
677
+ "The SPARC dataset structure does not allow empty folders. The following empty folders will be removed from your dataset:"
678
+ ]
679
+ error_folders = error_message + [] + error_folders
680
+
681
+ return {
682
+ "empty_files": error_files,
683
+ "empty_folders": error_folders,
684
+ "soda": soda
685
+ }
686
+
687
+ except Exception as e:
688
+ raise e
689
+
690
+
691
+ def check_local_dataset_files_validity(soda):
692
+ """
693
+ Function to check that the local data files and folders specified in the dataset are valid
694
+
695
+ Args:
696
+ soda: soda dict with information about all specified files and folders
697
+ Output:
698
+ error: error message with list of non valid local data files, if any
699
+ """
700
+
701
+ def recursive_local_file_check(my_folder, my_relative_path, error):
702
+ for folder_key, folder in my_folder["folders"].items():
703
+ relative_path = my_relative_path + "/" + folder_key
704
+ error = recursive_local_file_check(folder, relative_path, error)
705
+
706
+ for file_key in list(my_folder["files"].keys()):
707
+ file = my_folder["files"][file_key]
708
+ if file_key in ["manifest.xlsx", "manifest.csv"]:
709
+ continue
710
+ file_type = file.get("location")
711
+ if file_type == "local":
712
+ file_path = file["path"]
713
+ if file.get("location") == "ps":
714
+ continue
715
+ if not isfile(file_path):
716
+ relative_path = my_relative_path + "/" + file_key
717
+ error_message = relative_path + " (path: " + file_path + ")"
718
+ error.append(error_message)
719
+ else:
720
+ file_size = getsize(file_path)
721
+ if file_size == 0:
722
+ del my_folder["files"][file_key]
723
+
724
+ return error
725
+
726
+ def recursive_empty_local_folder_remove(
727
+ my_folder, my_folder_key, my_folders_content
728
+ ):
729
+
730
+ folders_content = my_folder["folders"]
731
+ for folder_key in list(my_folder["folders"].keys()):
732
+ folder = my_folder["folders"][folder_key]
733
+ recursive_empty_local_folder_remove(folder, folder_key, folders_content)
734
+
735
+ if not my_folder.get("folders") and not my_folder.get("files") and my_folder.get("location") != "ps":
736
+ del my_folders_content[my_folder_key]
737
+
738
+ error = []
739
+ if "dataset-structure" in soda.keys():
740
+ dataset_structure = soda["dataset-structure"]
741
+ # Remove 0kb files, files that can't be found, and any empty folders from the dataset data files
742
+ if "folders" in dataset_structure:
743
+ for folder_key, folder in dataset_structure["folders"].items():
744
+ relative_path = folder_key
745
+ error = recursive_local_file_check(folder, relative_path, error)
746
+
747
+ folders_content = dataset_structure["folders"]
748
+ for folder_key in list(dataset_structure["folders"].keys()):
749
+ folder = dataset_structure["folders"][folder_key]
750
+ recursive_empty_local_folder_remove(folder, folder_key, folders_content)
751
+
752
+ # Return list of all the files that were not found.
753
+ if len(error) > 0:
754
+ error_message = [
755
+ "Error: The following local files were not found. Specify them again or remove them."
756
+ ]
757
+ error = error_message + error
758
+
759
+ return error
760
+
761
+
762
+ # path to local SODA folder for saving manifest files
763
+ manifest_sparc = ["manifest.xlsx", "manifest.csv"]
764
+ manifest_folder_path = join(userpath, ".pysoda", "manifest_file")
765
+
766
+
767
+
768
+ def check_json_size(jsonStructure):
769
+ """
770
+ This function is called to check size of files that will be created locally on a user's device.
771
+ """
772
+ global total_dataset_size
773
+ total_dataset_size = 0
774
+
775
+ try:
776
+ def recursive_dataset_scan(folder):
777
+ global total_dataset_size
778
+
779
+ if "files" in folder.keys():
780
+ for file_key, file in folder["files"].items():
781
+ if "deleted" not in file["action"]:
782
+ file_type = file.get("location")
783
+ if file_type == "local":
784
+ file_path = file["path"]
785
+ if isfile(file_path):
786
+ total_dataset_size += getsize(file_path)
787
+
788
+ if "folders" in folder.keys():
789
+ for folder_key, folder in folder["folders"].items():
790
+ recursive_dataset_scan(folder)
791
+
792
+ # scan dataset structure
793
+ dataset_structure = jsonStructure["dataset-structure"]
794
+ folderSection = dataset_structure["folders"]
795
+ # gets keys like code, primary, source and their content...
796
+ for keys, contents in folderSection.items():
797
+ recursive_dataset_scan(contents)
798
+
799
+ if "metadata-files" in jsonStructure.keys():
800
+ metadata_files = jsonStructure["metadata-files"]
801
+ for file_key, file in metadata_files.items():
802
+ if file.get("location") == "local":
803
+ metadata_path = file["path"]
804
+ if isfile(metadata_path) and "new" in file["action"]:
805
+ total_dataset_size += getsize(metadata_path)
806
+
807
+ if "manifest-files" in jsonStructure.keys():
808
+ manifest_files_structure = create_high_level_manifest_files(jsonStructure, manifest_folder_path)
809
+ for key in manifest_files_structure.keys():
810
+ manifestpath = manifest_files_structure[key]
811
+ if isfile(manifestpath):
812
+ total_dataset_size += getsize(manifestpath)
813
+
814
+ # returns in bytes
815
+ return {"dataset_size": total_dataset_size}
816
+ except Exception as e:
817
+ raise e
818
+
819
+
820
+ def generate_dataset_locally(soda):
821
+ global logger
822
+ logger.info("starting generate_dataset_locally")
823
+
824
+ # Vars used for tracking progress on the frontend
825
+ global main_curate_progress_message
826
+ global progress_percentage
827
+ global main_total_generate_dataset_size
828
+ global start_generate
829
+ global main_curation_uploaded_files
830
+
831
+ main_curation_uploaded_files = 0
832
+
833
+ def recursive_dataset_scan(
834
+ my_folder, my_folderpath, list_copy_files, list_move_files
835
+ ):
836
+ global main_total_generate_dataset_size
837
+
838
+ if "folders" in my_folder.keys():
839
+ for folder_key, folder in my_folder["folders"].items():
840
+ folderpath = join(my_folderpath, folder_key)
841
+ if not isdir(folderpath):
842
+ mkdir(folderpath)
843
+ list_copy_files, list_move_files = recursive_dataset_scan(
844
+ folder, folderpath, list_copy_files, list_move_files
845
+ )
846
+
847
+ if "files" in my_folder.keys():
848
+ for file_key, file in my_folder["files"].items():
849
+ if "deleted" not in file["action"]:
850
+ file_type = file.get("location")
851
+ if file_type == "local":
852
+ file_path = file["path"]
853
+ if isfile(file_path):
854
+ destination_path = abspath(
855
+ join(my_folderpath, file_key)
856
+ )
857
+ if not isfile(destination_path):
858
+ if (
859
+ "existing" in file["action"]
860
+ and soda["generate-dataset"][
861
+ "if-existing"
862
+ ]
863
+ == "merge"
864
+ ):
865
+ list_move_files.append(
866
+ [file_path, destination_path]
867
+ )
868
+ else:
869
+ main_total_generate_dataset_size += getsize(
870
+ file_path
871
+ )
872
+ list_copy_files.append(
873
+ [file_path, destination_path]
874
+ )
875
+ else:
876
+ logger.info(f"file_path {file_path} does not exist. Skipping.")
877
+ return list_copy_files, list_move_files
878
+
879
+
880
+ logger.info("generate_dataset_locally step 1")
881
+ # 1. Create new folder for dataset or use existing merge with existing or create new dataset
882
+ main_curate_progress_message = "Generating folder structure and list of files to be included in the dataset"
883
+ dataset_absolute_path = soda["generate-dataset"]["path"]
884
+ if_existing = soda["generate-dataset"]["if-existing"]
885
+ dataset_name = soda["generate-dataset"]["dataset-name"]
886
+ datasetpath = join(dataset_absolute_path, dataset_name)
887
+ datasetpath = return_new_path(datasetpath)
888
+ mkdir(datasetpath)
889
+
890
+ logger.info("generate_dataset_locally step 2")
891
+ # 2. Scan the dataset structure and:
892
+ # 2.1. Create all folders (with new name if renamed)
893
+ # 2.2. Compile a list of files to be copied and a list of files to be moved (with new name recorded if renamed)
894
+ list_copy_files = []
895
+ list_move_files = []
896
+ dataset_structure = soda["dataset-structure"]
897
+
898
+ for folder_key, folder in dataset_structure["folders"].items():
899
+ folderpath = join(datasetpath, folder_key)
900
+ mkdir(folderpath)
901
+ list_copy_files, list_move_files = recursive_dataset_scan(
902
+ folder, folderpath, list_copy_files, list_move_files
903
+ )
904
+
905
+ # 3. Add high-level metadata files in the list
906
+ if "dataset_metadata" in soda.keys():
907
+ logger.info("generate_dataset_locally (optional) step 3 handling metadata-files")
908
+ metadata_files = soda["dataset_metadata"]
909
+ # log the metadata files that will be created
910
+ for file_key, _ in metadata_files.items():
911
+ if file_key == "subjects":
912
+ subjects.create_excel(soda, False, join(datasetpath, "subjects.xlsx"))
913
+ elif file_key == "samples":
914
+ samples.create_excel(soda, False, join(datasetpath, "samples.xlsx"))
915
+ elif file_key == "code_description":
916
+ code_description.create_excel(soda, False, join(datasetpath, "code_description.xlsx"))
917
+ elif file_key == "dataset_description":
918
+ dataset_description.create_excel(soda, False, join(datasetpath, "dataset_description.xlsx"))
919
+ elif file_key == "performances":
920
+ performances.create_excel(soda, False, join(datasetpath, "performances.xlsx"))
921
+ elif file_key == "resources":
922
+ resources.create_excel(soda, False, join(datasetpath, "resources.xlsx"))
923
+ elif file_key == "sites":
924
+ sites.create_excel(soda, False, join(datasetpath, "sites.xlsx"))
925
+ elif file_key == "submission":
926
+ submission.create_excel(soda, False, join(datasetpath, "submission.xlsx"))
927
+ elif file_key == "README.md":
928
+ text_metadata.create_text_file(soda, False, join(datasetpath, "README.md"), "README.md")
929
+ elif file_key == "CHANGES":
930
+ text_metadata.create_text_file(soda, False, join(datasetpath, "CHANGES"), "CHANGES")
931
+ elif file_key == "LICENSE":
932
+ text_metadata.create_text_file(soda, False, join(datasetpath, "LICENSE"), "LICENSE")
933
+
934
+ # 4. Add manifest files in the list
935
+ if "manifest_file" in soda["dataset_metadata"].keys():
936
+ logger.info("generate_dataset_locally (optional) step 4 handling manifest-files")
937
+ main_curate_progress_message = "Preparing manifest files"
938
+ manifest.create_excel(soda, False, join(datasetpath, "manifest.xlsx"))
939
+
940
+
941
+ logger.info("generate_dataset_locally step 5 moving files to new location")
942
+ # 5. Move files to new location
943
+ main_curate_progress_message = "Moving files to new location"
944
+ for fileinfo in list_move_files:
945
+ srcfile = fileinfo[0]
946
+ distfile = fileinfo[1]
947
+ main_curate_progress_message = f"Moving file {str(srcfile)} to {str(distfile)}"
948
+ shutil.move(srcfile, distfile)
949
+
950
+ logger.info("generate_dataset_locally step 6 copying files to new location")
951
+ # 6. Copy files to new location
952
+ main_curate_progress_message = "Copying files to new location"
953
+ start_generate = 1
954
+ for fileinfo in list_copy_files:
955
+ srcfile = fileinfo[0]
956
+ distfile = fileinfo[1]
957
+ main_curate_progress_message = f"Copying file {str(srcfile)} to {str(distfile)}"
958
+ # track amount of copied files for loggin purposes
959
+ mycopyfile_with_metadata(srcfile, distfile)
960
+ main_curation_uploaded_files += 1
961
+
962
+ logger.info("generate_dataset_locally step 7")
963
+ # 7. Delete manifest folder and original folder if merge requested and rename new folder
964
+ shutil.rmtree(manifest_folder_path) if isdir(manifest_folder_path) else 0
965
+ if if_existing == "merge":
966
+ logger.info("generate_dataset_locally (optional) step 7.1 delete manifest folder if merge requested")
967
+ main_curate_progress_message = "Finalizing dataset"
968
+ original_dataset_path = join(dataset_absolute_path, dataset_name)
969
+ shutil.rmtree(original_dataset_path)
970
+ rename(datasetpath, original_dataset_path)
971
+ open_file(join(dataset_absolute_path, original_dataset_path))
972
+ else:
973
+ open_file(join(dataset_absolute_path, datasetpath))
974
+ return datasetpath, main_total_generate_dataset_size
975
+
976
+
977
+
978
+
979
+
980
+ def ps_create_new_dataset(datasetname, ps):
981
+ """
982
+ Args:
983
+ datasetname: name of the dataset to be created (string)
984
+ bf: Pennsieve account object
985
+ Action:
986
+ Creates dataset for the account specified
987
+ """
988
+ try:
989
+ error, count = "", 0
990
+ datasetname = datasetname.strip()
991
+
992
+ if check_forbidden_characters_ps(datasetname):
993
+ error = (
994
+ f"{error}Error: A Pennsieve dataset name cannot contain any of the following characters: "
995
+ + forbidden_characters_bf
996
+ + "<br>"
997
+ )
998
+ count += 1
999
+
1000
+ if not datasetname:
1001
+ error = f"{error}Error: Please enter valid dataset name<br>"
1002
+ count += 1
1003
+
1004
+ if datasetname.isspace():
1005
+ error = error + "Error: Please enter valid dataset name" + "<br>"
1006
+ count += 1
1007
+
1008
+ if count > 0:
1009
+ raise PennsieveDatasetNameInvalid(datasetname)
1010
+
1011
+ try:
1012
+ dataset_list = get_users_dataset_list()
1013
+ except Exception as e:
1014
+ raise Exception("Failed to retrieve datasets from Pennsieve. Please try again later.")
1015
+
1016
+ for dataset in dataset_list:
1017
+ if datasetname == dataset["content"]["name"]:
1018
+ raise PennsieveDatasetNameTaken("Dataset name already exists")
1019
+
1020
+
1021
+ # Create the dataset on Pennsieve
1022
+ r = requests.post(f"{PENNSIEVE_URL}/datasets", headers=create_request_headers(ps), json={"name": datasetname})
1023
+ r.raise_for_status()
1024
+
1025
+
1026
+ return r.json()
1027
+
1028
+ # TODO: Remove unnecessary raise
1029
+ except Exception as e:
1030
+ raise e
1031
+
1032
+ double_extensions = [
1033
+ ".ome.tiff",
1034
+ ".ome.tif",
1035
+ ".ome.tf2,",
1036
+ ".ome.tf8",
1037
+ ".ome.btf",
1038
+ ".ome.xml",
1039
+ ".brukertiff.gz",
1040
+ ".mefd.gz",
1041
+ ".moberg.gz",
1042
+ ".nii.gz",
1043
+ ".mgh.gz",
1044
+ ".tar.gz",
1045
+ ".bcl.gz",
1046
+ ]
1047
+
1048
+
1049
+ def create_high_lvl_manifest_files_existing_ps(
1050
+ soda, ps, my_tracking_folder
1051
+ ):
1052
+ """
1053
+ Function to create manifest files for each high-level SPARC folder.
1054
+
1055
+ Args:
1056
+ soda: soda dict with information about the dataset to be generated/modified
1057
+ Action:
1058
+ manifest_files_structure: dict including the local path of the manifest files
1059
+ """
1060
+ def get_name_extension(file_name):
1061
+ double_ext = False
1062
+ for ext in double_extensions:
1063
+ if file_name.find(ext) != -1:
1064
+ double_ext = True
1065
+ break
1066
+ ext = ""
1067
+ name = ""
1068
+ if double_ext == False:
1069
+ name = os.path.splitext(file_name)[0]
1070
+ ext = os.path.splitext(file_name)[1]
1071
+ else:
1072
+ ext = (
1073
+ os.path.splitext(os.path.splitext(file_name)[0])[1]
1074
+ + os.path.splitext(file_name)[1]
1075
+ )
1076
+ name = os.path.splitext(os.path.splitext(file_name)[0])[0]
1077
+ return name, ext
1078
+
1079
+ def recursive_import_ps_manifest_info(
1080
+ folder, my_relative_path, dict_folder_manifest, manifest_df
1081
+ ):
1082
+ """
1083
+ Import manifest information from the Pennsieve dataset for the given folder and its children.
1084
+ """
1085
+
1086
+ if len(folder['children']) == 0:
1087
+ limit = 100
1088
+ offset = 0
1089
+ ps_folder = {"children": []}
1090
+ while True:
1091
+ r = requests.get(f"{PENNSIEVE_URL}/packages/{folder['content']['id']}?limit={limit}&offset={offset}", headers=create_request_headers(ps), json={"include": "files"})
1092
+ r.raise_for_status()
1093
+ page = r.json()
1094
+ normalize_tracking_folder(page)
1095
+ ps_folder["children"].extend(page)
1096
+
1097
+ if len(page) < limit:
1098
+ break
1099
+ offset += limit
1100
+
1101
+ folder['children'] = ps_folder['children']
1102
+
1103
+ for _, folder_item in folder["children"]["folders"].items():
1104
+ folder_name = folder_item['content']['name']
1105
+ relative_path = generate_relative_path(
1106
+ my_relative_path, folder_name
1107
+ )
1108
+ dict_folder_manifest = recursive_import_ps_manifest_info(
1109
+ folder_item, relative_path, dict_folder_manifest, manifest_df
1110
+ )
1111
+ for _, file in folder["children"]["files"].items():
1112
+ if file['content']['name'] != "manifest":
1113
+ file_id = file['content']['id']
1114
+ r = requests.get(f"{PENNSIEVE_URL}/packages/{file_id}/view", headers=create_request_headers(ps))
1115
+ r.raise_for_status()
1116
+ file_details = r.json()
1117
+ file_name = file_details[0]["content"]["name"]
1118
+ file_extension = splitext(file_name)[1]
1119
+ file_name_with_extension = (
1120
+ splitext(file['content']['name'])[0] + file_extension
1121
+ )
1122
+ relative_path = generate_relative_path(
1123
+ my_relative_path, file_name_with_extension
1124
+ )
1125
+ dict_folder_manifest["filename"].append(relative_path)
1126
+ # file type
1127
+ file_extension = get_name_extension(file_name)
1128
+ if file_extension == "":
1129
+ file_extension = "None"
1130
+ dict_folder_manifest["file type"].append(file_extension)
1131
+ # timestamp, description, Additional Metadata
1132
+ if not manifest_df.empty:
1133
+ if relative_path in manifest_df["filename"].values:
1134
+ timestamp = manifest_df[
1135
+ manifest_df["filename"] == relative_path
1136
+ ]["timestamp"].iloc[0]
1137
+ description = manifest_df[
1138
+ manifest_df["filename"] == relative_path
1139
+ ]["description"].iloc[0]
1140
+ additional_metadata = manifest_df[
1141
+ manifest_df["filename"] == relative_path
1142
+ ]["Additional Metadata"].iloc[0]
1143
+ else:
1144
+ timestamp = ""
1145
+ description = ""
1146
+ additional_metadata = ""
1147
+ dict_folder_manifest["timestamp"].append(timestamp)
1148
+ dict_folder_manifest["description"].append(description)
1149
+ dict_folder_manifest["Additional Metadata"].append(
1150
+ additional_metadata
1151
+ )
1152
+ else:
1153
+ dict_folder_manifest["timestamp"].append("")
1154
+ dict_folder_manifest["description"].append("")
1155
+ dict_folder_manifest["Additional Metadata"].append("")
1156
+ return dict_folder_manifest
1157
+
1158
+ # Merge existing folders
1159
+ def recursive_manifest_builder_existing_ps(
1160
+ my_folder,
1161
+ my_bf_folder,
1162
+ my_bf_folder_exists,
1163
+ my_relative_path,
1164
+ dict_folder_manifest,
1165
+ ):
1166
+ if "folders" in my_folder.keys():
1167
+ if my_bf_folder_exists:
1168
+ (
1169
+ my_bf_existing_folders_name,
1170
+ ) = ps_get_existing_folders_details(my_bf_folder['children']['folders'])
1171
+ else:
1172
+ my_bf_existing_folders_name = []
1173
+ for folder_key, folder in my_folder["folders"].items():
1174
+ relative_path = generate_relative_path(my_relative_path, folder_key)
1175
+ if folder_key in my_bf_existing_folders_name:
1176
+ bf_folder = my_bf_folder["children"]["folders"][folder_key]
1177
+ bf_folder_exists = True
1178
+ else:
1179
+ bf_folder = ""
1180
+ bf_folder_exists = False
1181
+ dict_folder_manifest = recursive_manifest_builder_existing_ps(
1182
+ folder,
1183
+ bf_folder,
1184
+ bf_folder_exists,
1185
+ relative_path,
1186
+ dict_folder_manifest,
1187
+ )
1188
+ if "files" in my_folder.keys():
1189
+ if my_bf_folder_exists:
1190
+ (
1191
+ my_bf_existing_files_name,
1192
+ my_bf_existing_files_name_with_extension,
1193
+ ) = ps_get_existing_files_details(my_bf_folder)
1194
+ else:
1195
+ my_bf_existing_files = []
1196
+ my_bf_existing_files_name = []
1197
+ my_bf_existing_files_name_with_extension = []
1198
+ for file_key, file in my_folder["files"].items():
1199
+ if file.get("location") == "local":
1200
+ file_path = file["path"]
1201
+ if isfile(file_path):
1202
+ desired_name = splitext(file_key)[0]
1203
+ file_extension = splitext(file_key)[1]
1204
+ # manage existing file request
1205
+ if existing_file_option == "skip" and file_key in my_bf_existing_files_name_with_extension:
1206
+ continue
1207
+ if existing_file_option == "replace" and file_key in my_bf_existing_files_name_with_extension:
1208
+ # remove existing from manifest
1209
+ filename = generate_relative_path(
1210
+ my_relative_path, file_key
1211
+ )
1212
+ filename_list = dict_folder_manifest["filename"]
1213
+ index_file = filename_list.index(filename)
1214
+ del dict_folder_manifest["filename"][index_file]
1215
+ del dict_folder_manifest["timestamp"][index_file]
1216
+ del dict_folder_manifest["description"][index_file]
1217
+ del dict_folder_manifest["file type"][index_file]
1218
+ del dict_folder_manifest["Additional Metadata"][
1219
+ index_file
1220
+ ]
1221
+ index_name = (
1222
+ my_bf_existing_files_name_with_extension.index(
1223
+ file_key
1224
+ )
1225
+ )
1226
+ del my_bf_existing_files[index_name]
1227
+ del my_bf_existing_files_name[index_name]
1228
+ del my_bf_existing_files_name_with_extension[
1229
+ index_name
1230
+ ]
1231
+ if desired_name not in my_bf_existing_files_name:
1232
+ final_name = file_key
1233
+ else:
1234
+ # expected final name
1235
+ count_done = 0
1236
+ final_name = desired_name
1237
+ output = get_base_file_name(desired_name)
1238
+ if output:
1239
+ base_name = output[0]
1240
+ count_exist = output[1]
1241
+ while count_done == 0:
1242
+ if final_name in my_bf_existing_files_name:
1243
+ count_exist += 1
1244
+ final_name = (
1245
+ base_name + "(" + str(count_exist) + ")"
1246
+ )
1247
+ else:
1248
+ count_done = 1
1249
+ else:
1250
+ count_exist = 0
1251
+ while count_done == 0:
1252
+ if final_name in my_bf_existing_files_name:
1253
+ count_exist += 1
1254
+ final_name = (
1255
+ desired_name
1256
+ + " ("
1257
+ + str(count_exist)
1258
+ + ")"
1259
+ )
1260
+ else:
1261
+ count_done = 1
1262
+ final_name = final_name + file_extension
1263
+ my_bf_existing_files_name.append(
1264
+ splitext(final_name)[0]
1265
+ )
1266
+ # filename
1267
+ filename = generate_relative_path(
1268
+ my_relative_path, final_name
1269
+ )
1270
+ dict_folder_manifest["filename"].append(filename)
1271
+ # timestamp
1272
+ file_path = file["path"]
1273
+ filepath = pathlib.Path(file_path)
1274
+ mtime = filepath.stat().st_mtime
1275
+ lastmodtime = datetime.fromtimestamp(mtime).astimezone(
1276
+ local_timezone
1277
+ )
1278
+ dict_folder_manifest["timestamp"].append(
1279
+ lastmodtime.isoformat()
1280
+ .replace(".", ",")
1281
+ .replace("+00:00", "Z")
1282
+ )
1283
+ # description
1284
+ if "description" in file.keys():
1285
+ dict_folder_manifest["description"].append(
1286
+ file["description"]
1287
+ )
1288
+ else:
1289
+ dict_folder_manifest["description"].append("")
1290
+ # file type
1291
+ if file_extension == "":
1292
+ file_extension = "None"
1293
+ dict_folder_manifest["file type"].append(file_extension)
1294
+ # addtional metadata
1295
+ if "additional-metadata" in file.keys():
1296
+ dict_folder_manifest["Additional Metadata"].append(
1297
+ file["additional-metadata"]
1298
+ )
1299
+ else:
1300
+ dict_folder_manifest["Additional Metadata"].append("")
1301
+ return dict_folder_manifest
1302
+
1303
+ double_extensions = [
1304
+ ".ome.tiff",
1305
+ ".ome.tif",
1306
+ ".ome.tf2,",
1307
+ ".ome.tf8",
1308
+ ".ome.btf",
1309
+ ".ome.xml",
1310
+ ".brukertiff.gz",
1311
+ ".mefd.gz",
1312
+ ".moberg.gz",
1313
+ ".nii.gz",
1314
+ ".mgh.gz",
1315
+ ".tar.gz",
1316
+ ".bcl.gz",
1317
+ ]
1318
+
1319
+ try:
1320
+ # create local folder to save manifest files temporarly (delete any existing one first)
1321
+ shutil.rmtree(manifest_folder_path) if isdir(manifest_folder_path) else 0
1322
+ makedirs(manifest_folder_path)
1323
+
1324
+ # import info about files already on ps
1325
+ dataset_structure = soda["dataset-structure"]
1326
+ manifest_dict_save = {}
1327
+ for high_level_folder_key, high_level_folder in my_tracking_folder["children"]["folders"].items():
1328
+ if (
1329
+ high_level_folder_key in dataset_structure["folders"].keys()
1330
+ ):
1331
+
1332
+ relative_path = ""
1333
+ dict_folder_manifest = {}
1334
+ # Initialize dict where manifest info will be stored
1335
+ dict_folder_manifest["filename"] = []
1336
+ dict_folder_manifest["timestamp"] = []
1337
+ dict_folder_manifest["description"] = []
1338
+ dict_folder_manifest["file type"] = []
1339
+ dict_folder_manifest["Additional Metadata"] = []
1340
+
1341
+ # pull manifest file into if exists
1342
+ manifest_df = pd.DataFrame()
1343
+ for file_key, file in high_level_folder['children']['files'].items():
1344
+ file_id = file['content']['id']
1345
+ r = requests.get(f"{PENNSIEVE_URL}/packages/{file_id}/view", headers=create_request_headers(ps))
1346
+ r.raise_for_status()
1347
+ file_details = r.json()
1348
+ file_name_with_extension = file_details[0]["content"]["name"]
1349
+ if file_name_with_extension in manifest_sparc:
1350
+ file_id_2 = file_details[0]["content"]["id"]
1351
+ r = requests.get(f"{PENNSIEVE_URL}/packages/{file_id}/files/{file_id_2}", headers=create_request_headers(ps))
1352
+ r.raise_for_status()
1353
+ file_url_info = r.json()
1354
+ file_url = file_url_info["url"]
1355
+ manifest_df = pd.read_excel(file_url, engine="openpyxl")
1356
+ manifest_df = manifest_df.fillna("")
1357
+ if (
1358
+ "filename" not in manifest_df.columns
1359
+ or "description" not in manifest_df.columns
1360
+ or "Additional Metadata" not in manifest_df.columns
1361
+ ):
1362
+ manifest_df = pd.DataFrame()
1363
+ break
1364
+
1365
+ # store the data frame pulled from Pennsieve into a dictionary
1366
+ dict_folder_manifest = recursive_import_ps_manifest_info(
1367
+ high_level_folder, relative_path, dict_folder_manifest, manifest_df
1368
+ )
1369
+
1370
+ manifest_dict_save[high_level_folder_key] = {
1371
+ "manifest": dict_folder_manifest,
1372
+ "bf_folder": high_level_folder,
1373
+ }
1374
+
1375
+ # import info from local files to be uploaded
1376
+ local_timezone = TZLOCAL()
1377
+ manifest_files_structure = {}
1378
+ existing_folder_option = soda["generate-dataset"]["if-existing"]
1379
+ existing_file_option = soda["generate-dataset"][
1380
+ "if-existing-files"
1381
+ ]
1382
+ for folder_key, folder in dataset_structure["folders"].items():
1383
+ relative_path = ""
1384
+
1385
+ if (
1386
+ folder_key in manifest_dict_save
1387
+ and existing_folder_option == "merge"
1388
+ ):
1389
+ bf_folder = manifest_dict_save[folder_key]["bf_folder"]
1390
+ bf_folder_exists = True
1391
+ dict_folder_manifest = manifest_dict_save[folder_key]["manifest"]
1392
+
1393
+ elif (
1394
+ folder_key in manifest_dict_save
1395
+ and folder_key
1396
+ not in my_tracking_folder["children"]["folders"].keys()
1397
+ and existing_folder_option == "skip"
1398
+ ):
1399
+ continue
1400
+
1401
+ else:
1402
+ bf_folder = ""
1403
+ bf_folder_exists = False
1404
+ dict_folder_manifest = {}
1405
+ dict_folder_manifest["filename"] = []
1406
+ dict_folder_manifest["timestamp"] = []
1407
+ dict_folder_manifest["description"] = []
1408
+ dict_folder_manifest["file type"] = []
1409
+ dict_folder_manifest["Additional Metadata"] = []
1410
+
1411
+ dict_folder_manifest = recursive_manifest_builder_existing_ps(
1412
+ folder, bf_folder, bf_folder_exists, relative_path, dict_folder_manifest
1413
+ )
1414
+
1415
+ # create high-level folder at the temporary location
1416
+ folderpath = join(manifest_folder_path, folder_key)
1417
+ makedirs(folderpath)
1418
+
1419
+ # save manifest file
1420
+ manifestfilepath = join(folderpath, "manifest.xlsx")
1421
+ df = pd.DataFrame.from_dict(dict_folder_manifest)
1422
+ df.to_excel(manifestfilepath, index=None, header=True)
1423
+ wb = load_workbook(manifestfilepath)
1424
+ ws = wb.active
1425
+
1426
+ blueFill = PatternFill(
1427
+ start_color="9DC3E6", fill_type="solid"
1428
+ )
1429
+ greenFill = PatternFill(
1430
+ start_color="A8D08D", fill_type="solid"
1431
+ )
1432
+ yellowFill = PatternFill(
1433
+ start_color="FFD965", fill_type="solid"
1434
+ )
1435
+ ws['A1'].fill = blueFill
1436
+ ws['B1'].fill = greenFill
1437
+ ws['C1'].fill = greenFill
1438
+ ws['D1'].fill = greenFill
1439
+ ws['E1'].fill = yellowFill
1440
+ wb.save(manifestfilepath)
1441
+
1442
+ manifest_files_structure[folder_key] = manifestfilepath
1443
+
1444
+ return manifest_files_structure
1445
+
1446
+ except Exception as e:
1447
+ raise e
1448
+
1449
+
1450
+
1451
+
1452
+
1453
+ def generate_relative_path(x, y):
1454
+ return x + "/" + y if x else y
1455
+
1456
+
1457
+ def ps_get_existing_folders_details(ps_folders):
1458
+ ps_existing_folders = [ps_folders[folder] for folder in ps_folders if ps_folders[folder]["content"]["packageType"] == "Collection"]
1459
+ ps_existing_folders_name = [folder['content']["name"] for folder in ps_existing_folders]
1460
+
1461
+ return ps_existing_folders, ps_existing_folders_name
1462
+
1463
+
1464
+ def ps_get_existing_files_details(ps_folder):
1465
+ # TODO: Dorian -> ["extensions doesn't seem to be returned anymore by the endpoint"]
1466
+ def verify_file_name(file_name, extension):
1467
+ if extension == "":
1468
+ return file_name
1469
+
1470
+ double_ext = False
1471
+ for ext in double_extensions:
1472
+ if file_name.find(ext) != -1:
1473
+ double_ext = True
1474
+ break
1475
+
1476
+ extension_from_name = ""
1477
+
1478
+ if double_ext == False:
1479
+ extension_from_name = os.path.splitext(file_name)[1]
1480
+ else:
1481
+ extension_from_name = (
1482
+ os.path.splitext(os.path.splitext(file_name)[0])[1]
1483
+ + os.path.splitext(file_name)[1]
1484
+ )
1485
+
1486
+ if extension_from_name == ("." + extension):
1487
+ return file_name
1488
+ else:
1489
+ return file_name + ("." + extension)
1490
+
1491
+ files = ps_folder["children"]["files"]
1492
+ double_extensions = [
1493
+ ".ome.tiff",
1494
+ ".ome.tif",
1495
+ ".ome.tf2,",
1496
+ ".ome.tf8",
1497
+ ".ome.btf",
1498
+ ".ome.xml",
1499
+ ".brukertiff.gz",
1500
+ ".mefd.gz",
1501
+ ".moberg.gz",
1502
+ ".nii.gz",
1503
+ ".mgh.gz",
1504
+ ".tar.gz",
1505
+ ".bcl.gz",
1506
+ ]
1507
+
1508
+
1509
+ bf_existing_files_name = [splitext(files[file]['content']["name"])[0] for file in files]
1510
+ bf_existing_files_name_with_extension = []
1511
+
1512
+ # determine if we are at the root of the dataset
1513
+ content = ps_folder["content"]
1514
+ if (str(content['id'])[2:9]) == "dataset":
1515
+ r = requests.get(f"{PENNSIEVE_URL}/datasets/{content['id']}", headers=create_request_headers(get_access_token()))
1516
+ r.raise_for_status()
1517
+ root_folder = r.json()
1518
+ root_children = root_folder["children"]
1519
+ for item in root_children:
1520
+ file_name_with_extension = ""
1521
+ item_id = item["content"]["id"]
1522
+ item_name = item["content"]["name"]
1523
+ if item_id[2:9] == "package":
1524
+ if("extension" not in root_children):
1525
+ file_name_with_extension = verify_file_name(item_name,"")
1526
+ else:
1527
+ file_name_with_extension = verify_file_name(item_name, root_children["extension"])
1528
+
1529
+ if file_name_with_extension == "":
1530
+ continue
1531
+ bf_existing_files_name_with_extension.append(file_name_with_extension)
1532
+ else:
1533
+ #is collection - aka a folder in the dataset
1534
+ for file_key, file in files.items():
1535
+ file_name_with_extension = ""
1536
+ file_name = file["content"]["name"]
1537
+ file_id = file["content"]["id"]
1538
+ if file_id[2:9] == "package":
1539
+ if "extension" not in file:
1540
+ file_name_with_extension = verify_file_name(file_name,"")
1541
+ else:
1542
+ file_name_with_extension = verify_file_name(file_name, file["extension"])
1543
+ if file_name_with_extension == "":
1544
+ continue
1545
+ bf_existing_files_name_with_extension.append(file_name_with_extension)
1546
+
1547
+
1548
+ return (
1549
+ bf_existing_files_name,
1550
+ bf_existing_files_name_with_extension,
1551
+ )
1552
+
1553
+
1554
+ def check_if_int(s):
1555
+ try:
1556
+ int(s)
1557
+ return True
1558
+ except ValueError:
1559
+ return False
1560
+
1561
+
1562
+ def get_base_file_name(file_name):
1563
+ output = []
1564
+ if file_name[-1] == ")":
1565
+ string_length = len(file_name)
1566
+ count_start = string_length
1567
+ character = file_name[count_start - 1]
1568
+ while character != "(" and count_start >= 0:
1569
+ count_start -= 1
1570
+ character = file_name[count_start - 1]
1571
+ if character == "(":
1572
+ base_name = file_name[:count_start - 1]
1573
+ num = file_name[count_start : string_length - 1]
1574
+ if check_if_int(num):
1575
+ output = [base_name, int(num)]
1576
+ return output
1577
+
1578
+
1579
+ def ps_update_existing_dataset(soda, ds, ps, resume):
1580
+ global logger
1581
+
1582
+ logger.info("Starting ps_update_existing_dataset")
1583
+
1584
+ global main_curate_progress_message
1585
+ global main_total_generate_dataset_size
1586
+ global start_generate
1587
+ global main_initial_bfdataset_size
1588
+
1589
+ # Delete any files on Pennsieve that have been marked as deleted
1590
+ def recursive_file_delete(folder):
1591
+ if "files" in folder.keys():
1592
+ for item in list(folder["files"]):
1593
+ if "deleted" in folder["files"][item]["action"]:
1594
+ file_path = folder["files"][item]["path"]
1595
+ # remove the file from the dataset
1596
+ r = requests.post(f"{PENNSIEVE_URL}/data/delete", headers=create_request_headers(ps), json={"things": [file_path]})
1597
+ r.raise_for_status()
1598
+ # remove the file from the soda json structure
1599
+ del folder["files"][item]
1600
+
1601
+ for item in list(folder["folders"]):
1602
+ recursive_file_delete(folder["folders"][item])
1603
+
1604
+ # Delete any files on Pennsieve that have been marked as deleted
1605
+ def metadata_file_delete(soda):
1606
+ if "dataset_metadata" in soda.keys():
1607
+ folder = soda["dataset_metadata"]
1608
+ for item in list(folder):
1609
+ if "deleted" in folder[item]["action"]:
1610
+ r = requests.post(f"{PENNSIEVE_URL}/data/delete", headers=create_request_headers(ps), json={"things": [folder[item]["path"]]})
1611
+ r.raise_for_status()
1612
+ del folder[item]
1613
+
1614
+
1615
+ def recursive_item_path_create(folder, path):
1616
+ """
1617
+ Recursively create the path for the item # Add a new key containing the path to all the files and folders on the
1618
+ local data structure.
1619
+ Allows us to see if the folder path of a specfic file already
1620
+ exists on Pennsieve.
1621
+ """
1622
+
1623
+ if "files" in folder.keys():
1624
+ for item in list(folder["files"]):
1625
+ if item in ["manifest.xslx", "manifest.csv"]:
1626
+ continue
1627
+ if "folderpath" not in folder["files"][item]:
1628
+ folder["files"][item]["folderpath"] = path[:]
1629
+
1630
+ if "folders" in folder.keys():
1631
+ for item in list(folder["folders"]):
1632
+ if "folderpath" not in folder["folders"][item]:
1633
+ folder["folders"][item]["folderpath"] = path[:]
1634
+ folder["folders"][item]["folderpath"].append(item)
1635
+ recursive_item_path_create(
1636
+ folder["folders"][item], folder["folders"][item]["folderpath"][:]
1637
+ )
1638
+
1639
+ return
1640
+
1641
+ # Check and create any non existing folders for the file move process (Used in the recursive_check_moved_files function)
1642
+ def recursive_check_and_create_ps_file_path(
1643
+ folderpath, index, current_folder_structure
1644
+ ):
1645
+ folder = folderpath[index]
1646
+
1647
+ if folder not in current_folder_structure["folders"]:
1648
+ if index == 0:
1649
+ r = requests.post(f"{PENNSIEVE_URL}/packages", json={"name": folder, "parent": f"{current_folder_structure['path']}", "packageType": "collection", "dataset": ds['content']['id']}, headers=create_request_headers(ps))
1650
+ r.raise_for_status()
1651
+ new_folder = r.json()
1652
+ else:
1653
+ r = requests.post(f"{PENNSIEVE_URL}/packages", json={"name": folder, "parent": f"{current_folder_structure['path']}", "packageType": "collection", "dataset": ds['content']['id']}, headers=create_request_headers(ps))
1654
+ r.raise_for_status()
1655
+ new_folder = r.json()
1656
+
1657
+ current_folder_structure["folders"][folder] = {
1658
+ "location": "ps",
1659
+ "action": ["existing"],
1660
+ "path": new_folder['content']['id'],
1661
+ "folders": {},
1662
+ "files": {},
1663
+ }
1664
+
1665
+ index += 1
1666
+ # check if path exists for folder, if not then folder has not been created on Pennsieve yet, so create it and add it to the path key
1667
+ if "path" not in current_folder_structure["folders"][folder].keys() or current_folder_structure["folders"][folder]["location"] != "ps":
1668
+ r = requests.post(f"{PENNSIEVE_URL}/packages", headers=create_request_headers(ps), json=build_create_folder_request(folder, current_folder_structure["path"], ds['content']['id']))
1669
+ r.raise_for_status()
1670
+ new_folder_id = r.json()["content"]["id"]
1671
+ current_folder_structure["folders"][folder]["path"] = new_folder_id
1672
+
1673
+ if index < len(folderpath):
1674
+ return recursive_check_and_create_ps_file_path(
1675
+ folderpath, index, current_folder_structure["folders"][folder]
1676
+ )
1677
+ else:
1678
+ return current_folder_structure["folders"][folder]["path"]
1679
+
1680
+ # Check for any files that have been moved and verify paths before moving
1681
+ def recursive_check_moved_files(folder):
1682
+ if "files" in folder.keys():
1683
+ for item in list(folder["files"]):
1684
+ if (
1685
+ "moved" in folder["files"][item]["action"]
1686
+ and folder["files"][item]["location"] == "ps"
1687
+ ):
1688
+ # create the folders if they do not exist
1689
+ new_folder_id = ""
1690
+ new_folder_id = recursive_check_and_create_ps_file_path(
1691
+ folder["files"][item]["folderpath"].copy(), 0, dataset_structure
1692
+ )
1693
+ # move the file into the target folder on Pennsieve
1694
+ r = requests.post(f"{PENNSIEVE_URL}/data/move", json={"things": [folder["files"][item]["path"]], "destination": new_folder_id}, headers=create_request_headers(ps))
1695
+ r.raise_for_status()
1696
+
1697
+ for item in list(folder["folders"]):
1698
+ recursive_check_moved_files(folder["folders"][item])
1699
+
1700
+
1701
+ # Rename any files that exist on Pennsieve
1702
+ def recursive_file_rename(folder):
1703
+ if "files" in folder.keys():
1704
+ for item in list(folder["files"]):
1705
+ if (
1706
+ "renamed" in folder["files"][item]["action"]
1707
+ and folder["files"][item]["location"] == "ps"
1708
+ ):
1709
+ # rename the file on Pennsieve
1710
+ r = requests.put(f"{PENNSIEVE_URL}/packages/{folder['files'][item]['path']}?updateStorage=true", json={"name": item}, headers=create_request_headers(ps))
1711
+ r.raise_for_status()
1712
+
1713
+ for item in list(folder["folders"]):
1714
+ recursive_file_rename(folder["folders"][item])
1715
+
1716
+
1717
+ def recursive_folder_delete(folder):
1718
+ """
1719
+ Delete any stray folders that exist on Pennsieve
1720
+ Only top level files are deleted since the api deletes any
1721
+ files and folders that exist inside.
1722
+ """
1723
+ for item in list(folder["folders"]):
1724
+ if folder["folders"][item]["location"] == "ps":
1725
+ if "moved" in folder["folders"][item]["action"]:
1726
+ file_path = folder["folders"][item]["path"]
1727
+ # remove the file from the dataset
1728
+ r = requests.post(f"{PENNSIEVE_URL}/data/delete", headers=create_request_headers(ps), json={"things": [file_path]})
1729
+ r.raise_for_status()
1730
+ if "deleted" in folder["folders"][item]["action"]:
1731
+ file_path = folder["folders"][item]["path"]
1732
+ # remove the file from the dataset
1733
+ r = requests.post(f"{PENNSIEVE_URL}/data/delete", headers=create_request_headers(ps), json={"things": [file_path]})
1734
+ r.raise_for_status()
1735
+ del folder["folders"][item]
1736
+ else:
1737
+ recursive_folder_delete(folder["folders"][item])
1738
+ else:
1739
+ recursive_folder_delete(folder["folders"][item])
1740
+
1741
+
1742
+ # Rename any folders that still exist.
1743
+ def recursive_folder_rename(folder, mode):
1744
+ for item in list(folder["folders"]):
1745
+ if (
1746
+ folder["folders"][item]["location"] == "ps"
1747
+ and "action" in folder["folders"][item].keys()
1748
+ and mode in folder["folders"][item]["action"]
1749
+ ):
1750
+ folder_id = folder["folders"][item]["path"]
1751
+ r = requests.put(f"{PENNSIEVE_URL}/packages/{folder_id}?updateStorage=true", headers=create_request_headers(ps), json={"name": item})
1752
+ r.raise_for_status()
1753
+ recursive_folder_rename(folder["folders"][item], mode)
1754
+
1755
+
1756
+ ps_dataset = ""
1757
+ start = timer()
1758
+ # 1. Remove all existing files on Pennsieve, that the user deleted.
1759
+ logger.info("ps_update_existing_dataset step 1 remove existing files on Pennsieve the user deleted")
1760
+ main_curate_progress_message = "Checking Pennsieve for deleted files"
1761
+ dataset_structure = soda["dataset-structure"]
1762
+ recursive_file_delete(dataset_structure)
1763
+ main_curate_progress_message = (
1764
+ "Files on Pennsieve marked for deletion have been deleted"
1765
+ )
1766
+
1767
+ # 2. Rename any deleted folders on Pennsieve to allow for replacements.
1768
+ logger.info("ps_update_existing_dataset step 2 rename deleted folders on Pennsieve to allow for replacements")
1769
+ main_curate_progress_message = "Checking Pennsieve for deleted folders"
1770
+ dataset_structure = soda["dataset-structure"]
1771
+ recursive_folder_rename(dataset_structure, "deleted")
1772
+ main_curate_progress_message = "Folders on Pennsieve have been marked for deletion"
1773
+
1774
+ # 2.5 Rename folders that need to be in the final destination.
1775
+ logger.info("ps_update_existing_dataset step 2.5 rename folders that need to be in the final destination")
1776
+ main_curate_progress_message = "Renaming any folders requested by the user"
1777
+ recursive_folder_rename(dataset_structure, "renamed")
1778
+ main_curate_progress_message = "Renamed all folders requested by the user"
1779
+
1780
+ # 3. Get the status of all files currently on Pennsieve and create
1781
+ # the folderpath for all items in both dataset structures.
1782
+ logger.info("ps_update_existing_dataset step 3 get the status of all files currently on Pennsieve and create the folderpath for all items in both dataset structures")
1783
+ main_curate_progress_message = "Fetching files and folders from Pennsieve"
1784
+ current_bf_dataset_files_folders = import_pennsieve_dataset(
1785
+ soda.copy()
1786
+ )["soda_object"]
1787
+ ps_dataset = current_bf_dataset_files_folders["dataset-structure"]
1788
+ main_curate_progress_message = "Creating file paths for all files on Pennsieve"
1789
+ recursive_item_path_create(dataset_structure, [])
1790
+ recursive_item_path_create(ps_dataset, [])
1791
+ main_curate_progress_message = "File paths created"
1792
+
1793
+ # 4. Move any files that are marked as moved on Pennsieve.
1794
+ # Create any additional folders if required
1795
+ logger.info("ps_update_existing_dataset step 4 move any files that are marked as moved on Pennsieve")
1796
+ main_curate_progress_message = "Moving any files requested by the user"
1797
+ recursive_check_moved_files(dataset_structure)
1798
+ main_curate_progress_message = "Moved all files requested by the user"
1799
+
1800
+ # 5. Rename any Pennsieve files that are marked as renamed.
1801
+ logger.info("ps_update_existing_dataset step 5 rename any Pennsieve files that are marked as renamed")
1802
+ main_curate_progress_message = "Renaming any files requested by the user"
1803
+ recursive_file_rename(dataset_structure)
1804
+ main_curate_progress_message = "Renamed all files requested by the user"
1805
+
1806
+ # 6. Delete any Pennsieve folders that are marked as deleted.
1807
+ logger.info("ps_update_existing_dataset step 6 delete any Pennsieve folders that are marked as deleted")
1808
+ main_curate_progress_message = (
1809
+ "Deleting any additional folders present on Pennsieve"
1810
+ )
1811
+ recursive_folder_delete(dataset_structure)
1812
+ main_curate_progress_message = "Deletion of additional folders complete"
1813
+
1814
+ # 7. Delete any metadata files that are marked as deleted.
1815
+ logger.info("ps_update_existing_dataset step 8 delete any metadata files that are marked as deleted")
1816
+ main_curate_progress_message = "Removing any metadata files marked for deletion"
1817
+ metadata_file_delete(soda)
1818
+ main_curate_progress_message = "Removed metadata files marked for deletion"
1819
+
1820
+ # 8. Run the original code to upload any new files added to the dataset.
1821
+ logger.info("ps_update_existing_dataset step 9 run the ps_create_new_dataset code to upload any new files added to the dataset")
1822
+ if "dataset_metadata" in soda.keys() and "manifest_files" in soda["dataset_metadata"].keys():
1823
+ if "auto-generated" in soda["manifest-files"].keys():
1824
+ soda["manifest-files"] = {"destination": "ps", "auto-generated": True}
1825
+ else:
1826
+ soda["manifest-files"] = {"destination": "ps"}
1827
+
1828
+ soda["generate-dataset"] = {
1829
+ "destination": "ps",
1830
+ "if-existing": "merge",
1831
+ "if-existing-files": "replace",
1832
+ "generate-option": "existing-ps"
1833
+ }
1834
+
1835
+ end = timer()
1836
+ logger.info(f"Time for ps_update_existing_dataset function: {timedelta(seconds=end - start)}")
1837
+ ps_upload_to_dataset(soda, ps, ds, resume)
1838
+
1839
+
1840
+ def get_origin_manifest_id(dataset_id):
1841
+ global logger
1842
+ max_attempts = 3
1843
+ for _ in range(max_attempts):
1844
+ manifests = get_upload_manifests(dataset_id)
1845
+ if manifests and "manifests" in manifests and manifests["manifests"]:
1846
+ # sort the manifests list by date_created timestamp field in descending order
1847
+ manifests["manifests"].sort(key=lambda x: x["date_created"], reverse=True)
1848
+ return manifests["manifests"][0]["id"]
1849
+ time.sleep(5) # Wait for 5 seconds before the next attempt
1850
+
1851
+ raise Exception("Did not get the origin manifest id in an expected amount of time.")
1852
+
1853
+
1854
+
1855
+ def normalize_tracking_folder(tracking_folder):
1856
+ """
1857
+ Normalize the tracking folder object to be a dictonary with the shape: {files: {}, folders: {}}.
1858
+ This shape matches our dataset structure object. Recall, the tracking folder receives information about what folders and
1859
+ files are stored on Pennsieve. We update this as we update Pennsieve's state.
1860
+ """
1861
+ if tracking_folder == "":
1862
+ return {"folders": {}, "files": {} }
1863
+
1864
+ temp_children = {"folders": {}, "files": {}}
1865
+
1866
+
1867
+ # add the files and folders to the temp_children structure
1868
+ for child in tracking_folder["children"]:
1869
+ if child["content"]["packageType"] == "Collection":
1870
+ # add the folders ( designated collection on Pennsieve ) to the temp_children structure under folders
1871
+ temp_children["folders"][child["content"]["name"]] = child
1872
+ else:
1873
+ # add the files (anything not designated a collection) to the temp_children structure under files
1874
+ temp_children["files"][child["content"]["name"]] = child
1875
+
1876
+ # replace the non-normalized children structure with the normalized children structure
1877
+ tracking_folder["children"] = temp_children
1878
+
1879
+
1880
+ def build_create_folder_request(folder_name, folder_parent_id, dataset_id):
1881
+ """
1882
+ Create a folder on Pennsieve.
1883
+ """
1884
+ body = {}
1885
+
1886
+ # if creating a folder at the root of the dataset the api does not require a parent key
1887
+ if folder_parent_id.find("N:dataset") == -1:
1888
+ body["parent"] = folder_parent_id
1889
+
1890
+ body["name"] = folder_name
1891
+ body["dataset"] = dataset_id
1892
+ body["packageType"] = "collection"
1893
+
1894
+ return body
1895
+
1896
+
1897
+ bytes_uploaded_per_file = {}
1898
+ total_bytes_uploaded = {"value": 0}
1899
+ current_files_in_subscriber_session = 0
1900
+
1901
+
1902
+
1903
+ bytes_file_path_dict = {}
1904
+
1905
+ # retry variables instantiated outside function
1906
+ list_of_files_to_rename = {}
1907
+ renamed_files_counter = 0
1908
+
1909
+
1910
+ def ps_upload_to_dataset(soda, ps, ds, resume=False):
1911
+ global logger
1912
+
1913
+ # Progress tracking variables that are used for the frontend progress bar.
1914
+ global main_curate_progress_message
1915
+ global main_total_generate_dataset_size
1916
+ global main_generated_dataset_size
1917
+ global start_generate
1918
+ global main_initial_bfdataset_size
1919
+ global main_curation_uploaded_files
1920
+ global uploaded_folder_counter
1921
+ global current_size_of_uploaded_files
1922
+ global total_files
1923
+ global total_bytes_uploaded # current number of bytes uploaded to Pennsieve in the current session
1924
+ global client
1925
+ global files_uploaded
1926
+ global total_dataset_files
1927
+ global current_files_in_subscriber_session
1928
+ global renaming_files_flow
1929
+ global bytes_uploaded_per_file
1930
+ global total_bytes_uploaded_per_file
1931
+ global bytes_file_path_dict
1932
+ global elapsed_time
1933
+ global manifest_id
1934
+ global origin_manifest_id
1935
+ global main_curate_status
1936
+ global list_of_files_to_rename
1937
+ global renamed_files_counter
1938
+
1939
+
1940
+
1941
+ total_files = 0
1942
+ total_dataset_files = 0
1943
+ total_metadata_files = 0
1944
+ total_manifest_files = 0
1945
+ main_curation_uploaded_files = 0
1946
+ total_bytes_uploaded = {"value": 0}
1947
+ total_bytes_uploaded_per_file = {}
1948
+ files_uploaded = 0
1949
+ renamed_files_counter = 0
1950
+
1951
+
1952
+ uploaded_folder_counter = 0
1953
+ current_size_of_uploaded_files = 0
1954
+ start = timer()
1955
+ try:
1956
+
1957
+
1958
+ def recursive_dataset_scan_for_new_upload(dataset_structure, list_upload_files, my_relative_path):
1959
+ """
1960
+ This function recursively gathers the files and folders in the dataset that will be uploaded to Pennsieve.
1961
+ It assumes the dataset is new based on the generate_option value and will spend less time comparing what is on Pennsieve.
1962
+ It will gather all the relative paths for the files and folders to pass along to the Pennsieve agent.
1963
+ Input:
1964
+ dataset_structure,
1965
+ my_relative_path
1966
+
1967
+ Output:
1968
+ two lists in one tuple, the first list will have all the local file paths that will be uploaded to Pennsieve
1969
+ The second list will have the relative files paths according to the dataset structure.
1970
+ If the folder does not existing yet on Pennsieve the agent will create it.
1971
+ """
1972
+ global main_total_generate_dataset_size
1973
+ global bytes_file_path_dict
1974
+ # First loop will take place in the root of the dataset
1975
+ if "folders" in dataset_structure.keys():
1976
+ for folder_key, folder in dataset_structure["folders"].items():
1977
+ relative_path = generate_relative_path(my_relative_path, folder_key)
1978
+ list_upload_files = recursive_dataset_scan_for_new_upload(folder, list_upload_files, relative_path)
1979
+ if "files" in dataset_structure.keys():
1980
+ list_local_files = []
1981
+ list_projected_names = []
1982
+ list_desired_names = []
1983
+ list_final_names = []
1984
+
1985
+ list_initial_names = []
1986
+ for file_key, file in dataset_structure["files"].items():
1987
+ # relative_path = generate_relative_path(my_relative_path, file_key)
1988
+ file_path = file["path"]
1989
+ if isfile(file_path) and file.get("location") == "local":
1990
+ projected_name = splitext(basename(file_path))[0]
1991
+ projected_name_w_extension = basename(file_path)
1992
+ desired_name = splitext(file_key)[0]
1993
+ desired_name_with_extension = file_key
1994
+
1995
+
1996
+ if projected_name != desired_name:
1997
+ list_initial_names.append(projected_name)
1998
+ list_local_files.append(file_path)
1999
+ list_projected_names.append(projected_name_w_extension)
2000
+ list_desired_names.append(desired_name_with_extension)
2001
+ list_final_names.append(desired_name)
2002
+ else:
2003
+ list_local_files.append(file_path)
2004
+ list_projected_names.append(projected_name_w_extension)
2005
+ list_desired_names.append(desired_name_with_extension)
2006
+ list_final_names.append(desired_name)
2007
+ list_initial_names.append(projected_name)
2008
+
2009
+ file_size = getsize(file_path)
2010
+ main_total_generate_dataset_size += file_size
2011
+ bytes_file_path_dict[file_path] = file_size
2012
+
2013
+ if list_local_files:
2014
+ list_upload_files.append([
2015
+ list_local_files,
2016
+ list_projected_names,
2017
+ list_desired_names,
2018
+ list_final_names,
2019
+ "/" if my_relative_path == soda["generate-dataset"]["dataset-name"] else my_relative_path,
2020
+ ])
2021
+
2022
+
2023
+ return list_upload_files
2024
+
2025
+ # See how to create folders with the Pennsieve agent
2026
+ def recursive_create_folder_for_ps(
2027
+ my_folder, my_tracking_folder, existing_folder_option
2028
+ ):
2029
+ """
2030
+ Creates a folder on Pennsieve for each folder in the dataset structure if they aren't already present in the dataset.
2031
+ Input:
2032
+ my_folder: The dataset structure to be created on Pennsieve. Pass in the soda json object to start.
2033
+ my_tracking_folder: Tracks what folders have been created on Pennsieve thus far. Starts as an empty dictionary.
2034
+ existing_folder_option: Dictates whether to merge, duplicate, replace, or skip existing folders.
2035
+ """
2036
+ # Check if the current folder has any subfolders that already exist on Pennsieve. Important step to appropriately handle replacing and merging folders.
2037
+ if len(my_tracking_folder["children"]["folders"]) == 0 and my_tracking_folder["content"]["id"].find("N:dataset") == -1:
2038
+ limit = 100
2039
+ offset = 0
2040
+ ps_folder = {}
2041
+ ps_folder_children = []
2042
+ while True:
2043
+ r = requests.get(f"{PENNSIEVE_URL}/packages/{my_tracking_folder['content']['id']}?limit={limit}&offset={offset}", headers=create_request_headers(ps), json={"include": "files"})
2044
+ r.raise_for_status()
2045
+ ps_folder = r.json()
2046
+ page = ps_folder["children"]
2047
+ ps_folder_children.extend(page)
2048
+ if len(page) < limit:
2049
+ break
2050
+ offset += limit
2051
+ time.sleep(1)
2052
+
2053
+ ps_folder["children"] = ps_folder_children
2054
+ normalize_tracking_folder(ps_folder)
2055
+ my_tracking_folder["children"] = ps_folder["children"]
2056
+
2057
+ # create/replace/skip folder
2058
+ if "folders" in my_folder.keys():
2059
+ for folder_key, folder in my_folder["folders"].items():
2060
+ if existing_folder_option == "merge":
2061
+ if folder_key in my_tracking_folder["children"]["folders"]:
2062
+ ps_folder = my_tracking_folder["children"]["folders"][folder_key]
2063
+ normalize_tracking_folder(ps_folder)
2064
+ else:
2065
+ # We are merging but this is a new folder - not one that already exists in the current dataset - so we create it.
2066
+ r = requests.post(f"{PENNSIEVE_URL}/packages", headers=create_request_headers(ps), json=build_create_folder_request(folder_key, my_tracking_folder['content']['id'], ds['content']['id']))
2067
+ r.raise_for_status()
2068
+ ps_folder = r.json()
2069
+ normalize_tracking_folder(ps_folder)
2070
+
2071
+ elif existing_folder_option == "replace":
2072
+ # if the folder exists on Pennsieve remove it
2073
+ if folder_key in my_tracking_folder["children"]["folders"]:
2074
+ ps_folder = my_tracking_folder["children"]["folders"][folder_key]
2075
+
2076
+ r = requests.post(f"{PENNSIEVE_URL}/data/delete", headers=create_request_headers(ps), json={"things": [ps_folder["content"]["id"]]})
2077
+ r.raise_for_status()
2078
+
2079
+ # remove from ps_folder
2080
+ del my_tracking_folder["children"]["folders"][folder_key]
2081
+
2082
+ r = requests.post(f"{PENNSIEVE_URL}/packages", headers=create_request_headers(ps), json=build_create_folder_request(folder_key, my_tracking_folder['content']['id'], ds['content']['id']))
2083
+ r.raise_for_status()
2084
+ ps_folder = r.json()
2085
+ normalize_tracking_folder(ps_folder)
2086
+
2087
+ my_tracking_folder["children"]["folders"][folder_key] = ps_folder
2088
+ tracking_folder = my_tracking_folder["children"]["folders"][folder_key] # get the folder we just added to the tracking folder
2089
+ recursive_create_folder_for_ps(
2090
+ folder, tracking_folder, existing_folder_option
2091
+ )
2092
+
2093
+ def recursive_dataset_scan_for_ps(
2094
+ my_folder,
2095
+ my_tracking_folder,
2096
+ existing_file_option,
2097
+ list_upload_files,
2098
+ my_relative_path,
2099
+ ):
2100
+ """
2101
+ Delete files that are marked to be replaced in the dataset. Create a list of files to upload to Pennsieve.
2102
+ """
2103
+
2104
+ global main_total_generate_dataset_size
2105
+ global logger
2106
+
2107
+
2108
+ # folder children are packages such as collections and files stored on the Pennsieve dataset
2109
+ ps_folder_children = my_tracking_folder["children"] #ds (dataset)
2110
+
2111
+
2112
+
2113
+ if "folders" in my_folder.keys():
2114
+ for folder_key, folder in my_folder["folders"].items():
2115
+ relative_path = generate_relative_path(my_relative_path, folder_key)
2116
+ tracking_folder = ps_folder_children["folders"][folder_key]
2117
+ list_upload_files = recursive_dataset_scan_for_ps(
2118
+ folder,
2119
+ tracking_folder,
2120
+ existing_file_option,
2121
+ list_upload_files,
2122
+ relative_path,
2123
+ )
2124
+
2125
+ if "files" in my_folder.keys():
2126
+
2127
+ # delete files to be deleted
2128
+ (
2129
+ my_bf_existing_files_name,
2130
+ my_bf_existing_files_name_with_extension,
2131
+ ) = ps_get_existing_files_details(my_tracking_folder)
2132
+
2133
+ for file_key, file in my_folder["files"].items():
2134
+ # if local then we are either adding a new file to an existing/new dataset or replacing a file in an existing dataset
2135
+ if file.get("location") == "local":
2136
+ file_path = file["path"]
2137
+ if isfile(file_path) and existing_file_option == "replace" and file_key in ps_folder_children["files"]:
2138
+ my_file = ps_folder_children["files"][file_key]
2139
+ # delete the package ( aka file ) from the dataset
2140
+ r = requests.post(f"{PENNSIEVE_URL}/data/delete", headers=create_request_headers(ps), json={"things": [f"{my_file['content']['id']}"]})
2141
+ r.raise_for_status()
2142
+ del ps_folder_children["files"][file_key]
2143
+
2144
+
2145
+ # create list of files to be uploaded with projected and desired names saved
2146
+ (
2147
+ my_bf_existing_files_name,
2148
+ my_bf_existing_files_name_with_extension,
2149
+ ) = ps_get_existing_files_details(my_tracking_folder)
2150
+
2151
+ logger.info(f"Existing files in Pennsieve: {my_bf_existing_files_name_with_extension}")
2152
+
2153
+ list_local_files = []
2154
+ list_projected_names = []
2155
+ list_desired_names = []
2156
+ list_final_names = []
2157
+ additional_upload_lists = []
2158
+
2159
+ list_initial_names = []
2160
+
2161
+ # add the files that are set to be uploaded to Pennsieve to a list
2162
+ # handle renaming files and creating duplicates
2163
+ for file_key, file in my_folder["files"].items():
2164
+ if file.get("location") == "local":
2165
+ file_path = file["path"]
2166
+ if isfile(file_path):
2167
+ initial_name = splitext(basename(file_path))[0]
2168
+ initial_extension = splitext(basename(file_path))[1]
2169
+ initial_name_with_extension = basename(file_path)
2170
+ desired_name = splitext(file_key)[0]
2171
+ desired_name_extension = splitext(file_key)[1]
2172
+ desired_name_with_extension = file_key
2173
+ if existing_file_option == "skip" and desired_name_with_extension in my_bf_existing_files_name_with_extension:
2174
+ continue
2175
+
2176
+ # check if initial filename exists on Pennsieve dataset and get the projected name of the file after upload
2177
+ # used when a local file has a name that matches an existing name on Pennsieve
2178
+ count_done = 0
2179
+ count_exist = 0
2180
+ projected_name = initial_name_with_extension
2181
+ while count_done == 0:
2182
+ if (
2183
+ projected_name
2184
+ in my_bf_existing_files_name_with_extension
2185
+ ):
2186
+ count_exist += 1
2187
+ projected_name = (
2188
+ initial_name
2189
+ + " ("
2190
+ + str(count_exist)
2191
+ + ")"
2192
+ + initial_extension
2193
+ )
2194
+ else:
2195
+ count_done = 1
2196
+
2197
+ # expected final name
2198
+ count_done = 0
2199
+ final_name = desired_name_with_extension
2200
+ if output := get_base_file_name(desired_name):
2201
+ base_name = output[0]
2202
+ count_exist = output[1]
2203
+ while count_done == 0:
2204
+ if final_name in my_bf_existing_files_name:
2205
+ count_exist += 1
2206
+ final_name = (
2207
+ base_name
2208
+ + "("
2209
+ + str(count_exist)
2210
+ + ")"
2211
+ + desired_name_extension
2212
+ )
2213
+ else:
2214
+ count_done = 1
2215
+ else:
2216
+ count_exist = 0
2217
+ while count_done == 0:
2218
+ if final_name in my_bf_existing_files_name:
2219
+ count_exist += 1
2220
+ final_name = (
2221
+ desired_name
2222
+ + " ("
2223
+ + str(count_exist)
2224
+ + ")"
2225
+ + desired_name_extension
2226
+ )
2227
+ else:
2228
+ count_done = 1
2229
+
2230
+ # save in list accordingly
2231
+ if (
2232
+ initial_name in list_initial_names
2233
+ or initial_name in list_final_names
2234
+ or projected_name in list_final_names
2235
+ or final_name in list_projected_names
2236
+ ):
2237
+ additional_upload_lists.append(
2238
+ [
2239
+ [file_path],
2240
+ ps_folder_children,
2241
+ [projected_name],
2242
+ [desired_name],
2243
+ [final_name],
2244
+ my_tracking_folder,
2245
+ my_relative_path,
2246
+ ]
2247
+ )
2248
+ else:
2249
+ list_local_files.append(file_path)
2250
+ list_projected_names.append(projected_name)
2251
+ list_desired_names.append(desired_name_with_extension)
2252
+ list_final_names.append(final_name)
2253
+ list_initial_names.append(initial_name)
2254
+
2255
+ my_bf_existing_files_name.append(final_name)
2256
+ if initial_extension in ps_recognized_file_extensions:
2257
+ my_bf_existing_files_name_with_extension.append(
2258
+ final_name
2259
+ )
2260
+ else:
2261
+ my_bf_existing_files_name_with_extension.append(
2262
+ final_name + initial_extension
2263
+ )
2264
+
2265
+ # add to projected dataset size to be generated
2266
+ main_total_generate_dataset_size += getsize(file_path)
2267
+
2268
+ if list_local_files:
2269
+ ds_name = soda["ps-dataset-selected"]["dataset-name"]
2270
+ list_upload_files.append(
2271
+ [
2272
+ list_local_files,
2273
+ ps_folder_children,
2274
+ list_projected_names,
2275
+ list_desired_names,
2276
+ list_final_names,
2277
+ my_tracking_folder,
2278
+ "/" if my_relative_path == ds_name else my_relative_path,
2279
+ ]
2280
+ )
2281
+
2282
+ for item in additional_upload_lists:
2283
+ list_upload_files.append(item)
2284
+
2285
+ return list_upload_files
2286
+
2287
+ def monitor_subscriber_progress(events_dict):
2288
+ """
2289
+ Monitors the progress of a subscriber and unsubscribes once the upload finishes.
2290
+ """
2291
+ global files_uploaded
2292
+ global total_bytes_uploaded
2293
+ global bytes_uploaded_per_file
2294
+ global main_curation_uploaded_files
2295
+ global main_total_generate_dataset_size
2296
+
2297
+
2298
+ if events_dict["type"] == 1: # upload status: file_id, total, current, worker_id
2299
+ file_id = events_dict["upload_status"].file_id
2300
+ total_bytes_to_upload = events_dict["upload_status"].total
2301
+ current_bytes_uploaded = events_dict["upload_status"].current
2302
+
2303
+ status = events_dict["upload_status"].status
2304
+ if status == "2" or status == 2:
2305
+ ps.unsubscribe(10)
2306
+ logger.info("[UPLOAD COMPLETE EVENT RECEIVED]")
2307
+ logger.info(f"Amount of bytes uploaded via sum: {sum(bytes_uploaded_per_file.values())} vs total bytes uploaded via difference: {total_bytes_uploaded['value']}")
2308
+ logger.info(f"Amount of bytes Pennsieve Agent says via sum: {sum(bytes_uploaded_per_file.values())} vs amount of bytes we calculated before hand: {main_total_generate_dataset_size}")
2309
+
2310
+
2311
+ # only update the byte count if the current bytes uploaded is greater than the previous bytes uploaded
2312
+ # if current_bytes_uploaded > previous_bytes_uploaded:
2313
+ # update the file id's current total bytes uploaded value
2314
+ bytes_uploaded_per_file[file_id] = current_bytes_uploaded
2315
+ total_bytes_uploaded["value"] = sum(bytes_uploaded_per_file.values())
2316
+
2317
+ # check if the given file has finished uploading
2318
+ if current_bytes_uploaded == total_bytes_to_upload and file_id != "":
2319
+ files_uploaded += 1
2320
+ main_curation_uploaded_files += 1
2321
+
2322
+
2323
+
2324
+ # Set the Pennsieve Python Client's dataset to the Pennsieve dataset that will be uploaded to.
2325
+ selected_id = ds["content"]["id"]
2326
+ ps.use_dataset(selected_id)
2327
+
2328
+ # Set variables needed throughout generation flow
2329
+ list_upload_files = []
2330
+ list_upload_metadata_files = []
2331
+ list_upload_manifest_files = []
2332
+ list_of_files_to_rename = {}
2333
+ brand_new_dataset = False
2334
+ dataset_structure = soda["dataset-structure"]
2335
+ generate_option = soda["generate-dataset"]["generate-option"]
2336
+ starting_point = soda["starting-point"]["origin"]
2337
+ relative_path = ds["content"]["name"]
2338
+
2339
+
2340
+ # 1. Scan the dataset structure and create a list of files/folders to be uploaded with the desired renaming
2341
+ if generate_option == "new" and starting_point == "new":
2342
+ vs = ums.df_mid_has_progress()
2343
+ if resume == False or resume == True and not vs:
2344
+ logger.info("NO progress found so we will start from scratch and construct the manifest")
2345
+ main_curate_progress_message = "Preparing a list of files to upload"
2346
+ # we can assume no files/folders exist in the dataset since the generate option is new and starting point is also new
2347
+ # therefore, we can assume the dataset structure is the same as the tracking structure
2348
+ brand_new_dataset = True
2349
+ list_upload_files = recursive_dataset_scan_for_new_upload(dataset_structure, list_upload_files, relative_path)
2350
+
2351
+
2352
+
2353
+
2354
+
2355
+ if "dataset_metadata" in soda.keys():
2356
+ for key, _ in soda["dataset_metadata"].items():
2357
+ if key == "submission":
2358
+ metadata_path = os.path.join(METADATA_UPLOAD_PS_PATH, "submission.xlsx")
2359
+ submission.create_excel(soda, False, metadata_path)
2360
+ list_upload_metadata_files.append(metadata_path)
2361
+ main_total_generate_dataset_size += getsize(metadata_path)
2362
+ total_files += 1
2363
+ total_metadata_files += 1
2364
+ if key == "subjects":
2365
+ metadata_path = os.path.join(METADATA_UPLOAD_PS_PATH, "subjects.xlsx")
2366
+ subjects.create_excel(soda, False, metadata_path)
2367
+ list_upload_metadata_files.append(metadata_path)
2368
+ main_total_generate_dataset_size += getsize(metadata_path)
2369
+ total_files += 1
2370
+ total_metadata_files += 1
2371
+ if key == "samples":
2372
+ metadata_path = os.path.join(METADATA_UPLOAD_PS_PATH, "samples.xlsx")
2373
+ samples.create_excel(soda, False, metadata_path)
2374
+ list_upload_metadata_files.append(metadata_path)
2375
+ main_total_generate_dataset_size += getsize(metadata_path)
2376
+ total_files += 1
2377
+ total_metadata_files += 1
2378
+ if key == "performances":
2379
+ metadata_path = os.path.join(METADATA_UPLOAD_PS_PATH, "performances.xlsx")
2380
+ performances.create_excel(soda, False, metadata_path)
2381
+ list_upload_metadata_files.append(metadata_path)
2382
+ main_total_generate_dataset_size += getsize(metadata_path)
2383
+ total_files += 1
2384
+ total_metadata_files += 1
2385
+ if key == "resources":
2386
+ metadata_path = os.path.join(METADATA_UPLOAD_PS_PATH, "resources.xlsx")
2387
+ resources.create_excel(soda, False, metadata_path)
2388
+ list_upload_metadata_files.append(metadata_path)
2389
+ main_total_generate_dataset_size += getsize(metadata_path)
2390
+ total_files += 1
2391
+ total_metadata_files += 1
2392
+ if key == "sites":
2393
+ metadata_path = os.path.join(METADATA_UPLOAD_PS_PATH, "sites.xlsx")
2394
+ sites.create_excel(soda, False, metadata_path)
2395
+ list_upload_metadata_files.append(metadata_path)
2396
+ main_total_generate_dataset_size += getsize(metadata_path)
2397
+ total_files += 1
2398
+ total_metadata_files += 1
2399
+ if key == "dataset_description":
2400
+ metadata_path = os.path.join(METADATA_UPLOAD_PS_PATH, "dataset_description.xlsx")
2401
+ dataset_description.create_excel(soda, False, metadata_path)
2402
+ list_upload_metadata_files.append(metadata_path)
2403
+ main_total_generate_dataset_size += getsize(metadata_path)
2404
+ total_files += 1
2405
+ total_metadata_files += 1
2406
+ if key == "code_description":
2407
+ metadata_path = os.path.join(METADATA_UPLOAD_PS_PATH, "code_description.xlsx")
2408
+ code_description.create_excel(soda, False, metadata_path)
2409
+ list_upload_metadata_files.append(metadata_path)
2410
+ main_total_generate_dataset_size += getsize(metadata_path)
2411
+ total_files += 1
2412
+ total_metadata_files += 1
2413
+ if key == "manifest_file":
2414
+ metadata_path = os.path.join(METADATA_UPLOAD_PS_PATH, "manifest.xlsx")
2415
+ manifest.create_excel(soda, False, metadata_path)
2416
+ list_upload_metadata_files.append(metadata_path)
2417
+ main_total_generate_dataset_size += getsize(metadata_path)
2418
+ total_files += 1
2419
+ total_metadata_files += 1
2420
+
2421
+ if key == "README.md":
2422
+ metadata_path = os.path.join(METADATA_UPLOAD_PS_PATH, "README.md")
2423
+ text_metadata.create_text_file(soda, False, metadata_path, "README.md")
2424
+ list_upload_metadata_files.append(metadata_path)
2425
+ main_total_generate_dataset_size += getsize(metadata_path)
2426
+ total_files += 1
2427
+ total_metadata_files += 1
2428
+ if key == "CHANGES":
2429
+ metadata_path = os.path.join(METADATA_UPLOAD_PS_PATH, "CHANGES")
2430
+ text_metadata.create_text_file(soda, False, metadata_path, "CHANGES")
2431
+ list_upload_metadata_files.append(metadata_path)
2432
+ main_total_generate_dataset_size += getsize(metadata_path)
2433
+ total_files += 1
2434
+ total_metadata_files += 1
2435
+ if key == "LICENSE":
2436
+ metadata_path = os.path.join(METADATA_UPLOAD_PS_PATH, "LICENSE")
2437
+ text_metadata.create_text_file(soda, False, metadata_path, "LICENSE")
2438
+ list_upload_metadata_files.append(metadata_path)
2439
+ main_total_generate_dataset_size += getsize(metadata_path)
2440
+ total_files += 1
2441
+ total_metadata_files += 1
2442
+
2443
+
2444
+
2445
+ else:
2446
+
2447
+ vs = ums.df_mid_has_progress()
2448
+
2449
+ if resume == False or resume == True and not vs:
2450
+ main_curate_progress_message = "Preparing a list of files to upload"
2451
+
2452
+ existing_folder_option = soda["generate-dataset"]["if-existing"]
2453
+ existing_file_option = soda["generate-dataset"][
2454
+ "if-existing-files"
2455
+ ]
2456
+
2457
+ # we will need a tracking structure to compare against
2458
+ tracking_json_structure = ds
2459
+ normalize_tracking_folder(tracking_json_structure)
2460
+ recursive_create_folder_for_ps(dataset_structure, tracking_json_structure, existing_folder_option)
2461
+ list_upload_files = recursive_dataset_scan_for_ps(
2462
+ dataset_structure,
2463
+ tracking_json_structure,
2464
+ existing_file_option,
2465
+ list_upload_files,
2466
+ relative_path,
2467
+ )
2468
+
2469
+ logger.info(f"List of files to upload: {list_upload_files}")
2470
+
2471
+
2472
+ # return and mark upload as completed if nothing is added to the manifest
2473
+ if len(list_upload_files) < 1:
2474
+ logger.warning("No files found to upload.")
2475
+ main_curate_progress_message = "No files were uploaded in this session"
2476
+ main_curate_status = "Done"
2477
+ return
2478
+
2479
+ # 3. Add high-level metadata files to a list
2480
+ if "dataset_metadata" in soda.keys():
2481
+ logger.info("ps_create_new_dataset (optional) step 3 create high level metadata list")
2482
+ # TODO: Add enahnced merge support post SDS3 launch
2483
+ # (
2484
+ # my_bf_existing_files_name,
2485
+ # _,
2486
+ # ) = ps_get_existing_files_details(ds)
2487
+ for key, _ in soda["dataset_metadata"].items():
2488
+ if key == "submission":
2489
+ metadata_path = os.path.join(METADATA_UPLOAD_PS_PATH, "submission.xlsx")
2490
+ submission.create_excel(soda, False, metadata_path)
2491
+ list_upload_metadata_files.append(metadata_path)
2492
+ main_total_generate_dataset_size += getsize(metadata_path)
2493
+ total_files += 1
2494
+ total_metadata_files += 1
2495
+ if key == "subjects":
2496
+ metadata_path = os.path.join(METADATA_UPLOAD_PS_PATH, "subjects.xlsx")
2497
+ subjects.create_excel(soda, False, metadata_path)
2498
+ list_upload_metadata_files.append(metadata_path)
2499
+ main_total_generate_dataset_size += getsize(metadata_path)
2500
+ total_files += 1
2501
+ total_metadata_files += 1
2502
+ if key == "samples":
2503
+ metadata_path = os.path.join(METADATA_UPLOAD_PS_PATH, "samples.xlsx")
2504
+ samples.create_excel(soda, False, metadata_path)
2505
+ list_upload_metadata_files.append(metadata_path)
2506
+ main_total_generate_dataset_size += getsize(metadata_path)
2507
+ total_files += 1
2508
+ total_metadata_files += 1
2509
+ if key == "performances":
2510
+ metadata_path = os.path.join(METADATA_UPLOAD_PS_PATH, "performances.xlsx")
2511
+ performances.create_excel(soda, False, metadata_path)
2512
+ list_upload_metadata_files.append(metadata_path)
2513
+ main_total_generate_dataset_size += getsize(metadata_path)
2514
+ total_files += 1
2515
+ total_metadata_files += 1
2516
+ if key == "resources":
2517
+ metadata_path = os.path.join(METADATA_UPLOAD_PS_PATH, "resources.xlsx")
2518
+ resources.create_excel(soda, False, metadata_path)
2519
+ list_upload_metadata_files.append(metadata_path)
2520
+ main_total_generate_dataset_size += getsize(metadata_path)
2521
+ total_files += 1
2522
+ total_metadata_files += 1
2523
+ if key == "sites":
2524
+ metadata_path = os.path.join(METADATA_UPLOAD_PS_PATH, "sites.xlsx")
2525
+ sites.create_excel(soda, False, metadata_path)
2526
+ list_upload_metadata_files.append(metadata_path)
2527
+ main_total_generate_dataset_size += getsize(metadata_path)
2528
+ total_files += 1
2529
+ total_metadata_files += 1
2530
+ if key == "dataset_description":
2531
+ metadata_path = os.path.join(METADATA_UPLOAD_PS_PATH, "dataset_description.xlsx")
2532
+ dataset_description.create_excel(soda, False, metadata_path)
2533
+ list_upload_metadata_files.append(metadata_path)
2534
+ main_total_generate_dataset_size += getsize(metadata_path)
2535
+ total_files += 1
2536
+ total_metadata_files += 1
2537
+ if key == "code_description":
2538
+ metadata_path = os.path.join(METADATA_UPLOAD_PS_PATH, "code_description.xlsx")
2539
+ code_description.create_excel(soda, False, metadata_path)
2540
+ list_upload_metadata_files.append(metadata_path)
2541
+ main_total_generate_dataset_size += getsize(metadata_path)
2542
+ total_files += 1
2543
+ total_metadata_files += 1
2544
+ if key == "manifest_file":
2545
+ metadata_path = os.path.join(METADATA_UPLOAD_PS_PATH, "manifest.xlsx")
2546
+ manifest.create_excel(soda, False, metadata_path)
2547
+ list_upload_metadata_files.append(metadata_path)
2548
+ main_total_generate_dataset_size += getsize(metadata_path)
2549
+ total_files += 1
2550
+ total_metadata_files += 1
2551
+ if key == "README.md":
2552
+ metadata_path = os.path.join(METADATA_UPLOAD_PS_PATH, "README.md")
2553
+ text_metadata.create_text_file(soda, False, metadata_path, "README.md")
2554
+ list_upload_metadata_files.append(metadata_path)
2555
+ main_total_generate_dataset_size += getsize(metadata_path)
2556
+ total_files += 1
2557
+ total_metadata_files += 1
2558
+ if key == "CHANGES":
2559
+ metadata_path = os.path.join(METADATA_UPLOAD_PS_PATH, "CHANGES")
2560
+ text_metadata.create_text_file(soda, False, metadata_path, "CHANGES")
2561
+ list_upload_metadata_files.append(metadata_path)
2562
+ main_total_generate_dataset_size += getsize(metadata_path)
2563
+ total_files += 1
2564
+ total_metadata_files += 1
2565
+ if key == "LICENSE":
2566
+ metadata_path = os.path.join(METADATA_UPLOAD_PS_PATH, "LICENSE")
2567
+ text_metadata.create_text_file(soda, False, metadata_path, "LICENSE")
2568
+ list_upload_metadata_files.append(metadata_path)
2569
+ main_total_generate_dataset_size += getsize(metadata_path)
2570
+ total_files += 1
2571
+ total_metadata_files += 1
2572
+
2573
+
2574
+
2575
+
2576
+ # 4. Prepare and add manifest files to a list
2577
+ if "dataset_metadata" in soda.keys() and "manifest_files" in soda["dataset_metadata"].keys():
2578
+ logger.info("ps_create_new_dataset (optional) step 4 create manifest list")
2579
+ # create local folder to save manifest files temporarly (delete any existing one first)
2580
+ # TODO: SDS 3 create manifests if not skipping and delete file on Pennsieve if it exists
2581
+ if "auto-generated" in soda["manifest-files"]:
2582
+ if soda["manifest-files"]["auto-generated"] == True:
2583
+ manifest_files_structure = (
2584
+ get_auto_generated_manifest_files(soda)
2585
+ )
2586
+
2587
+ # add manifest files to list after deleting existing ones
2588
+ for key in manifest_files_structure.keys():
2589
+ manifestpath = manifest_files_structure[key]
2590
+ folder = tracking_json_structure["children"]["folders"][key]
2591
+
2592
+ # delete existing manifest files
2593
+ for child_key in folder["children"]["files"]:
2594
+ file_name_no_ext = os.path.splitext(folder['children']['files'][child_key]['content']['name'])[0]
2595
+ if file_name_no_ext.lower() == "manifest":
2596
+ # delete the manifest file from the given folder
2597
+ r = requests.post(f"{PENNSIEVE_URL}/data/delete", json={"things": [folder['children']['files'][child_key]['content']['id']]}, headers=create_request_headers(get_access_token()))
2598
+ r.raise_for_status()
2599
+
2600
+ # upload new manifest files
2601
+ # the number of files to upload and the total also determines when the upload subscribers should stop listening to the dataset upload progress ( when files uploaded == total files stop listening )
2602
+ list_upload_manifest_files.append([manifestpath, key])
2603
+ total_files += 1
2604
+ total_manifest_files += 1
2605
+ main_total_generate_dataset_size += getsize(manifestpath)
2606
+
2607
+
2608
+ # 2. Count how many files will be uploaded to inform frontend - do not count if we are resuming a previous upload that has made progress
2609
+ if not resume or resume and not ums.df_mid_has_progress():
2610
+ for folderInformation in list_upload_files:
2611
+ file_paths_count = len(folderInformation[0])
2612
+ total_files += file_paths_count
2613
+ total_dataset_files += file_paths_count
2614
+
2615
+
2616
+ # 3. Upload files and add to tracking list
2617
+ start_generate = 1
2618
+
2619
+
2620
+ # resuming a dataset that had no files to rename or that failed before renaming any files
2621
+ if resume and ums.df_mid_has_progress() and not ums.get_renaming_files_flow():
2622
+ main_curate_progress_message = ("Preparing to retry upload. Progress on partially uploaded files will be reset.")
2623
+ # reset necessary variables that were used in the failed upload session and cannot be reliably cached
2624
+ bytes_uploaded_per_file = {}
2625
+
2626
+ # get the current manifest id for data files
2627
+ manifest_id = ums.get_df_mid()
2628
+ # get the cached values of the previous upload session
2629
+ main_total_generate_dataset_size = ums.get_main_total_generate_dataset_size()
2630
+
2631
+ total_files = ums.get_total_files_to_upload()
2632
+ total_dataset_files = total_files
2633
+ current_files_in_subscriber_session = total_dataset_files
2634
+
2635
+ main_curation_uploaded_files = total_files - ums.get_remaining_file_count(manifest_id, total_files)
2636
+ files_uploaded = main_curation_uploaded_files
2637
+ total_bytes_uploaded["value"] = ums.calculate_completed_upload_size(manifest_id, bytes_file_path_dict, total_files )
2638
+
2639
+ # rename file information
2640
+ list_of_files_to_rename = ums.get_list_of_files_to_rename()
2641
+ renamed_files_counter = ums.get_rename_total_files()
2642
+
2643
+ time.sleep(5)
2644
+
2645
+
2646
+ # upload the manifest files
2647
+ try:
2648
+ ps.manifest.upload(manifest_id)
2649
+ main_curate_progress_message = ("Uploading data files...")
2650
+ # subscribe to the manifest upload so we wait until it has finished uploading before moving on
2651
+ ps.subscribe(10, False, monitor_subscriber_progress)
2652
+ except Exception as e:
2653
+ logger.error("Error uploading dataset files")
2654
+ logger.error(e)
2655
+ raise PennsieveUploadException("The Pennsieve Agent has encountered an issue while uploading. Please retry the upload. If this issue persists please follow this <a target='_blank' rel='noopener noreferrer' href='https://docs.sodaforsparc.io/docs/how-to/how-to-reinstall-the-pennsieve-agent'> guide</a> on performing a full reinstallation of the Pennsieve Agent then click the retry button.")
2656
+ elif resume and ums.df_mid_has_progress() and ums.get_renaming_files_flow():
2657
+ # setup for rename files flow
2658
+ list_of_files_to_rename = ums.get_list_of_files_to_rename()
2659
+ renamed_files_counter = ums.get_rename_total_files()
2660
+ # create a manifest for files - IMP: We use a single file to start with since creating a manifest requires a file path. We need to remove this at the end.
2661
+ elif len(list_upload_files) > 0:
2662
+ main_curate_progress_message = ("Queuing dataset files for upload with the Pennsieve Agent..." + "<br>" + "This may take some time.")
2663
+
2664
+ first_file_local_path = list_upload_files[0][0][0]
2665
+
2666
+ if brand_new_dataset:
2667
+ first_relative_path = list_upload_files[0][4]
2668
+ first_final_name = list_upload_files[0][2][0]
2669
+ else:
2670
+ first_relative_path = list_upload_files[0][6]
2671
+ first_final_name = list_upload_files[0][4][0]
2672
+
2673
+ folder_name = first_relative_path[first_relative_path.index("/")+1:]
2674
+
2675
+ if first_final_name != basename(first_file_local_path):
2676
+ # if file name is not the same as local path, then it has been renamed in SODA
2677
+ if folder_name not in list_of_files_to_rename:
2678
+ list_of_files_to_rename[folder_name] = {}
2679
+ if basename(first_file_local_path) not in list_of_files_to_rename[folder_name]:
2680
+ list_of_files_to_rename[folder_name][basename(first_file_local_path)] = {
2681
+ "final_file_name": first_final_name,
2682
+ "id": "",
2683
+ }
2684
+ renamed_files_counter += 1
2685
+
2686
+ manifest_data = ps.manifest.create(first_file_local_path, folder_name)
2687
+ manifest_id = manifest_data.manifest_id
2688
+
2689
+
2690
+ ums.set_df_mid(manifest_id)
2691
+
2692
+ # remove the item just added to the manifest
2693
+ list_upload_files[0][0].pop(0)
2694
+
2695
+ # reset global variables used in the subscriber monitoring function
2696
+ bytes_uploaded_per_file = {}
2697
+ total_bytes_uploaded = {"value": 0}
2698
+ current_files_in_subscriber_session = total_dataset_files
2699
+
2700
+ # there are files to add to the manifest if there are more than one file in the first folder or more than one folder
2701
+ if len(list_upload_files[0][0]) > 1 or len(list_upload_files) > 1:
2702
+ index_skip = True
2703
+ for folderInformation in list_upload_files:
2704
+ list_file_paths = folderInformation[0]
2705
+ if brand_new_dataset:
2706
+ relative_path = folderInformation[4]
2707
+ final_file_name_list = folderInformation[2]
2708
+ else:
2709
+ relative_path = folderInformation[6]
2710
+ final_file_name_list = folderInformation[4]
2711
+ # get the substring from the string relative_path that starts at the index of the / and contains the rest of the string
2712
+ try:
2713
+ folder_name = relative_path[relative_path.index("/")+1:]
2714
+ except ValueError as e:
2715
+ folder_name = relative_path
2716
+
2717
+ # Add files to manfiest"
2718
+ final_files_index = 1 if index_skip else 0
2719
+ index_skip = False
2720
+ for file_path in list_file_paths:
2721
+ file_file_name = final_file_name_list[final_files_index]
2722
+ if file_file_name != basename(file_path):
2723
+ # save the relative path, final name and local path of the file to be renamed
2724
+ if folder_name not in list_of_files_to_rename:
2725
+ list_of_files_to_rename[folder_name] = {}
2726
+ if basename(file_path) not in list_of_files_to_rename[folder_name]:
2727
+ renamed_files_counter += 1
2728
+ list_of_files_to_rename[folder_name][basename(file_path)] = {
2729
+ "final_file_name": file_file_name,
2730
+ "id": "",
2731
+ }
2732
+ ps.manifest.add(file_path, folder_name, manifest_id)
2733
+ final_files_index += 1
2734
+
2735
+
2736
+ # add metadata files to the manifest
2737
+ if list_upload_metadata_files:
2738
+ current_files_in_subscriber_session += total_metadata_files
2739
+ # add the files to the manifest
2740
+ for manifest_path in list_upload_metadata_files:
2741
+ # subprocess call to the pennsieve agent to add the files to the manifest
2742
+ ps.manifest.add(manifest_path, target_base_path="", manifest_id=manifest_id)
2743
+
2744
+
2745
+ # add manifest files to the upload manifest
2746
+ if list_upload_manifest_files:
2747
+ current_files_in_subscriber_session += total_manifest_files
2748
+ for manifest_file_path in list_upload_manifest_files:
2749
+ # add the file to the manifest
2750
+ ps.manifest.add(manifest_file_path, "/", manifest_id)
2751
+
2752
+
2753
+ # set rename files to ums for upload resuming if this upload fails
2754
+ if renamed_files_counter > 0:
2755
+ ums.set_list_of_files_to_rename(list_of_files_to_rename)
2756
+ ums.set_rename_total_files(renamed_files_counter)
2757
+
2758
+ # upload the manifest files
2759
+ try:
2760
+ ps.manifest.upload(manifest_id)
2761
+
2762
+ main_curate_progress_message = ("Uploading data files...")
2763
+
2764
+ # subscribe to the manifest upload so we wait until it has finished uploading before moving on
2765
+ ps.subscribe(10, False, monitor_subscriber_progress)
2766
+
2767
+ except Exception as e:
2768
+ logger.error(e)
2769
+ raise PennsieveUploadException("The Pennsieve Agent has encountered an issue while uploading. Please retry the upload. If this issue persists please follow this <a target='_blank' rel='noopener noreferrer' href='https://docs.sodaforsparc.io/docs/how-to/how-to-reinstall-the-pennsieve-agent'> guide</a> on performing a full reinstallation of the Pennsieve Agent then click the retry button.")
2770
+
2771
+
2772
+ # wait for all of the Agent's processes to finish to avoid errors when deleting files on Windows
2773
+ time.sleep(1)
2774
+
2775
+ # 6. Rename files
2776
+ if list_of_files_to_rename:
2777
+ renaming_files_flow = True
2778
+ logger.info("ps_create_new_dataset (optional) step 8 rename files")
2779
+ main_curate_progress_message = ("Preparing files to be renamed...")
2780
+ dataset_id = ds["content"]["id"]
2781
+ collection_ids = {}
2782
+ # gets the high level folders in the dataset
2783
+ r = requests.get(f"{PENNSIEVE_URL}/datasets/{dataset_id}", headers=create_request_headers(ps))
2784
+ r.raise_for_status()
2785
+ dataset_content = r.json()["children"]
2786
+
2787
+ if dataset_content == []:
2788
+ while dataset_content == []:
2789
+ time.sleep(3)
2790
+ r = requests.get(f"{PENNSIEVE_URL}/datasets/{dataset_id}", headers=create_request_headers(ps))
2791
+ r.raise_for_status()
2792
+ dataset_content = r.json()["children"]
2793
+
2794
+ collections_found = False
2795
+ while not collections_found:
2796
+ for item in dataset_content:
2797
+ # high lvl folders' ids are stored to be used to find the file IDS
2798
+ if item["content"]["packageType"] == "Collection":
2799
+ collections_found = True
2800
+ collection_ids[item["content"]["name"]] = {"id": item["content"]["nodeId"]}
2801
+
2802
+
2803
+ if not collections_found:
2804
+ # No collections were found, metadata files were processed but not the high level folders
2805
+ time.sleep(3)
2806
+ r = requests.get(f"{PENNSIEVE_URL}/datasets/{dataset_id}", headers=create_request_headers(ps))
2807
+ r.raise_for_status()
2808
+ dataset_content = r.json()["children"]
2809
+
2810
+ for key in list_of_files_to_rename:
2811
+ # split the key up if there are multiple folders in the relative path
2812
+ relative_path = key.split("/")
2813
+ high_lvl_folder_name = relative_path[0]
2814
+ subfolder_level = 0
2815
+ subfolder_amount = len(relative_path) - 1
2816
+
2817
+ if high_lvl_folder_name in collection_ids:
2818
+ # subfolder_amount will be the amount of subfolders we need to call until we can get the file ID to rename
2819
+
2820
+ high_lvl_folder_id = collection_ids[high_lvl_folder_name]["id"]
2821
+ limit = 100
2822
+ offset = 0
2823
+ dataset_content = []
2824
+ while True:
2825
+ r = requests.get(f"{PENNSIEVE_URL}/packages/{high_lvl_folder_id}?limit={limit}&offset={offset}", headers=create_request_headers(ps))
2826
+ r.raise_for_status()
2827
+ page = r.json()["children"]
2828
+ dataset_content.extend(page)
2829
+
2830
+ if len(page) < limit:
2831
+ break
2832
+ offset += limit
2833
+
2834
+ if dataset_content == []:
2835
+ # request until there is no children content, (folder is empty so files have not been processed yet)
2836
+ while dataset_content == []:
2837
+ time.sleep(3)
2838
+ limit = 100
2839
+ offset = 0
2840
+
2841
+ while True:
2842
+ r = requests.get(f"{PENNSIEVE_URL}/packages/{high_lvl_folder_id}?limit={limit}&offset={offset}", headers=create_request_headers(ps))
2843
+ r.raise_for_status()
2844
+ page = r.json()["children"]
2845
+ dataset_content.extend(page)
2846
+ if len(page) < limit:
2847
+ break
2848
+ offset += limit
2849
+
2850
+
2851
+ if subfolder_amount == 0:
2852
+ # the file is in the high level folder
2853
+ if "id" not in list_of_files_to_rename[key]:
2854
+ # store the id of the folder to be used again in case the file id is not found (happens when not all files have been processed yet)
2855
+ list_of_files_to_rename[key]["id"] = high_lvl_folder_id
2856
+
2857
+
2858
+ for item in dataset_content:
2859
+ if item["content"]["packageType"] != "Collection":
2860
+ file_name = item["content"]["name"]
2861
+ file_id = item["content"]["nodeId"]
2862
+
2863
+ if file_name in list_of_files_to_rename[key]:
2864
+ # name
2865
+ # store the package id for now
2866
+ list_of_files_to_rename[key][file_name]["id"] = file_id
2867
+ else:
2868
+ # file is within a subfolder and we recursively iterate until we get to the last subfolder needed
2869
+ subfolder_id = collection_ids[high_lvl_folder_name]["id"]
2870
+ while subfolder_level != subfolder_amount:
2871
+ if dataset_content == []:
2872
+ # subfolder has no content so request again
2873
+ while dataset_content == []:
2874
+ time.sleep(3)
2875
+ limit = 100
2876
+ offset = 0
2877
+ while True:
2878
+ r = requests.get(f"{PENNSIEVE_URL}/packages/{subfolder_id}", headers=create_request_headers(ps))
2879
+ r.raise_for_status()
2880
+ page = r.json()["children"]
2881
+ dataset_content.extend(page)
2882
+ if len(page) < limit:
2883
+ break
2884
+ offset += limit
2885
+
2886
+
2887
+ for item in dataset_content:
2888
+ if item["content"]["packageType"] == "Collection":
2889
+ folder_name = item["content"]["name"]
2890
+ folder_id = item["content"]["nodeId"]
2891
+
2892
+ if folder_name in relative_path:
2893
+ # we have found the folder we need to iterate through
2894
+ subfolder_level += 1
2895
+
2896
+ limit = 100
2897
+ offset = 0
2898
+ children = []
2899
+ while True:
2900
+ r = requests.get(f"{PENNSIEVE_URL}/packages/{folder_id}?limit={limit}&offset={offset}", headers=create_request_headers(ps))
2901
+ r.raise_for_status()
2902
+ page = r.json()["children"]
2903
+ children.extend(page)
2904
+ if len(page) < limit:
2905
+ break
2906
+ offset += limit
2907
+
2908
+ if subfolder_level != subfolder_amount:
2909
+ dataset_content = children
2910
+ if dataset_content == []:
2911
+ while dataset_content == []:
2912
+ # subfolder has no content so request again
2913
+ time.sleep(3)
2914
+ limit = 100
2915
+ offset = 0
2916
+ while True:
2917
+ r = requests.get(f"{PENNSIEVE_URL}/packages/{folder_id}", headers=create_request_headers(ps))
2918
+ r.raise_for_status()
2919
+ page = r.json()["children"]
2920
+ dataset_content.extend(page)
2921
+ if len(page) < limit:
2922
+ break
2923
+ offset += limit
2924
+
2925
+ subfolder_id = folder_id
2926
+ break
2927
+ else:
2928
+ # we are at the last folder in the relative path, we can get the file id
2929
+ if "id" not in list_of_files_to_rename[key]:
2930
+ # store the id of the last folder to directly call later in case not all files get an id
2931
+ list_of_files_to_rename[key]["id"] = folder_id
2932
+ for item in children:
2933
+ if item["content"]["packageType"] != "Collection":
2934
+ file_name = item["content"]["name"]
2935
+ file_id = item["content"]["nodeId"]
2936
+
2937
+ if file_name in list_of_files_to_rename[key]:
2938
+ # store the package id for renaming
2939
+ list_of_files_to_rename[key][file_name]["id"] = file_id
2940
+ else:
2941
+ continue
2942
+
2943
+ # 8.5 Rename files - All or most ids have been fetched now rename the files or gather the ids again if not all files have been processed at this time
2944
+ main_curate_progress_message = "Renaming files..."
2945
+ main_generated_dataset_size = 0
2946
+ main_total_generate_dataset_size = renamed_files_counter
2947
+ for relative_path in list_of_files_to_rename:
2948
+ for file in list_of_files_to_rename[relative_path].keys():
2949
+ collection_id = list_of_files_to_rename[relative_path]["id"]
2950
+ if file == "id":
2951
+ continue
2952
+ new_name = list_of_files_to_rename[relative_path][file]["final_file_name"]
2953
+ file_id = list_of_files_to_rename[relative_path][file]["id"]
2954
+
2955
+ if file_id != "":
2956
+ # id was found so make api call to rename with final file name
2957
+ try:
2958
+ r = requests.put(f"{PENNSIEVE_URL}/packages/{file_id}?updateStorage=true", json={"name": new_name}, headers=create_request_headers(ps))
2959
+ r.raise_for_status()
2960
+ except Exception as e:
2961
+ if r.status_code == 500:
2962
+ continue
2963
+ main_generated_dataset_size += 1
2964
+ else:
2965
+ # id was not found so keep trying to get the id until it is found
2966
+ all_ids_found = False
2967
+ while not all_ids_found:
2968
+ collection_id = list_of_files_to_rename[relative_path]["id"]
2969
+ if file == "id":
2970
+ continue
2971
+
2972
+
2973
+ limit = 100
2974
+ offset = 0
2975
+ dataset_content = []
2976
+
2977
+ while True:
2978
+ r = requests.put(f"{PENNSIEVE_URL}/packages/{collection_id}?updateStorage=true&limit={limit}&offset={offset}", headers=create_request_headers(ps))
2979
+ r.raise_for_status()
2980
+ page = r.json()["children"]
2981
+ dataset_content.extend(page)
2982
+ if len(dataset_content) < limit:
2983
+ break
2984
+ offset += limit
2985
+
2986
+ for item in dataset_content:
2987
+ if item["content"]["packageType"] != "Collection":
2988
+ file_name = item["content"]["name"]
2989
+ file_id = item["content"]["nodeId"]
2990
+
2991
+ if file_name == file:
2992
+ # id was found so make api call to rename with file file name
2993
+ try:
2994
+ r = requests.put(f"{PENNSIEVE_URL}/packages/{file_id}", json={"name": new_name}, headers=create_request_headers(ps))
2995
+ r.raise_for_status()
2996
+ except Exception as e:
2997
+ if r.status_code == 500:
2998
+ continue
2999
+ main_generated_dataset_size += 1
3000
+ all_ids_found = True
3001
+ break
3002
+
3003
+
3004
+
3005
+
3006
+
3007
+ # get the manifest id of the Pennsieve upload manifest created when uploading
3008
+
3009
+
3010
+ origin_manifest_id = get_origin_manifest_id(selected_id)
3011
+
3012
+ # if files were uploaded but later receive the 'Failed' status in the Pennsieve manifest we allow users to retry the upload; set the pre-requisite information for the upload to
3013
+ # be retried in that case
3014
+ # NOTE: We do not need to store the rename information here. Rationale: If the upload for a file failed the rename could not succeed and we would not reach this point.
3015
+ # What would happen instead is as follows(in an optimistic case where the upload doesnt keep being marked as Failed):
3016
+ # 1. The upload for a file fails
3017
+ # 2. The upload information gets (including rename information ) stored in the catchall error handling block
3018
+ # 3. The user retries the upload
3019
+ # 4. The manifest counts the Failed file as a file to be retried
3020
+ # 5. The manifest is uploaded again and the file is uploaded again
3021
+ # 6. The file is renamed successfully this time
3022
+ ums.set_main_total_generate_dataset_size(main_total_generate_dataset_size)
3023
+ ums.set_total_files_to_upload(total_files)
3024
+ ums.set_elapsed_time(elapsed_time)
3025
+
3026
+ # at end of successful session reset tracking for folders created
3027
+ main_curate_progress_message = "Success: COMPLETED!"
3028
+ main_curate_status = "Done"
3029
+
3030
+
3031
+ shutil.rmtree(manifest_folder_path) if isdir(manifest_folder_path) else 0
3032
+ end = timer()
3033
+ logger.info(f"Time for ps_upload_to_dataset function: {timedelta(seconds=end - start)}")
3034
+ except Exception as e:
3035
+ # reset the total bytes uploaded for any file that has not been fully uploaded
3036
+ ums.set_main_total_generate_dataset_size(main_total_generate_dataset_size)
3037
+ ums.set_total_files_to_upload(total_files)
3038
+ ums.set_elapsed_time(elapsed_time)
3039
+ # store the renaming files information in case the upload fails and we need to rename files during the retry
3040
+ ums.set_renaming_files_flow(renaming_files_flow) # this determines if we failed while renaming files after the upload is complete
3041
+ ums.set_rename_total_files(renamed_files_counter)
3042
+ ums.set_list_of_files_to_rename(list_of_files_to_rename)
3043
+ raise e
3044
+
3045
+ main_curate_status = ""
3046
+ main_curate_print_status = ""
3047
+ main_curate_progress_message = ""
3048
+ main_total_generate_dataset_size = 1
3049
+ main_generated_dataset_size = 0
3050
+ start_generate = 0
3051
+ generate_start_time = 0
3052
+ main_generate_destination = ""
3053
+ main_initial_bfdataset_size = 0
3054
+ myds = ""
3055
+ renaming_files_flow = False
3056
+ elapsed_time = None
3057
+ manifest_id = None
3058
+ origin_manifest_id = None
3059
+
3060
+
3061
+
3062
+ def ps_check_dataset_files_validity(soda):
3063
+ """
3064
+ Function to check that the bf data files and folders specified in the dataset are valid
3065
+
3066
+ Args:
3067
+ dataset_structure: soda dict with information about all specified files and folders
3068
+ Output:
3069
+ error: error message with list of non valid local data files, if any
3070
+ """
3071
+ def check_folder_validity(folder_id, folder_dict, folder_path, error):
3072
+ """
3073
+ Function to verify that the subfolders and files specified in the dataset are valid
3074
+
3075
+ Args:
3076
+ folder_id: id of the folder in the dataset
3077
+ folder_dict: dict with information about the folder
3078
+ folder_path: path of the folder in the dataset
3079
+ error: error message with list of non valid files/folders, if any
3080
+ Output:
3081
+ error: error message with list of non valid files/folders, if any
3082
+ """
3083
+ # get the folder content through Pennsieve api
3084
+ limit = 100
3085
+ offset = 0
3086
+ folder_content = []
3087
+ while True:
3088
+ r = requests.get(f"{PENNSIEVE_URL}/packages/{folder_id}?offset={offset}&limit={limit}", headers=create_request_headers(get_access_token()))
3089
+ r.raise_for_status()
3090
+ page = r.json()["children"]
3091
+ folder_content.extend(page)
3092
+ if len(page) < limit:
3093
+ break
3094
+ offset += limit
3095
+
3096
+ # check that the subfolders and files specified in the dataset are valid
3097
+ if "files" in folder_dict.keys():
3098
+ for file_key, file in folder_dict["files"].items():
3099
+ file_type = file.get("location")
3100
+ relative_path = (f"{folder_path}/{file_key}")
3101
+ # If file is from Pennsieve we verify if file exists on Pennsieve
3102
+ if file_type == "ps":
3103
+ file_actions = file["action"]
3104
+ file_id = file["path"]
3105
+ if "moved" in file_actions:
3106
+ try:
3107
+ r = requests.get(f"{PENNSIEVE_URL}/packages/{file_id}/view", headers=create_request_headers(get_access_token()))
3108
+ r.raise_for_status()
3109
+ except Exception as e:
3110
+ error.append(f"{relative_path} id: {file_id}")
3111
+ continue
3112
+ if next((item for item in folder_content if item["content"]["id"] == file_id), None) is None:
3113
+ error.append(f"{relative_path} id: {file_id}")
3114
+
3115
+ if "folders" in folder_dict.keys():
3116
+ for folder_key, folder in folder_dict["folders"].items():
3117
+ folder_type = folder.get("location")
3118
+ relative_path = (f"{folder_path}/{folder_key}")
3119
+ if folder_type == "ps":
3120
+ folder_id = folder["path"]
3121
+ folder_action = folder["action"]
3122
+ if "moved" in folder_action:
3123
+ try:
3124
+ r = requests.get(f"{PENNSIEVE_URL}/packages/{folder_id}", headers=create_request_headers(get_access_token()))
3125
+ r.raise_for_status()
3126
+ except Exception as e:
3127
+ error.append(f"{relative_path} id: {folder_id}")
3128
+ continue
3129
+ if next((item for item in folder_content if item["content"]["id"] == folder_id), None) is None:
3130
+ error.append(f"{relative_path} id: {folder_id}")
3131
+ else:
3132
+ check_folder_validity(folder_id, folder, relative_path, error)
3133
+
3134
+ return error
3135
+
3136
+ error = []
3137
+ # check that the files and folders specified in the dataset are valid
3138
+ dataset_name = soda["ps-dataset-selected"]["dataset-name"]
3139
+ dataset_id = get_dataset_id(dataset_name)
3140
+ r = requests.get(f"{PENNSIEVE_URL}/datasets/{dataset_id}", headers=create_request_headers(get_access_token()))
3141
+ r.raise_for_status()
3142
+ root_folder = r.json()["children"]
3143
+
3144
+ if len(root_folder) == 0:
3145
+ return error
3146
+
3147
+ if "dataset-structure" in soda.keys():
3148
+ dataset_structure = soda["dataset-structure"]
3149
+ if "folders" in dataset_structure:
3150
+ for folder_key, folder in dataset_structure["folders"].items():
3151
+ folder_type = folder.get("location")
3152
+ relative_path = folder_key
3153
+ if folder_type == "ps":
3154
+ collection_id = folder["path"]
3155
+ collection_actions = folder["action"]
3156
+ if "moved" in collection_actions:
3157
+ try:
3158
+ r = requests.get(f"{PENNSIEVE_URL}/packages/{collection_id}/view", headers=create_request_headers(get_access_token()))
3159
+ r.raise_for_status()
3160
+ except Exception:
3161
+ error.append(f"{relative_path} id: {collection_id}")
3162
+ continue
3163
+ if next((item for item in root_folder if item["content"]["id"] == collection_id), None) is None:
3164
+ error.append(f"{relative_path} id: {collection_id}")
3165
+ else:
3166
+ # recursively check all files + subfolders of collection_id
3167
+ error = check_folder_validity(collection_id, folder, relative_path, error)
3168
+
3169
+ # if there are items in the error list, check if they have been "moved"
3170
+ if len(error) > 0:
3171
+ error_message = [
3172
+ "Error: The following Pennsieve files/folders are invalid. Specify them again or remove them."
3173
+ ]
3174
+ error = error_message + error
3175
+
3176
+ return error
3177
+
3178
+
3179
+ def check_server_access_to_files(file_list):
3180
+ # Return two lists, one that the server can open, and one that it can not.
3181
+ # This is to avoid the server trying to open files that it does not have access to.cf
3182
+ accessible_files = []
3183
+ inaccessible_files = []
3184
+ for file in file_list:
3185
+ if os.path.isfile(file) or os.path.isdir(file):
3186
+ accessible_files.append(file)
3187
+ else:
3188
+ inaccessible_files.append(file)
3189
+
3190
+ return {"accessible_files": accessible_files, "inaccessible_files": inaccessible_files}
3191
+
3192
+
3193
+ # TODO: Update for SDS 3.0
3194
+ def clean_json_structure(soda):
3195
+ global logger
3196
+ # Delete any files on Pennsieve that have been marked as deleted
3197
+ def recursive_file_delete(folder):
3198
+ if "files" in folder.keys():
3199
+ for item in list(folder["files"]):
3200
+ if item in ["manifest.xlsx", "manifest.csv"]:
3201
+ continue
3202
+ if "deleted" in folder["files"][item]["action"]:
3203
+ # remove the file from the soda json structure
3204
+ del folder["files"][item]
3205
+
3206
+ for item in list(folder["folders"]):
3207
+ recursive_file_delete(folder["folders"][item])
3208
+
3209
+
3210
+ # Rename any files that exist on Pennsieve
3211
+ def recursive_file_rename(folder):
3212
+ if "files" in folder.keys():
3213
+ for item in list(folder["files"]):
3214
+ if (
3215
+ "renamed" in folder["files"][item]["action"]
3216
+ and folder["files"][item]["location"] == "ps"
3217
+ ):
3218
+ continue
3219
+
3220
+ for item in list(folder["folders"]):
3221
+ recursive_file_rename(folder["folders"][item])
3222
+
3223
+
3224
+ def recursive_folder_delete(folder):
3225
+ """
3226
+ Delete any stray folders that exist on Pennsieve
3227
+ Only top level files are deleted since the api deletes any
3228
+ files and folders that exist inside.
3229
+ """
3230
+
3231
+ for folder_item in list(folder["folders"]):
3232
+ if folder["folders"][folder_item]["location"] == "ps":
3233
+ if "deleted" in folder["folders"][folder_item]["action"]:
3234
+ del folder["folders"][folder_item]
3235
+ else:
3236
+ recursive_folder_delete(folder["folders"][folder_item])
3237
+ else:
3238
+ recursive_folder_delete(folder["folders"][folder_item])
3239
+ return
3240
+
3241
+ main_keys = soda.keys()
3242
+ dataset_structure = soda["dataset-structure"]
3243
+
3244
+ if ("dataset-structure" not in main_keys and "dataset_metadata" not in main_keys):
3245
+ if "ps-dataset-selected" in main_keys:
3246
+ dataset_name = soda["ps-dataset-selected"]["dataset-name"]
3247
+ elif "generate-dataset" in main_keys:
3248
+ dataset_name = soda["generate-dataset"]["dataset-name"]
3249
+ else:
3250
+ dataset_name = "Unset Name"
3251
+ raise EmptyDatasetError(dataset_name)
3252
+
3253
+ if "generate-dataset" in main_keys:
3254
+ # Check that local files/folders exist
3255
+ try:
3256
+ if error := check_local_dataset_files_validity(soda):
3257
+ raise LocalDatasetMissingSpecifiedFiles(error)
3258
+ # check that dataset is not empty after removing all the empty files and folders
3259
+ if not soda["dataset-structure"]["folders"] and "dataset_metadata" not in soda:
3260
+ if "ps-dataset-selected" in main_keys:
3261
+ dataset_name = soda["ps-dataset-selected"]["dataset-name"]
3262
+ elif "generate-dataset" in main_keys:
3263
+ dataset_name = soda["generate-dataset"]["dataset-name"]
3264
+ else:
3265
+ dataset_name = "Unset Name"
3266
+ raise EmptyDatasetError(dataset_name)
3267
+ except Exception as e:
3268
+ raise e
3269
+
3270
+ if "starting-point" in main_keys and soda["starting-point"][
3271
+ "origin"
3272
+ ] in ["ps", "local"]:
3273
+ recursive_file_delete(dataset_structure)
3274
+ recursive_folder_delete(dataset_structure)
3275
+ soda["dataset-structure"] = dataset_structure
3276
+
3277
+
3278
+ # here will be clean up the soda json object before creating the manifest file cards
3279
+ return {"soda": soda}
3280
+
3281
+
3282
+
3283
+ def validate_local_dataset_generate_path(soda):
3284
+ generate_dataset = soda["generate-dataset"]
3285
+ local_dataset_path = generate_dataset["path"]
3286
+ if not isdir(local_dataset_path):
3287
+ error_message = (
3288
+ "Error: The Path "
3289
+ + local_dataset_path
3290
+ + " is not found. Please select a valid destination folder for the new dataset"
3291
+ )
3292
+ raise FileNotFoundError(error_message)
3293
+
3294
+
3295
+
3296
+
3297
+ def generating_on_ps(soda):
3298
+ return soda["generate-dataset"]["destination"] == "ps"
3299
+
3300
+ def uploading_with_ps_account(soda):
3301
+ return "ps-account-selected" in soda
3302
+
3303
+ def uploading_to_existing_ps_dataset(soda):
3304
+ return "ps-dataset-selected" in soda
3305
+
3306
+ def can_resume_prior_upload(resume_status):
3307
+ global ums
3308
+ return resume_status and ums.df_mid_has_progress()
3309
+
3310
+ def virtual_dataset_empty(soda):
3311
+ return (
3312
+ "dataset-structure" not in soda
3313
+ and "metadata-files" not in soda
3314
+ )
3315
+
3316
+ def generate_options_set(soda):
3317
+ return "generate-dataset" in soda.keys()
3318
+
3319
+
3320
+ def get_dataset_with_backoff(selected_dataset_id):
3321
+ # check that dataset was created with a limited retry (for some users the dataset isn't automatically accessible)
3322
+ attempts = 0
3323
+ while(attempts < 3):
3324
+ try:
3325
+ # whether we are generating a new dataset or merging, we want the dataset information for later steps
3326
+ r = requests.get(f"{PENNSIEVE_URL}/datasets/{selected_dataset_id}", headers=create_request_headers(get_access_token()))
3327
+ r.raise_for_status()
3328
+ return r.json()
3329
+ except Exception as e:
3330
+ attempts += 1
3331
+ # check if final attempt
3332
+ if attempts >= 2:
3333
+ # raise the error to the user
3334
+ raise e
3335
+ time.sleep(10)
3336
+
3337
+
3338
+ def generate_new_ds_ps_resume(soda, dataset_name, ps):
3339
+ # get the dataset id by the name
3340
+ try:
3341
+ selected_dataset_id = get_dataset_id(dataset_name)
3342
+ except Exception as e:
3343
+ if e.code == 404:
3344
+ # dataset does not exist - create it
3345
+ ds = ps_create_new_dataset(dataset_name, ps)
3346
+ selected_dataset_id = ds["content"]["id"]
3347
+
3348
+ myds = get_dataset_with_backoff(selected_dataset_id)
3349
+ ps_upload_to_dataset(soda, ps, myds, True)
3350
+
3351
+ def generate_new_ds_ps(soda, dataset_name, ps):
3352
+ ds = ps_create_new_dataset(dataset_name, ps)
3353
+ selected_dataset_id = ds["content"]["id"]
3354
+ myds = get_dataset_with_backoff(selected_dataset_id)
3355
+ ps_upload_to_dataset(soda, ps, myds, False)
3356
+
3357
+
3358
+ def generate_dataset(soda, resume, ps):
3359
+ global main_generate_destination
3360
+ global main_total_generate_dataset_size
3361
+
3362
+
3363
+ # Generate dataset locally
3364
+ if generating_locally(soda):
3365
+ logger.info("generate_dataset generating_locally")
3366
+ main_generate_destination = soda["generate-dataset"][
3367
+ "destination"
3368
+ ]
3369
+ _, main_total_generate_dataset_size = generate_dataset_locally(
3370
+ soda
3371
+ )
3372
+
3373
+ # Generate dataset to Pennsieve
3374
+ if generating_on_ps(soda):
3375
+ main_generate_destination = soda["generate-dataset"][
3376
+ "destination"
3377
+ ]
3378
+ generate_option = soda["generate-dataset"]["generate-option"]
3379
+
3380
+ logger.info("generate_dataset generating_on_ps")
3381
+
3382
+ if uploading_to_existing_ps_dataset(soda) and soda["starting-point"]["origin"] != "new":
3383
+
3384
+ selected_dataset_id = get_dataset_id(
3385
+ soda["ps-dataset-selected"]["dataset-name"]
3386
+ )
3387
+ # make an api request to pennsieve to get the dataset details
3388
+ r = requests.get(f"{PENNSIEVE_URL}/datasets/{selected_dataset_id}", headers=create_request_headers(get_access_token()))
3389
+ r.raise_for_status()
3390
+ myds = r.json()
3391
+
3392
+ if can_resume_prior_upload(resume):
3393
+ ps_upload_to_dataset(soda, ps, myds, resume)
3394
+ else:
3395
+ ps_update_existing_dataset(soda, myds, ps, resume)
3396
+
3397
+ elif generate_option == "new" or generate_option == "existing-ps" and soda["starting-point"]["origin"] == "new":
3398
+ # if dataset name is in the generate-dataset section, we are generating a new dataset
3399
+ if "dataset-name" in soda["generate-dataset"]:
3400
+ dataset_name = soda["generate-dataset"][
3401
+ "dataset-name"
3402
+ ]
3403
+ elif "digital-metadata" in soda and "name" in soda["digital-metadata"]:
3404
+ dataset_name = soda["digital-metadata"]["name"]
3405
+ elif "ps-dataset-selected" in soda and "dataset-name" in soda["ps-dataset-selected"]:
3406
+ dataset_name = soda["ps-dataset-selected"]["dataset-name"]
3407
+
3408
+ if resume:
3409
+ generate_new_ds_ps_resume(soda, dataset_name, ps)
3410
+ else:
3411
+ try:
3412
+ selected_dataset_id = get_dataset_id(dataset_name)
3413
+ except Exception as e:
3414
+ if isinstance(e, PennsieveDatasetCannotBeFound):
3415
+ generate_new_ds_ps(soda, dataset_name, ps)
3416
+ return
3417
+ else:
3418
+ raise Exception(f"{e.status_code}, {e.message}")
3419
+ myds = get_dataset_with_backoff(selected_dataset_id)
3420
+
3421
+ ps_upload_to_dataset(soda, ps, myds, resume)
3422
+
3423
+
3424
+
3425
+
3426
+
3427
+ def validate_dataset_structure(soda, resume):
3428
+
3429
+ global main_curate_status
3430
+ global main_curate_progress_message
3431
+ global logger
3432
+
3433
+ # 1] Check for potential errors
3434
+ logger.info("main_curate_function step 1")
3435
+
3436
+ if not generate_options_set(soda):
3437
+ main_curate_status = "Done"
3438
+ raise GenerateOptionsNotSet()
3439
+
3440
+ # 1.1. If the dataset is being generated locally then check that the local destination is valid
3441
+ if generating_locally(soda):
3442
+ main_curate_progress_message = "Checking that the local destination selected for generating your dataset is valid"
3443
+ try:
3444
+ validate_local_dataset_generate_path(soda)
3445
+ except Exception as e:
3446
+ main_curate_status = "Done"
3447
+ raise e
3448
+
3449
+
3450
+ logger.info("main_curate_function step 1.2")
3451
+
3452
+ # 1.2. If generating dataset to Pennsieve or any other Pennsieve actions are requested check that the destination is valid
3453
+ if uploading_with_ps_account(soda):
3454
+ # check that the Pennsieve account is valid
3455
+ try:
3456
+ main_curate_progress_message = (
3457
+ "Checking that the selected Pennsieve account is valid"
3458
+ )
3459
+ accountname = soda["ps-account-selected"]["account-name"]
3460
+ connect_pennsieve_client(accountname)
3461
+ except Exception as e:
3462
+ main_curate_status = "Done"
3463
+ if isinstance(e, AttributeError):
3464
+ raise Exception("The Pennsieve Agent cannot access datasets but needs to in order to work. Please try again. If the issue persists, please contact the SODA team. The SODA team will contact Pennsieve to help resolve this issue.")
3465
+ else:
3466
+ raise PennsieveAccountInvalid("Please select a valid Pennsieve account.")
3467
+
3468
+ if uploading_to_existing_ps_dataset(soda):
3469
+ # check that the Pennsieve dataset is valid
3470
+ try:
3471
+ main_curate_progress_message = (
3472
+ "Checking that the selected Pennsieve dataset is valid"
3473
+ )
3474
+ bfdataset = soda["ps-dataset-selected"]["dataset-name"]
3475
+ selected_dataset_id = get_dataset_id(bfdataset)
3476
+
3477
+ except Exception as e:
3478
+ main_curate_status = "Done"
3479
+ bfdataset = soda["ps-dataset-selected"]["dataset-name"]
3480
+ raise PennsieveDatasetCannotBeFound(bfdataset)
3481
+
3482
+ # check that the user has permissions for uploading and modifying the dataset
3483
+ main_curate_progress_message = "Checking that you have required permissions for modifying the selected dataset"
3484
+ role = pennsieve_get_current_user_permissions(selected_dataset_id, get_access_token())["role"]
3485
+ if role not in ["owner", "manager", "editor"]:
3486
+ main_curate_status = "Done"
3487
+ raise PennsieveActionNoPermission("uploading to Pennsieve dataset")
3488
+
3489
+ logger.info("main_curate_function step 1.3")
3490
+
3491
+
3492
+ # 1.3. Check that specified dataset files and folders are valid (existing path) if generate dataset is requested
3493
+ # Note: Empty folders and 0 kb files will be removed without warning (a warning will be provided on the front end before starting the curate process)
3494
+ # Check at least one file or folder are added to the dataset
3495
+ main_curate_progress_message = "Checking that the dataset is not empty"
3496
+ if virtual_dataset_empty(soda):
3497
+ main_curate_status = "Done"
3498
+ if "generate-options" in soda.keys():
3499
+ dataset_name = soda["generate-options"]["dataset-name"]
3500
+ elif "ps-dataset-selected" in soda.keys():
3501
+ dataset_name = soda["ps-dataset-selected"]["dataset-name"]
3502
+ else:
3503
+ dataset_name = "Name not set"
3504
+ raise EmptyDatasetError(dataset_name)
3505
+
3506
+
3507
+ logger.info("main_curate_function step 1.3.1")
3508
+
3509
+ # Check that local files/folders exist
3510
+ if error := check_local_dataset_files_validity(soda):
3511
+ main_curate_status = "Done"
3512
+ raise LocalDatasetMissingSpecifiedFiles(error)
3513
+
3514
+
3515
+ # check that dataset is not empty after removing all the empty files and folders
3516
+ if virtual_dataset_empty(soda):
3517
+ main_curate_status = "Done"
3518
+ if "generate-options" in soda.keys():
3519
+ dataset_name = soda["generate-options"]["dataset-name"]
3520
+ elif "ps-dataset-selected" in soda.keys():
3521
+ dataset_name = soda["ps-dataset-selected"]["dataset-name"]
3522
+ else:
3523
+ dataset_name = "Name not set"
3524
+ raise EmptyDatasetError(dataset_name, "The dataset is empty after removing all the empty files and folders.")
3525
+
3526
+
3527
+ logger.info("main_curate_function step 1.3.2")
3528
+ # Check that bf files/folders exist (Only used for when generating from an existing Pennsieve dataset)
3529
+ if uploading_to_existing_ps_dataset(soda) and can_resume_prior_upload(resume) == False:
3530
+ try:
3531
+ main_curate_progress_message = (
3532
+ "Checking that the Pennsieve files and folders are valid"
3533
+ )
3534
+ if soda["generate-dataset"]["destination"] == "ps":
3535
+ if error := ps_check_dataset_files_validity(soda):
3536
+ logger.info("Failed to validate dataset files")
3537
+ logger.info(error)
3538
+ main_curate_status = "Done"
3539
+ raise PennsieveDatasetFilesInvalid(error)
3540
+ except Exception as e:
3541
+ main_curate_status = "Done"
3542
+ raise e
3543
+
3544
+
3545
+
3546
+ def reset_upload_session_environment(resume):
3547
+ global main_curate_status
3548
+ global main_curate_progress_message
3549
+ global main_total_generate_dataset_size
3550
+ global main_generated_dataset_size
3551
+ global start_generate
3552
+ global generate_start_time
3553
+ global main_generate_destination
3554
+ global main_initial_bfdataset_size
3555
+ global main_curation_uploaded_files
3556
+ global uploaded_folder_counter
3557
+ global ums
3558
+
3559
+ global myds
3560
+ global generated_dataset_id
3561
+ global bytes_file_path_dict
3562
+ global renaming_files_flow
3563
+
3564
+ start_generate = 0
3565
+ myds = ""
3566
+
3567
+ generate_start_time = time.time()
3568
+
3569
+ # variables for tracking the progress of the curate process on the frontend
3570
+ main_curate_status = ""
3571
+ main_curate_progress_message = "Starting..."
3572
+ main_total_generate_dataset_size = 0
3573
+ main_generated_dataset_size = 0
3574
+ main_curation_uploaded_files = 0
3575
+ uploaded_folder_counter = 0
3576
+ generated_dataset_id = None
3577
+
3578
+ main_curate_status = "Curating"
3579
+ main_curate_progress_message = "Starting dataset curation"
3580
+ main_generate_destination = ""
3581
+ main_initial_bfdataset_size = 0
3582
+
3583
+ if not resume:
3584
+ ums.set_df_mid(None)
3585
+ ums.set_elapsed_time(None)
3586
+ ums.set_total_files_to_upload(0)
3587
+ ums.set_main_total_generate_dataset_size(0)
3588
+ # reset the rename information back to default
3589
+ ums.set_renaming_files_flow(False) # this determines if we failed while renaming files after the upload is complete
3590
+ ums.set_rename_total_files(None)
3591
+ ums.set_list_of_files_to_rename(None)
3592
+ renaming_files_flow = False
3593
+ # reset the calculated values for the upload session
3594
+ bytes_file_path_dict = {}
3595
+
3596
+
3597
+
3598
+
3599
+ def main_curate_function(soda, resume):
3600
+ global logger
3601
+ global main_curate_status
3602
+ global manifest_id
3603
+ global origin_manifest_id
3604
+ global total_files
3605
+
3606
+ logger.info("Starting generating selected dataset")
3607
+ logger.info(f"Generating dataset metadata generate-options={soda['generate-dataset']}")
3608
+
3609
+
3610
+ reset_upload_session_environment(resume)
3611
+
3612
+
3613
+ validate_dataset_structure(soda, resume)
3614
+
3615
+ logger.info("Generating dataset step 3")
3616
+
3617
+
3618
+ # 2] Generate
3619
+ main_curate_progress_message = "Generating dataset"
3620
+ try:
3621
+ if (soda["generate-dataset"]["destination"] == "local"):
3622
+ logger.info("main_curate_function generating locally")
3623
+ generate_dataset(soda, resume, ps=None)
3624
+ else:
3625
+ logger.info("main_curate_function generating on Pennsieve")
3626
+ accountname = soda["ps-account-selected"]["account-name"]
3627
+ ps = connect_pennsieve_client(accountname)
3628
+ generate_dataset(soda, resume, ps)
3629
+ except Exception as e:
3630
+ main_curate_status = "Done"
3631
+ raise e
3632
+
3633
+ main_curate_status = "Done"
3634
+ main_curate_progress_message = "Success: COMPLETED!"
3635
+
3636
+
3637
+ logger.info(f"Finished generating dataset")
3638
+ return {
3639
+ "main_curate_progress_message": main_curate_progress_message,
3640
+ "main_total_generate_dataset_size": main_total_generate_dataset_size,
3641
+ "main_curation_uploaded_files": main_curation_uploaded_files,
3642
+ "local_manifest_id": manifest_id,
3643
+ "origin_manifest_id": origin_manifest_id,
3644
+ "main_curation_total_files": total_files,
3645
+ }
3646
+
3647
+
3648
+
3649
+ def main_curate_function_progress():
3650
+ """
3651
+ Function frequently called by front end to help keep track of the dataset generation progress
3652
+ """
3653
+
3654
+ global main_curate_status # empty if curate on going, "Done" when main curate function stopped (error or completed)
3655
+ global main_curate_progress_message
3656
+ global main_total_generate_dataset_size
3657
+ global main_generated_dataset_size
3658
+ global start_generate
3659
+ global generate_start_time
3660
+ global main_generate_destination
3661
+ global main_initial_bfdataset_size
3662
+ global main_curation_uploaded_files
3663
+ global total_bytes_uploaded # current number of bytes uploaded to Pennsieve in the upload session
3664
+ global myds
3665
+ global renaming_files_flow
3666
+ global ums
3667
+ global elapsed_time
3668
+
3669
+
3670
+ prior_elapsed_time = ums.get_elapsed_time()
3671
+ if prior_elapsed_time is not None:
3672
+ elapsed_time = ( time.time() - generate_start_time ) + prior_elapsed_time
3673
+ else:
3674
+ elapsed_time = time.time() - generate_start_time
3675
+
3676
+ elapsed_time_formatted = time_format(elapsed_time)
3677
+
3678
+
3679
+ if renaming_files_flow:
3680
+ testing_variable = main_generated_dataset_size
3681
+ else:
3682
+ testing_variable = total_bytes_uploaded["value"]
3683
+
3684
+ return {
3685
+ "main_curate_status": main_curate_status,
3686
+ "start_generate": start_generate,
3687
+ "main_curate_progress_message": main_curate_progress_message,
3688
+ "main_total_generate_dataset_size": main_total_generate_dataset_size,
3689
+ "main_generated_dataset_size": testing_variable,
3690
+ "elapsed_time_formatted": elapsed_time_formatted,
3691
+ "total_files_uploaded": main_curation_uploaded_files,
3692
+ "generated_dataset_id": myds["content"]["id"] if myds != "" else None, # when a new dataset gets generated log its id to our analytics
3693
+ "generated_dataset_int_id": myds["content"]["intId"] if myds != "" else None,
3694
+ }
3695
+
3696
+
3697
+ def preview_dataset(soda):
3698
+ """
3699
+ Associated with 'Preview' button in the SODA interface
3700
+ Creates a folder for preview and adds mock files based on the files specified in the UI by the user (same name as origin but 0 kb in size)
3701
+ Opens the dialog box to showcase the files / folders added
3702
+
3703
+ Args:
3704
+ soda: soda dict with information about all specified files and folders
3705
+ Action:
3706
+ Opens the dialog box at preview_path
3707
+ Returns:
3708
+ preview_path: path of the folder where the preview files are located
3709
+ """
3710
+
3711
+ preview_path = join(userpath, "SODA", "Preview_dataset")
3712
+
3713
+ # remove empty files and folders from dataset
3714
+ try:
3715
+ check_empty_files_folders(soda)
3716
+ except Exception as e:
3717
+ raise e
3718
+
3719
+ # create Preview_dataset folder
3720
+ try:
3721
+ if isdir(preview_path):
3722
+ shutil.rmtree(preview_path, ignore_errors=True)
3723
+ makedirs(preview_path)
3724
+ except Exception as e:
3725
+ raise e
3726
+
3727
+ try:
3728
+
3729
+ if "dataset-structure" in soda.keys():
3730
+ # create folder structure
3731
+ def recursive_create_mock_folder_structure(my_folder, my_folderpath):
3732
+ if "folders" in my_folder.keys():
3733
+ for folder_key, folder in my_folder["folders"].items():
3734
+ folderpath = join(my_folderpath, folder_key)
3735
+ if not isdir(folderpath):
3736
+ mkdir(folderpath)
3737
+ recursive_create_mock_folder_structure(folder, folderpath)
3738
+
3739
+ if "files" in my_folder.keys():
3740
+ for file_key, file in my_folder["files"].items():
3741
+ if "deleted" not in file["action"]:
3742
+ open(join(my_folderpath, file_key), "a").close()
3743
+
3744
+ dataset_structure = soda["dataset-structure"]
3745
+ folderpath = preview_path
3746
+ recursive_create_mock_folder_structure(dataset_structure, folderpath)
3747
+
3748
+ if "manifest-files" in soda.keys() and "folders" in dataset_structure.keys():
3749
+ for folder_key, folder in dataset_structure["folders"].items():
3750
+ manifest_path = join(preview_path, folder_key, "manifest.xlsx")
3751
+ if not isfile(manifest_path):
3752
+ open(manifest_path, "a").close()
3753
+
3754
+ if "metadata-files" in soda.keys():
3755
+ for metadata_key in soda["metadata-files"].keys():
3756
+ open(join(preview_path, metadata_key), "a").close()
3757
+
3758
+ if len(listdir(preview_path)) > 0:
3759
+ folder_in_preview = listdir(preview_path)[0]
3760
+ open_file(join(preview_path, folder_in_preview))
3761
+ else:
3762
+ open_file(preview_path)
3763
+
3764
+ return preview_path
3765
+
3766
+ except Exception as e:
3767
+ raise e
3768
+
3769
+
3770
+ def generate_manifest_file_locally(generate_purpose, soda):
3771
+ """
3772
+ Function to generate manifest files locally
3773
+ """
3774
+
3775
+
3776
+ global manifest_folder_path
3777
+
3778
+ def recursive_item_path_create(folder, path):
3779
+ if "files" in folder.keys():
3780
+ for item in list(folder["files"]):
3781
+ if "folderpath" not in folder["files"][item]:
3782
+ folder["files"][item]["folderpath"] = path[:]
3783
+
3784
+ if "folders" in folder.keys():
3785
+ for item in list(folder["folders"]):
3786
+ if "folderpath" not in folder["folders"][item]:
3787
+ folder["folders"][item]["folderpath"] = path[:]
3788
+ folder["folders"][item]["folderpath"].append(item)
3789
+ recursive_item_path_create(
3790
+ folder["folders"][item], folder["folders"][item]["folderpath"][:]
3791
+ )
3792
+
3793
+ return
3794
+
3795
+ def copytree(src, dst, symlinks=False, ignore=None):
3796
+ for item in os.listdir(src):
3797
+ s = os.path.join(src, item)
3798
+ d = os.path.join(dst, item)
3799
+ if os.path.isdir(s):
3800
+ if os.path.exists(d):
3801
+ shutil.rmtree(d)
3802
+ shutil.copytree(s, d, symlinks, ignore)
3803
+ else:
3804
+ shutil.copy2(s, d)
3805
+
3806
+ dataset_structure = soda["dataset-structure"]
3807
+ manifest_destination = soda["manifest-files"]["local-destination"]
3808
+
3809
+ recursive_item_path_create(dataset_structure, [])
3810
+ create_high_lvl_manifest_files_existing_ps_starting_point(soda, manifest_folder_path)
3811
+
3812
+ if generate_purpose == "edit-manifest":
3813
+ manifest_destination = os.path.join(manifest_destination, "manifest_file")
3814
+
3815
+ else:
3816
+ manifest_destination = return_new_path(
3817
+ os.path.join(manifest_destination, "manifest_file")
3818
+ )
3819
+
3820
+ copytree(manifest_folder_path, manifest_destination)
3821
+
3822
+
3823
+
3824
+ if generate_purpose == "edit-manifest":
3825
+ return {"success_message_or_manifest_destination": manifest_destination}
3826
+
3827
+ open_file(manifest_destination)
3828
+ return {"success_message_or_manifest_destination": "success"}
3829
+
3830
+
3831
+
3832
+ def generate_manifest_file_data(dataset_structure):
3833
+ # Define common file extensions with special handling
3834
+ double_extensions = {
3835
+ ".ome.tiff", ".ome.tif", ".ome.tf2", ".ome.tf8", ".ome.btf", ".ome.xml",
3836
+ ".brukertiff.gz", ".mefd.gz", ".moberg.gz", ".nii.gz", ".mgh.gz", ".tar.gz", ".bcl.gz"
3837
+ }
3838
+
3839
+ # Helper function: Get the complete file extension
3840
+ def get_file_extension(filename):
3841
+ for ext in double_extensions:
3842
+ if filename.endswith(ext):
3843
+ base_ext = os.path.splitext(os.path.splitext(filename)[0])[1]
3844
+ return base_ext + ext
3845
+ return os.path.splitext(filename)[1]
3846
+
3847
+ def create_folder_entry(folder_name, path_parts):
3848
+ full_path = "/".join(path_parts + [folder_name]) + "/"
3849
+ entry = [
3850
+ full_path.lstrip("/"), # Remove leading slash for consistency
3851
+ "", # Timestamp
3852
+ "", # Description
3853
+ "folder", # File type
3854
+ "", # Entity (empty)
3855
+ "", # Data modality (empty)
3856
+ "", # Also in dataset (empty)
3857
+ "", # Data dictionary path (empty)
3858
+ "", # Entity is transitive (empty)
3859
+ "", # Additional Metadata
3860
+ ]
3861
+ return entry
3862
+
3863
+
3864
+
3865
+ # Helper function: Build a single manifest entry
3866
+ def create_file_entry(item, folder, path_parts, timestamp, filename):
3867
+ full_path = "/".join(path_parts + [filename])
3868
+ file_info = folder["files"][item]
3869
+
3870
+ entry = [
3871
+ full_path.lstrip("/"), # Remove leading slash for consistency
3872
+ timestamp, # Timestamp
3873
+ file_info["description"], # Description
3874
+ get_file_extension(filename), # File type
3875
+ "", # Entity (empty)
3876
+ "", # Data modality (empty)
3877
+ "", # Also in dataset (empty)
3878
+ "", # Data dictionary path (empty)
3879
+ "", # Entity is transitive (empty)
3880
+ file_info.get("additional-metadata", "") # Additional Metadata
3881
+ ]
3882
+
3883
+ # Add any extra columns dynamically
3884
+ if "extra_columns" in file_info:
3885
+ for key, value in file_info["extra_columns"].items():
3886
+ entry.append(value)
3887
+ if key not in header_row:
3888
+ header_row.append(key)
3889
+
3890
+ return entry
3891
+
3892
+ # Recursive function: Traverse dataset and collect file data
3893
+ def traverse_folders(folder, path_parts):
3894
+ # Add header row if processing files for the first time
3895
+ if not manifest_data:
3896
+ manifest_data.append(header_row)
3897
+
3898
+ if "files" in folder:
3899
+ for item, file_info in folder["files"].items():
3900
+
3901
+ if "path" in file_info:
3902
+ file_path = file_info["path"]
3903
+ elif "pspath" in file_info:
3904
+ file_path = file_info["pspath"]
3905
+ else:
3906
+ continue
3907
+
3908
+ # If the file is a manifest file, skip it
3909
+ if item in {"manifest.xlsx", "manifest.csv"}:
3910
+ continue
3911
+
3912
+ # Determine timestamp
3913
+ filename = os.path.basename(file_path.replace("\\", "/"))
3914
+ if file_info["location"] == "ps":
3915
+ timestamp = file_info["timestamp"]
3916
+ else:
3917
+ local_path = pathlib.Path(file_info["path"])
3918
+ timestamp = datetime.fromtimestamp(
3919
+ local_path.stat().st_mtime, tz=local_timezone
3920
+ ).isoformat().replace(".", ",").replace("+00:00", "Z")
3921
+
3922
+ # Add file entry
3923
+ manifest_data.append(create_file_entry(item, folder, path_parts, timestamp, filename))
3924
+
3925
+ if "folders" in folder:
3926
+ for subfolder_name, subfolder in folder["folders"].items():
3927
+ # Add folder entry
3928
+ manifest_data.append(create_folder_entry(subfolder_name, path_parts))
3929
+ traverse_folders(subfolder, path_parts + [subfolder_name])
3930
+
3931
+ # Initialize variables
3932
+ manifest_data = [] # Collects all rows for the manifest
3933
+ # TODO: Update to SDS 3.0
3934
+ header_row = [
3935
+ "filename", "timestamp", "description", "file type", "entity",
3936
+ "data modality", "also in dataset", "data dictionary path",
3937
+ "entity is transitive", "Additional Metadata"
3938
+ ]
3939
+ local_timezone = TZLOCAL()
3940
+
3941
+ # Log the dataset structure
3942
+
3943
+ # Start recursive traversal from the root
3944
+ traverse_folders(dataset_structure, [])
3945
+
3946
+ return manifest_data
3947
+
3948
+
3949
+
3950
+
3951
+