pysodafair 0.1.62__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- pysoda/__init__.py +0 -0
- pysoda/constants.py +3 -0
- pysoda/core/__init__.py +10 -0
- pysoda/core/dataset_generation/__init__.py +11 -0
- pysoda/core/dataset_generation/manifestSession/__init__.py +1 -0
- pysoda/core/dataset_generation/manifestSession/manifest_session.py +146 -0
- pysoda/core/dataset_generation/upload.py +3951 -0
- pysoda/core/dataset_importing/__init__.py +1 -0
- pysoda/core/dataset_importing/import_dataset.py +662 -0
- pysoda/core/metadata/__init__.py +20 -0
- pysoda/core/metadata/code_description.py +109 -0
- pysoda/core/metadata/constants.py +32 -0
- pysoda/core/metadata/dataset_description.py +188 -0
- pysoda/core/metadata/excel_utils.py +41 -0
- pysoda/core/metadata/helpers.py +250 -0
- pysoda/core/metadata/manifest.py +112 -0
- pysoda/core/metadata/manifest_package/__init__.py +2 -0
- pysoda/core/metadata/manifest_package/manifest.py +0 -0
- pysoda/core/metadata/manifest_package/manifest_import.py +29 -0
- pysoda/core/metadata/manifest_package/manifest_writer.py +666 -0
- pysoda/core/metadata/performances.py +46 -0
- pysoda/core/metadata/resources.py +53 -0
- pysoda/core/metadata/samples.py +184 -0
- pysoda/core/metadata/sites.py +51 -0
- pysoda/core/metadata/subjects.py +172 -0
- pysoda/core/metadata/submission.py +91 -0
- pysoda/core/metadata/text_metadata.py +47 -0
- pysoda/core/metadata_templates/CHANGES +1 -0
- pysoda/core/metadata_templates/LICENSE +1 -0
- pysoda/core/metadata_templates/README.md +4 -0
- pysoda/core/metadata_templates/__init__.py +0 -0
- pysoda/core/metadata_templates/code_description.xlsx +0 -0
- pysoda/core/metadata_templates/code_parameters.xlsx +0 -0
- pysoda/core/metadata_templates/dataset_description.xlsx +0 -0
- pysoda/core/metadata_templates/manifest.xlsx +0 -0
- pysoda/core/metadata_templates/performances.xlsx +0 -0
- pysoda/core/metadata_templates/resources.xlsx +0 -0
- pysoda/core/metadata_templates/samples.xlsx +0 -0
- pysoda/core/metadata_templates/sites.xlsx +0 -0
- pysoda/core/metadata_templates/subjects.xlsx +0 -0
- pysoda/core/metadata_templates/subjects_pools_samples_structure.xlsx +0 -0
- pysoda/core/metadata_templates/subjects_pools_samples_structure_example.xlsx +0 -0
- pysoda/core/metadata_templates/submission.xlsx +0 -0
- pysoda/core/permissions/__init__.py +1 -0
- pysoda/core/permissions/permissions.py +31 -0
- pysoda/core/pysoda/__init__.py +2 -0
- pysoda/core/pysoda/soda.py +34 -0
- pysoda/core/pysoda/soda_object.py +55 -0
- pysoda/core/upload_manifests/__init__.py +1 -0
- pysoda/core/upload_manifests/upload_manifests.py +37 -0
- pysoda/schema/__init__.py +0 -0
- pysoda/schema/code_description.json +629 -0
- pysoda/schema/dataset_description.json +295 -0
- pysoda/schema/manifest.json +60 -0
- pysoda/schema/performances.json +44 -0
- pysoda/schema/resources.json +39 -0
- pysoda/schema/samples.json +97 -0
- pysoda/schema/sites.json +38 -0
- pysoda/schema/soda_schema.json +664 -0
- pysoda/schema/subjects.json +131 -0
- pysoda/schema/submission_schema.json +28 -0
- pysoda/utils/__init__.py +9 -0
- pysoda/utils/authentication.py +381 -0
- pysoda/utils/config.py +68 -0
- pysoda/utils/exceptions.py +156 -0
- pysoda/utils/logger.py +6 -0
- pysoda/utils/metadata_utils.py +74 -0
- pysoda/utils/pennsieveAgentUtils.py +11 -0
- pysoda/utils/pennsieveUtils.py +118 -0
- pysoda/utils/profile.py +28 -0
- pysoda/utils/schema_validation.py +133 -0
- pysoda/utils/time_utils.py +5 -0
- pysoda/utils/upload_utils.py +108 -0
- pysodafair-0.1.62.dist-info/METADATA +190 -0
- pysodafair-0.1.62.dist-info/RECORD +77 -0
- pysodafair-0.1.62.dist-info/WHEEL +4 -0
- pysodafair-0.1.62.dist-info/licenses/LICENSE +21 -0
|
@@ -0,0 +1,3951 @@
|
|
|
1
|
+
|
|
2
|
+
from ...utils import (
|
|
3
|
+
generate_options_set, generating_locally, generating_on_ps,
|
|
4
|
+
uploading_with_ps_account, uploading_to_existing_ps_dataset,
|
|
5
|
+
can_resume_prior_upload, virtual_dataset_empty, PropertyNotSetError,
|
|
6
|
+
connect_pennsieve_client, get_dataset_id, get_access_token,
|
|
7
|
+
PennsieveActionNoPermission, PennsieveDatasetCannotBeFound,
|
|
8
|
+
EmptyDatasetError, LocalDatasetMissingSpecifiedFiles,
|
|
9
|
+
PennsieveUploadException, create_request_headers, check_forbidden_characters_ps, get_users_dataset_list,
|
|
10
|
+
PennsieveDatasetNameInvalid, PennsieveDatasetNameTaken, PennsieveAccountInvalid, TZLOCAL, GenerateOptionsNotSet,
|
|
11
|
+
PennsieveDatasetFilesInvalid
|
|
12
|
+
)
|
|
13
|
+
from ..permissions import pennsieve_get_current_user_permissions
|
|
14
|
+
from os.path import isdir, isfile, getsize
|
|
15
|
+
from ..metadata import create_high_level_manifest_files, get_auto_generated_manifest_files, manifest, subjects, samples, code_description, dataset_description, performances, resources, sites, submission, text_metadata, METADATA_UPLOAD_PS_PATH, create_high_lvl_manifest_files_existing_ps_starting_point
|
|
16
|
+
from ..upload_manifests import get_upload_manifests
|
|
17
|
+
from .. import logger
|
|
18
|
+
|
|
19
|
+
main_curate_progress_message = ""
|
|
20
|
+
main_curate_status = ""
|
|
21
|
+
|
|
22
|
+
# -*- coding: utf-8 -*-
|
|
23
|
+
|
|
24
|
+
### Import required python modules
|
|
25
|
+
import platform
|
|
26
|
+
import os
|
|
27
|
+
from os import listdir, makedirs, mkdir, walk, rename
|
|
28
|
+
from os.path import (
|
|
29
|
+
isdir,
|
|
30
|
+
isfile,
|
|
31
|
+
join,
|
|
32
|
+
splitext,
|
|
33
|
+
basename,
|
|
34
|
+
exists,
|
|
35
|
+
expanduser,
|
|
36
|
+
dirname,
|
|
37
|
+
getsize,
|
|
38
|
+
abspath,
|
|
39
|
+
)
|
|
40
|
+
import pandas as pd
|
|
41
|
+
import time
|
|
42
|
+
from timeit import default_timer as timer
|
|
43
|
+
from datetime import timedelta
|
|
44
|
+
import shutil
|
|
45
|
+
import subprocess
|
|
46
|
+
import gevent
|
|
47
|
+
import pathlib
|
|
48
|
+
import requests
|
|
49
|
+
from datetime import datetime
|
|
50
|
+
from openpyxl import load_workbook
|
|
51
|
+
from openpyxl.styles import PatternFill
|
|
52
|
+
# from utils import connect_pennsieve_client, get_dataset_id, create_request_headers, TZLOCAL, get_users_dataset_list
|
|
53
|
+
# from manifest import create_high_lvl_manifest_files_existing_ps_starting_point, create_high_level_manifest_files, get_auto_generated_manifest_files
|
|
54
|
+
# from errors import PennsieveUploadException
|
|
55
|
+
from .manifestSession import UploadManifestSession
|
|
56
|
+
from ...constants import PENNSIEVE_URL
|
|
57
|
+
from ..dataset_importing import import_pennsieve_dataset
|
|
58
|
+
|
|
59
|
+
# from pysodaUtils import (
|
|
60
|
+
# check_forbidden_characters_ps
|
|
61
|
+
# )
|
|
62
|
+
|
|
63
|
+
# from organizeDatasets import import_pennsieve_dataset
|
|
64
|
+
|
|
65
|
+
|
|
66
|
+
### Global variables
|
|
67
|
+
curateprogress = " "
|
|
68
|
+
curatestatus = " "
|
|
69
|
+
curateprintstatus = " "
|
|
70
|
+
total_dataset_size = 1
|
|
71
|
+
curated_dataset_size = 0
|
|
72
|
+
start_time = 0
|
|
73
|
+
uploaded_folder_counter = 0
|
|
74
|
+
current_size_of_uploaded_files = 0
|
|
75
|
+
generated_dataset_id = None
|
|
76
|
+
# the pennsieve python client used for uploading dataset files
|
|
77
|
+
client = None
|
|
78
|
+
|
|
79
|
+
userpath = expanduser("~")
|
|
80
|
+
configpath = join(userpath, ".pennsieve", "config.ini")
|
|
81
|
+
submitdataprogress = " "
|
|
82
|
+
submitdatastatus = " "
|
|
83
|
+
submitprintstatus = " "
|
|
84
|
+
total_file_size = 1
|
|
85
|
+
uploaded_file_size = 0
|
|
86
|
+
start_time_bf_upload = 0
|
|
87
|
+
start_submit = 0
|
|
88
|
+
metadatapath = join(userpath, "SODA", "SODA_metadata")
|
|
89
|
+
ps_recognized_file_extensions = [
|
|
90
|
+
".cram",
|
|
91
|
+
".jp2",
|
|
92
|
+
".jpx",
|
|
93
|
+
".lsm",
|
|
94
|
+
".ndpi",
|
|
95
|
+
".nifti",
|
|
96
|
+
".oib",
|
|
97
|
+
".oif",
|
|
98
|
+
".roi",
|
|
99
|
+
".rtf",
|
|
100
|
+
".swc",
|
|
101
|
+
".abf",
|
|
102
|
+
".acq",
|
|
103
|
+
".adicht",
|
|
104
|
+
".adidat",
|
|
105
|
+
".aedt",
|
|
106
|
+
".afni",
|
|
107
|
+
".ai",
|
|
108
|
+
".avi",
|
|
109
|
+
".bam",
|
|
110
|
+
".bash",
|
|
111
|
+
".bcl",
|
|
112
|
+
".bcl.gz",
|
|
113
|
+
".bin",
|
|
114
|
+
".brik",
|
|
115
|
+
".brukertiff.gz",
|
|
116
|
+
".continuous",
|
|
117
|
+
".cpp",
|
|
118
|
+
".csv",
|
|
119
|
+
".curv",
|
|
120
|
+
".cxls",
|
|
121
|
+
".czi",
|
|
122
|
+
".data",
|
|
123
|
+
".dcm",
|
|
124
|
+
".df",
|
|
125
|
+
".dicom",
|
|
126
|
+
".doc",
|
|
127
|
+
".docx",
|
|
128
|
+
".e",
|
|
129
|
+
".edf",
|
|
130
|
+
".eps",
|
|
131
|
+
".events",
|
|
132
|
+
".fasta",
|
|
133
|
+
".fastq",
|
|
134
|
+
".fcs",
|
|
135
|
+
".feather",
|
|
136
|
+
".fig",
|
|
137
|
+
".gif",
|
|
138
|
+
".h4",
|
|
139
|
+
".h5",
|
|
140
|
+
".hdf4",
|
|
141
|
+
".hdf5",
|
|
142
|
+
".hdr",
|
|
143
|
+
".he2",
|
|
144
|
+
".he5",
|
|
145
|
+
".head",
|
|
146
|
+
".hoc",
|
|
147
|
+
".htm",
|
|
148
|
+
".html",
|
|
149
|
+
".ibw",
|
|
150
|
+
".img",
|
|
151
|
+
".ims",
|
|
152
|
+
".ipynb",
|
|
153
|
+
".jpeg",
|
|
154
|
+
".jpg",
|
|
155
|
+
".js",
|
|
156
|
+
".json",
|
|
157
|
+
".lay",
|
|
158
|
+
".lh",
|
|
159
|
+
".lif",
|
|
160
|
+
".m",
|
|
161
|
+
".mat",
|
|
162
|
+
".md",
|
|
163
|
+
".mef",
|
|
164
|
+
".mefd.gz",
|
|
165
|
+
".mex",
|
|
166
|
+
".mgf",
|
|
167
|
+
".mgh",
|
|
168
|
+
".mgh.gz",
|
|
169
|
+
".mgz",
|
|
170
|
+
".mnc",
|
|
171
|
+
".moberg.gz",
|
|
172
|
+
".mod",
|
|
173
|
+
".mov",
|
|
174
|
+
".mp4",
|
|
175
|
+
".mph",
|
|
176
|
+
".mpj",
|
|
177
|
+
".mtw",
|
|
178
|
+
".ncs",
|
|
179
|
+
".nd2",
|
|
180
|
+
".nev",
|
|
181
|
+
".nex",
|
|
182
|
+
".nex5",
|
|
183
|
+
".nf3",
|
|
184
|
+
".nii",
|
|
185
|
+
".nii.gz",
|
|
186
|
+
".ns1",
|
|
187
|
+
".ns2",
|
|
188
|
+
".ns3",
|
|
189
|
+
".ns4",
|
|
190
|
+
".ns5",
|
|
191
|
+
".ns6",
|
|
192
|
+
".nwb",
|
|
193
|
+
".ogg",
|
|
194
|
+
".ogv",
|
|
195
|
+
".ome.btf",
|
|
196
|
+
".ome.tif",
|
|
197
|
+
".ome.tif2",
|
|
198
|
+
".ome.tif8",
|
|
199
|
+
".ome.tiff",
|
|
200
|
+
".ome.xml",
|
|
201
|
+
".openephys",
|
|
202
|
+
".pdf",
|
|
203
|
+
".pgf",
|
|
204
|
+
".png",
|
|
205
|
+
".ppt",
|
|
206
|
+
".pptx",
|
|
207
|
+
".ps",
|
|
208
|
+
".pul",
|
|
209
|
+
".py",
|
|
210
|
+
".r",
|
|
211
|
+
".raw",
|
|
212
|
+
".rdata",
|
|
213
|
+
".rh",
|
|
214
|
+
".rhd",
|
|
215
|
+
".sh",
|
|
216
|
+
".sldasm",
|
|
217
|
+
".slddrw",
|
|
218
|
+
".smr",
|
|
219
|
+
".spikes",
|
|
220
|
+
".svg",
|
|
221
|
+
".svs",
|
|
222
|
+
".tab",
|
|
223
|
+
".tar",
|
|
224
|
+
".tar.gz",
|
|
225
|
+
".tcsh",
|
|
226
|
+
".tdm",
|
|
227
|
+
".tdms",
|
|
228
|
+
".text",
|
|
229
|
+
".tif",
|
|
230
|
+
".tiff",
|
|
231
|
+
".tsv",
|
|
232
|
+
".txt",
|
|
233
|
+
".vcf",
|
|
234
|
+
".webm",
|
|
235
|
+
".xlsx",
|
|
236
|
+
".xml",
|
|
237
|
+
".yaml",
|
|
238
|
+
".yml",
|
|
239
|
+
".zip",
|
|
240
|
+
".zsh",
|
|
241
|
+
]
|
|
242
|
+
|
|
243
|
+
myds = ""
|
|
244
|
+
initial_bfdataset_size = 0
|
|
245
|
+
upload_directly_to_bf = 0
|
|
246
|
+
initial_bfdataset_size_submit = 0
|
|
247
|
+
renaming_files_flow = False
|
|
248
|
+
|
|
249
|
+
total_files = 0 # the total number of files in a given dataset that need to be uploaded to Pennsieve
|
|
250
|
+
total_bytes_uploaded = 0 # current number of bytes uploaded to Pennsieve in the upload session
|
|
251
|
+
total_upload_size = 0 # total number of bytes to upload to Pennsieve in the upload session
|
|
252
|
+
|
|
253
|
+
forbidden_characters = '<>:"/\|?*'
|
|
254
|
+
forbidden_characters_bf = '\/:*?"<>'
|
|
255
|
+
|
|
256
|
+
# a global that tracks the amount of files that have been uploaded in an upload session;
|
|
257
|
+
# is reset once the session ends by success, or failure (is implicitly reset in case of Pennsieve Agent freeze by the user closing SODA)
|
|
258
|
+
main_curation_uploaded_files = 0
|
|
259
|
+
|
|
260
|
+
DEV_TEMPLATE_PATH = join(dirname(__file__), "..", "file_templates")
|
|
261
|
+
|
|
262
|
+
# once pysoda has been packaged with pyinstaller
|
|
263
|
+
# it becomes nested into the pysodadist/api directory
|
|
264
|
+
PROD_TEMPLATE_PATH = join(dirname(__file__), "..", "..", "file_templates")
|
|
265
|
+
TEMPLATE_PATH = DEV_TEMPLATE_PATH if exists(DEV_TEMPLATE_PATH) else PROD_TEMPLATE_PATH
|
|
266
|
+
|
|
267
|
+
|
|
268
|
+
|
|
269
|
+
|
|
270
|
+
ums = UploadManifestSession()
|
|
271
|
+
|
|
272
|
+
|
|
273
|
+
|
|
274
|
+
|
|
275
|
+
|
|
276
|
+
def open_file(file_path):
|
|
277
|
+
"""
|
|
278
|
+
Opening folder on all platforms
|
|
279
|
+
https://stackoverflow.com/questions/6631299/python-opening-a-folder-in-explorer-nautilus-mac-thingie
|
|
280
|
+
|
|
281
|
+
Args:
|
|
282
|
+
file_path: path of the folder (string)
|
|
283
|
+
Action:
|
|
284
|
+
Opens file explorer window to the given path
|
|
285
|
+
"""
|
|
286
|
+
|
|
287
|
+
if platform.system() == "Windows":
|
|
288
|
+
subprocess.Popen(f"explorer /select,{str(file_path)}")
|
|
289
|
+
elif platform.system() == "Darwin":
|
|
290
|
+
subprocess.Popen(["open", file_path])
|
|
291
|
+
else:
|
|
292
|
+
subprocess.Popen(["xdg-open", file_path])
|
|
293
|
+
|
|
294
|
+
|
|
295
|
+
|
|
296
|
+
def folder_size(path):
|
|
297
|
+
"""
|
|
298
|
+
Provides the size of the folder indicated by path
|
|
299
|
+
|
|
300
|
+
Args:
|
|
301
|
+
path: path of the folder (string)
|
|
302
|
+
Returns:
|
|
303
|
+
total_size: total size of the folder in bytes (integer)
|
|
304
|
+
"""
|
|
305
|
+
total_size = 0
|
|
306
|
+
|
|
307
|
+
for path, dirs, files in walk(path):
|
|
308
|
+
for f in files:
|
|
309
|
+
fp = join(path, f)
|
|
310
|
+
total_size += getsize(fp)
|
|
311
|
+
return total_size
|
|
312
|
+
|
|
313
|
+
|
|
314
|
+
def path_size(path):
|
|
315
|
+
"""
|
|
316
|
+
Returns size of the path, after checking if it's a folder or a file
|
|
317
|
+
Args:
|
|
318
|
+
path: path of the file/folder (string)
|
|
319
|
+
Returns:
|
|
320
|
+
total_size: total size of the file/folder in bytes (integer)
|
|
321
|
+
"""
|
|
322
|
+
return folder_size(path) if isdir(path) else getsize(path)
|
|
323
|
+
|
|
324
|
+
|
|
325
|
+
def create_folder_level_manifest(jsonpath, jsondescription):
|
|
326
|
+
"""
|
|
327
|
+
Function to create manifest files for each SPARC folder.
|
|
328
|
+
Files are created in a temporary folder
|
|
329
|
+
|
|
330
|
+
Args:
|
|
331
|
+
datasetpath: path of the dataset (string)
|
|
332
|
+
jsonpath: all paths in json format with key being SPARC folder names (dictionary)
|
|
333
|
+
jsondescription: description associated with each path (dictionary)
|
|
334
|
+
Action:
|
|
335
|
+
Creates manifest files in xslx format for each SPARC folder
|
|
336
|
+
"""
|
|
337
|
+
global total_dataset_size
|
|
338
|
+
local_timezone = TZLOCAL()
|
|
339
|
+
|
|
340
|
+
try:
|
|
341
|
+
shutil.rmtree(metadatapath) if isdir(metadatapath) else 0
|
|
342
|
+
makedirs(metadatapath)
|
|
343
|
+
folders = list(jsonpath.keys())
|
|
344
|
+
|
|
345
|
+
if "main" in folders:
|
|
346
|
+
folders.remove("main")
|
|
347
|
+
# In each SPARC folder, generate a manifest file
|
|
348
|
+
for folder in folders:
|
|
349
|
+
if jsonpath[folder] != []:
|
|
350
|
+
# Initialize dataframe where manifest info will be stored
|
|
351
|
+
df = pd.DataFrame(
|
|
352
|
+
columns=[
|
|
353
|
+
"filename",
|
|
354
|
+
"timestamp",
|
|
355
|
+
"description",
|
|
356
|
+
"file type",
|
|
357
|
+
"Additional Metadata",
|
|
358
|
+
]
|
|
359
|
+
)
|
|
360
|
+
# Get list of files/folders in the the folder
|
|
361
|
+
# Remove manifest file from the list if already exists
|
|
362
|
+
folderpath = join(metadatapath, folder)
|
|
363
|
+
allfiles = jsonpath[folder]
|
|
364
|
+
alldescription = jsondescription[folder + "_description"]
|
|
365
|
+
|
|
366
|
+
countpath = -1
|
|
367
|
+
for pathname in allfiles:
|
|
368
|
+
countpath += 1
|
|
369
|
+
if basename(pathname) in ["manifest.csv", "manifest.xlsx"]:
|
|
370
|
+
allfiles.pop(countpath)
|
|
371
|
+
alldescription.pop(countpath)
|
|
372
|
+
|
|
373
|
+
# Populate manifest dataframe
|
|
374
|
+
filename, timestamp, filetype, filedescription = [], [], [], []
|
|
375
|
+
countpath = -1
|
|
376
|
+
for paths in allfiles:
|
|
377
|
+
if isdir(paths):
|
|
378
|
+
key = basename(paths)
|
|
379
|
+
alldescription.pop(0)
|
|
380
|
+
for subdir, dirs, files in os.walk(paths):
|
|
381
|
+
for file in files:
|
|
382
|
+
gevent.sleep(0)
|
|
383
|
+
filepath = pathlib.Path(paths) / subdir / file
|
|
384
|
+
mtime = filepath.stat().st_mtime
|
|
385
|
+
lastmodtime = datetime.fromtimestamp(mtime).astimezone(
|
|
386
|
+
local_timezone
|
|
387
|
+
)
|
|
388
|
+
timestamp.append(
|
|
389
|
+
lastmodtime.isoformat()
|
|
390
|
+
.replace(".", ",")
|
|
391
|
+
.replace("+00:00", "Z")
|
|
392
|
+
)
|
|
393
|
+
full_filename = filepath.name
|
|
394
|
+
|
|
395
|
+
if folder == "main": # if file in main folder
|
|
396
|
+
filename.append(
|
|
397
|
+
full_filename
|
|
398
|
+
) if folder == "" else filename.append(
|
|
399
|
+
join(folder, full_filename)
|
|
400
|
+
)
|
|
401
|
+
else:
|
|
402
|
+
subdirname = os.path.relpath(
|
|
403
|
+
subdir, paths
|
|
404
|
+
) # gives relative path of the directory of the file w.r.t paths
|
|
405
|
+
if subdirname == ".":
|
|
406
|
+
filename.append(join(key, full_filename))
|
|
407
|
+
else:
|
|
408
|
+
filename.append(
|
|
409
|
+
join(key, subdirname, full_filename)
|
|
410
|
+
)
|
|
411
|
+
|
|
412
|
+
fileextension = splitext(full_filename)[1]
|
|
413
|
+
if (
|
|
414
|
+
not fileextension
|
|
415
|
+
): # if empty (happens e.g. with Readme files)
|
|
416
|
+
fileextension = "None"
|
|
417
|
+
filetype.append(fileextension)
|
|
418
|
+
filedescription.append("")
|
|
419
|
+
else:
|
|
420
|
+
gevent.sleep(0)
|
|
421
|
+
countpath += 1
|
|
422
|
+
filepath = pathlib.Path(paths)
|
|
423
|
+
file = filepath.name
|
|
424
|
+
filename.append(file)
|
|
425
|
+
mtime = filepath.stat().st_mtime
|
|
426
|
+
lastmodtime = datetime.fromtimestamp(mtime).astimezone(
|
|
427
|
+
local_timezone
|
|
428
|
+
)
|
|
429
|
+
timestamp.append(
|
|
430
|
+
lastmodtime.isoformat()
|
|
431
|
+
.replace(".", ",")
|
|
432
|
+
.replace("+00:00", "Z")
|
|
433
|
+
)
|
|
434
|
+
filedescription.append(alldescription[countpath])
|
|
435
|
+
if isdir(paths):
|
|
436
|
+
filetype.append("folder")
|
|
437
|
+
else:
|
|
438
|
+
fileextension = splitext(file)[1]
|
|
439
|
+
if (
|
|
440
|
+
not fileextension
|
|
441
|
+
): # if empty (happens e.g. with Readme files)
|
|
442
|
+
fileextension = "None"
|
|
443
|
+
filetype.append(fileextension)
|
|
444
|
+
|
|
445
|
+
df["filename"] = filename
|
|
446
|
+
df["timestamp"] = timestamp
|
|
447
|
+
df["file type"] = filetype
|
|
448
|
+
df["description"] = filedescription
|
|
449
|
+
|
|
450
|
+
makedirs(folderpath)
|
|
451
|
+
# Save manifest as Excel sheet
|
|
452
|
+
manifestfile = join(folderpath, "manifest.xlsx")
|
|
453
|
+
df.to_excel(manifestfile, index=None, header=True)
|
|
454
|
+
wb = load_workbook(manifestfile)
|
|
455
|
+
ws = wb.active
|
|
456
|
+
|
|
457
|
+
blueFill = PatternFill(
|
|
458
|
+
start_color="9DC3E6", fill_type="solid"
|
|
459
|
+
)
|
|
460
|
+
greenFill = PatternFill(
|
|
461
|
+
start_color="A8D08D", fill_type="solid"
|
|
462
|
+
)
|
|
463
|
+
yellowFill = PatternFill(
|
|
464
|
+
start_color="FFD965", fill_type="solid"
|
|
465
|
+
)
|
|
466
|
+
ws['A1'].fill = blueFill
|
|
467
|
+
ws['B1'].fill = greenFill
|
|
468
|
+
ws['C1'].fill = greenFill
|
|
469
|
+
ws['D1'].fill = greenFill
|
|
470
|
+
ws['E1'].fill = yellowFill
|
|
471
|
+
|
|
472
|
+
wb.save(manifestfile)
|
|
473
|
+
total_dataset_size += path_size(manifestfile)
|
|
474
|
+
jsonpath[folder].append(manifestfile)
|
|
475
|
+
|
|
476
|
+
return jsonpath
|
|
477
|
+
|
|
478
|
+
except Exception as e:
|
|
479
|
+
raise e
|
|
480
|
+
|
|
481
|
+
|
|
482
|
+
def return_new_path(topath):
|
|
483
|
+
"""
|
|
484
|
+
This function checks if a folder already exists and in such cases,
|
|
485
|
+
appends (1) or (2) etc. to the folder name
|
|
486
|
+
|
|
487
|
+
Args:
|
|
488
|
+
topath: path where the folder is supposed to be created (string)
|
|
489
|
+
Returns:
|
|
490
|
+
topath: new folder name based on the availability in destination folder (string)
|
|
491
|
+
"""
|
|
492
|
+
|
|
493
|
+
if not exists(topath):
|
|
494
|
+
return topath
|
|
495
|
+
|
|
496
|
+
i = 1
|
|
497
|
+
while True:
|
|
498
|
+
if not exists(topath + " (" + str(i) + ")"):
|
|
499
|
+
return topath + " (" + str(i) + ")"
|
|
500
|
+
i += 1
|
|
501
|
+
|
|
502
|
+
|
|
503
|
+
def return_new_path_replace(topath):
|
|
504
|
+
"""
|
|
505
|
+
This function checks if a folder already exists and in such cases,
|
|
506
|
+
replace the existing folder (this is the opposite situation to the function return_new_path)
|
|
507
|
+
|
|
508
|
+
Args:
|
|
509
|
+
topath: path where the folder is supposed to be created (string)
|
|
510
|
+
Returns:
|
|
511
|
+
topath: new folder name based on the availability in destination folder (string)
|
|
512
|
+
"""
|
|
513
|
+
|
|
514
|
+
if not exists(topath):
|
|
515
|
+
return topath
|
|
516
|
+
i = 1
|
|
517
|
+
while True:
|
|
518
|
+
if not exists(topath + " (" + str(i) + ")"):
|
|
519
|
+
return topath + " (" + str(i) + ")"
|
|
520
|
+
i += 1
|
|
521
|
+
|
|
522
|
+
|
|
523
|
+
def time_format(elapsed_time):
|
|
524
|
+
mins, secs = divmod(elapsed_time, 60)
|
|
525
|
+
hours, mins = divmod(mins, 60)
|
|
526
|
+
return "%dh:%02dmin:%02ds" % (hours, mins, secs)
|
|
527
|
+
|
|
528
|
+
|
|
529
|
+
def mycopyfileobj(fsrc, fdst, length=16 * 1024 * 16):
|
|
530
|
+
"""
|
|
531
|
+
Helper function to copy file
|
|
532
|
+
|
|
533
|
+
Args:
|
|
534
|
+
fsrc: source file opened in python (file-like object)
|
|
535
|
+
fdst: destination file accessed in python (file-like object)
|
|
536
|
+
length: copied buffer size in bytes (integer)
|
|
537
|
+
"""
|
|
538
|
+
global curateprogress
|
|
539
|
+
global total_dataset_size
|
|
540
|
+
global curated_dataset_size
|
|
541
|
+
global main_generated_dataset_size
|
|
542
|
+
|
|
543
|
+
while True:
|
|
544
|
+
buf = fsrc.read(length)
|
|
545
|
+
if not buf:
|
|
546
|
+
break
|
|
547
|
+
gevent.sleep(0)
|
|
548
|
+
fdst.write(buf)
|
|
549
|
+
curated_dataset_size += len(buf)
|
|
550
|
+
main_generated_dataset_size += len(buf)
|
|
551
|
+
|
|
552
|
+
|
|
553
|
+
def mycopyfile_with_metadata(src, dst, *, follow_symlinks=True):
|
|
554
|
+
"""
|
|
555
|
+
Copy file src to dst with metadata (timestamp, permission, etc.) conserved
|
|
556
|
+
|
|
557
|
+
Args:
|
|
558
|
+
src: source file (string)
|
|
559
|
+
dst: destination file (string)
|
|
560
|
+
Returns:
|
|
561
|
+
dst
|
|
562
|
+
"""
|
|
563
|
+
if not follow_symlinks and os.path.islink(src):
|
|
564
|
+
os.symlink(os.readlink(src), dst)
|
|
565
|
+
else:
|
|
566
|
+
with open(src, "rb") as fsrc:
|
|
567
|
+
with open(dst, "wb") as fdst:
|
|
568
|
+
mycopyfileobj(fsrc, fdst)
|
|
569
|
+
shutil.copystat(src, dst)
|
|
570
|
+
return dst
|
|
571
|
+
|
|
572
|
+
|
|
573
|
+
def check_empty_files_folders(soda):
|
|
574
|
+
"""
|
|
575
|
+
Function to check for empty files and folders
|
|
576
|
+
|
|
577
|
+
Args:
|
|
578
|
+
soda: soda dict with information about all specified files and folders
|
|
579
|
+
Output:
|
|
580
|
+
error: error message with list of non valid local data files, if any
|
|
581
|
+
"""
|
|
582
|
+
try:
|
|
583
|
+
def recursive_empty_files_check(my_folder, my_relative_path, error_files):
|
|
584
|
+
for folder_key, folder in my_folder["folders"].items():
|
|
585
|
+
relative_path = my_relative_path + "/" + folder_key
|
|
586
|
+
error_files = recursive_empty_files_check(
|
|
587
|
+
folder, relative_path, error_files
|
|
588
|
+
)
|
|
589
|
+
|
|
590
|
+
for file_key in list(my_folder["files"].keys()):
|
|
591
|
+
file = my_folder["files"][file_key]
|
|
592
|
+
file_type = file.get("location")
|
|
593
|
+
if file_type == "local":
|
|
594
|
+
file_path = file["path"]
|
|
595
|
+
if isfile(file_path):
|
|
596
|
+
file_size = getsize(file_path)
|
|
597
|
+
if file_size == 0:
|
|
598
|
+
del my_folder["files"][file_key]
|
|
599
|
+
relative_path = my_relative_path + "/" + file_key
|
|
600
|
+
error_message = relative_path + " (path: " + file_path + ")"
|
|
601
|
+
error_files.append(error_message)
|
|
602
|
+
|
|
603
|
+
return error_files
|
|
604
|
+
|
|
605
|
+
def recursive_empty_local_folders_check(
|
|
606
|
+
my_folder,
|
|
607
|
+
my_folder_key,
|
|
608
|
+
my_folders_content,
|
|
609
|
+
my_relative_path,
|
|
610
|
+
error_folders,
|
|
611
|
+
):
|
|
612
|
+
folders_content = my_folder["folders"]
|
|
613
|
+
for folder_key in list(my_folder["folders"].keys()):
|
|
614
|
+
folder = my_folder["folders"][folder_key]
|
|
615
|
+
relative_path = my_relative_path + "/" + folder_key
|
|
616
|
+
error_folders = recursive_empty_local_folders_check(
|
|
617
|
+
folder, folder_key, folders_content, relative_path, error_folders
|
|
618
|
+
)
|
|
619
|
+
|
|
620
|
+
if not my_folder["folders"] and not my_folder["files"]:
|
|
621
|
+
ignore = False
|
|
622
|
+
if "location" in my_folder and my_folder.get("location") == "ps":
|
|
623
|
+
ignore = True
|
|
624
|
+
if not ignore:
|
|
625
|
+
error_message = my_relative_path
|
|
626
|
+
error_folders.append(error_message)
|
|
627
|
+
del my_folders_content[my_folder_key]
|
|
628
|
+
return error_folders
|
|
629
|
+
|
|
630
|
+
error_files = []
|
|
631
|
+
error_folders = []
|
|
632
|
+
if "dataset-structure" in soda.keys():
|
|
633
|
+
dataset_structure = soda["dataset-structure"]
|
|
634
|
+
if "folders" in dataset_structure:
|
|
635
|
+
for folder_key, folder in dataset_structure["folders"].items():
|
|
636
|
+
relative_path = folder_key
|
|
637
|
+
error_files = recursive_empty_files_check(
|
|
638
|
+
folder, relative_path, error_files
|
|
639
|
+
)
|
|
640
|
+
|
|
641
|
+
folders_content = dataset_structure["folders"]
|
|
642
|
+
for folder_key in list(dataset_structure["folders"].keys()):
|
|
643
|
+
folder = dataset_structure["folders"][folder_key]
|
|
644
|
+
relative_path = folder_key
|
|
645
|
+
error_folders = recursive_empty_local_folders_check(
|
|
646
|
+
folder,
|
|
647
|
+
folder_key,
|
|
648
|
+
folders_content,
|
|
649
|
+
relative_path,
|
|
650
|
+
error_folders,
|
|
651
|
+
)
|
|
652
|
+
|
|
653
|
+
if "metadata-files" in soda.keys():
|
|
654
|
+
metadata_files = soda["metadata-files"]
|
|
655
|
+
for file_key in list(metadata_files.keys()):
|
|
656
|
+
file = metadata_files[file_key]
|
|
657
|
+
file_type = file.get("location")
|
|
658
|
+
if file_type == "local":
|
|
659
|
+
file_path = file["path"]
|
|
660
|
+
if isfile(file_path):
|
|
661
|
+
file_size = getsize(file_path)
|
|
662
|
+
if file_size == 0:
|
|
663
|
+
del metadata_files[file_key]
|
|
664
|
+
error_message = file_key + " (path: " + file_path + ")"
|
|
665
|
+
error_files.append(error_message)
|
|
666
|
+
if not metadata_files:
|
|
667
|
+
del soda["metadata-files"]
|
|
668
|
+
|
|
669
|
+
if len(error_files) > 0:
|
|
670
|
+
error_message = [
|
|
671
|
+
"The following local file(s) is/are empty (0 kb) and will be ignored."
|
|
672
|
+
]
|
|
673
|
+
error_files = error_message + [] + error_files
|
|
674
|
+
|
|
675
|
+
if len(error_folders) > 0:
|
|
676
|
+
error_message = [
|
|
677
|
+
"The SPARC dataset structure does not allow empty folders. The following empty folders will be removed from your dataset:"
|
|
678
|
+
]
|
|
679
|
+
error_folders = error_message + [] + error_folders
|
|
680
|
+
|
|
681
|
+
return {
|
|
682
|
+
"empty_files": error_files,
|
|
683
|
+
"empty_folders": error_folders,
|
|
684
|
+
"soda": soda
|
|
685
|
+
}
|
|
686
|
+
|
|
687
|
+
except Exception as e:
|
|
688
|
+
raise e
|
|
689
|
+
|
|
690
|
+
|
|
691
|
+
def check_local_dataset_files_validity(soda):
|
|
692
|
+
"""
|
|
693
|
+
Function to check that the local data files and folders specified in the dataset are valid
|
|
694
|
+
|
|
695
|
+
Args:
|
|
696
|
+
soda: soda dict with information about all specified files and folders
|
|
697
|
+
Output:
|
|
698
|
+
error: error message with list of non valid local data files, if any
|
|
699
|
+
"""
|
|
700
|
+
|
|
701
|
+
def recursive_local_file_check(my_folder, my_relative_path, error):
|
|
702
|
+
for folder_key, folder in my_folder["folders"].items():
|
|
703
|
+
relative_path = my_relative_path + "/" + folder_key
|
|
704
|
+
error = recursive_local_file_check(folder, relative_path, error)
|
|
705
|
+
|
|
706
|
+
for file_key in list(my_folder["files"].keys()):
|
|
707
|
+
file = my_folder["files"][file_key]
|
|
708
|
+
if file_key in ["manifest.xlsx", "manifest.csv"]:
|
|
709
|
+
continue
|
|
710
|
+
file_type = file.get("location")
|
|
711
|
+
if file_type == "local":
|
|
712
|
+
file_path = file["path"]
|
|
713
|
+
if file.get("location") == "ps":
|
|
714
|
+
continue
|
|
715
|
+
if not isfile(file_path):
|
|
716
|
+
relative_path = my_relative_path + "/" + file_key
|
|
717
|
+
error_message = relative_path + " (path: " + file_path + ")"
|
|
718
|
+
error.append(error_message)
|
|
719
|
+
else:
|
|
720
|
+
file_size = getsize(file_path)
|
|
721
|
+
if file_size == 0:
|
|
722
|
+
del my_folder["files"][file_key]
|
|
723
|
+
|
|
724
|
+
return error
|
|
725
|
+
|
|
726
|
+
def recursive_empty_local_folder_remove(
|
|
727
|
+
my_folder, my_folder_key, my_folders_content
|
|
728
|
+
):
|
|
729
|
+
|
|
730
|
+
folders_content = my_folder["folders"]
|
|
731
|
+
for folder_key in list(my_folder["folders"].keys()):
|
|
732
|
+
folder = my_folder["folders"][folder_key]
|
|
733
|
+
recursive_empty_local_folder_remove(folder, folder_key, folders_content)
|
|
734
|
+
|
|
735
|
+
if not my_folder.get("folders") and not my_folder.get("files") and my_folder.get("location") != "ps":
|
|
736
|
+
del my_folders_content[my_folder_key]
|
|
737
|
+
|
|
738
|
+
error = []
|
|
739
|
+
if "dataset-structure" in soda.keys():
|
|
740
|
+
dataset_structure = soda["dataset-structure"]
|
|
741
|
+
# Remove 0kb files, files that can't be found, and any empty folders from the dataset data files
|
|
742
|
+
if "folders" in dataset_structure:
|
|
743
|
+
for folder_key, folder in dataset_structure["folders"].items():
|
|
744
|
+
relative_path = folder_key
|
|
745
|
+
error = recursive_local_file_check(folder, relative_path, error)
|
|
746
|
+
|
|
747
|
+
folders_content = dataset_structure["folders"]
|
|
748
|
+
for folder_key in list(dataset_structure["folders"].keys()):
|
|
749
|
+
folder = dataset_structure["folders"][folder_key]
|
|
750
|
+
recursive_empty_local_folder_remove(folder, folder_key, folders_content)
|
|
751
|
+
|
|
752
|
+
# Return list of all the files that were not found.
|
|
753
|
+
if len(error) > 0:
|
|
754
|
+
error_message = [
|
|
755
|
+
"Error: The following local files were not found. Specify them again or remove them."
|
|
756
|
+
]
|
|
757
|
+
error = error_message + error
|
|
758
|
+
|
|
759
|
+
return error
|
|
760
|
+
|
|
761
|
+
|
|
762
|
+
# path to local SODA folder for saving manifest files
|
|
763
|
+
manifest_sparc = ["manifest.xlsx", "manifest.csv"]
|
|
764
|
+
manifest_folder_path = join(userpath, ".pysoda", "manifest_file")
|
|
765
|
+
|
|
766
|
+
|
|
767
|
+
|
|
768
|
+
def check_json_size(jsonStructure):
|
|
769
|
+
"""
|
|
770
|
+
This function is called to check size of files that will be created locally on a user's device.
|
|
771
|
+
"""
|
|
772
|
+
global total_dataset_size
|
|
773
|
+
total_dataset_size = 0
|
|
774
|
+
|
|
775
|
+
try:
|
|
776
|
+
def recursive_dataset_scan(folder):
|
|
777
|
+
global total_dataset_size
|
|
778
|
+
|
|
779
|
+
if "files" in folder.keys():
|
|
780
|
+
for file_key, file in folder["files"].items():
|
|
781
|
+
if "deleted" not in file["action"]:
|
|
782
|
+
file_type = file.get("location")
|
|
783
|
+
if file_type == "local":
|
|
784
|
+
file_path = file["path"]
|
|
785
|
+
if isfile(file_path):
|
|
786
|
+
total_dataset_size += getsize(file_path)
|
|
787
|
+
|
|
788
|
+
if "folders" in folder.keys():
|
|
789
|
+
for folder_key, folder in folder["folders"].items():
|
|
790
|
+
recursive_dataset_scan(folder)
|
|
791
|
+
|
|
792
|
+
# scan dataset structure
|
|
793
|
+
dataset_structure = jsonStructure["dataset-structure"]
|
|
794
|
+
folderSection = dataset_structure["folders"]
|
|
795
|
+
# gets keys like code, primary, source and their content...
|
|
796
|
+
for keys, contents in folderSection.items():
|
|
797
|
+
recursive_dataset_scan(contents)
|
|
798
|
+
|
|
799
|
+
if "metadata-files" in jsonStructure.keys():
|
|
800
|
+
metadata_files = jsonStructure["metadata-files"]
|
|
801
|
+
for file_key, file in metadata_files.items():
|
|
802
|
+
if file.get("location") == "local":
|
|
803
|
+
metadata_path = file["path"]
|
|
804
|
+
if isfile(metadata_path) and "new" in file["action"]:
|
|
805
|
+
total_dataset_size += getsize(metadata_path)
|
|
806
|
+
|
|
807
|
+
if "manifest-files" in jsonStructure.keys():
|
|
808
|
+
manifest_files_structure = create_high_level_manifest_files(jsonStructure, manifest_folder_path)
|
|
809
|
+
for key in manifest_files_structure.keys():
|
|
810
|
+
manifestpath = manifest_files_structure[key]
|
|
811
|
+
if isfile(manifestpath):
|
|
812
|
+
total_dataset_size += getsize(manifestpath)
|
|
813
|
+
|
|
814
|
+
# returns in bytes
|
|
815
|
+
return {"dataset_size": total_dataset_size}
|
|
816
|
+
except Exception as e:
|
|
817
|
+
raise e
|
|
818
|
+
|
|
819
|
+
|
|
820
|
+
def generate_dataset_locally(soda):
|
|
821
|
+
global logger
|
|
822
|
+
logger.info("starting generate_dataset_locally")
|
|
823
|
+
|
|
824
|
+
# Vars used for tracking progress on the frontend
|
|
825
|
+
global main_curate_progress_message
|
|
826
|
+
global progress_percentage
|
|
827
|
+
global main_total_generate_dataset_size
|
|
828
|
+
global start_generate
|
|
829
|
+
global main_curation_uploaded_files
|
|
830
|
+
|
|
831
|
+
main_curation_uploaded_files = 0
|
|
832
|
+
|
|
833
|
+
def recursive_dataset_scan(
|
|
834
|
+
my_folder, my_folderpath, list_copy_files, list_move_files
|
|
835
|
+
):
|
|
836
|
+
global main_total_generate_dataset_size
|
|
837
|
+
|
|
838
|
+
if "folders" in my_folder.keys():
|
|
839
|
+
for folder_key, folder in my_folder["folders"].items():
|
|
840
|
+
folderpath = join(my_folderpath, folder_key)
|
|
841
|
+
if not isdir(folderpath):
|
|
842
|
+
mkdir(folderpath)
|
|
843
|
+
list_copy_files, list_move_files = recursive_dataset_scan(
|
|
844
|
+
folder, folderpath, list_copy_files, list_move_files
|
|
845
|
+
)
|
|
846
|
+
|
|
847
|
+
if "files" in my_folder.keys():
|
|
848
|
+
for file_key, file in my_folder["files"].items():
|
|
849
|
+
if "deleted" not in file["action"]:
|
|
850
|
+
file_type = file.get("location")
|
|
851
|
+
if file_type == "local":
|
|
852
|
+
file_path = file["path"]
|
|
853
|
+
if isfile(file_path):
|
|
854
|
+
destination_path = abspath(
|
|
855
|
+
join(my_folderpath, file_key)
|
|
856
|
+
)
|
|
857
|
+
if not isfile(destination_path):
|
|
858
|
+
if (
|
|
859
|
+
"existing" in file["action"]
|
|
860
|
+
and soda["generate-dataset"][
|
|
861
|
+
"if-existing"
|
|
862
|
+
]
|
|
863
|
+
== "merge"
|
|
864
|
+
):
|
|
865
|
+
list_move_files.append(
|
|
866
|
+
[file_path, destination_path]
|
|
867
|
+
)
|
|
868
|
+
else:
|
|
869
|
+
main_total_generate_dataset_size += getsize(
|
|
870
|
+
file_path
|
|
871
|
+
)
|
|
872
|
+
list_copy_files.append(
|
|
873
|
+
[file_path, destination_path]
|
|
874
|
+
)
|
|
875
|
+
else:
|
|
876
|
+
logger.info(f"file_path {file_path} does not exist. Skipping.")
|
|
877
|
+
return list_copy_files, list_move_files
|
|
878
|
+
|
|
879
|
+
|
|
880
|
+
logger.info("generate_dataset_locally step 1")
|
|
881
|
+
# 1. Create new folder for dataset or use existing merge with existing or create new dataset
|
|
882
|
+
main_curate_progress_message = "Generating folder structure and list of files to be included in the dataset"
|
|
883
|
+
dataset_absolute_path = soda["generate-dataset"]["path"]
|
|
884
|
+
if_existing = soda["generate-dataset"]["if-existing"]
|
|
885
|
+
dataset_name = soda["generate-dataset"]["dataset-name"]
|
|
886
|
+
datasetpath = join(dataset_absolute_path, dataset_name)
|
|
887
|
+
datasetpath = return_new_path(datasetpath)
|
|
888
|
+
mkdir(datasetpath)
|
|
889
|
+
|
|
890
|
+
logger.info("generate_dataset_locally step 2")
|
|
891
|
+
# 2. Scan the dataset structure and:
|
|
892
|
+
# 2.1. Create all folders (with new name if renamed)
|
|
893
|
+
# 2.2. Compile a list of files to be copied and a list of files to be moved (with new name recorded if renamed)
|
|
894
|
+
list_copy_files = []
|
|
895
|
+
list_move_files = []
|
|
896
|
+
dataset_structure = soda["dataset-structure"]
|
|
897
|
+
|
|
898
|
+
for folder_key, folder in dataset_structure["folders"].items():
|
|
899
|
+
folderpath = join(datasetpath, folder_key)
|
|
900
|
+
mkdir(folderpath)
|
|
901
|
+
list_copy_files, list_move_files = recursive_dataset_scan(
|
|
902
|
+
folder, folderpath, list_copy_files, list_move_files
|
|
903
|
+
)
|
|
904
|
+
|
|
905
|
+
# 3. Add high-level metadata files in the list
|
|
906
|
+
if "dataset_metadata" in soda.keys():
|
|
907
|
+
logger.info("generate_dataset_locally (optional) step 3 handling metadata-files")
|
|
908
|
+
metadata_files = soda["dataset_metadata"]
|
|
909
|
+
# log the metadata files that will be created
|
|
910
|
+
for file_key, _ in metadata_files.items():
|
|
911
|
+
if file_key == "subjects":
|
|
912
|
+
subjects.create_excel(soda, False, join(datasetpath, "subjects.xlsx"))
|
|
913
|
+
elif file_key == "samples":
|
|
914
|
+
samples.create_excel(soda, False, join(datasetpath, "samples.xlsx"))
|
|
915
|
+
elif file_key == "code_description":
|
|
916
|
+
code_description.create_excel(soda, False, join(datasetpath, "code_description.xlsx"))
|
|
917
|
+
elif file_key == "dataset_description":
|
|
918
|
+
dataset_description.create_excel(soda, False, join(datasetpath, "dataset_description.xlsx"))
|
|
919
|
+
elif file_key == "performances":
|
|
920
|
+
performances.create_excel(soda, False, join(datasetpath, "performances.xlsx"))
|
|
921
|
+
elif file_key == "resources":
|
|
922
|
+
resources.create_excel(soda, False, join(datasetpath, "resources.xlsx"))
|
|
923
|
+
elif file_key == "sites":
|
|
924
|
+
sites.create_excel(soda, False, join(datasetpath, "sites.xlsx"))
|
|
925
|
+
elif file_key == "submission":
|
|
926
|
+
submission.create_excel(soda, False, join(datasetpath, "submission.xlsx"))
|
|
927
|
+
elif file_key == "README.md":
|
|
928
|
+
text_metadata.create_text_file(soda, False, join(datasetpath, "README.md"), "README.md")
|
|
929
|
+
elif file_key == "CHANGES":
|
|
930
|
+
text_metadata.create_text_file(soda, False, join(datasetpath, "CHANGES"), "CHANGES")
|
|
931
|
+
elif file_key == "LICENSE":
|
|
932
|
+
text_metadata.create_text_file(soda, False, join(datasetpath, "LICENSE"), "LICENSE")
|
|
933
|
+
|
|
934
|
+
# 4. Add manifest files in the list
|
|
935
|
+
if "manifest_file" in soda["dataset_metadata"].keys():
|
|
936
|
+
logger.info("generate_dataset_locally (optional) step 4 handling manifest-files")
|
|
937
|
+
main_curate_progress_message = "Preparing manifest files"
|
|
938
|
+
manifest.create_excel(soda, False, join(datasetpath, "manifest.xlsx"))
|
|
939
|
+
|
|
940
|
+
|
|
941
|
+
logger.info("generate_dataset_locally step 5 moving files to new location")
|
|
942
|
+
# 5. Move files to new location
|
|
943
|
+
main_curate_progress_message = "Moving files to new location"
|
|
944
|
+
for fileinfo in list_move_files:
|
|
945
|
+
srcfile = fileinfo[0]
|
|
946
|
+
distfile = fileinfo[1]
|
|
947
|
+
main_curate_progress_message = f"Moving file {str(srcfile)} to {str(distfile)}"
|
|
948
|
+
shutil.move(srcfile, distfile)
|
|
949
|
+
|
|
950
|
+
logger.info("generate_dataset_locally step 6 copying files to new location")
|
|
951
|
+
# 6. Copy files to new location
|
|
952
|
+
main_curate_progress_message = "Copying files to new location"
|
|
953
|
+
start_generate = 1
|
|
954
|
+
for fileinfo in list_copy_files:
|
|
955
|
+
srcfile = fileinfo[0]
|
|
956
|
+
distfile = fileinfo[1]
|
|
957
|
+
main_curate_progress_message = f"Copying file {str(srcfile)} to {str(distfile)}"
|
|
958
|
+
# track amount of copied files for loggin purposes
|
|
959
|
+
mycopyfile_with_metadata(srcfile, distfile)
|
|
960
|
+
main_curation_uploaded_files += 1
|
|
961
|
+
|
|
962
|
+
logger.info("generate_dataset_locally step 7")
|
|
963
|
+
# 7. Delete manifest folder and original folder if merge requested and rename new folder
|
|
964
|
+
shutil.rmtree(manifest_folder_path) if isdir(manifest_folder_path) else 0
|
|
965
|
+
if if_existing == "merge":
|
|
966
|
+
logger.info("generate_dataset_locally (optional) step 7.1 delete manifest folder if merge requested")
|
|
967
|
+
main_curate_progress_message = "Finalizing dataset"
|
|
968
|
+
original_dataset_path = join(dataset_absolute_path, dataset_name)
|
|
969
|
+
shutil.rmtree(original_dataset_path)
|
|
970
|
+
rename(datasetpath, original_dataset_path)
|
|
971
|
+
open_file(join(dataset_absolute_path, original_dataset_path))
|
|
972
|
+
else:
|
|
973
|
+
open_file(join(dataset_absolute_path, datasetpath))
|
|
974
|
+
return datasetpath, main_total_generate_dataset_size
|
|
975
|
+
|
|
976
|
+
|
|
977
|
+
|
|
978
|
+
|
|
979
|
+
|
|
980
|
+
def ps_create_new_dataset(datasetname, ps):
|
|
981
|
+
"""
|
|
982
|
+
Args:
|
|
983
|
+
datasetname: name of the dataset to be created (string)
|
|
984
|
+
bf: Pennsieve account object
|
|
985
|
+
Action:
|
|
986
|
+
Creates dataset for the account specified
|
|
987
|
+
"""
|
|
988
|
+
try:
|
|
989
|
+
error, count = "", 0
|
|
990
|
+
datasetname = datasetname.strip()
|
|
991
|
+
|
|
992
|
+
if check_forbidden_characters_ps(datasetname):
|
|
993
|
+
error = (
|
|
994
|
+
f"{error}Error: A Pennsieve dataset name cannot contain any of the following characters: "
|
|
995
|
+
+ forbidden_characters_bf
|
|
996
|
+
+ "<br>"
|
|
997
|
+
)
|
|
998
|
+
count += 1
|
|
999
|
+
|
|
1000
|
+
if not datasetname:
|
|
1001
|
+
error = f"{error}Error: Please enter valid dataset name<br>"
|
|
1002
|
+
count += 1
|
|
1003
|
+
|
|
1004
|
+
if datasetname.isspace():
|
|
1005
|
+
error = error + "Error: Please enter valid dataset name" + "<br>"
|
|
1006
|
+
count += 1
|
|
1007
|
+
|
|
1008
|
+
if count > 0:
|
|
1009
|
+
raise PennsieveDatasetNameInvalid(datasetname)
|
|
1010
|
+
|
|
1011
|
+
try:
|
|
1012
|
+
dataset_list = get_users_dataset_list()
|
|
1013
|
+
except Exception as e:
|
|
1014
|
+
raise Exception("Failed to retrieve datasets from Pennsieve. Please try again later.")
|
|
1015
|
+
|
|
1016
|
+
for dataset in dataset_list:
|
|
1017
|
+
if datasetname == dataset["content"]["name"]:
|
|
1018
|
+
raise PennsieveDatasetNameTaken("Dataset name already exists")
|
|
1019
|
+
|
|
1020
|
+
|
|
1021
|
+
# Create the dataset on Pennsieve
|
|
1022
|
+
r = requests.post(f"{PENNSIEVE_URL}/datasets", headers=create_request_headers(ps), json={"name": datasetname})
|
|
1023
|
+
r.raise_for_status()
|
|
1024
|
+
|
|
1025
|
+
|
|
1026
|
+
return r.json()
|
|
1027
|
+
|
|
1028
|
+
# TODO: Remove unnecessary raise
|
|
1029
|
+
except Exception as e:
|
|
1030
|
+
raise e
|
|
1031
|
+
|
|
1032
|
+
double_extensions = [
|
|
1033
|
+
".ome.tiff",
|
|
1034
|
+
".ome.tif",
|
|
1035
|
+
".ome.tf2,",
|
|
1036
|
+
".ome.tf8",
|
|
1037
|
+
".ome.btf",
|
|
1038
|
+
".ome.xml",
|
|
1039
|
+
".brukertiff.gz",
|
|
1040
|
+
".mefd.gz",
|
|
1041
|
+
".moberg.gz",
|
|
1042
|
+
".nii.gz",
|
|
1043
|
+
".mgh.gz",
|
|
1044
|
+
".tar.gz",
|
|
1045
|
+
".bcl.gz",
|
|
1046
|
+
]
|
|
1047
|
+
|
|
1048
|
+
|
|
1049
|
+
def create_high_lvl_manifest_files_existing_ps(
|
|
1050
|
+
soda, ps, my_tracking_folder
|
|
1051
|
+
):
|
|
1052
|
+
"""
|
|
1053
|
+
Function to create manifest files for each high-level SPARC folder.
|
|
1054
|
+
|
|
1055
|
+
Args:
|
|
1056
|
+
soda: soda dict with information about the dataset to be generated/modified
|
|
1057
|
+
Action:
|
|
1058
|
+
manifest_files_structure: dict including the local path of the manifest files
|
|
1059
|
+
"""
|
|
1060
|
+
def get_name_extension(file_name):
|
|
1061
|
+
double_ext = False
|
|
1062
|
+
for ext in double_extensions:
|
|
1063
|
+
if file_name.find(ext) != -1:
|
|
1064
|
+
double_ext = True
|
|
1065
|
+
break
|
|
1066
|
+
ext = ""
|
|
1067
|
+
name = ""
|
|
1068
|
+
if double_ext == False:
|
|
1069
|
+
name = os.path.splitext(file_name)[0]
|
|
1070
|
+
ext = os.path.splitext(file_name)[1]
|
|
1071
|
+
else:
|
|
1072
|
+
ext = (
|
|
1073
|
+
os.path.splitext(os.path.splitext(file_name)[0])[1]
|
|
1074
|
+
+ os.path.splitext(file_name)[1]
|
|
1075
|
+
)
|
|
1076
|
+
name = os.path.splitext(os.path.splitext(file_name)[0])[0]
|
|
1077
|
+
return name, ext
|
|
1078
|
+
|
|
1079
|
+
def recursive_import_ps_manifest_info(
|
|
1080
|
+
folder, my_relative_path, dict_folder_manifest, manifest_df
|
|
1081
|
+
):
|
|
1082
|
+
"""
|
|
1083
|
+
Import manifest information from the Pennsieve dataset for the given folder and its children.
|
|
1084
|
+
"""
|
|
1085
|
+
|
|
1086
|
+
if len(folder['children']) == 0:
|
|
1087
|
+
limit = 100
|
|
1088
|
+
offset = 0
|
|
1089
|
+
ps_folder = {"children": []}
|
|
1090
|
+
while True:
|
|
1091
|
+
r = requests.get(f"{PENNSIEVE_URL}/packages/{folder['content']['id']}?limit={limit}&offset={offset}", headers=create_request_headers(ps), json={"include": "files"})
|
|
1092
|
+
r.raise_for_status()
|
|
1093
|
+
page = r.json()
|
|
1094
|
+
normalize_tracking_folder(page)
|
|
1095
|
+
ps_folder["children"].extend(page)
|
|
1096
|
+
|
|
1097
|
+
if len(page) < limit:
|
|
1098
|
+
break
|
|
1099
|
+
offset += limit
|
|
1100
|
+
|
|
1101
|
+
folder['children'] = ps_folder['children']
|
|
1102
|
+
|
|
1103
|
+
for _, folder_item in folder["children"]["folders"].items():
|
|
1104
|
+
folder_name = folder_item['content']['name']
|
|
1105
|
+
relative_path = generate_relative_path(
|
|
1106
|
+
my_relative_path, folder_name
|
|
1107
|
+
)
|
|
1108
|
+
dict_folder_manifest = recursive_import_ps_manifest_info(
|
|
1109
|
+
folder_item, relative_path, dict_folder_manifest, manifest_df
|
|
1110
|
+
)
|
|
1111
|
+
for _, file in folder["children"]["files"].items():
|
|
1112
|
+
if file['content']['name'] != "manifest":
|
|
1113
|
+
file_id = file['content']['id']
|
|
1114
|
+
r = requests.get(f"{PENNSIEVE_URL}/packages/{file_id}/view", headers=create_request_headers(ps))
|
|
1115
|
+
r.raise_for_status()
|
|
1116
|
+
file_details = r.json()
|
|
1117
|
+
file_name = file_details[0]["content"]["name"]
|
|
1118
|
+
file_extension = splitext(file_name)[1]
|
|
1119
|
+
file_name_with_extension = (
|
|
1120
|
+
splitext(file['content']['name'])[0] + file_extension
|
|
1121
|
+
)
|
|
1122
|
+
relative_path = generate_relative_path(
|
|
1123
|
+
my_relative_path, file_name_with_extension
|
|
1124
|
+
)
|
|
1125
|
+
dict_folder_manifest["filename"].append(relative_path)
|
|
1126
|
+
# file type
|
|
1127
|
+
file_extension = get_name_extension(file_name)
|
|
1128
|
+
if file_extension == "":
|
|
1129
|
+
file_extension = "None"
|
|
1130
|
+
dict_folder_manifest["file type"].append(file_extension)
|
|
1131
|
+
# timestamp, description, Additional Metadata
|
|
1132
|
+
if not manifest_df.empty:
|
|
1133
|
+
if relative_path in manifest_df["filename"].values:
|
|
1134
|
+
timestamp = manifest_df[
|
|
1135
|
+
manifest_df["filename"] == relative_path
|
|
1136
|
+
]["timestamp"].iloc[0]
|
|
1137
|
+
description = manifest_df[
|
|
1138
|
+
manifest_df["filename"] == relative_path
|
|
1139
|
+
]["description"].iloc[0]
|
|
1140
|
+
additional_metadata = manifest_df[
|
|
1141
|
+
manifest_df["filename"] == relative_path
|
|
1142
|
+
]["Additional Metadata"].iloc[0]
|
|
1143
|
+
else:
|
|
1144
|
+
timestamp = ""
|
|
1145
|
+
description = ""
|
|
1146
|
+
additional_metadata = ""
|
|
1147
|
+
dict_folder_manifest["timestamp"].append(timestamp)
|
|
1148
|
+
dict_folder_manifest["description"].append(description)
|
|
1149
|
+
dict_folder_manifest["Additional Metadata"].append(
|
|
1150
|
+
additional_metadata
|
|
1151
|
+
)
|
|
1152
|
+
else:
|
|
1153
|
+
dict_folder_manifest["timestamp"].append("")
|
|
1154
|
+
dict_folder_manifest["description"].append("")
|
|
1155
|
+
dict_folder_manifest["Additional Metadata"].append("")
|
|
1156
|
+
return dict_folder_manifest
|
|
1157
|
+
|
|
1158
|
+
# Merge existing folders
|
|
1159
|
+
def recursive_manifest_builder_existing_ps(
|
|
1160
|
+
my_folder,
|
|
1161
|
+
my_bf_folder,
|
|
1162
|
+
my_bf_folder_exists,
|
|
1163
|
+
my_relative_path,
|
|
1164
|
+
dict_folder_manifest,
|
|
1165
|
+
):
|
|
1166
|
+
if "folders" in my_folder.keys():
|
|
1167
|
+
if my_bf_folder_exists:
|
|
1168
|
+
(
|
|
1169
|
+
my_bf_existing_folders_name,
|
|
1170
|
+
) = ps_get_existing_folders_details(my_bf_folder['children']['folders'])
|
|
1171
|
+
else:
|
|
1172
|
+
my_bf_existing_folders_name = []
|
|
1173
|
+
for folder_key, folder in my_folder["folders"].items():
|
|
1174
|
+
relative_path = generate_relative_path(my_relative_path, folder_key)
|
|
1175
|
+
if folder_key in my_bf_existing_folders_name:
|
|
1176
|
+
bf_folder = my_bf_folder["children"]["folders"][folder_key]
|
|
1177
|
+
bf_folder_exists = True
|
|
1178
|
+
else:
|
|
1179
|
+
bf_folder = ""
|
|
1180
|
+
bf_folder_exists = False
|
|
1181
|
+
dict_folder_manifest = recursive_manifest_builder_existing_ps(
|
|
1182
|
+
folder,
|
|
1183
|
+
bf_folder,
|
|
1184
|
+
bf_folder_exists,
|
|
1185
|
+
relative_path,
|
|
1186
|
+
dict_folder_manifest,
|
|
1187
|
+
)
|
|
1188
|
+
if "files" in my_folder.keys():
|
|
1189
|
+
if my_bf_folder_exists:
|
|
1190
|
+
(
|
|
1191
|
+
my_bf_existing_files_name,
|
|
1192
|
+
my_bf_existing_files_name_with_extension,
|
|
1193
|
+
) = ps_get_existing_files_details(my_bf_folder)
|
|
1194
|
+
else:
|
|
1195
|
+
my_bf_existing_files = []
|
|
1196
|
+
my_bf_existing_files_name = []
|
|
1197
|
+
my_bf_existing_files_name_with_extension = []
|
|
1198
|
+
for file_key, file in my_folder["files"].items():
|
|
1199
|
+
if file.get("location") == "local":
|
|
1200
|
+
file_path = file["path"]
|
|
1201
|
+
if isfile(file_path):
|
|
1202
|
+
desired_name = splitext(file_key)[0]
|
|
1203
|
+
file_extension = splitext(file_key)[1]
|
|
1204
|
+
# manage existing file request
|
|
1205
|
+
if existing_file_option == "skip" and file_key in my_bf_existing_files_name_with_extension:
|
|
1206
|
+
continue
|
|
1207
|
+
if existing_file_option == "replace" and file_key in my_bf_existing_files_name_with_extension:
|
|
1208
|
+
# remove existing from manifest
|
|
1209
|
+
filename = generate_relative_path(
|
|
1210
|
+
my_relative_path, file_key
|
|
1211
|
+
)
|
|
1212
|
+
filename_list = dict_folder_manifest["filename"]
|
|
1213
|
+
index_file = filename_list.index(filename)
|
|
1214
|
+
del dict_folder_manifest["filename"][index_file]
|
|
1215
|
+
del dict_folder_manifest["timestamp"][index_file]
|
|
1216
|
+
del dict_folder_manifest["description"][index_file]
|
|
1217
|
+
del dict_folder_manifest["file type"][index_file]
|
|
1218
|
+
del dict_folder_manifest["Additional Metadata"][
|
|
1219
|
+
index_file
|
|
1220
|
+
]
|
|
1221
|
+
index_name = (
|
|
1222
|
+
my_bf_existing_files_name_with_extension.index(
|
|
1223
|
+
file_key
|
|
1224
|
+
)
|
|
1225
|
+
)
|
|
1226
|
+
del my_bf_existing_files[index_name]
|
|
1227
|
+
del my_bf_existing_files_name[index_name]
|
|
1228
|
+
del my_bf_existing_files_name_with_extension[
|
|
1229
|
+
index_name
|
|
1230
|
+
]
|
|
1231
|
+
if desired_name not in my_bf_existing_files_name:
|
|
1232
|
+
final_name = file_key
|
|
1233
|
+
else:
|
|
1234
|
+
# expected final name
|
|
1235
|
+
count_done = 0
|
|
1236
|
+
final_name = desired_name
|
|
1237
|
+
output = get_base_file_name(desired_name)
|
|
1238
|
+
if output:
|
|
1239
|
+
base_name = output[0]
|
|
1240
|
+
count_exist = output[1]
|
|
1241
|
+
while count_done == 0:
|
|
1242
|
+
if final_name in my_bf_existing_files_name:
|
|
1243
|
+
count_exist += 1
|
|
1244
|
+
final_name = (
|
|
1245
|
+
base_name + "(" + str(count_exist) + ")"
|
|
1246
|
+
)
|
|
1247
|
+
else:
|
|
1248
|
+
count_done = 1
|
|
1249
|
+
else:
|
|
1250
|
+
count_exist = 0
|
|
1251
|
+
while count_done == 0:
|
|
1252
|
+
if final_name in my_bf_existing_files_name:
|
|
1253
|
+
count_exist += 1
|
|
1254
|
+
final_name = (
|
|
1255
|
+
desired_name
|
|
1256
|
+
+ " ("
|
|
1257
|
+
+ str(count_exist)
|
|
1258
|
+
+ ")"
|
|
1259
|
+
)
|
|
1260
|
+
else:
|
|
1261
|
+
count_done = 1
|
|
1262
|
+
final_name = final_name + file_extension
|
|
1263
|
+
my_bf_existing_files_name.append(
|
|
1264
|
+
splitext(final_name)[0]
|
|
1265
|
+
)
|
|
1266
|
+
# filename
|
|
1267
|
+
filename = generate_relative_path(
|
|
1268
|
+
my_relative_path, final_name
|
|
1269
|
+
)
|
|
1270
|
+
dict_folder_manifest["filename"].append(filename)
|
|
1271
|
+
# timestamp
|
|
1272
|
+
file_path = file["path"]
|
|
1273
|
+
filepath = pathlib.Path(file_path)
|
|
1274
|
+
mtime = filepath.stat().st_mtime
|
|
1275
|
+
lastmodtime = datetime.fromtimestamp(mtime).astimezone(
|
|
1276
|
+
local_timezone
|
|
1277
|
+
)
|
|
1278
|
+
dict_folder_manifest["timestamp"].append(
|
|
1279
|
+
lastmodtime.isoformat()
|
|
1280
|
+
.replace(".", ",")
|
|
1281
|
+
.replace("+00:00", "Z")
|
|
1282
|
+
)
|
|
1283
|
+
# description
|
|
1284
|
+
if "description" in file.keys():
|
|
1285
|
+
dict_folder_manifest["description"].append(
|
|
1286
|
+
file["description"]
|
|
1287
|
+
)
|
|
1288
|
+
else:
|
|
1289
|
+
dict_folder_manifest["description"].append("")
|
|
1290
|
+
# file type
|
|
1291
|
+
if file_extension == "":
|
|
1292
|
+
file_extension = "None"
|
|
1293
|
+
dict_folder_manifest["file type"].append(file_extension)
|
|
1294
|
+
# addtional metadata
|
|
1295
|
+
if "additional-metadata" in file.keys():
|
|
1296
|
+
dict_folder_manifest["Additional Metadata"].append(
|
|
1297
|
+
file["additional-metadata"]
|
|
1298
|
+
)
|
|
1299
|
+
else:
|
|
1300
|
+
dict_folder_manifest["Additional Metadata"].append("")
|
|
1301
|
+
return dict_folder_manifest
|
|
1302
|
+
|
|
1303
|
+
double_extensions = [
|
|
1304
|
+
".ome.tiff",
|
|
1305
|
+
".ome.tif",
|
|
1306
|
+
".ome.tf2,",
|
|
1307
|
+
".ome.tf8",
|
|
1308
|
+
".ome.btf",
|
|
1309
|
+
".ome.xml",
|
|
1310
|
+
".brukertiff.gz",
|
|
1311
|
+
".mefd.gz",
|
|
1312
|
+
".moberg.gz",
|
|
1313
|
+
".nii.gz",
|
|
1314
|
+
".mgh.gz",
|
|
1315
|
+
".tar.gz",
|
|
1316
|
+
".bcl.gz",
|
|
1317
|
+
]
|
|
1318
|
+
|
|
1319
|
+
try:
|
|
1320
|
+
# create local folder to save manifest files temporarly (delete any existing one first)
|
|
1321
|
+
shutil.rmtree(manifest_folder_path) if isdir(manifest_folder_path) else 0
|
|
1322
|
+
makedirs(manifest_folder_path)
|
|
1323
|
+
|
|
1324
|
+
# import info about files already on ps
|
|
1325
|
+
dataset_structure = soda["dataset-structure"]
|
|
1326
|
+
manifest_dict_save = {}
|
|
1327
|
+
for high_level_folder_key, high_level_folder in my_tracking_folder["children"]["folders"].items():
|
|
1328
|
+
if (
|
|
1329
|
+
high_level_folder_key in dataset_structure["folders"].keys()
|
|
1330
|
+
):
|
|
1331
|
+
|
|
1332
|
+
relative_path = ""
|
|
1333
|
+
dict_folder_manifest = {}
|
|
1334
|
+
# Initialize dict where manifest info will be stored
|
|
1335
|
+
dict_folder_manifest["filename"] = []
|
|
1336
|
+
dict_folder_manifest["timestamp"] = []
|
|
1337
|
+
dict_folder_manifest["description"] = []
|
|
1338
|
+
dict_folder_manifest["file type"] = []
|
|
1339
|
+
dict_folder_manifest["Additional Metadata"] = []
|
|
1340
|
+
|
|
1341
|
+
# pull manifest file into if exists
|
|
1342
|
+
manifest_df = pd.DataFrame()
|
|
1343
|
+
for file_key, file in high_level_folder['children']['files'].items():
|
|
1344
|
+
file_id = file['content']['id']
|
|
1345
|
+
r = requests.get(f"{PENNSIEVE_URL}/packages/{file_id}/view", headers=create_request_headers(ps))
|
|
1346
|
+
r.raise_for_status()
|
|
1347
|
+
file_details = r.json()
|
|
1348
|
+
file_name_with_extension = file_details[0]["content"]["name"]
|
|
1349
|
+
if file_name_with_extension in manifest_sparc:
|
|
1350
|
+
file_id_2 = file_details[0]["content"]["id"]
|
|
1351
|
+
r = requests.get(f"{PENNSIEVE_URL}/packages/{file_id}/files/{file_id_2}", headers=create_request_headers(ps))
|
|
1352
|
+
r.raise_for_status()
|
|
1353
|
+
file_url_info = r.json()
|
|
1354
|
+
file_url = file_url_info["url"]
|
|
1355
|
+
manifest_df = pd.read_excel(file_url, engine="openpyxl")
|
|
1356
|
+
manifest_df = manifest_df.fillna("")
|
|
1357
|
+
if (
|
|
1358
|
+
"filename" not in manifest_df.columns
|
|
1359
|
+
or "description" not in manifest_df.columns
|
|
1360
|
+
or "Additional Metadata" not in manifest_df.columns
|
|
1361
|
+
):
|
|
1362
|
+
manifest_df = pd.DataFrame()
|
|
1363
|
+
break
|
|
1364
|
+
|
|
1365
|
+
# store the data frame pulled from Pennsieve into a dictionary
|
|
1366
|
+
dict_folder_manifest = recursive_import_ps_manifest_info(
|
|
1367
|
+
high_level_folder, relative_path, dict_folder_manifest, manifest_df
|
|
1368
|
+
)
|
|
1369
|
+
|
|
1370
|
+
manifest_dict_save[high_level_folder_key] = {
|
|
1371
|
+
"manifest": dict_folder_manifest,
|
|
1372
|
+
"bf_folder": high_level_folder,
|
|
1373
|
+
}
|
|
1374
|
+
|
|
1375
|
+
# import info from local files to be uploaded
|
|
1376
|
+
local_timezone = TZLOCAL()
|
|
1377
|
+
manifest_files_structure = {}
|
|
1378
|
+
existing_folder_option = soda["generate-dataset"]["if-existing"]
|
|
1379
|
+
existing_file_option = soda["generate-dataset"][
|
|
1380
|
+
"if-existing-files"
|
|
1381
|
+
]
|
|
1382
|
+
for folder_key, folder in dataset_structure["folders"].items():
|
|
1383
|
+
relative_path = ""
|
|
1384
|
+
|
|
1385
|
+
if (
|
|
1386
|
+
folder_key in manifest_dict_save
|
|
1387
|
+
and existing_folder_option == "merge"
|
|
1388
|
+
):
|
|
1389
|
+
bf_folder = manifest_dict_save[folder_key]["bf_folder"]
|
|
1390
|
+
bf_folder_exists = True
|
|
1391
|
+
dict_folder_manifest = manifest_dict_save[folder_key]["manifest"]
|
|
1392
|
+
|
|
1393
|
+
elif (
|
|
1394
|
+
folder_key in manifest_dict_save
|
|
1395
|
+
and folder_key
|
|
1396
|
+
not in my_tracking_folder["children"]["folders"].keys()
|
|
1397
|
+
and existing_folder_option == "skip"
|
|
1398
|
+
):
|
|
1399
|
+
continue
|
|
1400
|
+
|
|
1401
|
+
else:
|
|
1402
|
+
bf_folder = ""
|
|
1403
|
+
bf_folder_exists = False
|
|
1404
|
+
dict_folder_manifest = {}
|
|
1405
|
+
dict_folder_manifest["filename"] = []
|
|
1406
|
+
dict_folder_manifest["timestamp"] = []
|
|
1407
|
+
dict_folder_manifest["description"] = []
|
|
1408
|
+
dict_folder_manifest["file type"] = []
|
|
1409
|
+
dict_folder_manifest["Additional Metadata"] = []
|
|
1410
|
+
|
|
1411
|
+
dict_folder_manifest = recursive_manifest_builder_existing_ps(
|
|
1412
|
+
folder, bf_folder, bf_folder_exists, relative_path, dict_folder_manifest
|
|
1413
|
+
)
|
|
1414
|
+
|
|
1415
|
+
# create high-level folder at the temporary location
|
|
1416
|
+
folderpath = join(manifest_folder_path, folder_key)
|
|
1417
|
+
makedirs(folderpath)
|
|
1418
|
+
|
|
1419
|
+
# save manifest file
|
|
1420
|
+
manifestfilepath = join(folderpath, "manifest.xlsx")
|
|
1421
|
+
df = pd.DataFrame.from_dict(dict_folder_manifest)
|
|
1422
|
+
df.to_excel(manifestfilepath, index=None, header=True)
|
|
1423
|
+
wb = load_workbook(manifestfilepath)
|
|
1424
|
+
ws = wb.active
|
|
1425
|
+
|
|
1426
|
+
blueFill = PatternFill(
|
|
1427
|
+
start_color="9DC3E6", fill_type="solid"
|
|
1428
|
+
)
|
|
1429
|
+
greenFill = PatternFill(
|
|
1430
|
+
start_color="A8D08D", fill_type="solid"
|
|
1431
|
+
)
|
|
1432
|
+
yellowFill = PatternFill(
|
|
1433
|
+
start_color="FFD965", fill_type="solid"
|
|
1434
|
+
)
|
|
1435
|
+
ws['A1'].fill = blueFill
|
|
1436
|
+
ws['B1'].fill = greenFill
|
|
1437
|
+
ws['C1'].fill = greenFill
|
|
1438
|
+
ws['D1'].fill = greenFill
|
|
1439
|
+
ws['E1'].fill = yellowFill
|
|
1440
|
+
wb.save(manifestfilepath)
|
|
1441
|
+
|
|
1442
|
+
manifest_files_structure[folder_key] = manifestfilepath
|
|
1443
|
+
|
|
1444
|
+
return manifest_files_structure
|
|
1445
|
+
|
|
1446
|
+
except Exception as e:
|
|
1447
|
+
raise e
|
|
1448
|
+
|
|
1449
|
+
|
|
1450
|
+
|
|
1451
|
+
|
|
1452
|
+
|
|
1453
|
+
def generate_relative_path(x, y):
|
|
1454
|
+
return x + "/" + y if x else y
|
|
1455
|
+
|
|
1456
|
+
|
|
1457
|
+
def ps_get_existing_folders_details(ps_folders):
|
|
1458
|
+
ps_existing_folders = [ps_folders[folder] for folder in ps_folders if ps_folders[folder]["content"]["packageType"] == "Collection"]
|
|
1459
|
+
ps_existing_folders_name = [folder['content']["name"] for folder in ps_existing_folders]
|
|
1460
|
+
|
|
1461
|
+
return ps_existing_folders, ps_existing_folders_name
|
|
1462
|
+
|
|
1463
|
+
|
|
1464
|
+
def ps_get_existing_files_details(ps_folder):
|
|
1465
|
+
# TODO: Dorian -> ["extensions doesn't seem to be returned anymore by the endpoint"]
|
|
1466
|
+
def verify_file_name(file_name, extension):
|
|
1467
|
+
if extension == "":
|
|
1468
|
+
return file_name
|
|
1469
|
+
|
|
1470
|
+
double_ext = False
|
|
1471
|
+
for ext in double_extensions:
|
|
1472
|
+
if file_name.find(ext) != -1:
|
|
1473
|
+
double_ext = True
|
|
1474
|
+
break
|
|
1475
|
+
|
|
1476
|
+
extension_from_name = ""
|
|
1477
|
+
|
|
1478
|
+
if double_ext == False:
|
|
1479
|
+
extension_from_name = os.path.splitext(file_name)[1]
|
|
1480
|
+
else:
|
|
1481
|
+
extension_from_name = (
|
|
1482
|
+
os.path.splitext(os.path.splitext(file_name)[0])[1]
|
|
1483
|
+
+ os.path.splitext(file_name)[1]
|
|
1484
|
+
)
|
|
1485
|
+
|
|
1486
|
+
if extension_from_name == ("." + extension):
|
|
1487
|
+
return file_name
|
|
1488
|
+
else:
|
|
1489
|
+
return file_name + ("." + extension)
|
|
1490
|
+
|
|
1491
|
+
files = ps_folder["children"]["files"]
|
|
1492
|
+
double_extensions = [
|
|
1493
|
+
".ome.tiff",
|
|
1494
|
+
".ome.tif",
|
|
1495
|
+
".ome.tf2,",
|
|
1496
|
+
".ome.tf8",
|
|
1497
|
+
".ome.btf",
|
|
1498
|
+
".ome.xml",
|
|
1499
|
+
".brukertiff.gz",
|
|
1500
|
+
".mefd.gz",
|
|
1501
|
+
".moberg.gz",
|
|
1502
|
+
".nii.gz",
|
|
1503
|
+
".mgh.gz",
|
|
1504
|
+
".tar.gz",
|
|
1505
|
+
".bcl.gz",
|
|
1506
|
+
]
|
|
1507
|
+
|
|
1508
|
+
|
|
1509
|
+
bf_existing_files_name = [splitext(files[file]['content']["name"])[0] for file in files]
|
|
1510
|
+
bf_existing_files_name_with_extension = []
|
|
1511
|
+
|
|
1512
|
+
# determine if we are at the root of the dataset
|
|
1513
|
+
content = ps_folder["content"]
|
|
1514
|
+
if (str(content['id'])[2:9]) == "dataset":
|
|
1515
|
+
r = requests.get(f"{PENNSIEVE_URL}/datasets/{content['id']}", headers=create_request_headers(get_access_token()))
|
|
1516
|
+
r.raise_for_status()
|
|
1517
|
+
root_folder = r.json()
|
|
1518
|
+
root_children = root_folder["children"]
|
|
1519
|
+
for item in root_children:
|
|
1520
|
+
file_name_with_extension = ""
|
|
1521
|
+
item_id = item["content"]["id"]
|
|
1522
|
+
item_name = item["content"]["name"]
|
|
1523
|
+
if item_id[2:9] == "package":
|
|
1524
|
+
if("extension" not in root_children):
|
|
1525
|
+
file_name_with_extension = verify_file_name(item_name,"")
|
|
1526
|
+
else:
|
|
1527
|
+
file_name_with_extension = verify_file_name(item_name, root_children["extension"])
|
|
1528
|
+
|
|
1529
|
+
if file_name_with_extension == "":
|
|
1530
|
+
continue
|
|
1531
|
+
bf_existing_files_name_with_extension.append(file_name_with_extension)
|
|
1532
|
+
else:
|
|
1533
|
+
#is collection - aka a folder in the dataset
|
|
1534
|
+
for file_key, file in files.items():
|
|
1535
|
+
file_name_with_extension = ""
|
|
1536
|
+
file_name = file["content"]["name"]
|
|
1537
|
+
file_id = file["content"]["id"]
|
|
1538
|
+
if file_id[2:9] == "package":
|
|
1539
|
+
if "extension" not in file:
|
|
1540
|
+
file_name_with_extension = verify_file_name(file_name,"")
|
|
1541
|
+
else:
|
|
1542
|
+
file_name_with_extension = verify_file_name(file_name, file["extension"])
|
|
1543
|
+
if file_name_with_extension == "":
|
|
1544
|
+
continue
|
|
1545
|
+
bf_existing_files_name_with_extension.append(file_name_with_extension)
|
|
1546
|
+
|
|
1547
|
+
|
|
1548
|
+
return (
|
|
1549
|
+
bf_existing_files_name,
|
|
1550
|
+
bf_existing_files_name_with_extension,
|
|
1551
|
+
)
|
|
1552
|
+
|
|
1553
|
+
|
|
1554
|
+
def check_if_int(s):
|
|
1555
|
+
try:
|
|
1556
|
+
int(s)
|
|
1557
|
+
return True
|
|
1558
|
+
except ValueError:
|
|
1559
|
+
return False
|
|
1560
|
+
|
|
1561
|
+
|
|
1562
|
+
def get_base_file_name(file_name):
|
|
1563
|
+
output = []
|
|
1564
|
+
if file_name[-1] == ")":
|
|
1565
|
+
string_length = len(file_name)
|
|
1566
|
+
count_start = string_length
|
|
1567
|
+
character = file_name[count_start - 1]
|
|
1568
|
+
while character != "(" and count_start >= 0:
|
|
1569
|
+
count_start -= 1
|
|
1570
|
+
character = file_name[count_start - 1]
|
|
1571
|
+
if character == "(":
|
|
1572
|
+
base_name = file_name[:count_start - 1]
|
|
1573
|
+
num = file_name[count_start : string_length - 1]
|
|
1574
|
+
if check_if_int(num):
|
|
1575
|
+
output = [base_name, int(num)]
|
|
1576
|
+
return output
|
|
1577
|
+
|
|
1578
|
+
|
|
1579
|
+
def ps_update_existing_dataset(soda, ds, ps, resume):
|
|
1580
|
+
global logger
|
|
1581
|
+
|
|
1582
|
+
logger.info("Starting ps_update_existing_dataset")
|
|
1583
|
+
|
|
1584
|
+
global main_curate_progress_message
|
|
1585
|
+
global main_total_generate_dataset_size
|
|
1586
|
+
global start_generate
|
|
1587
|
+
global main_initial_bfdataset_size
|
|
1588
|
+
|
|
1589
|
+
# Delete any files on Pennsieve that have been marked as deleted
|
|
1590
|
+
def recursive_file_delete(folder):
|
|
1591
|
+
if "files" in folder.keys():
|
|
1592
|
+
for item in list(folder["files"]):
|
|
1593
|
+
if "deleted" in folder["files"][item]["action"]:
|
|
1594
|
+
file_path = folder["files"][item]["path"]
|
|
1595
|
+
# remove the file from the dataset
|
|
1596
|
+
r = requests.post(f"{PENNSIEVE_URL}/data/delete", headers=create_request_headers(ps), json={"things": [file_path]})
|
|
1597
|
+
r.raise_for_status()
|
|
1598
|
+
# remove the file from the soda json structure
|
|
1599
|
+
del folder["files"][item]
|
|
1600
|
+
|
|
1601
|
+
for item in list(folder["folders"]):
|
|
1602
|
+
recursive_file_delete(folder["folders"][item])
|
|
1603
|
+
|
|
1604
|
+
# Delete any files on Pennsieve that have been marked as deleted
|
|
1605
|
+
def metadata_file_delete(soda):
|
|
1606
|
+
if "dataset_metadata" in soda.keys():
|
|
1607
|
+
folder = soda["dataset_metadata"]
|
|
1608
|
+
for item in list(folder):
|
|
1609
|
+
if "deleted" in folder[item]["action"]:
|
|
1610
|
+
r = requests.post(f"{PENNSIEVE_URL}/data/delete", headers=create_request_headers(ps), json={"things": [folder[item]["path"]]})
|
|
1611
|
+
r.raise_for_status()
|
|
1612
|
+
del folder[item]
|
|
1613
|
+
|
|
1614
|
+
|
|
1615
|
+
def recursive_item_path_create(folder, path):
|
|
1616
|
+
"""
|
|
1617
|
+
Recursively create the path for the item # Add a new key containing the path to all the files and folders on the
|
|
1618
|
+
local data structure.
|
|
1619
|
+
Allows us to see if the folder path of a specfic file already
|
|
1620
|
+
exists on Pennsieve.
|
|
1621
|
+
"""
|
|
1622
|
+
|
|
1623
|
+
if "files" in folder.keys():
|
|
1624
|
+
for item in list(folder["files"]):
|
|
1625
|
+
if item in ["manifest.xslx", "manifest.csv"]:
|
|
1626
|
+
continue
|
|
1627
|
+
if "folderpath" not in folder["files"][item]:
|
|
1628
|
+
folder["files"][item]["folderpath"] = path[:]
|
|
1629
|
+
|
|
1630
|
+
if "folders" in folder.keys():
|
|
1631
|
+
for item in list(folder["folders"]):
|
|
1632
|
+
if "folderpath" not in folder["folders"][item]:
|
|
1633
|
+
folder["folders"][item]["folderpath"] = path[:]
|
|
1634
|
+
folder["folders"][item]["folderpath"].append(item)
|
|
1635
|
+
recursive_item_path_create(
|
|
1636
|
+
folder["folders"][item], folder["folders"][item]["folderpath"][:]
|
|
1637
|
+
)
|
|
1638
|
+
|
|
1639
|
+
return
|
|
1640
|
+
|
|
1641
|
+
# Check and create any non existing folders for the file move process (Used in the recursive_check_moved_files function)
|
|
1642
|
+
def recursive_check_and_create_ps_file_path(
|
|
1643
|
+
folderpath, index, current_folder_structure
|
|
1644
|
+
):
|
|
1645
|
+
folder = folderpath[index]
|
|
1646
|
+
|
|
1647
|
+
if folder not in current_folder_structure["folders"]:
|
|
1648
|
+
if index == 0:
|
|
1649
|
+
r = requests.post(f"{PENNSIEVE_URL}/packages", json={"name": folder, "parent": f"{current_folder_structure['path']}", "packageType": "collection", "dataset": ds['content']['id']}, headers=create_request_headers(ps))
|
|
1650
|
+
r.raise_for_status()
|
|
1651
|
+
new_folder = r.json()
|
|
1652
|
+
else:
|
|
1653
|
+
r = requests.post(f"{PENNSIEVE_URL}/packages", json={"name": folder, "parent": f"{current_folder_structure['path']}", "packageType": "collection", "dataset": ds['content']['id']}, headers=create_request_headers(ps))
|
|
1654
|
+
r.raise_for_status()
|
|
1655
|
+
new_folder = r.json()
|
|
1656
|
+
|
|
1657
|
+
current_folder_structure["folders"][folder] = {
|
|
1658
|
+
"location": "ps",
|
|
1659
|
+
"action": ["existing"],
|
|
1660
|
+
"path": new_folder['content']['id'],
|
|
1661
|
+
"folders": {},
|
|
1662
|
+
"files": {},
|
|
1663
|
+
}
|
|
1664
|
+
|
|
1665
|
+
index += 1
|
|
1666
|
+
# check if path exists for folder, if not then folder has not been created on Pennsieve yet, so create it and add it to the path key
|
|
1667
|
+
if "path" not in current_folder_structure["folders"][folder].keys() or current_folder_structure["folders"][folder]["location"] != "ps":
|
|
1668
|
+
r = requests.post(f"{PENNSIEVE_URL}/packages", headers=create_request_headers(ps), json=build_create_folder_request(folder, current_folder_structure["path"], ds['content']['id']))
|
|
1669
|
+
r.raise_for_status()
|
|
1670
|
+
new_folder_id = r.json()["content"]["id"]
|
|
1671
|
+
current_folder_structure["folders"][folder]["path"] = new_folder_id
|
|
1672
|
+
|
|
1673
|
+
if index < len(folderpath):
|
|
1674
|
+
return recursive_check_and_create_ps_file_path(
|
|
1675
|
+
folderpath, index, current_folder_structure["folders"][folder]
|
|
1676
|
+
)
|
|
1677
|
+
else:
|
|
1678
|
+
return current_folder_structure["folders"][folder]["path"]
|
|
1679
|
+
|
|
1680
|
+
# Check for any files that have been moved and verify paths before moving
|
|
1681
|
+
def recursive_check_moved_files(folder):
|
|
1682
|
+
if "files" in folder.keys():
|
|
1683
|
+
for item in list(folder["files"]):
|
|
1684
|
+
if (
|
|
1685
|
+
"moved" in folder["files"][item]["action"]
|
|
1686
|
+
and folder["files"][item]["location"] == "ps"
|
|
1687
|
+
):
|
|
1688
|
+
# create the folders if they do not exist
|
|
1689
|
+
new_folder_id = ""
|
|
1690
|
+
new_folder_id = recursive_check_and_create_ps_file_path(
|
|
1691
|
+
folder["files"][item]["folderpath"].copy(), 0, dataset_structure
|
|
1692
|
+
)
|
|
1693
|
+
# move the file into the target folder on Pennsieve
|
|
1694
|
+
r = requests.post(f"{PENNSIEVE_URL}/data/move", json={"things": [folder["files"][item]["path"]], "destination": new_folder_id}, headers=create_request_headers(ps))
|
|
1695
|
+
r.raise_for_status()
|
|
1696
|
+
|
|
1697
|
+
for item in list(folder["folders"]):
|
|
1698
|
+
recursive_check_moved_files(folder["folders"][item])
|
|
1699
|
+
|
|
1700
|
+
|
|
1701
|
+
# Rename any files that exist on Pennsieve
|
|
1702
|
+
def recursive_file_rename(folder):
|
|
1703
|
+
if "files" in folder.keys():
|
|
1704
|
+
for item in list(folder["files"]):
|
|
1705
|
+
if (
|
|
1706
|
+
"renamed" in folder["files"][item]["action"]
|
|
1707
|
+
and folder["files"][item]["location"] == "ps"
|
|
1708
|
+
):
|
|
1709
|
+
# rename the file on Pennsieve
|
|
1710
|
+
r = requests.put(f"{PENNSIEVE_URL}/packages/{folder['files'][item]['path']}?updateStorage=true", json={"name": item}, headers=create_request_headers(ps))
|
|
1711
|
+
r.raise_for_status()
|
|
1712
|
+
|
|
1713
|
+
for item in list(folder["folders"]):
|
|
1714
|
+
recursive_file_rename(folder["folders"][item])
|
|
1715
|
+
|
|
1716
|
+
|
|
1717
|
+
def recursive_folder_delete(folder):
|
|
1718
|
+
"""
|
|
1719
|
+
Delete any stray folders that exist on Pennsieve
|
|
1720
|
+
Only top level files are deleted since the api deletes any
|
|
1721
|
+
files and folders that exist inside.
|
|
1722
|
+
"""
|
|
1723
|
+
for item in list(folder["folders"]):
|
|
1724
|
+
if folder["folders"][item]["location"] == "ps":
|
|
1725
|
+
if "moved" in folder["folders"][item]["action"]:
|
|
1726
|
+
file_path = folder["folders"][item]["path"]
|
|
1727
|
+
# remove the file from the dataset
|
|
1728
|
+
r = requests.post(f"{PENNSIEVE_URL}/data/delete", headers=create_request_headers(ps), json={"things": [file_path]})
|
|
1729
|
+
r.raise_for_status()
|
|
1730
|
+
if "deleted" in folder["folders"][item]["action"]:
|
|
1731
|
+
file_path = folder["folders"][item]["path"]
|
|
1732
|
+
# remove the file from the dataset
|
|
1733
|
+
r = requests.post(f"{PENNSIEVE_URL}/data/delete", headers=create_request_headers(ps), json={"things": [file_path]})
|
|
1734
|
+
r.raise_for_status()
|
|
1735
|
+
del folder["folders"][item]
|
|
1736
|
+
else:
|
|
1737
|
+
recursive_folder_delete(folder["folders"][item])
|
|
1738
|
+
else:
|
|
1739
|
+
recursive_folder_delete(folder["folders"][item])
|
|
1740
|
+
|
|
1741
|
+
|
|
1742
|
+
# Rename any folders that still exist.
|
|
1743
|
+
def recursive_folder_rename(folder, mode):
|
|
1744
|
+
for item in list(folder["folders"]):
|
|
1745
|
+
if (
|
|
1746
|
+
folder["folders"][item]["location"] == "ps"
|
|
1747
|
+
and "action" in folder["folders"][item].keys()
|
|
1748
|
+
and mode in folder["folders"][item]["action"]
|
|
1749
|
+
):
|
|
1750
|
+
folder_id = folder["folders"][item]["path"]
|
|
1751
|
+
r = requests.put(f"{PENNSIEVE_URL}/packages/{folder_id}?updateStorage=true", headers=create_request_headers(ps), json={"name": item})
|
|
1752
|
+
r.raise_for_status()
|
|
1753
|
+
recursive_folder_rename(folder["folders"][item], mode)
|
|
1754
|
+
|
|
1755
|
+
|
|
1756
|
+
ps_dataset = ""
|
|
1757
|
+
start = timer()
|
|
1758
|
+
# 1. Remove all existing files on Pennsieve, that the user deleted.
|
|
1759
|
+
logger.info("ps_update_existing_dataset step 1 remove existing files on Pennsieve the user deleted")
|
|
1760
|
+
main_curate_progress_message = "Checking Pennsieve for deleted files"
|
|
1761
|
+
dataset_structure = soda["dataset-structure"]
|
|
1762
|
+
recursive_file_delete(dataset_structure)
|
|
1763
|
+
main_curate_progress_message = (
|
|
1764
|
+
"Files on Pennsieve marked for deletion have been deleted"
|
|
1765
|
+
)
|
|
1766
|
+
|
|
1767
|
+
# 2. Rename any deleted folders on Pennsieve to allow for replacements.
|
|
1768
|
+
logger.info("ps_update_existing_dataset step 2 rename deleted folders on Pennsieve to allow for replacements")
|
|
1769
|
+
main_curate_progress_message = "Checking Pennsieve for deleted folders"
|
|
1770
|
+
dataset_structure = soda["dataset-structure"]
|
|
1771
|
+
recursive_folder_rename(dataset_structure, "deleted")
|
|
1772
|
+
main_curate_progress_message = "Folders on Pennsieve have been marked for deletion"
|
|
1773
|
+
|
|
1774
|
+
# 2.5 Rename folders that need to be in the final destination.
|
|
1775
|
+
logger.info("ps_update_existing_dataset step 2.5 rename folders that need to be in the final destination")
|
|
1776
|
+
main_curate_progress_message = "Renaming any folders requested by the user"
|
|
1777
|
+
recursive_folder_rename(dataset_structure, "renamed")
|
|
1778
|
+
main_curate_progress_message = "Renamed all folders requested by the user"
|
|
1779
|
+
|
|
1780
|
+
# 3. Get the status of all files currently on Pennsieve and create
|
|
1781
|
+
# the folderpath for all items in both dataset structures.
|
|
1782
|
+
logger.info("ps_update_existing_dataset step 3 get the status of all files currently on Pennsieve and create the folderpath for all items in both dataset structures")
|
|
1783
|
+
main_curate_progress_message = "Fetching files and folders from Pennsieve"
|
|
1784
|
+
current_bf_dataset_files_folders = import_pennsieve_dataset(
|
|
1785
|
+
soda.copy()
|
|
1786
|
+
)["soda_object"]
|
|
1787
|
+
ps_dataset = current_bf_dataset_files_folders["dataset-structure"]
|
|
1788
|
+
main_curate_progress_message = "Creating file paths for all files on Pennsieve"
|
|
1789
|
+
recursive_item_path_create(dataset_structure, [])
|
|
1790
|
+
recursive_item_path_create(ps_dataset, [])
|
|
1791
|
+
main_curate_progress_message = "File paths created"
|
|
1792
|
+
|
|
1793
|
+
# 4. Move any files that are marked as moved on Pennsieve.
|
|
1794
|
+
# Create any additional folders if required
|
|
1795
|
+
logger.info("ps_update_existing_dataset step 4 move any files that are marked as moved on Pennsieve")
|
|
1796
|
+
main_curate_progress_message = "Moving any files requested by the user"
|
|
1797
|
+
recursive_check_moved_files(dataset_structure)
|
|
1798
|
+
main_curate_progress_message = "Moved all files requested by the user"
|
|
1799
|
+
|
|
1800
|
+
# 5. Rename any Pennsieve files that are marked as renamed.
|
|
1801
|
+
logger.info("ps_update_existing_dataset step 5 rename any Pennsieve files that are marked as renamed")
|
|
1802
|
+
main_curate_progress_message = "Renaming any files requested by the user"
|
|
1803
|
+
recursive_file_rename(dataset_structure)
|
|
1804
|
+
main_curate_progress_message = "Renamed all files requested by the user"
|
|
1805
|
+
|
|
1806
|
+
# 6. Delete any Pennsieve folders that are marked as deleted.
|
|
1807
|
+
logger.info("ps_update_existing_dataset step 6 delete any Pennsieve folders that are marked as deleted")
|
|
1808
|
+
main_curate_progress_message = (
|
|
1809
|
+
"Deleting any additional folders present on Pennsieve"
|
|
1810
|
+
)
|
|
1811
|
+
recursive_folder_delete(dataset_structure)
|
|
1812
|
+
main_curate_progress_message = "Deletion of additional folders complete"
|
|
1813
|
+
|
|
1814
|
+
# 7. Delete any metadata files that are marked as deleted.
|
|
1815
|
+
logger.info("ps_update_existing_dataset step 8 delete any metadata files that are marked as deleted")
|
|
1816
|
+
main_curate_progress_message = "Removing any metadata files marked for deletion"
|
|
1817
|
+
metadata_file_delete(soda)
|
|
1818
|
+
main_curate_progress_message = "Removed metadata files marked for deletion"
|
|
1819
|
+
|
|
1820
|
+
# 8. Run the original code to upload any new files added to the dataset.
|
|
1821
|
+
logger.info("ps_update_existing_dataset step 9 run the ps_create_new_dataset code to upload any new files added to the dataset")
|
|
1822
|
+
if "dataset_metadata" in soda.keys() and "manifest_files" in soda["dataset_metadata"].keys():
|
|
1823
|
+
if "auto-generated" in soda["manifest-files"].keys():
|
|
1824
|
+
soda["manifest-files"] = {"destination": "ps", "auto-generated": True}
|
|
1825
|
+
else:
|
|
1826
|
+
soda["manifest-files"] = {"destination": "ps"}
|
|
1827
|
+
|
|
1828
|
+
soda["generate-dataset"] = {
|
|
1829
|
+
"destination": "ps",
|
|
1830
|
+
"if-existing": "merge",
|
|
1831
|
+
"if-existing-files": "replace",
|
|
1832
|
+
"generate-option": "existing-ps"
|
|
1833
|
+
}
|
|
1834
|
+
|
|
1835
|
+
end = timer()
|
|
1836
|
+
logger.info(f"Time for ps_update_existing_dataset function: {timedelta(seconds=end - start)}")
|
|
1837
|
+
ps_upload_to_dataset(soda, ps, ds, resume)
|
|
1838
|
+
|
|
1839
|
+
|
|
1840
|
+
def get_origin_manifest_id(dataset_id):
|
|
1841
|
+
global logger
|
|
1842
|
+
max_attempts = 3
|
|
1843
|
+
for _ in range(max_attempts):
|
|
1844
|
+
manifests = get_upload_manifests(dataset_id)
|
|
1845
|
+
if manifests and "manifests" in manifests and manifests["manifests"]:
|
|
1846
|
+
# sort the manifests list by date_created timestamp field in descending order
|
|
1847
|
+
manifests["manifests"].sort(key=lambda x: x["date_created"], reverse=True)
|
|
1848
|
+
return manifests["manifests"][0]["id"]
|
|
1849
|
+
time.sleep(5) # Wait for 5 seconds before the next attempt
|
|
1850
|
+
|
|
1851
|
+
raise Exception("Did not get the origin manifest id in an expected amount of time.")
|
|
1852
|
+
|
|
1853
|
+
|
|
1854
|
+
|
|
1855
|
+
def normalize_tracking_folder(tracking_folder):
|
|
1856
|
+
"""
|
|
1857
|
+
Normalize the tracking folder object to be a dictonary with the shape: {files: {}, folders: {}}.
|
|
1858
|
+
This shape matches our dataset structure object. Recall, the tracking folder receives information about what folders and
|
|
1859
|
+
files are stored on Pennsieve. We update this as we update Pennsieve's state.
|
|
1860
|
+
"""
|
|
1861
|
+
if tracking_folder == "":
|
|
1862
|
+
return {"folders": {}, "files": {} }
|
|
1863
|
+
|
|
1864
|
+
temp_children = {"folders": {}, "files": {}}
|
|
1865
|
+
|
|
1866
|
+
|
|
1867
|
+
# add the files and folders to the temp_children structure
|
|
1868
|
+
for child in tracking_folder["children"]:
|
|
1869
|
+
if child["content"]["packageType"] == "Collection":
|
|
1870
|
+
# add the folders ( designated collection on Pennsieve ) to the temp_children structure under folders
|
|
1871
|
+
temp_children["folders"][child["content"]["name"]] = child
|
|
1872
|
+
else:
|
|
1873
|
+
# add the files (anything not designated a collection) to the temp_children structure under files
|
|
1874
|
+
temp_children["files"][child["content"]["name"]] = child
|
|
1875
|
+
|
|
1876
|
+
# replace the non-normalized children structure with the normalized children structure
|
|
1877
|
+
tracking_folder["children"] = temp_children
|
|
1878
|
+
|
|
1879
|
+
|
|
1880
|
+
def build_create_folder_request(folder_name, folder_parent_id, dataset_id):
|
|
1881
|
+
"""
|
|
1882
|
+
Create a folder on Pennsieve.
|
|
1883
|
+
"""
|
|
1884
|
+
body = {}
|
|
1885
|
+
|
|
1886
|
+
# if creating a folder at the root of the dataset the api does not require a parent key
|
|
1887
|
+
if folder_parent_id.find("N:dataset") == -1:
|
|
1888
|
+
body["parent"] = folder_parent_id
|
|
1889
|
+
|
|
1890
|
+
body["name"] = folder_name
|
|
1891
|
+
body["dataset"] = dataset_id
|
|
1892
|
+
body["packageType"] = "collection"
|
|
1893
|
+
|
|
1894
|
+
return body
|
|
1895
|
+
|
|
1896
|
+
|
|
1897
|
+
bytes_uploaded_per_file = {}
|
|
1898
|
+
total_bytes_uploaded = {"value": 0}
|
|
1899
|
+
current_files_in_subscriber_session = 0
|
|
1900
|
+
|
|
1901
|
+
|
|
1902
|
+
|
|
1903
|
+
bytes_file_path_dict = {}
|
|
1904
|
+
|
|
1905
|
+
# retry variables instantiated outside function
|
|
1906
|
+
list_of_files_to_rename = {}
|
|
1907
|
+
renamed_files_counter = 0
|
|
1908
|
+
|
|
1909
|
+
|
|
1910
|
+
def ps_upload_to_dataset(soda, ps, ds, resume=False):
|
|
1911
|
+
global logger
|
|
1912
|
+
|
|
1913
|
+
# Progress tracking variables that are used for the frontend progress bar.
|
|
1914
|
+
global main_curate_progress_message
|
|
1915
|
+
global main_total_generate_dataset_size
|
|
1916
|
+
global main_generated_dataset_size
|
|
1917
|
+
global start_generate
|
|
1918
|
+
global main_initial_bfdataset_size
|
|
1919
|
+
global main_curation_uploaded_files
|
|
1920
|
+
global uploaded_folder_counter
|
|
1921
|
+
global current_size_of_uploaded_files
|
|
1922
|
+
global total_files
|
|
1923
|
+
global total_bytes_uploaded # current number of bytes uploaded to Pennsieve in the current session
|
|
1924
|
+
global client
|
|
1925
|
+
global files_uploaded
|
|
1926
|
+
global total_dataset_files
|
|
1927
|
+
global current_files_in_subscriber_session
|
|
1928
|
+
global renaming_files_flow
|
|
1929
|
+
global bytes_uploaded_per_file
|
|
1930
|
+
global total_bytes_uploaded_per_file
|
|
1931
|
+
global bytes_file_path_dict
|
|
1932
|
+
global elapsed_time
|
|
1933
|
+
global manifest_id
|
|
1934
|
+
global origin_manifest_id
|
|
1935
|
+
global main_curate_status
|
|
1936
|
+
global list_of_files_to_rename
|
|
1937
|
+
global renamed_files_counter
|
|
1938
|
+
|
|
1939
|
+
|
|
1940
|
+
|
|
1941
|
+
total_files = 0
|
|
1942
|
+
total_dataset_files = 0
|
|
1943
|
+
total_metadata_files = 0
|
|
1944
|
+
total_manifest_files = 0
|
|
1945
|
+
main_curation_uploaded_files = 0
|
|
1946
|
+
total_bytes_uploaded = {"value": 0}
|
|
1947
|
+
total_bytes_uploaded_per_file = {}
|
|
1948
|
+
files_uploaded = 0
|
|
1949
|
+
renamed_files_counter = 0
|
|
1950
|
+
|
|
1951
|
+
|
|
1952
|
+
uploaded_folder_counter = 0
|
|
1953
|
+
current_size_of_uploaded_files = 0
|
|
1954
|
+
start = timer()
|
|
1955
|
+
try:
|
|
1956
|
+
|
|
1957
|
+
|
|
1958
|
+
def recursive_dataset_scan_for_new_upload(dataset_structure, list_upload_files, my_relative_path):
|
|
1959
|
+
"""
|
|
1960
|
+
This function recursively gathers the files and folders in the dataset that will be uploaded to Pennsieve.
|
|
1961
|
+
It assumes the dataset is new based on the generate_option value and will spend less time comparing what is on Pennsieve.
|
|
1962
|
+
It will gather all the relative paths for the files and folders to pass along to the Pennsieve agent.
|
|
1963
|
+
Input:
|
|
1964
|
+
dataset_structure,
|
|
1965
|
+
my_relative_path
|
|
1966
|
+
|
|
1967
|
+
Output:
|
|
1968
|
+
two lists in one tuple, the first list will have all the local file paths that will be uploaded to Pennsieve
|
|
1969
|
+
The second list will have the relative files paths according to the dataset structure.
|
|
1970
|
+
If the folder does not existing yet on Pennsieve the agent will create it.
|
|
1971
|
+
"""
|
|
1972
|
+
global main_total_generate_dataset_size
|
|
1973
|
+
global bytes_file_path_dict
|
|
1974
|
+
# First loop will take place in the root of the dataset
|
|
1975
|
+
if "folders" in dataset_structure.keys():
|
|
1976
|
+
for folder_key, folder in dataset_structure["folders"].items():
|
|
1977
|
+
relative_path = generate_relative_path(my_relative_path, folder_key)
|
|
1978
|
+
list_upload_files = recursive_dataset_scan_for_new_upload(folder, list_upload_files, relative_path)
|
|
1979
|
+
if "files" in dataset_structure.keys():
|
|
1980
|
+
list_local_files = []
|
|
1981
|
+
list_projected_names = []
|
|
1982
|
+
list_desired_names = []
|
|
1983
|
+
list_final_names = []
|
|
1984
|
+
|
|
1985
|
+
list_initial_names = []
|
|
1986
|
+
for file_key, file in dataset_structure["files"].items():
|
|
1987
|
+
# relative_path = generate_relative_path(my_relative_path, file_key)
|
|
1988
|
+
file_path = file["path"]
|
|
1989
|
+
if isfile(file_path) and file.get("location") == "local":
|
|
1990
|
+
projected_name = splitext(basename(file_path))[0]
|
|
1991
|
+
projected_name_w_extension = basename(file_path)
|
|
1992
|
+
desired_name = splitext(file_key)[0]
|
|
1993
|
+
desired_name_with_extension = file_key
|
|
1994
|
+
|
|
1995
|
+
|
|
1996
|
+
if projected_name != desired_name:
|
|
1997
|
+
list_initial_names.append(projected_name)
|
|
1998
|
+
list_local_files.append(file_path)
|
|
1999
|
+
list_projected_names.append(projected_name_w_extension)
|
|
2000
|
+
list_desired_names.append(desired_name_with_extension)
|
|
2001
|
+
list_final_names.append(desired_name)
|
|
2002
|
+
else:
|
|
2003
|
+
list_local_files.append(file_path)
|
|
2004
|
+
list_projected_names.append(projected_name_w_extension)
|
|
2005
|
+
list_desired_names.append(desired_name_with_extension)
|
|
2006
|
+
list_final_names.append(desired_name)
|
|
2007
|
+
list_initial_names.append(projected_name)
|
|
2008
|
+
|
|
2009
|
+
file_size = getsize(file_path)
|
|
2010
|
+
main_total_generate_dataset_size += file_size
|
|
2011
|
+
bytes_file_path_dict[file_path] = file_size
|
|
2012
|
+
|
|
2013
|
+
if list_local_files:
|
|
2014
|
+
list_upload_files.append([
|
|
2015
|
+
list_local_files,
|
|
2016
|
+
list_projected_names,
|
|
2017
|
+
list_desired_names,
|
|
2018
|
+
list_final_names,
|
|
2019
|
+
"/" if my_relative_path == soda["generate-dataset"]["dataset-name"] else my_relative_path,
|
|
2020
|
+
])
|
|
2021
|
+
|
|
2022
|
+
|
|
2023
|
+
return list_upload_files
|
|
2024
|
+
|
|
2025
|
+
# See how to create folders with the Pennsieve agent
|
|
2026
|
+
def recursive_create_folder_for_ps(
|
|
2027
|
+
my_folder, my_tracking_folder, existing_folder_option
|
|
2028
|
+
):
|
|
2029
|
+
"""
|
|
2030
|
+
Creates a folder on Pennsieve for each folder in the dataset structure if they aren't already present in the dataset.
|
|
2031
|
+
Input:
|
|
2032
|
+
my_folder: The dataset structure to be created on Pennsieve. Pass in the soda json object to start.
|
|
2033
|
+
my_tracking_folder: Tracks what folders have been created on Pennsieve thus far. Starts as an empty dictionary.
|
|
2034
|
+
existing_folder_option: Dictates whether to merge, duplicate, replace, or skip existing folders.
|
|
2035
|
+
"""
|
|
2036
|
+
# Check if the current folder has any subfolders that already exist on Pennsieve. Important step to appropriately handle replacing and merging folders.
|
|
2037
|
+
if len(my_tracking_folder["children"]["folders"]) == 0 and my_tracking_folder["content"]["id"].find("N:dataset") == -1:
|
|
2038
|
+
limit = 100
|
|
2039
|
+
offset = 0
|
|
2040
|
+
ps_folder = {}
|
|
2041
|
+
ps_folder_children = []
|
|
2042
|
+
while True:
|
|
2043
|
+
r = requests.get(f"{PENNSIEVE_URL}/packages/{my_tracking_folder['content']['id']}?limit={limit}&offset={offset}", headers=create_request_headers(ps), json={"include": "files"})
|
|
2044
|
+
r.raise_for_status()
|
|
2045
|
+
ps_folder = r.json()
|
|
2046
|
+
page = ps_folder["children"]
|
|
2047
|
+
ps_folder_children.extend(page)
|
|
2048
|
+
if len(page) < limit:
|
|
2049
|
+
break
|
|
2050
|
+
offset += limit
|
|
2051
|
+
time.sleep(1)
|
|
2052
|
+
|
|
2053
|
+
ps_folder["children"] = ps_folder_children
|
|
2054
|
+
normalize_tracking_folder(ps_folder)
|
|
2055
|
+
my_tracking_folder["children"] = ps_folder["children"]
|
|
2056
|
+
|
|
2057
|
+
# create/replace/skip folder
|
|
2058
|
+
if "folders" in my_folder.keys():
|
|
2059
|
+
for folder_key, folder in my_folder["folders"].items():
|
|
2060
|
+
if existing_folder_option == "merge":
|
|
2061
|
+
if folder_key in my_tracking_folder["children"]["folders"]:
|
|
2062
|
+
ps_folder = my_tracking_folder["children"]["folders"][folder_key]
|
|
2063
|
+
normalize_tracking_folder(ps_folder)
|
|
2064
|
+
else:
|
|
2065
|
+
# We are merging but this is a new folder - not one that already exists in the current dataset - so we create it.
|
|
2066
|
+
r = requests.post(f"{PENNSIEVE_URL}/packages", headers=create_request_headers(ps), json=build_create_folder_request(folder_key, my_tracking_folder['content']['id'], ds['content']['id']))
|
|
2067
|
+
r.raise_for_status()
|
|
2068
|
+
ps_folder = r.json()
|
|
2069
|
+
normalize_tracking_folder(ps_folder)
|
|
2070
|
+
|
|
2071
|
+
elif existing_folder_option == "replace":
|
|
2072
|
+
# if the folder exists on Pennsieve remove it
|
|
2073
|
+
if folder_key in my_tracking_folder["children"]["folders"]:
|
|
2074
|
+
ps_folder = my_tracking_folder["children"]["folders"][folder_key]
|
|
2075
|
+
|
|
2076
|
+
r = requests.post(f"{PENNSIEVE_URL}/data/delete", headers=create_request_headers(ps), json={"things": [ps_folder["content"]["id"]]})
|
|
2077
|
+
r.raise_for_status()
|
|
2078
|
+
|
|
2079
|
+
# remove from ps_folder
|
|
2080
|
+
del my_tracking_folder["children"]["folders"][folder_key]
|
|
2081
|
+
|
|
2082
|
+
r = requests.post(f"{PENNSIEVE_URL}/packages", headers=create_request_headers(ps), json=build_create_folder_request(folder_key, my_tracking_folder['content']['id'], ds['content']['id']))
|
|
2083
|
+
r.raise_for_status()
|
|
2084
|
+
ps_folder = r.json()
|
|
2085
|
+
normalize_tracking_folder(ps_folder)
|
|
2086
|
+
|
|
2087
|
+
my_tracking_folder["children"]["folders"][folder_key] = ps_folder
|
|
2088
|
+
tracking_folder = my_tracking_folder["children"]["folders"][folder_key] # get the folder we just added to the tracking folder
|
|
2089
|
+
recursive_create_folder_for_ps(
|
|
2090
|
+
folder, tracking_folder, existing_folder_option
|
|
2091
|
+
)
|
|
2092
|
+
|
|
2093
|
+
def recursive_dataset_scan_for_ps(
|
|
2094
|
+
my_folder,
|
|
2095
|
+
my_tracking_folder,
|
|
2096
|
+
existing_file_option,
|
|
2097
|
+
list_upload_files,
|
|
2098
|
+
my_relative_path,
|
|
2099
|
+
):
|
|
2100
|
+
"""
|
|
2101
|
+
Delete files that are marked to be replaced in the dataset. Create a list of files to upload to Pennsieve.
|
|
2102
|
+
"""
|
|
2103
|
+
|
|
2104
|
+
global main_total_generate_dataset_size
|
|
2105
|
+
global logger
|
|
2106
|
+
|
|
2107
|
+
|
|
2108
|
+
# folder children are packages such as collections and files stored on the Pennsieve dataset
|
|
2109
|
+
ps_folder_children = my_tracking_folder["children"] #ds (dataset)
|
|
2110
|
+
|
|
2111
|
+
|
|
2112
|
+
|
|
2113
|
+
if "folders" in my_folder.keys():
|
|
2114
|
+
for folder_key, folder in my_folder["folders"].items():
|
|
2115
|
+
relative_path = generate_relative_path(my_relative_path, folder_key)
|
|
2116
|
+
tracking_folder = ps_folder_children["folders"][folder_key]
|
|
2117
|
+
list_upload_files = recursive_dataset_scan_for_ps(
|
|
2118
|
+
folder,
|
|
2119
|
+
tracking_folder,
|
|
2120
|
+
existing_file_option,
|
|
2121
|
+
list_upload_files,
|
|
2122
|
+
relative_path,
|
|
2123
|
+
)
|
|
2124
|
+
|
|
2125
|
+
if "files" in my_folder.keys():
|
|
2126
|
+
|
|
2127
|
+
# delete files to be deleted
|
|
2128
|
+
(
|
|
2129
|
+
my_bf_existing_files_name,
|
|
2130
|
+
my_bf_existing_files_name_with_extension,
|
|
2131
|
+
) = ps_get_existing_files_details(my_tracking_folder)
|
|
2132
|
+
|
|
2133
|
+
for file_key, file in my_folder["files"].items():
|
|
2134
|
+
# if local then we are either adding a new file to an existing/new dataset or replacing a file in an existing dataset
|
|
2135
|
+
if file.get("location") == "local":
|
|
2136
|
+
file_path = file["path"]
|
|
2137
|
+
if isfile(file_path) and existing_file_option == "replace" and file_key in ps_folder_children["files"]:
|
|
2138
|
+
my_file = ps_folder_children["files"][file_key]
|
|
2139
|
+
# delete the package ( aka file ) from the dataset
|
|
2140
|
+
r = requests.post(f"{PENNSIEVE_URL}/data/delete", headers=create_request_headers(ps), json={"things": [f"{my_file['content']['id']}"]})
|
|
2141
|
+
r.raise_for_status()
|
|
2142
|
+
del ps_folder_children["files"][file_key]
|
|
2143
|
+
|
|
2144
|
+
|
|
2145
|
+
# create list of files to be uploaded with projected and desired names saved
|
|
2146
|
+
(
|
|
2147
|
+
my_bf_existing_files_name,
|
|
2148
|
+
my_bf_existing_files_name_with_extension,
|
|
2149
|
+
) = ps_get_existing_files_details(my_tracking_folder)
|
|
2150
|
+
|
|
2151
|
+
logger.info(f"Existing files in Pennsieve: {my_bf_existing_files_name_with_extension}")
|
|
2152
|
+
|
|
2153
|
+
list_local_files = []
|
|
2154
|
+
list_projected_names = []
|
|
2155
|
+
list_desired_names = []
|
|
2156
|
+
list_final_names = []
|
|
2157
|
+
additional_upload_lists = []
|
|
2158
|
+
|
|
2159
|
+
list_initial_names = []
|
|
2160
|
+
|
|
2161
|
+
# add the files that are set to be uploaded to Pennsieve to a list
|
|
2162
|
+
# handle renaming files and creating duplicates
|
|
2163
|
+
for file_key, file in my_folder["files"].items():
|
|
2164
|
+
if file.get("location") == "local":
|
|
2165
|
+
file_path = file["path"]
|
|
2166
|
+
if isfile(file_path):
|
|
2167
|
+
initial_name = splitext(basename(file_path))[0]
|
|
2168
|
+
initial_extension = splitext(basename(file_path))[1]
|
|
2169
|
+
initial_name_with_extension = basename(file_path)
|
|
2170
|
+
desired_name = splitext(file_key)[0]
|
|
2171
|
+
desired_name_extension = splitext(file_key)[1]
|
|
2172
|
+
desired_name_with_extension = file_key
|
|
2173
|
+
if existing_file_option == "skip" and desired_name_with_extension in my_bf_existing_files_name_with_extension:
|
|
2174
|
+
continue
|
|
2175
|
+
|
|
2176
|
+
# check if initial filename exists on Pennsieve dataset and get the projected name of the file after upload
|
|
2177
|
+
# used when a local file has a name that matches an existing name on Pennsieve
|
|
2178
|
+
count_done = 0
|
|
2179
|
+
count_exist = 0
|
|
2180
|
+
projected_name = initial_name_with_extension
|
|
2181
|
+
while count_done == 0:
|
|
2182
|
+
if (
|
|
2183
|
+
projected_name
|
|
2184
|
+
in my_bf_existing_files_name_with_extension
|
|
2185
|
+
):
|
|
2186
|
+
count_exist += 1
|
|
2187
|
+
projected_name = (
|
|
2188
|
+
initial_name
|
|
2189
|
+
+ " ("
|
|
2190
|
+
+ str(count_exist)
|
|
2191
|
+
+ ")"
|
|
2192
|
+
+ initial_extension
|
|
2193
|
+
)
|
|
2194
|
+
else:
|
|
2195
|
+
count_done = 1
|
|
2196
|
+
|
|
2197
|
+
# expected final name
|
|
2198
|
+
count_done = 0
|
|
2199
|
+
final_name = desired_name_with_extension
|
|
2200
|
+
if output := get_base_file_name(desired_name):
|
|
2201
|
+
base_name = output[0]
|
|
2202
|
+
count_exist = output[1]
|
|
2203
|
+
while count_done == 0:
|
|
2204
|
+
if final_name in my_bf_existing_files_name:
|
|
2205
|
+
count_exist += 1
|
|
2206
|
+
final_name = (
|
|
2207
|
+
base_name
|
|
2208
|
+
+ "("
|
|
2209
|
+
+ str(count_exist)
|
|
2210
|
+
+ ")"
|
|
2211
|
+
+ desired_name_extension
|
|
2212
|
+
)
|
|
2213
|
+
else:
|
|
2214
|
+
count_done = 1
|
|
2215
|
+
else:
|
|
2216
|
+
count_exist = 0
|
|
2217
|
+
while count_done == 0:
|
|
2218
|
+
if final_name in my_bf_existing_files_name:
|
|
2219
|
+
count_exist += 1
|
|
2220
|
+
final_name = (
|
|
2221
|
+
desired_name
|
|
2222
|
+
+ " ("
|
|
2223
|
+
+ str(count_exist)
|
|
2224
|
+
+ ")"
|
|
2225
|
+
+ desired_name_extension
|
|
2226
|
+
)
|
|
2227
|
+
else:
|
|
2228
|
+
count_done = 1
|
|
2229
|
+
|
|
2230
|
+
# save in list accordingly
|
|
2231
|
+
if (
|
|
2232
|
+
initial_name in list_initial_names
|
|
2233
|
+
or initial_name in list_final_names
|
|
2234
|
+
or projected_name in list_final_names
|
|
2235
|
+
or final_name in list_projected_names
|
|
2236
|
+
):
|
|
2237
|
+
additional_upload_lists.append(
|
|
2238
|
+
[
|
|
2239
|
+
[file_path],
|
|
2240
|
+
ps_folder_children,
|
|
2241
|
+
[projected_name],
|
|
2242
|
+
[desired_name],
|
|
2243
|
+
[final_name],
|
|
2244
|
+
my_tracking_folder,
|
|
2245
|
+
my_relative_path,
|
|
2246
|
+
]
|
|
2247
|
+
)
|
|
2248
|
+
else:
|
|
2249
|
+
list_local_files.append(file_path)
|
|
2250
|
+
list_projected_names.append(projected_name)
|
|
2251
|
+
list_desired_names.append(desired_name_with_extension)
|
|
2252
|
+
list_final_names.append(final_name)
|
|
2253
|
+
list_initial_names.append(initial_name)
|
|
2254
|
+
|
|
2255
|
+
my_bf_existing_files_name.append(final_name)
|
|
2256
|
+
if initial_extension in ps_recognized_file_extensions:
|
|
2257
|
+
my_bf_existing_files_name_with_extension.append(
|
|
2258
|
+
final_name
|
|
2259
|
+
)
|
|
2260
|
+
else:
|
|
2261
|
+
my_bf_existing_files_name_with_extension.append(
|
|
2262
|
+
final_name + initial_extension
|
|
2263
|
+
)
|
|
2264
|
+
|
|
2265
|
+
# add to projected dataset size to be generated
|
|
2266
|
+
main_total_generate_dataset_size += getsize(file_path)
|
|
2267
|
+
|
|
2268
|
+
if list_local_files:
|
|
2269
|
+
ds_name = soda["ps-dataset-selected"]["dataset-name"]
|
|
2270
|
+
list_upload_files.append(
|
|
2271
|
+
[
|
|
2272
|
+
list_local_files,
|
|
2273
|
+
ps_folder_children,
|
|
2274
|
+
list_projected_names,
|
|
2275
|
+
list_desired_names,
|
|
2276
|
+
list_final_names,
|
|
2277
|
+
my_tracking_folder,
|
|
2278
|
+
"/" if my_relative_path == ds_name else my_relative_path,
|
|
2279
|
+
]
|
|
2280
|
+
)
|
|
2281
|
+
|
|
2282
|
+
for item in additional_upload_lists:
|
|
2283
|
+
list_upload_files.append(item)
|
|
2284
|
+
|
|
2285
|
+
return list_upload_files
|
|
2286
|
+
|
|
2287
|
+
def monitor_subscriber_progress(events_dict):
|
|
2288
|
+
"""
|
|
2289
|
+
Monitors the progress of a subscriber and unsubscribes once the upload finishes.
|
|
2290
|
+
"""
|
|
2291
|
+
global files_uploaded
|
|
2292
|
+
global total_bytes_uploaded
|
|
2293
|
+
global bytes_uploaded_per_file
|
|
2294
|
+
global main_curation_uploaded_files
|
|
2295
|
+
global main_total_generate_dataset_size
|
|
2296
|
+
|
|
2297
|
+
|
|
2298
|
+
if events_dict["type"] == 1: # upload status: file_id, total, current, worker_id
|
|
2299
|
+
file_id = events_dict["upload_status"].file_id
|
|
2300
|
+
total_bytes_to_upload = events_dict["upload_status"].total
|
|
2301
|
+
current_bytes_uploaded = events_dict["upload_status"].current
|
|
2302
|
+
|
|
2303
|
+
status = events_dict["upload_status"].status
|
|
2304
|
+
if status == "2" or status == 2:
|
|
2305
|
+
ps.unsubscribe(10)
|
|
2306
|
+
logger.info("[UPLOAD COMPLETE EVENT RECEIVED]")
|
|
2307
|
+
logger.info(f"Amount of bytes uploaded via sum: {sum(bytes_uploaded_per_file.values())} vs total bytes uploaded via difference: {total_bytes_uploaded['value']}")
|
|
2308
|
+
logger.info(f"Amount of bytes Pennsieve Agent says via sum: {sum(bytes_uploaded_per_file.values())} vs amount of bytes we calculated before hand: {main_total_generate_dataset_size}")
|
|
2309
|
+
|
|
2310
|
+
|
|
2311
|
+
# only update the byte count if the current bytes uploaded is greater than the previous bytes uploaded
|
|
2312
|
+
# if current_bytes_uploaded > previous_bytes_uploaded:
|
|
2313
|
+
# update the file id's current total bytes uploaded value
|
|
2314
|
+
bytes_uploaded_per_file[file_id] = current_bytes_uploaded
|
|
2315
|
+
total_bytes_uploaded["value"] = sum(bytes_uploaded_per_file.values())
|
|
2316
|
+
|
|
2317
|
+
# check if the given file has finished uploading
|
|
2318
|
+
if current_bytes_uploaded == total_bytes_to_upload and file_id != "":
|
|
2319
|
+
files_uploaded += 1
|
|
2320
|
+
main_curation_uploaded_files += 1
|
|
2321
|
+
|
|
2322
|
+
|
|
2323
|
+
|
|
2324
|
+
# Set the Pennsieve Python Client's dataset to the Pennsieve dataset that will be uploaded to.
|
|
2325
|
+
selected_id = ds["content"]["id"]
|
|
2326
|
+
ps.use_dataset(selected_id)
|
|
2327
|
+
|
|
2328
|
+
# Set variables needed throughout generation flow
|
|
2329
|
+
list_upload_files = []
|
|
2330
|
+
list_upload_metadata_files = []
|
|
2331
|
+
list_upload_manifest_files = []
|
|
2332
|
+
list_of_files_to_rename = {}
|
|
2333
|
+
brand_new_dataset = False
|
|
2334
|
+
dataset_structure = soda["dataset-structure"]
|
|
2335
|
+
generate_option = soda["generate-dataset"]["generate-option"]
|
|
2336
|
+
starting_point = soda["starting-point"]["origin"]
|
|
2337
|
+
relative_path = ds["content"]["name"]
|
|
2338
|
+
|
|
2339
|
+
|
|
2340
|
+
# 1. Scan the dataset structure and create a list of files/folders to be uploaded with the desired renaming
|
|
2341
|
+
if generate_option == "new" and starting_point == "new":
|
|
2342
|
+
vs = ums.df_mid_has_progress()
|
|
2343
|
+
if resume == False or resume == True and not vs:
|
|
2344
|
+
logger.info("NO progress found so we will start from scratch and construct the manifest")
|
|
2345
|
+
main_curate_progress_message = "Preparing a list of files to upload"
|
|
2346
|
+
# we can assume no files/folders exist in the dataset since the generate option is new and starting point is also new
|
|
2347
|
+
# therefore, we can assume the dataset structure is the same as the tracking structure
|
|
2348
|
+
brand_new_dataset = True
|
|
2349
|
+
list_upload_files = recursive_dataset_scan_for_new_upload(dataset_structure, list_upload_files, relative_path)
|
|
2350
|
+
|
|
2351
|
+
|
|
2352
|
+
|
|
2353
|
+
|
|
2354
|
+
|
|
2355
|
+
if "dataset_metadata" in soda.keys():
|
|
2356
|
+
for key, _ in soda["dataset_metadata"].items():
|
|
2357
|
+
if key == "submission":
|
|
2358
|
+
metadata_path = os.path.join(METADATA_UPLOAD_PS_PATH, "submission.xlsx")
|
|
2359
|
+
submission.create_excel(soda, False, metadata_path)
|
|
2360
|
+
list_upload_metadata_files.append(metadata_path)
|
|
2361
|
+
main_total_generate_dataset_size += getsize(metadata_path)
|
|
2362
|
+
total_files += 1
|
|
2363
|
+
total_metadata_files += 1
|
|
2364
|
+
if key == "subjects":
|
|
2365
|
+
metadata_path = os.path.join(METADATA_UPLOAD_PS_PATH, "subjects.xlsx")
|
|
2366
|
+
subjects.create_excel(soda, False, metadata_path)
|
|
2367
|
+
list_upload_metadata_files.append(metadata_path)
|
|
2368
|
+
main_total_generate_dataset_size += getsize(metadata_path)
|
|
2369
|
+
total_files += 1
|
|
2370
|
+
total_metadata_files += 1
|
|
2371
|
+
if key == "samples":
|
|
2372
|
+
metadata_path = os.path.join(METADATA_UPLOAD_PS_PATH, "samples.xlsx")
|
|
2373
|
+
samples.create_excel(soda, False, metadata_path)
|
|
2374
|
+
list_upload_metadata_files.append(metadata_path)
|
|
2375
|
+
main_total_generate_dataset_size += getsize(metadata_path)
|
|
2376
|
+
total_files += 1
|
|
2377
|
+
total_metadata_files += 1
|
|
2378
|
+
if key == "performances":
|
|
2379
|
+
metadata_path = os.path.join(METADATA_UPLOAD_PS_PATH, "performances.xlsx")
|
|
2380
|
+
performances.create_excel(soda, False, metadata_path)
|
|
2381
|
+
list_upload_metadata_files.append(metadata_path)
|
|
2382
|
+
main_total_generate_dataset_size += getsize(metadata_path)
|
|
2383
|
+
total_files += 1
|
|
2384
|
+
total_metadata_files += 1
|
|
2385
|
+
if key == "resources":
|
|
2386
|
+
metadata_path = os.path.join(METADATA_UPLOAD_PS_PATH, "resources.xlsx")
|
|
2387
|
+
resources.create_excel(soda, False, metadata_path)
|
|
2388
|
+
list_upload_metadata_files.append(metadata_path)
|
|
2389
|
+
main_total_generate_dataset_size += getsize(metadata_path)
|
|
2390
|
+
total_files += 1
|
|
2391
|
+
total_metadata_files += 1
|
|
2392
|
+
if key == "sites":
|
|
2393
|
+
metadata_path = os.path.join(METADATA_UPLOAD_PS_PATH, "sites.xlsx")
|
|
2394
|
+
sites.create_excel(soda, False, metadata_path)
|
|
2395
|
+
list_upload_metadata_files.append(metadata_path)
|
|
2396
|
+
main_total_generate_dataset_size += getsize(metadata_path)
|
|
2397
|
+
total_files += 1
|
|
2398
|
+
total_metadata_files += 1
|
|
2399
|
+
if key == "dataset_description":
|
|
2400
|
+
metadata_path = os.path.join(METADATA_UPLOAD_PS_PATH, "dataset_description.xlsx")
|
|
2401
|
+
dataset_description.create_excel(soda, False, metadata_path)
|
|
2402
|
+
list_upload_metadata_files.append(metadata_path)
|
|
2403
|
+
main_total_generate_dataset_size += getsize(metadata_path)
|
|
2404
|
+
total_files += 1
|
|
2405
|
+
total_metadata_files += 1
|
|
2406
|
+
if key == "code_description":
|
|
2407
|
+
metadata_path = os.path.join(METADATA_UPLOAD_PS_PATH, "code_description.xlsx")
|
|
2408
|
+
code_description.create_excel(soda, False, metadata_path)
|
|
2409
|
+
list_upload_metadata_files.append(metadata_path)
|
|
2410
|
+
main_total_generate_dataset_size += getsize(metadata_path)
|
|
2411
|
+
total_files += 1
|
|
2412
|
+
total_metadata_files += 1
|
|
2413
|
+
if key == "manifest_file":
|
|
2414
|
+
metadata_path = os.path.join(METADATA_UPLOAD_PS_PATH, "manifest.xlsx")
|
|
2415
|
+
manifest.create_excel(soda, False, metadata_path)
|
|
2416
|
+
list_upload_metadata_files.append(metadata_path)
|
|
2417
|
+
main_total_generate_dataset_size += getsize(metadata_path)
|
|
2418
|
+
total_files += 1
|
|
2419
|
+
total_metadata_files += 1
|
|
2420
|
+
|
|
2421
|
+
if key == "README.md":
|
|
2422
|
+
metadata_path = os.path.join(METADATA_UPLOAD_PS_PATH, "README.md")
|
|
2423
|
+
text_metadata.create_text_file(soda, False, metadata_path, "README.md")
|
|
2424
|
+
list_upload_metadata_files.append(metadata_path)
|
|
2425
|
+
main_total_generate_dataset_size += getsize(metadata_path)
|
|
2426
|
+
total_files += 1
|
|
2427
|
+
total_metadata_files += 1
|
|
2428
|
+
if key == "CHANGES":
|
|
2429
|
+
metadata_path = os.path.join(METADATA_UPLOAD_PS_PATH, "CHANGES")
|
|
2430
|
+
text_metadata.create_text_file(soda, False, metadata_path, "CHANGES")
|
|
2431
|
+
list_upload_metadata_files.append(metadata_path)
|
|
2432
|
+
main_total_generate_dataset_size += getsize(metadata_path)
|
|
2433
|
+
total_files += 1
|
|
2434
|
+
total_metadata_files += 1
|
|
2435
|
+
if key == "LICENSE":
|
|
2436
|
+
metadata_path = os.path.join(METADATA_UPLOAD_PS_PATH, "LICENSE")
|
|
2437
|
+
text_metadata.create_text_file(soda, False, metadata_path, "LICENSE")
|
|
2438
|
+
list_upload_metadata_files.append(metadata_path)
|
|
2439
|
+
main_total_generate_dataset_size += getsize(metadata_path)
|
|
2440
|
+
total_files += 1
|
|
2441
|
+
total_metadata_files += 1
|
|
2442
|
+
|
|
2443
|
+
|
|
2444
|
+
|
|
2445
|
+
else:
|
|
2446
|
+
|
|
2447
|
+
vs = ums.df_mid_has_progress()
|
|
2448
|
+
|
|
2449
|
+
if resume == False or resume == True and not vs:
|
|
2450
|
+
main_curate_progress_message = "Preparing a list of files to upload"
|
|
2451
|
+
|
|
2452
|
+
existing_folder_option = soda["generate-dataset"]["if-existing"]
|
|
2453
|
+
existing_file_option = soda["generate-dataset"][
|
|
2454
|
+
"if-existing-files"
|
|
2455
|
+
]
|
|
2456
|
+
|
|
2457
|
+
# we will need a tracking structure to compare against
|
|
2458
|
+
tracking_json_structure = ds
|
|
2459
|
+
normalize_tracking_folder(tracking_json_structure)
|
|
2460
|
+
recursive_create_folder_for_ps(dataset_structure, tracking_json_structure, existing_folder_option)
|
|
2461
|
+
list_upload_files = recursive_dataset_scan_for_ps(
|
|
2462
|
+
dataset_structure,
|
|
2463
|
+
tracking_json_structure,
|
|
2464
|
+
existing_file_option,
|
|
2465
|
+
list_upload_files,
|
|
2466
|
+
relative_path,
|
|
2467
|
+
)
|
|
2468
|
+
|
|
2469
|
+
logger.info(f"List of files to upload: {list_upload_files}")
|
|
2470
|
+
|
|
2471
|
+
|
|
2472
|
+
# return and mark upload as completed if nothing is added to the manifest
|
|
2473
|
+
if len(list_upload_files) < 1:
|
|
2474
|
+
logger.warning("No files found to upload.")
|
|
2475
|
+
main_curate_progress_message = "No files were uploaded in this session"
|
|
2476
|
+
main_curate_status = "Done"
|
|
2477
|
+
return
|
|
2478
|
+
|
|
2479
|
+
# 3. Add high-level metadata files to a list
|
|
2480
|
+
if "dataset_metadata" in soda.keys():
|
|
2481
|
+
logger.info("ps_create_new_dataset (optional) step 3 create high level metadata list")
|
|
2482
|
+
# TODO: Add enahnced merge support post SDS3 launch
|
|
2483
|
+
# (
|
|
2484
|
+
# my_bf_existing_files_name,
|
|
2485
|
+
# _,
|
|
2486
|
+
# ) = ps_get_existing_files_details(ds)
|
|
2487
|
+
for key, _ in soda["dataset_metadata"].items():
|
|
2488
|
+
if key == "submission":
|
|
2489
|
+
metadata_path = os.path.join(METADATA_UPLOAD_PS_PATH, "submission.xlsx")
|
|
2490
|
+
submission.create_excel(soda, False, metadata_path)
|
|
2491
|
+
list_upload_metadata_files.append(metadata_path)
|
|
2492
|
+
main_total_generate_dataset_size += getsize(metadata_path)
|
|
2493
|
+
total_files += 1
|
|
2494
|
+
total_metadata_files += 1
|
|
2495
|
+
if key == "subjects":
|
|
2496
|
+
metadata_path = os.path.join(METADATA_UPLOAD_PS_PATH, "subjects.xlsx")
|
|
2497
|
+
subjects.create_excel(soda, False, metadata_path)
|
|
2498
|
+
list_upload_metadata_files.append(metadata_path)
|
|
2499
|
+
main_total_generate_dataset_size += getsize(metadata_path)
|
|
2500
|
+
total_files += 1
|
|
2501
|
+
total_metadata_files += 1
|
|
2502
|
+
if key == "samples":
|
|
2503
|
+
metadata_path = os.path.join(METADATA_UPLOAD_PS_PATH, "samples.xlsx")
|
|
2504
|
+
samples.create_excel(soda, False, metadata_path)
|
|
2505
|
+
list_upload_metadata_files.append(metadata_path)
|
|
2506
|
+
main_total_generate_dataset_size += getsize(metadata_path)
|
|
2507
|
+
total_files += 1
|
|
2508
|
+
total_metadata_files += 1
|
|
2509
|
+
if key == "performances":
|
|
2510
|
+
metadata_path = os.path.join(METADATA_UPLOAD_PS_PATH, "performances.xlsx")
|
|
2511
|
+
performances.create_excel(soda, False, metadata_path)
|
|
2512
|
+
list_upload_metadata_files.append(metadata_path)
|
|
2513
|
+
main_total_generate_dataset_size += getsize(metadata_path)
|
|
2514
|
+
total_files += 1
|
|
2515
|
+
total_metadata_files += 1
|
|
2516
|
+
if key == "resources":
|
|
2517
|
+
metadata_path = os.path.join(METADATA_UPLOAD_PS_PATH, "resources.xlsx")
|
|
2518
|
+
resources.create_excel(soda, False, metadata_path)
|
|
2519
|
+
list_upload_metadata_files.append(metadata_path)
|
|
2520
|
+
main_total_generate_dataset_size += getsize(metadata_path)
|
|
2521
|
+
total_files += 1
|
|
2522
|
+
total_metadata_files += 1
|
|
2523
|
+
if key == "sites":
|
|
2524
|
+
metadata_path = os.path.join(METADATA_UPLOAD_PS_PATH, "sites.xlsx")
|
|
2525
|
+
sites.create_excel(soda, False, metadata_path)
|
|
2526
|
+
list_upload_metadata_files.append(metadata_path)
|
|
2527
|
+
main_total_generate_dataset_size += getsize(metadata_path)
|
|
2528
|
+
total_files += 1
|
|
2529
|
+
total_metadata_files += 1
|
|
2530
|
+
if key == "dataset_description":
|
|
2531
|
+
metadata_path = os.path.join(METADATA_UPLOAD_PS_PATH, "dataset_description.xlsx")
|
|
2532
|
+
dataset_description.create_excel(soda, False, metadata_path)
|
|
2533
|
+
list_upload_metadata_files.append(metadata_path)
|
|
2534
|
+
main_total_generate_dataset_size += getsize(metadata_path)
|
|
2535
|
+
total_files += 1
|
|
2536
|
+
total_metadata_files += 1
|
|
2537
|
+
if key == "code_description":
|
|
2538
|
+
metadata_path = os.path.join(METADATA_UPLOAD_PS_PATH, "code_description.xlsx")
|
|
2539
|
+
code_description.create_excel(soda, False, metadata_path)
|
|
2540
|
+
list_upload_metadata_files.append(metadata_path)
|
|
2541
|
+
main_total_generate_dataset_size += getsize(metadata_path)
|
|
2542
|
+
total_files += 1
|
|
2543
|
+
total_metadata_files += 1
|
|
2544
|
+
if key == "manifest_file":
|
|
2545
|
+
metadata_path = os.path.join(METADATA_UPLOAD_PS_PATH, "manifest.xlsx")
|
|
2546
|
+
manifest.create_excel(soda, False, metadata_path)
|
|
2547
|
+
list_upload_metadata_files.append(metadata_path)
|
|
2548
|
+
main_total_generate_dataset_size += getsize(metadata_path)
|
|
2549
|
+
total_files += 1
|
|
2550
|
+
total_metadata_files += 1
|
|
2551
|
+
if key == "README.md":
|
|
2552
|
+
metadata_path = os.path.join(METADATA_UPLOAD_PS_PATH, "README.md")
|
|
2553
|
+
text_metadata.create_text_file(soda, False, metadata_path, "README.md")
|
|
2554
|
+
list_upload_metadata_files.append(metadata_path)
|
|
2555
|
+
main_total_generate_dataset_size += getsize(metadata_path)
|
|
2556
|
+
total_files += 1
|
|
2557
|
+
total_metadata_files += 1
|
|
2558
|
+
if key == "CHANGES":
|
|
2559
|
+
metadata_path = os.path.join(METADATA_UPLOAD_PS_PATH, "CHANGES")
|
|
2560
|
+
text_metadata.create_text_file(soda, False, metadata_path, "CHANGES")
|
|
2561
|
+
list_upload_metadata_files.append(metadata_path)
|
|
2562
|
+
main_total_generate_dataset_size += getsize(metadata_path)
|
|
2563
|
+
total_files += 1
|
|
2564
|
+
total_metadata_files += 1
|
|
2565
|
+
if key == "LICENSE":
|
|
2566
|
+
metadata_path = os.path.join(METADATA_UPLOAD_PS_PATH, "LICENSE")
|
|
2567
|
+
text_metadata.create_text_file(soda, False, metadata_path, "LICENSE")
|
|
2568
|
+
list_upload_metadata_files.append(metadata_path)
|
|
2569
|
+
main_total_generate_dataset_size += getsize(metadata_path)
|
|
2570
|
+
total_files += 1
|
|
2571
|
+
total_metadata_files += 1
|
|
2572
|
+
|
|
2573
|
+
|
|
2574
|
+
|
|
2575
|
+
|
|
2576
|
+
# 4. Prepare and add manifest files to a list
|
|
2577
|
+
if "dataset_metadata" in soda.keys() and "manifest_files" in soda["dataset_metadata"].keys():
|
|
2578
|
+
logger.info("ps_create_new_dataset (optional) step 4 create manifest list")
|
|
2579
|
+
# create local folder to save manifest files temporarly (delete any existing one first)
|
|
2580
|
+
# TODO: SDS 3 create manifests if not skipping and delete file on Pennsieve if it exists
|
|
2581
|
+
if "auto-generated" in soda["manifest-files"]:
|
|
2582
|
+
if soda["manifest-files"]["auto-generated"] == True:
|
|
2583
|
+
manifest_files_structure = (
|
|
2584
|
+
get_auto_generated_manifest_files(soda)
|
|
2585
|
+
)
|
|
2586
|
+
|
|
2587
|
+
# add manifest files to list after deleting existing ones
|
|
2588
|
+
for key in manifest_files_structure.keys():
|
|
2589
|
+
manifestpath = manifest_files_structure[key]
|
|
2590
|
+
folder = tracking_json_structure["children"]["folders"][key]
|
|
2591
|
+
|
|
2592
|
+
# delete existing manifest files
|
|
2593
|
+
for child_key in folder["children"]["files"]:
|
|
2594
|
+
file_name_no_ext = os.path.splitext(folder['children']['files'][child_key]['content']['name'])[0]
|
|
2595
|
+
if file_name_no_ext.lower() == "manifest":
|
|
2596
|
+
# delete the manifest file from the given folder
|
|
2597
|
+
r = requests.post(f"{PENNSIEVE_URL}/data/delete", json={"things": [folder['children']['files'][child_key]['content']['id']]}, headers=create_request_headers(get_access_token()))
|
|
2598
|
+
r.raise_for_status()
|
|
2599
|
+
|
|
2600
|
+
# upload new manifest files
|
|
2601
|
+
# the number of files to upload and the total also determines when the upload subscribers should stop listening to the dataset upload progress ( when files uploaded == total files stop listening )
|
|
2602
|
+
list_upload_manifest_files.append([manifestpath, key])
|
|
2603
|
+
total_files += 1
|
|
2604
|
+
total_manifest_files += 1
|
|
2605
|
+
main_total_generate_dataset_size += getsize(manifestpath)
|
|
2606
|
+
|
|
2607
|
+
|
|
2608
|
+
# 2. Count how many files will be uploaded to inform frontend - do not count if we are resuming a previous upload that has made progress
|
|
2609
|
+
if not resume or resume and not ums.df_mid_has_progress():
|
|
2610
|
+
for folderInformation in list_upload_files:
|
|
2611
|
+
file_paths_count = len(folderInformation[0])
|
|
2612
|
+
total_files += file_paths_count
|
|
2613
|
+
total_dataset_files += file_paths_count
|
|
2614
|
+
|
|
2615
|
+
|
|
2616
|
+
# 3. Upload files and add to tracking list
|
|
2617
|
+
start_generate = 1
|
|
2618
|
+
|
|
2619
|
+
|
|
2620
|
+
# resuming a dataset that had no files to rename or that failed before renaming any files
|
|
2621
|
+
if resume and ums.df_mid_has_progress() and not ums.get_renaming_files_flow():
|
|
2622
|
+
main_curate_progress_message = ("Preparing to retry upload. Progress on partially uploaded files will be reset.")
|
|
2623
|
+
# reset necessary variables that were used in the failed upload session and cannot be reliably cached
|
|
2624
|
+
bytes_uploaded_per_file = {}
|
|
2625
|
+
|
|
2626
|
+
# get the current manifest id for data files
|
|
2627
|
+
manifest_id = ums.get_df_mid()
|
|
2628
|
+
# get the cached values of the previous upload session
|
|
2629
|
+
main_total_generate_dataset_size = ums.get_main_total_generate_dataset_size()
|
|
2630
|
+
|
|
2631
|
+
total_files = ums.get_total_files_to_upload()
|
|
2632
|
+
total_dataset_files = total_files
|
|
2633
|
+
current_files_in_subscriber_session = total_dataset_files
|
|
2634
|
+
|
|
2635
|
+
main_curation_uploaded_files = total_files - ums.get_remaining_file_count(manifest_id, total_files)
|
|
2636
|
+
files_uploaded = main_curation_uploaded_files
|
|
2637
|
+
total_bytes_uploaded["value"] = ums.calculate_completed_upload_size(manifest_id, bytes_file_path_dict, total_files )
|
|
2638
|
+
|
|
2639
|
+
# rename file information
|
|
2640
|
+
list_of_files_to_rename = ums.get_list_of_files_to_rename()
|
|
2641
|
+
renamed_files_counter = ums.get_rename_total_files()
|
|
2642
|
+
|
|
2643
|
+
time.sleep(5)
|
|
2644
|
+
|
|
2645
|
+
|
|
2646
|
+
# upload the manifest files
|
|
2647
|
+
try:
|
|
2648
|
+
ps.manifest.upload(manifest_id)
|
|
2649
|
+
main_curate_progress_message = ("Uploading data files...")
|
|
2650
|
+
# subscribe to the manifest upload so we wait until it has finished uploading before moving on
|
|
2651
|
+
ps.subscribe(10, False, monitor_subscriber_progress)
|
|
2652
|
+
except Exception as e:
|
|
2653
|
+
logger.error("Error uploading dataset files")
|
|
2654
|
+
logger.error(e)
|
|
2655
|
+
raise PennsieveUploadException("The Pennsieve Agent has encountered an issue while uploading. Please retry the upload. If this issue persists please follow this <a target='_blank' rel='noopener noreferrer' href='https://docs.sodaforsparc.io/docs/how-to/how-to-reinstall-the-pennsieve-agent'> guide</a> on performing a full reinstallation of the Pennsieve Agent then click the retry button.")
|
|
2656
|
+
elif resume and ums.df_mid_has_progress() and ums.get_renaming_files_flow():
|
|
2657
|
+
# setup for rename files flow
|
|
2658
|
+
list_of_files_to_rename = ums.get_list_of_files_to_rename()
|
|
2659
|
+
renamed_files_counter = ums.get_rename_total_files()
|
|
2660
|
+
# create a manifest for files - IMP: We use a single file to start with since creating a manifest requires a file path. We need to remove this at the end.
|
|
2661
|
+
elif len(list_upload_files) > 0:
|
|
2662
|
+
main_curate_progress_message = ("Queuing dataset files for upload with the Pennsieve Agent..." + "<br>" + "This may take some time.")
|
|
2663
|
+
|
|
2664
|
+
first_file_local_path = list_upload_files[0][0][0]
|
|
2665
|
+
|
|
2666
|
+
if brand_new_dataset:
|
|
2667
|
+
first_relative_path = list_upload_files[0][4]
|
|
2668
|
+
first_final_name = list_upload_files[0][2][0]
|
|
2669
|
+
else:
|
|
2670
|
+
first_relative_path = list_upload_files[0][6]
|
|
2671
|
+
first_final_name = list_upload_files[0][4][0]
|
|
2672
|
+
|
|
2673
|
+
folder_name = first_relative_path[first_relative_path.index("/")+1:]
|
|
2674
|
+
|
|
2675
|
+
if first_final_name != basename(first_file_local_path):
|
|
2676
|
+
# if file name is not the same as local path, then it has been renamed in SODA
|
|
2677
|
+
if folder_name not in list_of_files_to_rename:
|
|
2678
|
+
list_of_files_to_rename[folder_name] = {}
|
|
2679
|
+
if basename(first_file_local_path) not in list_of_files_to_rename[folder_name]:
|
|
2680
|
+
list_of_files_to_rename[folder_name][basename(first_file_local_path)] = {
|
|
2681
|
+
"final_file_name": first_final_name,
|
|
2682
|
+
"id": "",
|
|
2683
|
+
}
|
|
2684
|
+
renamed_files_counter += 1
|
|
2685
|
+
|
|
2686
|
+
manifest_data = ps.manifest.create(first_file_local_path, folder_name)
|
|
2687
|
+
manifest_id = manifest_data.manifest_id
|
|
2688
|
+
|
|
2689
|
+
|
|
2690
|
+
ums.set_df_mid(manifest_id)
|
|
2691
|
+
|
|
2692
|
+
# remove the item just added to the manifest
|
|
2693
|
+
list_upload_files[0][0].pop(0)
|
|
2694
|
+
|
|
2695
|
+
# reset global variables used in the subscriber monitoring function
|
|
2696
|
+
bytes_uploaded_per_file = {}
|
|
2697
|
+
total_bytes_uploaded = {"value": 0}
|
|
2698
|
+
current_files_in_subscriber_session = total_dataset_files
|
|
2699
|
+
|
|
2700
|
+
# there are files to add to the manifest if there are more than one file in the first folder or more than one folder
|
|
2701
|
+
if len(list_upload_files[0][0]) > 1 or len(list_upload_files) > 1:
|
|
2702
|
+
index_skip = True
|
|
2703
|
+
for folderInformation in list_upload_files:
|
|
2704
|
+
list_file_paths = folderInformation[0]
|
|
2705
|
+
if brand_new_dataset:
|
|
2706
|
+
relative_path = folderInformation[4]
|
|
2707
|
+
final_file_name_list = folderInformation[2]
|
|
2708
|
+
else:
|
|
2709
|
+
relative_path = folderInformation[6]
|
|
2710
|
+
final_file_name_list = folderInformation[4]
|
|
2711
|
+
# get the substring from the string relative_path that starts at the index of the / and contains the rest of the string
|
|
2712
|
+
try:
|
|
2713
|
+
folder_name = relative_path[relative_path.index("/")+1:]
|
|
2714
|
+
except ValueError as e:
|
|
2715
|
+
folder_name = relative_path
|
|
2716
|
+
|
|
2717
|
+
# Add files to manfiest"
|
|
2718
|
+
final_files_index = 1 if index_skip else 0
|
|
2719
|
+
index_skip = False
|
|
2720
|
+
for file_path in list_file_paths:
|
|
2721
|
+
file_file_name = final_file_name_list[final_files_index]
|
|
2722
|
+
if file_file_name != basename(file_path):
|
|
2723
|
+
# save the relative path, final name and local path of the file to be renamed
|
|
2724
|
+
if folder_name not in list_of_files_to_rename:
|
|
2725
|
+
list_of_files_to_rename[folder_name] = {}
|
|
2726
|
+
if basename(file_path) not in list_of_files_to_rename[folder_name]:
|
|
2727
|
+
renamed_files_counter += 1
|
|
2728
|
+
list_of_files_to_rename[folder_name][basename(file_path)] = {
|
|
2729
|
+
"final_file_name": file_file_name,
|
|
2730
|
+
"id": "",
|
|
2731
|
+
}
|
|
2732
|
+
ps.manifest.add(file_path, folder_name, manifest_id)
|
|
2733
|
+
final_files_index += 1
|
|
2734
|
+
|
|
2735
|
+
|
|
2736
|
+
# add metadata files to the manifest
|
|
2737
|
+
if list_upload_metadata_files:
|
|
2738
|
+
current_files_in_subscriber_session += total_metadata_files
|
|
2739
|
+
# add the files to the manifest
|
|
2740
|
+
for manifest_path in list_upload_metadata_files:
|
|
2741
|
+
# subprocess call to the pennsieve agent to add the files to the manifest
|
|
2742
|
+
ps.manifest.add(manifest_path, target_base_path="", manifest_id=manifest_id)
|
|
2743
|
+
|
|
2744
|
+
|
|
2745
|
+
# add manifest files to the upload manifest
|
|
2746
|
+
if list_upload_manifest_files:
|
|
2747
|
+
current_files_in_subscriber_session += total_manifest_files
|
|
2748
|
+
for manifest_file_path in list_upload_manifest_files:
|
|
2749
|
+
# add the file to the manifest
|
|
2750
|
+
ps.manifest.add(manifest_file_path, "/", manifest_id)
|
|
2751
|
+
|
|
2752
|
+
|
|
2753
|
+
# set rename files to ums for upload resuming if this upload fails
|
|
2754
|
+
if renamed_files_counter > 0:
|
|
2755
|
+
ums.set_list_of_files_to_rename(list_of_files_to_rename)
|
|
2756
|
+
ums.set_rename_total_files(renamed_files_counter)
|
|
2757
|
+
|
|
2758
|
+
# upload the manifest files
|
|
2759
|
+
try:
|
|
2760
|
+
ps.manifest.upload(manifest_id)
|
|
2761
|
+
|
|
2762
|
+
main_curate_progress_message = ("Uploading data files...")
|
|
2763
|
+
|
|
2764
|
+
# subscribe to the manifest upload so we wait until it has finished uploading before moving on
|
|
2765
|
+
ps.subscribe(10, False, monitor_subscriber_progress)
|
|
2766
|
+
|
|
2767
|
+
except Exception as e:
|
|
2768
|
+
logger.error(e)
|
|
2769
|
+
raise PennsieveUploadException("The Pennsieve Agent has encountered an issue while uploading. Please retry the upload. If this issue persists please follow this <a target='_blank' rel='noopener noreferrer' href='https://docs.sodaforsparc.io/docs/how-to/how-to-reinstall-the-pennsieve-agent'> guide</a> on performing a full reinstallation of the Pennsieve Agent then click the retry button.")
|
|
2770
|
+
|
|
2771
|
+
|
|
2772
|
+
# wait for all of the Agent's processes to finish to avoid errors when deleting files on Windows
|
|
2773
|
+
time.sleep(1)
|
|
2774
|
+
|
|
2775
|
+
# 6. Rename files
|
|
2776
|
+
if list_of_files_to_rename:
|
|
2777
|
+
renaming_files_flow = True
|
|
2778
|
+
logger.info("ps_create_new_dataset (optional) step 8 rename files")
|
|
2779
|
+
main_curate_progress_message = ("Preparing files to be renamed...")
|
|
2780
|
+
dataset_id = ds["content"]["id"]
|
|
2781
|
+
collection_ids = {}
|
|
2782
|
+
# gets the high level folders in the dataset
|
|
2783
|
+
r = requests.get(f"{PENNSIEVE_URL}/datasets/{dataset_id}", headers=create_request_headers(ps))
|
|
2784
|
+
r.raise_for_status()
|
|
2785
|
+
dataset_content = r.json()["children"]
|
|
2786
|
+
|
|
2787
|
+
if dataset_content == []:
|
|
2788
|
+
while dataset_content == []:
|
|
2789
|
+
time.sleep(3)
|
|
2790
|
+
r = requests.get(f"{PENNSIEVE_URL}/datasets/{dataset_id}", headers=create_request_headers(ps))
|
|
2791
|
+
r.raise_for_status()
|
|
2792
|
+
dataset_content = r.json()["children"]
|
|
2793
|
+
|
|
2794
|
+
collections_found = False
|
|
2795
|
+
while not collections_found:
|
|
2796
|
+
for item in dataset_content:
|
|
2797
|
+
# high lvl folders' ids are stored to be used to find the file IDS
|
|
2798
|
+
if item["content"]["packageType"] == "Collection":
|
|
2799
|
+
collections_found = True
|
|
2800
|
+
collection_ids[item["content"]["name"]] = {"id": item["content"]["nodeId"]}
|
|
2801
|
+
|
|
2802
|
+
|
|
2803
|
+
if not collections_found:
|
|
2804
|
+
# No collections were found, metadata files were processed but not the high level folders
|
|
2805
|
+
time.sleep(3)
|
|
2806
|
+
r = requests.get(f"{PENNSIEVE_URL}/datasets/{dataset_id}", headers=create_request_headers(ps))
|
|
2807
|
+
r.raise_for_status()
|
|
2808
|
+
dataset_content = r.json()["children"]
|
|
2809
|
+
|
|
2810
|
+
for key in list_of_files_to_rename:
|
|
2811
|
+
# split the key up if there are multiple folders in the relative path
|
|
2812
|
+
relative_path = key.split("/")
|
|
2813
|
+
high_lvl_folder_name = relative_path[0]
|
|
2814
|
+
subfolder_level = 0
|
|
2815
|
+
subfolder_amount = len(relative_path) - 1
|
|
2816
|
+
|
|
2817
|
+
if high_lvl_folder_name in collection_ids:
|
|
2818
|
+
# subfolder_amount will be the amount of subfolders we need to call until we can get the file ID to rename
|
|
2819
|
+
|
|
2820
|
+
high_lvl_folder_id = collection_ids[high_lvl_folder_name]["id"]
|
|
2821
|
+
limit = 100
|
|
2822
|
+
offset = 0
|
|
2823
|
+
dataset_content = []
|
|
2824
|
+
while True:
|
|
2825
|
+
r = requests.get(f"{PENNSIEVE_URL}/packages/{high_lvl_folder_id}?limit={limit}&offset={offset}", headers=create_request_headers(ps))
|
|
2826
|
+
r.raise_for_status()
|
|
2827
|
+
page = r.json()["children"]
|
|
2828
|
+
dataset_content.extend(page)
|
|
2829
|
+
|
|
2830
|
+
if len(page) < limit:
|
|
2831
|
+
break
|
|
2832
|
+
offset += limit
|
|
2833
|
+
|
|
2834
|
+
if dataset_content == []:
|
|
2835
|
+
# request until there is no children content, (folder is empty so files have not been processed yet)
|
|
2836
|
+
while dataset_content == []:
|
|
2837
|
+
time.sleep(3)
|
|
2838
|
+
limit = 100
|
|
2839
|
+
offset = 0
|
|
2840
|
+
|
|
2841
|
+
while True:
|
|
2842
|
+
r = requests.get(f"{PENNSIEVE_URL}/packages/{high_lvl_folder_id}?limit={limit}&offset={offset}", headers=create_request_headers(ps))
|
|
2843
|
+
r.raise_for_status()
|
|
2844
|
+
page = r.json()["children"]
|
|
2845
|
+
dataset_content.extend(page)
|
|
2846
|
+
if len(page) < limit:
|
|
2847
|
+
break
|
|
2848
|
+
offset += limit
|
|
2849
|
+
|
|
2850
|
+
|
|
2851
|
+
if subfolder_amount == 0:
|
|
2852
|
+
# the file is in the high level folder
|
|
2853
|
+
if "id" not in list_of_files_to_rename[key]:
|
|
2854
|
+
# store the id of the folder to be used again in case the file id is not found (happens when not all files have been processed yet)
|
|
2855
|
+
list_of_files_to_rename[key]["id"] = high_lvl_folder_id
|
|
2856
|
+
|
|
2857
|
+
|
|
2858
|
+
for item in dataset_content:
|
|
2859
|
+
if item["content"]["packageType"] != "Collection":
|
|
2860
|
+
file_name = item["content"]["name"]
|
|
2861
|
+
file_id = item["content"]["nodeId"]
|
|
2862
|
+
|
|
2863
|
+
if file_name in list_of_files_to_rename[key]:
|
|
2864
|
+
# name
|
|
2865
|
+
# store the package id for now
|
|
2866
|
+
list_of_files_to_rename[key][file_name]["id"] = file_id
|
|
2867
|
+
else:
|
|
2868
|
+
# file is within a subfolder and we recursively iterate until we get to the last subfolder needed
|
|
2869
|
+
subfolder_id = collection_ids[high_lvl_folder_name]["id"]
|
|
2870
|
+
while subfolder_level != subfolder_amount:
|
|
2871
|
+
if dataset_content == []:
|
|
2872
|
+
# subfolder has no content so request again
|
|
2873
|
+
while dataset_content == []:
|
|
2874
|
+
time.sleep(3)
|
|
2875
|
+
limit = 100
|
|
2876
|
+
offset = 0
|
|
2877
|
+
while True:
|
|
2878
|
+
r = requests.get(f"{PENNSIEVE_URL}/packages/{subfolder_id}", headers=create_request_headers(ps))
|
|
2879
|
+
r.raise_for_status()
|
|
2880
|
+
page = r.json()["children"]
|
|
2881
|
+
dataset_content.extend(page)
|
|
2882
|
+
if len(page) < limit:
|
|
2883
|
+
break
|
|
2884
|
+
offset += limit
|
|
2885
|
+
|
|
2886
|
+
|
|
2887
|
+
for item in dataset_content:
|
|
2888
|
+
if item["content"]["packageType"] == "Collection":
|
|
2889
|
+
folder_name = item["content"]["name"]
|
|
2890
|
+
folder_id = item["content"]["nodeId"]
|
|
2891
|
+
|
|
2892
|
+
if folder_name in relative_path:
|
|
2893
|
+
# we have found the folder we need to iterate through
|
|
2894
|
+
subfolder_level += 1
|
|
2895
|
+
|
|
2896
|
+
limit = 100
|
|
2897
|
+
offset = 0
|
|
2898
|
+
children = []
|
|
2899
|
+
while True:
|
|
2900
|
+
r = requests.get(f"{PENNSIEVE_URL}/packages/{folder_id}?limit={limit}&offset={offset}", headers=create_request_headers(ps))
|
|
2901
|
+
r.raise_for_status()
|
|
2902
|
+
page = r.json()["children"]
|
|
2903
|
+
children.extend(page)
|
|
2904
|
+
if len(page) < limit:
|
|
2905
|
+
break
|
|
2906
|
+
offset += limit
|
|
2907
|
+
|
|
2908
|
+
if subfolder_level != subfolder_amount:
|
|
2909
|
+
dataset_content = children
|
|
2910
|
+
if dataset_content == []:
|
|
2911
|
+
while dataset_content == []:
|
|
2912
|
+
# subfolder has no content so request again
|
|
2913
|
+
time.sleep(3)
|
|
2914
|
+
limit = 100
|
|
2915
|
+
offset = 0
|
|
2916
|
+
while True:
|
|
2917
|
+
r = requests.get(f"{PENNSIEVE_URL}/packages/{folder_id}", headers=create_request_headers(ps))
|
|
2918
|
+
r.raise_for_status()
|
|
2919
|
+
page = r.json()["children"]
|
|
2920
|
+
dataset_content.extend(page)
|
|
2921
|
+
if len(page) < limit:
|
|
2922
|
+
break
|
|
2923
|
+
offset += limit
|
|
2924
|
+
|
|
2925
|
+
subfolder_id = folder_id
|
|
2926
|
+
break
|
|
2927
|
+
else:
|
|
2928
|
+
# we are at the last folder in the relative path, we can get the file id
|
|
2929
|
+
if "id" not in list_of_files_to_rename[key]:
|
|
2930
|
+
# store the id of the last folder to directly call later in case not all files get an id
|
|
2931
|
+
list_of_files_to_rename[key]["id"] = folder_id
|
|
2932
|
+
for item in children:
|
|
2933
|
+
if item["content"]["packageType"] != "Collection":
|
|
2934
|
+
file_name = item["content"]["name"]
|
|
2935
|
+
file_id = item["content"]["nodeId"]
|
|
2936
|
+
|
|
2937
|
+
if file_name in list_of_files_to_rename[key]:
|
|
2938
|
+
# store the package id for renaming
|
|
2939
|
+
list_of_files_to_rename[key][file_name]["id"] = file_id
|
|
2940
|
+
else:
|
|
2941
|
+
continue
|
|
2942
|
+
|
|
2943
|
+
# 8.5 Rename files - All or most ids have been fetched now rename the files or gather the ids again if not all files have been processed at this time
|
|
2944
|
+
main_curate_progress_message = "Renaming files..."
|
|
2945
|
+
main_generated_dataset_size = 0
|
|
2946
|
+
main_total_generate_dataset_size = renamed_files_counter
|
|
2947
|
+
for relative_path in list_of_files_to_rename:
|
|
2948
|
+
for file in list_of_files_to_rename[relative_path].keys():
|
|
2949
|
+
collection_id = list_of_files_to_rename[relative_path]["id"]
|
|
2950
|
+
if file == "id":
|
|
2951
|
+
continue
|
|
2952
|
+
new_name = list_of_files_to_rename[relative_path][file]["final_file_name"]
|
|
2953
|
+
file_id = list_of_files_to_rename[relative_path][file]["id"]
|
|
2954
|
+
|
|
2955
|
+
if file_id != "":
|
|
2956
|
+
# id was found so make api call to rename with final file name
|
|
2957
|
+
try:
|
|
2958
|
+
r = requests.put(f"{PENNSIEVE_URL}/packages/{file_id}?updateStorage=true", json={"name": new_name}, headers=create_request_headers(ps))
|
|
2959
|
+
r.raise_for_status()
|
|
2960
|
+
except Exception as e:
|
|
2961
|
+
if r.status_code == 500:
|
|
2962
|
+
continue
|
|
2963
|
+
main_generated_dataset_size += 1
|
|
2964
|
+
else:
|
|
2965
|
+
# id was not found so keep trying to get the id until it is found
|
|
2966
|
+
all_ids_found = False
|
|
2967
|
+
while not all_ids_found:
|
|
2968
|
+
collection_id = list_of_files_to_rename[relative_path]["id"]
|
|
2969
|
+
if file == "id":
|
|
2970
|
+
continue
|
|
2971
|
+
|
|
2972
|
+
|
|
2973
|
+
limit = 100
|
|
2974
|
+
offset = 0
|
|
2975
|
+
dataset_content = []
|
|
2976
|
+
|
|
2977
|
+
while True:
|
|
2978
|
+
r = requests.put(f"{PENNSIEVE_URL}/packages/{collection_id}?updateStorage=true&limit={limit}&offset={offset}", headers=create_request_headers(ps))
|
|
2979
|
+
r.raise_for_status()
|
|
2980
|
+
page = r.json()["children"]
|
|
2981
|
+
dataset_content.extend(page)
|
|
2982
|
+
if len(dataset_content) < limit:
|
|
2983
|
+
break
|
|
2984
|
+
offset += limit
|
|
2985
|
+
|
|
2986
|
+
for item in dataset_content:
|
|
2987
|
+
if item["content"]["packageType"] != "Collection":
|
|
2988
|
+
file_name = item["content"]["name"]
|
|
2989
|
+
file_id = item["content"]["nodeId"]
|
|
2990
|
+
|
|
2991
|
+
if file_name == file:
|
|
2992
|
+
# id was found so make api call to rename with file file name
|
|
2993
|
+
try:
|
|
2994
|
+
r = requests.put(f"{PENNSIEVE_URL}/packages/{file_id}", json={"name": new_name}, headers=create_request_headers(ps))
|
|
2995
|
+
r.raise_for_status()
|
|
2996
|
+
except Exception as e:
|
|
2997
|
+
if r.status_code == 500:
|
|
2998
|
+
continue
|
|
2999
|
+
main_generated_dataset_size += 1
|
|
3000
|
+
all_ids_found = True
|
|
3001
|
+
break
|
|
3002
|
+
|
|
3003
|
+
|
|
3004
|
+
|
|
3005
|
+
|
|
3006
|
+
|
|
3007
|
+
# get the manifest id of the Pennsieve upload manifest created when uploading
|
|
3008
|
+
|
|
3009
|
+
|
|
3010
|
+
origin_manifest_id = get_origin_manifest_id(selected_id)
|
|
3011
|
+
|
|
3012
|
+
# if files were uploaded but later receive the 'Failed' status in the Pennsieve manifest we allow users to retry the upload; set the pre-requisite information for the upload to
|
|
3013
|
+
# be retried in that case
|
|
3014
|
+
# NOTE: We do not need to store the rename information here. Rationale: If the upload for a file failed the rename could not succeed and we would not reach this point.
|
|
3015
|
+
# What would happen instead is as follows(in an optimistic case where the upload doesnt keep being marked as Failed):
|
|
3016
|
+
# 1. The upload for a file fails
|
|
3017
|
+
# 2. The upload information gets (including rename information ) stored in the catchall error handling block
|
|
3018
|
+
# 3. The user retries the upload
|
|
3019
|
+
# 4. The manifest counts the Failed file as a file to be retried
|
|
3020
|
+
# 5. The manifest is uploaded again and the file is uploaded again
|
|
3021
|
+
# 6. The file is renamed successfully this time
|
|
3022
|
+
ums.set_main_total_generate_dataset_size(main_total_generate_dataset_size)
|
|
3023
|
+
ums.set_total_files_to_upload(total_files)
|
|
3024
|
+
ums.set_elapsed_time(elapsed_time)
|
|
3025
|
+
|
|
3026
|
+
# at end of successful session reset tracking for folders created
|
|
3027
|
+
main_curate_progress_message = "Success: COMPLETED!"
|
|
3028
|
+
main_curate_status = "Done"
|
|
3029
|
+
|
|
3030
|
+
|
|
3031
|
+
shutil.rmtree(manifest_folder_path) if isdir(manifest_folder_path) else 0
|
|
3032
|
+
end = timer()
|
|
3033
|
+
logger.info(f"Time for ps_upload_to_dataset function: {timedelta(seconds=end - start)}")
|
|
3034
|
+
except Exception as e:
|
|
3035
|
+
# reset the total bytes uploaded for any file that has not been fully uploaded
|
|
3036
|
+
ums.set_main_total_generate_dataset_size(main_total_generate_dataset_size)
|
|
3037
|
+
ums.set_total_files_to_upload(total_files)
|
|
3038
|
+
ums.set_elapsed_time(elapsed_time)
|
|
3039
|
+
# store the renaming files information in case the upload fails and we need to rename files during the retry
|
|
3040
|
+
ums.set_renaming_files_flow(renaming_files_flow) # this determines if we failed while renaming files after the upload is complete
|
|
3041
|
+
ums.set_rename_total_files(renamed_files_counter)
|
|
3042
|
+
ums.set_list_of_files_to_rename(list_of_files_to_rename)
|
|
3043
|
+
raise e
|
|
3044
|
+
|
|
3045
|
+
main_curate_status = ""
|
|
3046
|
+
main_curate_print_status = ""
|
|
3047
|
+
main_curate_progress_message = ""
|
|
3048
|
+
main_total_generate_dataset_size = 1
|
|
3049
|
+
main_generated_dataset_size = 0
|
|
3050
|
+
start_generate = 0
|
|
3051
|
+
generate_start_time = 0
|
|
3052
|
+
main_generate_destination = ""
|
|
3053
|
+
main_initial_bfdataset_size = 0
|
|
3054
|
+
myds = ""
|
|
3055
|
+
renaming_files_flow = False
|
|
3056
|
+
elapsed_time = None
|
|
3057
|
+
manifest_id = None
|
|
3058
|
+
origin_manifest_id = None
|
|
3059
|
+
|
|
3060
|
+
|
|
3061
|
+
|
|
3062
|
+
def ps_check_dataset_files_validity(soda):
|
|
3063
|
+
"""
|
|
3064
|
+
Function to check that the bf data files and folders specified in the dataset are valid
|
|
3065
|
+
|
|
3066
|
+
Args:
|
|
3067
|
+
dataset_structure: soda dict with information about all specified files and folders
|
|
3068
|
+
Output:
|
|
3069
|
+
error: error message with list of non valid local data files, if any
|
|
3070
|
+
"""
|
|
3071
|
+
def check_folder_validity(folder_id, folder_dict, folder_path, error):
|
|
3072
|
+
"""
|
|
3073
|
+
Function to verify that the subfolders and files specified in the dataset are valid
|
|
3074
|
+
|
|
3075
|
+
Args:
|
|
3076
|
+
folder_id: id of the folder in the dataset
|
|
3077
|
+
folder_dict: dict with information about the folder
|
|
3078
|
+
folder_path: path of the folder in the dataset
|
|
3079
|
+
error: error message with list of non valid files/folders, if any
|
|
3080
|
+
Output:
|
|
3081
|
+
error: error message with list of non valid files/folders, if any
|
|
3082
|
+
"""
|
|
3083
|
+
# get the folder content through Pennsieve api
|
|
3084
|
+
limit = 100
|
|
3085
|
+
offset = 0
|
|
3086
|
+
folder_content = []
|
|
3087
|
+
while True:
|
|
3088
|
+
r = requests.get(f"{PENNSIEVE_URL}/packages/{folder_id}?offset={offset}&limit={limit}", headers=create_request_headers(get_access_token()))
|
|
3089
|
+
r.raise_for_status()
|
|
3090
|
+
page = r.json()["children"]
|
|
3091
|
+
folder_content.extend(page)
|
|
3092
|
+
if len(page) < limit:
|
|
3093
|
+
break
|
|
3094
|
+
offset += limit
|
|
3095
|
+
|
|
3096
|
+
# check that the subfolders and files specified in the dataset are valid
|
|
3097
|
+
if "files" in folder_dict.keys():
|
|
3098
|
+
for file_key, file in folder_dict["files"].items():
|
|
3099
|
+
file_type = file.get("location")
|
|
3100
|
+
relative_path = (f"{folder_path}/{file_key}")
|
|
3101
|
+
# If file is from Pennsieve we verify if file exists on Pennsieve
|
|
3102
|
+
if file_type == "ps":
|
|
3103
|
+
file_actions = file["action"]
|
|
3104
|
+
file_id = file["path"]
|
|
3105
|
+
if "moved" in file_actions:
|
|
3106
|
+
try:
|
|
3107
|
+
r = requests.get(f"{PENNSIEVE_URL}/packages/{file_id}/view", headers=create_request_headers(get_access_token()))
|
|
3108
|
+
r.raise_for_status()
|
|
3109
|
+
except Exception as e:
|
|
3110
|
+
error.append(f"{relative_path} id: {file_id}")
|
|
3111
|
+
continue
|
|
3112
|
+
if next((item for item in folder_content if item["content"]["id"] == file_id), None) is None:
|
|
3113
|
+
error.append(f"{relative_path} id: {file_id}")
|
|
3114
|
+
|
|
3115
|
+
if "folders" in folder_dict.keys():
|
|
3116
|
+
for folder_key, folder in folder_dict["folders"].items():
|
|
3117
|
+
folder_type = folder.get("location")
|
|
3118
|
+
relative_path = (f"{folder_path}/{folder_key}")
|
|
3119
|
+
if folder_type == "ps":
|
|
3120
|
+
folder_id = folder["path"]
|
|
3121
|
+
folder_action = folder["action"]
|
|
3122
|
+
if "moved" in folder_action:
|
|
3123
|
+
try:
|
|
3124
|
+
r = requests.get(f"{PENNSIEVE_URL}/packages/{folder_id}", headers=create_request_headers(get_access_token()))
|
|
3125
|
+
r.raise_for_status()
|
|
3126
|
+
except Exception as e:
|
|
3127
|
+
error.append(f"{relative_path} id: {folder_id}")
|
|
3128
|
+
continue
|
|
3129
|
+
if next((item for item in folder_content if item["content"]["id"] == folder_id), None) is None:
|
|
3130
|
+
error.append(f"{relative_path} id: {folder_id}")
|
|
3131
|
+
else:
|
|
3132
|
+
check_folder_validity(folder_id, folder, relative_path, error)
|
|
3133
|
+
|
|
3134
|
+
return error
|
|
3135
|
+
|
|
3136
|
+
error = []
|
|
3137
|
+
# check that the files and folders specified in the dataset are valid
|
|
3138
|
+
dataset_name = soda["ps-dataset-selected"]["dataset-name"]
|
|
3139
|
+
dataset_id = get_dataset_id(dataset_name)
|
|
3140
|
+
r = requests.get(f"{PENNSIEVE_URL}/datasets/{dataset_id}", headers=create_request_headers(get_access_token()))
|
|
3141
|
+
r.raise_for_status()
|
|
3142
|
+
root_folder = r.json()["children"]
|
|
3143
|
+
|
|
3144
|
+
if len(root_folder) == 0:
|
|
3145
|
+
return error
|
|
3146
|
+
|
|
3147
|
+
if "dataset-structure" in soda.keys():
|
|
3148
|
+
dataset_structure = soda["dataset-structure"]
|
|
3149
|
+
if "folders" in dataset_structure:
|
|
3150
|
+
for folder_key, folder in dataset_structure["folders"].items():
|
|
3151
|
+
folder_type = folder.get("location")
|
|
3152
|
+
relative_path = folder_key
|
|
3153
|
+
if folder_type == "ps":
|
|
3154
|
+
collection_id = folder["path"]
|
|
3155
|
+
collection_actions = folder["action"]
|
|
3156
|
+
if "moved" in collection_actions:
|
|
3157
|
+
try:
|
|
3158
|
+
r = requests.get(f"{PENNSIEVE_URL}/packages/{collection_id}/view", headers=create_request_headers(get_access_token()))
|
|
3159
|
+
r.raise_for_status()
|
|
3160
|
+
except Exception:
|
|
3161
|
+
error.append(f"{relative_path} id: {collection_id}")
|
|
3162
|
+
continue
|
|
3163
|
+
if next((item for item in root_folder if item["content"]["id"] == collection_id), None) is None:
|
|
3164
|
+
error.append(f"{relative_path} id: {collection_id}")
|
|
3165
|
+
else:
|
|
3166
|
+
# recursively check all files + subfolders of collection_id
|
|
3167
|
+
error = check_folder_validity(collection_id, folder, relative_path, error)
|
|
3168
|
+
|
|
3169
|
+
# if there are items in the error list, check if they have been "moved"
|
|
3170
|
+
if len(error) > 0:
|
|
3171
|
+
error_message = [
|
|
3172
|
+
"Error: The following Pennsieve files/folders are invalid. Specify them again or remove them."
|
|
3173
|
+
]
|
|
3174
|
+
error = error_message + error
|
|
3175
|
+
|
|
3176
|
+
return error
|
|
3177
|
+
|
|
3178
|
+
|
|
3179
|
+
def check_server_access_to_files(file_list):
|
|
3180
|
+
# Return two lists, one that the server can open, and one that it can not.
|
|
3181
|
+
# This is to avoid the server trying to open files that it does not have access to.cf
|
|
3182
|
+
accessible_files = []
|
|
3183
|
+
inaccessible_files = []
|
|
3184
|
+
for file in file_list:
|
|
3185
|
+
if os.path.isfile(file) or os.path.isdir(file):
|
|
3186
|
+
accessible_files.append(file)
|
|
3187
|
+
else:
|
|
3188
|
+
inaccessible_files.append(file)
|
|
3189
|
+
|
|
3190
|
+
return {"accessible_files": accessible_files, "inaccessible_files": inaccessible_files}
|
|
3191
|
+
|
|
3192
|
+
|
|
3193
|
+
# TODO: Update for SDS 3.0
|
|
3194
|
+
def clean_json_structure(soda):
|
|
3195
|
+
global logger
|
|
3196
|
+
# Delete any files on Pennsieve that have been marked as deleted
|
|
3197
|
+
def recursive_file_delete(folder):
|
|
3198
|
+
if "files" in folder.keys():
|
|
3199
|
+
for item in list(folder["files"]):
|
|
3200
|
+
if item in ["manifest.xlsx", "manifest.csv"]:
|
|
3201
|
+
continue
|
|
3202
|
+
if "deleted" in folder["files"][item]["action"]:
|
|
3203
|
+
# remove the file from the soda json structure
|
|
3204
|
+
del folder["files"][item]
|
|
3205
|
+
|
|
3206
|
+
for item in list(folder["folders"]):
|
|
3207
|
+
recursive_file_delete(folder["folders"][item])
|
|
3208
|
+
|
|
3209
|
+
|
|
3210
|
+
# Rename any files that exist on Pennsieve
|
|
3211
|
+
def recursive_file_rename(folder):
|
|
3212
|
+
if "files" in folder.keys():
|
|
3213
|
+
for item in list(folder["files"]):
|
|
3214
|
+
if (
|
|
3215
|
+
"renamed" in folder["files"][item]["action"]
|
|
3216
|
+
and folder["files"][item]["location"] == "ps"
|
|
3217
|
+
):
|
|
3218
|
+
continue
|
|
3219
|
+
|
|
3220
|
+
for item in list(folder["folders"]):
|
|
3221
|
+
recursive_file_rename(folder["folders"][item])
|
|
3222
|
+
|
|
3223
|
+
|
|
3224
|
+
def recursive_folder_delete(folder):
|
|
3225
|
+
"""
|
|
3226
|
+
Delete any stray folders that exist on Pennsieve
|
|
3227
|
+
Only top level files are deleted since the api deletes any
|
|
3228
|
+
files and folders that exist inside.
|
|
3229
|
+
"""
|
|
3230
|
+
|
|
3231
|
+
for folder_item in list(folder["folders"]):
|
|
3232
|
+
if folder["folders"][folder_item]["location"] == "ps":
|
|
3233
|
+
if "deleted" in folder["folders"][folder_item]["action"]:
|
|
3234
|
+
del folder["folders"][folder_item]
|
|
3235
|
+
else:
|
|
3236
|
+
recursive_folder_delete(folder["folders"][folder_item])
|
|
3237
|
+
else:
|
|
3238
|
+
recursive_folder_delete(folder["folders"][folder_item])
|
|
3239
|
+
return
|
|
3240
|
+
|
|
3241
|
+
main_keys = soda.keys()
|
|
3242
|
+
dataset_structure = soda["dataset-structure"]
|
|
3243
|
+
|
|
3244
|
+
if ("dataset-structure" not in main_keys and "dataset_metadata" not in main_keys):
|
|
3245
|
+
if "ps-dataset-selected" in main_keys:
|
|
3246
|
+
dataset_name = soda["ps-dataset-selected"]["dataset-name"]
|
|
3247
|
+
elif "generate-dataset" in main_keys:
|
|
3248
|
+
dataset_name = soda["generate-dataset"]["dataset-name"]
|
|
3249
|
+
else:
|
|
3250
|
+
dataset_name = "Unset Name"
|
|
3251
|
+
raise EmptyDatasetError(dataset_name)
|
|
3252
|
+
|
|
3253
|
+
if "generate-dataset" in main_keys:
|
|
3254
|
+
# Check that local files/folders exist
|
|
3255
|
+
try:
|
|
3256
|
+
if error := check_local_dataset_files_validity(soda):
|
|
3257
|
+
raise LocalDatasetMissingSpecifiedFiles(error)
|
|
3258
|
+
# check that dataset is not empty after removing all the empty files and folders
|
|
3259
|
+
if not soda["dataset-structure"]["folders"] and "dataset_metadata" not in soda:
|
|
3260
|
+
if "ps-dataset-selected" in main_keys:
|
|
3261
|
+
dataset_name = soda["ps-dataset-selected"]["dataset-name"]
|
|
3262
|
+
elif "generate-dataset" in main_keys:
|
|
3263
|
+
dataset_name = soda["generate-dataset"]["dataset-name"]
|
|
3264
|
+
else:
|
|
3265
|
+
dataset_name = "Unset Name"
|
|
3266
|
+
raise EmptyDatasetError(dataset_name)
|
|
3267
|
+
except Exception as e:
|
|
3268
|
+
raise e
|
|
3269
|
+
|
|
3270
|
+
if "starting-point" in main_keys and soda["starting-point"][
|
|
3271
|
+
"origin"
|
|
3272
|
+
] in ["ps", "local"]:
|
|
3273
|
+
recursive_file_delete(dataset_structure)
|
|
3274
|
+
recursive_folder_delete(dataset_structure)
|
|
3275
|
+
soda["dataset-structure"] = dataset_structure
|
|
3276
|
+
|
|
3277
|
+
|
|
3278
|
+
# here will be clean up the soda json object before creating the manifest file cards
|
|
3279
|
+
return {"soda": soda}
|
|
3280
|
+
|
|
3281
|
+
|
|
3282
|
+
|
|
3283
|
+
def validate_local_dataset_generate_path(soda):
|
|
3284
|
+
generate_dataset = soda["generate-dataset"]
|
|
3285
|
+
local_dataset_path = generate_dataset["path"]
|
|
3286
|
+
if not isdir(local_dataset_path):
|
|
3287
|
+
error_message = (
|
|
3288
|
+
"Error: The Path "
|
|
3289
|
+
+ local_dataset_path
|
|
3290
|
+
+ " is not found. Please select a valid destination folder for the new dataset"
|
|
3291
|
+
)
|
|
3292
|
+
raise FileNotFoundError(error_message)
|
|
3293
|
+
|
|
3294
|
+
|
|
3295
|
+
|
|
3296
|
+
|
|
3297
|
+
def generating_on_ps(soda):
|
|
3298
|
+
return soda["generate-dataset"]["destination"] == "ps"
|
|
3299
|
+
|
|
3300
|
+
def uploading_with_ps_account(soda):
|
|
3301
|
+
return "ps-account-selected" in soda
|
|
3302
|
+
|
|
3303
|
+
def uploading_to_existing_ps_dataset(soda):
|
|
3304
|
+
return "ps-dataset-selected" in soda
|
|
3305
|
+
|
|
3306
|
+
def can_resume_prior_upload(resume_status):
|
|
3307
|
+
global ums
|
|
3308
|
+
return resume_status and ums.df_mid_has_progress()
|
|
3309
|
+
|
|
3310
|
+
def virtual_dataset_empty(soda):
|
|
3311
|
+
return (
|
|
3312
|
+
"dataset-structure" not in soda
|
|
3313
|
+
and "metadata-files" not in soda
|
|
3314
|
+
)
|
|
3315
|
+
|
|
3316
|
+
def generate_options_set(soda):
|
|
3317
|
+
return "generate-dataset" in soda.keys()
|
|
3318
|
+
|
|
3319
|
+
|
|
3320
|
+
def get_dataset_with_backoff(selected_dataset_id):
|
|
3321
|
+
# check that dataset was created with a limited retry (for some users the dataset isn't automatically accessible)
|
|
3322
|
+
attempts = 0
|
|
3323
|
+
while(attempts < 3):
|
|
3324
|
+
try:
|
|
3325
|
+
# whether we are generating a new dataset or merging, we want the dataset information for later steps
|
|
3326
|
+
r = requests.get(f"{PENNSIEVE_URL}/datasets/{selected_dataset_id}", headers=create_request_headers(get_access_token()))
|
|
3327
|
+
r.raise_for_status()
|
|
3328
|
+
return r.json()
|
|
3329
|
+
except Exception as e:
|
|
3330
|
+
attempts += 1
|
|
3331
|
+
# check if final attempt
|
|
3332
|
+
if attempts >= 2:
|
|
3333
|
+
# raise the error to the user
|
|
3334
|
+
raise e
|
|
3335
|
+
time.sleep(10)
|
|
3336
|
+
|
|
3337
|
+
|
|
3338
|
+
def generate_new_ds_ps_resume(soda, dataset_name, ps):
|
|
3339
|
+
# get the dataset id by the name
|
|
3340
|
+
try:
|
|
3341
|
+
selected_dataset_id = get_dataset_id(dataset_name)
|
|
3342
|
+
except Exception as e:
|
|
3343
|
+
if e.code == 404:
|
|
3344
|
+
# dataset does not exist - create it
|
|
3345
|
+
ds = ps_create_new_dataset(dataset_name, ps)
|
|
3346
|
+
selected_dataset_id = ds["content"]["id"]
|
|
3347
|
+
|
|
3348
|
+
myds = get_dataset_with_backoff(selected_dataset_id)
|
|
3349
|
+
ps_upload_to_dataset(soda, ps, myds, True)
|
|
3350
|
+
|
|
3351
|
+
def generate_new_ds_ps(soda, dataset_name, ps):
|
|
3352
|
+
ds = ps_create_new_dataset(dataset_name, ps)
|
|
3353
|
+
selected_dataset_id = ds["content"]["id"]
|
|
3354
|
+
myds = get_dataset_with_backoff(selected_dataset_id)
|
|
3355
|
+
ps_upload_to_dataset(soda, ps, myds, False)
|
|
3356
|
+
|
|
3357
|
+
|
|
3358
|
+
def generate_dataset(soda, resume, ps):
|
|
3359
|
+
global main_generate_destination
|
|
3360
|
+
global main_total_generate_dataset_size
|
|
3361
|
+
|
|
3362
|
+
|
|
3363
|
+
# Generate dataset locally
|
|
3364
|
+
if generating_locally(soda):
|
|
3365
|
+
logger.info("generate_dataset generating_locally")
|
|
3366
|
+
main_generate_destination = soda["generate-dataset"][
|
|
3367
|
+
"destination"
|
|
3368
|
+
]
|
|
3369
|
+
_, main_total_generate_dataset_size = generate_dataset_locally(
|
|
3370
|
+
soda
|
|
3371
|
+
)
|
|
3372
|
+
|
|
3373
|
+
# Generate dataset to Pennsieve
|
|
3374
|
+
if generating_on_ps(soda):
|
|
3375
|
+
main_generate_destination = soda["generate-dataset"][
|
|
3376
|
+
"destination"
|
|
3377
|
+
]
|
|
3378
|
+
generate_option = soda["generate-dataset"]["generate-option"]
|
|
3379
|
+
|
|
3380
|
+
logger.info("generate_dataset generating_on_ps")
|
|
3381
|
+
|
|
3382
|
+
if uploading_to_existing_ps_dataset(soda) and soda["starting-point"]["origin"] != "new":
|
|
3383
|
+
|
|
3384
|
+
selected_dataset_id = get_dataset_id(
|
|
3385
|
+
soda["ps-dataset-selected"]["dataset-name"]
|
|
3386
|
+
)
|
|
3387
|
+
# make an api request to pennsieve to get the dataset details
|
|
3388
|
+
r = requests.get(f"{PENNSIEVE_URL}/datasets/{selected_dataset_id}", headers=create_request_headers(get_access_token()))
|
|
3389
|
+
r.raise_for_status()
|
|
3390
|
+
myds = r.json()
|
|
3391
|
+
|
|
3392
|
+
if can_resume_prior_upload(resume):
|
|
3393
|
+
ps_upload_to_dataset(soda, ps, myds, resume)
|
|
3394
|
+
else:
|
|
3395
|
+
ps_update_existing_dataset(soda, myds, ps, resume)
|
|
3396
|
+
|
|
3397
|
+
elif generate_option == "new" or generate_option == "existing-ps" and soda["starting-point"]["origin"] == "new":
|
|
3398
|
+
# if dataset name is in the generate-dataset section, we are generating a new dataset
|
|
3399
|
+
if "dataset-name" in soda["generate-dataset"]:
|
|
3400
|
+
dataset_name = soda["generate-dataset"][
|
|
3401
|
+
"dataset-name"
|
|
3402
|
+
]
|
|
3403
|
+
elif "digital-metadata" in soda and "name" in soda["digital-metadata"]:
|
|
3404
|
+
dataset_name = soda["digital-metadata"]["name"]
|
|
3405
|
+
elif "ps-dataset-selected" in soda and "dataset-name" in soda["ps-dataset-selected"]:
|
|
3406
|
+
dataset_name = soda["ps-dataset-selected"]["dataset-name"]
|
|
3407
|
+
|
|
3408
|
+
if resume:
|
|
3409
|
+
generate_new_ds_ps_resume(soda, dataset_name, ps)
|
|
3410
|
+
else:
|
|
3411
|
+
try:
|
|
3412
|
+
selected_dataset_id = get_dataset_id(dataset_name)
|
|
3413
|
+
except Exception as e:
|
|
3414
|
+
if isinstance(e, PennsieveDatasetCannotBeFound):
|
|
3415
|
+
generate_new_ds_ps(soda, dataset_name, ps)
|
|
3416
|
+
return
|
|
3417
|
+
else:
|
|
3418
|
+
raise Exception(f"{e.status_code}, {e.message}")
|
|
3419
|
+
myds = get_dataset_with_backoff(selected_dataset_id)
|
|
3420
|
+
|
|
3421
|
+
ps_upload_to_dataset(soda, ps, myds, resume)
|
|
3422
|
+
|
|
3423
|
+
|
|
3424
|
+
|
|
3425
|
+
|
|
3426
|
+
|
|
3427
|
+
def validate_dataset_structure(soda, resume):
|
|
3428
|
+
|
|
3429
|
+
global main_curate_status
|
|
3430
|
+
global main_curate_progress_message
|
|
3431
|
+
global logger
|
|
3432
|
+
|
|
3433
|
+
# 1] Check for potential errors
|
|
3434
|
+
logger.info("main_curate_function step 1")
|
|
3435
|
+
|
|
3436
|
+
if not generate_options_set(soda):
|
|
3437
|
+
main_curate_status = "Done"
|
|
3438
|
+
raise GenerateOptionsNotSet()
|
|
3439
|
+
|
|
3440
|
+
# 1.1. If the dataset is being generated locally then check that the local destination is valid
|
|
3441
|
+
if generating_locally(soda):
|
|
3442
|
+
main_curate_progress_message = "Checking that the local destination selected for generating your dataset is valid"
|
|
3443
|
+
try:
|
|
3444
|
+
validate_local_dataset_generate_path(soda)
|
|
3445
|
+
except Exception as e:
|
|
3446
|
+
main_curate_status = "Done"
|
|
3447
|
+
raise e
|
|
3448
|
+
|
|
3449
|
+
|
|
3450
|
+
logger.info("main_curate_function step 1.2")
|
|
3451
|
+
|
|
3452
|
+
# 1.2. If generating dataset to Pennsieve or any other Pennsieve actions are requested check that the destination is valid
|
|
3453
|
+
if uploading_with_ps_account(soda):
|
|
3454
|
+
# check that the Pennsieve account is valid
|
|
3455
|
+
try:
|
|
3456
|
+
main_curate_progress_message = (
|
|
3457
|
+
"Checking that the selected Pennsieve account is valid"
|
|
3458
|
+
)
|
|
3459
|
+
accountname = soda["ps-account-selected"]["account-name"]
|
|
3460
|
+
connect_pennsieve_client(accountname)
|
|
3461
|
+
except Exception as e:
|
|
3462
|
+
main_curate_status = "Done"
|
|
3463
|
+
if isinstance(e, AttributeError):
|
|
3464
|
+
raise Exception("The Pennsieve Agent cannot access datasets but needs to in order to work. Please try again. If the issue persists, please contact the SODA team. The SODA team will contact Pennsieve to help resolve this issue.")
|
|
3465
|
+
else:
|
|
3466
|
+
raise PennsieveAccountInvalid("Please select a valid Pennsieve account.")
|
|
3467
|
+
|
|
3468
|
+
if uploading_to_existing_ps_dataset(soda):
|
|
3469
|
+
# check that the Pennsieve dataset is valid
|
|
3470
|
+
try:
|
|
3471
|
+
main_curate_progress_message = (
|
|
3472
|
+
"Checking that the selected Pennsieve dataset is valid"
|
|
3473
|
+
)
|
|
3474
|
+
bfdataset = soda["ps-dataset-selected"]["dataset-name"]
|
|
3475
|
+
selected_dataset_id = get_dataset_id(bfdataset)
|
|
3476
|
+
|
|
3477
|
+
except Exception as e:
|
|
3478
|
+
main_curate_status = "Done"
|
|
3479
|
+
bfdataset = soda["ps-dataset-selected"]["dataset-name"]
|
|
3480
|
+
raise PennsieveDatasetCannotBeFound(bfdataset)
|
|
3481
|
+
|
|
3482
|
+
# check that the user has permissions for uploading and modifying the dataset
|
|
3483
|
+
main_curate_progress_message = "Checking that you have required permissions for modifying the selected dataset"
|
|
3484
|
+
role = pennsieve_get_current_user_permissions(selected_dataset_id, get_access_token())["role"]
|
|
3485
|
+
if role not in ["owner", "manager", "editor"]:
|
|
3486
|
+
main_curate_status = "Done"
|
|
3487
|
+
raise PennsieveActionNoPermission("uploading to Pennsieve dataset")
|
|
3488
|
+
|
|
3489
|
+
logger.info("main_curate_function step 1.3")
|
|
3490
|
+
|
|
3491
|
+
|
|
3492
|
+
# 1.3. Check that specified dataset files and folders are valid (existing path) if generate dataset is requested
|
|
3493
|
+
# Note: Empty folders and 0 kb files will be removed without warning (a warning will be provided on the front end before starting the curate process)
|
|
3494
|
+
# Check at least one file or folder are added to the dataset
|
|
3495
|
+
main_curate_progress_message = "Checking that the dataset is not empty"
|
|
3496
|
+
if virtual_dataset_empty(soda):
|
|
3497
|
+
main_curate_status = "Done"
|
|
3498
|
+
if "generate-options" in soda.keys():
|
|
3499
|
+
dataset_name = soda["generate-options"]["dataset-name"]
|
|
3500
|
+
elif "ps-dataset-selected" in soda.keys():
|
|
3501
|
+
dataset_name = soda["ps-dataset-selected"]["dataset-name"]
|
|
3502
|
+
else:
|
|
3503
|
+
dataset_name = "Name not set"
|
|
3504
|
+
raise EmptyDatasetError(dataset_name)
|
|
3505
|
+
|
|
3506
|
+
|
|
3507
|
+
logger.info("main_curate_function step 1.3.1")
|
|
3508
|
+
|
|
3509
|
+
# Check that local files/folders exist
|
|
3510
|
+
if error := check_local_dataset_files_validity(soda):
|
|
3511
|
+
main_curate_status = "Done"
|
|
3512
|
+
raise LocalDatasetMissingSpecifiedFiles(error)
|
|
3513
|
+
|
|
3514
|
+
|
|
3515
|
+
# check that dataset is not empty after removing all the empty files and folders
|
|
3516
|
+
if virtual_dataset_empty(soda):
|
|
3517
|
+
main_curate_status = "Done"
|
|
3518
|
+
if "generate-options" in soda.keys():
|
|
3519
|
+
dataset_name = soda["generate-options"]["dataset-name"]
|
|
3520
|
+
elif "ps-dataset-selected" in soda.keys():
|
|
3521
|
+
dataset_name = soda["ps-dataset-selected"]["dataset-name"]
|
|
3522
|
+
else:
|
|
3523
|
+
dataset_name = "Name not set"
|
|
3524
|
+
raise EmptyDatasetError(dataset_name, "The dataset is empty after removing all the empty files and folders.")
|
|
3525
|
+
|
|
3526
|
+
|
|
3527
|
+
logger.info("main_curate_function step 1.3.2")
|
|
3528
|
+
# Check that bf files/folders exist (Only used for when generating from an existing Pennsieve dataset)
|
|
3529
|
+
if uploading_to_existing_ps_dataset(soda) and can_resume_prior_upload(resume) == False:
|
|
3530
|
+
try:
|
|
3531
|
+
main_curate_progress_message = (
|
|
3532
|
+
"Checking that the Pennsieve files and folders are valid"
|
|
3533
|
+
)
|
|
3534
|
+
if soda["generate-dataset"]["destination"] == "ps":
|
|
3535
|
+
if error := ps_check_dataset_files_validity(soda):
|
|
3536
|
+
logger.info("Failed to validate dataset files")
|
|
3537
|
+
logger.info(error)
|
|
3538
|
+
main_curate_status = "Done"
|
|
3539
|
+
raise PennsieveDatasetFilesInvalid(error)
|
|
3540
|
+
except Exception as e:
|
|
3541
|
+
main_curate_status = "Done"
|
|
3542
|
+
raise e
|
|
3543
|
+
|
|
3544
|
+
|
|
3545
|
+
|
|
3546
|
+
def reset_upload_session_environment(resume):
|
|
3547
|
+
global main_curate_status
|
|
3548
|
+
global main_curate_progress_message
|
|
3549
|
+
global main_total_generate_dataset_size
|
|
3550
|
+
global main_generated_dataset_size
|
|
3551
|
+
global start_generate
|
|
3552
|
+
global generate_start_time
|
|
3553
|
+
global main_generate_destination
|
|
3554
|
+
global main_initial_bfdataset_size
|
|
3555
|
+
global main_curation_uploaded_files
|
|
3556
|
+
global uploaded_folder_counter
|
|
3557
|
+
global ums
|
|
3558
|
+
|
|
3559
|
+
global myds
|
|
3560
|
+
global generated_dataset_id
|
|
3561
|
+
global bytes_file_path_dict
|
|
3562
|
+
global renaming_files_flow
|
|
3563
|
+
|
|
3564
|
+
start_generate = 0
|
|
3565
|
+
myds = ""
|
|
3566
|
+
|
|
3567
|
+
generate_start_time = time.time()
|
|
3568
|
+
|
|
3569
|
+
# variables for tracking the progress of the curate process on the frontend
|
|
3570
|
+
main_curate_status = ""
|
|
3571
|
+
main_curate_progress_message = "Starting..."
|
|
3572
|
+
main_total_generate_dataset_size = 0
|
|
3573
|
+
main_generated_dataset_size = 0
|
|
3574
|
+
main_curation_uploaded_files = 0
|
|
3575
|
+
uploaded_folder_counter = 0
|
|
3576
|
+
generated_dataset_id = None
|
|
3577
|
+
|
|
3578
|
+
main_curate_status = "Curating"
|
|
3579
|
+
main_curate_progress_message = "Starting dataset curation"
|
|
3580
|
+
main_generate_destination = ""
|
|
3581
|
+
main_initial_bfdataset_size = 0
|
|
3582
|
+
|
|
3583
|
+
if not resume:
|
|
3584
|
+
ums.set_df_mid(None)
|
|
3585
|
+
ums.set_elapsed_time(None)
|
|
3586
|
+
ums.set_total_files_to_upload(0)
|
|
3587
|
+
ums.set_main_total_generate_dataset_size(0)
|
|
3588
|
+
# reset the rename information back to default
|
|
3589
|
+
ums.set_renaming_files_flow(False) # this determines if we failed while renaming files after the upload is complete
|
|
3590
|
+
ums.set_rename_total_files(None)
|
|
3591
|
+
ums.set_list_of_files_to_rename(None)
|
|
3592
|
+
renaming_files_flow = False
|
|
3593
|
+
# reset the calculated values for the upload session
|
|
3594
|
+
bytes_file_path_dict = {}
|
|
3595
|
+
|
|
3596
|
+
|
|
3597
|
+
|
|
3598
|
+
|
|
3599
|
+
def main_curate_function(soda, resume):
|
|
3600
|
+
global logger
|
|
3601
|
+
global main_curate_status
|
|
3602
|
+
global manifest_id
|
|
3603
|
+
global origin_manifest_id
|
|
3604
|
+
global total_files
|
|
3605
|
+
|
|
3606
|
+
logger.info("Starting generating selected dataset")
|
|
3607
|
+
logger.info(f"Generating dataset metadata generate-options={soda['generate-dataset']}")
|
|
3608
|
+
|
|
3609
|
+
|
|
3610
|
+
reset_upload_session_environment(resume)
|
|
3611
|
+
|
|
3612
|
+
|
|
3613
|
+
validate_dataset_structure(soda, resume)
|
|
3614
|
+
|
|
3615
|
+
logger.info("Generating dataset step 3")
|
|
3616
|
+
|
|
3617
|
+
|
|
3618
|
+
# 2] Generate
|
|
3619
|
+
main_curate_progress_message = "Generating dataset"
|
|
3620
|
+
try:
|
|
3621
|
+
if (soda["generate-dataset"]["destination"] == "local"):
|
|
3622
|
+
logger.info("main_curate_function generating locally")
|
|
3623
|
+
generate_dataset(soda, resume, ps=None)
|
|
3624
|
+
else:
|
|
3625
|
+
logger.info("main_curate_function generating on Pennsieve")
|
|
3626
|
+
accountname = soda["ps-account-selected"]["account-name"]
|
|
3627
|
+
ps = connect_pennsieve_client(accountname)
|
|
3628
|
+
generate_dataset(soda, resume, ps)
|
|
3629
|
+
except Exception as e:
|
|
3630
|
+
main_curate_status = "Done"
|
|
3631
|
+
raise e
|
|
3632
|
+
|
|
3633
|
+
main_curate_status = "Done"
|
|
3634
|
+
main_curate_progress_message = "Success: COMPLETED!"
|
|
3635
|
+
|
|
3636
|
+
|
|
3637
|
+
logger.info(f"Finished generating dataset")
|
|
3638
|
+
return {
|
|
3639
|
+
"main_curate_progress_message": main_curate_progress_message,
|
|
3640
|
+
"main_total_generate_dataset_size": main_total_generate_dataset_size,
|
|
3641
|
+
"main_curation_uploaded_files": main_curation_uploaded_files,
|
|
3642
|
+
"local_manifest_id": manifest_id,
|
|
3643
|
+
"origin_manifest_id": origin_manifest_id,
|
|
3644
|
+
"main_curation_total_files": total_files,
|
|
3645
|
+
}
|
|
3646
|
+
|
|
3647
|
+
|
|
3648
|
+
|
|
3649
|
+
def main_curate_function_progress():
|
|
3650
|
+
"""
|
|
3651
|
+
Function frequently called by front end to help keep track of the dataset generation progress
|
|
3652
|
+
"""
|
|
3653
|
+
|
|
3654
|
+
global main_curate_status # empty if curate on going, "Done" when main curate function stopped (error or completed)
|
|
3655
|
+
global main_curate_progress_message
|
|
3656
|
+
global main_total_generate_dataset_size
|
|
3657
|
+
global main_generated_dataset_size
|
|
3658
|
+
global start_generate
|
|
3659
|
+
global generate_start_time
|
|
3660
|
+
global main_generate_destination
|
|
3661
|
+
global main_initial_bfdataset_size
|
|
3662
|
+
global main_curation_uploaded_files
|
|
3663
|
+
global total_bytes_uploaded # current number of bytes uploaded to Pennsieve in the upload session
|
|
3664
|
+
global myds
|
|
3665
|
+
global renaming_files_flow
|
|
3666
|
+
global ums
|
|
3667
|
+
global elapsed_time
|
|
3668
|
+
|
|
3669
|
+
|
|
3670
|
+
prior_elapsed_time = ums.get_elapsed_time()
|
|
3671
|
+
if prior_elapsed_time is not None:
|
|
3672
|
+
elapsed_time = ( time.time() - generate_start_time ) + prior_elapsed_time
|
|
3673
|
+
else:
|
|
3674
|
+
elapsed_time = time.time() - generate_start_time
|
|
3675
|
+
|
|
3676
|
+
elapsed_time_formatted = time_format(elapsed_time)
|
|
3677
|
+
|
|
3678
|
+
|
|
3679
|
+
if renaming_files_flow:
|
|
3680
|
+
testing_variable = main_generated_dataset_size
|
|
3681
|
+
else:
|
|
3682
|
+
testing_variable = total_bytes_uploaded["value"]
|
|
3683
|
+
|
|
3684
|
+
return {
|
|
3685
|
+
"main_curate_status": main_curate_status,
|
|
3686
|
+
"start_generate": start_generate,
|
|
3687
|
+
"main_curate_progress_message": main_curate_progress_message,
|
|
3688
|
+
"main_total_generate_dataset_size": main_total_generate_dataset_size,
|
|
3689
|
+
"main_generated_dataset_size": testing_variable,
|
|
3690
|
+
"elapsed_time_formatted": elapsed_time_formatted,
|
|
3691
|
+
"total_files_uploaded": main_curation_uploaded_files,
|
|
3692
|
+
"generated_dataset_id": myds["content"]["id"] if myds != "" else None, # when a new dataset gets generated log its id to our analytics
|
|
3693
|
+
"generated_dataset_int_id": myds["content"]["intId"] if myds != "" else None,
|
|
3694
|
+
}
|
|
3695
|
+
|
|
3696
|
+
|
|
3697
|
+
def preview_dataset(soda):
|
|
3698
|
+
"""
|
|
3699
|
+
Associated with 'Preview' button in the SODA interface
|
|
3700
|
+
Creates a folder for preview and adds mock files based on the files specified in the UI by the user (same name as origin but 0 kb in size)
|
|
3701
|
+
Opens the dialog box to showcase the files / folders added
|
|
3702
|
+
|
|
3703
|
+
Args:
|
|
3704
|
+
soda: soda dict with information about all specified files and folders
|
|
3705
|
+
Action:
|
|
3706
|
+
Opens the dialog box at preview_path
|
|
3707
|
+
Returns:
|
|
3708
|
+
preview_path: path of the folder where the preview files are located
|
|
3709
|
+
"""
|
|
3710
|
+
|
|
3711
|
+
preview_path = join(userpath, "SODA", "Preview_dataset")
|
|
3712
|
+
|
|
3713
|
+
# remove empty files and folders from dataset
|
|
3714
|
+
try:
|
|
3715
|
+
check_empty_files_folders(soda)
|
|
3716
|
+
except Exception as e:
|
|
3717
|
+
raise e
|
|
3718
|
+
|
|
3719
|
+
# create Preview_dataset folder
|
|
3720
|
+
try:
|
|
3721
|
+
if isdir(preview_path):
|
|
3722
|
+
shutil.rmtree(preview_path, ignore_errors=True)
|
|
3723
|
+
makedirs(preview_path)
|
|
3724
|
+
except Exception as e:
|
|
3725
|
+
raise e
|
|
3726
|
+
|
|
3727
|
+
try:
|
|
3728
|
+
|
|
3729
|
+
if "dataset-structure" in soda.keys():
|
|
3730
|
+
# create folder structure
|
|
3731
|
+
def recursive_create_mock_folder_structure(my_folder, my_folderpath):
|
|
3732
|
+
if "folders" in my_folder.keys():
|
|
3733
|
+
for folder_key, folder in my_folder["folders"].items():
|
|
3734
|
+
folderpath = join(my_folderpath, folder_key)
|
|
3735
|
+
if not isdir(folderpath):
|
|
3736
|
+
mkdir(folderpath)
|
|
3737
|
+
recursive_create_mock_folder_structure(folder, folderpath)
|
|
3738
|
+
|
|
3739
|
+
if "files" in my_folder.keys():
|
|
3740
|
+
for file_key, file in my_folder["files"].items():
|
|
3741
|
+
if "deleted" not in file["action"]:
|
|
3742
|
+
open(join(my_folderpath, file_key), "a").close()
|
|
3743
|
+
|
|
3744
|
+
dataset_structure = soda["dataset-structure"]
|
|
3745
|
+
folderpath = preview_path
|
|
3746
|
+
recursive_create_mock_folder_structure(dataset_structure, folderpath)
|
|
3747
|
+
|
|
3748
|
+
if "manifest-files" in soda.keys() and "folders" in dataset_structure.keys():
|
|
3749
|
+
for folder_key, folder in dataset_structure["folders"].items():
|
|
3750
|
+
manifest_path = join(preview_path, folder_key, "manifest.xlsx")
|
|
3751
|
+
if not isfile(manifest_path):
|
|
3752
|
+
open(manifest_path, "a").close()
|
|
3753
|
+
|
|
3754
|
+
if "metadata-files" in soda.keys():
|
|
3755
|
+
for metadata_key in soda["metadata-files"].keys():
|
|
3756
|
+
open(join(preview_path, metadata_key), "a").close()
|
|
3757
|
+
|
|
3758
|
+
if len(listdir(preview_path)) > 0:
|
|
3759
|
+
folder_in_preview = listdir(preview_path)[0]
|
|
3760
|
+
open_file(join(preview_path, folder_in_preview))
|
|
3761
|
+
else:
|
|
3762
|
+
open_file(preview_path)
|
|
3763
|
+
|
|
3764
|
+
return preview_path
|
|
3765
|
+
|
|
3766
|
+
except Exception as e:
|
|
3767
|
+
raise e
|
|
3768
|
+
|
|
3769
|
+
|
|
3770
|
+
def generate_manifest_file_locally(generate_purpose, soda):
|
|
3771
|
+
"""
|
|
3772
|
+
Function to generate manifest files locally
|
|
3773
|
+
"""
|
|
3774
|
+
|
|
3775
|
+
|
|
3776
|
+
global manifest_folder_path
|
|
3777
|
+
|
|
3778
|
+
def recursive_item_path_create(folder, path):
|
|
3779
|
+
if "files" in folder.keys():
|
|
3780
|
+
for item in list(folder["files"]):
|
|
3781
|
+
if "folderpath" not in folder["files"][item]:
|
|
3782
|
+
folder["files"][item]["folderpath"] = path[:]
|
|
3783
|
+
|
|
3784
|
+
if "folders" in folder.keys():
|
|
3785
|
+
for item in list(folder["folders"]):
|
|
3786
|
+
if "folderpath" not in folder["folders"][item]:
|
|
3787
|
+
folder["folders"][item]["folderpath"] = path[:]
|
|
3788
|
+
folder["folders"][item]["folderpath"].append(item)
|
|
3789
|
+
recursive_item_path_create(
|
|
3790
|
+
folder["folders"][item], folder["folders"][item]["folderpath"][:]
|
|
3791
|
+
)
|
|
3792
|
+
|
|
3793
|
+
return
|
|
3794
|
+
|
|
3795
|
+
def copytree(src, dst, symlinks=False, ignore=None):
|
|
3796
|
+
for item in os.listdir(src):
|
|
3797
|
+
s = os.path.join(src, item)
|
|
3798
|
+
d = os.path.join(dst, item)
|
|
3799
|
+
if os.path.isdir(s):
|
|
3800
|
+
if os.path.exists(d):
|
|
3801
|
+
shutil.rmtree(d)
|
|
3802
|
+
shutil.copytree(s, d, symlinks, ignore)
|
|
3803
|
+
else:
|
|
3804
|
+
shutil.copy2(s, d)
|
|
3805
|
+
|
|
3806
|
+
dataset_structure = soda["dataset-structure"]
|
|
3807
|
+
manifest_destination = soda["manifest-files"]["local-destination"]
|
|
3808
|
+
|
|
3809
|
+
recursive_item_path_create(dataset_structure, [])
|
|
3810
|
+
create_high_lvl_manifest_files_existing_ps_starting_point(soda, manifest_folder_path)
|
|
3811
|
+
|
|
3812
|
+
if generate_purpose == "edit-manifest":
|
|
3813
|
+
manifest_destination = os.path.join(manifest_destination, "manifest_file")
|
|
3814
|
+
|
|
3815
|
+
else:
|
|
3816
|
+
manifest_destination = return_new_path(
|
|
3817
|
+
os.path.join(manifest_destination, "manifest_file")
|
|
3818
|
+
)
|
|
3819
|
+
|
|
3820
|
+
copytree(manifest_folder_path, manifest_destination)
|
|
3821
|
+
|
|
3822
|
+
|
|
3823
|
+
|
|
3824
|
+
if generate_purpose == "edit-manifest":
|
|
3825
|
+
return {"success_message_or_manifest_destination": manifest_destination}
|
|
3826
|
+
|
|
3827
|
+
open_file(manifest_destination)
|
|
3828
|
+
return {"success_message_or_manifest_destination": "success"}
|
|
3829
|
+
|
|
3830
|
+
|
|
3831
|
+
|
|
3832
|
+
def generate_manifest_file_data(dataset_structure):
|
|
3833
|
+
# Define common file extensions with special handling
|
|
3834
|
+
double_extensions = {
|
|
3835
|
+
".ome.tiff", ".ome.tif", ".ome.tf2", ".ome.tf8", ".ome.btf", ".ome.xml",
|
|
3836
|
+
".brukertiff.gz", ".mefd.gz", ".moberg.gz", ".nii.gz", ".mgh.gz", ".tar.gz", ".bcl.gz"
|
|
3837
|
+
}
|
|
3838
|
+
|
|
3839
|
+
# Helper function: Get the complete file extension
|
|
3840
|
+
def get_file_extension(filename):
|
|
3841
|
+
for ext in double_extensions:
|
|
3842
|
+
if filename.endswith(ext):
|
|
3843
|
+
base_ext = os.path.splitext(os.path.splitext(filename)[0])[1]
|
|
3844
|
+
return base_ext + ext
|
|
3845
|
+
return os.path.splitext(filename)[1]
|
|
3846
|
+
|
|
3847
|
+
def create_folder_entry(folder_name, path_parts):
|
|
3848
|
+
full_path = "/".join(path_parts + [folder_name]) + "/"
|
|
3849
|
+
entry = [
|
|
3850
|
+
full_path.lstrip("/"), # Remove leading slash for consistency
|
|
3851
|
+
"", # Timestamp
|
|
3852
|
+
"", # Description
|
|
3853
|
+
"folder", # File type
|
|
3854
|
+
"", # Entity (empty)
|
|
3855
|
+
"", # Data modality (empty)
|
|
3856
|
+
"", # Also in dataset (empty)
|
|
3857
|
+
"", # Data dictionary path (empty)
|
|
3858
|
+
"", # Entity is transitive (empty)
|
|
3859
|
+
"", # Additional Metadata
|
|
3860
|
+
]
|
|
3861
|
+
return entry
|
|
3862
|
+
|
|
3863
|
+
|
|
3864
|
+
|
|
3865
|
+
# Helper function: Build a single manifest entry
|
|
3866
|
+
def create_file_entry(item, folder, path_parts, timestamp, filename):
|
|
3867
|
+
full_path = "/".join(path_parts + [filename])
|
|
3868
|
+
file_info = folder["files"][item]
|
|
3869
|
+
|
|
3870
|
+
entry = [
|
|
3871
|
+
full_path.lstrip("/"), # Remove leading slash for consistency
|
|
3872
|
+
timestamp, # Timestamp
|
|
3873
|
+
file_info["description"], # Description
|
|
3874
|
+
get_file_extension(filename), # File type
|
|
3875
|
+
"", # Entity (empty)
|
|
3876
|
+
"", # Data modality (empty)
|
|
3877
|
+
"", # Also in dataset (empty)
|
|
3878
|
+
"", # Data dictionary path (empty)
|
|
3879
|
+
"", # Entity is transitive (empty)
|
|
3880
|
+
file_info.get("additional-metadata", "") # Additional Metadata
|
|
3881
|
+
]
|
|
3882
|
+
|
|
3883
|
+
# Add any extra columns dynamically
|
|
3884
|
+
if "extra_columns" in file_info:
|
|
3885
|
+
for key, value in file_info["extra_columns"].items():
|
|
3886
|
+
entry.append(value)
|
|
3887
|
+
if key not in header_row:
|
|
3888
|
+
header_row.append(key)
|
|
3889
|
+
|
|
3890
|
+
return entry
|
|
3891
|
+
|
|
3892
|
+
# Recursive function: Traverse dataset and collect file data
|
|
3893
|
+
def traverse_folders(folder, path_parts):
|
|
3894
|
+
# Add header row if processing files for the first time
|
|
3895
|
+
if not manifest_data:
|
|
3896
|
+
manifest_data.append(header_row)
|
|
3897
|
+
|
|
3898
|
+
if "files" in folder:
|
|
3899
|
+
for item, file_info in folder["files"].items():
|
|
3900
|
+
|
|
3901
|
+
if "path" in file_info:
|
|
3902
|
+
file_path = file_info["path"]
|
|
3903
|
+
elif "pspath" in file_info:
|
|
3904
|
+
file_path = file_info["pspath"]
|
|
3905
|
+
else:
|
|
3906
|
+
continue
|
|
3907
|
+
|
|
3908
|
+
# If the file is a manifest file, skip it
|
|
3909
|
+
if item in {"manifest.xlsx", "manifest.csv"}:
|
|
3910
|
+
continue
|
|
3911
|
+
|
|
3912
|
+
# Determine timestamp
|
|
3913
|
+
filename = os.path.basename(file_path.replace("\\", "/"))
|
|
3914
|
+
if file_info["location"] == "ps":
|
|
3915
|
+
timestamp = file_info["timestamp"]
|
|
3916
|
+
else:
|
|
3917
|
+
local_path = pathlib.Path(file_info["path"])
|
|
3918
|
+
timestamp = datetime.fromtimestamp(
|
|
3919
|
+
local_path.stat().st_mtime, tz=local_timezone
|
|
3920
|
+
).isoformat().replace(".", ",").replace("+00:00", "Z")
|
|
3921
|
+
|
|
3922
|
+
# Add file entry
|
|
3923
|
+
manifest_data.append(create_file_entry(item, folder, path_parts, timestamp, filename))
|
|
3924
|
+
|
|
3925
|
+
if "folders" in folder:
|
|
3926
|
+
for subfolder_name, subfolder in folder["folders"].items():
|
|
3927
|
+
# Add folder entry
|
|
3928
|
+
manifest_data.append(create_folder_entry(subfolder_name, path_parts))
|
|
3929
|
+
traverse_folders(subfolder, path_parts + [subfolder_name])
|
|
3930
|
+
|
|
3931
|
+
# Initialize variables
|
|
3932
|
+
manifest_data = [] # Collects all rows for the manifest
|
|
3933
|
+
# TODO: Update to SDS 3.0
|
|
3934
|
+
header_row = [
|
|
3935
|
+
"filename", "timestamp", "description", "file type", "entity",
|
|
3936
|
+
"data modality", "also in dataset", "data dictionary path",
|
|
3937
|
+
"entity is transitive", "Additional Metadata"
|
|
3938
|
+
]
|
|
3939
|
+
local_timezone = TZLOCAL()
|
|
3940
|
+
|
|
3941
|
+
# Log the dataset structure
|
|
3942
|
+
|
|
3943
|
+
# Start recursive traversal from the root
|
|
3944
|
+
traverse_folders(dataset_structure, [])
|
|
3945
|
+
|
|
3946
|
+
return manifest_data
|
|
3947
|
+
|
|
3948
|
+
|
|
3949
|
+
|
|
3950
|
+
|
|
3951
|
+
|