pysodafair 0.1.62__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- pysoda/__init__.py +0 -0
- pysoda/constants.py +3 -0
- pysoda/core/__init__.py +10 -0
- pysoda/core/dataset_generation/__init__.py +11 -0
- pysoda/core/dataset_generation/manifestSession/__init__.py +1 -0
- pysoda/core/dataset_generation/manifestSession/manifest_session.py +146 -0
- pysoda/core/dataset_generation/upload.py +3951 -0
- pysoda/core/dataset_importing/__init__.py +1 -0
- pysoda/core/dataset_importing/import_dataset.py +662 -0
- pysoda/core/metadata/__init__.py +20 -0
- pysoda/core/metadata/code_description.py +109 -0
- pysoda/core/metadata/constants.py +32 -0
- pysoda/core/metadata/dataset_description.py +188 -0
- pysoda/core/metadata/excel_utils.py +41 -0
- pysoda/core/metadata/helpers.py +250 -0
- pysoda/core/metadata/manifest.py +112 -0
- pysoda/core/metadata/manifest_package/__init__.py +2 -0
- pysoda/core/metadata/manifest_package/manifest.py +0 -0
- pysoda/core/metadata/manifest_package/manifest_import.py +29 -0
- pysoda/core/metadata/manifest_package/manifest_writer.py +666 -0
- pysoda/core/metadata/performances.py +46 -0
- pysoda/core/metadata/resources.py +53 -0
- pysoda/core/metadata/samples.py +184 -0
- pysoda/core/metadata/sites.py +51 -0
- pysoda/core/metadata/subjects.py +172 -0
- pysoda/core/metadata/submission.py +91 -0
- pysoda/core/metadata/text_metadata.py +47 -0
- pysoda/core/metadata_templates/CHANGES +1 -0
- pysoda/core/metadata_templates/LICENSE +1 -0
- pysoda/core/metadata_templates/README.md +4 -0
- pysoda/core/metadata_templates/__init__.py +0 -0
- pysoda/core/metadata_templates/code_description.xlsx +0 -0
- pysoda/core/metadata_templates/code_parameters.xlsx +0 -0
- pysoda/core/metadata_templates/dataset_description.xlsx +0 -0
- pysoda/core/metadata_templates/manifest.xlsx +0 -0
- pysoda/core/metadata_templates/performances.xlsx +0 -0
- pysoda/core/metadata_templates/resources.xlsx +0 -0
- pysoda/core/metadata_templates/samples.xlsx +0 -0
- pysoda/core/metadata_templates/sites.xlsx +0 -0
- pysoda/core/metadata_templates/subjects.xlsx +0 -0
- pysoda/core/metadata_templates/subjects_pools_samples_structure.xlsx +0 -0
- pysoda/core/metadata_templates/subjects_pools_samples_structure_example.xlsx +0 -0
- pysoda/core/metadata_templates/submission.xlsx +0 -0
- pysoda/core/permissions/__init__.py +1 -0
- pysoda/core/permissions/permissions.py +31 -0
- pysoda/core/pysoda/__init__.py +2 -0
- pysoda/core/pysoda/soda.py +34 -0
- pysoda/core/pysoda/soda_object.py +55 -0
- pysoda/core/upload_manifests/__init__.py +1 -0
- pysoda/core/upload_manifests/upload_manifests.py +37 -0
- pysoda/schema/__init__.py +0 -0
- pysoda/schema/code_description.json +629 -0
- pysoda/schema/dataset_description.json +295 -0
- pysoda/schema/manifest.json +60 -0
- pysoda/schema/performances.json +44 -0
- pysoda/schema/resources.json +39 -0
- pysoda/schema/samples.json +97 -0
- pysoda/schema/sites.json +38 -0
- pysoda/schema/soda_schema.json +664 -0
- pysoda/schema/subjects.json +131 -0
- pysoda/schema/submission_schema.json +28 -0
- pysoda/utils/__init__.py +9 -0
- pysoda/utils/authentication.py +381 -0
- pysoda/utils/config.py +68 -0
- pysoda/utils/exceptions.py +156 -0
- pysoda/utils/logger.py +6 -0
- pysoda/utils/metadata_utils.py +74 -0
- pysoda/utils/pennsieveAgentUtils.py +11 -0
- pysoda/utils/pennsieveUtils.py +118 -0
- pysoda/utils/profile.py +28 -0
- pysoda/utils/schema_validation.py +133 -0
- pysoda/utils/time_utils.py +5 -0
- pysoda/utils/upload_utils.py +108 -0
- pysodafair-0.1.62.dist-info/METADATA +190 -0
- pysodafair-0.1.62.dist-info/RECORD +77 -0
- pysodafair-0.1.62.dist-info/WHEEL +4 -0
- pysodafair-0.1.62.dist-info/licenses/LICENSE +21 -0
|
@@ -0,0 +1 @@
|
|
|
1
|
+
from .import_dataset import import_pennsieve_dataset, create_soda_json_object_backend, monitor_local_json_progress
|
|
@@ -0,0 +1,662 @@
|
|
|
1
|
+
import os
|
|
2
|
+
import requests
|
|
3
|
+
import pandas as pd
|
|
4
|
+
import itertools
|
|
5
|
+
from ...constants import PENNSIEVE_URL
|
|
6
|
+
from ...utils import create_request_headers, get_access_token, get_dataset_id, PennsieveActionNoPermission
|
|
7
|
+
from ..permissions import pennsieve_get_current_user_permissions
|
|
8
|
+
from ..metadata import load_metadata_to_dataframe
|
|
9
|
+
|
|
10
|
+
from .. import logger
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
METADATA_FILES_SPARC = [
|
|
14
|
+
"submission.xlsx",
|
|
15
|
+
"submission.csv",
|
|
16
|
+
"submission.json",
|
|
17
|
+
"dataset_description.xlsx",
|
|
18
|
+
"dataset_description.csv",
|
|
19
|
+
"dataset_description.json",
|
|
20
|
+
"subjects.xlsx",
|
|
21
|
+
"subjects.csv",
|
|
22
|
+
"subjects.json",
|
|
23
|
+
"samples.xlsx",
|
|
24
|
+
"samples.csv",
|
|
25
|
+
"samples.json",
|
|
26
|
+
"README.txt",
|
|
27
|
+
"CHANGES.txt",
|
|
28
|
+
"code_description.xlsx",
|
|
29
|
+
"inputs_metadata.xlsx",
|
|
30
|
+
"outputs_metadata.xlsx",
|
|
31
|
+
"manifest.xlsx",
|
|
32
|
+
"manifest.csv"
|
|
33
|
+
]
|
|
34
|
+
|
|
35
|
+
|
|
36
|
+
def import_pennsieve_dataset(soda_json_structure, requested_sparc_only=True):
|
|
37
|
+
global logger
|
|
38
|
+
high_level_sparc_folders = [
|
|
39
|
+
"code",
|
|
40
|
+
"derivative",
|
|
41
|
+
"docs",
|
|
42
|
+
"primary",
|
|
43
|
+
"protocol",
|
|
44
|
+
"source",
|
|
45
|
+
]
|
|
46
|
+
manifest_sparc = ["manifest.xlsx", "manifest.csv"]
|
|
47
|
+
high_level_metadata_sparc = [
|
|
48
|
+
"submission.xlsx",
|
|
49
|
+
"submission.csv",
|
|
50
|
+
"submission.json",
|
|
51
|
+
"dataset_description.xlsx",
|
|
52
|
+
"dataset_description.csv",
|
|
53
|
+
"dataset_description.json",
|
|
54
|
+
"subjects.xlsx",
|
|
55
|
+
"subjects.csv",
|
|
56
|
+
"subjects.json",
|
|
57
|
+
"samples.xlsx",
|
|
58
|
+
"samples.csv",
|
|
59
|
+
"samples.json",
|
|
60
|
+
"README.txt",
|
|
61
|
+
"CHANGES.txt",
|
|
62
|
+
"code_description.xlsx",
|
|
63
|
+
"inputs_metadata.xlsx",
|
|
64
|
+
"outputs_metadata.xlsx",
|
|
65
|
+
]
|
|
66
|
+
double_extensions = [
|
|
67
|
+
".ome.tiff",
|
|
68
|
+
".ome.tif",
|
|
69
|
+
".ome.tf2,",
|
|
70
|
+
".ome.tf8",
|
|
71
|
+
".ome.btf",
|
|
72
|
+
".ome.xml",
|
|
73
|
+
".brukertiff.gz",
|
|
74
|
+
".mefd.gz",
|
|
75
|
+
".moberg.gz",
|
|
76
|
+
".nii.gz",
|
|
77
|
+
".mgh.gz",
|
|
78
|
+
".tar.gz",
|
|
79
|
+
".bcl.gz",
|
|
80
|
+
]
|
|
81
|
+
|
|
82
|
+
global create_soda_json_completed
|
|
83
|
+
global create_soda_json_total_items
|
|
84
|
+
global create_soda_json_progress
|
|
85
|
+
create_soda_json_progress = 0
|
|
86
|
+
create_soda_json_total_items = 0
|
|
87
|
+
create_soda_json_completed = 0
|
|
88
|
+
|
|
89
|
+
# ["extensions"] doesn't seem to be returned by the Pennsieve API anymore
|
|
90
|
+
def verify_file_name(file_name, extension):
|
|
91
|
+
global logger
|
|
92
|
+
if extension == "":
|
|
93
|
+
return file_name
|
|
94
|
+
|
|
95
|
+
double_ext = False
|
|
96
|
+
for ext in double_extensions:
|
|
97
|
+
if file_name.find(ext) != -1:
|
|
98
|
+
double_ext = True
|
|
99
|
+
break
|
|
100
|
+
|
|
101
|
+
extension_from_name = ""
|
|
102
|
+
|
|
103
|
+
if double_ext == False:
|
|
104
|
+
extension_from_name = os.path.splitext(file_name)[1]
|
|
105
|
+
else:
|
|
106
|
+
extension_from_name = (
|
|
107
|
+
os.path.splitext(os.path.splitext(file_name)[0])[1]
|
|
108
|
+
+ os.path.splitext(file_name)[1]
|
|
109
|
+
)
|
|
110
|
+
|
|
111
|
+
if extension_from_name == ("." + extension):
|
|
112
|
+
return file_name
|
|
113
|
+
else:
|
|
114
|
+
return file_name + ("." + extension)
|
|
115
|
+
|
|
116
|
+
|
|
117
|
+
def createFolderStructure(subfolder_json, manifest):
|
|
118
|
+
"""
|
|
119
|
+
Function for creating the Pennsieve folder structure for a given dataset as an object stored locally.
|
|
120
|
+
Arguments:
|
|
121
|
+
subfolder_json: The json object containing the folder structure of the dataset
|
|
122
|
+
pennsieve_client: The Pennsieve client object
|
|
123
|
+
manifest: The manifest object for the dataset
|
|
124
|
+
"""
|
|
125
|
+
# root level folder will pass subfolders into this function and will recursively check if there are subfolders while creating the json structure
|
|
126
|
+
global logger
|
|
127
|
+
global create_soda_json_progress
|
|
128
|
+
|
|
129
|
+
collection_id = subfolder_json["path"]
|
|
130
|
+
|
|
131
|
+
limit = 100
|
|
132
|
+
offset = 0
|
|
133
|
+
subfolder = []
|
|
134
|
+
while True:
|
|
135
|
+
r = requests.get(f"{PENNSIEVE_URL}/packages/{collection_id}?limit={limit}&offset={offset}", headers=create_request_headers(get_access_token()))
|
|
136
|
+
r.raise_for_status()
|
|
137
|
+
page = r.json()["children"]
|
|
138
|
+
subfolder.extend(page)
|
|
139
|
+
if len(page) < limit:
|
|
140
|
+
break
|
|
141
|
+
offset += limit
|
|
142
|
+
|
|
143
|
+
for items in subfolder:
|
|
144
|
+
folder_item_name = items["content"]["name"]
|
|
145
|
+
create_soda_json_progress += 1
|
|
146
|
+
item_id = items["content"]["id"]
|
|
147
|
+
# is a file name check if there are additional manifest information to attach to files
|
|
148
|
+
if item_id[2:9] == "package":
|
|
149
|
+
if (
|
|
150
|
+
folder_item_name[0:8] != "manifest"
|
|
151
|
+
): # manifest files are not being included in json structure
|
|
152
|
+
|
|
153
|
+
# verify file name first (used for legacy Pennsieve datasets)
|
|
154
|
+
if("extension" not in subfolder):
|
|
155
|
+
folder_item_name = verify_file_name(folder_item_name, "")
|
|
156
|
+
else:
|
|
157
|
+
folder_item_name = verify_file_name(folder_item_name, subfolder["extension"])
|
|
158
|
+
|
|
159
|
+
# verify timestamps
|
|
160
|
+
timestamp = items["content"]["createdAt"].replace('.', ',')
|
|
161
|
+
|
|
162
|
+
paths_list = [*subfolder_json["pspath"]]
|
|
163
|
+
subfolder_json["files"][folder_item_name] = {
|
|
164
|
+
"action": ["existing"],
|
|
165
|
+
"path": item_id,
|
|
166
|
+
"pspath": paths_list,
|
|
167
|
+
"timestamp": timestamp,
|
|
168
|
+
"location": "ps",
|
|
169
|
+
"additional-metadata": "",
|
|
170
|
+
"description": "",
|
|
171
|
+
}
|
|
172
|
+
|
|
173
|
+
# creates path for folder_item_name (stored in temp_name)
|
|
174
|
+
if len(subfolder_json["files"][folder_item_name]["pspath"]) > 1:
|
|
175
|
+
temp_name = '/'.join(subfolder_json["files"][folder_item_name]["pspath"][1:]) + "/" + folder_item_name
|
|
176
|
+
else:
|
|
177
|
+
temp_name = folder_item_name
|
|
178
|
+
|
|
179
|
+
if len(manifest.keys()) > 0:
|
|
180
|
+
# Dictionary that has the required manifest headers in lowercase and without spaces as keys
|
|
181
|
+
# and the correct manifest headers as values
|
|
182
|
+
defaultManifestHeadersNameMapped = {
|
|
183
|
+
"filename": "filename",
|
|
184
|
+
"timestamp": "timestamp",
|
|
185
|
+
"description": "description",
|
|
186
|
+
"filetype": "file type",
|
|
187
|
+
"entity": "entity",
|
|
188
|
+
"datamodality": "data modality",
|
|
189
|
+
"alsoindataset": "also in dataset",
|
|
190
|
+
"alsoindatasetpath": "also in dataset path",
|
|
191
|
+
"datadictionarypath": "data dictionary path",
|
|
192
|
+
"entityistransitive": "entity is transitive",
|
|
193
|
+
"additionalmetadata": "additional-metadata",
|
|
194
|
+
}
|
|
195
|
+
|
|
196
|
+
# Dictionary that will be used to store the correct manifest headers as keys
|
|
197
|
+
# and the values from the manifest as values
|
|
198
|
+
updated_manifest = {}
|
|
199
|
+
|
|
200
|
+
# Go through the imported manifest keys and change the keys to the correct name
|
|
201
|
+
# For example if the key is "File Name" change it to "filename"
|
|
202
|
+
for manifestKey in manifest.keys():
|
|
203
|
+
# Make the key lowercase
|
|
204
|
+
sterilizedKeyName = manifestKey.lower().replace(" ", "")
|
|
205
|
+
if sterilizedKeyName in defaultManifestHeadersNameMapped.keys():
|
|
206
|
+
# change the key to the correct name
|
|
207
|
+
# For example if the key name is "filetype" change it to "file type"
|
|
208
|
+
newManifestKeyName = defaultManifestHeadersNameMapped[sterilizedKeyName]
|
|
209
|
+
# Add the new key/value to the updated manifest
|
|
210
|
+
updated_manifest[newManifestKeyName] = manifest[manifestKey]
|
|
211
|
+
else:
|
|
212
|
+
# Keep the key/value the same and add it to the updated manifest
|
|
213
|
+
updated_manifest[manifestKey] = manifest[manifestKey]
|
|
214
|
+
|
|
215
|
+
if "filename" in updated_manifest.keys():
|
|
216
|
+
for manifestKey in updated_manifest.keys():
|
|
217
|
+
location_index = ""
|
|
218
|
+
# get the index of the file name in the manifest
|
|
219
|
+
if (temp_name in updated_manifest["filename"].values()):
|
|
220
|
+
location_index = list(updated_manifest["filename"].values()).index(
|
|
221
|
+
temp_name
|
|
222
|
+
)
|
|
223
|
+
# This is for the case where the file name in the manifest has a slash at the beginning
|
|
224
|
+
# which is the case for files in the root folder
|
|
225
|
+
elif ("/" + temp_name in updated_manifest["filename"].values()):
|
|
226
|
+
location_index = list(updated_manifest["filename"].values()).index(
|
|
227
|
+
"/" + temp_name
|
|
228
|
+
)
|
|
229
|
+
else:
|
|
230
|
+
# break out of the for loop if the file name is not in the manifest
|
|
231
|
+
break
|
|
232
|
+
|
|
233
|
+
# check if the key is in the required manifest headers, if it is, update the folder_item_name value
|
|
234
|
+
# corresponding to the key
|
|
235
|
+
if manifestKey in defaultManifestHeadersNameMapped.values():
|
|
236
|
+
if updated_manifest[manifestKey][location_index] != "":
|
|
237
|
+
if folder_item_name[0:1] == "/":
|
|
238
|
+
subfolder_json["files"][folder_item_name[:1]][manifestKey] = updated_manifest[manifestKey][location_index]
|
|
239
|
+
else:
|
|
240
|
+
subfolder_json["files"][folder_item_name][manifestKey] = updated_manifest[manifestKey][location_index]
|
|
241
|
+
# if the key is not in the required manifest headers, add it to the extra columns folder_item_name value
|
|
242
|
+
else :
|
|
243
|
+
# if the extra columns key does not exist, create it
|
|
244
|
+
if "extra_columns" not in subfolder_json["files"][folder_item_name]:
|
|
245
|
+
subfolder_json["files"][folder_item_name]["extra_columns"] = {}
|
|
246
|
+
|
|
247
|
+
if updated_manifest[manifestKey][location_index] != "":
|
|
248
|
+
subfolder_json["files"][folder_item_name]["extra_columns"][manifestKey] = updated_manifest[manifestKey][location_index]
|
|
249
|
+
else:
|
|
250
|
+
subfolder_json["files"][folder_item_name]["extra_columns"][manifestKey] = ""
|
|
251
|
+
else:
|
|
252
|
+
# filename not in updated manifest so recreate standard headers if they don't exist
|
|
253
|
+
# loop through the updated manifest keys and if header matches standard header add content else recreate
|
|
254
|
+
if len(updated_manifest.keys()) > 0:
|
|
255
|
+
location_index = ""
|
|
256
|
+
for manifestKey in updated_manifest.keys():
|
|
257
|
+
if temp_name in updated_manifest[manifestKey].values():
|
|
258
|
+
# file_names found
|
|
259
|
+
location_index = list(updated_manifest[manifestKey].values()).index(
|
|
260
|
+
temp_name
|
|
261
|
+
)
|
|
262
|
+
if ("/" + temp_name in updated_manifest[manifestKey].values()):
|
|
263
|
+
location_index = list(updated_manifest[manifestKey].values()).index(
|
|
264
|
+
"/" + temp_name
|
|
265
|
+
)
|
|
266
|
+
if location_index != "":
|
|
267
|
+
if manifestKey in defaultManifestHeadersNameMapped.values():
|
|
268
|
+
if folder_item_name[0:1] == "/":
|
|
269
|
+
subfolder_json["files"][folder_item_name[1:]][manifestKey] = updated_manifest[manifestKey][location_index]
|
|
270
|
+
else:
|
|
271
|
+
subfolder_json["files"][folder_item_name][manifestKey] = updated_manifest[manifestKey][location_index]
|
|
272
|
+
else:
|
|
273
|
+
if "extra_columns" not in subfolder_json["files"][folder_item_name]:
|
|
274
|
+
subfolder_json["files"][folder_item_name]["extra_columns"] = {}
|
|
275
|
+
subfolder_json["files"][folder_item_name]["extra_columns"][manifestKey] = updated_manifest[manifestKey][location_index]
|
|
276
|
+
|
|
277
|
+
else: # another subfolder found
|
|
278
|
+
paths_list = [*subfolder_json["pspath"], folder_item_name]
|
|
279
|
+
subfolder_json["folders"][folder_item_name] = {
|
|
280
|
+
"action": ["existing"],
|
|
281
|
+
"path": item_id,
|
|
282
|
+
"pspath": paths_list,
|
|
283
|
+
"files": {},
|
|
284
|
+
"folders": {},
|
|
285
|
+
"location": "ps",
|
|
286
|
+
}
|
|
287
|
+
|
|
288
|
+
if len(subfolder_json["folders"].keys()) != 0: # there are subfolders
|
|
289
|
+
for folder in subfolder_json["folders"].keys():
|
|
290
|
+
subfolder = subfolder_json["folders"][folder]
|
|
291
|
+
createFolderStructure(subfolder, manifest)
|
|
292
|
+
|
|
293
|
+
|
|
294
|
+
# check that the Pennsieve dataset is valid
|
|
295
|
+
try:
|
|
296
|
+
bf_dataset_name = soda_json_structure["ps-dataset-selected"]["dataset-name"]
|
|
297
|
+
except Exception as e:
|
|
298
|
+
raise e
|
|
299
|
+
|
|
300
|
+
selected_dataset_id = get_dataset_id(bf_dataset_name)
|
|
301
|
+
|
|
302
|
+
# check that the user has permission to edit this dataset
|
|
303
|
+
role = pennsieve_get_current_user_permissions(selected_dataset_id, get_access_token())["role"]
|
|
304
|
+
if role not in ["owner", "manager", "editor"]:
|
|
305
|
+
curatestatus = "Done"
|
|
306
|
+
raise PennsieveActionNoPermission("You do not have permissions to edit upload this Pennsieve dataset.")
|
|
307
|
+
|
|
308
|
+
|
|
309
|
+
# surface layer of dataset is pulled. then go through through the children to get information on subfolders
|
|
310
|
+
manifest_dict = {}
|
|
311
|
+
manifest_error_message = []
|
|
312
|
+
soda_json_structure["dataset-structure"] = {
|
|
313
|
+
"files": {},
|
|
314
|
+
"folders": {},
|
|
315
|
+
}
|
|
316
|
+
|
|
317
|
+
# root of dataset is pulled here (high level folders/files are gathered here)
|
|
318
|
+
# root_folder is the files and folders within root
|
|
319
|
+
r = requests.get(f"{PENNSIEVE_URL}/datasets/{selected_dataset_id}", headers=create_request_headers(get_access_token()))
|
|
320
|
+
r.raise_for_status()
|
|
321
|
+
root_folder = r.json()["children"]
|
|
322
|
+
|
|
323
|
+
# Get the amount of files/folders in the dataset
|
|
324
|
+
r = requests.get(f"{PENNSIEVE_URL}/datasets/{selected_dataset_id}/packageTypeCounts", headers=create_request_headers(get_access_token()))
|
|
325
|
+
r.raise_for_status()
|
|
326
|
+
packages_list = r.json()
|
|
327
|
+
|
|
328
|
+
|
|
329
|
+
# root's children files
|
|
330
|
+
for count in packages_list.values():
|
|
331
|
+
create_soda_json_total_items += int(count)
|
|
332
|
+
|
|
333
|
+
# set manifest dictionry to empty dictionary; used to store the manifest information while we import dataset
|
|
334
|
+
manifest_dict = {}
|
|
335
|
+
|
|
336
|
+
|
|
337
|
+
# Gather metadata files first
|
|
338
|
+
for items in root_folder:
|
|
339
|
+
item_id = items["content"]["id"]
|
|
340
|
+
item_name = items["content"]["name"]
|
|
341
|
+
|
|
342
|
+
# Import manifest at the root of the dataset
|
|
343
|
+
if item_name in manifest_sparc:
|
|
344
|
+
# Item is a manifest file
|
|
345
|
+
df = ""
|
|
346
|
+
try:
|
|
347
|
+
if item_name.lower() == "manifest.xlsx":
|
|
348
|
+
df = load_metadata_to_dataframe(item_id, "excel", get_access_token())
|
|
349
|
+
df = df.fillna("")
|
|
350
|
+
else:
|
|
351
|
+
df = load_metadata_to_dataframe(item_id, "csv", get_access_token())
|
|
352
|
+
df = df.fillna("")
|
|
353
|
+
manifest_dict = df.to_dict()
|
|
354
|
+
except Exception as e:
|
|
355
|
+
manifest_error_message.append(item_name)
|
|
356
|
+
|
|
357
|
+
# Item is a metadata file
|
|
358
|
+
if item_name in high_level_metadata_sparc:
|
|
359
|
+
create_soda_json_progress += 1
|
|
360
|
+
if "dataset_metadata" not in soda_json_structure.keys():
|
|
361
|
+
soda_json_structure["dataset_metadata"] = {}
|
|
362
|
+
soda_json_structure["dataset_metadata"][item_name] = {
|
|
363
|
+
"location": "ps",
|
|
364
|
+
"action": ["existing"],
|
|
365
|
+
"path": item_id,
|
|
366
|
+
}
|
|
367
|
+
|
|
368
|
+
# Process the folder structure
|
|
369
|
+
for items in root_folder:
|
|
370
|
+
item_id = items["content"]["id"]
|
|
371
|
+
item_name = items["content"]["name"]
|
|
372
|
+
|
|
373
|
+
# If package type is Collection, then it is a folder
|
|
374
|
+
if items["content"]["packageType"] == "Collection" and item_name in high_level_sparc_folders:
|
|
375
|
+
create_soda_json_progress += 1
|
|
376
|
+
soda_json_structure["dataset-structure"]["folders"][item_name] = {
|
|
377
|
+
"location": "ps",
|
|
378
|
+
"path": item_id,
|
|
379
|
+
"action": ["existing"],
|
|
380
|
+
"files": {},
|
|
381
|
+
"folders": {},
|
|
382
|
+
"pspath": [item_name],
|
|
383
|
+
}
|
|
384
|
+
|
|
385
|
+
# Check the content of the folder to see if a manifest file exists
|
|
386
|
+
r = requests.get(f"{PENNSIEVE_URL}/packages/{item_id}", headers=create_request_headers(get_access_token()))
|
|
387
|
+
r.raise_for_status()
|
|
388
|
+
folder_content = r.json()["children"]
|
|
389
|
+
|
|
390
|
+
if len(folder_content) > 0:
|
|
391
|
+
high_lvl_folder_dict = soda_json_structure["dataset-structure"]["folders"][item_name]
|
|
392
|
+
|
|
393
|
+
createFolderStructure(
|
|
394
|
+
high_lvl_folder_dict, manifest_dict
|
|
395
|
+
) # Passing item's JSON and the collection ID
|
|
396
|
+
|
|
397
|
+
success_message = (
|
|
398
|
+
"Data files under a valid high-level SPARC folders have been imported"
|
|
399
|
+
)
|
|
400
|
+
create_soda_json_completed = 1
|
|
401
|
+
|
|
402
|
+
logger.info(f"Time to import {soda_json_structure['ps-dataset-selected']['dataset-name']} ")
|
|
403
|
+
return {
|
|
404
|
+
"soda_object": soda_json_structure,
|
|
405
|
+
"success_message": success_message,
|
|
406
|
+
"manifest_error_message": manifest_error_message,
|
|
407
|
+
"import_progress": create_soda_json_progress,
|
|
408
|
+
"import_total_items": create_soda_json_total_items,
|
|
409
|
+
}
|
|
410
|
+
|
|
411
|
+
|
|
412
|
+
create_soda_json_progress = 0
|
|
413
|
+
create_soda_json_total_items = 0
|
|
414
|
+
create_soda_json_completed = 0
|
|
415
|
+
|
|
416
|
+
|
|
417
|
+
def create_soda_json_object_backend(
|
|
418
|
+
soda_json_structure, root_folder_path, irregularFolders, replaced
|
|
419
|
+
):
|
|
420
|
+
"""
|
|
421
|
+
This function is meant for importing local datasets into SODA.
|
|
422
|
+
It creates a json object with the structure of the dataset.
|
|
423
|
+
"""
|
|
424
|
+
global create_soda_json_progress # amount of items counted during recursion
|
|
425
|
+
global create_soda_json_total_items # counts the total items in folder
|
|
426
|
+
global create_soda_json_completed # completed progress is either 0 or 1
|
|
427
|
+
global METADATA_FILES_SPARC
|
|
428
|
+
|
|
429
|
+
high_level_sparc_folders = [
|
|
430
|
+
"code",
|
|
431
|
+
"derivative",
|
|
432
|
+
"docs",
|
|
433
|
+
"primary",
|
|
434
|
+
"protocol",
|
|
435
|
+
"source",
|
|
436
|
+
]
|
|
437
|
+
|
|
438
|
+
dataset_folder = soda_json_structure["dataset-structure"] = {"folders": {}}
|
|
439
|
+
|
|
440
|
+
def recursive_structure_create(dataset_structure, folder_path, root_manifest):
|
|
441
|
+
global create_soda_json_progress
|
|
442
|
+
# going within high level folders
|
|
443
|
+
# add manifest details if manifest exists
|
|
444
|
+
manifest_object = {
|
|
445
|
+
"filename": "",
|
|
446
|
+
"timestamp": "",
|
|
447
|
+
"description": "",
|
|
448
|
+
"file type": "",
|
|
449
|
+
"entity": "",
|
|
450
|
+
"data modality": "",
|
|
451
|
+
"also in dataset": "",
|
|
452
|
+
"also in dataset path": "",
|
|
453
|
+
"data dictionary path": "",
|
|
454
|
+
"entity is transitive": "",
|
|
455
|
+
"additional-metadata": "",
|
|
456
|
+
}
|
|
457
|
+
|
|
458
|
+
lastSlash = folder_path.rfind("/") + 1
|
|
459
|
+
folder_name = folder_path[lastSlash:]
|
|
460
|
+
|
|
461
|
+
if folder_name in replaced.keys():
|
|
462
|
+
folder_name = replaced[folder_name]
|
|
463
|
+
|
|
464
|
+
# Check if folder is in irregular folders
|
|
465
|
+
if folder_path in irregularFolders:
|
|
466
|
+
index_check = irregularFolders.index(folder_path)
|
|
467
|
+
modified_name = replaced[os.path.basename(folder_path)]
|
|
468
|
+
folder_path = irregularFolders[index_check]
|
|
469
|
+
|
|
470
|
+
|
|
471
|
+
entries = os.listdir(folder_path)
|
|
472
|
+
for entry in entries:
|
|
473
|
+
item_path = os.path.normpath(os.path.join(folder_path, entry))
|
|
474
|
+
if os.path.isfile(item_path):
|
|
475
|
+
# Check manifest to add metadata
|
|
476
|
+
if entry[0:1] != "." and entry[0:8] != "manifest":
|
|
477
|
+
create_soda_json_progress += 1
|
|
478
|
+
# Use the root manifest to find metadata for the file
|
|
479
|
+
for row in root_manifest:
|
|
480
|
+
extra_columns = False
|
|
481
|
+
if len(row) > 11:
|
|
482
|
+
extra_columns = True
|
|
483
|
+
extra_columns_dict = dict(itertools.islice(row.items(), 5, len(row)))
|
|
484
|
+
|
|
485
|
+
if row["filename"] == entry:
|
|
486
|
+
# Add description metadata
|
|
487
|
+
manifest_object["description"] = row.get("description", "")
|
|
488
|
+
# Add additional metadata
|
|
489
|
+
manifest_object["additional-metadata"] = row.get("Additional Metadata", "")
|
|
490
|
+
if extra_columns:
|
|
491
|
+
manifest_object["extra_columns"] = extra_columns_dict
|
|
492
|
+
|
|
493
|
+
# Create JSON structure for the file
|
|
494
|
+
if "extra_columns" in manifest_object:
|
|
495
|
+
dataset_structure["files"][entry] = {
|
|
496
|
+
"path": item_path,
|
|
497
|
+
"location": "local",
|
|
498
|
+
"action": ["existing"],
|
|
499
|
+
"description": manifest_object["description"],
|
|
500
|
+
"additional-metadata": manifest_object["additional-metadata"],
|
|
501
|
+
"extra_columns": manifest_object["extra_columns"],
|
|
502
|
+
}
|
|
503
|
+
else:
|
|
504
|
+
dataset_structure["files"][entry] = {
|
|
505
|
+
"path": item_path,
|
|
506
|
+
"location": "local",
|
|
507
|
+
"action": ["existing"],
|
|
508
|
+
"description": manifest_object["description"],
|
|
509
|
+
"additional-metadata": manifest_object["additional-metadata"],
|
|
510
|
+
}
|
|
511
|
+
elif os.path.isdir(item_path) is True:
|
|
512
|
+
create_soda_json_progress += 1
|
|
513
|
+
if item_path in irregularFolders:
|
|
514
|
+
index_check = irregularFolders.index(item_path)
|
|
515
|
+
modified_name = replaced[os.path.basename(item_path)]
|
|
516
|
+
|
|
517
|
+
dataset_structure["folders"][modified_name] = {
|
|
518
|
+
"folders": {},
|
|
519
|
+
"files": {},
|
|
520
|
+
"path": item_path,
|
|
521
|
+
"location": "local",
|
|
522
|
+
"action": ["existing"],
|
|
523
|
+
"original-name": entry,
|
|
524
|
+
}
|
|
525
|
+
for folder in dataset_structure["folders"][modified_name][
|
|
526
|
+
"folders"
|
|
527
|
+
]:
|
|
528
|
+
updated_path = dataset_structure["folders"][modified_name][
|
|
529
|
+
folder
|
|
530
|
+
]["path"]
|
|
531
|
+
recursive_structure_create(
|
|
532
|
+
dataset_structure["folders"][modified_name][folder],
|
|
533
|
+
updated_path,
|
|
534
|
+
root_manifest
|
|
535
|
+
)
|
|
536
|
+
else:
|
|
537
|
+
dataset_structure["folders"][entry] = {
|
|
538
|
+
"folders": {},
|
|
539
|
+
"files": {},
|
|
540
|
+
"path": item_path,
|
|
541
|
+
"location": "local",
|
|
542
|
+
"action": ["existing"],
|
|
543
|
+
}
|
|
544
|
+
|
|
545
|
+
for folder in dataset_structure["folders"]:
|
|
546
|
+
updated_path = dataset_structure["folders"][folder]["path"]
|
|
547
|
+
recursive_structure_create(
|
|
548
|
+
dataset_structure["folders"][folder], updated_path, root_manifest
|
|
549
|
+
)
|
|
550
|
+
|
|
551
|
+
# BEGIN
|
|
552
|
+
|
|
553
|
+
# Check for a single manifest file at the root of the dataset
|
|
554
|
+
root_manifest_csv = os.path.join(root_folder_path, "manifest.csv")
|
|
555
|
+
root_manifest_xlsx = os.path.join(root_folder_path, "manifest.xlsx")
|
|
556
|
+
|
|
557
|
+
soda_json_structure["starting-point"]["manifest"] = {}
|
|
558
|
+
|
|
559
|
+
if os.path.exists(root_manifest_csv):
|
|
560
|
+
csv_data = pd.read_csv(root_manifest_csv)
|
|
561
|
+
csv_data.fillna("", inplace=True)
|
|
562
|
+
json_format = csv_data.to_dict(orient="records")
|
|
563
|
+
soda_json_structure["starting-point"]["manifest"] = json_format
|
|
564
|
+
soda_json_structure["starting-point"]["path"] = root_manifest_csv
|
|
565
|
+
elif os.path.exists(root_manifest_xlsx):
|
|
566
|
+
excel_data = pd.read_excel(root_manifest_xlsx, sheet_name="Sheet1")
|
|
567
|
+
excel_data.fillna("", inplace=True)
|
|
568
|
+
json_format = excel_data.to_dict(orient="records")
|
|
569
|
+
soda_json_structure["starting-point"]["manifest"] = json_format
|
|
570
|
+
soda_json_structure["starting-point"]["path"] = root_manifest_xlsx
|
|
571
|
+
|
|
572
|
+
|
|
573
|
+
|
|
574
|
+
# count the amount of items in folder
|
|
575
|
+
create_soda_json_total_items = 0
|
|
576
|
+
for root, dirs, filenames in os.walk(root_folder_path):
|
|
577
|
+
# walk through all folders and it's subfolders
|
|
578
|
+
for Dir in dirs:
|
|
579
|
+
# does not take hidden folders or manifest folders
|
|
580
|
+
if Dir[0:1] != "." and Dir[0:8] != "manifest":
|
|
581
|
+
create_soda_json_total_items += 1
|
|
582
|
+
for fileName in filenames:
|
|
583
|
+
if root == root_folder_path and fileName in METADATA_FILES_SPARC:
|
|
584
|
+
# goes through all files and does not count hidden files
|
|
585
|
+
create_soda_json_total_items += 1
|
|
586
|
+
else:
|
|
587
|
+
if fileName[0:1] != ".":
|
|
588
|
+
create_soda_json_total_items += 1
|
|
589
|
+
|
|
590
|
+
# reading high level folders
|
|
591
|
+
create_soda_json_completed = 0
|
|
592
|
+
create_soda_json_progress = 0
|
|
593
|
+
entries = os.listdir(root_folder_path)
|
|
594
|
+
|
|
595
|
+
|
|
596
|
+
for entry in entries:
|
|
597
|
+
# begin going through high level folders
|
|
598
|
+
item_path = os.path.normpath(os.path.join(root_folder_path, entry))
|
|
599
|
+
# high level folder paths
|
|
600
|
+
if os.path.isfile(item_path) is True:
|
|
601
|
+
if entry[0:1] != "." and entry in METADATA_FILES_SPARC:
|
|
602
|
+
# is not a hidden folder
|
|
603
|
+
create_soda_json_progress += 1
|
|
604
|
+
soda_json_structure["dataset_metadata"][entry] = {
|
|
605
|
+
"path": item_path,
|
|
606
|
+
"location": "local",
|
|
607
|
+
"action": ["existing"],
|
|
608
|
+
}
|
|
609
|
+
# do file work here
|
|
610
|
+
elif os.path.isdir(item_path) is True:
|
|
611
|
+
create_soda_json_progress += 1
|
|
612
|
+
# add item to soda
|
|
613
|
+
if item_path in irregularFolders:
|
|
614
|
+
index_check = irregularFolders.index(item_path)
|
|
615
|
+
modified_name = replaced[index_check]
|
|
616
|
+
folder_name = modified_name
|
|
617
|
+
dataset_folder["folders"][folder_name] = {
|
|
618
|
+
"folders": {},
|
|
619
|
+
"files": {},
|
|
620
|
+
"path": item_path,
|
|
621
|
+
"location": "local",
|
|
622
|
+
"action": ["existing"],
|
|
623
|
+
"original-basename": item_path[(item_path.rfind("/") + 1) :],
|
|
624
|
+
}
|
|
625
|
+
else:
|
|
626
|
+
if entry in high_level_sparc_folders:
|
|
627
|
+
dataset_folder["folders"][entry] = {
|
|
628
|
+
"folders": {},
|
|
629
|
+
"files": {},
|
|
630
|
+
"path": item_path,
|
|
631
|
+
"location": "local",
|
|
632
|
+
"action": ["existing"],
|
|
633
|
+
}
|
|
634
|
+
soda_json_structure["starting-point"][entry] = {"path": ""}
|
|
635
|
+
|
|
636
|
+
for folder in dataset_folder["folders"]:
|
|
637
|
+
# go through high level folders again
|
|
638
|
+
high_lvl_path = root_folder_path + "/" + folder
|
|
639
|
+
recursive_structure_create(dataset_folder["folders"][folder], high_lvl_path, soda_json_structure["starting-point"]["manifest"])
|
|
640
|
+
|
|
641
|
+
create_soda_json_completed = 1
|
|
642
|
+
return soda_json_structure
|
|
643
|
+
|
|
644
|
+
|
|
645
|
+
def monitor_local_json_progress():
|
|
646
|
+
"""
|
|
647
|
+
Function for monitoring progress of json_object_creation
|
|
648
|
+
Used for progress bar
|
|
649
|
+
"""
|
|
650
|
+
global create_soda_json_completed
|
|
651
|
+
global create_soda_json_total_items
|
|
652
|
+
global create_soda_json_progress
|
|
653
|
+
progress_percentage = (
|
|
654
|
+
create_soda_json_progress / create_soda_json_total_items
|
|
655
|
+
) * 100
|
|
656
|
+
|
|
657
|
+
return {
|
|
658
|
+
"create_soda_json_progress": create_soda_json_progress,
|
|
659
|
+
"create_soda_json_total_items": create_soda_json_total_items,
|
|
660
|
+
"progress_percentage": progress_percentage,
|
|
661
|
+
"create_soda_json_completed": create_soda_json_completed
|
|
662
|
+
}
|
|
@@ -0,0 +1,20 @@
|
|
|
1
|
+
from .submission import create_excel
|
|
2
|
+
from .dataset_description import create_excel
|
|
3
|
+
from .text_metadata import create_text_file
|
|
4
|
+
from .code_description import create_excel
|
|
5
|
+
from .manifest_package import create_high_level_manifest_files, get_auto_generated_manifest_files, load_metadata_to_dataframe, create_high_lvl_manifest_files_existing_ps_starting_point
|
|
6
|
+
from .manifest import create_excel, load_existing_manifest_file
|
|
7
|
+
from .resources import create_excel
|
|
8
|
+
from .performances import create_excel
|
|
9
|
+
from .submission import create_excel
|
|
10
|
+
from .sites import create_excel
|
|
11
|
+
from .text_metadata import create_text_file
|
|
12
|
+
from .constants import (
|
|
13
|
+
SDS_FILE_RESOURCES,
|
|
14
|
+
SDS_FILE_PERFORMANCES,
|
|
15
|
+
SDS_FILE_MANIFEST,
|
|
16
|
+
SDS_FILE_SITES,
|
|
17
|
+
SDS_FILE_CODE_DESCRIPTION,
|
|
18
|
+
SDS_FILE_DATASET_DESCRIPTION,
|
|
19
|
+
METADATA_UPLOAD_PS_PATH
|
|
20
|
+
)
|