pysodafair 0.1.62__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (77) hide show
  1. pysoda/__init__.py +0 -0
  2. pysoda/constants.py +3 -0
  3. pysoda/core/__init__.py +10 -0
  4. pysoda/core/dataset_generation/__init__.py +11 -0
  5. pysoda/core/dataset_generation/manifestSession/__init__.py +1 -0
  6. pysoda/core/dataset_generation/manifestSession/manifest_session.py +146 -0
  7. pysoda/core/dataset_generation/upload.py +3951 -0
  8. pysoda/core/dataset_importing/__init__.py +1 -0
  9. pysoda/core/dataset_importing/import_dataset.py +662 -0
  10. pysoda/core/metadata/__init__.py +20 -0
  11. pysoda/core/metadata/code_description.py +109 -0
  12. pysoda/core/metadata/constants.py +32 -0
  13. pysoda/core/metadata/dataset_description.py +188 -0
  14. pysoda/core/metadata/excel_utils.py +41 -0
  15. pysoda/core/metadata/helpers.py +250 -0
  16. pysoda/core/metadata/manifest.py +112 -0
  17. pysoda/core/metadata/manifest_package/__init__.py +2 -0
  18. pysoda/core/metadata/manifest_package/manifest.py +0 -0
  19. pysoda/core/metadata/manifest_package/manifest_import.py +29 -0
  20. pysoda/core/metadata/manifest_package/manifest_writer.py +666 -0
  21. pysoda/core/metadata/performances.py +46 -0
  22. pysoda/core/metadata/resources.py +53 -0
  23. pysoda/core/metadata/samples.py +184 -0
  24. pysoda/core/metadata/sites.py +51 -0
  25. pysoda/core/metadata/subjects.py +172 -0
  26. pysoda/core/metadata/submission.py +91 -0
  27. pysoda/core/metadata/text_metadata.py +47 -0
  28. pysoda/core/metadata_templates/CHANGES +1 -0
  29. pysoda/core/metadata_templates/LICENSE +1 -0
  30. pysoda/core/metadata_templates/README.md +4 -0
  31. pysoda/core/metadata_templates/__init__.py +0 -0
  32. pysoda/core/metadata_templates/code_description.xlsx +0 -0
  33. pysoda/core/metadata_templates/code_parameters.xlsx +0 -0
  34. pysoda/core/metadata_templates/dataset_description.xlsx +0 -0
  35. pysoda/core/metadata_templates/manifest.xlsx +0 -0
  36. pysoda/core/metadata_templates/performances.xlsx +0 -0
  37. pysoda/core/metadata_templates/resources.xlsx +0 -0
  38. pysoda/core/metadata_templates/samples.xlsx +0 -0
  39. pysoda/core/metadata_templates/sites.xlsx +0 -0
  40. pysoda/core/metadata_templates/subjects.xlsx +0 -0
  41. pysoda/core/metadata_templates/subjects_pools_samples_structure.xlsx +0 -0
  42. pysoda/core/metadata_templates/subjects_pools_samples_structure_example.xlsx +0 -0
  43. pysoda/core/metadata_templates/submission.xlsx +0 -0
  44. pysoda/core/permissions/__init__.py +1 -0
  45. pysoda/core/permissions/permissions.py +31 -0
  46. pysoda/core/pysoda/__init__.py +2 -0
  47. pysoda/core/pysoda/soda.py +34 -0
  48. pysoda/core/pysoda/soda_object.py +55 -0
  49. pysoda/core/upload_manifests/__init__.py +1 -0
  50. pysoda/core/upload_manifests/upload_manifests.py +37 -0
  51. pysoda/schema/__init__.py +0 -0
  52. pysoda/schema/code_description.json +629 -0
  53. pysoda/schema/dataset_description.json +295 -0
  54. pysoda/schema/manifest.json +60 -0
  55. pysoda/schema/performances.json +44 -0
  56. pysoda/schema/resources.json +39 -0
  57. pysoda/schema/samples.json +97 -0
  58. pysoda/schema/sites.json +38 -0
  59. pysoda/schema/soda_schema.json +664 -0
  60. pysoda/schema/subjects.json +131 -0
  61. pysoda/schema/submission_schema.json +28 -0
  62. pysoda/utils/__init__.py +9 -0
  63. pysoda/utils/authentication.py +381 -0
  64. pysoda/utils/config.py +68 -0
  65. pysoda/utils/exceptions.py +156 -0
  66. pysoda/utils/logger.py +6 -0
  67. pysoda/utils/metadata_utils.py +74 -0
  68. pysoda/utils/pennsieveAgentUtils.py +11 -0
  69. pysoda/utils/pennsieveUtils.py +118 -0
  70. pysoda/utils/profile.py +28 -0
  71. pysoda/utils/schema_validation.py +133 -0
  72. pysoda/utils/time_utils.py +5 -0
  73. pysoda/utils/upload_utils.py +108 -0
  74. pysodafair-0.1.62.dist-info/METADATA +190 -0
  75. pysodafair-0.1.62.dist-info/RECORD +77 -0
  76. pysodafair-0.1.62.dist-info/WHEEL +4 -0
  77. pysodafair-0.1.62.dist-info/licenses/LICENSE +21 -0
@@ -0,0 +1 @@
1
+ from .import_dataset import import_pennsieve_dataset, create_soda_json_object_backend, monitor_local_json_progress
@@ -0,0 +1,662 @@
1
+ import os
2
+ import requests
3
+ import pandas as pd
4
+ import itertools
5
+ from ...constants import PENNSIEVE_URL
6
+ from ...utils import create_request_headers, get_access_token, get_dataset_id, PennsieveActionNoPermission
7
+ from ..permissions import pennsieve_get_current_user_permissions
8
+ from ..metadata import load_metadata_to_dataframe
9
+
10
+ from .. import logger
11
+
12
+
13
+ METADATA_FILES_SPARC = [
14
+ "submission.xlsx",
15
+ "submission.csv",
16
+ "submission.json",
17
+ "dataset_description.xlsx",
18
+ "dataset_description.csv",
19
+ "dataset_description.json",
20
+ "subjects.xlsx",
21
+ "subjects.csv",
22
+ "subjects.json",
23
+ "samples.xlsx",
24
+ "samples.csv",
25
+ "samples.json",
26
+ "README.txt",
27
+ "CHANGES.txt",
28
+ "code_description.xlsx",
29
+ "inputs_metadata.xlsx",
30
+ "outputs_metadata.xlsx",
31
+ "manifest.xlsx",
32
+ "manifest.csv"
33
+ ]
34
+
35
+
36
+ def import_pennsieve_dataset(soda_json_structure, requested_sparc_only=True):
37
+ global logger
38
+ high_level_sparc_folders = [
39
+ "code",
40
+ "derivative",
41
+ "docs",
42
+ "primary",
43
+ "protocol",
44
+ "source",
45
+ ]
46
+ manifest_sparc = ["manifest.xlsx", "manifest.csv"]
47
+ high_level_metadata_sparc = [
48
+ "submission.xlsx",
49
+ "submission.csv",
50
+ "submission.json",
51
+ "dataset_description.xlsx",
52
+ "dataset_description.csv",
53
+ "dataset_description.json",
54
+ "subjects.xlsx",
55
+ "subjects.csv",
56
+ "subjects.json",
57
+ "samples.xlsx",
58
+ "samples.csv",
59
+ "samples.json",
60
+ "README.txt",
61
+ "CHANGES.txt",
62
+ "code_description.xlsx",
63
+ "inputs_metadata.xlsx",
64
+ "outputs_metadata.xlsx",
65
+ ]
66
+ double_extensions = [
67
+ ".ome.tiff",
68
+ ".ome.tif",
69
+ ".ome.tf2,",
70
+ ".ome.tf8",
71
+ ".ome.btf",
72
+ ".ome.xml",
73
+ ".brukertiff.gz",
74
+ ".mefd.gz",
75
+ ".moberg.gz",
76
+ ".nii.gz",
77
+ ".mgh.gz",
78
+ ".tar.gz",
79
+ ".bcl.gz",
80
+ ]
81
+
82
+ global create_soda_json_completed
83
+ global create_soda_json_total_items
84
+ global create_soda_json_progress
85
+ create_soda_json_progress = 0
86
+ create_soda_json_total_items = 0
87
+ create_soda_json_completed = 0
88
+
89
+ # ["extensions"] doesn't seem to be returned by the Pennsieve API anymore
90
+ def verify_file_name(file_name, extension):
91
+ global logger
92
+ if extension == "":
93
+ return file_name
94
+
95
+ double_ext = False
96
+ for ext in double_extensions:
97
+ if file_name.find(ext) != -1:
98
+ double_ext = True
99
+ break
100
+
101
+ extension_from_name = ""
102
+
103
+ if double_ext == False:
104
+ extension_from_name = os.path.splitext(file_name)[1]
105
+ else:
106
+ extension_from_name = (
107
+ os.path.splitext(os.path.splitext(file_name)[0])[1]
108
+ + os.path.splitext(file_name)[1]
109
+ )
110
+
111
+ if extension_from_name == ("." + extension):
112
+ return file_name
113
+ else:
114
+ return file_name + ("." + extension)
115
+
116
+
117
+ def createFolderStructure(subfolder_json, manifest):
118
+ """
119
+ Function for creating the Pennsieve folder structure for a given dataset as an object stored locally.
120
+ Arguments:
121
+ subfolder_json: The json object containing the folder structure of the dataset
122
+ pennsieve_client: The Pennsieve client object
123
+ manifest: The manifest object for the dataset
124
+ """
125
+ # root level folder will pass subfolders into this function and will recursively check if there are subfolders while creating the json structure
126
+ global logger
127
+ global create_soda_json_progress
128
+
129
+ collection_id = subfolder_json["path"]
130
+
131
+ limit = 100
132
+ offset = 0
133
+ subfolder = []
134
+ while True:
135
+ r = requests.get(f"{PENNSIEVE_URL}/packages/{collection_id}?limit={limit}&offset={offset}", headers=create_request_headers(get_access_token()))
136
+ r.raise_for_status()
137
+ page = r.json()["children"]
138
+ subfolder.extend(page)
139
+ if len(page) < limit:
140
+ break
141
+ offset += limit
142
+
143
+ for items in subfolder:
144
+ folder_item_name = items["content"]["name"]
145
+ create_soda_json_progress += 1
146
+ item_id = items["content"]["id"]
147
+ # is a file name check if there are additional manifest information to attach to files
148
+ if item_id[2:9] == "package":
149
+ if (
150
+ folder_item_name[0:8] != "manifest"
151
+ ): # manifest files are not being included in json structure
152
+
153
+ # verify file name first (used for legacy Pennsieve datasets)
154
+ if("extension" not in subfolder):
155
+ folder_item_name = verify_file_name(folder_item_name, "")
156
+ else:
157
+ folder_item_name = verify_file_name(folder_item_name, subfolder["extension"])
158
+
159
+ # verify timestamps
160
+ timestamp = items["content"]["createdAt"].replace('.', ',')
161
+
162
+ paths_list = [*subfolder_json["pspath"]]
163
+ subfolder_json["files"][folder_item_name] = {
164
+ "action": ["existing"],
165
+ "path": item_id,
166
+ "pspath": paths_list,
167
+ "timestamp": timestamp,
168
+ "location": "ps",
169
+ "additional-metadata": "",
170
+ "description": "",
171
+ }
172
+
173
+ # creates path for folder_item_name (stored in temp_name)
174
+ if len(subfolder_json["files"][folder_item_name]["pspath"]) > 1:
175
+ temp_name = '/'.join(subfolder_json["files"][folder_item_name]["pspath"][1:]) + "/" + folder_item_name
176
+ else:
177
+ temp_name = folder_item_name
178
+
179
+ if len(manifest.keys()) > 0:
180
+ # Dictionary that has the required manifest headers in lowercase and without spaces as keys
181
+ # and the correct manifest headers as values
182
+ defaultManifestHeadersNameMapped = {
183
+ "filename": "filename",
184
+ "timestamp": "timestamp",
185
+ "description": "description",
186
+ "filetype": "file type",
187
+ "entity": "entity",
188
+ "datamodality": "data modality",
189
+ "alsoindataset": "also in dataset",
190
+ "alsoindatasetpath": "also in dataset path",
191
+ "datadictionarypath": "data dictionary path",
192
+ "entityistransitive": "entity is transitive",
193
+ "additionalmetadata": "additional-metadata",
194
+ }
195
+
196
+ # Dictionary that will be used to store the correct manifest headers as keys
197
+ # and the values from the manifest as values
198
+ updated_manifest = {}
199
+
200
+ # Go through the imported manifest keys and change the keys to the correct name
201
+ # For example if the key is "File Name" change it to "filename"
202
+ for manifestKey in manifest.keys():
203
+ # Make the key lowercase
204
+ sterilizedKeyName = manifestKey.lower().replace(" ", "")
205
+ if sterilizedKeyName in defaultManifestHeadersNameMapped.keys():
206
+ # change the key to the correct name
207
+ # For example if the key name is "filetype" change it to "file type"
208
+ newManifestKeyName = defaultManifestHeadersNameMapped[sterilizedKeyName]
209
+ # Add the new key/value to the updated manifest
210
+ updated_manifest[newManifestKeyName] = manifest[manifestKey]
211
+ else:
212
+ # Keep the key/value the same and add it to the updated manifest
213
+ updated_manifest[manifestKey] = manifest[manifestKey]
214
+
215
+ if "filename" in updated_manifest.keys():
216
+ for manifestKey in updated_manifest.keys():
217
+ location_index = ""
218
+ # get the index of the file name in the manifest
219
+ if (temp_name in updated_manifest["filename"].values()):
220
+ location_index = list(updated_manifest["filename"].values()).index(
221
+ temp_name
222
+ )
223
+ # This is for the case where the file name in the manifest has a slash at the beginning
224
+ # which is the case for files in the root folder
225
+ elif ("/" + temp_name in updated_manifest["filename"].values()):
226
+ location_index = list(updated_manifest["filename"].values()).index(
227
+ "/" + temp_name
228
+ )
229
+ else:
230
+ # break out of the for loop if the file name is not in the manifest
231
+ break
232
+
233
+ # check if the key is in the required manifest headers, if it is, update the folder_item_name value
234
+ # corresponding to the key
235
+ if manifestKey in defaultManifestHeadersNameMapped.values():
236
+ if updated_manifest[manifestKey][location_index] != "":
237
+ if folder_item_name[0:1] == "/":
238
+ subfolder_json["files"][folder_item_name[:1]][manifestKey] = updated_manifest[manifestKey][location_index]
239
+ else:
240
+ subfolder_json["files"][folder_item_name][manifestKey] = updated_manifest[manifestKey][location_index]
241
+ # if the key is not in the required manifest headers, add it to the extra columns folder_item_name value
242
+ else :
243
+ # if the extra columns key does not exist, create it
244
+ if "extra_columns" not in subfolder_json["files"][folder_item_name]:
245
+ subfolder_json["files"][folder_item_name]["extra_columns"] = {}
246
+
247
+ if updated_manifest[manifestKey][location_index] != "":
248
+ subfolder_json["files"][folder_item_name]["extra_columns"][manifestKey] = updated_manifest[manifestKey][location_index]
249
+ else:
250
+ subfolder_json["files"][folder_item_name]["extra_columns"][manifestKey] = ""
251
+ else:
252
+ # filename not in updated manifest so recreate standard headers if they don't exist
253
+ # loop through the updated manifest keys and if header matches standard header add content else recreate
254
+ if len(updated_manifest.keys()) > 0:
255
+ location_index = ""
256
+ for manifestKey in updated_manifest.keys():
257
+ if temp_name in updated_manifest[manifestKey].values():
258
+ # file_names found
259
+ location_index = list(updated_manifest[manifestKey].values()).index(
260
+ temp_name
261
+ )
262
+ if ("/" + temp_name in updated_manifest[manifestKey].values()):
263
+ location_index = list(updated_manifest[manifestKey].values()).index(
264
+ "/" + temp_name
265
+ )
266
+ if location_index != "":
267
+ if manifestKey in defaultManifestHeadersNameMapped.values():
268
+ if folder_item_name[0:1] == "/":
269
+ subfolder_json["files"][folder_item_name[1:]][manifestKey] = updated_manifest[manifestKey][location_index]
270
+ else:
271
+ subfolder_json["files"][folder_item_name][manifestKey] = updated_manifest[manifestKey][location_index]
272
+ else:
273
+ if "extra_columns" not in subfolder_json["files"][folder_item_name]:
274
+ subfolder_json["files"][folder_item_name]["extra_columns"] = {}
275
+ subfolder_json["files"][folder_item_name]["extra_columns"][manifestKey] = updated_manifest[manifestKey][location_index]
276
+
277
+ else: # another subfolder found
278
+ paths_list = [*subfolder_json["pspath"], folder_item_name]
279
+ subfolder_json["folders"][folder_item_name] = {
280
+ "action": ["existing"],
281
+ "path": item_id,
282
+ "pspath": paths_list,
283
+ "files": {},
284
+ "folders": {},
285
+ "location": "ps",
286
+ }
287
+
288
+ if len(subfolder_json["folders"].keys()) != 0: # there are subfolders
289
+ for folder in subfolder_json["folders"].keys():
290
+ subfolder = subfolder_json["folders"][folder]
291
+ createFolderStructure(subfolder, manifest)
292
+
293
+
294
+ # check that the Pennsieve dataset is valid
295
+ try:
296
+ bf_dataset_name = soda_json_structure["ps-dataset-selected"]["dataset-name"]
297
+ except Exception as e:
298
+ raise e
299
+
300
+ selected_dataset_id = get_dataset_id(bf_dataset_name)
301
+
302
+ # check that the user has permission to edit this dataset
303
+ role = pennsieve_get_current_user_permissions(selected_dataset_id, get_access_token())["role"]
304
+ if role not in ["owner", "manager", "editor"]:
305
+ curatestatus = "Done"
306
+ raise PennsieveActionNoPermission("You do not have permissions to edit upload this Pennsieve dataset.")
307
+
308
+
309
+ # surface layer of dataset is pulled. then go through through the children to get information on subfolders
310
+ manifest_dict = {}
311
+ manifest_error_message = []
312
+ soda_json_structure["dataset-structure"] = {
313
+ "files": {},
314
+ "folders": {},
315
+ }
316
+
317
+ # root of dataset is pulled here (high level folders/files are gathered here)
318
+ # root_folder is the files and folders within root
319
+ r = requests.get(f"{PENNSIEVE_URL}/datasets/{selected_dataset_id}", headers=create_request_headers(get_access_token()))
320
+ r.raise_for_status()
321
+ root_folder = r.json()["children"]
322
+
323
+ # Get the amount of files/folders in the dataset
324
+ r = requests.get(f"{PENNSIEVE_URL}/datasets/{selected_dataset_id}/packageTypeCounts", headers=create_request_headers(get_access_token()))
325
+ r.raise_for_status()
326
+ packages_list = r.json()
327
+
328
+
329
+ # root's children files
330
+ for count in packages_list.values():
331
+ create_soda_json_total_items += int(count)
332
+
333
+ # set manifest dictionry to empty dictionary; used to store the manifest information while we import dataset
334
+ manifest_dict = {}
335
+
336
+
337
+ # Gather metadata files first
338
+ for items in root_folder:
339
+ item_id = items["content"]["id"]
340
+ item_name = items["content"]["name"]
341
+
342
+ # Import manifest at the root of the dataset
343
+ if item_name in manifest_sparc:
344
+ # Item is a manifest file
345
+ df = ""
346
+ try:
347
+ if item_name.lower() == "manifest.xlsx":
348
+ df = load_metadata_to_dataframe(item_id, "excel", get_access_token())
349
+ df = df.fillna("")
350
+ else:
351
+ df = load_metadata_to_dataframe(item_id, "csv", get_access_token())
352
+ df = df.fillna("")
353
+ manifest_dict = df.to_dict()
354
+ except Exception as e:
355
+ manifest_error_message.append(item_name)
356
+
357
+ # Item is a metadata file
358
+ if item_name in high_level_metadata_sparc:
359
+ create_soda_json_progress += 1
360
+ if "dataset_metadata" not in soda_json_structure.keys():
361
+ soda_json_structure["dataset_metadata"] = {}
362
+ soda_json_structure["dataset_metadata"][item_name] = {
363
+ "location": "ps",
364
+ "action": ["existing"],
365
+ "path": item_id,
366
+ }
367
+
368
+ # Process the folder structure
369
+ for items in root_folder:
370
+ item_id = items["content"]["id"]
371
+ item_name = items["content"]["name"]
372
+
373
+ # If package type is Collection, then it is a folder
374
+ if items["content"]["packageType"] == "Collection" and item_name in high_level_sparc_folders:
375
+ create_soda_json_progress += 1
376
+ soda_json_structure["dataset-structure"]["folders"][item_name] = {
377
+ "location": "ps",
378
+ "path": item_id,
379
+ "action": ["existing"],
380
+ "files": {},
381
+ "folders": {},
382
+ "pspath": [item_name],
383
+ }
384
+
385
+ # Check the content of the folder to see if a manifest file exists
386
+ r = requests.get(f"{PENNSIEVE_URL}/packages/{item_id}", headers=create_request_headers(get_access_token()))
387
+ r.raise_for_status()
388
+ folder_content = r.json()["children"]
389
+
390
+ if len(folder_content) > 0:
391
+ high_lvl_folder_dict = soda_json_structure["dataset-structure"]["folders"][item_name]
392
+
393
+ createFolderStructure(
394
+ high_lvl_folder_dict, manifest_dict
395
+ ) # Passing item's JSON and the collection ID
396
+
397
+ success_message = (
398
+ "Data files under a valid high-level SPARC folders have been imported"
399
+ )
400
+ create_soda_json_completed = 1
401
+
402
+ logger.info(f"Time to import {soda_json_structure['ps-dataset-selected']['dataset-name']} ")
403
+ return {
404
+ "soda_object": soda_json_structure,
405
+ "success_message": success_message,
406
+ "manifest_error_message": manifest_error_message,
407
+ "import_progress": create_soda_json_progress,
408
+ "import_total_items": create_soda_json_total_items,
409
+ }
410
+
411
+
412
+ create_soda_json_progress = 0
413
+ create_soda_json_total_items = 0
414
+ create_soda_json_completed = 0
415
+
416
+
417
+ def create_soda_json_object_backend(
418
+ soda_json_structure, root_folder_path, irregularFolders, replaced
419
+ ):
420
+ """
421
+ This function is meant for importing local datasets into SODA.
422
+ It creates a json object with the structure of the dataset.
423
+ """
424
+ global create_soda_json_progress # amount of items counted during recursion
425
+ global create_soda_json_total_items # counts the total items in folder
426
+ global create_soda_json_completed # completed progress is either 0 or 1
427
+ global METADATA_FILES_SPARC
428
+
429
+ high_level_sparc_folders = [
430
+ "code",
431
+ "derivative",
432
+ "docs",
433
+ "primary",
434
+ "protocol",
435
+ "source",
436
+ ]
437
+
438
+ dataset_folder = soda_json_structure["dataset-structure"] = {"folders": {}}
439
+
440
+ def recursive_structure_create(dataset_structure, folder_path, root_manifest):
441
+ global create_soda_json_progress
442
+ # going within high level folders
443
+ # add manifest details if manifest exists
444
+ manifest_object = {
445
+ "filename": "",
446
+ "timestamp": "",
447
+ "description": "",
448
+ "file type": "",
449
+ "entity": "",
450
+ "data modality": "",
451
+ "also in dataset": "",
452
+ "also in dataset path": "",
453
+ "data dictionary path": "",
454
+ "entity is transitive": "",
455
+ "additional-metadata": "",
456
+ }
457
+
458
+ lastSlash = folder_path.rfind("/") + 1
459
+ folder_name = folder_path[lastSlash:]
460
+
461
+ if folder_name in replaced.keys():
462
+ folder_name = replaced[folder_name]
463
+
464
+ # Check if folder is in irregular folders
465
+ if folder_path in irregularFolders:
466
+ index_check = irregularFolders.index(folder_path)
467
+ modified_name = replaced[os.path.basename(folder_path)]
468
+ folder_path = irregularFolders[index_check]
469
+
470
+
471
+ entries = os.listdir(folder_path)
472
+ for entry in entries:
473
+ item_path = os.path.normpath(os.path.join(folder_path, entry))
474
+ if os.path.isfile(item_path):
475
+ # Check manifest to add metadata
476
+ if entry[0:1] != "." and entry[0:8] != "manifest":
477
+ create_soda_json_progress += 1
478
+ # Use the root manifest to find metadata for the file
479
+ for row in root_manifest:
480
+ extra_columns = False
481
+ if len(row) > 11:
482
+ extra_columns = True
483
+ extra_columns_dict = dict(itertools.islice(row.items(), 5, len(row)))
484
+
485
+ if row["filename"] == entry:
486
+ # Add description metadata
487
+ manifest_object["description"] = row.get("description", "")
488
+ # Add additional metadata
489
+ manifest_object["additional-metadata"] = row.get("Additional Metadata", "")
490
+ if extra_columns:
491
+ manifest_object["extra_columns"] = extra_columns_dict
492
+
493
+ # Create JSON structure for the file
494
+ if "extra_columns" in manifest_object:
495
+ dataset_structure["files"][entry] = {
496
+ "path": item_path,
497
+ "location": "local",
498
+ "action": ["existing"],
499
+ "description": manifest_object["description"],
500
+ "additional-metadata": manifest_object["additional-metadata"],
501
+ "extra_columns": manifest_object["extra_columns"],
502
+ }
503
+ else:
504
+ dataset_structure["files"][entry] = {
505
+ "path": item_path,
506
+ "location": "local",
507
+ "action": ["existing"],
508
+ "description": manifest_object["description"],
509
+ "additional-metadata": manifest_object["additional-metadata"],
510
+ }
511
+ elif os.path.isdir(item_path) is True:
512
+ create_soda_json_progress += 1
513
+ if item_path in irregularFolders:
514
+ index_check = irregularFolders.index(item_path)
515
+ modified_name = replaced[os.path.basename(item_path)]
516
+
517
+ dataset_structure["folders"][modified_name] = {
518
+ "folders": {},
519
+ "files": {},
520
+ "path": item_path,
521
+ "location": "local",
522
+ "action": ["existing"],
523
+ "original-name": entry,
524
+ }
525
+ for folder in dataset_structure["folders"][modified_name][
526
+ "folders"
527
+ ]:
528
+ updated_path = dataset_structure["folders"][modified_name][
529
+ folder
530
+ ]["path"]
531
+ recursive_structure_create(
532
+ dataset_structure["folders"][modified_name][folder],
533
+ updated_path,
534
+ root_manifest
535
+ )
536
+ else:
537
+ dataset_structure["folders"][entry] = {
538
+ "folders": {},
539
+ "files": {},
540
+ "path": item_path,
541
+ "location": "local",
542
+ "action": ["existing"],
543
+ }
544
+
545
+ for folder in dataset_structure["folders"]:
546
+ updated_path = dataset_structure["folders"][folder]["path"]
547
+ recursive_structure_create(
548
+ dataset_structure["folders"][folder], updated_path, root_manifest
549
+ )
550
+
551
+ # BEGIN
552
+
553
+ # Check for a single manifest file at the root of the dataset
554
+ root_manifest_csv = os.path.join(root_folder_path, "manifest.csv")
555
+ root_manifest_xlsx = os.path.join(root_folder_path, "manifest.xlsx")
556
+
557
+ soda_json_structure["starting-point"]["manifest"] = {}
558
+
559
+ if os.path.exists(root_manifest_csv):
560
+ csv_data = pd.read_csv(root_manifest_csv)
561
+ csv_data.fillna("", inplace=True)
562
+ json_format = csv_data.to_dict(orient="records")
563
+ soda_json_structure["starting-point"]["manifest"] = json_format
564
+ soda_json_structure["starting-point"]["path"] = root_manifest_csv
565
+ elif os.path.exists(root_manifest_xlsx):
566
+ excel_data = pd.read_excel(root_manifest_xlsx, sheet_name="Sheet1")
567
+ excel_data.fillna("", inplace=True)
568
+ json_format = excel_data.to_dict(orient="records")
569
+ soda_json_structure["starting-point"]["manifest"] = json_format
570
+ soda_json_structure["starting-point"]["path"] = root_manifest_xlsx
571
+
572
+
573
+
574
+ # count the amount of items in folder
575
+ create_soda_json_total_items = 0
576
+ for root, dirs, filenames in os.walk(root_folder_path):
577
+ # walk through all folders and it's subfolders
578
+ for Dir in dirs:
579
+ # does not take hidden folders or manifest folders
580
+ if Dir[0:1] != "." and Dir[0:8] != "manifest":
581
+ create_soda_json_total_items += 1
582
+ for fileName in filenames:
583
+ if root == root_folder_path and fileName in METADATA_FILES_SPARC:
584
+ # goes through all files and does not count hidden files
585
+ create_soda_json_total_items += 1
586
+ else:
587
+ if fileName[0:1] != ".":
588
+ create_soda_json_total_items += 1
589
+
590
+ # reading high level folders
591
+ create_soda_json_completed = 0
592
+ create_soda_json_progress = 0
593
+ entries = os.listdir(root_folder_path)
594
+
595
+
596
+ for entry in entries:
597
+ # begin going through high level folders
598
+ item_path = os.path.normpath(os.path.join(root_folder_path, entry))
599
+ # high level folder paths
600
+ if os.path.isfile(item_path) is True:
601
+ if entry[0:1] != "." and entry in METADATA_FILES_SPARC:
602
+ # is not a hidden folder
603
+ create_soda_json_progress += 1
604
+ soda_json_structure["dataset_metadata"][entry] = {
605
+ "path": item_path,
606
+ "location": "local",
607
+ "action": ["existing"],
608
+ }
609
+ # do file work here
610
+ elif os.path.isdir(item_path) is True:
611
+ create_soda_json_progress += 1
612
+ # add item to soda
613
+ if item_path in irregularFolders:
614
+ index_check = irregularFolders.index(item_path)
615
+ modified_name = replaced[index_check]
616
+ folder_name = modified_name
617
+ dataset_folder["folders"][folder_name] = {
618
+ "folders": {},
619
+ "files": {},
620
+ "path": item_path,
621
+ "location": "local",
622
+ "action": ["existing"],
623
+ "original-basename": item_path[(item_path.rfind("/") + 1) :],
624
+ }
625
+ else:
626
+ if entry in high_level_sparc_folders:
627
+ dataset_folder["folders"][entry] = {
628
+ "folders": {},
629
+ "files": {},
630
+ "path": item_path,
631
+ "location": "local",
632
+ "action": ["existing"],
633
+ }
634
+ soda_json_structure["starting-point"][entry] = {"path": ""}
635
+
636
+ for folder in dataset_folder["folders"]:
637
+ # go through high level folders again
638
+ high_lvl_path = root_folder_path + "/" + folder
639
+ recursive_structure_create(dataset_folder["folders"][folder], high_lvl_path, soda_json_structure["starting-point"]["manifest"])
640
+
641
+ create_soda_json_completed = 1
642
+ return soda_json_structure
643
+
644
+
645
+ def monitor_local_json_progress():
646
+ """
647
+ Function for monitoring progress of json_object_creation
648
+ Used for progress bar
649
+ """
650
+ global create_soda_json_completed
651
+ global create_soda_json_total_items
652
+ global create_soda_json_progress
653
+ progress_percentage = (
654
+ create_soda_json_progress / create_soda_json_total_items
655
+ ) * 100
656
+
657
+ return {
658
+ "create_soda_json_progress": create_soda_json_progress,
659
+ "create_soda_json_total_items": create_soda_json_total_items,
660
+ "progress_percentage": progress_percentage,
661
+ "create_soda_json_completed": create_soda_json_completed
662
+ }
@@ -0,0 +1,20 @@
1
+ from .submission import create_excel
2
+ from .dataset_description import create_excel
3
+ from .text_metadata import create_text_file
4
+ from .code_description import create_excel
5
+ from .manifest_package import create_high_level_manifest_files, get_auto_generated_manifest_files, load_metadata_to_dataframe, create_high_lvl_manifest_files_existing_ps_starting_point
6
+ from .manifest import create_excel, load_existing_manifest_file
7
+ from .resources import create_excel
8
+ from .performances import create_excel
9
+ from .submission import create_excel
10
+ from .sites import create_excel
11
+ from .text_metadata import create_text_file
12
+ from .constants import (
13
+ SDS_FILE_RESOURCES,
14
+ SDS_FILE_PERFORMANCES,
15
+ SDS_FILE_MANIFEST,
16
+ SDS_FILE_SITES,
17
+ SDS_FILE_CODE_DESCRIPTION,
18
+ SDS_FILE_DATASET_DESCRIPTION,
19
+ METADATA_UPLOAD_PS_PATH
20
+ )