pysodafair 0.1.62__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- pysoda/__init__.py +0 -0
- pysoda/constants.py +3 -0
- pysoda/core/__init__.py +10 -0
- pysoda/core/dataset_generation/__init__.py +11 -0
- pysoda/core/dataset_generation/manifestSession/__init__.py +1 -0
- pysoda/core/dataset_generation/manifestSession/manifest_session.py +146 -0
- pysoda/core/dataset_generation/upload.py +3951 -0
- pysoda/core/dataset_importing/__init__.py +1 -0
- pysoda/core/dataset_importing/import_dataset.py +662 -0
- pysoda/core/metadata/__init__.py +20 -0
- pysoda/core/metadata/code_description.py +109 -0
- pysoda/core/metadata/constants.py +32 -0
- pysoda/core/metadata/dataset_description.py +188 -0
- pysoda/core/metadata/excel_utils.py +41 -0
- pysoda/core/metadata/helpers.py +250 -0
- pysoda/core/metadata/manifest.py +112 -0
- pysoda/core/metadata/manifest_package/__init__.py +2 -0
- pysoda/core/metadata/manifest_package/manifest.py +0 -0
- pysoda/core/metadata/manifest_package/manifest_import.py +29 -0
- pysoda/core/metadata/manifest_package/manifest_writer.py +666 -0
- pysoda/core/metadata/performances.py +46 -0
- pysoda/core/metadata/resources.py +53 -0
- pysoda/core/metadata/samples.py +184 -0
- pysoda/core/metadata/sites.py +51 -0
- pysoda/core/metadata/subjects.py +172 -0
- pysoda/core/metadata/submission.py +91 -0
- pysoda/core/metadata/text_metadata.py +47 -0
- pysoda/core/metadata_templates/CHANGES +1 -0
- pysoda/core/metadata_templates/LICENSE +1 -0
- pysoda/core/metadata_templates/README.md +4 -0
- pysoda/core/metadata_templates/__init__.py +0 -0
- pysoda/core/metadata_templates/code_description.xlsx +0 -0
- pysoda/core/metadata_templates/code_parameters.xlsx +0 -0
- pysoda/core/metadata_templates/dataset_description.xlsx +0 -0
- pysoda/core/metadata_templates/manifest.xlsx +0 -0
- pysoda/core/metadata_templates/performances.xlsx +0 -0
- pysoda/core/metadata_templates/resources.xlsx +0 -0
- pysoda/core/metadata_templates/samples.xlsx +0 -0
- pysoda/core/metadata_templates/sites.xlsx +0 -0
- pysoda/core/metadata_templates/subjects.xlsx +0 -0
- pysoda/core/metadata_templates/subjects_pools_samples_structure.xlsx +0 -0
- pysoda/core/metadata_templates/subjects_pools_samples_structure_example.xlsx +0 -0
- pysoda/core/metadata_templates/submission.xlsx +0 -0
- pysoda/core/permissions/__init__.py +1 -0
- pysoda/core/permissions/permissions.py +31 -0
- pysoda/core/pysoda/__init__.py +2 -0
- pysoda/core/pysoda/soda.py +34 -0
- pysoda/core/pysoda/soda_object.py +55 -0
- pysoda/core/upload_manifests/__init__.py +1 -0
- pysoda/core/upload_manifests/upload_manifests.py +37 -0
- pysoda/schema/__init__.py +0 -0
- pysoda/schema/code_description.json +629 -0
- pysoda/schema/dataset_description.json +295 -0
- pysoda/schema/manifest.json +60 -0
- pysoda/schema/performances.json +44 -0
- pysoda/schema/resources.json +39 -0
- pysoda/schema/samples.json +97 -0
- pysoda/schema/sites.json +38 -0
- pysoda/schema/soda_schema.json +664 -0
- pysoda/schema/subjects.json +131 -0
- pysoda/schema/submission_schema.json +28 -0
- pysoda/utils/__init__.py +9 -0
- pysoda/utils/authentication.py +381 -0
- pysoda/utils/config.py +68 -0
- pysoda/utils/exceptions.py +156 -0
- pysoda/utils/logger.py +6 -0
- pysoda/utils/metadata_utils.py +74 -0
- pysoda/utils/pennsieveAgentUtils.py +11 -0
- pysoda/utils/pennsieveUtils.py +118 -0
- pysoda/utils/profile.py +28 -0
- pysoda/utils/schema_validation.py +133 -0
- pysoda/utils/time_utils.py +5 -0
- pysoda/utils/upload_utils.py +108 -0
- pysodafair-0.1.62.dist-info/METADATA +190 -0
- pysodafair-0.1.62.dist-info/RECORD +77 -0
- pysodafair-0.1.62.dist-info/WHEEL +4 -0
- pysodafair-0.1.62.dist-info/licenses/LICENSE +21 -0
|
@@ -0,0 +1,156 @@
|
|
|
1
|
+
|
|
2
|
+
# create a custom exception that indicates a property in an object has not been set
|
|
3
|
+
class PropertyNotSetError(Exception):
|
|
4
|
+
def __init__(self, property_name):
|
|
5
|
+
self.property_name = property_name
|
|
6
|
+
self.error_message = f"The property {self.property_name} has not been set."
|
|
7
|
+
|
|
8
|
+
def __str__(self):
|
|
9
|
+
return self.error_message
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
# create a custom exception that indicates that the 'PennsieveAgent' could not be started
|
|
13
|
+
class PennsieveAgentError(Exception):
|
|
14
|
+
def __init__(self, error_message):
|
|
15
|
+
self.error_message = error_message
|
|
16
|
+
|
|
17
|
+
def __str__(self):
|
|
18
|
+
return self.error_message
|
|
19
|
+
|
|
20
|
+
class FailedToFetchPennsieveDatasets(Exception):
|
|
21
|
+
def __init__(self, error_message):
|
|
22
|
+
self.error_message = error_message
|
|
23
|
+
|
|
24
|
+
def __str__(self):
|
|
25
|
+
return self.error_message
|
|
26
|
+
|
|
27
|
+
class PennsieveDatasetCannotBeFound(Exception):
|
|
28
|
+
def __init__(self, dataset_name):
|
|
29
|
+
self.dataset_name = dataset_name
|
|
30
|
+
self.error_message = f"The Pennsieve dataset {self.dataset_name} could not be found."
|
|
31
|
+
|
|
32
|
+
def __str__(self):
|
|
33
|
+
return self.error_message
|
|
34
|
+
|
|
35
|
+
|
|
36
|
+
class ConfigProfileNotSet(Exception):
|
|
37
|
+
def __init__(self, profile_name):
|
|
38
|
+
self.profile_name = profile_name
|
|
39
|
+
self.error_message = f"The profile {self.profile_name} has not been set."
|
|
40
|
+
|
|
41
|
+
def __str__(self):
|
|
42
|
+
return self.error_message
|
|
43
|
+
|
|
44
|
+
class GenerateOptionsNotSet(Exception):
|
|
45
|
+
def __init__(self):
|
|
46
|
+
self.error_message = "The generate options have not been set."
|
|
47
|
+
|
|
48
|
+
def __str__(self):
|
|
49
|
+
return self.error_message
|
|
50
|
+
|
|
51
|
+
|
|
52
|
+
class PennsieveActionNoPermission(Exception):
|
|
53
|
+
def __init__(self, action):
|
|
54
|
+
self.action = action
|
|
55
|
+
self.error_message = f"Do not have the correct permissions to perform action: {self.action} ."
|
|
56
|
+
|
|
57
|
+
def __str__(self):
|
|
58
|
+
return self.error_message
|
|
59
|
+
|
|
60
|
+
class GenericUploadError(Exception):
|
|
61
|
+
def __init__(self, error_message):
|
|
62
|
+
self.error_message = error_message
|
|
63
|
+
|
|
64
|
+
def __str__(self):
|
|
65
|
+
return self.error_message
|
|
66
|
+
|
|
67
|
+
|
|
68
|
+
class EmptyDatasetError(Exception):
|
|
69
|
+
def __init__(self, dataset_name, expanded=""):
|
|
70
|
+
self.dataset_name = dataset_name
|
|
71
|
+
self.expanded = expanded
|
|
72
|
+
self.error_message = f"The dataset {self.dataset_name} is empty. {expanded}"
|
|
73
|
+
|
|
74
|
+
def __str__(self):
|
|
75
|
+
return self.error_message
|
|
76
|
+
|
|
77
|
+
|
|
78
|
+
class LocalDatasetMissingSpecifiedFiles(Exception):
|
|
79
|
+
def __init__(self, error_message):
|
|
80
|
+
self.error_message = error_message
|
|
81
|
+
|
|
82
|
+
def __str__(self):
|
|
83
|
+
return self.error_message
|
|
84
|
+
|
|
85
|
+
|
|
86
|
+
class PennsieveUploadException(Exception):
|
|
87
|
+
def __init__(self, error_message):
|
|
88
|
+
self.error_message = error_message
|
|
89
|
+
super().__init__(self.error_message)
|
|
90
|
+
|
|
91
|
+
def __str__(self):
|
|
92
|
+
return self.error_message
|
|
93
|
+
|
|
94
|
+
|
|
95
|
+
class PennsieveAccountInformationFailedAuthentication(Exception):
|
|
96
|
+
def __init__(self, error_message):
|
|
97
|
+
|
|
98
|
+
self.error_message = error_message
|
|
99
|
+
|
|
100
|
+
def __str__(self):
|
|
101
|
+
return self.error_message
|
|
102
|
+
|
|
103
|
+
|
|
104
|
+
class PennsieveDatasetNameTaken(Exception):
|
|
105
|
+
def __init__(self, dataset_name):
|
|
106
|
+
self.dataset_name = dataset_name
|
|
107
|
+
self.error_message = f"The Pennsieve dataset name {self.dataset_name} is already taken."
|
|
108
|
+
|
|
109
|
+
def __str__(self):
|
|
110
|
+
return self.error_message
|
|
111
|
+
|
|
112
|
+
class PennsieveDatasetNameInvalid(Exception):
|
|
113
|
+
def __init__(self, dataset_name):
|
|
114
|
+
self.dataset_name = dataset_name
|
|
115
|
+
self.error_message = f"The Pennsieve dataset name {self.dataset_name} is invalid."
|
|
116
|
+
|
|
117
|
+
def __str__(self):
|
|
118
|
+
return self.error_message
|
|
119
|
+
|
|
120
|
+
class PennsieveAccountInvalid(Exception):
|
|
121
|
+
def __init__(self, account_name):
|
|
122
|
+
self.account_name = account_name
|
|
123
|
+
self.error_message = f"The Pennsieve account name {self.account_name} is invalid."
|
|
124
|
+
|
|
125
|
+
def __str__(self):
|
|
126
|
+
return self.error_message
|
|
127
|
+
|
|
128
|
+
class PennsieveDatasetFilesInvalid(Exception):
|
|
129
|
+
def __init__(self, error_message):
|
|
130
|
+
self.error_message = error_message
|
|
131
|
+
|
|
132
|
+
def __str__(self):
|
|
133
|
+
return self.error_message
|
|
134
|
+
|
|
135
|
+
def validation_error_message(e):
|
|
136
|
+
"""
|
|
137
|
+
Print a message for a validation error.
|
|
138
|
+
input: e (ValidationError): The validation error from the validate library.
|
|
139
|
+
output: human readable message for the validation error.
|
|
140
|
+
"""
|
|
141
|
+
msg = "There following error was found in your metadata:"
|
|
142
|
+
e_type = e.schema_path.pop().strip()
|
|
143
|
+
print(e.schema_path)
|
|
144
|
+
if e_type == "type":
|
|
145
|
+
s = ''
|
|
146
|
+
while e.schema_path:
|
|
147
|
+
p_v = e.schema_path.popleft()
|
|
148
|
+
if p_v.strip() != "properties":
|
|
149
|
+
if s != '':
|
|
150
|
+
s += ' -> '
|
|
151
|
+
s += p_v
|
|
152
|
+
msg = f"{msg} {s} needs to be a list of values."
|
|
153
|
+
if e_type == "required":
|
|
154
|
+
# peel out the first line from the stringified error message
|
|
155
|
+
msg = f"{msg} {e.message.splitlines()[0]}"
|
|
156
|
+
return msg
|
pysoda/utils/logger.py
ADDED
|
@@ -0,0 +1,74 @@
|
|
|
1
|
+
import os.path
|
|
2
|
+
import requests
|
|
3
|
+
from .authentication import create_request_headers
|
|
4
|
+
from ..constants import PENNSIEVE_URL
|
|
5
|
+
|
|
6
|
+
|
|
7
|
+
# check for non-empty fields (cells)
|
|
8
|
+
def column_check(x):
|
|
9
|
+
return "unnamed" not in x.lower()
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
# obtain Pennsieve S3 URL for an existing metadata file
|
|
13
|
+
def returnFileURL(ps, item_id):
|
|
14
|
+
r = requests.get(f"{PENNSIEVE_URL}/packages/{item_id}/view", headers=create_request_headers(ps))
|
|
15
|
+
r.raise_for_status()
|
|
16
|
+
|
|
17
|
+
file_details = r.json()
|
|
18
|
+
file_id = file_details[0]["content"]["id"]
|
|
19
|
+
r = requests.get(
|
|
20
|
+
f"{PENNSIEVE_URL}/packages/{item_id}/files/{file_id}", headers=create_request_headers(ps)
|
|
21
|
+
)
|
|
22
|
+
r.raise_for_status()
|
|
23
|
+
|
|
24
|
+
file_url_info = r.json()
|
|
25
|
+
return file_url_info["url"]
|
|
26
|
+
|
|
27
|
+
|
|
28
|
+
def remove_high_level_folder_from_path(paths):
|
|
29
|
+
"""
|
|
30
|
+
Remove the high level folder from the path. This is necessary because the high level folder is not included in the manifest file name entry.
|
|
31
|
+
"""
|
|
32
|
+
|
|
33
|
+
return "" if len(paths) == 1 else "/".join(paths[1:]) + "/"
|
|
34
|
+
|
|
35
|
+
|
|
36
|
+
|
|
37
|
+
|
|
38
|
+
double_extensions = [
|
|
39
|
+
".ome.tiff",
|
|
40
|
+
".ome.tif",
|
|
41
|
+
".ome.tf2,",
|
|
42
|
+
".ome.tf8",
|
|
43
|
+
".ome.btf",
|
|
44
|
+
".ome.xml",
|
|
45
|
+
".brukertiff.gz",
|
|
46
|
+
".mefd.gz",
|
|
47
|
+
".moberg.gz",
|
|
48
|
+
".nii.gz",
|
|
49
|
+
".mgh.gz",
|
|
50
|
+
".tar.gz",
|
|
51
|
+
".bcl.gz",
|
|
52
|
+
]
|
|
53
|
+
|
|
54
|
+
|
|
55
|
+
def get_name_extension(file_name):
|
|
56
|
+
double_ext = False
|
|
57
|
+
for ext in double_extensions:
|
|
58
|
+
if file_name.find(ext) != -1:
|
|
59
|
+
double_ext = True
|
|
60
|
+
break
|
|
61
|
+
|
|
62
|
+
ext = ""
|
|
63
|
+
name = ""
|
|
64
|
+
|
|
65
|
+
if double_ext == False:
|
|
66
|
+
name = os.path.splitext(file_name)[0]
|
|
67
|
+
ext = os.path.splitext(file_name)[1]
|
|
68
|
+
else:
|
|
69
|
+
ext = (
|
|
70
|
+
os.path.splitext(os.path.splitext(file_name)[0])[1]
|
|
71
|
+
+ os.path.splitext(file_name)[1]
|
|
72
|
+
)
|
|
73
|
+
name = os.path.splitext(os.path.splitext(file_name)[0])[0]
|
|
74
|
+
return name, ext
|
|
@@ -0,0 +1,11 @@
|
|
|
1
|
+
from pennsieve2.pennsieve import Pennsieve
|
|
2
|
+
from .exceptions import PennsieveAgentError
|
|
3
|
+
|
|
4
|
+
def connect_pennsieve_client(account_name):
|
|
5
|
+
"""
|
|
6
|
+
Connects to Pennsieve Python client to the Agent and returns the initialized Pennsieve object.
|
|
7
|
+
"""
|
|
8
|
+
try:
|
|
9
|
+
return Pennsieve(profile_name=account_name)
|
|
10
|
+
except Exception as e:
|
|
11
|
+
raise PennsieveAgentError(f"Could not connect to the Pennsieve agent: {e}")
|
|
@@ -0,0 +1,118 @@
|
|
|
1
|
+
import requests
|
|
2
|
+
from ..constants import PENNSIEVE_URL
|
|
3
|
+
from .authentication import get_access_token
|
|
4
|
+
import re
|
|
5
|
+
from .exceptions import PennsieveDatasetCannotBeFound, FailedToFetchPennsieveDatasets
|
|
6
|
+
|
|
7
|
+
def get_dataset_id(dataset_name_or_id):
|
|
8
|
+
"""
|
|
9
|
+
Returns the dataset ID for the given dataset name.
|
|
10
|
+
If the dataset ID was provided instead of the name, the ID will be returned. *Common for Guided Mode*
|
|
11
|
+
|
|
12
|
+
Input:
|
|
13
|
+
dataset_name_or_id: Pennsieve dataset name or ID to get the ID for
|
|
14
|
+
"""
|
|
15
|
+
# If the input is already a dataset ID, return it
|
|
16
|
+
if dataset_name_or_id.startswith("N:dataset:"):
|
|
17
|
+
return dataset_name_or_id
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
try:
|
|
21
|
+
# Attempt to retrieve the user's dataset list from Pennsieve
|
|
22
|
+
dataset_list = get_users_dataset_list()
|
|
23
|
+
except Exception as e:
|
|
24
|
+
raise FailedToFetchPennsieveDatasets(str(e))
|
|
25
|
+
|
|
26
|
+
# Iterate through the user's dataset list to find a matching dataset name
|
|
27
|
+
for dataset in dataset_list:
|
|
28
|
+
if dataset["content"]["name"] == dataset_name_or_id:
|
|
29
|
+
return dataset["content"]["id"]
|
|
30
|
+
|
|
31
|
+
# If no matching dataset is found, abort with a 404 status and a specific error message
|
|
32
|
+
raise PennsieveDatasetCannotBeFound(dataset_name_or_id)
|
|
33
|
+
|
|
34
|
+
|
|
35
|
+
def get_users_dataset_list():
|
|
36
|
+
"""
|
|
37
|
+
Returns a list of datasets the user has access to.
|
|
38
|
+
Input:
|
|
39
|
+
token: Pennsieve access token
|
|
40
|
+
"""
|
|
41
|
+
|
|
42
|
+
# The number of datasets to retrieve per chunk
|
|
43
|
+
NUMBER_OF_DATASETS_PER_CHUNK = 200
|
|
44
|
+
# The total number of datasets the user has access to (set after the first request)
|
|
45
|
+
NUMBER_OF_DATASETS_USER_HAS_ACCESS_TO = None
|
|
46
|
+
|
|
47
|
+
# The offset is the number of datasets to skip before retrieving the next chunk of datasets (starts at 0, then increases by the number of datasets per chunk)
|
|
48
|
+
current_offset = 0
|
|
49
|
+
# The list of datasets the user has access to (datasets are added to this list after each request and then returned)
|
|
50
|
+
datasets = []
|
|
51
|
+
|
|
52
|
+
try:
|
|
53
|
+
# Get the first chunk of datasets as well as the total number of datasets the user has access to
|
|
54
|
+
r = requests.get(f"{PENNSIEVE_URL}/datasets/paginated", headers=create_request_headers(get_access_token()), params={"offset": current_offset, "limit": NUMBER_OF_DATASETS_PER_CHUNK})
|
|
55
|
+
r.raise_for_status()
|
|
56
|
+
responseJSON = r.json()
|
|
57
|
+
datasets.extend(responseJSON["datasets"])
|
|
58
|
+
NUMBER_OF_DATASETS_USER_HAS_ACCESS_TO = responseJSON["totalCount"]
|
|
59
|
+
|
|
60
|
+
# If the number of datasets the user has access to is less than the number of datasets per chunk, we don't need to retrieve any more datasets
|
|
61
|
+
if NUMBER_OF_DATASETS_USER_HAS_ACCESS_TO < NUMBER_OF_DATASETS_PER_CHUNK:
|
|
62
|
+
return datasets
|
|
63
|
+
|
|
64
|
+
# Otherwise, we need to retrieve the rest of the datasets.
|
|
65
|
+
# We do this by retrieving chunks of datasets until the number of datasets retrieved is equal to the number of datasets the user has access to
|
|
66
|
+
while len(datasets) < NUMBER_OF_DATASETS_USER_HAS_ACCESS_TO:
|
|
67
|
+
# Increase the offset by the number of datasets per chunk (e.g. if 200 datasets per chunk, then increase the offset by 200)
|
|
68
|
+
current_offset += NUMBER_OF_DATASETS_PER_CHUNK
|
|
69
|
+
r = requests.get(f"{PENNSIEVE_URL}/datasets/paginated", headers=create_request_headers(get_access_token()), params={"offset": current_offset, "limit": NUMBER_OF_DATASETS_PER_CHUNK})
|
|
70
|
+
r.raise_for_status()
|
|
71
|
+
responseJSON = r.json()
|
|
72
|
+
datasets.extend(responseJSON["datasets"])
|
|
73
|
+
|
|
74
|
+
return datasets
|
|
75
|
+
except Exception as e:
|
|
76
|
+
raise e
|
|
77
|
+
|
|
78
|
+
|
|
79
|
+
|
|
80
|
+
|
|
81
|
+
|
|
82
|
+
|
|
83
|
+
def create_request_headers(ps_or_token):
|
|
84
|
+
"""
|
|
85
|
+
Creates necessary HTTP headers for making Pennsieve API requests.
|
|
86
|
+
Input:
|
|
87
|
+
ps: Pennsieve object for a user that has been authenticated
|
|
88
|
+
"""
|
|
89
|
+
if type(ps_or_token) == str:
|
|
90
|
+
return {
|
|
91
|
+
"Content-Type": "application/json",
|
|
92
|
+
"Authorization": f"Bearer {ps_or_token}",
|
|
93
|
+
}
|
|
94
|
+
|
|
95
|
+
return {
|
|
96
|
+
"Content-Type": "application/json",
|
|
97
|
+
"Authorization": f"Bearer {ps_or_token.get_user().session_token}",
|
|
98
|
+
}
|
|
99
|
+
|
|
100
|
+
|
|
101
|
+
forbidden_characters_bf = '\/:*?"<>.,'
|
|
102
|
+
|
|
103
|
+
|
|
104
|
+
def check_forbidden_characters_ps(my_string):
|
|
105
|
+
"""
|
|
106
|
+
Check for forbidden characters in Pennsieve file/folder name
|
|
107
|
+
|
|
108
|
+
Args:
|
|
109
|
+
my_string: string with characters (string)
|
|
110
|
+
Returns:
|
|
111
|
+
False: no forbidden character
|
|
112
|
+
True: presence of forbidden character(s)
|
|
113
|
+
"""
|
|
114
|
+
regex = re.compile(f"[{forbidden_characters_bf}]")
|
|
115
|
+
if regex.search(my_string) == None and "\\" not in r"%r" % my_string:
|
|
116
|
+
return False
|
|
117
|
+
else:
|
|
118
|
+
return True
|
pysoda/utils/profile.py
ADDED
|
@@ -0,0 +1,28 @@
|
|
|
1
|
+
import requests
|
|
2
|
+
from .config import format_agent_profile_name
|
|
3
|
+
from ..constants import PENNSIEVE_URL
|
|
4
|
+
def create_unique_profile_name(token, machine_username_specifier):
|
|
5
|
+
try:
|
|
6
|
+
# get the users email
|
|
7
|
+
|
|
8
|
+
headers = {
|
|
9
|
+
"Content-Type": "application/json",
|
|
10
|
+
"Authorization": f"Bearer {token}",
|
|
11
|
+
}
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
r = requests.get(f"{PENNSIEVE_URL}/user", headers=headers)
|
|
15
|
+
r.raise_for_status()
|
|
16
|
+
|
|
17
|
+
user_info = r.json()
|
|
18
|
+
|
|
19
|
+
# create a substring of the start of the email to the @ symbol
|
|
20
|
+
email = user_info["email"]
|
|
21
|
+
email_sub = email.split("@")[0]
|
|
22
|
+
|
|
23
|
+
organization_id = user_info["preferredOrganization"]
|
|
24
|
+
|
|
25
|
+
# create an updated profile name that is unqiue to the user and their workspace
|
|
26
|
+
return format_agent_profile_name(f"soda-pennsieve-{machine_username_specifier}-{email_sub}-{organization_id.lower()}")
|
|
27
|
+
except Exception as e:
|
|
28
|
+
raise e
|
|
@@ -0,0 +1,133 @@
|
|
|
1
|
+
import json
|
|
2
|
+
from jsonschema import validate
|
|
3
|
+
import sys
|
|
4
|
+
import os
|
|
5
|
+
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
def load_schema(schema_name):
|
|
9
|
+
schema_path = get_schema_path(schema_name)
|
|
10
|
+
with open(schema_path, 'r') as schema_file:
|
|
11
|
+
schema = json.load(schema_file)
|
|
12
|
+
return schema
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
def get_schema_path(filename):
|
|
16
|
+
"""Get the path to a schema file within the metadata_templates package."""
|
|
17
|
+
|
|
18
|
+
# Method 1: Try PyInstaller bundle first (onefolder creates _MEIPASS)
|
|
19
|
+
if hasattr(sys, '_MEIPASS'):
|
|
20
|
+
# PyInstaller onefolder extracts to _MEIPASS/
|
|
21
|
+
possible_paths = [
|
|
22
|
+
os.path.join(sys._MEIPASS, "pysoda", "schema", filename),
|
|
23
|
+
os.path.join(sys._MEIPASS, filename)
|
|
24
|
+
]
|
|
25
|
+
for path in possible_paths:
|
|
26
|
+
if os.path.exists(path):
|
|
27
|
+
return path
|
|
28
|
+
|
|
29
|
+
# Method 2: Try to import the metadata_templates module (works if PyPI package is properly installed)
|
|
30
|
+
try:
|
|
31
|
+
from .. import schema
|
|
32
|
+
schema_dir = os.path.dirname(schema.__file__)
|
|
33
|
+
schema_path = os.path.join(schema_dir, filename)
|
|
34
|
+
if os.path.exists(schema_path):
|
|
35
|
+
return schema_path
|
|
36
|
+
except (ImportError, ModuleNotFoundError, AttributeError):
|
|
37
|
+
pass
|
|
38
|
+
|
|
39
|
+
# Method 3: Search in the Flask app's directory structure
|
|
40
|
+
current_file = os.path.abspath(__file__)
|
|
41
|
+
current_dir = os.path.dirname(current_file)
|
|
42
|
+
|
|
43
|
+
search_paths = [
|
|
44
|
+
os.path.join(current_dir, '..', '..', 'schema', filename),
|
|
45
|
+
os.path.join(current_dir, 'schema', filename),
|
|
46
|
+
]
|
|
47
|
+
|
|
48
|
+
for path in search_paths:
|
|
49
|
+
if os.path.exists(path):
|
|
50
|
+
return path
|
|
51
|
+
|
|
52
|
+
|
|
53
|
+
# Method 4: Use importlib_resources (Python 3.7+)
|
|
54
|
+
try:
|
|
55
|
+
from importlib import resources
|
|
56
|
+
with resources.path('schema', filename) as schema_path:
|
|
57
|
+
if schema_path.exists():
|
|
58
|
+
return str(schema_path)
|
|
59
|
+
except (ImportError, ModuleNotFoundError):
|
|
60
|
+
# Fallback to other methods if importlib_resources is not available
|
|
61
|
+
pass
|
|
62
|
+
|
|
63
|
+
|
|
64
|
+
|
|
65
|
+
# Method 5: Try to find in Electron Resources folder
|
|
66
|
+
try:
|
|
67
|
+
# Find the Electron Resources folder
|
|
68
|
+
current_path = current_dir
|
|
69
|
+
resources_folder = None
|
|
70
|
+
|
|
71
|
+
# Walk up the directory tree to find the Resources folder
|
|
72
|
+
while current_path and current_path != os.path.dirname(current_path):
|
|
73
|
+
# Check common Electron Resources locations
|
|
74
|
+
possible_resources = [
|
|
75
|
+
os.path.join(current_path, 'Resources'), # macOS
|
|
76
|
+
os.path.join(current_path, 'resources'), # Windows/Linux
|
|
77
|
+
os.path.join(current_path, 'Contents', 'Resources'), # macOS app bundle
|
|
78
|
+
]
|
|
79
|
+
|
|
80
|
+
for resource_path in possible_resources:
|
|
81
|
+
if os.path.exists(resource_path):
|
|
82
|
+
resources_folder = resource_path
|
|
83
|
+
break
|
|
84
|
+
|
|
85
|
+
if resources_folder:
|
|
86
|
+
break
|
|
87
|
+
|
|
88
|
+
current_path = os.path.dirname(current_path)
|
|
89
|
+
|
|
90
|
+
# If we found the Resources folder, look for schema inside it
|
|
91
|
+
if resources_folder:
|
|
92
|
+
template_path = os.path.join(resources_folder, 'schema', filename)
|
|
93
|
+
|
|
94
|
+
if os.path.exists(template_path):
|
|
95
|
+
return template_path
|
|
96
|
+
|
|
97
|
+
except Exception as e:
|
|
98
|
+
pass
|
|
99
|
+
|
|
100
|
+
raise ImportError(f"Could not locate or create schema file {filename}.")
|
|
101
|
+
|
|
102
|
+
|
|
103
|
+
# TODO: Make an enum of the schema names and add extensions to the schema names in the function.....or to the enum.
|
|
104
|
+
def validate_schema(schema, schema_name):
|
|
105
|
+
"""
|
|
106
|
+
Validate submission metadata against the submission schema.
|
|
107
|
+
|
|
108
|
+
Args:
|
|
109
|
+
schema (dict): The python dictionary version of the schema or subschema to validate against the json schema.
|
|
110
|
+
schema_name (str): The file name of the schema to validate against.
|
|
111
|
+
|
|
112
|
+
Raises:
|
|
113
|
+
ValidationError: If the metadata is invalid.
|
|
114
|
+
"""
|
|
115
|
+
true_schema = load_schema(schema_name)
|
|
116
|
+
validate(instance=schema, schema=true_schema)
|
|
117
|
+
|
|
118
|
+
|
|
119
|
+
def get_sds_headers(schema_name):
|
|
120
|
+
"""
|
|
121
|
+
Get the headers for the SDS file.
|
|
122
|
+
|
|
123
|
+
Args:
|
|
124
|
+
soda (dict): The soda object containing the metadata.
|
|
125
|
+
schema_name (str): The name of the schema to validate against.
|
|
126
|
+
|
|
127
|
+
Returns:
|
|
128
|
+
list: The headers for the SDS file.
|
|
129
|
+
"""
|
|
130
|
+
|
|
131
|
+
true_schema = load_schema(schema_name)
|
|
132
|
+
sds_headers = true_schema["items"][0]["properties"].keys()
|
|
133
|
+
return sds_headers
|
|
@@ -0,0 +1,108 @@
|
|
|
1
|
+
import requests
|
|
2
|
+
from os.path import expanduser, join
|
|
3
|
+
from .exceptions import FailedToFetchPennsieveDatasets, PennsieveDatasetCannotBeFound
|
|
4
|
+
from .authentication import get_access_token, create_request_headers
|
|
5
|
+
|
|
6
|
+
|
|
7
|
+
from ..constants import PENNSIEVE_URL
|
|
8
|
+
|
|
9
|
+
userpath = expanduser("~")
|
|
10
|
+
configpath = join(userpath, ".pennsieve", "config.ini")
|
|
11
|
+
|
|
12
|
+
def generate_options_set(soda_json_structure):
|
|
13
|
+
return "generate-dataset" in soda_json_structure.keys()
|
|
14
|
+
|
|
15
|
+
def generating_locally(soda_json_structure):
|
|
16
|
+
return soda_json_structure["generate-dataset"]["destination"] == "local"
|
|
17
|
+
|
|
18
|
+
def generating_on_ps(soda_json_structure):
|
|
19
|
+
return soda_json_structure["generate-dataset"]["destination"] == "ps"
|
|
20
|
+
|
|
21
|
+
def uploading_with_ps_account(soda_json_structure):
|
|
22
|
+
return "ps-account-selected" in soda_json_structure
|
|
23
|
+
|
|
24
|
+
def uploading_to_existing_ps_dataset(soda_json_structure):
|
|
25
|
+
return "ps-dataset-selected" in soda_json_structure
|
|
26
|
+
|
|
27
|
+
def can_resume_prior_upload(resume_status):
|
|
28
|
+
global ums
|
|
29
|
+
return resume_status and ums.df_mid_has_progress()
|
|
30
|
+
|
|
31
|
+
def virtual_dataset_empty(soda_json_structure):
|
|
32
|
+
return (
|
|
33
|
+
"dataset-structure" not in soda_json_structure
|
|
34
|
+
and "metadata-files" not in soda_json_structure
|
|
35
|
+
)
|
|
36
|
+
|
|
37
|
+
def get_dataset_id(dataset_name_or_id):
|
|
38
|
+
"""
|
|
39
|
+
Returns the dataset ID for the given dataset name.
|
|
40
|
+
If the dataset ID was provided instead of the name, the ID will be returned. *Common for Guided Mode*
|
|
41
|
+
|
|
42
|
+
Input:
|
|
43
|
+
dataset_name_or_id: Pennsieve dataset name or ID to get the ID for
|
|
44
|
+
"""
|
|
45
|
+
# If the input is already a dataset ID, return it
|
|
46
|
+
if dataset_name_or_id.startswith("N:dataset:"):
|
|
47
|
+
return dataset_name_or_id
|
|
48
|
+
|
|
49
|
+
# Attempt to retrieve the user's dataset list from Pennsieve
|
|
50
|
+
dataset_list = get_users_dataset_list()
|
|
51
|
+
|
|
52
|
+
# Iterate through the user's dataset list to find a matching dataset name
|
|
53
|
+
for dataset in dataset_list:
|
|
54
|
+
if dataset["content"]["name"] == dataset_name_or_id:
|
|
55
|
+
return dataset["content"]["id"]
|
|
56
|
+
|
|
57
|
+
# If no matching dataset is found, abort with a 404 status and a specific error message
|
|
58
|
+
raise PennsieveDatasetCannotBeFound(dataset_name_or_id)
|
|
59
|
+
|
|
60
|
+
|
|
61
|
+
def get_users_dataset_list():
|
|
62
|
+
"""
|
|
63
|
+
Returns a list of datasets the user has access to.
|
|
64
|
+
Input:
|
|
65
|
+
token: Pennsieve access token
|
|
66
|
+
"""
|
|
67
|
+
|
|
68
|
+
# The number of datasets to retrieve per chunk
|
|
69
|
+
NUMBER_OF_DATASETS_PER_CHUNK = 200
|
|
70
|
+
# The total number of datasets the user has access to (set after the first request)
|
|
71
|
+
NUMBER_OF_DATASETS_USER_HAS_ACCESS_TO = None
|
|
72
|
+
|
|
73
|
+
# The offset is the number of datasets to skip before retrieving the next chunk of datasets (starts at 0, then increases by the number of datasets per chunk)
|
|
74
|
+
current_offset = 0
|
|
75
|
+
# The list of datasets the user has access to (datasets are added to this list after each request and then returned)
|
|
76
|
+
datasets = []
|
|
77
|
+
|
|
78
|
+
try:
|
|
79
|
+
# Get the first chunk of datasets as well as the total number of datasets the user has access to
|
|
80
|
+
r = requests.get(f"{PENNSIEVE_URL}/datasets/paginated", headers=create_request_headers(get_access_token()), params={"offset": current_offset, "limit": NUMBER_OF_DATASETS_PER_CHUNK})
|
|
81
|
+
r.raise_for_status()
|
|
82
|
+
responseJSON = r.json()
|
|
83
|
+
datasets.extend(responseJSON["datasets"])
|
|
84
|
+
NUMBER_OF_DATASETS_USER_HAS_ACCESS_TO = responseJSON["totalCount"]
|
|
85
|
+
|
|
86
|
+
# If the number of datasets the user has access to is less than the number of datasets per chunk, we don't need to retrieve any more datasets
|
|
87
|
+
if NUMBER_OF_DATASETS_USER_HAS_ACCESS_TO < NUMBER_OF_DATASETS_PER_CHUNK:
|
|
88
|
+
return datasets
|
|
89
|
+
|
|
90
|
+
# Otherwise, we need to retrieve the rest of the datasets.
|
|
91
|
+
# We do this by retrieving chunks of datasets until the number of datasets retrieved is equal to the number of datasets the user has access to
|
|
92
|
+
while len(datasets) < NUMBER_OF_DATASETS_USER_HAS_ACCESS_TO:
|
|
93
|
+
# Increase the offset by the number of datasets per chunk (e.g. if 200 datasets per chunk, then increase the offset by 200)
|
|
94
|
+
current_offset += NUMBER_OF_DATASETS_PER_CHUNK
|
|
95
|
+
r = requests.get(f"{PENNSIEVE_URL}/datasets/paginated", headers=create_request_headers(get_access_token()), params={"offset": current_offset, "limit": NUMBER_OF_DATASETS_PER_CHUNK})
|
|
96
|
+
r.raise_for_status()
|
|
97
|
+
responseJSON = r.json()
|
|
98
|
+
datasets.extend(responseJSON["datasets"])
|
|
99
|
+
|
|
100
|
+
return datasets
|
|
101
|
+
except Exception as e:
|
|
102
|
+
raise FailedToFetchPennsieveDatasets("Error: Failed to retrieve datasets from Pennsieve. Please try again later.")
|
|
103
|
+
|
|
104
|
+
|
|
105
|
+
|
|
106
|
+
|
|
107
|
+
|
|
108
|
+
|