nci-cidc-api-modules 1.1.28__py3-none-any.whl → 1.1.30__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
cidc_api/models/models.py CHANGED
@@ -3103,8 +3103,8 @@ class DownloadableFiles(CommonColumns):
3103
3103
  trial_file_counts = cls.count_by(
3104
3104
  cls.trial_id,
3105
3105
  session=session,
3106
- # Apply the provided filter, and also exclude files with null `data_category`s
3107
- filter_=lambda q: filter_(q).filter(cls.data_category != None),
3106
+ # Apply the provided filter
3107
+ filter_=lambda q: filter_(q),
3108
3108
  )
3109
3109
  trial_facets = build_trial_facets(trial_file_counts)
3110
3110
  return trial_facets
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: nci_cidc_api_modules
3
- Version: 1.1.28
3
+ Version: 1.1.30
4
4
  Summary: SQLAlchemy data models and configuration tools used in the NCI CIDC API
5
5
  Home-page: https://github.com/NCI-CIDC/cidc-api-gae
6
6
  License: MIT license
@@ -28,7 +28,7 @@ Requires-Dist: python-dotenv==0.10.3
28
28
  Requires-Dist: requests==2.32.3
29
29
  Requires-Dist: jinja2==3.1.6
30
30
  Requires-Dist: certifi==2024.7.4
31
- Requires-Dist: nci-cidc-schemas==0.27.18
31
+ Requires-Dist: nci-cidc-schemas==0.27.20
32
32
  Dynamic: description
33
33
  Dynamic: description-content-type
34
34
  Dynamic: home-page
@@ -3,12 +3,9 @@ cidc_api/config/db.py,sha256=cyWhWtmXha4OsrwUf6ez8aKSfm7tPSmPDE9JVSBx3Fk,1935
3
3
  cidc_api/config/logging.py,sha256=abhVYtn8lfhIt0tyV2WHFgSmp_s2eeJh7kodB6LH4J0,1149
4
4
  cidc_api/config/secrets.py,sha256=jRFj7W43pWuPf9DZQLCKF7WPXf5cUv-BAaS3ASqhV_Q,1481
5
5
  cidc_api/config/settings.py,sha256=fJQIaCfxsuooEi1pAO8FhHurN0BjP6FZKX8jl7uHGZM,4203
6
- cidc_api/csms/__init__.py,sha256=eJkY6rWNOAUBmSd4G1_U6h7i472druKEtBdVmgFZVPg,20
7
- cidc_api/csms/auth.py,sha256=VTfHlCym_hqVrHXv41Ku9RMAGN9BiNe7ui0o9KZCKtY,3185
8
6
  cidc_api/models/__init__.py,sha256=bl445G8Zic9YbhZ8ZBni07wtBMhLJRMBA-JqjLxx2bw,66
9
- cidc_api/models/csms_api.py,sha256=ovi_jZXZBg6XYEvIupbf5c0WyMbPi4V07OywbleKGqs,30737
10
7
  cidc_api/models/migrations.py,sha256=gp9vtkYbA9FFy2s-7woelAmsvQbJ41LO2_DY-YkFIrQ,11464
11
- cidc_api/models/models.py,sha256=mQ6XTnGwh-h8wi7xcwEuzMZUr8epv9C8f5lkjkIjsVc,129512
8
+ cidc_api/models/models.py,sha256=HBXb5228CeUInaaKOXYBcPz-T9pfwULz_7BaSyJmNDI,129427
12
9
  cidc_api/models/schemas.py,sha256=7tDYtmULuzTt2kg7RorWhte06ffalgpQKrFiDRGcPEQ,2711
13
10
  cidc_api/models/files/__init__.py,sha256=8BMTnUSHzUbz0lBeEQY6NvApxDD3GMWMduoVMos2g4Y,213
14
11
  cidc_api/models/files/details.py,sha256=WrWPxJqlsteinoNbGTaQ3fcxgvChqLGJ9vY7H829jtk,62842
@@ -19,8 +16,8 @@ cidc_api/shared/emails.py,sha256=GY-l0EkoVU_3hjV0g-xo7N9d1iyCdluyq_arftEPPe0,498
19
16
  cidc_api/shared/gcloud_client.py,sha256=i4ZZLoDC_pEwKaMS8218uUJ0fsIi0DKwd-hzGHGQw7g,33139
20
17
  cidc_api/shared/jose.py,sha256=-qzGzEDAlokEp9E7WtBtQkXyyfPWTYXlwYpCqVJWmqM,1830
21
18
  cidc_api/shared/rest_utils.py,sha256=RwR30WOUAYCxL7V-i2totEyeriG30GbBDvBcpLXhM9w,6594
22
- nci_cidc_api_modules-1.1.28.dist-info/licenses/LICENSE,sha256=pNYWVTHaYonnmJyplmeAp7tQAjosmDpAWjb34jjv7Xs,1102
23
- nci_cidc_api_modules-1.1.28.dist-info/METADATA,sha256=arwyhSARRnyRAUizteejzvHG7hcw8iuRKpvifVW1Fmg,41285
24
- nci_cidc_api_modules-1.1.28.dist-info/WHEEL,sha256=zaaOINJESkSfm_4HQVc5ssNzHCPXhJm0kEUakpsEHaU,91
25
- nci_cidc_api_modules-1.1.28.dist-info/top_level.txt,sha256=rNiRzL0lJGi5Q9tY9uSoMdTbJ-7u5c_D2E86KA94yRA,9
26
- nci_cidc_api_modules-1.1.28.dist-info/RECORD,,
19
+ nci_cidc_api_modules-1.1.30.dist-info/licenses/LICENSE,sha256=pNYWVTHaYonnmJyplmeAp7tQAjosmDpAWjb34jjv7Xs,1102
20
+ nci_cidc_api_modules-1.1.30.dist-info/METADATA,sha256=s378K4KxeTTwVZuypJg2yiWUSO3jBN9zqeMwsuQ06sU,41285
21
+ nci_cidc_api_modules-1.1.30.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
22
+ nci_cidc_api_modules-1.1.30.dist-info/top_level.txt,sha256=rNiRzL0lJGi5Q9tY9uSoMdTbJ-7u5c_D2E86KA94yRA,9
23
+ nci_cidc_api_modules-1.1.30.dist-info/RECORD,,
@@ -1,5 +1,5 @@
1
1
  Wheel-Version: 1.0
2
- Generator: setuptools (80.8.0)
2
+ Generator: setuptools (80.9.0)
3
3
  Root-Is-Purelib: true
4
4
  Tag: py3-none-any
5
5
 
cidc_api/csms/__init__.py DELETED
@@ -1 +0,0 @@
1
- from .auth import *
cidc_api/csms/auth.py DELETED
@@ -1,105 +0,0 @@
1
- __all__ = ["get_token", "get_with_authorization", "get_with_paging"]
2
-
3
- import os
4
- from datetime import datetime, timedelta
5
- from typing import Any, Dict, Iterator
6
-
7
- import requests
8
-
9
- from ..config.settings import (
10
- CSMS_BASE_URL,
11
- CSMS_CLIENT_ID,
12
- CSMS_CLIENT_SECRET,
13
- CSMS_TOKEN_URL,
14
- )
15
-
16
- os.environ["TZ"] = "UTC"
17
-
18
- TIMEOUT_IN_SECONDS = 20
19
- _TOKEN, _TOKEN_EXPIRY = None, datetime.now()
20
-
21
-
22
- def get_token():
23
- global _TOKEN, _TOKEN_EXPIRY
24
- if not _TOKEN or datetime.now() >= _TOKEN_EXPIRY:
25
- res, time = (
26
- requests.post(
27
- CSMS_TOKEN_URL,
28
- headers={"Content-Type": "application/x-www-form-urlencoded"},
29
- data={
30
- "grant_type": "client_credentials",
31
- "client_id": CSMS_CLIENT_ID,
32
- "client_secret": CSMS_CLIENT_SECRET,
33
- },
34
- timeout=TIMEOUT_IN_SECONDS,
35
- ).json(),
36
- datetime.now(),
37
- )
38
-
39
- # res definition from https://developer.okta.com/docs/reference/api/oidc/#response-example-error-7
40
- if "errorCode" in res:
41
- raise RuntimeError(res["errorCode"] + ": " + res.get("errorSummary"))
42
-
43
- _TOKEN = res["access_token"]
44
- _TOKEN_EXPIRY = time + timedelta(seconds=res["expires_in"])
45
-
46
- return _TOKEN
47
-
48
-
49
- def get_with_authorization(url: str, **kwargs) -> requests.Response:
50
- """url should be fully valid or begin with `/` to be prefixed with CSMS_BASE_URL"""
51
- token = get_token()
52
- headers = {
53
- **kwargs.get("headers", {}),
54
- "Authorization": f"Bearer {token}",
55
- "accept": "*/*",
56
- }
57
- kwargs["headers"] = headers
58
- if not url.startswith(CSMS_BASE_URL):
59
- url = CSMS_BASE_URL + url
60
- return requests.get(
61
- url,
62
- **kwargs,
63
- timeout=TIMEOUT_IN_SECONDS,
64
- )
65
-
66
-
67
- def get_with_paging(url: str, limit: int = None, offset: int = 0, **kwargs) -> Iterator[Dict[str, Any]]:
68
- """
69
- Return an iterator of entries via get_with_authorization with handling for CSMS paging
70
-
71
- Parameters
72
- ----------
73
- url: str
74
- url should be fully valid or begin with `/` to be prefixed with CSMS_BASE_URL
75
- limit: int = None
76
- the number of records to return on each page
77
- default: 5000 for samples, 50 for manifests, 1 otherwise
78
- offset: int = 0
79
- which page to return, 0-indexed
80
- increments as needed to continue returning
81
-
82
- Raises
83
- ------
84
- requests.exceptions.HTTPError
85
- via res.raise_for_status()
86
- https://docs.python-requests.org/en/master/user/quickstart/#response-status-codes
87
- """
88
- if not limit:
89
- if "samples" in url:
90
- limit = 5000
91
- elif "manifests" in url:
92
- limit = 50
93
- else:
94
- limit = 1
95
-
96
- kwargs.update({"limit": limit, "offset": offset})
97
-
98
- res = get_with_authorization(url, params=kwargs)
99
- while res.status_code < 300 and len(res.json().get("data", [])) > 0:
100
- # if there's not an error and we're still returning
101
- yield from res.json()["data"]
102
- kwargs["offset"] += 1 # get the next page
103
- res = get_with_authorization(url, params=kwargs)
104
-
105
- res.raise_for_status()
@@ -1,872 +0,0 @@
1
- __all__ = [
2
- "Change",
3
- "detect_manifest_changes",
4
- "insert_manifest_into_blob",
5
- "NewManifestError",
6
- ]
7
-
8
- import os
9
- import re
10
- from collections import defaultdict
11
- from datetime import date, datetime, time
12
- from typing import (
13
- Any,
14
- Callable,
15
- Dict,
16
- Iterable,
17
- Iterator,
18
- List,
19
- Optional,
20
- Set,
21
- Tuple,
22
- Union,
23
- )
24
-
25
- from sqlalchemy.orm.session import Session
26
-
27
- from cidc_schemas.prism.merger import merge_clinical_trial_metadata
28
- from cidc_schemas.prism.core import (
29
- _check_encrypt_init,
30
- _encrypt,
31
- _ENCRYPTED_FIELD_LEN,
32
- load_and_validate_schema,
33
- set_prism_encrypt_key,
34
- )
35
- from .models import TrialMetadata, UploadJobStatus, UploadJobs
36
- from .models import with_default_session
37
- from ..config.logging import get_logger
38
- from ..config.settings import PRISM_ENCRYPT_KEY
39
-
40
-
41
- os.environ["TZ"] = "UTC"
42
- logger = get_logger(__name__)
43
-
44
-
45
- def cimac_id_to_cimac_participant_id(cimac_id, _):
46
- return cimac_id[:7]
47
-
48
-
49
- CIMAC_ID_REGEX = re.compile("^C[A-Z0-9]{3}[A-Z0-9]{3}[A-Z0-9]{2}.[0-9]{2}$")
50
- SAMPLE_SCHEMA: dict = load_and_validate_schema("sample.json")
51
- PARTICIPANT_SCHEMA: dict = load_and_validate_schema("participant.json")
52
- SHIPMENT_SCHEMA: dict = load_and_validate_schema("shipping_core.json")
53
- TARGET_PROPERTIES_MAP: Dict[str, dict] = {
54
- "sample": SAMPLE_SCHEMA["properties"],
55
- "participant": PARTICIPANT_SCHEMA["properties"],
56
- "shipment": SHIPMENT_SCHEMA["properties"],
57
- }
58
-
59
- # make sure that the encryption key is set
60
- # NOTE: Exception is raised in external core module
61
- try:
62
- _check_encrypt_init()
63
- except Exception:
64
- set_prism_encrypt_key(PRISM_ENCRYPT_KEY)
65
-
66
-
67
- def _get_all_values(target: str, old: dict, drop: List[str] = None) -> Dict[str, Any]:
68
- """
69
- Parameters
70
- ----------
71
- target: str in ["sample", "participant", "shipment"]
72
- old: dict
73
- drop: List[str] = []
74
-
75
- Returns
76
- -------
77
- Dict[str, Any]
78
- all of the values from `old` that are in `target` excepting anything keys in `drop`
79
- """
80
-
81
- if drop is None:
82
- drop = []
83
-
84
- ret = {p: old[p] for p in TARGET_PROPERTIES_MAP[target].keys() if p in old and p not in drop}
85
-
86
- return ret
87
-
88
-
89
- class NewManifestError(RuntimeError):
90
- pass
91
-
92
-
93
- def _parse_upload_type(sample: dict, upload_type: Set[str]) -> str:
94
- sample_manifest_type = sample.get("sample_manifest_type")
95
- processed_derivative = sample.get("processed_sample_derivative")
96
- if sample_manifest_type is None:
97
- # safety
98
- return
99
-
100
- if sample_manifest_type == "biofluid_cellular":
101
- upload_type.add("pbmc")
102
- elif sample_manifest_type == "tissue_slides":
103
- upload_type.add("tissue_slide")
104
-
105
- elif processed_derivative == "Germline DNA":
106
- upload_type.add(f"normal_{sample_manifest_type.split()[0].lower()}_dna")
107
- elif processed_derivative == "Tumor DNA":
108
- upload_type.add(f"tumor_{sample_manifest_type.split()[0]}_dna")
109
- elif processed_derivative in ["DNA", "RNA"]:
110
- unprocessed_type = sample.get("type_of_sample")
111
- new_type = "tumor" if "tumor" in unprocessed_type.lower() else "normal"
112
- new_type += "_blood_" if sample_manifest_type.startswith("biofluid") else "_tissue_"
113
- new_type += processed_derivative.lower()
114
-
115
- upload_type.add(new_type)
116
-
117
-
118
- def _get_upload_type(samples: Iterable[Dict[str, Any]]) -> str:
119
- upload_type: Set[str] = set()
120
-
121
- for sample in samples:
122
- processed_type = sample.get("processed_sample_type").lower()
123
- if processed_type == "h&e fixed tissue slide":
124
- processed_type = "h_and_e"
125
-
126
- if processed_type in [
127
- "pbmc",
128
- "plasma",
129
- "tissue_slide",
130
- "normal_blood_dna",
131
- "normal_tissue_dna",
132
- "tumor_tissue_dna",
133
- "tumor_tissue_rna",
134
- "h_and_e",
135
- ]:
136
- upload_type.add(processed_type)
137
- else:
138
- # updates upload_type in-place with the given sample
139
- _parse_upload_type(sample=sample, upload_type=upload_type)
140
-
141
- assert len(upload_type) == 1, f"Inconsistent value determined for upload_type:{upload_type}"
142
- return list(upload_type)[0]
143
-
144
-
145
- def _get_and_check(
146
- obj: Union[Dict[str, Any], List[Dict[str, Any]]],
147
- key: str,
148
- msg: str,
149
- default: Any = None,
150
- check: Callable[[Any], bool] = bool,
151
- ) -> Any:
152
- """
153
- Returns a key from a dictionary if it exists, and raises an error if fails an integrity check
154
- If given a list of dictionaries, asserts that each one provides the same result.
155
- """
156
- if isinstance(obj, list):
157
- ret = {o.get(key, default) for o in obj}
158
- assert len(ret) == 1, f"Inconsistent value provided for {key}"
159
- ret = list(ret)[0]
160
- else:
161
- ret = obj.get(key, default)
162
-
163
- if not check(ret):
164
- raise RuntimeError(msg)
165
-
166
- return ret
167
-
168
-
169
- def _extract_info_from_manifest(manifest: Dict[str, Any]) -> Tuple[str, str, List[Dict[str, Any]]]:
170
- """
171
- Given a manifest, do initial validation and return some key values
172
-
173
- Returns
174
- -------
175
- str : trial_id
176
- the same across all samples
177
- exists in both TrialMetadata and ClinicalTrial tables
178
- str : manifest_id
179
- List[Dict[str, Any]] : samples
180
-
181
- RuntimeErrors Raised
182
- -----------------
183
- - "Cannot add a manifest that is not qc_complete"
184
- if manifest's status is not qc_complete (or null)
185
- - f"Manifest {manifest_id} contains no samples: {manifest}"
186
- - f"No consistent protocol_identifier defined for samples on manifest {manifest_id}"
187
- """
188
- manifest_id = _get_and_check(obj=manifest, key="manifest_id", msg=f"No manifest_id in: {manifest}")
189
- _ = _get_and_check( # don't need to keep status
190
- obj=manifest,
191
- key="status",
192
- msg="Cannot add a manifest that is not qc_complete",
193
- default="qc_complete",
194
- check=lambda v: v == "qc_complete",
195
- )
196
- samples = _get_and_check(
197
- obj=manifest,
198
- key="samples",
199
- msg=f"Manifest {manifest_id} contains no samples: {manifest}",
200
- default=[],
201
- check=lambda v: len(v) != 0,
202
- )
203
- trial_id = _get_and_check(
204
- obj=samples,
205
- key="protocol_identifier",
206
- msg=f"No consistent protocol_identifier defined for samples on manifest {manifest_id}",
207
- )
208
-
209
- return trial_id, manifest_id, samples
210
-
211
-
212
- def _extract_details_from_trial(csms_samples: List[Dict[str, Any]]):
213
- """
214
- Given a trial, return some key values
215
-
216
- Returns
217
- -------
218
- str : assay_priority
219
- str : assay_type
220
-
221
- RuntimeErrors Raised
222
- -----------------
223
- - f"No assay_priority defined for manifest_id={manifest_id} for trial {trial_id}"
224
- - f"No assay_type defined for manifest_id={manifest_id} for trial {trial_id}"
225
- """
226
- assay_priority = _get_and_check(
227
- obj=csms_samples,
228
- key="assay_priority",
229
- msg="will not be thrown",
230
- check=lambda _: True,
231
- )
232
- assay_type = _get_and_check(
233
- obj=csms_samples,
234
- key="assay_type",
235
- msg="will not be thrown",
236
- check=lambda _: True,
237
- )
238
- return assay_priority, assay_type
239
-
240
-
241
- def _process_csms_sample(csms_sample: dict):
242
- event_name = csms_sample.get("standardized_collection_event_name")
243
- if event_name is None:
244
- raise RuntimeError(
245
- f"No standardized_collection_event_name defined for sample {csms_sample.get('cimac_id', '')} on manifest {csms_sample['manifest_id']} for trial {csms_sample['protocol_identifier']}"
246
- )
247
-
248
- csms_sample["collection_event_name"] = event_name
249
-
250
- # encrypt participant ids if not already encrypted
251
- if "participant_id" in csms_sample and len(csms_sample["participant_id"]) != _ENCRYPTED_FIELD_LEN:
252
- csms_sample["participant_id"] = _encrypt(csms_sample["participant_id"])
253
-
254
- # differences in naming convention
255
- processed_sample_type_map: Dict[str, str] = {
256
- "tissue_slide": "Fixed Tissue Slide",
257
- "tumor_tissue_dna": "FFPE Tissue Scroll",
258
- "plasma": "Plasma",
259
- "normal_tissue_dna": "FFPE Tissue Scroll",
260
- "h_and_e": "H&E Fixed Tissue Slide",
261
- "pbmc": "PBMC",
262
- }
263
- if csms_sample["processed_sample_type"] in processed_sample_type_map:
264
- csms_sample["processed_sample_type"] = processed_sample_type_map[csms_sample["processed_sample_type"]]
265
-
266
- # differences in keys
267
- if "fixation_or_stabilization_type" in csms_sample:
268
- csms_sample["fixation_stabilization_type"] = csms_sample.pop("fixation_or_stabilization_type")
269
-
270
- # typing
271
- if "sample_derivative_concentration" in csms_sample:
272
- csms_sample["sample_derivative_concentration"] = float(csms_sample["sample_derivative_concentration"])
273
-
274
- if "parent_sample_id" not in csms_sample:
275
- csms_sample["parent_sample_id"] = "Not Reported"
276
-
277
-
278
- def _convert_csms_samples(
279
- trial_id: str,
280
- manifest_id: str,
281
- csms_samples: List[Dict[str, Any]],
282
- existing_cimac_ids: List[str] = None,
283
- ) -> Iterator[Tuple[str, Dict[str, Any]]]:
284
- """
285
- Convert a list of CSMS-style samples into an iterator returning CIMAC IDs and CIDC-style samples
286
- RuntimeErrors are raised during the call for each sample; full validation is NOT done first.
287
-
288
- Returns
289
- -------
290
- iterator yielding (str, dict)
291
- cimac_id, converted CSMS sample
292
-
293
- RuntimeErrors Raised
294
- -----------------
295
- - f"No standardized_collection_event_name defined for sample {sample['cimac_id']} on manifest {sample['manifest_id']} for trial {sample['protocol_identifier']}"
296
- - f"No cimac_id defined for samples[{n}] on manifest_id={manifest_id} for trial {trial_id}"
297
- - f"Malformatted cimac_id={cimac_id} on manifest_id={manifest_id} for trial {trial_id}"
298
- - f"Sample with cimac_id={cimac_id} already exists for trial {trial_id}\nNew samples: {sample}"
299
- - f"Sample with no local participant_id given:\n{sample}"
300
- if participant_id and trial_participant_id are both undefined
301
- """
302
-
303
- if existing_cimac_ids is None:
304
- existing_cimac_ids = []
305
-
306
- for n, sample in enumerate(csms_samples):
307
- # process the sample
308
- _process_csms_sample(csms_sample=sample)
309
-
310
- # get and validate the CIMAC id
311
- cimac_id = _get_and_check(
312
- obj=sample,
313
- key="cimac_id",
314
- msg=f"No cimac_id defined for samples[{n}] on manifest_id={manifest_id} for trial {trial_id}",
315
- )
316
- if not CIMAC_ID_REGEX.match(cimac_id):
317
- raise RuntimeError(f"Malformatted cimac_id={cimac_id} on manifest_id={manifest_id} for trial {trial_id}")
318
-
319
- if cimac_id in existing_cimac_ids:
320
- raise RuntimeError(
321
- f"Sample with cimac_id={cimac_id} already exists for trial {trial_id}\nNew samples: {sample}"
322
- )
323
-
324
- # yield
325
- yield (cimac_id, sample)
326
-
327
-
328
- @with_default_session
329
- def insert_manifest_into_blob(
330
- manifest: Dict[str, Any],
331
- uploader_email: str,
332
- *,
333
- dry_run: bool = False,
334
- session: Session,
335
- ) -> None:
336
- """
337
- Given a CSMS-style manifest, add it into the JSON metadata blob
338
- If `dry_run`, calls `session.rollback` instead of `session.commit`
339
-
340
- RuntimeErrors Raised
341
- -----------------
342
- - "Cannot add a manifest that is not qc_complete"
343
- if manifest's status is not qc_complete (or null)
344
- - f"Manifest {manifest_id} contains no samples: {manifest}"
345
- - f"No consistent protocol_identifier defined for samples on manifest {manifest_id}"
346
- - f"Clinical trial with protocol identifier={trial_id} does not exist"
347
- if trial is missing from TrialMetadata OR ClinicalTrial OR both
348
-
349
- - Assertion: "Inconsistent value provided for assay_priority"
350
- - Assertion: "Inconsistent value provided for assay_type"
351
-
352
- - f"Manifest with manifest_id={manifest_id} already exists for trial {trial_id}"
353
- - f"No standardized_collection_event_name defined for sample {sample['cimac_id']} on manifest {sample['manifest_id']} for trial {sample['protocol_identifier']}"
354
- - f"No cimac_id defined for samples[{n}] on manifest_id={manifest_id} for trial {trial_id}"
355
- - f"Malformatted cimac_id={cimac_id} on manifest_id={manifest_id} for trial {trial_id}"
356
- - f"Sample with cimac_id={cimac_id} already exists for trial {trial_id}\nNew samples: {sample}"
357
- - f"Sample with no local participant_id given:\n{sample}"
358
- if participant_id and trial_participant_id are both undefined
359
-
360
- - "prism errors: [{errors from merge_clinical_trial_metadata}]"
361
- """
362
-
363
- trial_id, manifest_id, csms_samples = _extract_info_from_manifest(manifest)
364
- trial_md = TrialMetadata.select_for_update_by_trial_id(trial_id, session=session)
365
- if manifest_id in [s["manifest_id"] for s in trial_md.metadata_json["shipments"]]:
366
- raise RuntimeError(f"Manifest with manifest_id={manifest_id} already exists for trial {trial_id}")
367
-
368
- # pull out some additional values we'll need
369
- existing_cimac_ids = [s["cimac_id"] for p in trial_md.metadata_json["participants"] for s in p["samples"]]
370
- assay_priority, assay_type = _extract_details_from_trial(csms_samples)
371
- if assay_priority:
372
- manifest["assay_priority"] = assay_priority
373
- if assay_type:
374
- manifest["assay_type"] = assay_type
375
-
376
- # a patch is just the parts that are new, equivalent to the return of schemas.prismify
377
- patch = {
378
- "protocol_identifier": trial_id,
379
- "shipments": [_get_all_values(target="shipment", old=manifest, drop=["excluded", "json_data"])],
380
- "participants": [],
381
- }
382
-
383
- # sort samples by participants
384
- sample_map: Dict[str, List[Dict[str, Any]]] = defaultdict(list)
385
- for cimac_id, sample in _convert_csms_samples(trial_id, manifest_id, csms_samples, existing_cimac_ids):
386
- sample_map[cimac_id_to_cimac_participant_id(cimac_id, {})].append(sample)
387
-
388
- # each participant has a list of samples
389
- for cimac_participant_id, partic_samples in sample_map.items():
390
- partic = {
391
- "cimac_participant_id": cimac_participant_id,
392
- "participant_id": partic_samples[0]["participant_id"],
393
- **_get_all_values(
394
- target="participant",
395
- old=partic_samples[0],
396
- drop=[
397
- "cimac_participant_id",
398
- "excluded",
399
- "json_data",
400
- "participant_id",
401
- "trial_participant_id",
402
- ],
403
- ),
404
- }
405
- partic["samples"] = [
406
- _get_all_values(
407
- target="sample",
408
- old=sample,
409
- drop=["excluded", "json_data", "manifest_id"],
410
- )
411
- for sample in partic_samples
412
- ]
413
-
414
- patch["participants"].append(partic)
415
-
416
- logger.info("Patch for %s manifest %s:\n%s", trial_id, manifest_id, patch)
417
- # merge and validate the data
418
- merged, errs = merge_clinical_trial_metadata(patch, trial_md.metadata_json)
419
- if errs:
420
- raise RuntimeError({"prism errors": [str(e) for e in errs]})
421
-
422
- # save it, will get rolled back if in a dry run
423
- trial_md.update(changes={"metadata_json": merged}, session=session)
424
-
425
- # create pseudo-UploadJobs that will get rolled back if in a dry run
426
- UploadJobs(
427
- trial_id=trial_id,
428
- _status=UploadJobStatus.MERGE_COMPLETED.value,
429
- multifile=False,
430
- metadata_patch=patch,
431
- upload_type=_get_upload_type(csms_samples),
432
- uploader_email=uploader_email,
433
- ).insert(session=session)
434
-
435
- if dry_run:
436
- session.flush()
437
- session.rollback()
438
- else:
439
- session.commit()
440
-
441
-
442
- class Change:
443
- def __init__(
444
- self,
445
- entity_type: str,
446
- trial_id: str,
447
- manifest_id: str,
448
- cimac_id: str = None,
449
- changes: Dict[str, Tuple[Any, Any]] = None,
450
- ):
451
- if changes is None:
452
- changes = []
453
-
454
- if entity_type not in ["sample", "shipment", "upload"]:
455
- raise ValueError(f"entity_type must be in: sample, shipment, upload\nnot: {entity_type}")
456
-
457
- self.entity_type = entity_type
458
- self.trial_id = trial_id
459
- self.manifest_id = manifest_id
460
- self.cimac_id = cimac_id
461
- self.changes = changes
462
-
463
- def __bool__(self):
464
- return bool(len(self.changes))
465
-
466
- def __repr__(self):
467
- return f"{self.entity_type.title()} changes for {self.trial_id}, {self.manifest_id}, {self.cimac_id}:\n{self.changes}"
468
-
469
- def __eq__(self, other):
470
- return (
471
- self.entity_type == other.entity_type
472
- and self.trial_id == other.trial_id
473
- and self.manifest_id == other.manifest_id
474
- and self.cimac_id == other.cimac_id
475
- and self.changes == other.changes
476
- )
477
-
478
-
479
- def _calc_difference(
480
- entity_type: str,
481
- trial_id: str,
482
- manifest_id: str,
483
- cidc: Dict[str, Any],
484
- csms: Dict[str, Any],
485
- ignore=None,
486
- ) -> Dict[str, Tuple[Any, Any]]:
487
- """
488
- The actual comparison function that handles comparing values
489
-
490
- Handles formatting for date/time/datetime in CIDC
491
- Do not perform a comparison for ignored keys
492
- Add constant critical fields back to anything that changes
493
- """
494
-
495
- if ignore is None:
496
- ignore = [
497
- "barcode",
498
- "biobank_id",
499
- "cimac_participant_id",
500
- "entry_number",
501
- "event",
502
- "excluded",
503
- "json_data",
504
- "modified_time",
505
- "modified_timestamp",
506
- "protocol_identifier",
507
- "qc_comments",
508
- "reason",
509
- "sample_approved",
510
- "sample_manifest_type",
511
- "samples",
512
- "status",
513
- "status_log",
514
- "study_encoding",
515
- "submitter",
516
- "trial_id",
517
- ]
518
-
519
- # handle formatting and ignore
520
- cidc1: Dict[str, Any] = {
521
- k: (datetime.strftime(v, "%Y-%m-%d %H:%M:%S") if isinstance(v, (date, time, datetime)) else v)
522
- for k, v in cidc.items()
523
- if k not in ignore
524
- }
525
- csms1: Dict[str, Any] = {k: v for k, v in csms.items() if k not in ignore}
526
-
527
- # take difference by using symmetric set difference on the items
528
- # use set to not get same key multiple times if values differ
529
- diff_keys: Set[str] = {
530
- k
531
- for k in set(cidc1.keys()).union(set(csms1.keys()))
532
- # guaranteed to be in one or the other, so never None == None
533
- if cidc1.get(k) != csms1.get(k)
534
- }
535
- # then get both values once per key to return
536
- changes: Dict[str, Tuple[Any, Any]] = {k: (cidc.get(k), csms.get(k)) for k in diff_keys}
537
-
538
- return Change(
539
- entity_type=entity_type,
540
- trial_id=trial_id,
541
- manifest_id=manifest_id,
542
- cimac_id=csms["cimac_id"] if entity_type == "sample" else None,
543
- changes=changes,
544
- )
545
-
546
-
547
- def _get_cidc_sample_map(metadata: dict) -> Dict[str, Dict[str, Any]]:
548
- """Returns a map of CIMAC IDs for this shipment to the relevant sample details from CIDC"""
549
- cidc_partic_map = {partic["cimac_participant_id"]: partic for partic in metadata.get("participants", [])}
550
-
551
- ## make maps from cimac_id to a full dict
552
- ## need to add participant-level values
553
- cidc_sample_map = {
554
- sample["cimac_id"]: sample
555
- for partic in metadata.get("participants", [])
556
- for sample in partic.get("samples", [])
557
- }
558
- for cidc_cimac_id in cidc_sample_map.keys():
559
- cimac_participant_id = cimac_id_to_cimac_participant_id(cidc_cimac_id, {})
560
- cidc_sample_map[cidc_cimac_id]["cohort_name"] = cidc_partic_map[cimac_participant_id]["cohort_name"]
561
- cidc_sample_map[cidc_cimac_id]["participant_id"] = cidc_partic_map[cimac_participant_id]["participant_id"]
562
-
563
- return cidc_sample_map
564
-
565
-
566
- def _get_csms_sample_map(trial_id, manifest_id, csms_samples) -> Dict[str, Dict[str, Any]]:
567
- """Returns a map of CIMAC IDs to the relevant sample details from CSMS"""
568
- return {
569
- csms_cimac_id: {
570
- # participant-level critical field
571
- "cohort_name": csms_sample["cohort_name"],
572
- # name changes
573
- "trial_id": csms_sample["protocol_identifier"],
574
- "participant_id": csms_sample["participant_id"],
575
- # not in CSMS
576
- "cimac_participant_id": cimac_id_to_cimac_participant_id(csms_cimac_id, {}),
577
- "sample_manifest_type": csms_sample.get("sample_manifest_type"),
578
- # the rest of the values
579
- **_get_all_values(
580
- target="sample",
581
- old=csms_sample,
582
- drop=[
583
- "cimac_participant_id",
584
- "cohort_name",
585
- "participant_id",
586
- "sample_manifest_type",
587
- "trial_id",
588
- ],
589
- ),
590
- }
591
- for csms_cimac_id, csms_sample in _convert_csms_samples(trial_id, manifest_id, csms_samples)
592
- }
593
-
594
-
595
- def _cross_validate_samples(
596
- trial_id: str,
597
- manifest_id: str,
598
- cidc_sample_map: Dict[str, dict],
599
- csms_sample_map: Dict[str, dict],
600
- *,
601
- session: Session,
602
- ):
603
- # make sure that all of the CIDC samples are still in CSMS
604
- for cimac_id, cidc_sample in cidc_sample_map.items():
605
- if cimac_id not in csms_sample_map:
606
- formatted = (
607
- trial_id,
608
- manifest_id,
609
- cidc_sample["cimac_id"],
610
- )
611
- raise RuntimeError(f"Missing sample: {formatted} on CSMS {(trial_id, manifest_id)}")
612
- # make sure that all of the CSMS samples are in CIDC
613
- all_cidc_sample_map: Dict[str, dict] = {
614
- sample["cimac_id"]: {
615
- **sample,
616
- "trial_id": upload.trial_id,
617
- "manifest_id": upload.metadata_patch["shipments"][0]["manifest_id"],
618
- }
619
- for upload in session.query(UploadJobs).filter(UploadJobs.status == UploadJobStatus.MERGE_COMPLETED.value).all()
620
- for partic in upload.metadata_patch.get("participants", [])
621
- for sample in partic.get("samples", [])
622
- if len(upload.metadata_patch.get("shipments", []))
623
- }
624
- for cimac_id in csms_sample_map:
625
- # as sample maps are pulling only from CIDC for this trial_id / manifest_id
626
- # any missing cimac_id's are a change in critical field
627
- # but the cimac_id might exist elsewhere in CIDC
628
- if cimac_id not in cidc_sample_map:
629
- cidc_sample = all_cidc_sample_map.get(cimac_id, None)
630
-
631
- formatted = (
632
- (
633
- cidc_sample["trial_id"],
634
- cidc_sample["manifest_id"],
635
- cidc_sample["cimac_id"],
636
- )
637
- if cidc_sample is not None
638
- else "<no sample found>"
639
- )
640
- raise RuntimeError(f"Change in critical field for: {formatted} to CSMS {(trial_id, manifest_id, cimac_id)}")
641
-
642
-
643
- def _initial_manifest_validation(
644
- csms_manifest: Dict[str, Any], *, session: Session
645
- ) -> Tuple[str, str, Dict[str, Dict[str, Any]], Dict[str, Dict[str, Any]], UploadJobs]:
646
- """
647
- Gather all of the things we'll need while performing validation of the manifest
648
-
649
- Returns
650
- -------
651
- str : trial_id
652
- str : manifest_id
653
- Dict[str, Dict[str, Any]] : csms_sample_map
654
- Dict[str, Dict[str, Any]] : cidc_sample_map
655
- both map cimac_id's to a sample definition dict
656
- UploadJobs : cidc_uploadjob
657
-
658
-
659
- RuntimeErrors Raised
660
- -----------------
661
- - "Cannot add a manifest that is not qc_complete"
662
- if manifest's status is not qc_complete (or null)
663
- - f"Manifest {manifest_id} contains no samples: {manifest}"
664
- - f"No consistent protocol_identifier defined for samples on manifest {manifest_id}"
665
- - f"Clinical trial with protocol identifier={trial_id} does not exist"
666
- if trial is missing from TrialMetadata
667
- - NewManifestError
668
- if there is no shipment with the given manifest_id
669
- - f"Change in critical field for: {(cidc.trial_id, cidc.manifest_id)} to CSMS {(trial_id, manifest_id)}"
670
- if the Shipment in CIDC has a different trial_id than in CSMS
671
- - f"Missing sample: {(cidc.trial_id, cidc.manifest_id, cidc.cimac_id)} on CSMS {(trial_id, manifest_id)}"
672
- if an sample in CIDC is not reflected in CSMS
673
- - f"Change in critical field for: {(cidc.trial_id, cidc.manifest_id, cidc.cimac_id)} to CSMS {(trial_id, manifest_id, cimac_id)}"
674
- if a sample in CSMS is not correctly reflected in the current state of CIDC
675
- - f"No assay_priority defined for manifest_id={manifest_id} for trial {trial_id}"
676
- - f"No assay_type defined for manifest_id={manifest_id} for trial {trial_id}"
677
- """
678
- trial_id, manifest_id, csms_samples = _extract_info_from_manifest(csms_manifest)
679
- # ----- Get all our information together -----
680
- # validate that trial exists in the JSON json or error otherwise
681
- _ = TrialMetadata.select_for_update_by_trial_id(trial_id, session=session)
682
-
683
- shipments: List[UploadJobs] = (
684
- session.query(UploadJobs)
685
- .filter(
686
- UploadJobs.status == UploadJobStatus.MERGE_COMPLETED.value,
687
- UploadJobs.trial_id == trial_id,
688
- )
689
- .all()
690
- )
691
- shipments_metadata: Dict[str, dict] = {
692
- s.metadata_patch["shipments"][0]["manifest_id"]: s
693
- for s in shipments
694
- if len(s.metadata_patch.get("shipments", []))
695
- }
696
-
697
- if manifest_id not in shipments_metadata:
698
- # remove this to allow for adding new manifests via this function
699
- # also need to uncomment new Sample code below
700
- raise NewManifestError()
701
-
702
- cidc_shipment: UploadJobs = shipments_metadata[manifest_id]
703
-
704
- cidc_sample_map = _get_cidc_sample_map(cidc_shipment.metadata_patch)
705
- csms_sample_map = _get_csms_sample_map(trial_id, manifest_id, csms_samples)
706
-
707
- # raises RuntimeErrors if something is amiss
708
- _cross_validate_samples(
709
- trial_id=trial_id,
710
- manifest_id=manifest_id,
711
- cidc_sample_map=cidc_sample_map,
712
- csms_sample_map=csms_sample_map,
713
- session=session,
714
- )
715
-
716
- csms_assay_priority, csms_assay_type = _extract_details_from_trial(csms_samples)
717
- if csms_assay_priority:
718
- csms_manifest["assay_priority"] = csms_assay_priority
719
- if csms_assay_type:
720
- csms_manifest["assay_type"] = csms_assay_type
721
-
722
- return trial_id, manifest_id, csms_sample_map, cidc_sample_map, cidc_shipment
723
-
724
-
725
- def _handle_shipment_differences(
726
- manifest_id: str,
727
- csms_manifest: Dict[str, Any],
728
- cidc_uploadjob: Optional[UploadJobs],
729
- ) -> Optional[Change]:
730
- """Compare the given CSMS and CIDC shipments, returning None's if no changes or the changes"""
731
- cidc_manifest: Dict[str, Any] = {} if cidc_uploadjob is None else cidc_uploadjob.metadata_patch["shipments"][0]
732
- change: Change = _calc_difference(
733
- entity_type="shipment",
734
- trial_id=cidc_uploadjob.trial_id,
735
- manifest_id=manifest_id,
736
- cidc=cidc_manifest,
737
- csms=csms_manifest,
738
- # default ignore
739
- )
740
- if change:
741
- return change
742
-
743
- return None
744
-
745
-
746
- def _handle_sample_differences(
747
- trial_id: str,
748
- manifest_id: str,
749
- csms_sample_map: Dict[str, Dict[str, Any]],
750
- cidc_sample_map: Dict[str, Dict[str, Any]],
751
- ret: List[Change],
752
- ) -> List[Change]:
753
- """
754
- Compare the given CSMS and CIDC participants and samples
755
-
756
- Unlike _handle_shipment_differences and _handle_upload_differences,
757
- directly takes the return for detect_manifest_changes() and updates it
758
- before returning.
759
- No changes are made if no differences are found.
760
- """
761
- for cimac_id, csms_sample in csms_sample_map.items():
762
- change: Change = _calc_difference(
763
- entity_type="sample",
764
- trial_id=trial_id,
765
- manifest_id=manifest_id,
766
- cidc=cidc_sample_map[cimac_id],
767
- csms=csms_sample,
768
- # default ignore
769
- )
770
- if change:
771
- ret.append(change)
772
-
773
- return ret
774
-
775
-
776
- def _handle_upload_differences(
777
- trial_id, manifest_id, csms_sample_map, uploader_email, cidc_uploadjob: UploadJobs
778
- ) -> Optional[Change]:
779
- """Look for the CIDC upload for the given manifest for changes, returning None's if no changes or the changes"""
780
- new_uploadjob = UploadJobs(
781
- trial_id=trial_id,
782
- _status=UploadJobStatus.MERGE_COMPLETED.value,
783
- multifile=False,
784
- upload_type=_get_upload_type(csms_sample_map.values()),
785
- uploader_email=uploader_email,
786
- metadata_patch={},
787
- )
788
- change: Change = _calc_difference(
789
- "upload",
790
- trial_id,
791
- manifest_id,
792
- {} if cidc_uploadjob is None else cidc_uploadjob.to_dict(),
793
- new_uploadjob.to_dict(),
794
- ignore=[
795
- "_created",
796
- "_etag",
797
- "id",
798
- "metadata_patch",
799
- "token",
800
- "_updated",
801
- "uploader_email",
802
- ],
803
- )
804
- if change:
805
- return change
806
-
807
- return None
808
-
809
-
810
- @with_default_session
811
- def detect_manifest_changes(csms_manifest: Dict[str, Any], uploader_email: str, *, session: Session) -> List[Change]:
812
- """
813
- Given a CSMS-style manifest, see if it has any differences from the current state of the db
814
- If a new manifest, throws a NewManifestError
815
- If critical fields are different, throws an error to be handled later by a human
816
- Returns a list of the changes themselves
817
-
818
- Returns
819
- -------
820
- List[Change]
821
- the changes that were detected
822
-
823
- Raises
824
- ------
825
- NewManifestError
826
- if the manifest_id doesn't correspond to anything in CIDC
827
- RuntimeError
828
- if the connections between any critical fields is changed
829
- namely trial_id, manifest_id, cimac_id
830
- """
831
- # if it's an excluded manifest, we don't consider it for changes
832
- if _get_and_check(
833
- obj=csms_manifest,
834
- key="excluded",
835
- default=False,
836
- msg="not called",
837
- check=lambda _: True,
838
- ):
839
- return []
840
-
841
- # ----- Initial validation, raises RuntimeError if issues -----
842
- ret = []
843
- (
844
- trial_id,
845
- manifest_id,
846
- csms_sample_map,
847
- cidc_sample_map,
848
- cidc_uploadjob,
849
- # will raise NewManifestError if manifest_id not in Shipment table
850
- ) = _initial_manifest_validation(csms_manifest, session=session)
851
-
852
- # ----- Look for shipment-level differences -----
853
- change: Optional[Change] = _handle_shipment_differences(manifest_id, csms_manifest, cidc_uploadjob)
854
- if change:
855
- ret.append(change)
856
-
857
- # ----- Look for sample-level differences -----
858
- ret = _handle_sample_differences(trial_id, manifest_id, csms_sample_map, cidc_sample_map, ret)
859
-
860
- # ----- Look for differences in the Upload -----
861
- change: Optional[Change] = _handle_upload_differences(
862
- trial_id,
863
- manifest_id,
864
- csms_sample_map,
865
- uploader_email,
866
- cidc_uploadjob,
867
- )
868
- if change:
869
- ret.append(change)
870
-
871
- # ----- Finish up and return -----
872
- return ret