nci-cidc-api-modules 1.1.29__py3-none-any.whl → 1.1.31__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -70,6 +70,7 @@ GOOGLE_INTAKE_BUCKET = environ["GOOGLE_INTAKE_BUCKET"]
70
70
  GOOGLE_UPLOAD_BUCKET = environ["GOOGLE_UPLOAD_BUCKET"]
71
71
  GOOGLE_UPLOAD_TOPIC = environ["GOOGLE_UPLOAD_TOPIC"]
72
72
  GOOGLE_ACL_DATA_BUCKET = environ["GOOGLE_ACL_DATA_BUCKET"]
73
+ GOOGLE_CLINICAL_DATA_BUCKET = environ["GOOGLE_CLINICAL_DATA_BUCKET"]
73
74
  GOOGLE_EPHEMERAL_BUCKET = environ["GOOGLE_EPHEMERAL_BUCKET"]
74
75
  GOOGLE_UPLOAD_ROLE = environ["GOOGLE_UPLOAD_ROLE"]
75
76
  GOOGLE_LISTER_ROLE = environ["GOOGLE_LISTER_ROLE"]
cidc_api/models/models.py CHANGED
@@ -22,6 +22,7 @@ __all__ = [
22
22
  "Users",
23
23
  "ValidationMultiError",
24
24
  "with_default_session",
25
+ "PreprocessedFiles",
25
26
  ]
26
27
 
27
28
  import hashlib
@@ -3184,3 +3185,70 @@ def upload_manifest_json(
3184
3185
  # Publish that a manifest upload has been received
3185
3186
  publish_patient_sample_update(manifest_upload.id)
3186
3187
  return manifest_upload.id
3188
+
3189
+
3190
+ class PreprocessedFiles(CommonColumns):
3191
+ __tablename__ = "preprocessed_files"
3192
+
3193
+ file_name = Column(String)
3194
+ object_url = Column(String)
3195
+ trial_id = Column(String)
3196
+ file_category = Column(String)
3197
+ uploader_email = Column(String)
3198
+ status = Column(String)
3199
+ version = Column(Integer)
3200
+ released_version = Column(String)
3201
+
3202
+ @staticmethod
3203
+ @with_default_session
3204
+ def create(
3205
+ file_name: str,
3206
+ object_url: str,
3207
+ file_category: str,
3208
+ uploader_email: str,
3209
+ status: str = "pending",
3210
+ trial_id: str = None,
3211
+ version: int = None,
3212
+ released_version: str = None,
3213
+ session: Session = None,
3214
+ ):
3215
+ """Create and insert a new PreprocessedFiles record."""
3216
+ new_file = PreprocessedFiles(
3217
+ file_name=file_name,
3218
+ object_url=object_url,
3219
+ file_category=file_category,
3220
+ uploader_email=uploader_email,
3221
+ status=status,
3222
+ trial_id=trial_id,
3223
+ version=version,
3224
+ released_version=released_version,
3225
+ )
3226
+ new_file.insert(session=session)
3227
+ return new_file
3228
+
3229
+ @classmethod
3230
+ @with_default_session
3231
+ def archive_current_files(cls, file_category: str, session: Session = None):
3232
+ """Update any 'current' files in the given category to 'archived'."""
3233
+ current_files = cls.get_files_by_category_and_status(file_category, "current", session=session)
3234
+ for file in current_files:
3235
+ file.status = "archived"
3236
+ file._updated = datetime.now()
3237
+ session.commit()
3238
+
3239
+ @classmethod
3240
+ @with_default_session
3241
+ def delete_pending_files_by_category(cls, file_category: str, session: Session = None):
3242
+ """Delete all pending files matching given file_category."""
3243
+ records = cls.get_files_by_category_and_status(file_category, "pending", session=session)
3244
+ for record in records:
3245
+ session.delete(record)
3246
+ session.commit()
3247
+
3248
+ @classmethod
3249
+ @with_default_session
3250
+ def get_files_by_category_and_status(
3251
+ cls, file_category: str, status: str, session: Session = None
3252
+ ) -> list["PreprocessedFiles"]:
3253
+ """Return all files matching given file_category and status."""
3254
+ return session.query(cls).filter_by(file_category=file_category, status=status).all()
@@ -2,15 +2,16 @@
2
2
 
3
3
  # pylint: disable=logging-fstring-interpolation
4
4
 
5
- import json
6
- import os
7
- from os import environ
8
5
  import base64
9
6
  import datetime
10
- import warnings
11
7
  import hashlib
8
+ import io
9
+ import json
10
+ import os
11
+ import warnings
12
12
  from collections import namedtuple
13
13
  from concurrent.futures import Future
14
+ from os import environ
14
15
  from typing import (
15
16
  Any,
16
17
  BinaryIO,
@@ -23,19 +24,20 @@ from typing import (
23
24
  Union,
24
25
  )
25
26
 
26
- from werkzeug.datastructures import FileStorage
27
- from sqlalchemy.orm.session import Session
27
+ import googleapiclient.discovery
28
+ import requests
29
+ from cidc_schemas.prism.constants import ASSAY_TO_FILEPATH
30
+ from google.api_core.client_options import ClientOptions
31
+ from google.api_core.iam import Policy
28
32
  from google.cloud import storage, pubsub, bigquery
29
33
  from google.cloud.bigquery.enums import EntityTypes
30
34
  from google.oauth2.service_account import Credentials
31
- from google.api_core.iam import Policy
32
- from google.api_core.client_options import ClientOptions
33
- import googleapiclient.discovery
34
- import requests
35
+ from sqlalchemy.orm.session import Session
36
+ from werkzeug.datastructures import FileStorage
37
+ from werkzeug.utils import secure_filename
35
38
 
36
- from cidc_schemas.prism.constants import ASSAY_TO_FILEPATH
37
39
  from cidc_api.config.secrets import get_secrets_manager
38
-
40
+ from ..config.logging import get_logger
39
41
  from ..config.settings import (
40
42
  DEV_USE_GCS,
41
43
  GOOGLE_INTAKE_ROLE,
@@ -57,7 +59,6 @@ from ..config.settings import (
57
59
  DEV_CFUNCTIONS_SERVER,
58
60
  INACTIVE_USER_DAYS,
59
61
  )
60
- from ..config.logging import get_logger
61
62
 
62
63
  os.environ["TZ"] = "UTC"
63
64
  logger = get_logger(__name__)
@@ -216,6 +217,68 @@ def upload_xlsx_to_gcs(
216
217
  return final_object
217
218
 
218
219
 
220
+ def upload_file_to_gcs(file: FileStorage, bucket_name: str, gcs_folder: str) -> str:
221
+ """Upload a file to the specified GCS folder and return the GCS path from the bucket."""
222
+ # Secure the filename and prepare file
223
+ filename = secure_filename(file.filename)
224
+ gcs_file_path = os.path.join(gcs_folder, filename)
225
+ binary_file = io.BytesIO(file.read())
226
+
227
+ if ENV == "dev" and not DEV_USE_GCS:
228
+ logger.info(f"Would've saved {gcs_file_path} to {bucket_name}")
229
+ return gcs_file_path
230
+
231
+ # Upload to GCS
232
+ blob = _get_bucket(bucket_name).blob(gcs_file_path)
233
+ blob.upload_from_file(binary_file, content_type=file.content_type)
234
+
235
+ return gcs_file_path
236
+
237
+
238
+ def move_gcs_file(bucket_name: str, existing_path: str, to_folder: str, append_timestamp: bool = True) -> str:
239
+ """Move a file within a GCS bucket to a new folder, optionally appending a timestamp to the filename."""
240
+ bucket = _get_bucket(bucket_name)
241
+ filename = os.path.basename(existing_path)
242
+ if append_timestamp:
243
+ filename = _append_iso_timestamp_to_filename(filename)
244
+ # Ensure trailing slash on folder
245
+ if not to_folder.endswith("/"):
246
+ to_folder += "/"
247
+ new_gcs_file_path = f"{to_folder}{filename}"
248
+
249
+ if ENV == "dev" and not DEV_USE_GCS:
250
+ logger.info(f"Would've moved {existing_path} to {new_gcs_file_path} in {bucket_name}")
251
+ return new_gcs_file_path
252
+
253
+ source_blob = bucket.blob(existing_path)
254
+ if not source_blob.exists():
255
+ raise Exception("Expected file not found in GCS")
256
+ new_blob = bucket.blob(new_gcs_file_path)
257
+ # GCS move = rewrite + delete
258
+ new_blob.rewrite(source_blob)
259
+ source_blob.delete()
260
+
261
+ return new_gcs_file_path
262
+
263
+
264
+ def delete_items_from_folder(bucket_name: str, folder: str):
265
+ """Deletes all blobs from the specified folder in the specified bucket."""
266
+ bucket = _get_bucket(bucket_name)
267
+ if ENV == "dev" and not DEV_USE_GCS:
268
+ logger.info(f"Would've deleted file(s) from {folder} in {bucket_name}")
269
+ return
270
+ existing_blobs = bucket.list_blobs(prefix=folder)
271
+ for blob in existing_blobs:
272
+ blob.delete()
273
+
274
+
275
+ def _append_iso_timestamp_to_filename(filename: str) -> str:
276
+ """Append an ISO 8601 timestamp to a filename, preserving its extension."""
277
+ base, ext = os.path.splitext(filename)
278
+ timestamp = datetime.datetime.now().isoformat(timespec="milliseconds").replace(":", "-")
279
+ return f"{base}_{timestamp}{ext}"
280
+
281
+
219
282
  def grant_lister_access(user_email: str) -> None:
220
283
  """
221
284
  Grant a user list access to the GOOGLE_ACL_DATA_BUCKET. List access is
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: nci_cidc_api_modules
3
- Version: 1.1.29
3
+ Version: 1.1.31
4
4
  Summary: SQLAlchemy data models and configuration tools used in the NCI CIDC API
5
5
  Home-page: https://github.com/NCI-CIDC/cidc-api-gae
6
6
  License: MIT license
@@ -28,7 +28,7 @@ Requires-Dist: python-dotenv==0.10.3
28
28
  Requires-Dist: requests==2.32.3
29
29
  Requires-Dist: jinja2==3.1.6
30
30
  Requires-Dist: certifi==2024.7.4
31
- Requires-Dist: nci-cidc-schemas==0.27.19
31
+ Requires-Dist: nci-cidc-schemas==0.27.21
32
32
  Dynamic: description
33
33
  Dynamic: description-content-type
34
34
  Dynamic: home-page
@@ -2,13 +2,10 @@ cidc_api/config/__init__.py,sha256=5mX8GAPxUKV84iS-aGOoE-4m68LsOCGCDptXNdlgvj0,1
2
2
  cidc_api/config/db.py,sha256=cyWhWtmXha4OsrwUf6ez8aKSfm7tPSmPDE9JVSBx3Fk,1935
3
3
  cidc_api/config/logging.py,sha256=abhVYtn8lfhIt0tyV2WHFgSmp_s2eeJh7kodB6LH4J0,1149
4
4
  cidc_api/config/secrets.py,sha256=jRFj7W43pWuPf9DZQLCKF7WPXf5cUv-BAaS3ASqhV_Q,1481
5
- cidc_api/config/settings.py,sha256=fJQIaCfxsuooEi1pAO8FhHurN0BjP6FZKX8jl7uHGZM,4203
6
- cidc_api/csms/__init__.py,sha256=eJkY6rWNOAUBmSd4G1_U6h7i472druKEtBdVmgFZVPg,20
7
- cidc_api/csms/auth.py,sha256=VTfHlCym_hqVrHXv41Ku9RMAGN9BiNe7ui0o9KZCKtY,3185
5
+ cidc_api/config/settings.py,sha256=mA-4r7oB60uFepYtl5abbPigjwX8aBz__qCJXdcWWbs,4272
8
6
  cidc_api/models/__init__.py,sha256=bl445G8Zic9YbhZ8ZBni07wtBMhLJRMBA-JqjLxx2bw,66
9
- cidc_api/models/csms_api.py,sha256=ovi_jZXZBg6XYEvIupbf5c0WyMbPi4V07OywbleKGqs,30737
10
7
  cidc_api/models/migrations.py,sha256=gp9vtkYbA9FFy2s-7woelAmsvQbJ41LO2_DY-YkFIrQ,11464
11
- cidc_api/models/models.py,sha256=HBXb5228CeUInaaKOXYBcPz-T9pfwULz_7BaSyJmNDI,129427
8
+ cidc_api/models/models.py,sha256=JAvKhX2VnbhavfPkGelBIa3M8Qi6JaFKvydJmmvqZ1U,131795
12
9
  cidc_api/models/schemas.py,sha256=7tDYtmULuzTt2kg7RorWhte06ffalgpQKrFiDRGcPEQ,2711
13
10
  cidc_api/models/files/__init__.py,sha256=8BMTnUSHzUbz0lBeEQY6NvApxDD3GMWMduoVMos2g4Y,213
14
11
  cidc_api/models/files/details.py,sha256=WrWPxJqlsteinoNbGTaQ3fcxgvChqLGJ9vY7H829jtk,62842
@@ -16,11 +13,11 @@ cidc_api/models/files/facets.py,sha256=JqCmwcjYYSz7XK4bAokSE9i71C8t9EQ4Jtbv7npth
16
13
  cidc_api/shared/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
17
14
  cidc_api/shared/auth.py,sha256=PHqmVGkqDjbmUofytVFwD_9ssgUomESl3fFtFHPwZYQ,9062
18
15
  cidc_api/shared/emails.py,sha256=GY-l0EkoVU_3hjV0g-xo7N9d1iyCdluyq_arftEPPe0,4989
19
- cidc_api/shared/gcloud_client.py,sha256=i4ZZLoDC_pEwKaMS8218uUJ0fsIi0DKwd-hzGHGQw7g,33139
16
+ cidc_api/shared/gcloud_client.py,sha256=ko-3kGRyHI0RcIgoA0r_qVZUcDUGjd9t6-5pXJc3A7s,35634
20
17
  cidc_api/shared/jose.py,sha256=-qzGzEDAlokEp9E7WtBtQkXyyfPWTYXlwYpCqVJWmqM,1830
21
18
  cidc_api/shared/rest_utils.py,sha256=RwR30WOUAYCxL7V-i2totEyeriG30GbBDvBcpLXhM9w,6594
22
- nci_cidc_api_modules-1.1.29.dist-info/licenses/LICENSE,sha256=pNYWVTHaYonnmJyplmeAp7tQAjosmDpAWjb34jjv7Xs,1102
23
- nci_cidc_api_modules-1.1.29.dist-info/METADATA,sha256=8Hs5qiYZFExpOAL2tSG8cz4_t5k17ocmf8TE1Oa3dr4,41285
24
- nci_cidc_api_modules-1.1.29.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
25
- nci_cidc_api_modules-1.1.29.dist-info/top_level.txt,sha256=rNiRzL0lJGi5Q9tY9uSoMdTbJ-7u5c_D2E86KA94yRA,9
26
- nci_cidc_api_modules-1.1.29.dist-info/RECORD,,
19
+ nci_cidc_api_modules-1.1.31.dist-info/licenses/LICENSE,sha256=pNYWVTHaYonnmJyplmeAp7tQAjosmDpAWjb34jjv7Xs,1102
20
+ nci_cidc_api_modules-1.1.31.dist-info/METADATA,sha256=92LzhqUCRQiUvKbuj0Gs_lZ3OrmPBzBHfrNZQ_EjrwQ,41285
21
+ nci_cidc_api_modules-1.1.31.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
22
+ nci_cidc_api_modules-1.1.31.dist-info/top_level.txt,sha256=rNiRzL0lJGi5Q9tY9uSoMdTbJ-7u5c_D2E86KA94yRA,9
23
+ nci_cidc_api_modules-1.1.31.dist-info/RECORD,,
cidc_api/csms/__init__.py DELETED
@@ -1 +0,0 @@
1
- from .auth import *
cidc_api/csms/auth.py DELETED
@@ -1,105 +0,0 @@
1
- __all__ = ["get_token", "get_with_authorization", "get_with_paging"]
2
-
3
- import os
4
- from datetime import datetime, timedelta
5
- from typing import Any, Dict, Iterator
6
-
7
- import requests
8
-
9
- from ..config.settings import (
10
- CSMS_BASE_URL,
11
- CSMS_CLIENT_ID,
12
- CSMS_CLIENT_SECRET,
13
- CSMS_TOKEN_URL,
14
- )
15
-
16
- os.environ["TZ"] = "UTC"
17
-
18
- TIMEOUT_IN_SECONDS = 20
19
- _TOKEN, _TOKEN_EXPIRY = None, datetime.now()
20
-
21
-
22
- def get_token():
23
- global _TOKEN, _TOKEN_EXPIRY
24
- if not _TOKEN or datetime.now() >= _TOKEN_EXPIRY:
25
- res, time = (
26
- requests.post(
27
- CSMS_TOKEN_URL,
28
- headers={"Content-Type": "application/x-www-form-urlencoded"},
29
- data={
30
- "grant_type": "client_credentials",
31
- "client_id": CSMS_CLIENT_ID,
32
- "client_secret": CSMS_CLIENT_SECRET,
33
- },
34
- timeout=TIMEOUT_IN_SECONDS,
35
- ).json(),
36
- datetime.now(),
37
- )
38
-
39
- # res definition from https://developer.okta.com/docs/reference/api/oidc/#response-example-error-7
40
- if "errorCode" in res:
41
- raise RuntimeError(res["errorCode"] + ": " + res.get("errorSummary"))
42
-
43
- _TOKEN = res["access_token"]
44
- _TOKEN_EXPIRY = time + timedelta(seconds=res["expires_in"])
45
-
46
- return _TOKEN
47
-
48
-
49
- def get_with_authorization(url: str, **kwargs) -> requests.Response:
50
- """url should be fully valid or begin with `/` to be prefixed with CSMS_BASE_URL"""
51
- token = get_token()
52
- headers = {
53
- **kwargs.get("headers", {}),
54
- "Authorization": f"Bearer {token}",
55
- "accept": "*/*",
56
- }
57
- kwargs["headers"] = headers
58
- if not url.startswith(CSMS_BASE_URL):
59
- url = CSMS_BASE_URL + url
60
- return requests.get(
61
- url,
62
- **kwargs,
63
- timeout=TIMEOUT_IN_SECONDS,
64
- )
65
-
66
-
67
- def get_with_paging(url: str, limit: int = None, offset: int = 0, **kwargs) -> Iterator[Dict[str, Any]]:
68
- """
69
- Return an iterator of entries via get_with_authorization with handling for CSMS paging
70
-
71
- Parameters
72
- ----------
73
- url: str
74
- url should be fully valid or begin with `/` to be prefixed with CSMS_BASE_URL
75
- limit: int = None
76
- the number of records to return on each page
77
- default: 5000 for samples, 50 for manifests, 1 otherwise
78
- offset: int = 0
79
- which page to return, 0-indexed
80
- increments as needed to continue returning
81
-
82
- Raises
83
- ------
84
- requests.exceptions.HTTPError
85
- via res.raise_for_status()
86
- https://docs.python-requests.org/en/master/user/quickstart/#response-status-codes
87
- """
88
- if not limit:
89
- if "samples" in url:
90
- limit = 5000
91
- elif "manifests" in url:
92
- limit = 50
93
- else:
94
- limit = 1
95
-
96
- kwargs.update({"limit": limit, "offset": offset})
97
-
98
- res = get_with_authorization(url, params=kwargs)
99
- while res.status_code < 300 and len(res.json().get("data", [])) > 0:
100
- # if there's not an error and we're still returning
101
- yield from res.json()["data"]
102
- kwargs["offset"] += 1 # get the next page
103
- res = get_with_authorization(url, params=kwargs)
104
-
105
- res.raise_for_status()
@@ -1,872 +0,0 @@
1
- __all__ = [
2
- "Change",
3
- "detect_manifest_changes",
4
- "insert_manifest_into_blob",
5
- "NewManifestError",
6
- ]
7
-
8
- import os
9
- import re
10
- from collections import defaultdict
11
- from datetime import date, datetime, time
12
- from typing import (
13
- Any,
14
- Callable,
15
- Dict,
16
- Iterable,
17
- Iterator,
18
- List,
19
- Optional,
20
- Set,
21
- Tuple,
22
- Union,
23
- )
24
-
25
- from sqlalchemy.orm.session import Session
26
-
27
- from cidc_schemas.prism.merger import merge_clinical_trial_metadata
28
- from cidc_schemas.prism.core import (
29
- _check_encrypt_init,
30
- _encrypt,
31
- _ENCRYPTED_FIELD_LEN,
32
- load_and_validate_schema,
33
- set_prism_encrypt_key,
34
- )
35
- from .models import TrialMetadata, UploadJobStatus, UploadJobs
36
- from .models import with_default_session
37
- from ..config.logging import get_logger
38
- from ..config.settings import PRISM_ENCRYPT_KEY
39
-
40
-
41
- os.environ["TZ"] = "UTC"
42
- logger = get_logger(__name__)
43
-
44
-
45
- def cimac_id_to_cimac_participant_id(cimac_id, _):
46
- return cimac_id[:7]
47
-
48
-
49
- CIMAC_ID_REGEX = re.compile("^C[A-Z0-9]{3}[A-Z0-9]{3}[A-Z0-9]{2}.[0-9]{2}$")
50
- SAMPLE_SCHEMA: dict = load_and_validate_schema("sample.json")
51
- PARTICIPANT_SCHEMA: dict = load_and_validate_schema("participant.json")
52
- SHIPMENT_SCHEMA: dict = load_and_validate_schema("shipping_core.json")
53
- TARGET_PROPERTIES_MAP: Dict[str, dict] = {
54
- "sample": SAMPLE_SCHEMA["properties"],
55
- "participant": PARTICIPANT_SCHEMA["properties"],
56
- "shipment": SHIPMENT_SCHEMA["properties"],
57
- }
58
-
59
- # make sure that the encryption key is set
60
- # NOTE: Exception is raised in external core module
61
- try:
62
- _check_encrypt_init()
63
- except Exception:
64
- set_prism_encrypt_key(PRISM_ENCRYPT_KEY)
65
-
66
-
67
- def _get_all_values(target: str, old: dict, drop: List[str] = None) -> Dict[str, Any]:
68
- """
69
- Parameters
70
- ----------
71
- target: str in ["sample", "participant", "shipment"]
72
- old: dict
73
- drop: List[str] = []
74
-
75
- Returns
76
- -------
77
- Dict[str, Any]
78
- all of the values from `old` that are in `target` excepting anything keys in `drop`
79
- """
80
-
81
- if drop is None:
82
- drop = []
83
-
84
- ret = {p: old[p] for p in TARGET_PROPERTIES_MAP[target].keys() if p in old and p not in drop}
85
-
86
- return ret
87
-
88
-
89
- class NewManifestError(RuntimeError):
90
- pass
91
-
92
-
93
- def _parse_upload_type(sample: dict, upload_type: Set[str]) -> str:
94
- sample_manifest_type = sample.get("sample_manifest_type")
95
- processed_derivative = sample.get("processed_sample_derivative")
96
- if sample_manifest_type is None:
97
- # safety
98
- return
99
-
100
- if sample_manifest_type == "biofluid_cellular":
101
- upload_type.add("pbmc")
102
- elif sample_manifest_type == "tissue_slides":
103
- upload_type.add("tissue_slide")
104
-
105
- elif processed_derivative == "Germline DNA":
106
- upload_type.add(f"normal_{sample_manifest_type.split()[0].lower()}_dna")
107
- elif processed_derivative == "Tumor DNA":
108
- upload_type.add(f"tumor_{sample_manifest_type.split()[0]}_dna")
109
- elif processed_derivative in ["DNA", "RNA"]:
110
- unprocessed_type = sample.get("type_of_sample")
111
- new_type = "tumor" if "tumor" in unprocessed_type.lower() else "normal"
112
- new_type += "_blood_" if sample_manifest_type.startswith("biofluid") else "_tissue_"
113
- new_type += processed_derivative.lower()
114
-
115
- upload_type.add(new_type)
116
-
117
-
118
- def _get_upload_type(samples: Iterable[Dict[str, Any]]) -> str:
119
- upload_type: Set[str] = set()
120
-
121
- for sample in samples:
122
- processed_type = sample.get("processed_sample_type").lower()
123
- if processed_type == "h&e fixed tissue slide":
124
- processed_type = "h_and_e"
125
-
126
- if processed_type in [
127
- "pbmc",
128
- "plasma",
129
- "tissue_slide",
130
- "normal_blood_dna",
131
- "normal_tissue_dna",
132
- "tumor_tissue_dna",
133
- "tumor_tissue_rna",
134
- "h_and_e",
135
- ]:
136
- upload_type.add(processed_type)
137
- else:
138
- # updates upload_type in-place with the given sample
139
- _parse_upload_type(sample=sample, upload_type=upload_type)
140
-
141
- assert len(upload_type) == 1, f"Inconsistent value determined for upload_type:{upload_type}"
142
- return list(upload_type)[0]
143
-
144
-
145
- def _get_and_check(
146
- obj: Union[Dict[str, Any], List[Dict[str, Any]]],
147
- key: str,
148
- msg: str,
149
- default: Any = None,
150
- check: Callable[[Any], bool] = bool,
151
- ) -> Any:
152
- """
153
- Returns a key from a dictionary if it exists, and raises an error if fails an integrity check
154
- If given a list of dictionaries, asserts that each one provides the same result.
155
- """
156
- if isinstance(obj, list):
157
- ret = {o.get(key, default) for o in obj}
158
- assert len(ret) == 1, f"Inconsistent value provided for {key}"
159
- ret = list(ret)[0]
160
- else:
161
- ret = obj.get(key, default)
162
-
163
- if not check(ret):
164
- raise RuntimeError(msg)
165
-
166
- return ret
167
-
168
-
169
- def _extract_info_from_manifest(manifest: Dict[str, Any]) -> Tuple[str, str, List[Dict[str, Any]]]:
170
- """
171
- Given a manifest, do initial validation and return some key values
172
-
173
- Returns
174
- -------
175
- str : trial_id
176
- the same across all samples
177
- exists in both TrialMetadata and ClinicalTrial tables
178
- str : manifest_id
179
- List[Dict[str, Any]] : samples
180
-
181
- RuntimeErrors Raised
182
- -----------------
183
- - "Cannot add a manifest that is not qc_complete"
184
- if manifest's status is not qc_complete (or null)
185
- - f"Manifest {manifest_id} contains no samples: {manifest}"
186
- - f"No consistent protocol_identifier defined for samples on manifest {manifest_id}"
187
- """
188
- manifest_id = _get_and_check(obj=manifest, key="manifest_id", msg=f"No manifest_id in: {manifest}")
189
- _ = _get_and_check( # don't need to keep status
190
- obj=manifest,
191
- key="status",
192
- msg="Cannot add a manifest that is not qc_complete",
193
- default="qc_complete",
194
- check=lambda v: v == "qc_complete",
195
- )
196
- samples = _get_and_check(
197
- obj=manifest,
198
- key="samples",
199
- msg=f"Manifest {manifest_id} contains no samples: {manifest}",
200
- default=[],
201
- check=lambda v: len(v) != 0,
202
- )
203
- trial_id = _get_and_check(
204
- obj=samples,
205
- key="protocol_identifier",
206
- msg=f"No consistent protocol_identifier defined for samples on manifest {manifest_id}",
207
- )
208
-
209
- return trial_id, manifest_id, samples
210
-
211
-
212
- def _extract_details_from_trial(csms_samples: List[Dict[str, Any]]):
213
- """
214
- Given a trial, return some key values
215
-
216
- Returns
217
- -------
218
- str : assay_priority
219
- str : assay_type
220
-
221
- RuntimeErrors Raised
222
- -----------------
223
- - f"No assay_priority defined for manifest_id={manifest_id} for trial {trial_id}"
224
- - f"No assay_type defined for manifest_id={manifest_id} for trial {trial_id}"
225
- """
226
- assay_priority = _get_and_check(
227
- obj=csms_samples,
228
- key="assay_priority",
229
- msg="will not be thrown",
230
- check=lambda _: True,
231
- )
232
- assay_type = _get_and_check(
233
- obj=csms_samples,
234
- key="assay_type",
235
- msg="will not be thrown",
236
- check=lambda _: True,
237
- )
238
- return assay_priority, assay_type
239
-
240
-
241
- def _process_csms_sample(csms_sample: dict):
242
- event_name = csms_sample.get("standardized_collection_event_name")
243
- if event_name is None:
244
- raise RuntimeError(
245
- f"No standardized_collection_event_name defined for sample {csms_sample.get('cimac_id', '')} on manifest {csms_sample['manifest_id']} for trial {csms_sample['protocol_identifier']}"
246
- )
247
-
248
- csms_sample["collection_event_name"] = event_name
249
-
250
- # encrypt participant ids if not already encrypted
251
- if "participant_id" in csms_sample and len(csms_sample["participant_id"]) != _ENCRYPTED_FIELD_LEN:
252
- csms_sample["participant_id"] = _encrypt(csms_sample["participant_id"])
253
-
254
- # differences in naming convention
255
- processed_sample_type_map: Dict[str, str] = {
256
- "tissue_slide": "Fixed Tissue Slide",
257
- "tumor_tissue_dna": "FFPE Tissue Scroll",
258
- "plasma": "Plasma",
259
- "normal_tissue_dna": "FFPE Tissue Scroll",
260
- "h_and_e": "H&E Fixed Tissue Slide",
261
- "pbmc": "PBMC",
262
- }
263
- if csms_sample["processed_sample_type"] in processed_sample_type_map:
264
- csms_sample["processed_sample_type"] = processed_sample_type_map[csms_sample["processed_sample_type"]]
265
-
266
- # differences in keys
267
- if "fixation_or_stabilization_type" in csms_sample:
268
- csms_sample["fixation_stabilization_type"] = csms_sample.pop("fixation_or_stabilization_type")
269
-
270
- # typing
271
- if "sample_derivative_concentration" in csms_sample:
272
- csms_sample["sample_derivative_concentration"] = float(csms_sample["sample_derivative_concentration"])
273
-
274
- if "parent_sample_id" not in csms_sample:
275
- csms_sample["parent_sample_id"] = "Not Reported"
276
-
277
-
278
- def _convert_csms_samples(
279
- trial_id: str,
280
- manifest_id: str,
281
- csms_samples: List[Dict[str, Any]],
282
- existing_cimac_ids: List[str] = None,
283
- ) -> Iterator[Tuple[str, Dict[str, Any]]]:
284
- """
285
- Convert a list of CSMS-style samples into an iterator returning CIMAC IDs and CIDC-style samples
286
- RuntimeErrors are raised during the call for each sample; full validation is NOT done first.
287
-
288
- Returns
289
- -------
290
- iterator yielding (str, dict)
291
- cimac_id, converted CSMS sample
292
-
293
- RuntimeErrors Raised
294
- -----------------
295
- - f"No standardized_collection_event_name defined for sample {sample['cimac_id']} on manifest {sample['manifest_id']} for trial {sample['protocol_identifier']}"
296
- - f"No cimac_id defined for samples[{n}] on manifest_id={manifest_id} for trial {trial_id}"
297
- - f"Malformatted cimac_id={cimac_id} on manifest_id={manifest_id} for trial {trial_id}"
298
- - f"Sample with cimac_id={cimac_id} already exists for trial {trial_id}\nNew samples: {sample}"
299
- - f"Sample with no local participant_id given:\n{sample}"
300
- if participant_id and trial_participant_id are both undefined
301
- """
302
-
303
- if existing_cimac_ids is None:
304
- existing_cimac_ids = []
305
-
306
- for n, sample in enumerate(csms_samples):
307
- # process the sample
308
- _process_csms_sample(csms_sample=sample)
309
-
310
- # get and validate the CIMAC id
311
- cimac_id = _get_and_check(
312
- obj=sample,
313
- key="cimac_id",
314
- msg=f"No cimac_id defined for samples[{n}] on manifest_id={manifest_id} for trial {trial_id}",
315
- )
316
- if not CIMAC_ID_REGEX.match(cimac_id):
317
- raise RuntimeError(f"Malformatted cimac_id={cimac_id} on manifest_id={manifest_id} for trial {trial_id}")
318
-
319
- if cimac_id in existing_cimac_ids:
320
- raise RuntimeError(
321
- f"Sample with cimac_id={cimac_id} already exists for trial {trial_id}\nNew samples: {sample}"
322
- )
323
-
324
- # yield
325
- yield (cimac_id, sample)
326
-
327
-
328
- @with_default_session
329
- def insert_manifest_into_blob(
330
- manifest: Dict[str, Any],
331
- uploader_email: str,
332
- *,
333
- dry_run: bool = False,
334
- session: Session,
335
- ) -> None:
336
- """
337
- Given a CSMS-style manifest, add it into the JSON metadata blob
338
- If `dry_run`, calls `session.rollback` instead of `session.commit`
339
-
340
- RuntimeErrors Raised
341
- -----------------
342
- - "Cannot add a manifest that is not qc_complete"
343
- if manifest's status is not qc_complete (or null)
344
- - f"Manifest {manifest_id} contains no samples: {manifest}"
345
- - f"No consistent protocol_identifier defined for samples on manifest {manifest_id}"
346
- - f"Clinical trial with protocol identifier={trial_id} does not exist"
347
- if trial is missing from TrialMetadata OR ClinicalTrial OR both
348
-
349
- - Assertion: "Inconsistent value provided for assay_priority"
350
- - Assertion: "Inconsistent value provided for assay_type"
351
-
352
- - f"Manifest with manifest_id={manifest_id} already exists for trial {trial_id}"
353
- - f"No standardized_collection_event_name defined for sample {sample['cimac_id']} on manifest {sample['manifest_id']} for trial {sample['protocol_identifier']}"
354
- - f"No cimac_id defined for samples[{n}] on manifest_id={manifest_id} for trial {trial_id}"
355
- - f"Malformatted cimac_id={cimac_id} on manifest_id={manifest_id} for trial {trial_id}"
356
- - f"Sample with cimac_id={cimac_id} already exists for trial {trial_id}\nNew samples: {sample}"
357
- - f"Sample with no local participant_id given:\n{sample}"
358
- if participant_id and trial_participant_id are both undefined
359
-
360
- - "prism errors: [{errors from merge_clinical_trial_metadata}]"
361
- """
362
-
363
- trial_id, manifest_id, csms_samples = _extract_info_from_manifest(manifest)
364
- trial_md = TrialMetadata.select_for_update_by_trial_id(trial_id, session=session)
365
- if manifest_id in [s["manifest_id"] for s in trial_md.metadata_json["shipments"]]:
366
- raise RuntimeError(f"Manifest with manifest_id={manifest_id} already exists for trial {trial_id}")
367
-
368
- # pull out some additional values we'll need
369
- existing_cimac_ids = [s["cimac_id"] for p in trial_md.metadata_json["participants"] for s in p["samples"]]
370
- assay_priority, assay_type = _extract_details_from_trial(csms_samples)
371
- if assay_priority:
372
- manifest["assay_priority"] = assay_priority
373
- if assay_type:
374
- manifest["assay_type"] = assay_type
375
-
376
- # a patch is just the parts that are new, equivalent to the return of schemas.prismify
377
- patch = {
378
- "protocol_identifier": trial_id,
379
- "shipments": [_get_all_values(target="shipment", old=manifest, drop=["excluded", "json_data"])],
380
- "participants": [],
381
- }
382
-
383
- # sort samples by participants
384
- sample_map: Dict[str, List[Dict[str, Any]]] = defaultdict(list)
385
- for cimac_id, sample in _convert_csms_samples(trial_id, manifest_id, csms_samples, existing_cimac_ids):
386
- sample_map[cimac_id_to_cimac_participant_id(cimac_id, {})].append(sample)
387
-
388
- # each participant has a list of samples
389
- for cimac_participant_id, partic_samples in sample_map.items():
390
- partic = {
391
- "cimac_participant_id": cimac_participant_id,
392
- "participant_id": partic_samples[0]["participant_id"],
393
- **_get_all_values(
394
- target="participant",
395
- old=partic_samples[0],
396
- drop=[
397
- "cimac_participant_id",
398
- "excluded",
399
- "json_data",
400
- "participant_id",
401
- "trial_participant_id",
402
- ],
403
- ),
404
- }
405
- partic["samples"] = [
406
- _get_all_values(
407
- target="sample",
408
- old=sample,
409
- drop=["excluded", "json_data", "manifest_id"],
410
- )
411
- for sample in partic_samples
412
- ]
413
-
414
- patch["participants"].append(partic)
415
-
416
- logger.info("Patch for %s manifest %s:\n%s", trial_id, manifest_id, patch)
417
- # merge and validate the data
418
- merged, errs = merge_clinical_trial_metadata(patch, trial_md.metadata_json)
419
- if errs:
420
- raise RuntimeError({"prism errors": [str(e) for e in errs]})
421
-
422
- # save it, will get rolled back if in a dry run
423
- trial_md.update(changes={"metadata_json": merged}, session=session)
424
-
425
- # create pseudo-UploadJobs that will get rolled back if in a dry run
426
- UploadJobs(
427
- trial_id=trial_id,
428
- _status=UploadJobStatus.MERGE_COMPLETED.value,
429
- multifile=False,
430
- metadata_patch=patch,
431
- upload_type=_get_upload_type(csms_samples),
432
- uploader_email=uploader_email,
433
- ).insert(session=session)
434
-
435
- if dry_run:
436
- session.flush()
437
- session.rollback()
438
- else:
439
- session.commit()
440
-
441
-
442
- class Change:
443
- def __init__(
444
- self,
445
- entity_type: str,
446
- trial_id: str,
447
- manifest_id: str,
448
- cimac_id: str = None,
449
- changes: Dict[str, Tuple[Any, Any]] = None,
450
- ):
451
- if changes is None:
452
- changes = []
453
-
454
- if entity_type not in ["sample", "shipment", "upload"]:
455
- raise ValueError(f"entity_type must be in: sample, shipment, upload\nnot: {entity_type}")
456
-
457
- self.entity_type = entity_type
458
- self.trial_id = trial_id
459
- self.manifest_id = manifest_id
460
- self.cimac_id = cimac_id
461
- self.changes = changes
462
-
463
- def __bool__(self):
464
- return bool(len(self.changes))
465
-
466
- def __repr__(self):
467
- return f"{self.entity_type.title()} changes for {self.trial_id}, {self.manifest_id}, {self.cimac_id}:\n{self.changes}"
468
-
469
- def __eq__(self, other):
470
- return (
471
- self.entity_type == other.entity_type
472
- and self.trial_id == other.trial_id
473
- and self.manifest_id == other.manifest_id
474
- and self.cimac_id == other.cimac_id
475
- and self.changes == other.changes
476
- )
477
-
478
-
479
- def _calc_difference(
480
- entity_type: str,
481
- trial_id: str,
482
- manifest_id: str,
483
- cidc: Dict[str, Any],
484
- csms: Dict[str, Any],
485
- ignore=None,
486
- ) -> Dict[str, Tuple[Any, Any]]:
487
- """
488
- The actual comparison function that handles comparing values
489
-
490
- Handles formatting for date/time/datetime in CIDC
491
- Do not perform a comparison for ignored keys
492
- Add constant critical fields back to anything that changes
493
- """
494
-
495
- if ignore is None:
496
- ignore = [
497
- "barcode",
498
- "biobank_id",
499
- "cimac_participant_id",
500
- "entry_number",
501
- "event",
502
- "excluded",
503
- "json_data",
504
- "modified_time",
505
- "modified_timestamp",
506
- "protocol_identifier",
507
- "qc_comments",
508
- "reason",
509
- "sample_approved",
510
- "sample_manifest_type",
511
- "samples",
512
- "status",
513
- "status_log",
514
- "study_encoding",
515
- "submitter",
516
- "trial_id",
517
- ]
518
-
519
- # handle formatting and ignore
520
- cidc1: Dict[str, Any] = {
521
- k: (datetime.strftime(v, "%Y-%m-%d %H:%M:%S") if isinstance(v, (date, time, datetime)) else v)
522
- for k, v in cidc.items()
523
- if k not in ignore
524
- }
525
- csms1: Dict[str, Any] = {k: v for k, v in csms.items() if k not in ignore}
526
-
527
- # take difference by using symmetric set difference on the items
528
- # use set to not get same key multiple times if values differ
529
- diff_keys: Set[str] = {
530
- k
531
- for k in set(cidc1.keys()).union(set(csms1.keys()))
532
- # guaranteed to be in one or the other, so never None == None
533
- if cidc1.get(k) != csms1.get(k)
534
- }
535
- # then get both values once per key to return
536
- changes: Dict[str, Tuple[Any, Any]] = {k: (cidc.get(k), csms.get(k)) for k in diff_keys}
537
-
538
- return Change(
539
- entity_type=entity_type,
540
- trial_id=trial_id,
541
- manifest_id=manifest_id,
542
- cimac_id=csms["cimac_id"] if entity_type == "sample" else None,
543
- changes=changes,
544
- )
545
-
546
-
547
- def _get_cidc_sample_map(metadata: dict) -> Dict[str, Dict[str, Any]]:
548
- """Returns a map of CIMAC IDs for this shipment to the relevant sample details from CIDC"""
549
- cidc_partic_map = {partic["cimac_participant_id"]: partic for partic in metadata.get("participants", [])}
550
-
551
- ## make maps from cimac_id to a full dict
552
- ## need to add participant-level values
553
- cidc_sample_map = {
554
- sample["cimac_id"]: sample
555
- for partic in metadata.get("participants", [])
556
- for sample in partic.get("samples", [])
557
- }
558
- for cidc_cimac_id in cidc_sample_map.keys():
559
- cimac_participant_id = cimac_id_to_cimac_participant_id(cidc_cimac_id, {})
560
- cidc_sample_map[cidc_cimac_id]["cohort_name"] = cidc_partic_map[cimac_participant_id]["cohort_name"]
561
- cidc_sample_map[cidc_cimac_id]["participant_id"] = cidc_partic_map[cimac_participant_id]["participant_id"]
562
-
563
- return cidc_sample_map
564
-
565
-
566
- def _get_csms_sample_map(trial_id, manifest_id, csms_samples) -> Dict[str, Dict[str, Any]]:
567
- """Returns a map of CIMAC IDs to the relevant sample details from CSMS"""
568
- return {
569
- csms_cimac_id: {
570
- # participant-level critical field
571
- "cohort_name": csms_sample["cohort_name"],
572
- # name changes
573
- "trial_id": csms_sample["protocol_identifier"],
574
- "participant_id": csms_sample["participant_id"],
575
- # not in CSMS
576
- "cimac_participant_id": cimac_id_to_cimac_participant_id(csms_cimac_id, {}),
577
- "sample_manifest_type": csms_sample.get("sample_manifest_type"),
578
- # the rest of the values
579
- **_get_all_values(
580
- target="sample",
581
- old=csms_sample,
582
- drop=[
583
- "cimac_participant_id",
584
- "cohort_name",
585
- "participant_id",
586
- "sample_manifest_type",
587
- "trial_id",
588
- ],
589
- ),
590
- }
591
- for csms_cimac_id, csms_sample in _convert_csms_samples(trial_id, manifest_id, csms_samples)
592
- }
593
-
594
-
595
- def _cross_validate_samples(
596
- trial_id: str,
597
- manifest_id: str,
598
- cidc_sample_map: Dict[str, dict],
599
- csms_sample_map: Dict[str, dict],
600
- *,
601
- session: Session,
602
- ):
603
- # make sure that all of the CIDC samples are still in CSMS
604
- for cimac_id, cidc_sample in cidc_sample_map.items():
605
- if cimac_id not in csms_sample_map:
606
- formatted = (
607
- trial_id,
608
- manifest_id,
609
- cidc_sample["cimac_id"],
610
- )
611
- raise RuntimeError(f"Missing sample: {formatted} on CSMS {(trial_id, manifest_id)}")
612
- # make sure that all of the CSMS samples are in CIDC
613
- all_cidc_sample_map: Dict[str, dict] = {
614
- sample["cimac_id"]: {
615
- **sample,
616
- "trial_id": upload.trial_id,
617
- "manifest_id": upload.metadata_patch["shipments"][0]["manifest_id"],
618
- }
619
- for upload in session.query(UploadJobs).filter(UploadJobs.status == UploadJobStatus.MERGE_COMPLETED.value).all()
620
- for partic in upload.metadata_patch.get("participants", [])
621
- for sample in partic.get("samples", [])
622
- if len(upload.metadata_patch.get("shipments", []))
623
- }
624
- for cimac_id in csms_sample_map:
625
- # as sample maps are pulling only from CIDC for this trial_id / manifest_id
626
- # any missing cimac_id's are a change in critical field
627
- # but the cimac_id might exist elsewhere in CIDC
628
- if cimac_id not in cidc_sample_map:
629
- cidc_sample = all_cidc_sample_map.get(cimac_id, None)
630
-
631
- formatted = (
632
- (
633
- cidc_sample["trial_id"],
634
- cidc_sample["manifest_id"],
635
- cidc_sample["cimac_id"],
636
- )
637
- if cidc_sample is not None
638
- else "<no sample found>"
639
- )
640
- raise RuntimeError(f"Change in critical field for: {formatted} to CSMS {(trial_id, manifest_id, cimac_id)}")
641
-
642
-
643
- def _initial_manifest_validation(
644
- csms_manifest: Dict[str, Any], *, session: Session
645
- ) -> Tuple[str, str, Dict[str, Dict[str, Any]], Dict[str, Dict[str, Any]], UploadJobs]:
646
- """
647
- Gather all of the things we'll need while performing validation of the manifest
648
-
649
- Returns
650
- -------
651
- str : trial_id
652
- str : manifest_id
653
- Dict[str, Dict[str, Any]] : csms_sample_map
654
- Dict[str, Dict[str, Any]] : cidc_sample_map
655
- both map cimac_id's to a sample definition dict
656
- UploadJobs : cidc_uploadjob
657
-
658
-
659
- RuntimeErrors Raised
660
- -----------------
661
- - "Cannot add a manifest that is not qc_complete"
662
- if manifest's status is not qc_complete (or null)
663
- - f"Manifest {manifest_id} contains no samples: {manifest}"
664
- - f"No consistent protocol_identifier defined for samples on manifest {manifest_id}"
665
- - f"Clinical trial with protocol identifier={trial_id} does not exist"
666
- if trial is missing from TrialMetadata
667
- - NewManifestError
668
- if there is no shipment with the given manifest_id
669
- - f"Change in critical field for: {(cidc.trial_id, cidc.manifest_id)} to CSMS {(trial_id, manifest_id)}"
670
- if the Shipment in CIDC has a different trial_id than in CSMS
671
- - f"Missing sample: {(cidc.trial_id, cidc.manifest_id, cidc.cimac_id)} on CSMS {(trial_id, manifest_id)}"
672
- if an sample in CIDC is not reflected in CSMS
673
- - f"Change in critical field for: {(cidc.trial_id, cidc.manifest_id, cidc.cimac_id)} to CSMS {(trial_id, manifest_id, cimac_id)}"
674
- if a sample in CSMS is not correctly reflected in the current state of CIDC
675
- - f"No assay_priority defined for manifest_id={manifest_id} for trial {trial_id}"
676
- - f"No assay_type defined for manifest_id={manifest_id} for trial {trial_id}"
677
- """
678
- trial_id, manifest_id, csms_samples = _extract_info_from_manifest(csms_manifest)
679
- # ----- Get all our information together -----
680
- # validate that trial exists in the JSON json or error otherwise
681
- _ = TrialMetadata.select_for_update_by_trial_id(trial_id, session=session)
682
-
683
- shipments: List[UploadJobs] = (
684
- session.query(UploadJobs)
685
- .filter(
686
- UploadJobs.status == UploadJobStatus.MERGE_COMPLETED.value,
687
- UploadJobs.trial_id == trial_id,
688
- )
689
- .all()
690
- )
691
- shipments_metadata: Dict[str, dict] = {
692
- s.metadata_patch["shipments"][0]["manifest_id"]: s
693
- for s in shipments
694
- if len(s.metadata_patch.get("shipments", []))
695
- }
696
-
697
- if manifest_id not in shipments_metadata:
698
- # remove this to allow for adding new manifests via this function
699
- # also need to uncomment new Sample code below
700
- raise NewManifestError()
701
-
702
- cidc_shipment: UploadJobs = shipments_metadata[manifest_id]
703
-
704
- cidc_sample_map = _get_cidc_sample_map(cidc_shipment.metadata_patch)
705
- csms_sample_map = _get_csms_sample_map(trial_id, manifest_id, csms_samples)
706
-
707
- # raises RuntimeErrors if something is amiss
708
- _cross_validate_samples(
709
- trial_id=trial_id,
710
- manifest_id=manifest_id,
711
- cidc_sample_map=cidc_sample_map,
712
- csms_sample_map=csms_sample_map,
713
- session=session,
714
- )
715
-
716
- csms_assay_priority, csms_assay_type = _extract_details_from_trial(csms_samples)
717
- if csms_assay_priority:
718
- csms_manifest["assay_priority"] = csms_assay_priority
719
- if csms_assay_type:
720
- csms_manifest["assay_type"] = csms_assay_type
721
-
722
- return trial_id, manifest_id, csms_sample_map, cidc_sample_map, cidc_shipment
723
-
724
-
725
- def _handle_shipment_differences(
726
- manifest_id: str,
727
- csms_manifest: Dict[str, Any],
728
- cidc_uploadjob: Optional[UploadJobs],
729
- ) -> Optional[Change]:
730
- """Compare the given CSMS and CIDC shipments, returning None's if no changes or the changes"""
731
- cidc_manifest: Dict[str, Any] = {} if cidc_uploadjob is None else cidc_uploadjob.metadata_patch["shipments"][0]
732
- change: Change = _calc_difference(
733
- entity_type="shipment",
734
- trial_id=cidc_uploadjob.trial_id,
735
- manifest_id=manifest_id,
736
- cidc=cidc_manifest,
737
- csms=csms_manifest,
738
- # default ignore
739
- )
740
- if change:
741
- return change
742
-
743
- return None
744
-
745
-
746
- def _handle_sample_differences(
747
- trial_id: str,
748
- manifest_id: str,
749
- csms_sample_map: Dict[str, Dict[str, Any]],
750
- cidc_sample_map: Dict[str, Dict[str, Any]],
751
- ret: List[Change],
752
- ) -> List[Change]:
753
- """
754
- Compare the given CSMS and CIDC participants and samples
755
-
756
- Unlike _handle_shipment_differences and _handle_upload_differences,
757
- directly takes the return for detect_manifest_changes() and updates it
758
- before returning.
759
- No changes are made if no differences are found.
760
- """
761
- for cimac_id, csms_sample in csms_sample_map.items():
762
- change: Change = _calc_difference(
763
- entity_type="sample",
764
- trial_id=trial_id,
765
- manifest_id=manifest_id,
766
- cidc=cidc_sample_map[cimac_id],
767
- csms=csms_sample,
768
- # default ignore
769
- )
770
- if change:
771
- ret.append(change)
772
-
773
- return ret
774
-
775
-
776
- def _handle_upload_differences(
777
- trial_id, manifest_id, csms_sample_map, uploader_email, cidc_uploadjob: UploadJobs
778
- ) -> Optional[Change]:
779
- """Look for the CIDC upload for the given manifest for changes, returning None's if no changes or the changes"""
780
- new_uploadjob = UploadJobs(
781
- trial_id=trial_id,
782
- _status=UploadJobStatus.MERGE_COMPLETED.value,
783
- multifile=False,
784
- upload_type=_get_upload_type(csms_sample_map.values()),
785
- uploader_email=uploader_email,
786
- metadata_patch={},
787
- )
788
- change: Change = _calc_difference(
789
- "upload",
790
- trial_id,
791
- manifest_id,
792
- {} if cidc_uploadjob is None else cidc_uploadjob.to_dict(),
793
- new_uploadjob.to_dict(),
794
- ignore=[
795
- "_created",
796
- "_etag",
797
- "id",
798
- "metadata_patch",
799
- "token",
800
- "_updated",
801
- "uploader_email",
802
- ],
803
- )
804
- if change:
805
- return change
806
-
807
- return None
808
-
809
-
810
- @with_default_session
811
- def detect_manifest_changes(csms_manifest: Dict[str, Any], uploader_email: str, *, session: Session) -> List[Change]:
812
- """
813
- Given a CSMS-style manifest, see if it has any differences from the current state of the db
814
- If a new manifest, throws a NewManifestError
815
- If critical fields are different, throws an error to be handled later by a human
816
- Returns a list of the changes themselves
817
-
818
- Returns
819
- -------
820
- List[Change]
821
- the changes that were detected
822
-
823
- Raises
824
- ------
825
- NewManifestError
826
- if the manifest_id doesn't correspond to anything in CIDC
827
- RuntimeError
828
- if the connections between any critical fields is changed
829
- namely trial_id, manifest_id, cimac_id
830
- """
831
- # if it's an excluded manifest, we don't consider it for changes
832
- if _get_and_check(
833
- obj=csms_manifest,
834
- key="excluded",
835
- default=False,
836
- msg="not called",
837
- check=lambda _: True,
838
- ):
839
- return []
840
-
841
- # ----- Initial validation, raises RuntimeError if issues -----
842
- ret = []
843
- (
844
- trial_id,
845
- manifest_id,
846
- csms_sample_map,
847
- cidc_sample_map,
848
- cidc_uploadjob,
849
- # will raise NewManifestError if manifest_id not in Shipment table
850
- ) = _initial_manifest_validation(csms_manifest, session=session)
851
-
852
- # ----- Look for shipment-level differences -----
853
- change: Optional[Change] = _handle_shipment_differences(manifest_id, csms_manifest, cidc_uploadjob)
854
- if change:
855
- ret.append(change)
856
-
857
- # ----- Look for sample-level differences -----
858
- ret = _handle_sample_differences(trial_id, manifest_id, csms_sample_map, cidc_sample_map, ret)
859
-
860
- # ----- Look for differences in the Upload -----
861
- change: Optional[Change] = _handle_upload_differences(
862
- trial_id,
863
- manifest_id,
864
- csms_sample_map,
865
- uploader_email,
866
- cidc_uploadjob,
867
- )
868
- if change:
869
- ret.append(change)
870
-
871
- # ----- Finish up and return -----
872
- return ret