PyPI - nci-cidc-api-modules - Versions diffs - 1.2.34__py3-none-any.whl → 1.2.53__py3-none-any.whl - Mend

nci-cidc-api-modules 1.2.34py3-none-any.whl → 1.2.53py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (151) hide show

boot.py +14 -0
cidc_api/__init__.py +1 -0
cidc_api/config/db.py +21 -1
cidc_api/config/settings.py +5 -10
cidc_api/models/__init__.py +0 -2
cidc_api/models/data.py +15 -6
cidc_api/models/db/stage1/__init__.py +56 -0
cidc_api/models/db/stage1/additional_treatment_orm.py +22 -0
cidc_api/models/db/stage1/adverse_event_orm.py +46 -0
cidc_api/models/db/stage1/base_orm.py +7 -0
cidc_api/models/db/stage1/baseline_clinical_assessment_orm.py +22 -0
cidc_api/models/db/stage1/comorbidity_orm.py +23 -0
cidc_api/models/db/stage1/consent_group_orm.py +32 -0
cidc_api/models/db/stage1/demographic_orm.py +47 -0
cidc_api/models/db/stage1/disease_orm.py +52 -0
cidc_api/models/db/stage1/exposure_orm.py +22 -0
cidc_api/models/db/stage1/gvhd_diagnosis_acute_orm.py +34 -0
cidc_api/models/db/stage1/gvhd_diagnosis_chronic_orm.py +36 -0
cidc_api/models/db/stage1/gvhd_organ_acute_orm.py +21 -0
cidc_api/models/db/stage1/gvhd_organ_chronic_orm.py +21 -0
cidc_api/models/db/stage1/medical_history_orm.py +30 -0
cidc_api/models/db/stage1/other_malignancy_orm.py +29 -0
cidc_api/models/db/stage1/participant_orm.py +77 -0
cidc_api/models/db/stage1/prior_treatment_orm.py +29 -0
cidc_api/models/db/stage1/radiotherapy_dose_orm.py +39 -0
cidc_api/models/db/stage1/response_by_system_orm.py +30 -0
cidc_api/models/db/stage1/response_orm.py +28 -0
cidc_api/models/db/stage1/specimen_orm.py +46 -0
cidc_api/models/db/stage1/stem_cell_transplant_orm.py +25 -0
cidc_api/models/db/stage1/surgery_orm.py +27 -0
cidc_api/models/db/stage1/therapy_agent_dose_orm.py +31 -0
cidc_api/models/db/stage1/treatment_orm.py +38 -0
cidc_api/models/db/stage1/trial_orm.py +35 -0
cidc_api/models/db/stage2/additional_treatment_orm.py +6 -7
cidc_api/models/db/stage2/administrative_person_orm.py +4 -4
cidc_api/models/db/stage2/administrative_role_assignment_orm.py +4 -4
cidc_api/models/db/stage2/adverse_event_orm.py +11 -13
cidc_api/models/db/stage2/arm_orm.py +3 -3
cidc_api/models/db/stage2/base_orm.py +7 -0
cidc_api/models/db/stage2/baseline_clinical_assessment_orm.py +5 -7
cidc_api/models/db/stage2/cohort_orm.py +3 -3
cidc_api/models/db/stage2/comorbidity_orm.py +6 -8
cidc_api/models/db/stage2/consent_group_orm.py +4 -4
cidc_api/models/db/stage2/contact_orm.py +16 -20
cidc_api/models/db/stage2/demographic_orm.py +3 -3
cidc_api/models/db/stage2/disease_orm.py +4 -4
cidc_api/models/db/stage2/exposure_orm.py +3 -3
cidc_api/models/db/stage2/file_orm.py +6 -9
cidc_api/models/db/stage2/gvhd_diagnosis_acute_orm.py +4 -4
cidc_api/models/db/stage2/gvhd_diagnosis_chronic_orm.py +4 -6
cidc_api/models/db/stage2/gvhd_organ_acute_orm.py +3 -3
cidc_api/models/db/stage2/gvhd_organ_chronic_orm.py +3 -3
cidc_api/models/db/stage2/institution_orm.py +7 -7
cidc_api/models/db/stage2/medical_history_orm.py +9 -9
cidc_api/models/db/stage2/other_clinical_endpoint_orm.py +8 -12
cidc_api/models/db/stage2/other_malignancy_orm.py +8 -10
cidc_api/models/db/stage2/participant_orm.py +23 -24
cidc_api/models/db/stage2/prior_treatment_orm.py +12 -13
cidc_api/models/db/stage2/publication_orm.py +9 -11
cidc_api/models/db/stage2/radiotherapy_dose_orm.py +8 -9
cidc_api/models/db/stage2/response_by_system_orm.py +3 -3
cidc_api/models/db/stage2/response_orm.py +3 -3
cidc_api/models/db/stage2/shipment_orm.py +17 -17
cidc_api/models/db/stage2/shipment_specimen_orm.py +4 -4
cidc_api/models/db/stage2/specimen_orm.py +7 -6
cidc_api/models/db/stage2/stem_cell_transplant_orm.py +6 -7
cidc_api/models/db/stage2/surgery_orm.py +6 -7
cidc_api/models/db/stage2/therapy_agent_dose_orm.py +7 -8
cidc_api/models/db/stage2/treatment_orm.py +15 -15
cidc_api/models/db/stage2/trial_orm.py +15 -17
cidc_api/models/errors.py +7 -0
cidc_api/models/files/facets.py +4 -0
cidc_api/models/models.py +167 -11
cidc_api/models/pydantic/base.py +109 -0
cidc_api/models/pydantic/stage1/__init__.py +56 -0
cidc_api/models/pydantic/stage1/additional_treatment.py +23 -0
cidc_api/models/pydantic/stage1/adverse_event.py +127 -0
cidc_api/models/pydantic/stage1/baseline_clinical_assessment.py +23 -0
cidc_api/models/pydantic/stage1/comorbidity.py +43 -0
cidc_api/models/pydantic/stage1/consent_group.py +30 -0
cidc_api/models/pydantic/stage1/demographic.py +140 -0
cidc_api/models/pydantic/stage1/disease.py +200 -0
cidc_api/models/pydantic/stage1/exposure.py +38 -0
cidc_api/models/pydantic/stage1/gvhd_diagnosis_acute.py +33 -0
cidc_api/models/pydantic/stage1/gvhd_diagnosis_chronic.py +32 -0
cidc_api/models/pydantic/stage1/gvhd_organ_acute.py +22 -0
cidc_api/models/pydantic/stage1/gvhd_organ_chronic.py +23 -0
cidc_api/models/pydantic/stage1/medical_history.py +43 -0
cidc_api/models/pydantic/stage1/other_malignancy.py +55 -0
cidc_api/models/pydantic/stage1/participant.py +63 -0
cidc_api/models/pydantic/stage1/prior_treatment.py +45 -0
cidc_api/models/pydantic/stage1/radiotherapy_dose.py +92 -0
cidc_api/models/pydantic/stage1/response.py +84 -0
cidc_api/models/pydantic/stage1/response_by_system.py +220 -0
cidc_api/models/pydantic/stage1/specimen.py +31 -0
cidc_api/models/pydantic/stage1/stem_cell_transplant.py +35 -0
cidc_api/models/pydantic/stage1/surgery.py +57 -0
cidc_api/models/pydantic/stage1/therapy_agent_dose.py +80 -0
cidc_api/models/pydantic/stage1/treatment.py +64 -0
cidc_api/models/pydantic/stage1/trial.py +45 -0
cidc_api/models/pydantic/stage2/additional_treatment.py +2 -4
cidc_api/models/pydantic/stage2/administrative_person.py +1 -1
cidc_api/models/pydantic/stage2/administrative_role_assignment.py +2 -2
cidc_api/models/pydantic/stage2/adverse_event.py +1 -1
cidc_api/models/pydantic/stage2/arm.py +2 -2
cidc_api/models/pydantic/stage2/baseline_clinical_assessment.py +1 -1
cidc_api/models/pydantic/stage2/cohort.py +1 -1
cidc_api/models/pydantic/stage2/comorbidity.py +1 -1
cidc_api/models/pydantic/stage2/consent_group.py +2 -2
cidc_api/models/pydantic/stage2/contact.py +1 -1
cidc_api/models/pydantic/stage2/demographic.py +1 -1
cidc_api/models/pydantic/stage2/disease.py +1 -1
cidc_api/models/pydantic/stage2/exposure.py +1 -1
cidc_api/models/pydantic/stage2/file.py +2 -2
cidc_api/models/pydantic/stage2/gvhd_diagnosis_acute.py +1 -1
cidc_api/models/pydantic/stage2/gvhd_diagnosis_chronic.py +1 -1
cidc_api/models/pydantic/stage2/gvhd_organ_acute.py +1 -1
cidc_api/models/pydantic/stage2/gvhd_organ_chronic.py +1 -1
cidc_api/models/pydantic/stage2/institution.py +1 -1
cidc_api/models/pydantic/stage2/medical_history.py +1 -1
cidc_api/models/pydantic/stage2/other_clinical_endpoint.py +1 -1
cidc_api/models/pydantic/stage2/other_malignancy.py +1 -1
cidc_api/models/pydantic/stage2/participant.py +6 -3
cidc_api/models/pydantic/stage2/prior_treatment.py +6 -15
cidc_api/models/pydantic/stage2/publication.py +2 -2
cidc_api/models/pydantic/stage2/radiotherapy_dose.py +1 -1
cidc_api/models/pydantic/stage2/response.py +2 -2
cidc_api/models/pydantic/stage2/response_by_system.py +1 -1
cidc_api/models/pydantic/stage2/shipment.py +2 -2
cidc_api/models/pydantic/stage2/shipment_specimen.py +1 -1
cidc_api/models/pydantic/stage2/specimen.py +6 -3
cidc_api/models/pydantic/stage2/stem_cell_transplant.py +2 -2
cidc_api/models/pydantic/stage2/surgery.py +1 -1
cidc_api/models/pydantic/stage2/therapy_agent_dose.py +1 -1
cidc_api/models/pydantic/stage2/treatment.py +1 -1
cidc_api/models/pydantic/stage2/trial.py +8 -10
cidc_api/models/types.py +30 -16
cidc_api/shared/assay_handling.py +68 -0
cidc_api/shared/auth.py +5 -5
cidc_api/shared/file_handling.py +18 -4
cidc_api/shared/gcloud_client.py +96 -16
cidc_api/shared/utils.py +18 -9
cidc_api/telemetry.py +101 -0
{nci_cidc_api_modules-1.2.34.dist-info → nci_cidc_api_modules-1.2.53.dist-info}/METADATA +25 -15
nci_cidc_api_modules-1.2.53.dist-info/RECORD +167 -0
{nci_cidc_api_modules-1.2.34.dist-info → nci_cidc_api_modules-1.2.53.dist-info}/WHEEL +1 -1
{nci_cidc_api_modules-1.2.34.dist-info → nci_cidc_api_modules-1.2.53.dist-info}/top_level.txt +1 -0
cidc_api/models/db/base_orm.py +0 -25
cidc_api/models/pydantic/stage2/base.py +0 -48
nci_cidc_api_modules-1.2.34.dist-info/RECORD +0 -109
{nci_cidc_api_modules-1.2.34.dist-info → nci_cidc_api_modules-1.2.53.dist-info}/licenses/LICENSE +0 -0

cidc_api/models/pydantic/stage2/trial.py CHANGED Viewed

@@ -2,8 +2,8 @@ from datetime import datetime
 from pydantic import BeforeValidator
 from typing import List, Annotated
-from .base import Base
-from cidc_api.models.types import TrialOrganization, TrialFundingAgency, AssayType, AgeGroup
+from cidc_api.models.pydantic.base import Base
+from cidc_api.models.types import TrialOrganization, TrialFundingAgency, AssayType, AgeGroup, PrimaryPurposeType
 class Trial(Base):
@@ -12,7 +12,7 @@ class Trial(Base):
     # The unique identifier for the clinical trial. e.g. "GU16-287","BACCI"
     # CDE: https://cadsr.cancer.gov/onedata/dmdirect/NIH/NCI/CO/CDEDD?filter=CDEDD.ITEM_ID=5054234%20and%20ver_nr=1
-    trial_id: str | None = None  # TODO: Fix stage2 trial_id to not be nullable, once stage 1 models are complete
+    trial_id: str | None = None
     # The version number of the trial dataset. e.g. "1.0"
     version: str | None = None
@@ -69,6 +69,10 @@ class Trial(Base):
     # CDE: https://cadsr.cancer.gov/onedata/dmdirect/NIH/NCI/CO/CDEDD?filter=CDEDD.ITEM_ID=16333703%20and%20ver_nr=1
     dates_of_conduct_end: datetime | None = None
+    # A classification of the study based upon the primary intent of the study's activities.
+    # CDE: https://cadsr.cancer.gov/onedata/dmdirect/NIH/NCI/CO/CDEDD?filter=CDEDD.ITEM_ID=11160683%20and%20ver_nr=1
+    primary_purpose_type: PrimaryPurposeType
     # The image of the trial data schema
     schema_file_id: int | None = None
@@ -81,11 +85,5 @@ class Trial(Base):
     # The list of assays that CIDC expects to receive for this trial.
     expected_assays: List[AssayType] = []
-    # Is the cancer studying a liquid tumor type?
-    is_liquid_tumor_trial: bool = False
     # The dbgap study accession number associated with the trial.
-    dbgap_study_accession: str | None = None
-    # The internal version identifier for this specific trial dataset.
-    version: str
+    dbgap_study_accession: str

cidc_api/models/types.py CHANGED Viewed

@@ -26,6 +26,7 @@ AgeGroup = Literal[
     "Pediatric",
 ]
 TrialOrganization = Literal[
     "ECOG-ACRIN",
     "SWOG",
@@ -59,6 +60,34 @@ TrialFundingAgency = Literal[
 ]
+PrimaryPurposeType = Literal[
+    "Adverse Effect Mitigation Study",
+    "Ancillary Study",
+    "Basic Science  Research ",
+    "Correlative Study",
+    "Cure Study",
+    "Device Feasibility Study",
+    "Diagnosis Study",
+    "Disease Modifying Treatment Study",
+    "Early Detection Study",
+    "Education Training Clinical Study",
+    "Epidemiology  Research ",
+    "Genomics Research",
+    "Health Services Research",
+    "Imaging Research",
+    "Interventional Study",
+    "Observational Study",
+    "Outcomes Research",
+    "Prevention Study",
+    "Proteomic Research",
+    "Rehabilitation Clinical Study ",
+    "Screening Study",
+    "Supportive Care Study",
+    "Transcriptomics Research",
+    "Treatment Study",
+]
 AssayType = Literal[
     "Olink",
     "WES",
@@ -84,6 +113,7 @@ AssayType = Literal[
     "snRNA-Seq",
     "Visium",
     "Olink HT",
+    "TCRseq RNA",
 ]
@@ -285,7 +315,6 @@ CancerStageAJCC = Literal[
 CancerStageFIGO = Literal[
-    "value",
     "Stage I",
     "Stage IA",
     "Stage IA1",
@@ -1016,21 +1045,6 @@ GVHDDiagnosisChronicGlobalSeverity = Literal["Mild", "Moderate", "Severe"]
 GVHDOrganChronicScore = Literal["0", "1", "2", "3"]
-PriorTreatmentType = Literal[
-    "Surgery",
-    "Radiotherapy",
-    "Immunotherapy",
-    "Chemotherapy",
-    "Targeted therapy",
-    "Other therapy",
-    "Radiopharmaceutical",
-    "Stem cell transplant",
-    "Immunosuppressive therapy/GVHD prophylaxis for transplant",
-    "Conditioning therapy",
-    "Post-transplant salvage therapy",
-]
 ConditioningRegimenType = Literal["Myeloablative", "Reduced-intensity", "Non-myeloablative", "Other"]
 StemCellDonorType = Literal["Autologous", "Allogeneic"]

cidc_api/shared/assay_handling.py ADDED Viewed

@@ -0,0 +1,68 @@
+from datetime import datetime
+from urllib.parse import quote
+from werkzeug.exceptions import BadRequest
+from cidc_api.models import IngestionJobs
+from . import gcloud_client
+from ..shared.auth import get_current_user
+JOB_TYPE_ASSAY = "assay"
+JOB_TYPE_CLINICAL = "clinical"
+ALLOWED_JOB_TYPES = {JOB_TYPE_CLINICAL, JOB_TYPE_ASSAY}
+def resolve_job_type_and_assay_fields(data: dict) -> tuple[str, str | None, str | None]:
+    """Decide job_type and gather assay_type/batch_id from request JSON."""
+    assay_type = data.get("assay_type")
+    # If job_type is assay or assay_type is present, treat this as an assay job.
+    job_type = data.get("job_type") or (JOB_TYPE_ASSAY if assay_type else JOB_TYPE_CLINICAL)
+    if job_type not in ALLOWED_JOB_TYPES:
+        raise BadRequest("Invalid job_type. Allowed values are 'clinical' or 'assay'.")
+    if job_type == JOB_TYPE_ASSAY and (not assay_type or not isinstance(assay_type, str)):
+        raise BadRequest("assay_type must be provided for job_type='assay'.")
+    assay_type = assay_type.strip() if assay_type else None
+    batch_id = data.get("batch_id").strip() if isinstance(data.get("batch_id"), str) else None
+    return job_type, assay_type, batch_id
+def prepare_assay_job(trial_id: str, assay_type: str, batch_id: str) -> tuple[str, str, str, datetime, int, str]:
+    """
+    Validate assay job uniqueness and generate submission_id, start_date, version, and the trial’s GCS intake path.
+    """
+    if not assay_type:
+        raise BadRequest("assay_type must be provided for job_type='assay'.")
+    # Enforce uniqueness of (trial_id, assay_type, batch_id) when batch_id is present.
+    if batch_id:
+        existing_job = IngestionJobs.get_unique_assay_job(trial_id, assay_type, batch_id)
+        if existing_job:
+            raise BadRequest(
+                f"Assay job {existing_job.id} already exists for this exact trial_id/assay_type/batch_id combination."
+            )
+    submission_id = IngestionJobs.next_assay_submission_id(trial_id, assay_type)
+    job_status = "INITIAL SUBMISSION"
+    error_status = "Upload Incomplete"  # job starts with 'Incomplete' notifier
+    start_date = datetime.now()
+    version = 1
+    # Create or retrieve intake bucket corresponding to the trial
+    intake_bucket = gcloud_client.create_intake_bucket(get_current_user().email, trial_id=trial_id)
+    gcs_path = f"{intake_bucket.name}/{assay_type}/{submission_id}"
+    return submission_id, job_status, error_status, start_date, version, gcs_path
+def get_google_links(intake_path: str) -> tuple[str, str]:
+    """Build the GCS URI and GCS Console URL corresponding to the intake path."""
+    gcs_uri = f"gs://{intake_path}"
+    # Encode path to ensure link opens correctly
+    encoded_path = quote(intake_path)
+    console_url = f"https://console.cloud.google.com/storage/browser/{encoded_path}"
+    return gcs_uri, console_url

cidc_api/shared/auth.py CHANGED Viewed

@@ -1,16 +1,14 @@
 from functools import wraps
 from typing import List
-from packaging import version
 from flask import g, request, current_app as app, Flask
+from packaging import version
 from werkzeug.exceptions import Unauthorized, BadRequest, PreconditionFailed
-from ..models import Users, UserSchema
 from ..config.logging import get_logger
+from ..models import Users, UserSchema
 from ..shared.jose import decode_id_token
+from ..telemetry import trace_
 logger = get_logger(__name__)
@@ -144,6 +142,7 @@ def get_current_user() -> Users:
 _user_schema = UserSchema()
+@trace_()
 def authenticate() -> Users:
     id_token = _extract_token()
     token_payload = decode_id_token(id_token)
@@ -172,6 +171,7 @@ def _extract_token() -> str:
 ### Authorization logic ###
+@trace_()
 def authorize(user: Users, allowed_roles: List[str], resource: str, method: str) -> bool:
     """Check if the current user is authorized to act on the current request's resource.
     Raises Unauthorized

cidc_api/shared/file_handling.py CHANGED Viewed

@@ -10,22 +10,35 @@ from ..config.settings import GOOGLE_CLINICAL_DATA_BUCKET
 from ..models import PreprocessedFiles, TRIAL_APPENDIX_A_CELL_THAT_ENDS_THE_HEADER
 from ..shared.auth import get_current_user
 from ..shared.gcloud_client import upload_file_to_gcs, move_gcs_file
+from ..telemetry import trace_
 logger = get_logger(__name__)
+MASTER_APPENDIX_A_VERSION_PREFIX = "Master Appendix A Version:"
+@trace_()
 def set_current_file(
-    file: FileStorage, file_category: str, gcs_folder: str, session: Session, uploader_email: str, job_id: int = None
+    file: FileStorage,
+    file_category: str,
+    gcs_folder: str,
+    session: Session,
+    uploader_email: str,
+    job_id: int = None,
+    append_timestamp: bool = None,
 ) -> PreprocessedFiles:
     """
     Archives any existing 'current' files for the given category and job,
     then uploads the new file as the latest 'current' version.
     """
     latest_version = PreprocessedFiles.archive_current_files(file_category, job_id=job_id, session=session)
-    latest_file = create_file(file, gcs_folder, file_category, session, uploader_email, job_id, latest_version + 1)
+    latest_file = create_file(
+        file, gcs_folder, file_category, session, uploader_email, job_id, latest_version + 1, append_timestamp
+    )
     return latest_file
+@trace_()
 def create_file(
     file: FileStorage,
     gcs_folder: str,
@@ -34,11 +47,12 @@ def create_file(
     uploader_email: str,
     job_id: int = None,
     version: int = None,
+    append_timestamp: bool = None,
 ) -> PreprocessedFiles:
     """Upload file to GCS and create corresponding metadata record in the database."""
     status = "pending" if gcs_folder.endswith("pending/") else "current"
-    # only need timestamp for current/versioned files
-    append_timestamp = status == "current"
+    # only need timestamp for current/versioned files, if not specified otherwise
+    append_timestamp = append_timestamp if append_timestamp is not None else (status == "current")
     # create file in GCS
     gcs_file_path = upload_file_to_gcs(file, GOOGLE_CLINICAL_DATA_BUCKET, gcs_folder, append_timestamp=append_timestamp)
     # create corresponding record in db

cidc_api/shared/gcloud_client.py CHANGED Viewed

@@ -1,13 +1,15 @@
 """Utilities for interacting with the Google Cloud Platform APIs."""
-# pylint: disable=logging-fstring-interpolation,too-many-lines
+# pylint: disable=logging-fstring-interpolation,too-many-lines, broad-exception-raised
+import asyncio
 import base64
 import datetime
 import hashlib
 import io
 import json
 import os
+import re
 import warnings
 from collections import namedtuple
 from concurrent.futures import Future
@@ -25,6 +27,8 @@ from typing import (
 )
 import googleapiclient.discovery
+from gcloud.aio.storage import Storage
+from pandas.core.frame import DataFrame
 import pandas as pd
 import requests
 from cidc_schemas.prism.constants import ASSAY_TO_FILEPATH
@@ -56,6 +60,8 @@ from ..config.settings import (
     GOOGLE_GRANT_DOWNLOAD_PERMISSIONS_TOPIC,
     GOOGLE_HL_CLINICAL_VALIDATION_TOPIC,
     GOOGLE_DL_CLINICAL_VALIDATION_TOPIC,
+    GOOGLE_ASSAY_METADATA_VALIDATION_TOPIC,
+    GOOGLE_CLINICAL_DATA_INGESTION_PROCESSING_TOPIC,
     TESTING,
     ENV,
     IS_EMAIL_ON,
@@ -361,15 +367,34 @@ def get_intake_bucket_name(user_email: str) -> str:
     return bucket_name
-def create_intake_bucket(user_email: str) -> storage.Bucket:
+def get_trial_intake_bucket_name(trial_id: str) -> str:
     """
-    Create a new data intake bucket for this user, or get the existing one.
+    Return a sanitized GCS bucket name for a given trial_id.
+    Produces:  <GOOGLE_INTAKE_BUCKET>-<sanitized_trial_id>
+    where the trial_id segment is lowercased and restricted to [a-z0-9-].
+    """
+    # Replace non-allowed bucket chars with "-"
+    sanitized_id = re.sub(r"[^a-z0-9-]", "-", trial_id.lower())
+    # Collapse repeated "-" and trim from both ends
+    sanitized_id = re.sub(r"-+", "-", sanitized_id).strip("-")
+    return f"{GOOGLE_INTAKE_BUCKET}-{sanitized_id}"
+def create_intake_bucket(user_email: str, trial_id: str = None) -> storage.Bucket:
+    """
+    Create (or retrieve) the appropriate data intake bucket.
+    If a trial_id is provided, a trial-specific bucket is used;
+    otherwise a user-specific intake bucket is used.
     Grant the user GCS object admin permissions on the bucket, or refresh those
     permissions if they've already been granted.
     Created with uniform bucket-level IAM access, so expiring permission.
     """
     storage_client = _get_storage_client()
-    bucket_name = get_intake_bucket_name(user_email)
+    # Get trial-specific bucket name if trial_id is given, otherwise a user-specific bucket name.
+    bucket_name = get_trial_intake_bucket_name(trial_id) if trial_id else get_intake_bucket_name(user_email)
     bucket = storage_client.bucket(bucket_name)
     if not bucket.exists():
@@ -423,25 +448,61 @@ def upload_xlsx_to_intake_bucket(user_email: str, trial_id: str, upload_type: st
     return f"https://console.cloud.google.com/storage/browser/_details/{bucket_name}/{blob_name}"
-def gcs_xlsx_or_csv_file_to_pandas_dataframe(bucket_name: str, blob_name: str):
-    """Reads an XLSX or CSV file from Google Cloud Storage into a Pandas DataFrame."""
-    temp_file = get_file_bytes_from_gcs(bucket_name, blob_name)
-    # TODO: specify sheet in xlsx file and/or accept tsv and xls files
-    if blob_name[-3:] == "csv":
-        return strip_whitespaces(pd.read_csv(temp_file))
-    elif blob_name[-4:] == "xlsx":
-        return strip_whitespaces(pd.read_excel(temp_file))
+def prepare_dataframe(extension, bytes) -> DataFrame:
+    if extension == "csv":
+        return strip_whitespaces(pd.read_csv(bytes, dtype=str, keep_default_na=False))
+    elif extension == "xlsx":
+        return strip_whitespaces(pd.read_excel(bytes, dtype=str, keep_default_na=False))
     else:
         raise Exception("Can only read csv or xlsx files")
+def gcs_xlsx_or_csv_file_to_pandas_dataframe(bucket_name: str, blob_name: str) -> DataFrame:
+    """Reads an XLSX or CSV file from Google Cloud Storage into a Pandas DataFrame."""
+    contents = get_file_bytes_from_gcs(bucket_name, blob_name)
+    extension = blob_name.split(".")[-1]
+    return prepare_dataframe(extension, contents)
 def get_file_bytes_from_gcs(bucket_name: str, blob_name: str) -> io.BytesIO:
     """Reads a file from Google Cloud Storage and returns it as BytesIO."""
     sheet_data = storage.Client().bucket(bucket_name).blob(blob_name).download_as_bytes()
     return io.BytesIO(sheet_data)
+async def async_gcs_files_to_pandas_dataframes(bucket_name: str, blob_names: List[str]) -> List[DataFrame]:
+    """Async reads a XLSX or CSV files from Google Cloud Storage into a list of Pandas DataFrames."""
+    all_contents = await asyncio.gather(
+        *[async_get_file_bytes_from_gcs(bucket_name, blob_name) for blob_name in blob_names]
+    )
+    dataframes = []
+    for blob_name, contents in zip(blob_names, all_contents):
+        extension = blob_name.split(".")[-1]
+        try:
+            dataframes.append(prepare_dataframe(extension, contents))
+        except pd.errors.EmptyDataError:
+            logger.warning(f"The dataframe retrieved from {blob_name} was empty!")
+    return dataframes
+async def async_gcs_files_to_bytes(bucket_name: str, blob_names: List[str]) -> List[DataFrame]:
+    """Async reads a XLSX or CSV files from Google Cloud Storage into a list of raw bytes"""
+    all_contents = await asyncio.gather(
+        *[async_get_file_bytes_from_gcs(bucket_name, blob_name) for blob_name in blob_names]
+    )
+    return all_contents
+async def async_get_file_bytes_from_gcs(bucket_name: str, blob_name: str) -> io.BytesIO:
+    """Async reads a file from Google Cloud Storage and returns it as BytesIO."""
+    async with Storage() as client:
+        sheet_data = await client.download(bucket_name, blob_name)
+    return io.BytesIO(sheet_data)
 def _execute_multiblob_acl_change(
     user_email_list: List[str],
     blob_list: List[storage.Blob],
@@ -614,6 +675,7 @@ def _build_trial_upload_prefixes(
     trial_set: Set[str] = set()
     upload_set: Set[str] = set()
     if not trial_id:
+        # import is here becasue of circular import
         from ..models.models import TrialMetadata
         trial_set = {str(t.trial_id) for t in session.query(TrialMetadata).add_columns(TrialMetadata.trial_id)}
@@ -886,6 +948,7 @@ def get_signed_url(
     bucket_name: str = GOOGLE_ACL_DATA_BUCKET,
     method: str = "GET",
     expiry_mins: int = 30,
+    use_short_filename: bool = False,
 ) -> str:
     """
     Generate a signed URL for `object_name` to give a client temporary access.
@@ -900,7 +963,11 @@ def get_signed_url(
     # Generate the signed URL, allowing a client to use `method` for `expiry_mins` minutes
     expiration = datetime.timedelta(minutes=expiry_mins)
-    full_filename = object_name.replace("/", "_").replace('"', "_").replace(" ", "_")
+    if use_short_filename:
+        filename = os.path.basename(object_name)
+    else:
+        # full filename with path included
+        filename = object_name.replace("/", "_").replace('"', "_").replace(" ", "_")
     other_kwargs = {}
     if os.environ.get("DEV_GOOGLE_STORAGE", None):
         other_kwargs["api_access_endpoint"] = (os.environ.get("DEV_GOOGLE_STORAGE") or "") + (
@@ -910,7 +977,7 @@ def get_signed_url(
         version="v2",
         expiration=expiration,
         method=method,
-        response_disposition=f'attachment; filename="{full_filename}"',
+        response_disposition=f'attachment; filename="{filename}"',
         **other_kwargs,
     )
     logger.info(f"generated signed URL for {object_name}: {url}")
@@ -920,7 +987,8 @@ def get_signed_url(
 def _encode_and_publish(content: str, topic: str) -> Future:
     """Convert `content` to bytes and publish it to `topic`."""
-    pubsub_publisher = pubsub.PublisherClient()
+    publisher_options = pubsub.types.PublisherOptions(enable_open_telemetry_tracing=ENV == "dev-int")
+    pubsub_publisher = pubsub.PublisherClient(publisher_options=publisher_options)
     topic = pubsub_publisher.topic_path(GOOGLE_CLOUD_PROJECT, topic)
     data = bytes(content, "utf-8")
@@ -994,6 +1062,18 @@ def publish_detailed_validation(job_id: int) -> None:
     _report = _encode_and_publish(str(job_id), GOOGLE_DL_CLINICAL_VALIDATION_TOPIC)
+def publish_assay_metadata_validation(job_id: int) -> None:
+    """Publish to the assay_metadata_validation topic that a job's assay metadata file is ready to be validated."""
+    # Start validation asynchronously
+    _report = _encode_and_publish(str(job_id), GOOGLE_ASSAY_METADATA_VALIDATION_TOPIC)
+def publish_clinical_data_ingestion(job_id: int) -> None:
+    """Start ingestion of clinical data job"""
+    # Start asynchronously
+    _report = _encode_and_publish(str(job_id), GOOGLE_CLINICAL_DATA_INGESTION_PROCESSING_TOPIC)
 def send_email(to_emails: List[str], subject: str, html_content: str, **kw) -> None:
     """
     Publish an email-to-send to the emails topic.

cidc_api/shared/utils.py CHANGED Viewed

@@ -1,11 +1,20 @@
-def strip_whitespaces(df):
-    def stripper(x):
-        if x and isinstance(x, str):
-            return x.strip()
-        else:
-            return x
-    df.rename(columns=stripper, inplace=True)
-    df = df.map(stripper)
+from cidc_api.telemetry import trace_
+def _stripper(x):
+    if x and isinstance(x, str):
+        return x.strip()
+    else:
+        return x
+@trace_("sheet")
+def strip_whitespaces(df, sheet=None):
+    if sheet:
+        df = df[sheet]
+    df.rename(columns=_stripper, inplace=True)
+    df = df.map(_stripper)
+    df.replace("", None, inplace=True)
     return df

cidc_api/telemetry.py ADDED Viewed

@@ -0,0 +1,101 @@
+# standard modules
+from functools import wraps
+# external modules
+from opentelemetry import trace
+from opentelemetry.sdk.resources import Resource
+from opentelemetry.sdk.trace import TracerProvider
+from opentelemetry.sdk.trace.export import BatchSpanProcessor
+# local modules
+from .config.settings import ENV, TESTING
+# pylint: disable=import-outside-toplevel
+def instrument_flask(app):
+    from opentelemetry.instrumentation.flask import FlaskInstrumentor
+    from opentelemetry.propagate import set_global_textmap
+    from opentelemetry.propagators.cloud_trace_propagator import CloudTraceFormatPropagator
+    FlaskInstrumentor().instrument_app(app)
+    # use the X-Cloud-Trace-Context header
+    set_global_textmap(CloudTraceFormatPropagator())
+def instrument_requests():
+    from opentelemetry.instrumentation.requests import RequestsInstrumentor
+    def _request_hook(span, request_obj):
+        span.update_name(f"requests {request_obj.method}")
+    RequestsInstrumentor().instrument(request_hook=_request_hook)
+def instrument_sqlachemy(engine):
+    from opentelemetry.instrumentation.sqlalchemy import SQLAlchemyInstrumentor
+    SQLAlchemyInstrumentor().instrument(engine=engine)
+resource = Resource(attributes={"service.name": f"CIDC-{ENV}"})
+provider = TracerProvider(resource=resource)
+if ENV == "dev" and not TESTING:
+    from opentelemetry.exporter.otlp.proto.grpc.trace_exporter import OTLPSpanExporter
+    COLLECTOR_ENDPOINT = "127.0.0.1"
+    COLLECTOR_GRPC_PORT = 6004
+    # send spans to local exporter
+    # 1. download latest version from https://github.com/open-telemetry/opentelemetry-collector-releases/releases (otelcol-contrib_0.140.1_darwin_arm64)
+    # 2. start exporter from otel folder with `./otelcol-contrib --config=config.yaml`
+    # 3. download and start Jeager (all-in-one image)  - https://www.jaegertracing.io/download/
+    exporter = OTLPSpanExporter(endpoint=f"http://{COLLECTOR_ENDPOINT}:{COLLECTOR_GRPC_PORT}", insecure=True)
+    processor = BatchSpanProcessor(exporter)
+    provider.add_span_processor(processor)
+if ENV == "dev-int":
+    from opentelemetry.exporter.cloud_trace import CloudTraceSpanExporter
+    # send span to Cloud Trace service - https://console.cloud.google.com/traces/explorer
+    exporter = CloudTraceSpanExporter()
+    processor = BatchSpanProcessor(exporter)
+    provider.add_span_processor(processor)
+# NOTE: we don't run telemetry in upper tiers; no span processor is noop
+trace.set_tracer_provider(provider)
+tracer = trace.get_tracer(__name__)
+def trace_(*args):
+    def decorator_factory(func):
+        @wraps(func)
+        def wrapper(*args_, **kwargs_):
+            func_name = f"{func.__module__.split(".")[-1]}.{func.__name__}"
+            with tracer.start_as_current_span(func_name) as span:
+                for arg in args:
+                    value = kwargs_.get(arg)
+                    # track id of argument if exists
+                    if hasattr(value, "id"):
+                        value = getattr(value, "id")
+                    span.set_attributes({arg: value})
+                result = func(*args_, **kwargs_)
+                if isinstance(result, (str, int, float, bool)):
+                    span.set_attribute("result", result)
+            return result
+        return wrapper
+    return decorator_factory

nci-cidc-api-modules 1.2.34__py3-none-any.whl → 1.2.53__py3-none-any.whl

nci-cidc-api-modules 1.2.34py3-none-any.whl → 1.2.53py3-none-any.whl