PyPI - nci-cidc-api-modules - Versions diffs - 1.1.34__tar.gz → 1.1.37__tar.gz - Mend

nci-cidc-api-modules 1.1.34tar.gz → 1.1.37tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (34) hide show

{nci_cidc_api_modules-1.1.34/nci_cidc_api_modules.egg-info → nci_cidc_api_modules-1.1.37}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: nci_cidc_api_modules
-Version: 1.1.34
+Version: 1.1.37
 Summary: SQLAlchemy data models and configuration tools used in the NCI CIDC API
 Home-page: https://github.com/NCI-CIDC/cidc-api-gae
 License: MIT license
@@ -10,10 +10,10 @@ License-File: LICENSE
 Requires-Dist: werkzeug==3.0.6
 Requires-Dist: flask==3.0.3
 Requires-Dist: flask-migrate==3.1.0
-Requires-Dist: flask-sqlalchemy==3.0.2
-Requires-Dist: sqlalchemy==1.4.54
+Requires-Dist: flask-sqlalchemy==3.1.1
+Requires-Dist: sqlalchemy==2.0.41
 Requires-Dist: marshmallow==3.19.0
-Requires-Dist: marshmallow-sqlalchemy==0.22.3
+Requires-Dist: marshmallow-sqlalchemy==1.4.2
 Requires-Dist: google-cloud-storage==2.18.0
 Requires-Dist: google-cloud-secret-manager==2.20.1
 Requires-Dist: google-cloud-pubsub==2.22.0
@@ -28,7 +28,7 @@ Requires-Dist: python-dotenv==0.10.3
 Requires-Dist: requests==2.32.4
 Requires-Dist: jinja2==3.1.6
 Requires-Dist: certifi==2024.7.4
-Requires-Dist: nci-cidc-schemas==0.27.25
+Requires-Dist: nci-cidc-schemas==0.27.27
 Dynamic: description
 Dynamic: description-content-type
 Dynamic: home-page

{nci_cidc_api_modules-1.1.34 → nci_cidc_api_modules-1.1.37}/cidc_api/config/db.py RENAMED Viewed

@@ -4,13 +4,12 @@ from flask import Flask
 from flask_sqlalchemy import SQLAlchemy
 from flask_migrate import Migrate, upgrade
 from sqlalchemy.engine.url import URL
-from sqlalchemy.ext.declarative import declarative_base
+from sqlalchemy.orm import declarative_base
 from .secrets import get_secrets_manager
 db = SQLAlchemy()
-BaseModel = declarative_base(bind=db)
+BaseModel = declarative_base()
 db.Model = BaseModel
@@ -54,7 +53,7 @@ def get_sqlalchemy_database_uri(testing: bool = False) -> str:
                 "Either POSTGRES_URI or CLOUD_SQL_INSTANCE_NAME must be defined to connect " + "to a database."
             )
-        db_uri = str(URL(**config))
+        db_uri = str(URL.create(**config).render_as_string(hide_password=False))
     assert db_uri

{nci_cidc_api_modules-1.1.34 → nci_cidc_api_modules-1.1.37}/cidc_api/models/files/details.py RENAMED Viewed

@@ -993,4 +993,35 @@ details_dict = {
         "",
         "",
     ),
+    # scrna
+    "/scrnaseq/samples_metadata.csv": FileDetails("source", "", ""),
+    "/scrnaseq/read_1.gz": FileDetails("source", "", ""),
+    "/scrnaseq/read_2.gz": FileDetails("source", "", ""),
+    "/scrnaseq_analysis/samples_metadata.csv": FileDetails("source", "", ""),
+    "/scrnaseq_analysis/config.yaml": FileDetails("source", "", ""),
+    "/scrnaseq_analysis/R_package_versions.csv": FileDetails("source", "", ""),
+    "/scrnaseq_analysis/integration.rds": FileDetails("source", "", ""),
+    "/scrnaseq_analysis/integration_heatmap_plots.zip": FileDetails("source", "", ""),
+    "/scrnaseq_analysis/integration_markers.zip": FileDetails("source", "", ""),
+    "/scrnaseq_analysis/integration_split_percent_plots.zip": FileDetails("source", "", ""),
+    "/scrnaseq_analysis/integration_split_umap_plots.zip": FileDetails("source", "", ""),
+    "/scrnaseq_analysis/integration_umap_plots.zip": FileDetails("source", "", ""),
+    "/scrnaseq_analysis/clustering.rds": FileDetails("source", "", ""),
+    "/scrnaseq_analysis/report.html": FileDetails("source", "", ""),
+    "/scrnaseq_analysis/star_sorted_by_cord.bam": FileDetails("source", "", ""),
+    "/scrnaseq_analysis/star_sorted_by_cord.bam.bai": FileDetails("source", "", ""),
+    "/scrnaseq_analysis/log_final.out": FileDetails("source", "", ""),
+    "/scrnaseq_analysis/log.out": FileDetails("source", "", ""),
+    "/scrnaseq_analysis/log_progress.out": FileDetails("source", "", ""),
+    "/scrnaseq_analysis/sj_out.tab": FileDetails("source", "", ""),
+    "/scrnaseq_analysis/barcodes.stats": FileDetails("source", "", ""),
+    "/scrnaseq_analysis/gene_features.stats": FileDetails("source", "", ""),
+    "/scrnaseq_analysis/gene_summary.csv": FileDetails("source", "", ""),
+    "/scrnaseq_analysis/gene_umi_per_cell_sorted.txt": FileDetails("source", "", ""),
+    "/scrnaseq_analysis/gene_filtered_features.tsv": FileDetails("source", "", ""),
+    "/scrnaseq_analysis/gene_filtered_barcodes.tsv": FileDetails("source", "", ""),
+    "/scrnaseq_analysis/gene_filtered_matrix.mtx": FileDetails("source", "", ""),
+    "/scrnaseq_analysis/gene_raw_features.tsv": FileDetails("source", "", ""),
+    "/scrnaseq_analysis/gene_raw_barcodes.tsv": FileDetails("source", "", ""),
+    "/scrnaseq_analysis/gene_raw_matrix.mtx": FileDetails("source", "", ""),
 }

{nci_cidc_api_modules-1.1.34 → nci_cidc_api_modules-1.1.37}/cidc_api/models/files/facets.py RENAMED Viewed

@@ -346,6 +346,21 @@ assay_facets: Facets = {
             "H and E file from MIBI analysis",
         ),
     },
+    "scRNA": {
+        "Samples Metadata": FacetConfig(["/scrnaseq/samples_metadata.csv"], "Sample metadata for scRNA run"),
+        "Read 1 gz": FacetConfig(["/scrnaseq/read_1.gz"], "Gz file for read 1"),
+        "Read 2 gz": FacetConfig(["/scrnaseq/read_2.gz"], "Gz file for read 2"),
+    },
+    "Visium": {
+        "Samples Metadata": FacetConfig(["/visium/samples_metadata.csv"], "Sample metadata for visium run"),
+        "Read 1 fastq gz": FacetConfig(["/visium/R1_001.fastq.gz"], "Gz file for read 1"),
+        "Read 2 fastq gz": FacetConfig(["/visium/R2_001.fastq.gz"], "Gz file for read 2"),
+        "loupe alignment file": FacetConfig(["/visium/loupe_alignment_file.json"]),
+        "brightfield image": FacetConfig(["/visium/brightfield.tiff"]),
+        "dark image": FacetConfig(["/visium/dark_image.tiff"]),
+        "colorized image": FacetConfig(["/visium/colorized.tiff"]),
+        "cytassist image": FacetConfig(["/visium/cytassist.tiff"]),
+    },
     "mIHC": {
         "Samples Report": FacetConfig(["/mihc/sample_report.csv"], "Samples report for mIHC run"),
         "Multitiffs": FacetConfig(["/mihc/multitiffs.tar.gz"], "Multi Tiffs file from mIHC run"),
@@ -549,6 +564,48 @@ analysis_ready_facets = {
     "WES Analysis": FacetConfig(["/wes/analysis/report.tar.gz"]),
     "TCR": FacetConfig(["/tcr_analysis/report_trial.tar.gz"]),
     "mIF": FacetConfig(["/mif/roi_/cell_seg_data.txt"]),
+    "scRNA": FacetConfig(
+        [
+            "/scrnaseq_analysis/samples_metadata.csv",
+            "/scrnaseq_analysis/config.yaml",
+            "/scrnaseq_analysis/R_package_versions.csv",
+            "/scrnaseq_analysis/integration.rds",
+            "/scrnaseq_analysis/integration_heatmap_plots.zip",
+            "/scrnaseq_analysis/integration_markers.zip",
+            "/scrnaseq_analysis/integration_split_percent_plots.zip",
+            "/scrnaseq_analysis/integration_split_umap_plots.zip",
+            "/scrnaseq_analysis/integration_umap_plots.zip",
+            "/scrnaseq_analysis/clustering.rds",
+            "/scrnaseq_analysis/report.html",
+            "/scrnaseq_analysis/star_sorted_by_cord.bam",
+            "/scrnaseq_analysis/star_sorted_by_cord.bam.bai",
+            "/scrnaseq_analysis/log_final.out",
+            "/scrnaseq_analysis/log.out",
+            "/scrnaseq_analysis/log_progress.out",
+            "/scrnaseq_analysis/sj_out.tab",
+            "/scrnaseq_analysis/barcodes.stats",
+            "/scrnaseq_analysis/gene_features.stats",
+            "/scrnaseq_analysis/gene_summary.csv",
+            "/scrnaseq_analysis/gene_umi_per_cell_sorted.txt",
+            "/scrnaseq_analysis/gene_filtered_features.tsv",
+            "/scrnaseq_analysis/gene_filtered_barcodes.tsv",
+            "/scrnaseq_analysis/gene_filtered_matrix.mtx",
+            "/scrnaseq_analysis/gene_raw_features.tsv",
+            "/scrnaseq_analysis/gene_raw_barcodes.tsv",
+            "/scrnaseq_analysis/gene_raw_matrix.mtx",
+        ]
+    ),
+    "Visium": FacetConfig(
+        [
+            "/visium_analysis/samples_metadata.csv",
+            "/visium_analysis/config.yaml",
+            "/visium_analysis/R_package_versions.csv",
+            "/visium_analysis/merged.rds",
+            "/visium_analysis/spatial_variable_features.rds",
+            "/visium_analysis/report.html",
+            "/visium_analysis/visium_spaceranger_output.zip",
+        ]
+    ),
 }
 facets_dict: Dict[str, Facets] = {

{nci_cidc_api_modules-1.1.34 → nci_cidc_api_modules-1.1.37}/cidc_api/models/models.py RENAMED Viewed

@@ -23,6 +23,14 @@ __all__ = [
     "ValidationMultiError",
     "with_default_session",
     "PreprocessedFiles",
+    "IngestionJobs",
+    "JobFileCategories",
+    "TRIAL_APPENDIX_A",
+    "REQUEST_LETTER",
+    "ADMIN_FILE_CATEGORIES",
+    "FINAL_JOB_STATUS",
+    "INGESTION_JOB_STATUSES",
+    "INGESTION_JOB_COLORS",
 ]
 import hashlib
@@ -36,6 +44,7 @@ from functools import wraps
 from typing import (
     Any,
     BinaryIO,
+    ClassVar,
     Dict,
     Optional,
     List,
@@ -54,32 +63,33 @@ from google.cloud.storage import Blob
 from jsonschema.exceptions import ValidationError
 from sqlalchemy import (
     and_,
-    Column,
+    asc,
+    case,
+    desc,
+    func,
+    literal,
+    literal_column,
+    not_,
+    or_,
+    select,
+    text,
+    true,
+    tuple_,
+    update,
+    BigInteger,
     Boolean,
+    CheckConstraint,
+    Column,
     DateTime,
-    Integer,
-    BigInteger,
-    String,
     Enum,
-    Index,
-    func,
-    CheckConstraint,
     ForeignKey,
     ForeignKeyConstraint,
+    Index,
+    Integer,
+    MetaData,
     PrimaryKeyConstraint,
-    tuple_,
-    asc,
-    desc,
-    update,
-    case,
-    select,
-    literal_column,
-    not_,
-    literal,
-    or_,
+    String,
     Table,
-    MetaData,
-    true,
 )
 from sqlalchemy.dialects.postgresql import JSONB, UUID
 from sqlalchemy.engine import ResultProxy
@@ -96,8 +106,6 @@ from sqlalchemy.sql import (
     # break up this giant file.
     and_ as sql_and,
     or_ as sql_or,
-    # select, # ALREADY IMPORTED
-    text,
 )
 from sqlalchemy.sql.elements import BooleanClauseList
 from sqlalchemy.sql.functions import coalesce
@@ -119,6 +127,7 @@ from ..config.settings import (
     MAX_PAGINATION_PAGE_SIZE,
     TESTING,
     INACTIVE_USER_DAYS,
+    GOOGLE_CLINICAL_DATA_BUCKET,
 )
 from ..shared import emails
 from ..shared.gcloud_client import (
@@ -132,6 +141,7 @@ from ..shared.gcloud_client import (
     revoke_intake_access,
     revoke_lister_access,
     revoke_bigquery_access,
+    gcs_xlsx_or_csv_file_to_pandas_dataframe,
 )
 os.environ["TZ"] = "UTC"
@@ -309,7 +319,7 @@ class CommonColumns(BaseModel):  # type: ignore
     @with_default_session
     def find_by_id(cls, id: int, session: Session):
         """Find the record with this id"""
-        return session.query(cls).get(id)
+        return session.get(cls, id)
     @classmethod
     @with_default_session
@@ -1207,9 +1217,10 @@ class TrialMetadata(CommonColumns):
             raise NoResultFound(f"No trial found with id {trial_id}")
         return unprism.unprism_samples(trial.metadata_json)
-    file_bundle: Optional[FileBundle]
-    num_participants: Optional[int]
-    num_samples: Optional[int]
+    file_bundle: ClassVar[Optional[FileBundle]]
+    num_participants: ClassVar[Optional[int]]
+    num_samples: ClassVar[Optional[int]]
+    ready_for_submission: ClassVar[Optional[Boolean]]
     # List of metadata JSON fields that should not be sent to clients
     # in queries that list trial metadata, because they may contain a lot
@@ -1330,11 +1341,26 @@ class TrialMetadata(CommonColumns):
                             del trial.file_bundle[assay][purpose]
                     if not trial.file_bundle[assay]:
                         del trial.file_bundle[assay]
+            # Check if trial is ready for submission
+            setattr(trial, "ready_for_submission", trial.ready_for_submission())
             trials.append(trial)
         return trials
+    @with_default_session
+    def ready_for_submission(self, session: Session) -> Boolean:
+        open_job = IngestionJobs.get_open_job_by_trial(self.trial_id)
+        if not open_job:
+            return False
+        appendix_a_files = PreprocessedFiles.get_files_by_category_and_status(
+            "trial_appendix_a", "current", job_id=open_job.id
+        )
+        trial_letters = PreprocessedFiles.get_files_by_category_and_status(
+            "request_letter", "current", job_id=open_job.id
+        )
+        return appendix_a_files and trial_letters and open_job.status == "DRAFT"
     @with_default_session
     def insert(
         self,
@@ -1711,6 +1737,30 @@ class TrialMetadata(CommonColumns):
                         jsonb_array_elements(batch->'records') record
                 """
+        # Find all samples associated with scrnaseq analysis uploads.
+        scrnaseq_analysis_subquery = """
+                    select
+                        trial_id,
+                        'scrnaseq_analysis' as key,
+                        record->>'cimac_id' as cimac_id
+                    from
+                        trial_metadata,
+                        jsonb_array_elements(metadata_json#>'{analysis,scrnaseq_analysis}') batch,
+                        jsonb_array_elements(batch->'records') record
+                """
+        # Find all samples associated with visium analysis uploads.
+        visium_analysis_subquery = """
+                    select
+                        trial_id,
+                        'visium_analysis' as key,
+                        record->>'cimac_id' as cimac_id
+                    from
+                        trial_metadata,
+                        jsonb_array_elements(metadata_json#>'{analysis,visium_analysis}') batch,
+                        jsonb_array_elements(batch->'records') record
+                """
         # Build up a JSON object mapping analysis types to arrays of excluded samples.
         # The resulting object will have structure like:
         # {
@@ -1866,6 +1916,10 @@ class TrialMetadata(CommonColumns):
                                 {cytof_analysis_subquery}
                                 union all
                                 {atacseq_analysis_subquery}
+                                union all
+                                {scrnaseq_analysis_subquery}
+                                union all
+                                {visium_analysis_subquery}
                             ) assays_and_analysis
                             group by
                                 trial_id, key
@@ -1924,7 +1978,7 @@ class TrialMetadata(CommonColumns):
             - `"wes_tumor_only_analysis"` counts (tumor) samples with tumor-only analysis
         For `"total_[participants/samples]"`, ALL (ie tumor AND normal) WES assay samples are included.
         """
-        summaries_query = "SELECT result FROM trial_summaries_mv"
+        summaries_query = text("SELECT result FROM trial_summaries_mv")
         # Retrieve trial-level summary results from data cached in trial_summaries_mv materialized view.
         # The source of the SQL query used in trial_summaries_mv is get_summaries_query()
         summaries = [summary for (summary,) in session.execute(summaries_query) if summary]
@@ -2256,6 +2310,7 @@ class DownloadableFiles(CommonColumns):
         "FileGroups",
         secondary="files_to_file_groups",
         back_populates="downloadable_files",
+        cascade="save-update",
     )
     FILE_EXT_REGEX = r"\.([^./]*(\.gz)?)$"
@@ -2751,7 +2806,7 @@ class DownloadableFiles(CommonColumns):
         """
         where_clause = DownloadableFiles._generate_where_clause_with_permissions(user)
-        statement = select([DownloadableFiles.id]).where(sql_and(DownloadableFiles.id.in_(ids), where_clause))
+        statement = select(DownloadableFiles.id).where(sql_and(DownloadableFiles.id.in_(ids), where_clause))
         return [row[0] for row in session.execute(statement).fetchall()]
@@ -2783,7 +2838,7 @@ class DownloadableFiles(CommonColumns):
         for file in files_to_delete:
             file.delete(commit=True)
-        session.execute("REFRESH MATERIALIZED VIEW CONCURRENTLY trial_summaries_mv")
+        session.execute(text("REFRESH MATERIALIZED VIEW CONCURRENTLY trial_summaries_mv"))
     @classmethod
     @with_default_session
@@ -3054,23 +3109,19 @@ class DownloadableFiles(CommonColumns):
         id_bundles = (
             select(
-                [
-                    cls.trial_id,
-                    cls.data_category_prefix.label(type_col.key),
-                    cls.file_purpose.label(purp_col.key),
-                    func.json_agg(cls.id).label(ids_col.key),
-                ]
+                cls.trial_id,
+                cls.data_category_prefix.label(type_col.key),
+                cls.file_purpose.label(purp_col.key),
+                func.json_agg(cls.id).label(ids_col.key),
             )
             .group_by(cls.trial_id, cls.data_category_prefix, cls.file_purpose)
             .alias("id_bundles")
         )
         purpose_bundles = (
             select(
-                [
-                    tid_col,
-                    type_col,
-                    func.json_object_agg(func.coalesce(purp_col, "miscellaneous"), ids_col).label(purps_col.key),
-                ]
+                tid_col,
+                type_col,
+                func.json_object_agg(func.coalesce(purp_col, "miscellaneous"), ids_col).label(purps_col.key),
             )
             .select_from(id_bundles)
             .group_by(tid_col, type_col)
@@ -3078,10 +3129,8 @@ class DownloadableFiles(CommonColumns):
         )
         file_bundles = (
             select(
-                [
-                    tid_col.label(tid_col.key),
-                    func.json_object_agg(func.coalesce(type_col, "other"), purps_col).label("file_bundle"),
-                ]
+                tid_col.label(tid_col.key),
+                func.json_object_agg(func.coalesce(type_col, "other"), purps_col).label("file_bundle"),
             )
             .select_from(purpose_bundles)
             .group_by(tid_col)
@@ -3131,13 +3180,13 @@ class DownloadableFiles(CommonColumns):
 # Query clause for computing a downloadable file's data category.
 # Used above in the DownloadableFiles.data_category computed property.
 DATA_CATEGORY_CASE_CLAUSE = case(
-    [(DownloadableFiles.facet_group == k, v) for k, v in facet_groups_to_categories.items()]
+    *[(DownloadableFiles.facet_group == k, v) for k, v in facet_groups_to_categories.items()]
 )
 # Query clause for computing a downloadable file's file purpose.
 # Used above in the DownloadableFiles.file_purpose computed property.
 FILE_PURPOSE_CASE_CLAUSE = case(
-    [
+    *[
         (DownloadableFiles.facet_group == facet_group, file_details.file_purpose)
         for facet_group, file_details in details_dict.items()
     ]
@@ -3146,7 +3195,7 @@ FILE_PURPOSE_CASE_CLAUSE = case(
 def result_proxy_to_models(result_proxy: ResultProxy, model: BaseModel) -> List[BaseModel]:
     """Materialize a sqlalchemy `result_proxy` iterable as a list of `model` instances"""
-    return [model(**dict(row_proxy)) for row_proxy in result_proxy.all()]
+    return [model(**dict(row_proxy._mapping)) for row_proxy in result_proxy.all()]
 @with_default_session
@@ -3187,12 +3236,24 @@ def upload_manifest_json(
     return manifest_upload.id
+TRIAL_APPENDIX_A = "trial_appendix_a"
+REQUEST_LETTER = "request_letter"
+ADMIN_FILE_CATEGORIES = [TRIAL_APPENDIX_A, REQUEST_LETTER]
 class PreprocessedFiles(CommonColumns):
     __tablename__ = "preprocessed_files"
+    __table_args__ = (
+        ForeignKeyConstraint(
+            ["job_id"],
+            ["ingestion_jobs.id"],
+            name="preprocessed_files_job_id_fkey",
+        ),
+    )
     file_name = Column(String)
     object_url = Column(String)
-    trial_id = Column(String)
+    job_id = Column(Integer)
     file_category = Column(String)
     uploader_email = Column(String)
     status = Column(String)
@@ -3207,7 +3268,7 @@ class PreprocessedFiles(CommonColumns):
         file_category: str,
         uploader_email: str,
         status: str = "pending",
-        trial_id: str = None,
+        job_id: int = None,
         version: int = None,
         released_version: str = None,
         session: Session = None,
@@ -3219,7 +3280,7 @@ class PreprocessedFiles(CommonColumns):
             file_category=file_category,
             uploader_email=uploader_email,
             status=status,
-            trial_id=trial_id,
+            job_id=job_id,
             version=version,
             released_version=released_version,
         )
@@ -3228,10 +3289,10 @@ class PreprocessedFiles(CommonColumns):
     @classmethod
     @with_default_session
-    def archive_current_files(cls, file_category: str, session: Session = None):
+    def archive_current_files(cls, file_category: str, job_id: int = None, session: Session = None):
         """Update any 'current' files in the given category to 'archived'. Returns latest existing version number."""
         current_version = 0
-        current_files = cls.get_files_by_category_and_status(file_category, "current", session=session)
+        current_files = cls.get_files_by_category_and_status(file_category, "current", job_id=job_id, session=session)
         for file in current_files:
             file.status = "archived"
             file._updated = datetime.now()
@@ -3241,22 +3302,35 @@ class PreprocessedFiles(CommonColumns):
     @classmethod
     @with_default_session
-    def delete_pending_files_by_category(cls, file_category: str, trial_id: str = None, session: Session = None):
-        """Delete all pending files matching given file_category and optional trial_id."""
-        records = cls.get_files_by_category_and_status(file_category, "pending", trial_id=trial_id, session=session)
+    def delete_pending_files_by_category(cls, file_category: str, job_id: int = None, session: Session = None):
+        """Delete all pending files matching given file_category and optional job_id."""
+        records = cls.get_files_by_category_and_status(file_category, "pending", job_id=job_id, session=session)
+        for record in records:
+            session.delete(record)
+        session.commit()
+    @classmethod
+    @with_default_session
+    def delete_files_by_category(cls, file_category: str, job_id: int = None, session: Session = None):
+        """Delete all files matching a given file_category and job_id (or system files if job_id is None)."""
+        query = session.query(cls).filter_by(file_category=file_category)
+        query = cls.add_job_filter(query, job_id)
+        records = query.all()
+        if not records:
+            return False
         for record in records:
             session.delete(record)
         session.commit()
+        return True
     @classmethod
     @with_default_session
     def get_files_by_category_and_status(
-        cls, file_category: str, status: str, trial_id: str = None, session: Session = None
+        cls, file_category: str, status: str, job_id: int = None, session: Session = None
     ) -> list["PreprocessedFiles"]:
-        """Return all files matching given file_category and status, optionally filtered by trial_id."""
+        """Return all files matching file_category and status, with job_id filter (job_id is NULL if not provided)."""
         query = session.query(cls).filter_by(file_category=file_category, status=status)
-        if trial_id:
-            query = query.filter_by(trial_id=trial_id)
+        query = cls.add_job_filter(query, job_id)
         return query.all()
     @classmethod
@@ -3266,3 +3340,171 @@ class PreprocessedFiles(CommonColumns):
     ) -> Optional["PreprocessedFiles"]:
         """Return the file matching the given category and version number."""
         return session.query(cls).filter_by(file_category=file_category, version=version).one_or_none()
+    @classmethod
+    @with_default_session
+    def get_system_reference_files(cls, status: str = "current", session: Session = None) -> list["PreprocessedFiles"]:
+        """Return static reference files that are not linked to any job and not Master Appendix A."""
+        return (
+            session.query(cls)
+            .filter(cls.job_id.is_(None))
+            .filter(cls.file_category != "master_appendix_a")
+            .filter_by(status=status)
+            .all()
+        )
+    # TODO: logic for pending vs current files after high level validation
+    @classmethod
+    @with_default_session
+    def get_pending_non_admin_files(cls, job_id: int, session: Session) -> list["PreprocessedFiles"]:
+        return (
+            session.query(cls)
+            .filter(cls.job_id == job_id)
+            .filter(cls.status == "pending", cls.file_category.notin_(ADMIN_FILE_CATEGORIES))
+            .all()
+        )
+    @classmethod
+    def add_job_filter(cls, query, job_id):
+        """
+        Add a job_id filter to the SQLAlchemy query:
+        - If job_id is provided, filters for exact match.
+        - If not, filters for system-wide files (where job_id IS NULL).
+        """
+        if job_id is not None:
+            return query.filter_by(job_id=job_id)
+        else:
+            return query.filter(cls.job_id.is_(None))
+INGESTION_JOB_STATUSES = [
+    "DRAFT",
+    "INITIAL SUBMISSION",
+    "VALIDATION REVIEW",
+    "REVISION SUBMISSION",
+    "INGESTION",
+    "PUBLISHED",
+]
+# Business decision to pass hex codes from the backend though that should be done by the front end...
+INGESTION_JOB_COLORS = {
+    "DRAFT": "",
+    "INITIAL SUBMISSION": "#ACCAD7",
+    "VALIDATION REVIEW": "#DABE90",
+    "REVISION SUBMISSION": "#C8BAE5",
+    "INGESTION": "#8FCEC7",
+    "PUBLISHED": "#90D9E6",
+}
+# TODO If have "CANCELLED" concept or other final status, add here
+FINAL_JOB_STATUS = ["PUBLISHED"]
+class IngestionJobs(CommonColumns):
+    __tablename__ = "ingestion_jobs"
+    __table_args__ = (
+        ForeignKeyConstraint(
+            ["trial_id"],
+            ["trial_metadata.trial_id"],
+            name="ingestion_jobs_trial_id_fkey",
+        ),
+    )
+    status = Column("status", Enum(*INGESTION_JOB_STATUSES, name="status"), nullable=False)
+    trial_id = Column(String, nullable=False)
+    version = Column(Integer, nullable=False)
+    @staticmethod
+    @with_default_session
+    def create(trial_id: str, status: str, version: int, session: Session = None):
+        new_job = IngestionJobs(trial_id=trial_id, status=status, version=version)
+        new_job.insert(session=session)
+        return new_job
+    @with_default_session
+    def transition_status(self, status: str, session: Session):
+        # create required categories after opening job for submission
+        if self.status == "DRAFT" and status == "INITIAL SUBMISSION":
+            for category in self.derive_required_categories_from_appendix_a():
+                JobFileCategories.create(category=category, job_id=self.id, type="required")
+        self.status = status
+        self.update(session=session)
+    def derive_required_categories_from_appendix_a(self) -> List:
+        appendix_a = PreprocessedFiles.get_files_by_category_and_status(TRIAL_APPENDIX_A, "current", job_id=self.id)[0]
+        df = gcs_xlsx_or_csv_file_to_pandas_dataframe(GOOGLE_CLINICAL_DATA_BUCKET, appendix_a.object_url)
+        categories = []
+        headers_ended = False
+        for index, row in df.iterrows():
+            cell = str(row.iloc[0])
+            if headers_ended:
+                if not cell == "nan" and cell not in categories and cell != "Specialized_Data":
+                    categories.append(cell)
+            elif cell == "PATIENT-LEVEL DATA":
+                headers_ended = True
+        return categories
+    @classmethod
+    @with_default_session
+    def get_jobs_by_trial(cls, trial_id: str, session: Session = None) -> list["IngestionJobs"]:
+        return session.query(cls).filter(cls.trial_id == trial_id).order_by(cls.version.desc()).all()
+    @classmethod
+    @with_default_session
+    def get_open_job_by_trial(cls, trial_id: str, session: Session = None) -> Optional["IngestionJobs"]:
+        """Return the open job for a given trial if it exists."""
+        return (
+            session.query(cls)
+            .filter(
+                cls.trial_id == trial_id,
+                cls.status.notin_(FINAL_JOB_STATUS),
+            )
+            .order_by(cls._created.desc())
+            .first()
+        )
+    # TODO: figure out which users have access to which jobs
+    @classmethod
+    @with_default_session
+    def get_open_jobs_for_user(cls, user: Users, session: Session = None) -> list["IngestionJobs"]:
+        return session.query(cls).filter(cls.status.notin_(["DRAFT"])).order_by(cls._created.desc()).all()
+class JobFileCategories(CommonColumns):
+    __tablename__ = "job_file_categories"
+    __table_args__ = (
+        ForeignKeyConstraint(
+            ["job_id"],
+            ["ingestion_jobs.id"],
+        ),
+        Index(
+            "idx_categories_job_id" "job_id",
+            "category",
+            unique=True,
+        ),
+    )
+    category = Column(String)
+    job_id = Column(Integer)
+    type = Column(Enum("required", "optional", name="type"))
+    @staticmethod
+    @with_default_session
+    def create(
+        category: str,
+        job_id: int,
+        type: str,
+        session: Session = None,
+    ):
+        new_category = JobFileCategories(
+            category=category,
+            job_id=job_id,
+            type=type,
+        )
+        new_category.insert(session=session)
+        return new_category
+    @classmethod
+    @with_default_session
+    def categories_for_job(cls, job_id: int, type: str, session: Session = None):
+        categories = session.query(cls).filter(cls.job_id == job_id, cls.type == type).all()
+        return [c.category for c in categories]

{nci_cidc_api_modules-1.1.34 → nci_cidc_api_modules-1.1.37}/cidc_api/models/schemas.py RENAMED Viewed

@@ -104,6 +104,7 @@ class TrialMetadataSchema(BaseSchema):
     file_bundle = fields.Dict(dump_only=True)
     num_participants = fields.Int(dump_only=True)
     num_samples = fields.Int(dump_only=True)
+    ready_for_submission = fields.Bool(dump_only=True)
 TrialMetadataListSchema = _make_list_schema(TrialMetadataSchema())

{nci_cidc_api_modules-1.1.34 → nci_cidc_api_modules-1.1.37}/cidc_api/shared/emails.py RENAMED Viewed

@@ -16,7 +16,7 @@ from ..config.settings import ENV
 # - errors from CSMS in update_cidc_from_csms,
 # - errors from kicking off permissions in grant_download_permissions, and
 # - errors from implementing permissions in worker > permissions_worker
-CIDC_MAILING_LIST = ["essex-alert@cimac-network.org", "mustafa.kucukkal@nih.gov"]
+CIDC_MAILING_LIST = ["essex-alert@cimac-network.org"]
 def sendable(email_template):

nci_cidc_api_modules-1.1.37/cidc_api/shared/file_handling.py ADDED Viewed

@@ -0,0 +1,56 @@
+from werkzeug.datastructures import FileStorage
+from werkzeug.exceptions import BadRequest
+from ..config.settings import GOOGLE_CLINICAL_DATA_BUCKET
+from ..models import PreprocessedFiles
+from ..shared.auth import get_current_user
+from ..shared.gcloud_client import upload_file_to_gcs
+def set_current_file(file: FileStorage, file_category: str, gcs_folder: str, job_id: int = None) -> PreprocessedFiles:
+    """
+    Archives any existing 'current' files for the given category and job,
+    then uploads the new file as the latest 'current' version.
+    """
+    latest_version = PreprocessedFiles.archive_current_files(file_category, job_id=job_id)
+    latest_file = create_file(file, gcs_folder, file_category, job_id, latest_version + 1)
+    return latest_file
+def create_file(
+    file: FileStorage, gcs_folder: str, file_category: str, job_id: int = None, version: int = None
+) -> PreprocessedFiles:
+    """Upload file to GCS and create corresponding metadata record in the database."""
+    status = "pending" if gcs_folder.endswith("pending/") else "current"
+    # only need timestamp for current/approved files
+    append_timestamp = status == "current"
+    # create file in GCS
+    gcs_file_path = upload_file_to_gcs(file, GOOGLE_CLINICAL_DATA_BUCKET, gcs_folder, append_timestamp=append_timestamp)
+    # create corresponding record in db
+    file = PreprocessedFiles.create(
+        file_name=file.filename,
+        object_url=gcs_file_path,
+        file_category=file_category,
+        uploader_email=get_current_user().email,
+        status=status,
+        job_id=job_id,
+        version=version,
+    )
+    return file
+def validate_file_extension(filename: str, allowed_extensions: list[str]):
+    if not filename or not any(filename.lower().endswith(ext) for ext in allowed_extensions):
+        raise BadRequest(f"Invalid file type. Must be one of: {allowed_extensions}")
+def format_common_preprocessed_file_response(file: PreprocessedFiles):
+    """Format a common response for a single PreprocessedFiles record."""
+    return {
+        "file_name": file.file_name,
+        "gcs_uri": f"gs://{GOOGLE_CLINICAL_DATA_BUCKET}/{file.object_url}",
+        "status": file.status,
+        "file_category": file.file_category,
+        "uploader_email": file.uploader_email,
+        "date": file._created.isoformat(),
+    }

{nci_cidc_api_modules-1.1.34 → nci_cidc_api_modules-1.1.37}/cidc_api/shared/gcloud_client.py RENAMED Viewed

@@ -26,6 +26,7 @@ from typing import (
 import googleapiclient.discovery
 import requests
+import pandas as pd
 from cidc_schemas.prism.constants import ASSAY_TO_FILEPATH
 from google.api_core.client_options import ClientOptions
 from google.api_core.iam import Policy
@@ -217,10 +218,12 @@ def upload_xlsx_to_gcs(
     return final_object
-def upload_file_to_gcs(file: FileStorage, bucket_name: str, gcs_folder: str) -> str:
+def upload_file_to_gcs(file: FileStorage, bucket_name: str, gcs_folder: str, append_timestamp: bool = False) -> str:
     """Upload a file to the specified GCS folder and return the GCS path from the bucket."""
     # Secure the filename and prepare file
     filename = secure_filename(file.filename)
+    if append_timestamp:
+        filename = _append_iso_timestamp_to_filename(filename)
     gcs_file_path = os.path.join(gcs_folder, filename)
     binary_file = io.BytesIO(file.read())
@@ -416,6 +419,20 @@ def upload_xlsx_to_intake_bucket(user_email: str, trial_id: str, upload_type: st
     return f"https://console.cloud.google.com/storage/browser/_details/{bucket_name}/{blob_name}"
+def gcs_xlsx_or_csv_file_to_pandas_dataframe(bucket_name: str, blob_name: str):
+    """Reads an XLSX file from Google Cloud Storage into a Pandas DataFrame."""
+    sheet_data = storage.Client().bucket(bucket_name).blob(blob_name).download_as_bytes()
+    temp_file = io.BytesIO(sheet_data)
+    # TODO: specify sheet in xlsx file and/or accept tsv and xls files
+    if blob_name[-3:] == "csv":
+        return pd.read_csv(temp_file)
+    elif blob_name[-4:] == "xlsx":
+        return pd.read_excel(temp_file)
+    else:
+        raise Exception("Can only read csv or xlsx files")
 def _execute_multiblob_acl_change(
     user_email_list: List[str],
     blob_list: List[storage.Blob],

{nci_cidc_api_modules-1.1.34 → nci_cidc_api_modules-1.1.37/nci_cidc_api_modules.egg-info}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: nci_cidc_api_modules
-Version: 1.1.34
+Version: 1.1.37
 Summary: SQLAlchemy data models and configuration tools used in the NCI CIDC API
 Home-page: https://github.com/NCI-CIDC/cidc-api-gae
 License: MIT license
@@ -10,10 +10,10 @@ License-File: LICENSE
 Requires-Dist: werkzeug==3.0.6
 Requires-Dist: flask==3.0.3
 Requires-Dist: flask-migrate==3.1.0
-Requires-Dist: flask-sqlalchemy==3.0.2
-Requires-Dist: sqlalchemy==1.4.54
+Requires-Dist: flask-sqlalchemy==3.1.1
+Requires-Dist: sqlalchemy==2.0.41
 Requires-Dist: marshmallow==3.19.0
-Requires-Dist: marshmallow-sqlalchemy==0.22.3
+Requires-Dist: marshmallow-sqlalchemy==1.4.2
 Requires-Dist: google-cloud-storage==2.18.0
 Requires-Dist: google-cloud-secret-manager==2.20.1
 Requires-Dist: google-cloud-pubsub==2.22.0
@@ -28,7 +28,7 @@ Requires-Dist: python-dotenv==0.10.3
 Requires-Dist: requests==2.32.4
 Requires-Dist: jinja2==3.1.6
 Requires-Dist: certifi==2024.7.4
-Requires-Dist: nci-cidc-schemas==0.27.25
+Requires-Dist: nci-cidc-schemas==0.27.27
 Dynamic: description
 Dynamic: description-content-type
 Dynamic: home-page

{nci_cidc_api_modules-1.1.34 → nci_cidc_api_modules-1.1.37}/nci_cidc_api_modules.egg-info/SOURCES.txt RENAMED Viewed

@@ -19,6 +19,7 @@ cidc_api/models/files/facets.py
 cidc_api/shared/__init__.py
 cidc_api/shared/auth.py
 cidc_api/shared/emails.py
+cidc_api/shared/file_handling.py
 cidc_api/shared/gcloud_client.py
 cidc_api/shared/jose.py
 cidc_api/shared/rest_utils.py

{nci_cidc_api_modules-1.1.34 → nci_cidc_api_modules-1.1.37}/nci_cidc_api_modules.egg-info/requires.txt RENAMED Viewed

@@ -1,10 +1,10 @@
 werkzeug==3.0.6
 flask==3.0.3
 flask-migrate==3.1.0
-flask-sqlalchemy==3.0.2
-sqlalchemy==1.4.54
+flask-sqlalchemy==3.1.1
+sqlalchemy==2.0.41
 marshmallow==3.19.0
-marshmallow-sqlalchemy==0.22.3
+marshmallow-sqlalchemy==1.4.2
 google-cloud-storage==2.18.0
 google-cloud-secret-manager==2.20.1
 google-cloud-pubsub==2.22.0
@@ -19,4 +19,4 @@ python-dotenv==0.10.3
 requests==2.32.4
 jinja2==3.1.6
 certifi==2024.7.4
-nci-cidc-schemas==0.27.25
+nci-cidc-schemas==0.27.27

{nci_cidc_api_modules-1.1.34 → nci_cidc_api_modules-1.1.37}/requirements.modules.txt RENAMED Viewed

@@ -1,10 +1,10 @@
 werkzeug==3.0.6
 flask==3.0.3
 flask-migrate==3.1.0
-flask-sqlalchemy==3.0.2
-sqlalchemy==1.4.54
+flask-sqlalchemy==3.1.1
+sqlalchemy==2.0.41
 marshmallow==3.19.0
-marshmallow-sqlalchemy==0.22.3
+marshmallow-sqlalchemy==1.4.2
 google-cloud-storage==2.18.0
 google-cloud-secret-manager==2.20.1
 google-cloud-pubsub==2.22.0
@@ -19,4 +19,4 @@ python-dotenv==0.10.3
 requests==2.32.4
 jinja2==3.1.6
 certifi==2024.7.4
-nci-cidc-schemas==0.27.25
+nci-cidc-schemas==0.27.27

{nci_cidc_api_modules-1.1.34 → nci_cidc_api_modules-1.1.37}/tests/test_api.py RENAMED Viewed

@@ -5,16 +5,13 @@ to data resources, like endpoints that handle upload-related functionality.
 """
 import os
-os.environ["TZ"] = "UTC"
+import uuid
 from copy import deepcopy
 from unittest.mock import MagicMock
 from datetime import datetime
 from dateutil.parser import parse as parse_date
 import pytest
-from werkzeug.exceptions import BadRequest
 from cidc_api.models import (
     Users,
@@ -29,6 +26,7 @@ from cidc_api.models import (
 from .utils import mock_current_user, mock_gcloud_client
+os.environ["TZ"] = "UTC"
 TEST_RECORD_ID = 1
 # Configuration for resource tests below. For each resource, the following keywords are supported:
@@ -128,7 +126,9 @@ permissions = {
     "filters": {"empty": {"user_id": 2}, "one": {"user_id": TEST_RECORD_ID}},
 }
-upload_token = "53b455a5-d25b-428b-8c83-86a3120188da"
+# UUID object is returned with sqlchemy 2
+upload_token = uuid.uuid4()
 upload_jobs = {
     "json": {
         "id": TEST_RECORD_ID,
@@ -139,7 +139,7 @@ upload_jobs = {
         "gcs_xlsx_uri": "",
         "multifile": False,
         "status": UploadJobStatus.STARTED.value,
-        "token": upload_token,
+        "token": str(upload_token),
     },
     "model": UploadJobs,
     "lookup_func": lambda cfg: f"{cfg['id']}?token={upload_token}",
@@ -265,6 +265,7 @@ def test_resource_and_item_get(resource, config, cidc_api, clean_db, monkeypatch
             assert_dict_contains(item, json)
         else:
             assert_dict_contains(item, config["json"])
         if config.get("pagination"):
             assert response.json["_meta"]["total"] == 3
         elif resource == "users":
@@ -414,14 +415,16 @@ def test_endpoint_urls(cidc_api):
         "/clinical_data/files/master_appendix_a/pending",
         "/clinical_data/files/master_appendix_a/versions",
         "/clinical_data/files/master_appendix_a/versions/<int:version>",
-        "/clinical_data/jobs",
-        "/clinical_data/trials/<string:trial_id>/files/trial_appendix_a/pending",
         "/downloadable_files/",
         "/downloadable_files/filelist",
         "/downloadable_files/compressed_batch",
         "/downloadable_files/download_url",
         "/downloadable_files/facet_groups_for_links",
         "/downloadable_files/filter_facets",
+        "/files/<string:data_type>",
+        "/files/<string:data_type>/<string:file_category>",
+        "/files/<string:data_type>/<string:file_category>/versions",
+        "/files/<string:data_type>/current",
         "/downloadable_files/<int:downloadable_file>",
         "/downloadable_files/<int:downloadable_file>/related_files",
         "/info/assays",
@@ -438,6 +441,11 @@ def test_endpoint_urls(cidc_api):
         "/ingestion/poll_upload_merge_status/<int:upload_job>",
         "/ingestion/intake_bucket",
         "/ingestion/intake_metadata",
+        "/jobs/",
+        "/jobs/<int:job_id>",
+        "/jobs/<int:job_id>/files",
+        "/jobs/<int:job_id>/status",
+        "/jobs/trials/<string:trial_id>/current",
         "/permissions/",
         "/permissions/<int:permission>",
         "/samples/",