PyPI - nci-cidc-api-modules - Versions diffs - 1.2.15__tar.gz → 1.2.17__tar.gz - Mend

nci-cidc-api-modules 1.2.15tar.gz → 1.2.17tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (36) hide show

{nci_cidc_api_modules-1.2.15/nci_cidc_api_modules.egg-info → nci_cidc_api_modules-1.2.17}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: nci_cidc_api_modules
-Version: 1.2.15
+Version: 1.2.17
 Summary: SQLAlchemy data models and configuration tools used in the NCI CIDC API
 Home-page: https://github.com/NCI-CIDC/cidc-api-gae
 License: MIT license
@@ -12,7 +12,7 @@ Requires-Dist: cloud-sql-python-connector[pg8000]>=1.18.5
 Requires-Dist: flask>=3.1.2
 Requires-Dist: flask-migrate>=4.1.0
 Requires-Dist: flask-sqlalchemy>=3.1.1
-Requires-Dist: google-auth>=2.42.0
+Requires-Dist: google-auth==2.41.1
 Requires-Dist: google-api-python-client>=2.185.0
 Requires-Dist: google-cloud-bigquery>=3.38.0
 Requires-Dist: google-cloud-pubsub>=2.32.0

{nci_cidc_api_modules-1.2.15 → nci_cidc_api_modules-1.2.17}/cidc_api/config/db.py RENAMED Viewed

@@ -10,8 +10,7 @@ from google.cloud.sql.connector import Connector, IPTypes
 from .secrets import get_secrets_manager
 db = SQLAlchemy()
-BaseModel = declarative_base()
-db.Model = BaseModel
+BaseModel = db.Model
 connector = Connector()
@@ -31,7 +30,6 @@ def getconn():
 def init_db(app: Flask):
     """Connect `app` to the database and run migrations"""
     db.init_app(app)
-    db.Model = BaseModel
     Migrate(app, db, app.config["MIGRATIONS_PATH"])
     with app.app_context():
         upgrade(app.config["MIGRATIONS_PATH"])

{nci_cidc_api_modules-1.2.15 → nci_cidc_api_modules-1.2.17}/cidc_api/models/models.py RENAMED Viewed

@@ -26,6 +26,7 @@ __all__ = [
     "FileValidationErrors",
     "IngestionJobs",
     "JobFileCategories",
+    "CategoryDataElements",
     "ValidationConfigs",
     "TRIAL_APPENDIX_A",
     "TRIAL_APPENDIX_A_CELL_THAT_ENDS_THE_HEADER",
@@ -95,7 +96,7 @@ from sqlalchemy import (
     String,
     Table,
 )
-from sqlalchemy.dialects.postgresql import JSONB, UUID
+from sqlalchemy.dialects.postgresql import JSONB, UUID, CITEXT
 from sqlalchemy.engine import ResultProxy
 from sqlalchemy.exc import IntegrityError
 from sqlalchemy.ext.hybrid import hybrid_property
@@ -131,7 +132,6 @@ from ..config.settings import (
     MAX_PAGINATION_PAGE_SIZE,
     TESTING,
     INACTIVE_USER_DAYS,
-    GOOGLE_CLINICAL_DATA_BUCKET,
 )
 from ..shared import emails
 from ..shared.gcloud_client import (
@@ -145,7 +145,6 @@ from ..shared.gcloud_client import (
     revoke_intake_access,
     revoke_lister_access,
     revoke_bigquery_access,
-    gcs_xlsx_or_csv_file_to_pandas_dataframe,
 )
 os.environ["TZ"] = "UTC"
@@ -382,7 +381,7 @@ class Users(CommonColumns):
     last_n = Column(String)
     organization = Column(Enum(*ORGS, name="orgs"))
     approval_date = Column(DateTime)
-    role = Column(Enum(*ROLES, name="role"))
+    role = Column(Enum(*ROLES, name="roles"))
     disabled = Column(Boolean, default=False, server_default="false")
     @validates("approval_date")
@@ -607,6 +606,22 @@ class Permissions(CommonColumns):
             unique=True,
             postgresql_where=file_group_id.isnot(None),
         ),
+        Index(
+            "unique_trial_id_upload_type_is_null_perms",
+            "granted_to_user",
+            "trial_id",
+            literal_column("(upload_type IS NULL)"),
+            unique=True,
+            postgresql_where="(upload_type IS NULL)",
+        ),
+        Index(
+            "unique_upload_type_trial_id_is_null_perms",
+            "granted_to_user",
+            literal_column("(trial_id IS NULL)"),
+            "upload_type",
+            unique=True,
+            postgresql_where="(trial_id IS NULL)",
+        ),
     )
     # Shorthand to make code related to trial- and upload-type-level permissions
@@ -2294,7 +2309,7 @@ class DownloadableFiles(CommonColumns):
     additional_metadata = Column(JSONB, nullable=False)
     # TODO rename upload_type, because we store manifests in there too.
     # NOTE: this column actually has type CITEXT.
-    upload_type = Column(String, nullable=False)
+    upload_type = Column(CITEXT, nullable=False)
     md5_hash = Column(String, nullable=True)
     crc32c_hash = Column(String, nullable=True)
     trial_id = Column(String, nullable=False)
@@ -2314,7 +2329,7 @@ class DownloadableFiles(CommonColumns):
     #   used instead of data_format.
     # The columns are left as optional for short term backwards compatibility.
     file_name = Column(String, nullable=True)
-    data_format = Column(String, nullable=True)
+    data_format = Column(CITEXT, nullable=True)
     file_groups = relationship(
         "FileGroups",
@@ -3262,11 +3277,11 @@ class PreprocessedFiles(CommonColumns):
         ),
     )
-    file_name = Column(String)
-    object_url = Column(String)
+    file_name = Column(String, nullable=False)
+    object_url = Column(String, nullable=False)
     job_id = Column(Integer)
-    file_category = Column(String)
-    uploader_email = Column(String)
+    file_category = Column(String, nullable=False)
+    uploader_email = Column(String, nullable=False)
     status = Column(String)
     version = Column(Integer)
     released_version = Column(String)
@@ -3434,7 +3449,7 @@ class IngestionJobs(CommonColumns):
         ),
     )
-    status = Column("status", Enum(*INGESTION_JOB_STATUSES, name="status"), nullable=False)
+    status = Column("status", Enum(*INGESTION_JOB_STATUSES, name="ingestion_job_status"), nullable=False)
     trial_id = Column(String, nullable=False)
     version = Column(Integer, nullable=False)
     pending = Column(Boolean, nullable=False, default=False)
@@ -3450,30 +3465,9 @@ class IngestionJobs(CommonColumns):
     @with_default_session
     def transition_status(self, status: str, session: Session):
-        # create required categories after opening job for submission
-        if self.status == "DRAFT" and status == "INITIAL SUBMISSION":
-            for category in self.derive_required_categories_from_appendix_a():
-                JobFileCategories.create(category=category, job_id=self.id, type="required")
         self.status = status
         self.update(session=session)
-    def derive_required_categories_from_appendix_a(self) -> List:
-        appendix_a = PreprocessedFiles.get_files_by_category_and_status(TRIAL_APPENDIX_A, "current", job_id=self.id)[0]
-        df = gcs_xlsx_or_csv_file_to_pandas_dataframe(GOOGLE_CLINICAL_DATA_BUCKET, appendix_a.object_url)
-        categories = []
-        headers_ended = False
-        for _index, row in df.iterrows():
-            cell = str(row.iloc[0])
-            if headers_ended:
-                if cell != "nan" and cell not in categories:
-                    categories.append(cell)
-            elif cell.lower() == TRIAL_APPENDIX_A_CELL_THAT_ENDS_THE_HEADER.lower():
-                headers_ended = True
-        if "data_dictionary" not in categories:
-            # Ensure Data_Dictionary is always a required file category
-            categories.append("data_dictionary")
-        return categories
     @classmethod
     @with_default_session
     def atomic_set_job_as_pending(cls, job_id: int, session: Session) -> Boolean:
@@ -3544,15 +3538,17 @@ class JobFileCategories(CommonColumns):
             ["ingestion_jobs.id"],
         ),
         Index(
-            "idx_categories_job_id" "job_id",
+            "idx_categories_job_id",
+            "job_id",
             "category",
             unique=True,
         ),
     )
     category = Column(String)
-    job_id = Column(Integer)
-    type = Column(Enum("required", "optional", name="type"))
+    job_id = Column(Integer, nullable=False)
+    type = Column(Enum("required", "optional", name="type"), nullable=False)
+    is_custom = Column(Boolean, nullable=False, default=False, server_default="false")
     @staticmethod
     @with_default_session
@@ -3560,12 +3556,14 @@ class JobFileCategories(CommonColumns):
         category: str,
         job_id: int,
         type: str,
+        is_custom: bool = False,
         session: Session = None,
     ):
         new_category = JobFileCategories(
             category=category,
             job_id=job_id,
             type=type,
+            is_custom=is_custom,
         )
         new_category.insert(session=session)
         return new_category
@@ -3576,6 +3574,39 @@ class JobFileCategories(CommonColumns):
         categories = session.query(cls).filter(cls.job_id == job_id, cls.type == type).all()
         return [c.category for c in categories]
+    @classmethod
+    @with_default_session
+    def full_categories_for_job(cls, job_id: int, session: Session = None):
+        return session.query(cls).filter_by(job_id=job_id).all()
+class CategoryDataElements(CommonColumns):
+    __tablename__ = "category_data_elements"
+    __table_args__ = (
+        ForeignKeyConstraint(
+            ["category_id"],
+            ["job_file_categories.id"],
+            ondelete="CASCADE",
+        ),
+        Index(
+            "idx_elements_category_id",
+            "category_id",
+            "name",
+            unique=True,
+        ),
+    )
+    category_id = Column(Integer, nullable=False)
+    name = Column(String, nullable=False)
+    is_custom = Column(Boolean, nullable=False, default=False, server_default="false")
+    element_type = Column(String, nullable=False)
+    cardinality = Column(String, nullable=True)
+    @classmethod
+    @with_default_session
+    def elements_for_category(cls, category_id: int, session: Session = None):
+        return session.query(cls).filter_by(category_id=category_id).all()
 class FileValidationErrors(CommonColumns):
     __tablename__ = "file_validation_errors"

{nci_cidc_api_modules-1.2.15 → nci_cidc_api_modules-1.2.17}/cidc_api/shared/file_handling.py RENAMED Viewed

@@ -1,14 +1,15 @@
 from pathlib import Path
+from pandas import Series, DataFrame
+from sqlalchemy.orm.session import Session
 from werkzeug.datastructures import FileStorage
 from werkzeug.exceptions import BadRequest, InternalServerError
 from ..config.logging import get_logger
 from ..config.settings import GOOGLE_CLINICAL_DATA_BUCKET
-from ..models import PreprocessedFiles
+from ..models import PreprocessedFiles, TRIAL_APPENDIX_A_CELL_THAT_ENDS_THE_HEADER
 from ..shared.auth import get_current_user
 from ..shared.gcloud_client import upload_file_to_gcs, move_gcs_file
-from sqlalchemy.orm.session import Session
 logger = get_logger(__name__)
@@ -104,3 +105,37 @@ def strip_filename_and_pending_folder(path_str):
     if path.parent.name != "pending":
         raise ValueError("Expected 'pending' folder above file")
     return str(path.parent.parent)
+def get_row_at_condition(df: DataFrame, condition):
+    condition_met_index = df[condition].index[0]
+    row_at_condition_series = df.iloc[condition_met_index]
+    return row_at_condition_series
+def get_column(header_row_series: Series, header_name: str, use_raw_header_val: bool = False):
+    for idx, raw_header in enumerate(header_row_series):
+        if str(raw_header).lower() == header_name.lower():
+            return raw_header if use_raw_header_val else header_row_series.index[idx]
+    return None
+def get_column_from_appendix_a(appendix_a_df: DataFrame, header_name: str):
+    category_column = appendix_a_df.columns[0]
+    aa_header_condition = appendix_a_df[category_column] == TRIAL_APPENDIX_A_CELL_THAT_ENDS_THE_HEADER
+    header_row_series = get_row_at_condition(appendix_a_df, aa_header_condition)
+    return get_column(header_row_series, header_name)
+def get_column_from_first_row(df: DataFrame, header_name: str):
+    use_raw_header_val = False
+    if df.columns.inferred_type == "integer":
+        # If columns are integers (i.e. file was read without headers), treat the first row as header values.
+        header_row_series = df.iloc[0]
+    else:
+        # Otherwise columns already are headers
+        header_row_series = Series(df.columns)
+        use_raw_header_val = True
+    return get_column(header_row_series, header_name, use_raw_header_val=use_raw_header_val)

{nci_cidc_api_modules-1.2.15 → nci_cidc_api_modules-1.2.17}/cidc_api/shared/gcloud_client.py RENAMED Viewed

@@ -1,6 +1,6 @@
 """Utilities for interacting with the Google Cloud Platform APIs."""
-# pylint: disable=logging-fstring-interpolation
+# pylint: disable=logging-fstring-interpolation,too-many-lines
 import base64
 import datetime
@@ -37,8 +37,8 @@ from sqlalchemy.orm.session import Session
 from werkzeug.datastructures import FileStorage
 from werkzeug.utils import secure_filename
-from cidc_api.config.secrets import get_secrets_manager
 from ..config.logging import get_logger
+from ..config.secrets import get_secrets_manager
 from ..config.settings import (
     DEV_USE_GCS,
     GOOGLE_INTAKE_ROLE,
@@ -62,6 +62,8 @@ from ..config.settings import (
     DEV_CFUNCTIONS_SERVER,
     INACTIVE_USER_DAYS,
 )
+from ..shared.utils import strip_whitespaces
 os.environ["TZ"] = "UTC"
 logger = get_logger(__name__)
@@ -427,9 +429,9 @@ def gcs_xlsx_or_csv_file_to_pandas_dataframe(bucket_name: str, blob_name: str):
     # TODO: specify sheet in xlsx file and/or accept tsv and xls files
     if blob_name[-3:] == "csv":
-        return pd.read_csv(temp_file)
+        return strip_whitespaces(pd.read_csv(temp_file))
     elif blob_name[-4:] == "xlsx":
-        return pd.read_excel(temp_file)
+        return strip_whitespaces(pd.read_excel(temp_file))
     else:
         raise Exception("Can only read csv or xlsx files")

nci_cidc_api_modules-1.2.17/cidc_api/shared/utils.py ADDED Viewed

@@ -0,0 +1,8 @@
+def strip_whitespaces(df):
+    def stripper(x):
+        if x and isinstance(x, str):
+            return x.strip()
+        else:
+            return x
+    return df.map(stripper)

{nci_cidc_api_modules-1.2.15 → nci_cidc_api_modules-1.2.17/nci_cidc_api_modules.egg-info}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: nci_cidc_api_modules
-Version: 1.2.15
+Version: 1.2.17
 Summary: SQLAlchemy data models and configuration tools used in the NCI CIDC API
 Home-page: https://github.com/NCI-CIDC/cidc-api-gae
 License: MIT license
@@ -12,7 +12,7 @@ Requires-Dist: cloud-sql-python-connector[pg8000]>=1.18.5
 Requires-Dist: flask>=3.1.2
 Requires-Dist: flask-migrate>=4.1.0
 Requires-Dist: flask-sqlalchemy>=3.1.1
-Requires-Dist: google-auth>=2.42.0
+Requires-Dist: google-auth==2.41.1
 Requires-Dist: google-api-python-client>=2.185.0
 Requires-Dist: google-cloud-bigquery>=3.38.0
 Requires-Dist: google-cloud-pubsub>=2.32.0

{nci_cidc_api_modules-1.2.15 → nci_cidc_api_modules-1.2.17}/nci_cidc_api_modules.egg-info/SOURCES.txt RENAMED Viewed

@@ -24,6 +24,7 @@ cidc_api/shared/file_handling.py
 cidc_api/shared/gcloud_client.py
 cidc_api/shared/jose.py
 cidc_api/shared/rest_utils.py
+cidc_api/shared/utils.py
 nci_cidc_api_modules.egg-info/PKG-INFO
 nci_cidc_api_modules.egg-info/SOURCES.txt
 nci_cidc_api_modules.egg-info/dependency_links.txt

{nci_cidc_api_modules-1.2.15 → nci_cidc_api_modules-1.2.17}/nci_cidc_api_modules.egg-info/requires.txt RENAMED Viewed

@@ -3,7 +3,7 @@ cloud-sql-python-connector[pg8000]>=1.18.5
 flask>=3.1.2
 flask-migrate>=4.1.0
 flask-sqlalchemy>=3.1.1
-google-auth>=2.42.0
+google-auth==2.41.1
 google-api-python-client>=2.185.0
 google-cloud-bigquery>=3.38.0
 google-cloud-pubsub>=2.32.0

{nci_cidc_api_modules-1.2.15 → nci_cidc_api_modules-1.2.17}/requirements.modules.txt RENAMED Viewed

@@ -3,7 +3,7 @@ cloud-sql-python-connector[pg8000]>=1.18.5
 flask>=3.1.2
 flask-migrate>=4.1.0
 flask-sqlalchemy>=3.1.1
-google-auth>=2.42.0
+google-auth==2.41.1 # There is a bug in 2.42.X that causes local to fail when connecting to dev
 google-api-python-client>=2.185.0
 google-cloud-bigquery>=3.38.0
 google-cloud-pubsub>=2.32.0