nci-cidc-api-modules 1.1.35__py3-none-any.whl → 1.1.38__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
cidc_api/config/db.py CHANGED
@@ -4,13 +4,12 @@ from flask import Flask
4
4
  from flask_sqlalchemy import SQLAlchemy
5
5
  from flask_migrate import Migrate, upgrade
6
6
  from sqlalchemy.engine.url import URL
7
- from sqlalchemy.ext.declarative import declarative_base
8
-
7
+ from sqlalchemy.orm import declarative_base
9
8
 
10
9
  from .secrets import get_secrets_manager
11
10
 
12
11
  db = SQLAlchemy()
13
- BaseModel = declarative_base(bind=db)
12
+ BaseModel = declarative_base()
14
13
  db.Model = BaseModel
15
14
 
16
15
 
@@ -54,7 +53,7 @@ def get_sqlalchemy_database_uri(testing: bool = False) -> str:
54
53
  "Either POSTGRES_URI or CLOUD_SQL_INSTANCE_NAME must be defined to connect " + "to a database."
55
54
  )
56
55
 
57
- db_uri = str(URL(**config))
56
+ db_uri = str(URL.create(**config).render_as_string(hide_password=False))
58
57
 
59
58
  assert db_uri
60
59
 
cidc_api/models/models.py CHANGED
@@ -23,6 +23,14 @@ __all__ = [
23
23
  "ValidationMultiError",
24
24
  "with_default_session",
25
25
  "PreprocessedFiles",
26
+ "IngestionJobs",
27
+ "JobFileCategories",
28
+ "TRIAL_APPENDIX_A",
29
+ "REQUEST_LETTER",
30
+ "ADMIN_FILE_CATEGORIES",
31
+ "FINAL_JOB_STATUS",
32
+ "INGESTION_JOB_STATUSES",
33
+ "INGESTION_JOB_COLORS",
26
34
  ]
27
35
 
28
36
  import hashlib
@@ -36,6 +44,7 @@ from functools import wraps
36
44
  from typing import (
37
45
  Any,
38
46
  BinaryIO,
47
+ ClassVar,
39
48
  Dict,
40
49
  Optional,
41
50
  List,
@@ -54,32 +63,33 @@ from google.cloud.storage import Blob
54
63
  from jsonschema.exceptions import ValidationError
55
64
  from sqlalchemy import (
56
65
  and_,
57
- Column,
66
+ asc,
67
+ case,
68
+ desc,
69
+ func,
70
+ literal,
71
+ literal_column,
72
+ not_,
73
+ or_,
74
+ select,
75
+ text,
76
+ true,
77
+ tuple_,
78
+ update,
79
+ BigInteger,
58
80
  Boolean,
81
+ CheckConstraint,
82
+ Column,
59
83
  DateTime,
60
- Integer,
61
- BigInteger,
62
- String,
63
84
  Enum,
64
- Index,
65
- func,
66
- CheckConstraint,
67
85
  ForeignKey,
68
86
  ForeignKeyConstraint,
87
+ Index,
88
+ Integer,
89
+ MetaData,
69
90
  PrimaryKeyConstraint,
70
- tuple_,
71
- asc,
72
- desc,
73
- update,
74
- case,
75
- select,
76
- literal_column,
77
- not_,
78
- literal,
79
- or_,
91
+ String,
80
92
  Table,
81
- MetaData,
82
- true,
83
93
  )
84
94
  from sqlalchemy.dialects.postgresql import JSONB, UUID
85
95
  from sqlalchemy.engine import ResultProxy
@@ -96,8 +106,6 @@ from sqlalchemy.sql import (
96
106
  # break up this giant file.
97
107
  and_ as sql_and,
98
108
  or_ as sql_or,
99
- # select, # ALREADY IMPORTED
100
- text,
101
109
  )
102
110
  from sqlalchemy.sql.elements import BooleanClauseList
103
111
  from sqlalchemy.sql.functions import coalesce
@@ -119,6 +127,7 @@ from ..config.settings import (
119
127
  MAX_PAGINATION_PAGE_SIZE,
120
128
  TESTING,
121
129
  INACTIVE_USER_DAYS,
130
+ GOOGLE_CLINICAL_DATA_BUCKET,
122
131
  )
123
132
  from ..shared import emails
124
133
  from ..shared.gcloud_client import (
@@ -132,6 +141,7 @@ from ..shared.gcloud_client import (
132
141
  revoke_intake_access,
133
142
  revoke_lister_access,
134
143
  revoke_bigquery_access,
144
+ gcs_xlsx_or_csv_file_to_pandas_dataframe,
135
145
  )
136
146
 
137
147
  os.environ["TZ"] = "UTC"
@@ -309,7 +319,7 @@ class CommonColumns(BaseModel): # type: ignore
309
319
  @with_default_session
310
320
  def find_by_id(cls, id: int, session: Session):
311
321
  """Find the record with this id"""
312
- return session.query(cls).get(id)
322
+ return session.get(cls, id)
313
323
 
314
324
  @classmethod
315
325
  @with_default_session
@@ -347,6 +357,7 @@ class CIDCRole(EnumBaseClass):
347
357
  NCI_BIOBANK_USER = "nci-biobank-user"
348
358
  NETWORK_VIEWER = "network-viewer"
349
359
  PACT_USER = "pact-user"
360
+ CLINICAL_TRIAL_USER = "clinical-trial-user"
350
361
 
351
362
 
352
363
  ROLES = [role.value for role in CIDCRole]
@@ -1207,9 +1218,10 @@ class TrialMetadata(CommonColumns):
1207
1218
  raise NoResultFound(f"No trial found with id {trial_id}")
1208
1219
  return unprism.unprism_samples(trial.metadata_json)
1209
1220
 
1210
- file_bundle: Optional[FileBundle]
1211
- num_participants: Optional[int]
1212
- num_samples: Optional[int]
1221
+ file_bundle: ClassVar[Optional[FileBundle]]
1222
+ num_participants: ClassVar[Optional[int]]
1223
+ num_samples: ClassVar[Optional[int]]
1224
+ ready_for_submission: ClassVar[Optional[Boolean]]
1213
1225
 
1214
1226
  # List of metadata JSON fields that should not be sent to clients
1215
1227
  # in queries that list trial metadata, because they may contain a lot
@@ -1330,11 +1342,26 @@ class TrialMetadata(CommonColumns):
1330
1342
  del trial.file_bundle[assay][purpose]
1331
1343
  if not trial.file_bundle[assay]:
1332
1344
  del trial.file_bundle[assay]
1345
+ # Check if trial is ready for submission
1346
+ setattr(trial, "ready_for_submission", trial.ready_for_submission())
1333
1347
 
1334
1348
  trials.append(trial)
1335
1349
 
1336
1350
  return trials
1337
1351
 
1352
+ @with_default_session
1353
+ def ready_for_submission(self, session: Session) -> Boolean:
1354
+ open_job = IngestionJobs.get_open_job_by_trial(self.trial_id)
1355
+ if not open_job:
1356
+ return False
1357
+ appendix_a_files = PreprocessedFiles.get_files_by_category_and_status(
1358
+ "trial_appendix_a", "current", job_id=open_job.id
1359
+ )
1360
+ trial_letters = PreprocessedFiles.get_files_by_category_and_status(
1361
+ "request_letter", "current", job_id=open_job.id
1362
+ )
1363
+ return appendix_a_files and trial_letters and open_job.status == "DRAFT"
1364
+
1338
1365
  @with_default_session
1339
1366
  def insert(
1340
1367
  self,
@@ -1711,6 +1738,30 @@ class TrialMetadata(CommonColumns):
1711
1738
  jsonb_array_elements(batch->'records') record
1712
1739
  """
1713
1740
 
1741
+ # Find all samples associated with scrnaseq analysis uploads.
1742
+ scrnaseq_analysis_subquery = """
1743
+ select
1744
+ trial_id,
1745
+ 'scrnaseq_analysis' as key,
1746
+ record->>'cimac_id' as cimac_id
1747
+ from
1748
+ trial_metadata,
1749
+ jsonb_array_elements(metadata_json#>'{analysis,scrnaseq_analysis}') batch,
1750
+ jsonb_array_elements(batch->'records') record
1751
+ """
1752
+
1753
+ # Find all samples associated with visium analysis uploads.
1754
+ visium_analysis_subquery = """
1755
+ select
1756
+ trial_id,
1757
+ 'visium_analysis' as key,
1758
+ record->>'cimac_id' as cimac_id
1759
+ from
1760
+ trial_metadata,
1761
+ jsonb_array_elements(metadata_json#>'{analysis,visium_analysis}') batch,
1762
+ jsonb_array_elements(batch->'records') record
1763
+ """
1764
+
1714
1765
  # Build up a JSON object mapping analysis types to arrays of excluded samples.
1715
1766
  # The resulting object will have structure like:
1716
1767
  # {
@@ -1866,6 +1917,10 @@ class TrialMetadata(CommonColumns):
1866
1917
  {cytof_analysis_subquery}
1867
1918
  union all
1868
1919
  {atacseq_analysis_subquery}
1920
+ union all
1921
+ {scrnaseq_analysis_subquery}
1922
+ union all
1923
+ {visium_analysis_subquery}
1869
1924
  ) assays_and_analysis
1870
1925
  group by
1871
1926
  trial_id, key
@@ -1924,7 +1979,7 @@ class TrialMetadata(CommonColumns):
1924
1979
  - `"wes_tumor_only_analysis"` counts (tumor) samples with tumor-only analysis
1925
1980
  For `"total_[participants/samples]"`, ALL (ie tumor AND normal) WES assay samples are included.
1926
1981
  """
1927
- summaries_query = "SELECT result FROM trial_summaries_mv"
1982
+ summaries_query = text("SELECT result FROM trial_summaries_mv")
1928
1983
  # Retrieve trial-level summary results from data cached in trial_summaries_mv materialized view.
1929
1984
  # The source of the SQL query used in trial_summaries_mv is get_summaries_query()
1930
1985
  summaries = [summary for (summary,) in session.execute(summaries_query) if summary]
@@ -2256,6 +2311,7 @@ class DownloadableFiles(CommonColumns):
2256
2311
  "FileGroups",
2257
2312
  secondary="files_to_file_groups",
2258
2313
  back_populates="downloadable_files",
2314
+ cascade="save-update",
2259
2315
  )
2260
2316
 
2261
2317
  FILE_EXT_REGEX = r"\.([^./]*(\.gz)?)$"
@@ -2751,7 +2807,7 @@ class DownloadableFiles(CommonColumns):
2751
2807
  """
2752
2808
 
2753
2809
  where_clause = DownloadableFiles._generate_where_clause_with_permissions(user)
2754
- statement = select([DownloadableFiles.id]).where(sql_and(DownloadableFiles.id.in_(ids), where_clause))
2810
+ statement = select(DownloadableFiles.id).where(sql_and(DownloadableFiles.id.in_(ids), where_clause))
2755
2811
 
2756
2812
  return [row[0] for row in session.execute(statement).fetchall()]
2757
2813
 
@@ -2783,7 +2839,7 @@ class DownloadableFiles(CommonColumns):
2783
2839
 
2784
2840
  for file in files_to_delete:
2785
2841
  file.delete(commit=True)
2786
- session.execute("REFRESH MATERIALIZED VIEW CONCURRENTLY trial_summaries_mv")
2842
+ session.execute(text("REFRESH MATERIALIZED VIEW CONCURRENTLY trial_summaries_mv"))
2787
2843
 
2788
2844
  @classmethod
2789
2845
  @with_default_session
@@ -3054,23 +3110,19 @@ class DownloadableFiles(CommonColumns):
3054
3110
 
3055
3111
  id_bundles = (
3056
3112
  select(
3057
- [
3058
- cls.trial_id,
3059
- cls.data_category_prefix.label(type_col.key),
3060
- cls.file_purpose.label(purp_col.key),
3061
- func.json_agg(cls.id).label(ids_col.key),
3062
- ]
3113
+ cls.trial_id,
3114
+ cls.data_category_prefix.label(type_col.key),
3115
+ cls.file_purpose.label(purp_col.key),
3116
+ func.json_agg(cls.id).label(ids_col.key),
3063
3117
  )
3064
3118
  .group_by(cls.trial_id, cls.data_category_prefix, cls.file_purpose)
3065
3119
  .alias("id_bundles")
3066
3120
  )
3067
3121
  purpose_bundles = (
3068
3122
  select(
3069
- [
3070
- tid_col,
3071
- type_col,
3072
- func.json_object_agg(func.coalesce(purp_col, "miscellaneous"), ids_col).label(purps_col.key),
3073
- ]
3123
+ tid_col,
3124
+ type_col,
3125
+ func.json_object_agg(func.coalesce(purp_col, "miscellaneous"), ids_col).label(purps_col.key),
3074
3126
  )
3075
3127
  .select_from(id_bundles)
3076
3128
  .group_by(tid_col, type_col)
@@ -3078,10 +3130,8 @@ class DownloadableFiles(CommonColumns):
3078
3130
  )
3079
3131
  file_bundles = (
3080
3132
  select(
3081
- [
3082
- tid_col.label(tid_col.key),
3083
- func.json_object_agg(func.coalesce(type_col, "other"), purps_col).label("file_bundle"),
3084
- ]
3133
+ tid_col.label(tid_col.key),
3134
+ func.json_object_agg(func.coalesce(type_col, "other"), purps_col).label("file_bundle"),
3085
3135
  )
3086
3136
  .select_from(purpose_bundles)
3087
3137
  .group_by(tid_col)
@@ -3131,13 +3181,13 @@ class DownloadableFiles(CommonColumns):
3131
3181
  # Query clause for computing a downloadable file's data category.
3132
3182
  # Used above in the DownloadableFiles.data_category computed property.
3133
3183
  DATA_CATEGORY_CASE_CLAUSE = case(
3134
- [(DownloadableFiles.facet_group == k, v) for k, v in facet_groups_to_categories.items()]
3184
+ *[(DownloadableFiles.facet_group == k, v) for k, v in facet_groups_to_categories.items()]
3135
3185
  )
3136
3186
 
3137
3187
  # Query clause for computing a downloadable file's file purpose.
3138
3188
  # Used above in the DownloadableFiles.file_purpose computed property.
3139
3189
  FILE_PURPOSE_CASE_CLAUSE = case(
3140
- [
3190
+ *[
3141
3191
  (DownloadableFiles.facet_group == facet_group, file_details.file_purpose)
3142
3192
  for facet_group, file_details in details_dict.items()
3143
3193
  ]
@@ -3146,7 +3196,7 @@ FILE_PURPOSE_CASE_CLAUSE = case(
3146
3196
 
3147
3197
  def result_proxy_to_models(result_proxy: ResultProxy, model: BaseModel) -> List[BaseModel]:
3148
3198
  """Materialize a sqlalchemy `result_proxy` iterable as a list of `model` instances"""
3149
- return [model(**dict(row_proxy)) for row_proxy in result_proxy.all()]
3199
+ return [model(**dict(row_proxy._mapping)) for row_proxy in result_proxy.all()]
3150
3200
 
3151
3201
 
3152
3202
  @with_default_session
@@ -3166,7 +3216,7 @@ def upload_manifest_json(
3166
3216
  * The updated trial metadata object is updated in the `TrialMetadata` table.
3167
3217
  """
3168
3218
  try:
3169
- TrialMetadata.patch_manifest(trial_id, md_patch, session=session, commit=False)
3219
+ TrialMetadata.patch_manifest(trial_id, md_patch, session=session, commit=True)
3170
3220
  except ValidationError as e:
3171
3221
  raise BadRequest(json_validation.format_validation_error(e)) from e
3172
3222
  except ValidationMultiError as e:
@@ -3187,12 +3237,24 @@ def upload_manifest_json(
3187
3237
  return manifest_upload.id
3188
3238
 
3189
3239
 
3240
+ TRIAL_APPENDIX_A = "trial_appendix_a"
3241
+ REQUEST_LETTER = "request_letter"
3242
+ ADMIN_FILE_CATEGORIES = [TRIAL_APPENDIX_A, REQUEST_LETTER]
3243
+
3244
+
3190
3245
  class PreprocessedFiles(CommonColumns):
3191
3246
  __tablename__ = "preprocessed_files"
3247
+ __table_args__ = (
3248
+ ForeignKeyConstraint(
3249
+ ["job_id"],
3250
+ ["ingestion_jobs.id"],
3251
+ name="preprocessed_files_job_id_fkey",
3252
+ ),
3253
+ )
3192
3254
 
3193
3255
  file_name = Column(String)
3194
3256
  object_url = Column(String)
3195
- trial_id = Column(String)
3257
+ job_id = Column(Integer)
3196
3258
  file_category = Column(String)
3197
3259
  uploader_email = Column(String)
3198
3260
  status = Column(String)
@@ -3207,7 +3269,7 @@ class PreprocessedFiles(CommonColumns):
3207
3269
  file_category: str,
3208
3270
  uploader_email: str,
3209
3271
  status: str = "pending",
3210
- trial_id: str = None,
3272
+ job_id: int = None,
3211
3273
  version: int = None,
3212
3274
  released_version: str = None,
3213
3275
  session: Session = None,
@@ -3219,7 +3281,7 @@ class PreprocessedFiles(CommonColumns):
3219
3281
  file_category=file_category,
3220
3282
  uploader_email=uploader_email,
3221
3283
  status=status,
3222
- trial_id=trial_id,
3284
+ job_id=job_id,
3223
3285
  version=version,
3224
3286
  released_version=released_version,
3225
3287
  )
@@ -3228,10 +3290,10 @@ class PreprocessedFiles(CommonColumns):
3228
3290
 
3229
3291
  @classmethod
3230
3292
  @with_default_session
3231
- def archive_current_files(cls, file_category: str, session: Session = None):
3293
+ def archive_current_files(cls, file_category: str, job_id: int = None, session: Session = None):
3232
3294
  """Update any 'current' files in the given category to 'archived'. Returns latest existing version number."""
3233
3295
  current_version = 0
3234
- current_files = cls.get_files_by_category_and_status(file_category, "current", session=session)
3296
+ current_files = cls.get_files_by_category_and_status(file_category, "current", job_id=job_id, session=session)
3235
3297
  for file in current_files:
3236
3298
  file.status = "archived"
3237
3299
  file._updated = datetime.now()
@@ -3241,22 +3303,35 @@ class PreprocessedFiles(CommonColumns):
3241
3303
 
3242
3304
  @classmethod
3243
3305
  @with_default_session
3244
- def delete_pending_files_by_category(cls, file_category: str, trial_id: str = None, session: Session = None):
3245
- """Delete all pending files matching given file_category and optional trial_id."""
3246
- records = cls.get_files_by_category_and_status(file_category, "pending", trial_id=trial_id, session=session)
3306
+ def delete_pending_files_by_category(cls, file_category: str, job_id: int = None, session: Session = None):
3307
+ """Delete all pending files matching given file_category and optional job_id."""
3308
+ records = cls.get_files_by_category_and_status(file_category, "pending", job_id=job_id, session=session)
3309
+ for record in records:
3310
+ session.delete(record)
3311
+ session.commit()
3312
+
3313
+ @classmethod
3314
+ @with_default_session
3315
+ def delete_files_by_category(cls, file_category: str, job_id: int = None, session: Session = None):
3316
+ """Delete all files matching a given file_category and job_id (or system files if job_id is None)."""
3317
+ query = session.query(cls).filter_by(file_category=file_category)
3318
+ query = cls.add_job_filter(query, job_id)
3319
+ records = query.all()
3320
+ if not records:
3321
+ return False
3247
3322
  for record in records:
3248
3323
  session.delete(record)
3249
3324
  session.commit()
3325
+ return True
3250
3326
 
3251
3327
  @classmethod
3252
3328
  @with_default_session
3253
3329
  def get_files_by_category_and_status(
3254
- cls, file_category: str, status: str, trial_id: str = None, session: Session = None
3330
+ cls, file_category: str, status: str, job_id: int = None, session: Session = None
3255
3331
  ) -> list["PreprocessedFiles"]:
3256
- """Return all files matching given file_category and status, optionally filtered by trial_id."""
3332
+ """Return all files matching file_category and status, with job_id filter (job_id is NULL if not provided)."""
3257
3333
  query = session.query(cls).filter_by(file_category=file_category, status=status)
3258
- if trial_id:
3259
- query = query.filter_by(trial_id=trial_id)
3334
+ query = cls.add_job_filter(query, job_id)
3260
3335
  return query.all()
3261
3336
 
3262
3337
  @classmethod
@@ -3266,3 +3341,171 @@ class PreprocessedFiles(CommonColumns):
3266
3341
  ) -> Optional["PreprocessedFiles"]:
3267
3342
  """Return the file matching the given category and version number."""
3268
3343
  return session.query(cls).filter_by(file_category=file_category, version=version).one_or_none()
3344
+
3345
+ @classmethod
3346
+ @with_default_session
3347
+ def get_system_reference_files(cls, status: str = "current", session: Session = None) -> list["PreprocessedFiles"]:
3348
+ """Return static reference files that are not linked to any job and not Master Appendix A."""
3349
+ return (
3350
+ session.query(cls)
3351
+ .filter(cls.job_id.is_(None))
3352
+ .filter(cls.file_category != "master_appendix_a")
3353
+ .filter_by(status=status)
3354
+ .all()
3355
+ )
3356
+
3357
+ # TODO: logic for pending vs current files after high level validation
3358
+ @classmethod
3359
+ @with_default_session
3360
+ def get_pending_non_admin_files(cls, job_id: int, session: Session) -> list["PreprocessedFiles"]:
3361
+ return (
3362
+ session.query(cls)
3363
+ .filter(cls.job_id == job_id)
3364
+ .filter(cls.status == "pending", cls.file_category.notin_(ADMIN_FILE_CATEGORIES))
3365
+ .all()
3366
+ )
3367
+
3368
+ @classmethod
3369
+ def add_job_filter(cls, query, job_id):
3370
+ """
3371
+ Add a job_id filter to the SQLAlchemy query:
3372
+ - If job_id is provided, filters for exact match.
3373
+ - If not, filters for system-wide files (where job_id IS NULL).
3374
+ """
3375
+ if job_id is not None:
3376
+ return query.filter_by(job_id=job_id)
3377
+ else:
3378
+ return query.filter(cls.job_id.is_(None))
3379
+
3380
+
3381
+ INGESTION_JOB_STATUSES = [
3382
+ "DRAFT",
3383
+ "INITIAL SUBMISSION",
3384
+ "VALIDATION REVIEW",
3385
+ "REVISION SUBMISSION",
3386
+ "INGESTION",
3387
+ "PUBLISHED",
3388
+ ]
3389
+
3390
+ # Business decision to pass hex codes from the backend though that should be done by the front end...
3391
+ INGESTION_JOB_COLORS = {
3392
+ "DRAFT": "",
3393
+ "INITIAL SUBMISSION": "#ACCAD7",
3394
+ "VALIDATION REVIEW": "#DABE90",
3395
+ "REVISION SUBMISSION": "#C8BAE5",
3396
+ "INGESTION": "#8FCEC7",
3397
+ "PUBLISHED": "#90D9E6",
3398
+ }
3399
+ # TODO If have "CANCELLED" concept or other final status, add here
3400
+ FINAL_JOB_STATUS = ["PUBLISHED"]
3401
+
3402
+
3403
+ class IngestionJobs(CommonColumns):
3404
+ __tablename__ = "ingestion_jobs"
3405
+ __table_args__ = (
3406
+ ForeignKeyConstraint(
3407
+ ["trial_id"],
3408
+ ["trial_metadata.trial_id"],
3409
+ name="ingestion_jobs_trial_id_fkey",
3410
+ ),
3411
+ )
3412
+
3413
+ status = Column("status", Enum(*INGESTION_JOB_STATUSES, name="status"), nullable=False)
3414
+ trial_id = Column(String, nullable=False)
3415
+ version = Column(Integer, nullable=False)
3416
+
3417
+ @staticmethod
3418
+ @with_default_session
3419
+ def create(trial_id: str, status: str, version: int, session: Session = None):
3420
+ new_job = IngestionJobs(trial_id=trial_id, status=status, version=version)
3421
+ new_job.insert(session=session)
3422
+ return new_job
3423
+
3424
+ @with_default_session
3425
+ def transition_status(self, status: str, session: Session):
3426
+ # create required categories after opening job for submission
3427
+ if self.status == "DRAFT" and status == "INITIAL SUBMISSION":
3428
+ for category in self.derive_required_categories_from_appendix_a():
3429
+ JobFileCategories.create(category=category, job_id=self.id, type="required")
3430
+ self.status = status
3431
+ self.update(session=session)
3432
+
3433
+ def derive_required_categories_from_appendix_a(self) -> List:
3434
+ appendix_a = PreprocessedFiles.get_files_by_category_and_status(TRIAL_APPENDIX_A, "current", job_id=self.id)[0]
3435
+ df = gcs_xlsx_or_csv_file_to_pandas_dataframe(GOOGLE_CLINICAL_DATA_BUCKET, appendix_a.object_url)
3436
+ categories = []
3437
+ headers_ended = False
3438
+ for index, row in df.iterrows():
3439
+ cell = str(row.iloc[0])
3440
+ if headers_ended:
3441
+ if not cell == "nan" and cell not in categories and cell != "Specialized_Data":
3442
+ categories.append(cell)
3443
+ elif cell == "PATIENT-LEVEL DATA":
3444
+ headers_ended = True
3445
+ return categories
3446
+
3447
+ @classmethod
3448
+ @with_default_session
3449
+ def get_jobs_by_trial(cls, trial_id: str, session: Session = None) -> list["IngestionJobs"]:
3450
+ return session.query(cls).filter(cls.trial_id == trial_id).order_by(cls.version.desc()).all()
3451
+
3452
+ @classmethod
3453
+ @with_default_session
3454
+ def get_open_job_by_trial(cls, trial_id: str, session: Session = None) -> Optional["IngestionJobs"]:
3455
+ """Return the open job for a given trial if it exists."""
3456
+ return (
3457
+ session.query(cls)
3458
+ .filter(
3459
+ cls.trial_id == trial_id,
3460
+ cls.status.notin_(FINAL_JOB_STATUS),
3461
+ )
3462
+ .order_by(cls._created.desc())
3463
+ .first()
3464
+ )
3465
+
3466
+ # TODO: figure out which users have access to which jobs
3467
+ @classmethod
3468
+ @with_default_session
3469
+ def get_open_jobs_for_user(cls, user: Users, session: Session = None) -> list["IngestionJobs"]:
3470
+ return session.query(cls).filter(cls.status.notin_(["DRAFT"])).order_by(cls._created.desc()).all()
3471
+
3472
+
3473
+ class JobFileCategories(CommonColumns):
3474
+ __tablename__ = "job_file_categories"
3475
+ __table_args__ = (
3476
+ ForeignKeyConstraint(
3477
+ ["job_id"],
3478
+ ["ingestion_jobs.id"],
3479
+ ),
3480
+ Index(
3481
+ "idx_categories_job_id" "job_id",
3482
+ "category",
3483
+ unique=True,
3484
+ ),
3485
+ )
3486
+
3487
+ category = Column(String)
3488
+ job_id = Column(Integer)
3489
+ type = Column(Enum("required", "optional", name="type"))
3490
+
3491
+ @staticmethod
3492
+ @with_default_session
3493
+ def create(
3494
+ category: str,
3495
+ job_id: int,
3496
+ type: str,
3497
+ session: Session = None,
3498
+ ):
3499
+ new_category = JobFileCategories(
3500
+ category=category,
3501
+ job_id=job_id,
3502
+ type=type,
3503
+ )
3504
+ new_category.insert(session=session)
3505
+ return new_category
3506
+
3507
+ @classmethod
3508
+ @with_default_session
3509
+ def categories_for_job(cls, job_id: int, type: str, session: Session = None):
3510
+ categories = session.query(cls).filter(cls.job_id == job_id, cls.type == type).all()
3511
+ return [c.category for c in categories]
@@ -104,6 +104,7 @@ class TrialMetadataSchema(BaseSchema):
104
104
  file_bundle = fields.Dict(dump_only=True)
105
105
  num_participants = fields.Int(dump_only=True)
106
106
  num_samples = fields.Int(dump_only=True)
107
+ ready_for_submission = fields.Bool(dump_only=True)
107
108
 
108
109
 
109
110
  TrialMetadataListSchema = _make_list_schema(TrialMetadataSchema())
cidc_api/shared/emails.py CHANGED
@@ -16,7 +16,7 @@ from ..config.settings import ENV
16
16
  # - errors from CSMS in update_cidc_from_csms,
17
17
  # - errors from kicking off permissions in grant_download_permissions, and
18
18
  # - errors from implementing permissions in worker > permissions_worker
19
- CIDC_MAILING_LIST = ["essex-alert@cimac-network.org", "mustafa.kucukkal@nih.gov"]
19
+ CIDC_MAILING_LIST = ["essex-alert@cimac-network.org"]
20
20
 
21
21
 
22
22
  def sendable(email_template):
@@ -0,0 +1,90 @@
1
+ from werkzeug.datastructures import FileStorage
2
+ from werkzeug.exceptions import BadRequest
3
+
4
+ from ..config.settings import GOOGLE_CLINICAL_DATA_BUCKET
5
+ from ..models import PreprocessedFiles
6
+ from ..shared.auth import get_current_user
7
+ from ..shared.gcloud_client import upload_file_to_gcs
8
+
9
+
10
+ def set_current_file(file: FileStorage, file_category: str, gcs_folder: str, job_id: int = None) -> PreprocessedFiles:
11
+ """
12
+ Archives any existing 'current' files for the given category and job,
13
+ then uploads the new file as the latest 'current' version.
14
+ """
15
+ latest_version = PreprocessedFiles.archive_current_files(file_category, job_id=job_id)
16
+ latest_file = create_file(file, gcs_folder, file_category, job_id, latest_version + 1)
17
+ return latest_file
18
+
19
+
20
+ def create_file(
21
+ file: FileStorage, gcs_folder: str, file_category: str, job_id: int = None, version: int = None
22
+ ) -> PreprocessedFiles:
23
+ """Upload file to GCS and create corresponding metadata record in the database."""
24
+ status = "pending" if gcs_folder.endswith("pending/") else "current"
25
+ # only need timestamp for current/approved files
26
+ append_timestamp = status == "current"
27
+ # create file in GCS
28
+ gcs_file_path = upload_file_to_gcs(file, GOOGLE_CLINICAL_DATA_BUCKET, gcs_folder, append_timestamp=append_timestamp)
29
+ # create corresponding record in db
30
+ file = PreprocessedFiles.create(
31
+ file_name=file.filename,
32
+ object_url=gcs_file_path,
33
+ file_category=file_category,
34
+ uploader_email=get_current_user().email,
35
+ status=status,
36
+ job_id=job_id,
37
+ version=version,
38
+ )
39
+ return file
40
+
41
+
42
+ def validate_file_extension(filename: str, allowed_extensions: list[str]):
43
+ if not filename or not any(filename.lower().endswith(ext) for ext in allowed_extensions):
44
+ raise BadRequest(f"Invalid file type. Must be one of: {allowed_extensions}")
45
+
46
+
47
+ def format_common_preprocessed_file_response(file: PreprocessedFiles):
48
+ """Format a common response for a single PreprocessedFiles record."""
49
+ return {
50
+ "file_name": file.file_name,
51
+ "gcs_uri": f"gs://{GOOGLE_CLINICAL_DATA_BUCKET}/{file.object_url}",
52
+ "status": file.status,
53
+ "file_category": file.file_category,
54
+ "uploader_email": file.uploader_email,
55
+ "date": file._created.isoformat(),
56
+ }
57
+
58
+
59
+ # TODO Below functions approve_pending_file and delete_pending_files were copied from deleted clinical_data.py
60
+ # Consider re-implementing with pending files in clinical data file uploads, or remove
61
+ # def approve_pending_file(pending_file: FileStorage):
62
+ # original_filename = pending_file.file_name
63
+ # pending_gcs_path = pending_file.object_url
64
+ # try:
65
+ # new_gcs_path = gcloud_client.move_gcs_file(
66
+ # GOOGLE_CLINICAL_DATA_BUCKET, pending_gcs_path, f"{MASTER_APPENDIX_A}/"
67
+ # )
68
+ # except Exception as e:
69
+ # logger.error(str(e))
70
+ # raise InternalServerError(str(e))
71
+ # # Move any 'current' file(s) to 'archived' status
72
+ # latest_version = PreprocessedFiles.archive_current_files(MASTER_APPENDIX_A)
73
+ # # Insert new "approved" DB record
74
+ # PreprocessedFiles.create(
75
+ # file_name=original_filename,
76
+ # object_url=new_gcs_path,
77
+ # file_category=MASTER_APPENDIX_A,
78
+ # uploader_email=get_current_user().email,
79
+ # status="current",
80
+ # version=latest_version + 1,
81
+ # )
82
+ # # Delete pending record
83
+ # pending_file.delete()
84
+ # return new_gcs_path
85
+ #
86
+ #
87
+ # def delete_pending_files(pending_folder: str, file_category: str):
88
+ # """Deletes specified pending file(s) from GCS and associated db record(s)."""
89
+ # gcloud_client.delete_items_from_folder(GOOGLE_CLINICAL_DATA_BUCKET, pending_folder)
90
+ # PreprocessedFiles.delete_pending_files_by_category(file_category)
@@ -26,6 +26,7 @@ from typing import (
26
26
 
27
27
  import googleapiclient.discovery
28
28
  import requests
29
+ import pandas as pd
29
30
  from cidc_schemas.prism.constants import ASSAY_TO_FILEPATH
30
31
  from google.api_core.client_options import ClientOptions
31
32
  from google.api_core.iam import Policy
@@ -217,10 +218,12 @@ def upload_xlsx_to_gcs(
217
218
  return final_object
218
219
 
219
220
 
220
- def upload_file_to_gcs(file: FileStorage, bucket_name: str, gcs_folder: str) -> str:
221
+ def upload_file_to_gcs(file: FileStorage, bucket_name: str, gcs_folder: str, append_timestamp: bool = False) -> str:
221
222
  """Upload a file to the specified GCS folder and return the GCS path from the bucket."""
222
223
  # Secure the filename and prepare file
223
224
  filename = secure_filename(file.filename)
225
+ if append_timestamp:
226
+ filename = _append_iso_timestamp_to_filename(filename)
224
227
  gcs_file_path = os.path.join(gcs_folder, filename)
225
228
  binary_file = io.BytesIO(file.read())
226
229
 
@@ -416,6 +419,20 @@ def upload_xlsx_to_intake_bucket(user_email: str, trial_id: str, upload_type: st
416
419
  return f"https://console.cloud.google.com/storage/browser/_details/{bucket_name}/{blob_name}"
417
420
 
418
421
 
422
+ def gcs_xlsx_or_csv_file_to_pandas_dataframe(bucket_name: str, blob_name: str):
423
+ """Reads an XLSX file from Google Cloud Storage into a Pandas DataFrame."""
424
+ sheet_data = storage.Client().bucket(bucket_name).blob(blob_name).download_as_bytes()
425
+ temp_file = io.BytesIO(sheet_data)
426
+
427
+ # TODO: specify sheet in xlsx file and/or accept tsv and xls files
428
+ if blob_name[-3:] == "csv":
429
+ return pd.read_csv(temp_file)
430
+ elif blob_name[-4:] == "xlsx":
431
+ return pd.read_excel(temp_file)
432
+ else:
433
+ raise Exception("Can only read csv or xlsx files")
434
+
435
+
419
436
  def _execute_multiblob_acl_change(
420
437
  user_email_list: List[str],
421
438
  blob_list: List[storage.Blob],
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: nci_cidc_api_modules
3
- Version: 1.1.35
3
+ Version: 1.1.38
4
4
  Summary: SQLAlchemy data models and configuration tools used in the NCI CIDC API
5
5
  Home-page: https://github.com/NCI-CIDC/cidc-api-gae
6
6
  License: MIT license
@@ -10,10 +10,10 @@ License-File: LICENSE
10
10
  Requires-Dist: werkzeug==3.0.6
11
11
  Requires-Dist: flask==3.0.3
12
12
  Requires-Dist: flask-migrate==3.1.0
13
- Requires-Dist: flask-sqlalchemy==3.0.2
14
- Requires-Dist: sqlalchemy==1.4.54
13
+ Requires-Dist: flask-sqlalchemy==3.1.1
14
+ Requires-Dist: sqlalchemy==2.0.41
15
15
  Requires-Dist: marshmallow==3.19.0
16
- Requires-Dist: marshmallow-sqlalchemy==0.22.3
16
+ Requires-Dist: marshmallow-sqlalchemy==1.4.2
17
17
  Requires-Dist: google-cloud-storage==2.18.0
18
18
  Requires-Dist: google-cloud-secret-manager==2.20.1
19
19
  Requires-Dist: google-cloud-pubsub==2.22.0
@@ -28,7 +28,7 @@ Requires-Dist: python-dotenv==0.10.3
28
28
  Requires-Dist: requests==2.32.4
29
29
  Requires-Dist: jinja2==3.1.6
30
30
  Requires-Dist: certifi==2024.7.4
31
- Requires-Dist: nci-cidc-schemas==0.27.26
31
+ Requires-Dist: nci-cidc-schemas==0.27.27
32
32
  Dynamic: description
33
33
  Dynamic: description-content-type
34
34
  Dynamic: home-page
@@ -1,23 +1,24 @@
1
1
  cidc_api/config/__init__.py,sha256=5mX8GAPxUKV84iS-aGOoE-4m68LsOCGCDptXNdlgvj0,148
2
- cidc_api/config/db.py,sha256=cyWhWtmXha4OsrwUf6ez8aKSfm7tPSmPDE9JVSBx3Fk,1935
2
+ cidc_api/config/db.py,sha256=5rf7kIowkiJIqJj2_JtO1cY9L55IjJBonJ-vThA4cGo,1960
3
3
  cidc_api/config/logging.py,sha256=abhVYtn8lfhIt0tyV2WHFgSmp_s2eeJh7kodB6LH4J0,1149
4
4
  cidc_api/config/secrets.py,sha256=jRFj7W43pWuPf9DZQLCKF7WPXf5cUv-BAaS3ASqhV_Q,1481
5
5
  cidc_api/config/settings.py,sha256=mA-4r7oB60uFepYtl5abbPigjwX8aBz__qCJXdcWWbs,4272
6
6
  cidc_api/models/__init__.py,sha256=bl445G8Zic9YbhZ8ZBni07wtBMhLJRMBA-JqjLxx2bw,66
7
7
  cidc_api/models/migrations.py,sha256=gp9vtkYbA9FFy2s-7woelAmsvQbJ41LO2_DY-YkFIrQ,11464
8
- cidc_api/models/models.py,sha256=JceOfSBetV6ifhYIPL4Qp0T3YpG55M016YvEajLUQ8o,132587
9
- cidc_api/models/schemas.py,sha256=7tDYtmULuzTt2kg7RorWhte06ffalgpQKrFiDRGcPEQ,2711
8
+ cidc_api/models/models.py,sha256=D4GmcQSLKGBi0k3-w3TZk7J46zKR80JViN2_vTBZ1ZQ,141678
9
+ cidc_api/models/schemas.py,sha256=6IE2dJoEMcMbi0Vr1V3cYKnPKU0hv9vRKBixOZHe88s,2766
10
10
  cidc_api/models/files/__init__.py,sha256=8BMTnUSHzUbz0lBeEQY6NvApxDD3GMWMduoVMos2g4Y,213
11
11
  cidc_api/models/files/details.py,sha256=sZkGM7iEV4-J6IDQCdiMV6KBDLbPxCOqUMaU3aY9rX8,65153
12
12
  cidc_api/models/files/facets.py,sha256=WqjfqtYJgY2tBnZ598Yc0eJdQUo2slFNLyTDaqPx_DE,32318
13
13
  cidc_api/shared/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
14
14
  cidc_api/shared/auth.py,sha256=PHqmVGkqDjbmUofytVFwD_9ssgUomESl3fFtFHPwZYQ,9062
15
- cidc_api/shared/emails.py,sha256=GY-l0EkoVU_3hjV0g-xo7N9d1iyCdluyq_arftEPPe0,4989
16
- cidc_api/shared/gcloud_client.py,sha256=oD7Y3Glp1ZrPYkonv7DvX1koGtF30lgm3ENXYQ7G5cI,35634
15
+ cidc_api/shared/emails.py,sha256=HQIixEUsR8yyu7Iv8S81RjtvEQeGuzQHzBfGsWIfP7k,4961
16
+ cidc_api/shared/file_handling.py,sha256=lbdY4XH-otpNjWDY1g6EQoVWtEYM_9j95OlQXeeFkhE,3808
17
+ cidc_api/shared/gcloud_client.py,sha256=tgi6Ja31EUQcJueAIYHc3VyrMchoMZCdui1eruakCLg,36351
17
18
  cidc_api/shared/jose.py,sha256=-qzGzEDAlokEp9E7WtBtQkXyyfPWTYXlwYpCqVJWmqM,1830
18
19
  cidc_api/shared/rest_utils.py,sha256=RwR30WOUAYCxL7V-i2totEyeriG30GbBDvBcpLXhM9w,6594
19
- nci_cidc_api_modules-1.1.35.dist-info/licenses/LICENSE,sha256=pNYWVTHaYonnmJyplmeAp7tQAjosmDpAWjb34jjv7Xs,1102
20
- nci_cidc_api_modules-1.1.35.dist-info/METADATA,sha256=udkDYU-7dGtR_Msaw1vAFABYr8M5rq_RSbb8kjkyDbI,41285
21
- nci_cidc_api_modules-1.1.35.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
22
- nci_cidc_api_modules-1.1.35.dist-info/top_level.txt,sha256=rNiRzL0lJGi5Q9tY9uSoMdTbJ-7u5c_D2E86KA94yRA,9
23
- nci_cidc_api_modules-1.1.35.dist-info/RECORD,,
20
+ nci_cidc_api_modules-1.1.38.dist-info/licenses/LICENSE,sha256=pNYWVTHaYonnmJyplmeAp7tQAjosmDpAWjb34jjv7Xs,1102
21
+ nci_cidc_api_modules-1.1.38.dist-info/METADATA,sha256=Vlhx5ZliL-b3-3Umhqm1q22BztzUG-mc2dTIyyHzZ24,41284
22
+ nci_cidc_api_modules-1.1.38.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
23
+ nci_cidc_api_modules-1.1.38.dist-info/top_level.txt,sha256=rNiRzL0lJGi5Q9tY9uSoMdTbJ-7u5c_D2E86KA94yRA,9
24
+ nci_cidc_api_modules-1.1.38.dist-info/RECORD,,