nci-cidc-api-modules 1.2.15__tar.gz → 1.2.17__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (36) hide show
  1. {nci_cidc_api_modules-1.2.15/nci_cidc_api_modules.egg-info → nci_cidc_api_modules-1.2.17}/PKG-INFO +2 -2
  2. {nci_cidc_api_modules-1.2.15 → nci_cidc_api_modules-1.2.17}/cidc_api/config/db.py +1 -3
  3. {nci_cidc_api_modules-1.2.15 → nci_cidc_api_modules-1.2.17}/cidc_api/models/models.py +66 -35
  4. {nci_cidc_api_modules-1.2.15 → nci_cidc_api_modules-1.2.17}/cidc_api/shared/file_handling.py +37 -2
  5. {nci_cidc_api_modules-1.2.15 → nci_cidc_api_modules-1.2.17}/cidc_api/shared/gcloud_client.py +6 -4
  6. nci_cidc_api_modules-1.2.17/cidc_api/shared/utils.py +8 -0
  7. {nci_cidc_api_modules-1.2.15 → nci_cidc_api_modules-1.2.17/nci_cidc_api_modules.egg-info}/PKG-INFO +2 -2
  8. {nci_cidc_api_modules-1.2.15 → nci_cidc_api_modules-1.2.17}/nci_cidc_api_modules.egg-info/SOURCES.txt +1 -0
  9. {nci_cidc_api_modules-1.2.15 → nci_cidc_api_modules-1.2.17}/nci_cidc_api_modules.egg-info/requires.txt +1 -1
  10. {nci_cidc_api_modules-1.2.15 → nci_cidc_api_modules-1.2.17}/requirements.modules.txt +1 -1
  11. {nci_cidc_api_modules-1.2.15 → nci_cidc_api_modules-1.2.17}/LICENSE +0 -0
  12. {nci_cidc_api_modules-1.2.15 → nci_cidc_api_modules-1.2.17}/MANIFEST.in +0 -0
  13. {nci_cidc_api_modules-1.2.15 → nci_cidc_api_modules-1.2.17}/README.md +0 -0
  14. {nci_cidc_api_modules-1.2.15 → nci_cidc_api_modules-1.2.17}/cidc_api/config/__init__.py +0 -0
  15. {nci_cidc_api_modules-1.2.15 → nci_cidc_api_modules-1.2.17}/cidc_api/config/logging.py +0 -0
  16. {nci_cidc_api_modules-1.2.15 → nci_cidc_api_modules-1.2.17}/cidc_api/config/secrets.py +0 -0
  17. {nci_cidc_api_modules-1.2.15 → nci_cidc_api_modules-1.2.17}/cidc_api/config/settings.py +0 -0
  18. {nci_cidc_api_modules-1.2.15 → nci_cidc_api_modules-1.2.17}/cidc_api/models/__init__.py +0 -0
  19. {nci_cidc_api_modules-1.2.15 → nci_cidc_api_modules-1.2.17}/cidc_api/models/files/__init__.py +0 -0
  20. {nci_cidc_api_modules-1.2.15 → nci_cidc_api_modules-1.2.17}/cidc_api/models/files/details.py +0 -0
  21. {nci_cidc_api_modules-1.2.15 → nci_cidc_api_modules-1.2.17}/cidc_api/models/files/facets.py +0 -0
  22. {nci_cidc_api_modules-1.2.15 → nci_cidc_api_modules-1.2.17}/cidc_api/models/migrations.py +0 -0
  23. {nci_cidc_api_modules-1.2.15 → nci_cidc_api_modules-1.2.17}/cidc_api/models/schemas.py +0 -0
  24. {nci_cidc_api_modules-1.2.15 → nci_cidc_api_modules-1.2.17}/cidc_api/shared/__init__.py +0 -0
  25. {nci_cidc_api_modules-1.2.15 → nci_cidc_api_modules-1.2.17}/cidc_api/shared/auth.py +0 -0
  26. {nci_cidc_api_modules-1.2.15 → nci_cidc_api_modules-1.2.17}/cidc_api/shared/email_layout.html +0 -0
  27. {nci_cidc_api_modules-1.2.15 → nci_cidc_api_modules-1.2.17}/cidc_api/shared/emails.py +0 -0
  28. {nci_cidc_api_modules-1.2.15 → nci_cidc_api_modules-1.2.17}/cidc_api/shared/jose.py +0 -0
  29. {nci_cidc_api_modules-1.2.15 → nci_cidc_api_modules-1.2.17}/cidc_api/shared/rest_utils.py +0 -0
  30. {nci_cidc_api_modules-1.2.15 → nci_cidc_api_modules-1.2.17}/nci_cidc_api_modules.egg-info/dependency_links.txt +0 -0
  31. {nci_cidc_api_modules-1.2.15 → nci_cidc_api_modules-1.2.17}/nci_cidc_api_modules.egg-info/not-zip-safe +0 -0
  32. {nci_cidc_api_modules-1.2.15 → nci_cidc_api_modules-1.2.17}/nci_cidc_api_modules.egg-info/top_level.txt +0 -0
  33. {nci_cidc_api_modules-1.2.15 → nci_cidc_api_modules-1.2.17}/pyproject.toml +0 -0
  34. {nci_cidc_api_modules-1.2.15 → nci_cidc_api_modules-1.2.17}/setup.cfg +0 -0
  35. {nci_cidc_api_modules-1.2.15 → nci_cidc_api_modules-1.2.17}/setup.py +0 -0
  36. {nci_cidc_api_modules-1.2.15 → nci_cidc_api_modules-1.2.17}/tests/test_api.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: nci_cidc_api_modules
3
- Version: 1.2.15
3
+ Version: 1.2.17
4
4
  Summary: SQLAlchemy data models and configuration tools used in the NCI CIDC API
5
5
  Home-page: https://github.com/NCI-CIDC/cidc-api-gae
6
6
  License: MIT license
@@ -12,7 +12,7 @@ Requires-Dist: cloud-sql-python-connector[pg8000]>=1.18.5
12
12
  Requires-Dist: flask>=3.1.2
13
13
  Requires-Dist: flask-migrate>=4.1.0
14
14
  Requires-Dist: flask-sqlalchemy>=3.1.1
15
- Requires-Dist: google-auth>=2.42.0
15
+ Requires-Dist: google-auth==2.41.1
16
16
  Requires-Dist: google-api-python-client>=2.185.0
17
17
  Requires-Dist: google-cloud-bigquery>=3.38.0
18
18
  Requires-Dist: google-cloud-pubsub>=2.32.0
@@ -10,8 +10,7 @@ from google.cloud.sql.connector import Connector, IPTypes
10
10
  from .secrets import get_secrets_manager
11
11
 
12
12
  db = SQLAlchemy()
13
- BaseModel = declarative_base()
14
- db.Model = BaseModel
13
+ BaseModel = db.Model
15
14
 
16
15
  connector = Connector()
17
16
 
@@ -31,7 +30,6 @@ def getconn():
31
30
  def init_db(app: Flask):
32
31
  """Connect `app` to the database and run migrations"""
33
32
  db.init_app(app)
34
- db.Model = BaseModel
35
33
  Migrate(app, db, app.config["MIGRATIONS_PATH"])
36
34
  with app.app_context():
37
35
  upgrade(app.config["MIGRATIONS_PATH"])
@@ -26,6 +26,7 @@ __all__ = [
26
26
  "FileValidationErrors",
27
27
  "IngestionJobs",
28
28
  "JobFileCategories",
29
+ "CategoryDataElements",
29
30
  "ValidationConfigs",
30
31
  "TRIAL_APPENDIX_A",
31
32
  "TRIAL_APPENDIX_A_CELL_THAT_ENDS_THE_HEADER",
@@ -95,7 +96,7 @@ from sqlalchemy import (
95
96
  String,
96
97
  Table,
97
98
  )
98
- from sqlalchemy.dialects.postgresql import JSONB, UUID
99
+ from sqlalchemy.dialects.postgresql import JSONB, UUID, CITEXT
99
100
  from sqlalchemy.engine import ResultProxy
100
101
  from sqlalchemy.exc import IntegrityError
101
102
  from sqlalchemy.ext.hybrid import hybrid_property
@@ -131,7 +132,6 @@ from ..config.settings import (
131
132
  MAX_PAGINATION_PAGE_SIZE,
132
133
  TESTING,
133
134
  INACTIVE_USER_DAYS,
134
- GOOGLE_CLINICAL_DATA_BUCKET,
135
135
  )
136
136
  from ..shared import emails
137
137
  from ..shared.gcloud_client import (
@@ -145,7 +145,6 @@ from ..shared.gcloud_client import (
145
145
  revoke_intake_access,
146
146
  revoke_lister_access,
147
147
  revoke_bigquery_access,
148
- gcs_xlsx_or_csv_file_to_pandas_dataframe,
149
148
  )
150
149
 
151
150
  os.environ["TZ"] = "UTC"
@@ -382,7 +381,7 @@ class Users(CommonColumns):
382
381
  last_n = Column(String)
383
382
  organization = Column(Enum(*ORGS, name="orgs"))
384
383
  approval_date = Column(DateTime)
385
- role = Column(Enum(*ROLES, name="role"))
384
+ role = Column(Enum(*ROLES, name="roles"))
386
385
  disabled = Column(Boolean, default=False, server_default="false")
387
386
 
388
387
  @validates("approval_date")
@@ -607,6 +606,22 @@ class Permissions(CommonColumns):
607
606
  unique=True,
608
607
  postgresql_where=file_group_id.isnot(None),
609
608
  ),
609
+ Index(
610
+ "unique_trial_id_upload_type_is_null_perms",
611
+ "granted_to_user",
612
+ "trial_id",
613
+ literal_column("(upload_type IS NULL)"),
614
+ unique=True,
615
+ postgresql_where="(upload_type IS NULL)",
616
+ ),
617
+ Index(
618
+ "unique_upload_type_trial_id_is_null_perms",
619
+ "granted_to_user",
620
+ literal_column("(trial_id IS NULL)"),
621
+ "upload_type",
622
+ unique=True,
623
+ postgresql_where="(trial_id IS NULL)",
624
+ ),
610
625
  )
611
626
 
612
627
  # Shorthand to make code related to trial- and upload-type-level permissions
@@ -2294,7 +2309,7 @@ class DownloadableFiles(CommonColumns):
2294
2309
  additional_metadata = Column(JSONB, nullable=False)
2295
2310
  # TODO rename upload_type, because we store manifests in there too.
2296
2311
  # NOTE: this column actually has type CITEXT.
2297
- upload_type = Column(String, nullable=False)
2312
+ upload_type = Column(CITEXT, nullable=False)
2298
2313
  md5_hash = Column(String, nullable=True)
2299
2314
  crc32c_hash = Column(String, nullable=True)
2300
2315
  trial_id = Column(String, nullable=False)
@@ -2314,7 +2329,7 @@ class DownloadableFiles(CommonColumns):
2314
2329
  # used instead of data_format.
2315
2330
  # The columns are left as optional for short term backwards compatibility.
2316
2331
  file_name = Column(String, nullable=True)
2317
- data_format = Column(String, nullable=True)
2332
+ data_format = Column(CITEXT, nullable=True)
2318
2333
 
2319
2334
  file_groups = relationship(
2320
2335
  "FileGroups",
@@ -3262,11 +3277,11 @@ class PreprocessedFiles(CommonColumns):
3262
3277
  ),
3263
3278
  )
3264
3279
 
3265
- file_name = Column(String)
3266
- object_url = Column(String)
3280
+ file_name = Column(String, nullable=False)
3281
+ object_url = Column(String, nullable=False)
3267
3282
  job_id = Column(Integer)
3268
- file_category = Column(String)
3269
- uploader_email = Column(String)
3283
+ file_category = Column(String, nullable=False)
3284
+ uploader_email = Column(String, nullable=False)
3270
3285
  status = Column(String)
3271
3286
  version = Column(Integer)
3272
3287
  released_version = Column(String)
@@ -3434,7 +3449,7 @@ class IngestionJobs(CommonColumns):
3434
3449
  ),
3435
3450
  )
3436
3451
 
3437
- status = Column("status", Enum(*INGESTION_JOB_STATUSES, name="status"), nullable=False)
3452
+ status = Column("status", Enum(*INGESTION_JOB_STATUSES, name="ingestion_job_status"), nullable=False)
3438
3453
  trial_id = Column(String, nullable=False)
3439
3454
  version = Column(Integer, nullable=False)
3440
3455
  pending = Column(Boolean, nullable=False, default=False)
@@ -3450,30 +3465,9 @@ class IngestionJobs(CommonColumns):
3450
3465
 
3451
3466
  @with_default_session
3452
3467
  def transition_status(self, status: str, session: Session):
3453
- # create required categories after opening job for submission
3454
- if self.status == "DRAFT" and status == "INITIAL SUBMISSION":
3455
- for category in self.derive_required_categories_from_appendix_a():
3456
- JobFileCategories.create(category=category, job_id=self.id, type="required")
3457
3468
  self.status = status
3458
3469
  self.update(session=session)
3459
3470
 
3460
- def derive_required_categories_from_appendix_a(self) -> List:
3461
- appendix_a = PreprocessedFiles.get_files_by_category_and_status(TRIAL_APPENDIX_A, "current", job_id=self.id)[0]
3462
- df = gcs_xlsx_or_csv_file_to_pandas_dataframe(GOOGLE_CLINICAL_DATA_BUCKET, appendix_a.object_url)
3463
- categories = []
3464
- headers_ended = False
3465
- for _index, row in df.iterrows():
3466
- cell = str(row.iloc[0])
3467
- if headers_ended:
3468
- if cell != "nan" and cell not in categories:
3469
- categories.append(cell)
3470
- elif cell.lower() == TRIAL_APPENDIX_A_CELL_THAT_ENDS_THE_HEADER.lower():
3471
- headers_ended = True
3472
- if "data_dictionary" not in categories:
3473
- # Ensure Data_Dictionary is always a required file category
3474
- categories.append("data_dictionary")
3475
- return categories
3476
-
3477
3471
  @classmethod
3478
3472
  @with_default_session
3479
3473
  def atomic_set_job_as_pending(cls, job_id: int, session: Session) -> Boolean:
@@ -3544,15 +3538,17 @@ class JobFileCategories(CommonColumns):
3544
3538
  ["ingestion_jobs.id"],
3545
3539
  ),
3546
3540
  Index(
3547
- "idx_categories_job_id" "job_id",
3541
+ "idx_categories_job_id",
3542
+ "job_id",
3548
3543
  "category",
3549
3544
  unique=True,
3550
3545
  ),
3551
3546
  )
3552
3547
 
3553
3548
  category = Column(String)
3554
- job_id = Column(Integer)
3555
- type = Column(Enum("required", "optional", name="type"))
3549
+ job_id = Column(Integer, nullable=False)
3550
+ type = Column(Enum("required", "optional", name="type"), nullable=False)
3551
+ is_custom = Column(Boolean, nullable=False, default=False, server_default="false")
3556
3552
 
3557
3553
  @staticmethod
3558
3554
  @with_default_session
@@ -3560,12 +3556,14 @@ class JobFileCategories(CommonColumns):
3560
3556
  category: str,
3561
3557
  job_id: int,
3562
3558
  type: str,
3559
+ is_custom: bool = False,
3563
3560
  session: Session = None,
3564
3561
  ):
3565
3562
  new_category = JobFileCategories(
3566
3563
  category=category,
3567
3564
  job_id=job_id,
3568
3565
  type=type,
3566
+ is_custom=is_custom,
3569
3567
  )
3570
3568
  new_category.insert(session=session)
3571
3569
  return new_category
@@ -3576,6 +3574,39 @@ class JobFileCategories(CommonColumns):
3576
3574
  categories = session.query(cls).filter(cls.job_id == job_id, cls.type == type).all()
3577
3575
  return [c.category for c in categories]
3578
3576
 
3577
+ @classmethod
3578
+ @with_default_session
3579
+ def full_categories_for_job(cls, job_id: int, session: Session = None):
3580
+ return session.query(cls).filter_by(job_id=job_id).all()
3581
+
3582
+
3583
+ class CategoryDataElements(CommonColumns):
3584
+ __tablename__ = "category_data_elements"
3585
+ __table_args__ = (
3586
+ ForeignKeyConstraint(
3587
+ ["category_id"],
3588
+ ["job_file_categories.id"],
3589
+ ondelete="CASCADE",
3590
+ ),
3591
+ Index(
3592
+ "idx_elements_category_id",
3593
+ "category_id",
3594
+ "name",
3595
+ unique=True,
3596
+ ),
3597
+ )
3598
+
3599
+ category_id = Column(Integer, nullable=False)
3600
+ name = Column(String, nullable=False)
3601
+ is_custom = Column(Boolean, nullable=False, default=False, server_default="false")
3602
+ element_type = Column(String, nullable=False)
3603
+ cardinality = Column(String, nullable=True)
3604
+
3605
+ @classmethod
3606
+ @with_default_session
3607
+ def elements_for_category(cls, category_id: int, session: Session = None):
3608
+ return session.query(cls).filter_by(category_id=category_id).all()
3609
+
3579
3610
 
3580
3611
  class FileValidationErrors(CommonColumns):
3581
3612
  __tablename__ = "file_validation_errors"
@@ -1,14 +1,15 @@
1
1
  from pathlib import Path
2
2
 
3
+ from pandas import Series, DataFrame
4
+ from sqlalchemy.orm.session import Session
3
5
  from werkzeug.datastructures import FileStorage
4
6
  from werkzeug.exceptions import BadRequest, InternalServerError
5
7
 
6
8
  from ..config.logging import get_logger
7
9
  from ..config.settings import GOOGLE_CLINICAL_DATA_BUCKET
8
- from ..models import PreprocessedFiles
10
+ from ..models import PreprocessedFiles, TRIAL_APPENDIX_A_CELL_THAT_ENDS_THE_HEADER
9
11
  from ..shared.auth import get_current_user
10
12
  from ..shared.gcloud_client import upload_file_to_gcs, move_gcs_file
11
- from sqlalchemy.orm.session import Session
12
13
 
13
14
  logger = get_logger(__name__)
14
15
 
@@ -104,3 +105,37 @@ def strip_filename_and_pending_folder(path_str):
104
105
  if path.parent.name != "pending":
105
106
  raise ValueError("Expected 'pending' folder above file")
106
107
  return str(path.parent.parent)
108
+
109
+
110
+ def get_row_at_condition(df: DataFrame, condition):
111
+ condition_met_index = df[condition].index[0]
112
+ row_at_condition_series = df.iloc[condition_met_index]
113
+
114
+ return row_at_condition_series
115
+
116
+
117
+ def get_column(header_row_series: Series, header_name: str, use_raw_header_val: bool = False):
118
+ for idx, raw_header in enumerate(header_row_series):
119
+ if str(raw_header).lower() == header_name.lower():
120
+ return raw_header if use_raw_header_val else header_row_series.index[idx]
121
+ return None
122
+
123
+
124
+ def get_column_from_appendix_a(appendix_a_df: DataFrame, header_name: str):
125
+ category_column = appendix_a_df.columns[0]
126
+ aa_header_condition = appendix_a_df[category_column] == TRIAL_APPENDIX_A_CELL_THAT_ENDS_THE_HEADER
127
+ header_row_series = get_row_at_condition(appendix_a_df, aa_header_condition)
128
+ return get_column(header_row_series, header_name)
129
+
130
+
131
+ def get_column_from_first_row(df: DataFrame, header_name: str):
132
+ use_raw_header_val = False
133
+ if df.columns.inferred_type == "integer":
134
+ # If columns are integers (i.e. file was read without headers), treat the first row as header values.
135
+ header_row_series = df.iloc[0]
136
+ else:
137
+ # Otherwise columns already are headers
138
+ header_row_series = Series(df.columns)
139
+ use_raw_header_val = True
140
+
141
+ return get_column(header_row_series, header_name, use_raw_header_val=use_raw_header_val)
@@ -1,6 +1,6 @@
1
1
  """Utilities for interacting with the Google Cloud Platform APIs."""
2
2
 
3
- # pylint: disable=logging-fstring-interpolation
3
+ # pylint: disable=logging-fstring-interpolation,too-many-lines
4
4
 
5
5
  import base64
6
6
  import datetime
@@ -37,8 +37,8 @@ from sqlalchemy.orm.session import Session
37
37
  from werkzeug.datastructures import FileStorage
38
38
  from werkzeug.utils import secure_filename
39
39
 
40
- from cidc_api.config.secrets import get_secrets_manager
41
40
  from ..config.logging import get_logger
41
+ from ..config.secrets import get_secrets_manager
42
42
  from ..config.settings import (
43
43
  DEV_USE_GCS,
44
44
  GOOGLE_INTAKE_ROLE,
@@ -62,6 +62,8 @@ from ..config.settings import (
62
62
  DEV_CFUNCTIONS_SERVER,
63
63
  INACTIVE_USER_DAYS,
64
64
  )
65
+ from ..shared.utils import strip_whitespaces
66
+
65
67
 
66
68
  os.environ["TZ"] = "UTC"
67
69
  logger = get_logger(__name__)
@@ -427,9 +429,9 @@ def gcs_xlsx_or_csv_file_to_pandas_dataframe(bucket_name: str, blob_name: str):
427
429
 
428
430
  # TODO: specify sheet in xlsx file and/or accept tsv and xls files
429
431
  if blob_name[-3:] == "csv":
430
- return pd.read_csv(temp_file)
432
+ return strip_whitespaces(pd.read_csv(temp_file))
431
433
  elif blob_name[-4:] == "xlsx":
432
- return pd.read_excel(temp_file)
434
+ return strip_whitespaces(pd.read_excel(temp_file))
433
435
  else:
434
436
  raise Exception("Can only read csv or xlsx files")
435
437
 
@@ -0,0 +1,8 @@
1
+ def strip_whitespaces(df):
2
+ def stripper(x):
3
+ if x and isinstance(x, str):
4
+ return x.strip()
5
+ else:
6
+ return x
7
+
8
+ return df.map(stripper)
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: nci_cidc_api_modules
3
- Version: 1.2.15
3
+ Version: 1.2.17
4
4
  Summary: SQLAlchemy data models and configuration tools used in the NCI CIDC API
5
5
  Home-page: https://github.com/NCI-CIDC/cidc-api-gae
6
6
  License: MIT license
@@ -12,7 +12,7 @@ Requires-Dist: cloud-sql-python-connector[pg8000]>=1.18.5
12
12
  Requires-Dist: flask>=3.1.2
13
13
  Requires-Dist: flask-migrate>=4.1.0
14
14
  Requires-Dist: flask-sqlalchemy>=3.1.1
15
- Requires-Dist: google-auth>=2.42.0
15
+ Requires-Dist: google-auth==2.41.1
16
16
  Requires-Dist: google-api-python-client>=2.185.0
17
17
  Requires-Dist: google-cloud-bigquery>=3.38.0
18
18
  Requires-Dist: google-cloud-pubsub>=2.32.0
@@ -24,6 +24,7 @@ cidc_api/shared/file_handling.py
24
24
  cidc_api/shared/gcloud_client.py
25
25
  cidc_api/shared/jose.py
26
26
  cidc_api/shared/rest_utils.py
27
+ cidc_api/shared/utils.py
27
28
  nci_cidc_api_modules.egg-info/PKG-INFO
28
29
  nci_cidc_api_modules.egg-info/SOURCES.txt
29
30
  nci_cidc_api_modules.egg-info/dependency_links.txt
@@ -3,7 +3,7 @@ cloud-sql-python-connector[pg8000]>=1.18.5
3
3
  flask>=3.1.2
4
4
  flask-migrate>=4.1.0
5
5
  flask-sqlalchemy>=3.1.1
6
- google-auth>=2.42.0
6
+ google-auth==2.41.1
7
7
  google-api-python-client>=2.185.0
8
8
  google-cloud-bigquery>=3.38.0
9
9
  google-cloud-pubsub>=2.32.0
@@ -3,7 +3,7 @@ cloud-sql-python-connector[pg8000]>=1.18.5
3
3
  flask>=3.1.2
4
4
  flask-migrate>=4.1.0
5
5
  flask-sqlalchemy>=3.1.1
6
- google-auth>=2.42.0
6
+ google-auth==2.41.1 # There is a bug in 2.42.X that causes local to fail when connecting to dev
7
7
  google-api-python-client>=2.185.0
8
8
  google-cloud-bigquery>=3.38.0
9
9
  google-cloud-pubsub>=2.32.0