nci-cidc-api-modules 1.0.9__py3-none-any.whl → 1.0.12__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
cidc_api/models/models.py CHANGED
@@ -1467,100 +1467,68 @@ class TrialMetadata(CommonColumns):
1467
1467
  }
1468
1468
 
1469
1469
  @staticmethod
1470
- @with_default_session
1471
- def get_summaries(session: Session) -> List[dict]:
1472
- """
1473
- Return a list of trial summaries, where each summary has structure like:
1474
- ```python
1475
- {
1476
- "trial_id": ...,
1477
- "expected_assays": ..., # list of assays the trial should have data for
1478
- "file_size_bytes": ..., # total file size for the trial
1479
- "clinical_participants": ..., # number of participants with clinical data
1480
- "total_participants": ..., # number of unique participants with assay data
1481
- "total_samples": ..., # number of samples with assay data
1482
- "cytof": ..., # cytof sample count
1483
- ... # other assays and analysis
1484
- }
1485
- ```
1486
- NOTE: if the metadata model for any existing assays substantially changes,
1487
- or if new assays are introduced that don't follow the typical structure
1488
- (batches containing sample-level records), then this method will need to
1489
- be updated to accommodate those changes.
1490
-
1491
- Only the assays are used for calculating `"total_participants"` and `"total_samples"`,
1492
- as all analyses are derived from assay data.
1493
- Each assay/analysis subquery is expected to return a set with `trial_id`, `key`,
1494
- and `cimac_id` which are used for both assay-level and overall counting.
1495
-
1496
- There is a bit of complexity with the way that WES samples are counted:
1497
- - `"wes"` only counts tumor samples slated for paired wes_analysis
1498
- - `"wes_tumor_only"` counts all tumor samples NOT slated for paired wes_analysis
1499
- - `"wes_analysis"` counts tumor samples with paired wes_analysis
1500
- - `"wes_tumor_only_analysis"` counts (tumor) samples with tumor-only analysis
1501
- For `"total_[participants/samples]"`, ALL (ie tumor AND normal) WES assay samples are included.
1502
- """
1470
+ def get_summaries_query() -> str:
1503
1471
  # Compute the total amount of data in bytes stored for each trial
1504
1472
  files_subquery = """
1505
- select
1506
- trial_id,
1507
- sum(file_size_bytes) as value
1508
- from
1509
- downloadable_files
1510
- group by
1511
- trial_id
1512
- """
1473
+ select
1474
+ trial_id,
1475
+ sum(file_size_bytes) as value
1476
+ from
1477
+ downloadable_files
1478
+ group by
1479
+ trial_id
1480
+ """
1513
1481
 
1514
1482
  # Count how many participants have associated clinical data. The same
1515
1483
  # participant may appear in multiple clinical data files, so deduplicate
1516
1484
  # participants before counting them.
1517
1485
  clinical_subquery = """
1518
- select
1519
- trial_id,
1520
- count(distinct participants) as value
1521
- from
1522
- trial_metadata,
1523
- jsonb_array_elements(metadata_json#>'{clinical_data,records}') as records,
1524
- jsonb_array_elements(records#>'{clinical_file,participants}') as participants
1525
- group by
1526
- trial_id
1527
- """
1486
+ select
1487
+ trial_id,
1488
+ count(distinct participants) as value
1489
+ from
1490
+ trial_metadata,
1491
+ jsonb_array_elements(metadata_json#>'{clinical_data,records}') as records,
1492
+ jsonb_array_elements(records#>'{clinical_file,participants}') as participants
1493
+ group by
1494
+ trial_id
1495
+ """
1528
1496
 
1529
1497
  # Find all samples associated with each assay type for
1530
1498
  # assays whose metadata follows the typical structure: an array of batches,
1531
1499
  # with each batch containing an array of records, where each record
1532
1500
  # corresponds to a unique sample with a cimac_id.
1533
1501
  generic_assay_subquery = """
1534
- select
1535
- trial_id,
1536
- case
1537
- when key = 'hande' then 'h&e'
1538
- else key
1539
- end as key,
1540
- record->>'cimac_id' as cimac_id
1541
- from
1542
- trial_metadata,
1543
- jsonb_each(metadata_json->'assays') assays,
1544
- jsonb_array_elements(value) batches,
1545
- jsonb_array_elements(batches->'records') record
1546
- where key not in ('olink', 'nanostring', 'elisa', 'wes', 'misc_data')
1547
- """
1502
+ select
1503
+ trial_id,
1504
+ case
1505
+ when key = 'hande' then 'h&e'
1506
+ else key
1507
+ end as key,
1508
+ record->>'cimac_id' as cimac_id
1509
+ from
1510
+ trial_metadata,
1511
+ jsonb_each(metadata_json->'assays') assays,
1512
+ jsonb_array_elements(value) batches,
1513
+ jsonb_array_elements(batches->'records') record
1514
+ where key not in ('olink', 'nanostring', 'elisa', 'wes', 'misc_data')
1515
+ """
1548
1516
 
1549
1517
  # Find all samples associated with nanostring uploads.
1550
1518
  # Nanostring metadata has a slightly different structure than typical
1551
1519
  # assays, where each batch has an array of runs, and each run has
1552
1520
  # an array of sample-level entries each with a cimac_id.
1553
1521
  nanostring_subquery = """
1554
- select
1555
- trial_id,
1556
- 'nanostring' as key,
1557
- sample->>'cimac_id' as cimac_id
1558
- from
1559
- trial_metadata,
1560
- jsonb_array_elements(metadata_json#>'{assays,nanostring}') batches,
1561
- jsonb_array_elements(batches->'runs') runs,
1562
- jsonb_array_elements(runs->'samples') sample
1563
- """
1522
+ select
1523
+ trial_id,
1524
+ 'nanostring' as key,
1525
+ sample->>'cimac_id' as cimac_id
1526
+ from
1527
+ trial_metadata,
1528
+ jsonb_array_elements(metadata_json#>'{assays,nanostring}') batches,
1529
+ jsonb_array_elements(batches->'runs') runs,
1530
+ jsonb_array_elements(runs->'samples') sample
1531
+ """
1564
1532
 
1565
1533
  # Find all samples associated with olink uploads.
1566
1534
  # Unlike other assays, olink metadata is an object at the top level
@@ -1570,222 +1538,222 @@ class TrialMetadata(CommonColumns):
1570
1538
  # the samples corresponding to a given record are stored
1571
1539
  # like: record["files"]["assay_npx"]["samples"].
1572
1540
  olink_subquery = """
1573
- select
1574
- trial_id,
1575
- 'olink' as key,
1576
- sample as cimac_id
1577
- from
1578
- trial_metadata,
1579
- jsonb_array_elements(metadata_json#>'{assays,olink,batches}') batches,
1580
- jsonb_array_elements(batches->'records') records,
1581
- jsonb_array_elements_text(records#>'{files,assay_npx,samples}') sample
1582
- """
1541
+ select
1542
+ trial_id,
1543
+ 'olink' as key,
1544
+ sample as cimac_id
1545
+ from
1546
+ trial_metadata,
1547
+ jsonb_array_elements(metadata_json#>'{assays,olink,batches}') batches,
1548
+ jsonb_array_elements(batches->'records') records,
1549
+ jsonb_array_elements_text(records#>'{files,assay_npx,samples}') sample
1550
+ """
1583
1551
 
1584
1552
  # Find all samples associated with elisa uploads.
1585
1553
  # Unlike other assays, elisa metadata is an array of entries, each containing a single data file.
1586
1554
  # The samples corresponding to a given entry are stored like:
1587
1555
  # entry["assay_xlsx"]["samples"].
1588
1556
  elisa_subquery = """
1589
- select
1590
- trial_id,
1591
- 'elisa' as key,
1592
- sample as cimac_id
1593
- from
1594
- trial_metadata,
1595
- jsonb_array_elements(metadata_json#>'{assays,elisa}') entry,
1596
- jsonb_array_elements_text(entry#>'{assay_xlsx,samples}') sample
1597
- """
1557
+ select
1558
+ trial_id,
1559
+ 'elisa' as key,
1560
+ sample as cimac_id
1561
+ from
1562
+ trial_metadata,
1563
+ jsonb_array_elements(metadata_json#>'{assays,elisa}') entry,
1564
+ jsonb_array_elements_text(entry#>'{assay_xlsx,samples}') sample
1565
+ """
1598
1566
 
1599
1567
  # Find the tumor samples that have associated paired-analysis data.
1600
1568
  wes_analysis_subquery = """
1601
- select
1602
- trial_id,
1603
- 'wes_analysis' as key,
1604
- pair#>>'{tumor,cimac_id}' as cimac_id
1605
- from
1606
- trial_metadata,
1607
- jsonb_array_elements(metadata_json#>'{analysis,wes_analysis,pair_runs}') pair
1608
- where
1609
- pair#>>'{report,report}' is not null
1610
- union all
1611
- select
1612
- trial_id,
1613
- 'wes_analysis' as key,
1614
- pair#>>'{tumor,cimac_id}' as cimac_id
1615
- from
1616
- trial_metadata,
1617
- jsonb_array_elements(metadata_json#>'{analysis,wes_analysis_old,pair_runs}') pair
1618
- where
1619
- pair#>>'{report,report}' is not null
1620
- """
1569
+ select
1570
+ trial_id,
1571
+ 'wes_analysis' as key,
1572
+ pair#>>'{tumor,cimac_id}' as cimac_id
1573
+ from
1574
+ trial_metadata,
1575
+ jsonb_array_elements(metadata_json#>'{analysis,wes_analysis,pair_runs}') pair
1576
+ where
1577
+ pair#>>'{report,report}' is not null
1578
+ union all
1579
+ select
1580
+ trial_id,
1581
+ 'wes_analysis' as key,
1582
+ pair#>>'{tumor,cimac_id}' as cimac_id
1583
+ from
1584
+ trial_metadata,
1585
+ jsonb_array_elements(metadata_json#>'{analysis,wes_analysis_old,pair_runs}') pair
1586
+ where
1587
+ pair#>>'{report,report}' is not null
1588
+ """
1621
1589
 
1622
1590
  # Find the tumor samples that have associated tumor-only analysis data.
1623
1591
  wes_tumor_only_analysis_subquery = """
1624
- select
1625
- trial_id,
1626
- 'wes_tumor_only_analysis' as key,
1627
- run#>>'{tumor,cimac_id}' as cimac_id
1628
- from
1629
- trial_metadata,
1630
- jsonb_array_elements(metadata_json#>'{analysis,wes_tumor_only_analysis,runs}') run
1631
- where
1632
- run#>>'{report,report}' is not null
1633
- union all
1634
- select
1635
- trial_id,
1636
- 'wes_tumor_only_analysis' as key,
1637
- run#>>'{tumor,cimac_id}' as cimac_id
1638
- from
1639
- trial_metadata,
1640
- jsonb_array_elements(metadata_json#>'{analysis,wes_tumor_only_analysis_old,runs}') run
1641
- where
1642
- run#>>'{report,report}' is not null
1643
- """
1592
+ select
1593
+ trial_id,
1594
+ 'wes_tumor_only_analysis' as key,
1595
+ run#>>'{tumor,cimac_id}' as cimac_id
1596
+ from
1597
+ trial_metadata,
1598
+ jsonb_array_elements(metadata_json#>'{analysis,wes_tumor_only_analysis,runs}') run
1599
+ where
1600
+ run#>>'{report,report}' is not null
1601
+ union all
1602
+ select
1603
+ trial_id,
1604
+ 'wes_tumor_only_analysis' as key,
1605
+ run#>>'{tumor,cimac_id}' as cimac_id
1606
+ from
1607
+ trial_metadata,
1608
+ jsonb_array_elements(metadata_json#>'{analysis,wes_tumor_only_analysis_old,runs}') run
1609
+ where
1610
+ run#>>'{report,report}' is not null
1611
+ """
1644
1612
 
1645
1613
  # Find the tumor samples that will have associated paired-analysis data.
1646
1614
  # We are asserting that a tumor sample will not be used for multiple analyses.
1647
1615
  # This is similar to the wes_analysis_subquery but without the requirement for a report,
1648
1616
  # which is the defining feature of analysis.
1649
1617
  wes_subquery = """
1650
- select
1651
- trial_id,
1652
- 'wes' as key,
1653
- pair#>>'{tumor,cimac_id}' as cimac_id
1654
- from
1655
- trial_metadata,
1656
- jsonb_array_elements(metadata_json#>'{analysis,wes_analysis,pair_runs}') pair
1657
- union all
1658
- select
1659
- trial_id,
1660
- 'wes' as key,
1661
- pair#>>'{tumor,cimac_id}' as cimac_id
1662
- from
1663
- trial_metadata,
1664
- jsonb_array_elements(metadata_json#>'{analysis,wes_analysis_old,pair_runs}') pair
1665
- """
1666
-
1667
- # Find the tumor samples that WON'T have associated paired-analysis data.
1668
- # Get all tumor samples with WES data not in the equivalent of wes_subquery.
1669
- wes_tumor_assay_subquery = """
1670
- select
1671
- trial_metadata.trial_id,
1672
- 'wes_tumor_only' as key,
1673
- record->>'cimac_id' as cimac_id
1674
- from
1675
- trial_metadata,
1676
- jsonb_array_elements(metadata_json#>'{assays,wes}') batch,
1677
- jsonb_array_elements(batch->'records') record
1678
- join (
1679
- select
1680
- trial_id,
1681
- sample->>'cimac_id' as cimac_id
1682
- from
1683
- trial_metadata,
1684
- jsonb_array_elements(metadata_json->'participants') participant,
1685
- jsonb_array_elements(participant->'samples') sample
1686
-
1687
- where
1688
- sample->>'processed_sample_derivative' = 'Tumor DNA'
1689
- or
1690
- sample->>'processed_sample_derivative' = 'Tumor RNA'
1691
- ) sample_data
1692
- on
1693
- sample_data.cimac_id = record->>'cimac_id'
1694
- where
1695
- sample_data.trial_id = trial_metadata.trial_id
1696
- and
1697
- record->>'cimac_id' not in (
1698
1618
  select
1699
- pair#>>'{tumor,cimac_id}'
1619
+ trial_id,
1620
+ 'wes' as key,
1621
+ pair#>>'{tumor,cimac_id}' as cimac_id
1700
1622
  from
1701
1623
  trial_metadata,
1702
1624
  jsonb_array_elements(metadata_json#>'{analysis,wes_analysis,pair_runs}') pair
1703
1625
  union all
1704
1626
  select
1705
- pair#>>'{tumor,cimac_id}'
1627
+ trial_id,
1628
+ 'wes' as key,
1629
+ pair#>>'{tumor,cimac_id}' as cimac_id
1706
1630
  from
1707
1631
  trial_metadata,
1708
1632
  jsonb_array_elements(metadata_json#>'{analysis,wes_analysis_old,pair_runs}') pair
1709
- )
1710
- """
1633
+ """
1634
+
1635
+ # Find the tumor samples that WON'T have associated paired-analysis data.
1636
+ # Get all tumor samples with WES data not in the equivalent of wes_subquery.
1637
+ wes_tumor_assay_subquery = """
1638
+ select
1639
+ trial_metadata.trial_id,
1640
+ 'wes_tumor_only' as key,
1641
+ record->>'cimac_id' as cimac_id
1642
+ from
1643
+ trial_metadata,
1644
+ jsonb_array_elements(metadata_json#>'{assays,wes}') batch,
1645
+ jsonb_array_elements(batch->'records') record
1646
+ join (
1647
+ select
1648
+ trial_id,
1649
+ sample->>'cimac_id' as cimac_id
1650
+ from
1651
+ trial_metadata,
1652
+ jsonb_array_elements(metadata_json->'participants') participant,
1653
+ jsonb_array_elements(participant->'samples') sample
1654
+
1655
+ where
1656
+ sample->>'processed_sample_derivative' = 'Tumor DNA'
1657
+ or
1658
+ sample->>'processed_sample_derivative' = 'Tumor RNA'
1659
+ ) sample_data
1660
+ on
1661
+ sample_data.cimac_id = record->>'cimac_id'
1662
+ where
1663
+ sample_data.trial_id = trial_metadata.trial_id
1664
+ and
1665
+ record->>'cimac_id' not in (
1666
+ select
1667
+ pair#>>'{tumor,cimac_id}'
1668
+ from
1669
+ trial_metadata,
1670
+ jsonb_array_elements(metadata_json#>'{analysis,wes_analysis,pair_runs}') pair
1671
+ union all
1672
+ select
1673
+ pair#>>'{tumor,cimac_id}'
1674
+ from
1675
+ trial_metadata,
1676
+ jsonb_array_elements(metadata_json#>'{analysis,wes_analysis_old,pair_runs}') pair
1677
+ )
1678
+ """
1711
1679
 
1712
1680
  # Find ALL normal samples that have WES data.
1713
1681
  # This is included in counting for total_participants and total_samples,
1714
1682
  # but do not affect the assay-level counts which are tumor sample-specific for WES.
1715
1683
  wes_normal_assay_subquery = """
1716
- select
1717
- trial_id,
1718
- 'wes_normal' as key,
1719
- record->>'cimac_id' as cimac_id
1720
- from
1721
- trial_metadata,
1722
- jsonb_array_elements(metadata_json#>'{assays,wes}') batch,
1723
- jsonb_array_elements(batch->'records') record
1724
- join (
1725
1684
  select
1726
- sample->>'cimac_id' as cimac_id
1685
+ trial_id,
1686
+ 'wes_normal' as key,
1687
+ record->>'cimac_id' as cimac_id
1727
1688
  from
1728
1689
  trial_metadata,
1729
- jsonb_array_elements(metadata_json->'participants') participant,
1730
- jsonb_array_elements(participant->'samples') sample
1731
- where
1732
- sample->>'processed_sample_derivative' <> 'Tumor DNA'
1733
- and
1734
- sample->>'processed_sample_derivative' <> 'Tumor RNA'
1735
- ) sample_data
1736
- on
1737
- sample_data.cimac_id = record->>'cimac_id'
1738
- """
1690
+ jsonb_array_elements(metadata_json#>'{assays,wes}') batch,
1691
+ jsonb_array_elements(batch->'records') record
1692
+ join (
1693
+ select
1694
+ sample->>'cimac_id' as cimac_id
1695
+ from
1696
+ trial_metadata,
1697
+ jsonb_array_elements(metadata_json->'participants') participant,
1698
+ jsonb_array_elements(participant->'samples') sample
1699
+ where
1700
+ sample->>'processed_sample_derivative' <> 'Tumor DNA'
1701
+ and
1702
+ sample->>'processed_sample_derivative' <> 'Tumor RNA'
1703
+ ) sample_data
1704
+ on
1705
+ sample_data.cimac_id = record->>'cimac_id'
1706
+ """
1739
1707
 
1740
1708
  # Find all samples associated with RNA analysis uploads.
1741
1709
  # There is ONLY level_1
1742
1710
  rna_level1_analysis_subquery = """
1743
- select
1744
- trial_id,
1745
- 'rna_level1_analysis' as key,
1746
- run->>'cimac_id' as cimac_id
1747
- from
1748
- trial_metadata,
1749
- jsonb_array_elements(metadata_json#>'{analysis,rna_analysis,level_1}') run
1750
- """
1711
+ select
1712
+ trial_id,
1713
+ 'rna_level1_analysis' as key,
1714
+ run->>'cimac_id' as cimac_id
1715
+ from
1716
+ trial_metadata,
1717
+ jsonb_array_elements(metadata_json#>'{analysis,rna_analysis,level_1}') run
1718
+ """
1751
1719
 
1752
1720
  # Find all samples associated with TCR analysis uploads.
1753
1721
  tcr_analysis_subquery = """
1754
- select
1755
- trial_id,
1756
- 'tcr_analysis' as key,
1757
- record->>'cimac_id' as cimac_id
1758
- from
1759
- trial_metadata,
1760
- jsonb_array_elements(metadata_json#>'{analysis,tcr_analysis,batches}') batch,
1761
- jsonb_array_elements(batch->'records') record
1762
- """
1722
+ select
1723
+ trial_id,
1724
+ 'tcr_analysis' as key,
1725
+ record->>'cimac_id' as cimac_id
1726
+ from
1727
+ trial_metadata,
1728
+ jsonb_array_elements(metadata_json#>'{analysis,tcr_analysis,batches}') batch,
1729
+ jsonb_array_elements(batch->'records') record
1730
+ """
1763
1731
 
1764
1732
  # Find all samples associated with CyTOF analysis uploads.
1765
1733
  cytof_analysis_subquery = """
1766
- select
1767
- trial_id,
1768
- 'cytof_analysis' as key,
1769
- record->>'cimac_id' as cimac_id
1770
- from
1771
- trial_metadata,
1772
- jsonb_array_elements(metadata_json#>'{assays,cytof}') batch,
1773
- jsonb_array_elements(batch->'records') record
1774
- where
1775
- record->'output_files' is not null
1776
- """
1734
+ select
1735
+ trial_id,
1736
+ 'cytof_analysis' as key,
1737
+ record->>'cimac_id' as cimac_id
1738
+ from
1739
+ trial_metadata,
1740
+ jsonb_array_elements(metadata_json#>'{assays,cytof}') batch,
1741
+ jsonb_array_elements(batch->'records') record
1742
+ where
1743
+ record->'output_files' is not null
1744
+ """
1777
1745
 
1778
1746
  # Find all samples associated with ATACseq analysis uploads.
1779
1747
  atacseq_analysis_subquery = """
1780
- select
1781
- trial_id,
1782
- 'atacseq_analysis' as key,
1783
- record->>'cimac_id' as cimac_id
1784
- from
1785
- trial_metadata,
1786
- jsonb_array_elements(metadata_json#>'{analysis,atacseq_analysis}') batch,
1787
- jsonb_array_elements(batch->'records') record
1788
- """
1748
+ select
1749
+ trial_id,
1750
+ 'atacseq_analysis' as key,
1751
+ record->>'cimac_id' as cimac_id
1752
+ from
1753
+ trial_metadata,
1754
+ jsonb_array_elements(metadata_json#>'{analysis,atacseq_analysis}') batch,
1755
+ jsonb_array_elements(batch->'records') record
1756
+ """
1789
1757
 
1790
1758
  # Build up a JSON object mapping analysis types to arrays of excluded samples.
1791
1759
  # The resulting object will have structure like:
@@ -1795,79 +1763,79 @@ class TrialMetadata(CommonColumns):
1795
1763
  # ...
1796
1764
  # }
1797
1765
  excluded_samples_subquery = """
1798
- select
1799
- trial_id,
1800
- jsonb_object_agg(key, value) as value
1801
- from (
1802
- select
1803
- trial_id,
1804
- key,
1805
- jsonb_agg(sample) as value
1806
- from (
1807
- select
1808
- trial_id,
1809
- 'cytof_analysis' as key,
1810
- jsonb_array_elements(batch->'excluded_samples') as sample
1811
- from
1812
- trial_metadata,
1813
- jsonb_array_elements(metadata_json#>'{assays,cytof}') batch
1814
- union all
1815
- select
1816
- trial_id,
1817
- 'wes_analysis' as key,
1818
- jsonb_array_elements(metadata_json#>'{analysis,wes_analysis,excluded_samples}') as sample
1819
- from
1820
- trial_metadata
1821
- union all
1822
1766
  select
1823
1767
  trial_id,
1824
- 'wes_analysis' as key,
1825
- jsonb_array_elements(metadata_json#>'{analysis,wes_analysis_old,excluded_samples}') as sample
1826
- from
1827
- trial_metadata
1828
- union all
1829
- select
1830
- trial_id,
1831
- 'wes_tumor_only_analysis' as key,
1832
- jsonb_array_elements(metadata_json#>'{analysis,wes_tumor_only_analysis,excluded_samples}') as sample
1833
- from
1834
- trial_metadata
1835
- union all
1836
- select
1837
- trial_id,
1838
- 'wes_tumor_only_analysis' as key,
1839
- jsonb_array_elements(metadata_json#>'{analysis,wes_tumor_only_analysis_old,excluded_samples}') as sample
1840
- from
1841
- trial_metadata
1842
- union all
1768
+ jsonb_object_agg(key, value) as value
1769
+ from (
1770
+ select
1771
+ trial_id,
1772
+ key,
1773
+ jsonb_agg(sample) as value
1774
+ from (
1775
+ select
1776
+ trial_id,
1777
+ 'cytof_analysis' as key,
1778
+ jsonb_array_elements(batch->'excluded_samples') as sample
1779
+ from
1780
+ trial_metadata,
1781
+ jsonb_array_elements(metadata_json#>'{assays,cytof}') batch
1782
+ union all
1783
+ select
1784
+ trial_id,
1785
+ 'wes_analysis' as key,
1786
+ jsonb_array_elements(metadata_json#>'{analysis,wes_analysis,excluded_samples}') as sample
1787
+ from
1788
+ trial_metadata
1789
+ union all
1790
+ select
1791
+ trial_id,
1792
+ 'wes_analysis' as key,
1793
+ jsonb_array_elements(metadata_json#>'{analysis,wes_analysis_old,excluded_samples}') as sample
1794
+ from
1795
+ trial_metadata
1796
+ union all
1797
+ select
1798
+ trial_id,
1799
+ 'wes_tumor_only_analysis' as key,
1800
+ jsonb_array_elements(metadata_json#>'{analysis,wes_tumor_only_analysis,excluded_samples}') as sample
1801
+ from
1802
+ trial_metadata
1803
+ union all
1804
+ select
1805
+ trial_id,
1806
+ 'wes_tumor_only_analysis' as key,
1807
+ jsonb_array_elements(metadata_json#>'{analysis,wes_tumor_only_analysis_old,excluded_samples}') as sample
1808
+ from
1809
+ trial_metadata
1810
+ union all
1811
+ select
1812
+ trial_id,
1813
+ 'rna_level1_analysis' as key,
1814
+ jsonb_array_elements(metadata_json#>'{analysis,rna_analysis,excluded_samples}') as sample
1815
+ from
1816
+ trial_metadata
1817
+ union all
1818
+ select
1819
+ trial_id,
1820
+ 'tcr_analysis' as key,
1821
+ jsonb_array_elements(batches->'excluded_samples') as sample
1822
+ from
1823
+ trial_metadata,
1824
+ jsonb_array_elements(metadata_json#>'{analysis,tcr_analysis,batches}') batches
1825
+ ) excluded_q1
1826
+ group by trial_id, key
1827
+ ) excluded_q2
1828
+ group by trial_id
1829
+ """
1830
+
1831
+ # Extract an array of expected assays or an empty array if expected assays is null.
1832
+ expected_assays_subquery = """
1843
1833
  select
1844
1834
  trial_id,
1845
- 'rna_level1_analysis' as key,
1846
- jsonb_array_elements(metadata_json#>'{analysis,rna_analysis,excluded_samples}') as sample
1835
+ coalesce(metadata_json->'expected_assays', '[]'::jsonb) as expected_assays
1847
1836
  from
1848
1837
  trial_metadata
1849
- union all
1850
- select
1851
- trial_id,
1852
- 'tcr_analysis' as key,
1853
- jsonb_array_elements(batches->'excluded_samples') as sample
1854
- from
1855
- trial_metadata,
1856
- jsonb_array_elements(metadata_json#>'{analysis,tcr_analysis,batches}') batches
1857
- ) excluded_q1
1858
- group by trial_id, key
1859
- ) excluded_q2
1860
- group by trial_id
1861
- """
1862
-
1863
- # Extract an array of expected assays or an empty array if expected assays is null.
1864
- expected_assays_subquery = """
1865
- select
1866
- trial_id,
1867
- coalesce(metadata_json->'expected_assays', '[]'::jsonb) as expected_assays
1868
- from
1869
- trial_metadata
1870
- """
1838
+ """
1871
1839
 
1872
1840
  # All the subqueries produce the same set of columns, so UNION ALL
1873
1841
  # them together into a single query, aggregating results into
@@ -1875,98 +1843,136 @@ class TrialMetadata(CommonColumns):
1875
1843
  # NOTE: we use UNION ALL for assay-level counts instead of just UNION to
1876
1844
  # prevent any unwanted de-duplication within subquery results.
1877
1845
  combined_query = f"""
1878
- select
1879
- jsonb_object_agg('trial_id', expected_assays.trial_id)
1880
- || jsonb_object_agg('excluded_samples', coalesce(excluded_sample_lists.value, '{{}}'::jsonb))
1881
- || jsonb_object_agg('expected_assays', coalesce(expected_assays, '[]'::jsonb))
1882
- || jsonb_object_agg('file_size_bytes', coalesce(file_sizes.value, 0))
1883
- || jsonb_object_agg('clinical_participants', coalesce(clinical_participants.value, 0))
1884
- || jsonb_build_object('total_participants', coalesce(total_participants, 0))
1885
- || jsonb_build_object('total_samples', coalesce(total_samples, 0))
1886
- || coalesce(sample_counts.sample_counts, '{{}}'::jsonb)
1887
- from ({expected_assays_subquery}) expected_assays
1888
- full join (
1889
- select
1890
- trial_id,
1891
- count(distinct cimac_id) as total_samples,
1892
- count(distinct left(cimac_id, 7)) as total_participants
1893
- from (
1894
- {generic_assay_subquery}
1895
- union
1896
- {nanostring_subquery}
1897
- union
1898
- {olink_subquery}
1899
- union
1900
- {elisa_subquery}
1901
- union
1902
- {wes_subquery}
1903
- union
1904
- {wes_tumor_assay_subquery}
1905
- union
1906
- {wes_normal_assay_subquery}
1907
- ) assays
1908
- group by
1909
- trial_id
1910
- ) total_counts
1911
- on expected_assays.trial_id = total_counts.trial_id
1912
- full join (
1913
- select
1914
- trial_id,
1915
- jsonb_object_agg(key, num_sample) as sample_counts
1916
- from (
1917
1846
  select
1918
- trial_id,
1919
- key,
1920
- count(distinct cimac_id) as num_sample
1921
- from (
1922
- {generic_assay_subquery}
1923
- union all
1924
- {nanostring_subquery}
1925
- union all
1926
- {olink_subquery}
1927
- union all
1928
- {elisa_subquery}
1929
- union all
1930
- {wes_subquery}
1931
- union all
1932
- {wes_tumor_assay_subquery}
1933
- union all
1934
- {wes_analysis_subquery}
1935
- union all
1936
- {wes_tumor_only_analysis_subquery}
1937
- union all
1938
- {rna_level1_analysis_subquery}
1939
- union all
1940
- {tcr_analysis_subquery}
1941
- union all
1942
- {cytof_analysis_subquery}
1943
- union all
1944
- {atacseq_analysis_subquery}
1945
- ) assays_and_analysis
1847
+ jsonb_object_agg('trial_id', expected_assays.trial_id)
1848
+ || jsonb_object_agg('excluded_samples', coalesce(excluded_sample_lists.value, '{{}}'::jsonb))
1849
+ || jsonb_object_agg('expected_assays', coalesce(expected_assays, '[]'::jsonb))
1850
+ || jsonb_object_agg('file_size_bytes', coalesce(file_sizes.value, 0))
1851
+ || jsonb_object_agg('clinical_participants', coalesce(clinical_participants.value, 0))
1852
+ || jsonb_build_object('total_participants', coalesce(total_participants, 0))
1853
+ || jsonb_build_object('total_samples', coalesce(total_samples, 0))
1854
+ || coalesce(sample_counts.sample_counts, '{{}}'::jsonb) as result
1855
+ from ({expected_assays_subquery}) expected_assays
1856
+ full join (
1857
+ select
1858
+ trial_id,
1859
+ count(distinct cimac_id) as total_samples,
1860
+ count(distinct left(cimac_id, 7)) as total_participants
1861
+ from (
1862
+ {generic_assay_subquery}
1863
+ union
1864
+ {nanostring_subquery}
1865
+ union
1866
+ {olink_subquery}
1867
+ union
1868
+ {elisa_subquery}
1869
+ union
1870
+ {wes_subquery}
1871
+ union
1872
+ {wes_tumor_assay_subquery}
1873
+ union
1874
+ {wes_normal_assay_subquery}
1875
+ ) assays
1876
+ group by
1877
+ trial_id
1878
+ ) total_counts
1879
+ on expected_assays.trial_id = total_counts.trial_id
1880
+ full join (
1881
+ select
1882
+ trial_id,
1883
+ jsonb_object_agg(key, num_sample) as sample_counts
1884
+ from (
1885
+ select
1886
+ trial_id,
1887
+ key,
1888
+ count(distinct cimac_id) as num_sample
1889
+ from (
1890
+ {generic_assay_subquery}
1891
+ union all
1892
+ {nanostring_subquery}
1893
+ union all
1894
+ {olink_subquery}
1895
+ union all
1896
+ {elisa_subquery}
1897
+ union all
1898
+ {wes_subquery}
1899
+ union all
1900
+ {wes_tumor_assay_subquery}
1901
+ union all
1902
+ {wes_analysis_subquery}
1903
+ union all
1904
+ {wes_tumor_only_analysis_subquery}
1905
+ union all
1906
+ {rna_level1_analysis_subquery}
1907
+ union all
1908
+ {tcr_analysis_subquery}
1909
+ union all
1910
+ {cytof_analysis_subquery}
1911
+ union all
1912
+ {atacseq_analysis_subquery}
1913
+ ) assays_and_analysis
1914
+ group by
1915
+ trial_id, key
1916
+ ) q
1917
+ group by
1918
+ trial_id
1919
+ ) sample_counts
1920
+ on expected_assays.trial_id = sample_counts.trial_id
1921
+ full join ({excluded_samples_subquery}) excluded_sample_lists
1922
+ on expected_assays.trial_id = excluded_sample_lists.trial_id
1923
+ full join ({files_subquery}) file_sizes
1924
+ on expected_assays.trial_id = file_sizes.trial_id
1925
+ full join ({clinical_subquery}) clinical_participants
1926
+ on expected_assays.trial_id = clinical_participants.trial_id
1946
1927
  group by
1947
- trial_id, key
1948
- ) q
1949
- group by
1950
- trial_id
1951
- ) sample_counts
1952
- on expected_assays.trial_id = sample_counts.trial_id
1953
- full join ({excluded_samples_subquery}) excluded_sample_lists
1954
- on expected_assays.trial_id = excluded_sample_lists.trial_id
1955
- full join ({files_subquery}) file_sizes
1956
- on expected_assays.trial_id = file_sizes.trial_id
1957
- full join ({clinical_subquery}) clinical_participants
1958
- on expected_assays.trial_id = clinical_participants.trial_id
1959
- group by
1960
- expected_assays.trial_id,
1961
- total_participants,
1962
- total_samples,
1963
- sample_counts.sample_counts
1964
- ;
1965
- """
1966
-
1967
- # Run the query and extract the trial-level summary dictionaries
1928
+ expected_assays.trial_id,
1929
+ total_participants,
1930
+ total_samples,
1931
+ sample_counts.sample_counts
1932
+ ;
1933
+ """
1934
+
1935
+ return combined_query
1936
+
1937
+ @staticmethod
1938
+ @with_default_session
1939
+ def get_summaries(session: Session) -> List[dict]:
1940
+ """
1941
+ Return a list of trial summaries, where each summary has structure like:
1942
+ ```python
1943
+ {
1944
+ "trial_id": ...,
1945
+ "expected_assays": ..., # list of assays the trial should have data for
1946
+ "file_size_bytes": ..., # total file size for the trial
1947
+ "clinical_participants": ..., # number of participants with clinical data
1948
+ "total_participants": ..., # number of unique participants with assay data
1949
+ "total_samples": ..., # number of samples with assay data
1950
+ "cytof": ..., # cytof sample count
1951
+ ... # other assays and analysis
1952
+ }
1953
+ ```
1954
+ NOTE: if the metadata model for any existing assays substantially changes,
1955
+ or if new assays are introduced that don't follow the typical structure
1956
+ (batches containing sample-level records), then this method will need to
1957
+ be updated to accommodate those changes.
1958
+
1959
+ Only the assays are used for calculating `"total_participants"` and `"total_samples"`,
1960
+ as all analyses are derived from assay data.
1961
+ Each assay/analysis subquery is expected to return a set with `trial_id`, `key`,
1962
+ and `cimac_id` which are used for both assay-level and overall counting.
1963
+
1964
+ There is a bit of complexity with the way that WES samples are counted:
1965
+ - `"wes"` only counts tumor samples slated for paired wes_analysis
1966
+ - `"wes_tumor_only"` counts all tumor samples NOT slated for paired wes_analysis
1967
+ - `"wes_analysis"` counts tumor samples with paired wes_analysis
1968
+ - `"wes_tumor_only_analysis"` counts (tumor) samples with tumor-only analysis
1969
+ For `"total_[participants/samples]"`, ALL (ie tumor AND normal) WES assay samples are included.
1970
+ """
1971
+ summaries_query = "SELECT result FROM trial_summaries_mv"
1972
+ # Retrieve trial-level summary results from data cached in trial_summaries_mv materialized view.
1973
+ # The source of the SQL query used in trial_summaries_mv is get_summaries_query()
1968
1974
  summaries = [
1969
- summary for (summary,) in session.execute(combined_query) if summary
1975
+ summary for (summary,) in session.execute(summaries_query) if summary
1970
1976
  ]
1971
1977
 
1972
1978
  # Shortcut to impute 0 values for assays where trials don't yet have data
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: nci_cidc_api_modules
3
- Version: 1.0.9
3
+ Version: 1.0.12
4
4
  Summary: SQLAlchemy data models and configuration tools used in the NCI CIDC API
5
5
  Home-page: https://github.com/NCI-CIDC/cidc-api-gae
6
6
  License: MIT license
@@ -21,10 +21,11 @@ Requires-Dist: google-cloud-bigquery ==3.18.0
21
21
  Requires-Dist: google-api-python-client ==2.64.0
22
22
  Requires-Dist: packaging >=20.0.0
23
23
  Requires-Dist: pyarrow ==14.0.1
24
+ Requires-Dist: numpy <2,>=1.16.5
24
25
  Requires-Dist: pandas <2,>=1
25
26
  Requires-Dist: python-dotenv ==0.10.3
26
- Requires-Dist: requests ==2.31.0
27
- Requires-Dist: jinja2 ==3.1.3
27
+ Requires-Dist: requests ==2.32.3
28
+ Requires-Dist: jinja2 ==3.1.4
28
29
  Requires-Dist: nci-cidc-schemas ==0.26.33
29
30
 
30
31
  # NCI CIDC API <!-- omit in TOC -->
@@ -8,7 +8,7 @@ cidc_api/csms/auth.py,sha256=25Yma2Kz3KLENAPSeBYacFuSZXng-EDgmgInKBsRyP0,3191
8
8
  cidc_api/models/__init__.py,sha256=bl445G8Zic9YbhZ8ZBni07wtBMhLJRMBA-JqjLxx2bw,66
9
9
  cidc_api/models/csms_api.py,sha256=Wp4b53vwOqSlOIaoAYGlI1p8ZfXRXmVJ6MLcsvzq0LA,31664
10
10
  cidc_api/models/migrations.py,sha256=gp9vtkYbA9FFy2s-7woelAmsvQbJ41LO2_DY-YkFIrQ,11464
11
- cidc_api/models/models.py,sha256=AYt0rIzaeQ0HHlTSeerbTpYwMJwqt93aadOuFLLEqBA,120820
11
+ cidc_api/models/models.py,sha256=Hjp9sieGdldNbUzneFi-7vRYyo9wwr0D-0m_UbxsDEk,124106
12
12
  cidc_api/models/schemas.py,sha256=7tDYtmULuzTt2kg7RorWhte06ffalgpQKrFiDRGcPEQ,2711
13
13
  cidc_api/models/files/__init__.py,sha256=8BMTnUSHzUbz0lBeEQY6NvApxDD3GMWMduoVMos2g4Y,213
14
14
  cidc_api/models/files/details.py,sha256=eg1u8uZwtxb0m9mFobcTL_mnPBMq1MPZv3NN3KWMGOI,62309
@@ -18,8 +18,8 @@ cidc_api/shared/auth.py,sha256=VMd_3QJE2iG16QxuGzHBV9MzJJItOZNn9gcw0_iUBLI,11647
18
18
  cidc_api/shared/emails.py,sha256=5dyuKlpcg1M4P_RrAt0ss2hiCqb-Y7p2XXR1d9uBXg8,4868
19
19
  cidc_api/shared/gcloud_client.py,sha256=7dDs0crLMJKdIp4IDSfrZBMB3h-zvWNieB81azoeLO4,33746
20
20
  cidc_api/shared/rest_utils.py,sha256=LMfBpvJRjkfQjCzVXuhTTe4Foz4wlvaKg6QntyR-Hkc,6648
21
- nci_cidc_api_modules-1.0.9.dist-info/LICENSE,sha256=pNYWVTHaYonnmJyplmeAp7tQAjosmDpAWjb34jjv7Xs,1102
22
- nci_cidc_api_modules-1.0.9.dist-info/METADATA,sha256=qgcUc4UDf8wzx1U9aksotUyA1HvGjKfLoBcgSg2ap7w,40474
23
- nci_cidc_api_modules-1.0.9.dist-info/WHEEL,sha256=GJ7t_kWBFywbagK5eo9IoUwLW6oyOeTKmQ-9iHFVNxQ,92
24
- nci_cidc_api_modules-1.0.9.dist-info/top_level.txt,sha256=rNiRzL0lJGi5Q9tY9uSoMdTbJ-7u5c_D2E86KA94yRA,9
25
- nci_cidc_api_modules-1.0.9.dist-info/RECORD,,
21
+ nci_cidc_api_modules-1.0.12.dist-info/LICENSE,sha256=pNYWVTHaYonnmJyplmeAp7tQAjosmDpAWjb34jjv7Xs,1102
22
+ nci_cidc_api_modules-1.0.12.dist-info/METADATA,sha256=SPwKY2ReUVP7izlWlwHgVmLTSPxN95TSsFIWSQstems,40508
23
+ nci_cidc_api_modules-1.0.12.dist-info/WHEEL,sha256=GJ7t_kWBFywbagK5eo9IoUwLW6oyOeTKmQ-9iHFVNxQ,92
24
+ nci_cidc_api_modules-1.0.12.dist-info/top_level.txt,sha256=rNiRzL0lJGi5Q9tY9uSoMdTbJ-7u5c_D2E86KA94yRA,9
25
+ nci_cidc_api_modules-1.0.12.dist-info/RECORD,,