nci-cidc-api-modules 1.0.9__py3-none-any.whl → 1.0.12__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- cidc_api/models/models.py +405 -399
- {nci_cidc_api_modules-1.0.9.dist-info → nci_cidc_api_modules-1.0.12.dist-info}/METADATA +4 -3
- {nci_cidc_api_modules-1.0.9.dist-info → nci_cidc_api_modules-1.0.12.dist-info}/RECORD +6 -6
- {nci_cidc_api_modules-1.0.9.dist-info → nci_cidc_api_modules-1.0.12.dist-info}/LICENSE +0 -0
- {nci_cidc_api_modules-1.0.9.dist-info → nci_cidc_api_modules-1.0.12.dist-info}/WHEEL +0 -0
- {nci_cidc_api_modules-1.0.9.dist-info → nci_cidc_api_modules-1.0.12.dist-info}/top_level.txt +0 -0
cidc_api/models/models.py
CHANGED
@@ -1467,100 +1467,68 @@ class TrialMetadata(CommonColumns):
|
|
1467
1467
|
}
|
1468
1468
|
|
1469
1469
|
@staticmethod
|
1470
|
-
|
1471
|
-
def get_summaries(session: Session) -> List[dict]:
|
1472
|
-
"""
|
1473
|
-
Return a list of trial summaries, where each summary has structure like:
|
1474
|
-
```python
|
1475
|
-
{
|
1476
|
-
"trial_id": ...,
|
1477
|
-
"expected_assays": ..., # list of assays the trial should have data for
|
1478
|
-
"file_size_bytes": ..., # total file size for the trial
|
1479
|
-
"clinical_participants": ..., # number of participants with clinical data
|
1480
|
-
"total_participants": ..., # number of unique participants with assay data
|
1481
|
-
"total_samples": ..., # number of samples with assay data
|
1482
|
-
"cytof": ..., # cytof sample count
|
1483
|
-
... # other assays and analysis
|
1484
|
-
}
|
1485
|
-
```
|
1486
|
-
NOTE: if the metadata model for any existing assays substantially changes,
|
1487
|
-
or if new assays are introduced that don't follow the typical structure
|
1488
|
-
(batches containing sample-level records), then this method will need to
|
1489
|
-
be updated to accommodate those changes.
|
1490
|
-
|
1491
|
-
Only the assays are used for calculating `"total_participants"` and `"total_samples"`,
|
1492
|
-
as all analyses are derived from assay data.
|
1493
|
-
Each assay/analysis subquery is expected to return a set with `trial_id`, `key`,
|
1494
|
-
and `cimac_id` which are used for both assay-level and overall counting.
|
1495
|
-
|
1496
|
-
There is a bit of complexity with the way that WES samples are counted:
|
1497
|
-
- `"wes"` only counts tumor samples slated for paired wes_analysis
|
1498
|
-
- `"wes_tumor_only"` counts all tumor samples NOT slated for paired wes_analysis
|
1499
|
-
- `"wes_analysis"` counts tumor samples with paired wes_analysis
|
1500
|
-
- `"wes_tumor_only_analysis"` counts (tumor) samples with tumor-only analysis
|
1501
|
-
For `"total_[participants/samples]"`, ALL (ie tumor AND normal) WES assay samples are included.
|
1502
|
-
"""
|
1470
|
+
def get_summaries_query() -> str:
|
1503
1471
|
# Compute the total amount of data in bytes stored for each trial
|
1504
1472
|
files_subquery = """
|
1505
|
-
|
1506
|
-
|
1507
|
-
|
1508
|
-
|
1509
|
-
|
1510
|
-
|
1511
|
-
|
1512
|
-
|
1473
|
+
select
|
1474
|
+
trial_id,
|
1475
|
+
sum(file_size_bytes) as value
|
1476
|
+
from
|
1477
|
+
downloadable_files
|
1478
|
+
group by
|
1479
|
+
trial_id
|
1480
|
+
"""
|
1513
1481
|
|
1514
1482
|
# Count how many participants have associated clinical data. The same
|
1515
1483
|
# participant may appear in multiple clinical data files, so deduplicate
|
1516
1484
|
# participants before counting them.
|
1517
1485
|
clinical_subquery = """
|
1518
|
-
|
1519
|
-
|
1520
|
-
|
1521
|
-
|
1522
|
-
|
1523
|
-
|
1524
|
-
|
1525
|
-
|
1526
|
-
|
1527
|
-
|
1486
|
+
select
|
1487
|
+
trial_id,
|
1488
|
+
count(distinct participants) as value
|
1489
|
+
from
|
1490
|
+
trial_metadata,
|
1491
|
+
jsonb_array_elements(metadata_json#>'{clinical_data,records}') as records,
|
1492
|
+
jsonb_array_elements(records#>'{clinical_file,participants}') as participants
|
1493
|
+
group by
|
1494
|
+
trial_id
|
1495
|
+
"""
|
1528
1496
|
|
1529
1497
|
# Find all samples associated with each assay type for
|
1530
1498
|
# assays whose metadata follows the typical structure: an array of batches,
|
1531
1499
|
# with each batch containing an array of records, where each record
|
1532
1500
|
# corresponds to a unique sample with a cimac_id.
|
1533
1501
|
generic_assay_subquery = """
|
1534
|
-
|
1535
|
-
|
1536
|
-
|
1537
|
-
|
1538
|
-
|
1539
|
-
|
1540
|
-
|
1541
|
-
|
1542
|
-
|
1543
|
-
|
1544
|
-
|
1545
|
-
|
1546
|
-
|
1547
|
-
|
1502
|
+
select
|
1503
|
+
trial_id,
|
1504
|
+
case
|
1505
|
+
when key = 'hande' then 'h&e'
|
1506
|
+
else key
|
1507
|
+
end as key,
|
1508
|
+
record->>'cimac_id' as cimac_id
|
1509
|
+
from
|
1510
|
+
trial_metadata,
|
1511
|
+
jsonb_each(metadata_json->'assays') assays,
|
1512
|
+
jsonb_array_elements(value) batches,
|
1513
|
+
jsonb_array_elements(batches->'records') record
|
1514
|
+
where key not in ('olink', 'nanostring', 'elisa', 'wes', 'misc_data')
|
1515
|
+
"""
|
1548
1516
|
|
1549
1517
|
# Find all samples associated with nanostring uploads.
|
1550
1518
|
# Nanostring metadata has a slightly different structure than typical
|
1551
1519
|
# assays, where each batch has an array of runs, and each run has
|
1552
1520
|
# an array of sample-level entries each with a cimac_id.
|
1553
1521
|
nanostring_subquery = """
|
1554
|
-
|
1555
|
-
|
1556
|
-
|
1557
|
-
|
1558
|
-
|
1559
|
-
|
1560
|
-
|
1561
|
-
|
1562
|
-
|
1563
|
-
|
1522
|
+
select
|
1523
|
+
trial_id,
|
1524
|
+
'nanostring' as key,
|
1525
|
+
sample->>'cimac_id' as cimac_id
|
1526
|
+
from
|
1527
|
+
trial_metadata,
|
1528
|
+
jsonb_array_elements(metadata_json#>'{assays,nanostring}') batches,
|
1529
|
+
jsonb_array_elements(batches->'runs') runs,
|
1530
|
+
jsonb_array_elements(runs->'samples') sample
|
1531
|
+
"""
|
1564
1532
|
|
1565
1533
|
# Find all samples associated with olink uploads.
|
1566
1534
|
# Unlike other assays, olink metadata is an object at the top level
|
@@ -1570,222 +1538,222 @@ class TrialMetadata(CommonColumns):
|
|
1570
1538
|
# the samples corresponding to a given record are stored
|
1571
1539
|
# like: record["files"]["assay_npx"]["samples"].
|
1572
1540
|
olink_subquery = """
|
1573
|
-
|
1574
|
-
|
1575
|
-
|
1576
|
-
|
1577
|
-
|
1578
|
-
|
1579
|
-
|
1580
|
-
|
1581
|
-
|
1582
|
-
|
1541
|
+
select
|
1542
|
+
trial_id,
|
1543
|
+
'olink' as key,
|
1544
|
+
sample as cimac_id
|
1545
|
+
from
|
1546
|
+
trial_metadata,
|
1547
|
+
jsonb_array_elements(metadata_json#>'{assays,olink,batches}') batches,
|
1548
|
+
jsonb_array_elements(batches->'records') records,
|
1549
|
+
jsonb_array_elements_text(records#>'{files,assay_npx,samples}') sample
|
1550
|
+
"""
|
1583
1551
|
|
1584
1552
|
# Find all samples associated with elisa uploads.
|
1585
1553
|
# Unlike other assays, elisa metadata is an array of entries, each containing a single data file.
|
1586
1554
|
# The samples corresponding to a given entry are stored like:
|
1587
1555
|
# entry["assay_xlsx"]["samples"].
|
1588
1556
|
elisa_subquery = """
|
1589
|
-
|
1590
|
-
|
1591
|
-
|
1592
|
-
|
1593
|
-
|
1594
|
-
|
1595
|
-
|
1596
|
-
|
1597
|
-
|
1557
|
+
select
|
1558
|
+
trial_id,
|
1559
|
+
'elisa' as key,
|
1560
|
+
sample as cimac_id
|
1561
|
+
from
|
1562
|
+
trial_metadata,
|
1563
|
+
jsonb_array_elements(metadata_json#>'{assays,elisa}') entry,
|
1564
|
+
jsonb_array_elements_text(entry#>'{assay_xlsx,samples}') sample
|
1565
|
+
"""
|
1598
1566
|
|
1599
1567
|
# Find the tumor samples that have associated paired-analysis data.
|
1600
1568
|
wes_analysis_subquery = """
|
1601
|
-
|
1602
|
-
|
1603
|
-
|
1604
|
-
|
1605
|
-
|
1606
|
-
|
1607
|
-
|
1608
|
-
|
1609
|
-
|
1610
|
-
|
1611
|
-
|
1612
|
-
|
1613
|
-
|
1614
|
-
|
1615
|
-
|
1616
|
-
|
1617
|
-
|
1618
|
-
|
1619
|
-
|
1620
|
-
|
1569
|
+
select
|
1570
|
+
trial_id,
|
1571
|
+
'wes_analysis' as key,
|
1572
|
+
pair#>>'{tumor,cimac_id}' as cimac_id
|
1573
|
+
from
|
1574
|
+
trial_metadata,
|
1575
|
+
jsonb_array_elements(metadata_json#>'{analysis,wes_analysis,pair_runs}') pair
|
1576
|
+
where
|
1577
|
+
pair#>>'{report,report}' is not null
|
1578
|
+
union all
|
1579
|
+
select
|
1580
|
+
trial_id,
|
1581
|
+
'wes_analysis' as key,
|
1582
|
+
pair#>>'{tumor,cimac_id}' as cimac_id
|
1583
|
+
from
|
1584
|
+
trial_metadata,
|
1585
|
+
jsonb_array_elements(metadata_json#>'{analysis,wes_analysis_old,pair_runs}') pair
|
1586
|
+
where
|
1587
|
+
pair#>>'{report,report}' is not null
|
1588
|
+
"""
|
1621
1589
|
|
1622
1590
|
# Find the tumor samples that have associated tumor-only analysis data.
|
1623
1591
|
wes_tumor_only_analysis_subquery = """
|
1624
|
-
|
1625
|
-
|
1626
|
-
|
1627
|
-
|
1628
|
-
|
1629
|
-
|
1630
|
-
|
1631
|
-
|
1632
|
-
|
1633
|
-
|
1634
|
-
|
1635
|
-
|
1636
|
-
|
1637
|
-
|
1638
|
-
|
1639
|
-
|
1640
|
-
|
1641
|
-
|
1642
|
-
|
1643
|
-
|
1592
|
+
select
|
1593
|
+
trial_id,
|
1594
|
+
'wes_tumor_only_analysis' as key,
|
1595
|
+
run#>>'{tumor,cimac_id}' as cimac_id
|
1596
|
+
from
|
1597
|
+
trial_metadata,
|
1598
|
+
jsonb_array_elements(metadata_json#>'{analysis,wes_tumor_only_analysis,runs}') run
|
1599
|
+
where
|
1600
|
+
run#>>'{report,report}' is not null
|
1601
|
+
union all
|
1602
|
+
select
|
1603
|
+
trial_id,
|
1604
|
+
'wes_tumor_only_analysis' as key,
|
1605
|
+
run#>>'{tumor,cimac_id}' as cimac_id
|
1606
|
+
from
|
1607
|
+
trial_metadata,
|
1608
|
+
jsonb_array_elements(metadata_json#>'{analysis,wes_tumor_only_analysis_old,runs}') run
|
1609
|
+
where
|
1610
|
+
run#>>'{report,report}' is not null
|
1611
|
+
"""
|
1644
1612
|
|
1645
1613
|
# Find the tumor samples that will have associated paired-analysis data.
|
1646
1614
|
# We are asserting that a tumor sample will not be used for multiple analyses.
|
1647
1615
|
# This is similar to the wes_analysis_subquery but without the requirement for a report,
|
1648
1616
|
# which is the defining feature of analysis.
|
1649
1617
|
wes_subquery = """
|
1650
|
-
select
|
1651
|
-
trial_id,
|
1652
|
-
'wes' as key,
|
1653
|
-
pair#>>'{tumor,cimac_id}' as cimac_id
|
1654
|
-
from
|
1655
|
-
trial_metadata,
|
1656
|
-
jsonb_array_elements(metadata_json#>'{analysis,wes_analysis,pair_runs}') pair
|
1657
|
-
union all
|
1658
|
-
select
|
1659
|
-
trial_id,
|
1660
|
-
'wes' as key,
|
1661
|
-
pair#>>'{tumor,cimac_id}' as cimac_id
|
1662
|
-
from
|
1663
|
-
trial_metadata,
|
1664
|
-
jsonb_array_elements(metadata_json#>'{analysis,wes_analysis_old,pair_runs}') pair
|
1665
|
-
"""
|
1666
|
-
|
1667
|
-
# Find the tumor samples that WON'T have associated paired-analysis data.
|
1668
|
-
# Get all tumor samples with WES data not in the equivalent of wes_subquery.
|
1669
|
-
wes_tumor_assay_subquery = """
|
1670
|
-
select
|
1671
|
-
trial_metadata.trial_id,
|
1672
|
-
'wes_tumor_only' as key,
|
1673
|
-
record->>'cimac_id' as cimac_id
|
1674
|
-
from
|
1675
|
-
trial_metadata,
|
1676
|
-
jsonb_array_elements(metadata_json#>'{assays,wes}') batch,
|
1677
|
-
jsonb_array_elements(batch->'records') record
|
1678
|
-
join (
|
1679
|
-
select
|
1680
|
-
trial_id,
|
1681
|
-
sample->>'cimac_id' as cimac_id
|
1682
|
-
from
|
1683
|
-
trial_metadata,
|
1684
|
-
jsonb_array_elements(metadata_json->'participants') participant,
|
1685
|
-
jsonb_array_elements(participant->'samples') sample
|
1686
|
-
|
1687
|
-
where
|
1688
|
-
sample->>'processed_sample_derivative' = 'Tumor DNA'
|
1689
|
-
or
|
1690
|
-
sample->>'processed_sample_derivative' = 'Tumor RNA'
|
1691
|
-
) sample_data
|
1692
|
-
on
|
1693
|
-
sample_data.cimac_id = record->>'cimac_id'
|
1694
|
-
where
|
1695
|
-
sample_data.trial_id = trial_metadata.trial_id
|
1696
|
-
and
|
1697
|
-
record->>'cimac_id' not in (
|
1698
1618
|
select
|
1699
|
-
|
1619
|
+
trial_id,
|
1620
|
+
'wes' as key,
|
1621
|
+
pair#>>'{tumor,cimac_id}' as cimac_id
|
1700
1622
|
from
|
1701
1623
|
trial_metadata,
|
1702
1624
|
jsonb_array_elements(metadata_json#>'{analysis,wes_analysis,pair_runs}') pair
|
1703
1625
|
union all
|
1704
1626
|
select
|
1705
|
-
|
1627
|
+
trial_id,
|
1628
|
+
'wes' as key,
|
1629
|
+
pair#>>'{tumor,cimac_id}' as cimac_id
|
1706
1630
|
from
|
1707
1631
|
trial_metadata,
|
1708
1632
|
jsonb_array_elements(metadata_json#>'{analysis,wes_analysis_old,pair_runs}') pair
|
1709
|
-
|
1710
|
-
|
1633
|
+
"""
|
1634
|
+
|
1635
|
+
# Find the tumor samples that WON'T have associated paired-analysis data.
|
1636
|
+
# Get all tumor samples with WES data not in the equivalent of wes_subquery.
|
1637
|
+
wes_tumor_assay_subquery = """
|
1638
|
+
select
|
1639
|
+
trial_metadata.trial_id,
|
1640
|
+
'wes_tumor_only' as key,
|
1641
|
+
record->>'cimac_id' as cimac_id
|
1642
|
+
from
|
1643
|
+
trial_metadata,
|
1644
|
+
jsonb_array_elements(metadata_json#>'{assays,wes}') batch,
|
1645
|
+
jsonb_array_elements(batch->'records') record
|
1646
|
+
join (
|
1647
|
+
select
|
1648
|
+
trial_id,
|
1649
|
+
sample->>'cimac_id' as cimac_id
|
1650
|
+
from
|
1651
|
+
trial_metadata,
|
1652
|
+
jsonb_array_elements(metadata_json->'participants') participant,
|
1653
|
+
jsonb_array_elements(participant->'samples') sample
|
1654
|
+
|
1655
|
+
where
|
1656
|
+
sample->>'processed_sample_derivative' = 'Tumor DNA'
|
1657
|
+
or
|
1658
|
+
sample->>'processed_sample_derivative' = 'Tumor RNA'
|
1659
|
+
) sample_data
|
1660
|
+
on
|
1661
|
+
sample_data.cimac_id = record->>'cimac_id'
|
1662
|
+
where
|
1663
|
+
sample_data.trial_id = trial_metadata.trial_id
|
1664
|
+
and
|
1665
|
+
record->>'cimac_id' not in (
|
1666
|
+
select
|
1667
|
+
pair#>>'{tumor,cimac_id}'
|
1668
|
+
from
|
1669
|
+
trial_metadata,
|
1670
|
+
jsonb_array_elements(metadata_json#>'{analysis,wes_analysis,pair_runs}') pair
|
1671
|
+
union all
|
1672
|
+
select
|
1673
|
+
pair#>>'{tumor,cimac_id}'
|
1674
|
+
from
|
1675
|
+
trial_metadata,
|
1676
|
+
jsonb_array_elements(metadata_json#>'{analysis,wes_analysis_old,pair_runs}') pair
|
1677
|
+
)
|
1678
|
+
"""
|
1711
1679
|
|
1712
1680
|
# Find ALL normal samples that have WES data.
|
1713
1681
|
# This is included in counting for total_participants and total_samples,
|
1714
1682
|
# but do not affect the assay-level counts which are tumor sample-specific for WES.
|
1715
1683
|
wes_normal_assay_subquery = """
|
1716
|
-
select
|
1717
|
-
trial_id,
|
1718
|
-
'wes_normal' as key,
|
1719
|
-
record->>'cimac_id' as cimac_id
|
1720
|
-
from
|
1721
|
-
trial_metadata,
|
1722
|
-
jsonb_array_elements(metadata_json#>'{assays,wes}') batch,
|
1723
|
-
jsonb_array_elements(batch->'records') record
|
1724
|
-
join (
|
1725
1684
|
select
|
1726
|
-
|
1685
|
+
trial_id,
|
1686
|
+
'wes_normal' as key,
|
1687
|
+
record->>'cimac_id' as cimac_id
|
1727
1688
|
from
|
1728
1689
|
trial_metadata,
|
1729
|
-
jsonb_array_elements(metadata_json
|
1730
|
-
jsonb_array_elements(
|
1731
|
-
|
1732
|
-
|
1733
|
-
|
1734
|
-
|
1735
|
-
|
1736
|
-
|
1737
|
-
|
1738
|
-
|
1690
|
+
jsonb_array_elements(metadata_json#>'{assays,wes}') batch,
|
1691
|
+
jsonb_array_elements(batch->'records') record
|
1692
|
+
join (
|
1693
|
+
select
|
1694
|
+
sample->>'cimac_id' as cimac_id
|
1695
|
+
from
|
1696
|
+
trial_metadata,
|
1697
|
+
jsonb_array_elements(metadata_json->'participants') participant,
|
1698
|
+
jsonb_array_elements(participant->'samples') sample
|
1699
|
+
where
|
1700
|
+
sample->>'processed_sample_derivative' <> 'Tumor DNA'
|
1701
|
+
and
|
1702
|
+
sample->>'processed_sample_derivative' <> 'Tumor RNA'
|
1703
|
+
) sample_data
|
1704
|
+
on
|
1705
|
+
sample_data.cimac_id = record->>'cimac_id'
|
1706
|
+
"""
|
1739
1707
|
|
1740
1708
|
# Find all samples associated with RNA analysis uploads.
|
1741
1709
|
# There is ONLY level_1
|
1742
1710
|
rna_level1_analysis_subquery = """
|
1743
|
-
|
1744
|
-
|
1745
|
-
|
1746
|
-
|
1747
|
-
|
1748
|
-
|
1749
|
-
|
1750
|
-
|
1711
|
+
select
|
1712
|
+
trial_id,
|
1713
|
+
'rna_level1_analysis' as key,
|
1714
|
+
run->>'cimac_id' as cimac_id
|
1715
|
+
from
|
1716
|
+
trial_metadata,
|
1717
|
+
jsonb_array_elements(metadata_json#>'{analysis,rna_analysis,level_1}') run
|
1718
|
+
"""
|
1751
1719
|
|
1752
1720
|
# Find all samples associated with TCR analysis uploads.
|
1753
1721
|
tcr_analysis_subquery = """
|
1754
|
-
|
1755
|
-
|
1756
|
-
|
1757
|
-
|
1758
|
-
|
1759
|
-
|
1760
|
-
|
1761
|
-
|
1762
|
-
|
1722
|
+
select
|
1723
|
+
trial_id,
|
1724
|
+
'tcr_analysis' as key,
|
1725
|
+
record->>'cimac_id' as cimac_id
|
1726
|
+
from
|
1727
|
+
trial_metadata,
|
1728
|
+
jsonb_array_elements(metadata_json#>'{analysis,tcr_analysis,batches}') batch,
|
1729
|
+
jsonb_array_elements(batch->'records') record
|
1730
|
+
"""
|
1763
1731
|
|
1764
1732
|
# Find all samples associated with CyTOF analysis uploads.
|
1765
1733
|
cytof_analysis_subquery = """
|
1766
|
-
|
1767
|
-
|
1768
|
-
|
1769
|
-
|
1770
|
-
|
1771
|
-
|
1772
|
-
|
1773
|
-
|
1774
|
-
|
1775
|
-
|
1776
|
-
|
1734
|
+
select
|
1735
|
+
trial_id,
|
1736
|
+
'cytof_analysis' as key,
|
1737
|
+
record->>'cimac_id' as cimac_id
|
1738
|
+
from
|
1739
|
+
trial_metadata,
|
1740
|
+
jsonb_array_elements(metadata_json#>'{assays,cytof}') batch,
|
1741
|
+
jsonb_array_elements(batch->'records') record
|
1742
|
+
where
|
1743
|
+
record->'output_files' is not null
|
1744
|
+
"""
|
1777
1745
|
|
1778
1746
|
# Find all samples associated with ATACseq analysis uploads.
|
1779
1747
|
atacseq_analysis_subquery = """
|
1780
|
-
|
1781
|
-
|
1782
|
-
|
1783
|
-
|
1784
|
-
|
1785
|
-
|
1786
|
-
|
1787
|
-
|
1788
|
-
|
1748
|
+
select
|
1749
|
+
trial_id,
|
1750
|
+
'atacseq_analysis' as key,
|
1751
|
+
record->>'cimac_id' as cimac_id
|
1752
|
+
from
|
1753
|
+
trial_metadata,
|
1754
|
+
jsonb_array_elements(metadata_json#>'{analysis,atacseq_analysis}') batch,
|
1755
|
+
jsonb_array_elements(batch->'records') record
|
1756
|
+
"""
|
1789
1757
|
|
1790
1758
|
# Build up a JSON object mapping analysis types to arrays of excluded samples.
|
1791
1759
|
# The resulting object will have structure like:
|
@@ -1795,79 +1763,79 @@ class TrialMetadata(CommonColumns):
|
|
1795
1763
|
# ...
|
1796
1764
|
# }
|
1797
1765
|
excluded_samples_subquery = """
|
1798
|
-
select
|
1799
|
-
trial_id,
|
1800
|
-
jsonb_object_agg(key, value) as value
|
1801
|
-
from (
|
1802
|
-
select
|
1803
|
-
trial_id,
|
1804
|
-
key,
|
1805
|
-
jsonb_agg(sample) as value
|
1806
|
-
from (
|
1807
|
-
select
|
1808
|
-
trial_id,
|
1809
|
-
'cytof_analysis' as key,
|
1810
|
-
jsonb_array_elements(batch->'excluded_samples') as sample
|
1811
|
-
from
|
1812
|
-
trial_metadata,
|
1813
|
-
jsonb_array_elements(metadata_json#>'{assays,cytof}') batch
|
1814
|
-
union all
|
1815
|
-
select
|
1816
|
-
trial_id,
|
1817
|
-
'wes_analysis' as key,
|
1818
|
-
jsonb_array_elements(metadata_json#>'{analysis,wes_analysis,excluded_samples}') as sample
|
1819
|
-
from
|
1820
|
-
trial_metadata
|
1821
|
-
union all
|
1822
1766
|
select
|
1823
1767
|
trial_id,
|
1824
|
-
|
1825
|
-
|
1826
|
-
|
1827
|
-
|
1828
|
-
|
1829
|
-
|
1830
|
-
|
1831
|
-
|
1832
|
-
|
1833
|
-
|
1834
|
-
|
1835
|
-
|
1836
|
-
|
1837
|
-
|
1838
|
-
|
1839
|
-
|
1840
|
-
|
1841
|
-
|
1842
|
-
|
1768
|
+
jsonb_object_agg(key, value) as value
|
1769
|
+
from (
|
1770
|
+
select
|
1771
|
+
trial_id,
|
1772
|
+
key,
|
1773
|
+
jsonb_agg(sample) as value
|
1774
|
+
from (
|
1775
|
+
select
|
1776
|
+
trial_id,
|
1777
|
+
'cytof_analysis' as key,
|
1778
|
+
jsonb_array_elements(batch->'excluded_samples') as sample
|
1779
|
+
from
|
1780
|
+
trial_metadata,
|
1781
|
+
jsonb_array_elements(metadata_json#>'{assays,cytof}') batch
|
1782
|
+
union all
|
1783
|
+
select
|
1784
|
+
trial_id,
|
1785
|
+
'wes_analysis' as key,
|
1786
|
+
jsonb_array_elements(metadata_json#>'{analysis,wes_analysis,excluded_samples}') as sample
|
1787
|
+
from
|
1788
|
+
trial_metadata
|
1789
|
+
union all
|
1790
|
+
select
|
1791
|
+
trial_id,
|
1792
|
+
'wes_analysis' as key,
|
1793
|
+
jsonb_array_elements(metadata_json#>'{analysis,wes_analysis_old,excluded_samples}') as sample
|
1794
|
+
from
|
1795
|
+
trial_metadata
|
1796
|
+
union all
|
1797
|
+
select
|
1798
|
+
trial_id,
|
1799
|
+
'wes_tumor_only_analysis' as key,
|
1800
|
+
jsonb_array_elements(metadata_json#>'{analysis,wes_tumor_only_analysis,excluded_samples}') as sample
|
1801
|
+
from
|
1802
|
+
trial_metadata
|
1803
|
+
union all
|
1804
|
+
select
|
1805
|
+
trial_id,
|
1806
|
+
'wes_tumor_only_analysis' as key,
|
1807
|
+
jsonb_array_elements(metadata_json#>'{analysis,wes_tumor_only_analysis_old,excluded_samples}') as sample
|
1808
|
+
from
|
1809
|
+
trial_metadata
|
1810
|
+
union all
|
1811
|
+
select
|
1812
|
+
trial_id,
|
1813
|
+
'rna_level1_analysis' as key,
|
1814
|
+
jsonb_array_elements(metadata_json#>'{analysis,rna_analysis,excluded_samples}') as sample
|
1815
|
+
from
|
1816
|
+
trial_metadata
|
1817
|
+
union all
|
1818
|
+
select
|
1819
|
+
trial_id,
|
1820
|
+
'tcr_analysis' as key,
|
1821
|
+
jsonb_array_elements(batches->'excluded_samples') as sample
|
1822
|
+
from
|
1823
|
+
trial_metadata,
|
1824
|
+
jsonb_array_elements(metadata_json#>'{analysis,tcr_analysis,batches}') batches
|
1825
|
+
) excluded_q1
|
1826
|
+
group by trial_id, key
|
1827
|
+
) excluded_q2
|
1828
|
+
group by trial_id
|
1829
|
+
"""
|
1830
|
+
|
1831
|
+
# Extract an array of expected assays or an empty array if expected assays is null.
|
1832
|
+
expected_assays_subquery = """
|
1843
1833
|
select
|
1844
1834
|
trial_id,
|
1845
|
-
'
|
1846
|
-
jsonb_array_elements(metadata_json#>'{analysis,rna_analysis,excluded_samples}') as sample
|
1835
|
+
coalesce(metadata_json->'expected_assays', '[]'::jsonb) as expected_assays
|
1847
1836
|
from
|
1848
1837
|
trial_metadata
|
1849
|
-
|
1850
|
-
select
|
1851
|
-
trial_id,
|
1852
|
-
'tcr_analysis' as key,
|
1853
|
-
jsonb_array_elements(batches->'excluded_samples') as sample
|
1854
|
-
from
|
1855
|
-
trial_metadata,
|
1856
|
-
jsonb_array_elements(metadata_json#>'{analysis,tcr_analysis,batches}') batches
|
1857
|
-
) excluded_q1
|
1858
|
-
group by trial_id, key
|
1859
|
-
) excluded_q2
|
1860
|
-
group by trial_id
|
1861
|
-
"""
|
1862
|
-
|
1863
|
-
# Extract an array of expected assays or an empty array if expected assays is null.
|
1864
|
-
expected_assays_subquery = """
|
1865
|
-
select
|
1866
|
-
trial_id,
|
1867
|
-
coalesce(metadata_json->'expected_assays', '[]'::jsonb) as expected_assays
|
1868
|
-
from
|
1869
|
-
trial_metadata
|
1870
|
-
"""
|
1838
|
+
"""
|
1871
1839
|
|
1872
1840
|
# All the subqueries produce the same set of columns, so UNION ALL
|
1873
1841
|
# them together into a single query, aggregating results into
|
@@ -1875,98 +1843,136 @@ class TrialMetadata(CommonColumns):
|
|
1875
1843
|
# NOTE: we use UNION ALL for assay-level counts instead of just UNION to
|
1876
1844
|
# prevent any unwanted de-duplication within subquery results.
|
1877
1845
|
combined_query = f"""
|
1878
|
-
select
|
1879
|
-
jsonb_object_agg('trial_id', expected_assays.trial_id)
|
1880
|
-
|| jsonb_object_agg('excluded_samples', coalesce(excluded_sample_lists.value, '{{}}'::jsonb))
|
1881
|
-
|| jsonb_object_agg('expected_assays', coalesce(expected_assays, '[]'::jsonb))
|
1882
|
-
|| jsonb_object_agg('file_size_bytes', coalesce(file_sizes.value, 0))
|
1883
|
-
|| jsonb_object_agg('clinical_participants', coalesce(clinical_participants.value, 0))
|
1884
|
-
|| jsonb_build_object('total_participants', coalesce(total_participants, 0))
|
1885
|
-
|| jsonb_build_object('total_samples', coalesce(total_samples, 0))
|
1886
|
-
|| coalesce(sample_counts.sample_counts, '{{}}'::jsonb)
|
1887
|
-
from ({expected_assays_subquery}) expected_assays
|
1888
|
-
full join (
|
1889
|
-
select
|
1890
|
-
trial_id,
|
1891
|
-
count(distinct cimac_id) as total_samples,
|
1892
|
-
count(distinct left(cimac_id, 7)) as total_participants
|
1893
|
-
from (
|
1894
|
-
{generic_assay_subquery}
|
1895
|
-
union
|
1896
|
-
{nanostring_subquery}
|
1897
|
-
union
|
1898
|
-
{olink_subquery}
|
1899
|
-
union
|
1900
|
-
{elisa_subquery}
|
1901
|
-
union
|
1902
|
-
{wes_subquery}
|
1903
|
-
union
|
1904
|
-
{wes_tumor_assay_subquery}
|
1905
|
-
union
|
1906
|
-
{wes_normal_assay_subquery}
|
1907
|
-
) assays
|
1908
|
-
group by
|
1909
|
-
trial_id
|
1910
|
-
) total_counts
|
1911
|
-
on expected_assays.trial_id = total_counts.trial_id
|
1912
|
-
full join (
|
1913
|
-
select
|
1914
|
-
trial_id,
|
1915
|
-
jsonb_object_agg(key, num_sample) as sample_counts
|
1916
|
-
from (
|
1917
1846
|
select
|
1918
|
-
trial_id,
|
1919
|
-
|
1920
|
-
|
1921
|
-
|
1922
|
-
|
1923
|
-
|
1924
|
-
|
1925
|
-
|
1926
|
-
|
1927
|
-
|
1928
|
-
|
1929
|
-
|
1930
|
-
|
1931
|
-
|
1932
|
-
|
1933
|
-
|
1934
|
-
|
1935
|
-
|
1936
|
-
|
1937
|
-
|
1938
|
-
|
1939
|
-
|
1940
|
-
|
1941
|
-
|
1942
|
-
|
1943
|
-
|
1944
|
-
|
1945
|
-
|
1847
|
+
jsonb_object_agg('trial_id', expected_assays.trial_id)
|
1848
|
+
|| jsonb_object_agg('excluded_samples', coalesce(excluded_sample_lists.value, '{{}}'::jsonb))
|
1849
|
+
|| jsonb_object_agg('expected_assays', coalesce(expected_assays, '[]'::jsonb))
|
1850
|
+
|| jsonb_object_agg('file_size_bytes', coalesce(file_sizes.value, 0))
|
1851
|
+
|| jsonb_object_agg('clinical_participants', coalesce(clinical_participants.value, 0))
|
1852
|
+
|| jsonb_build_object('total_participants', coalesce(total_participants, 0))
|
1853
|
+
|| jsonb_build_object('total_samples', coalesce(total_samples, 0))
|
1854
|
+
|| coalesce(sample_counts.sample_counts, '{{}}'::jsonb) as result
|
1855
|
+
from ({expected_assays_subquery}) expected_assays
|
1856
|
+
full join (
|
1857
|
+
select
|
1858
|
+
trial_id,
|
1859
|
+
count(distinct cimac_id) as total_samples,
|
1860
|
+
count(distinct left(cimac_id, 7)) as total_participants
|
1861
|
+
from (
|
1862
|
+
{generic_assay_subquery}
|
1863
|
+
union
|
1864
|
+
{nanostring_subquery}
|
1865
|
+
union
|
1866
|
+
{olink_subquery}
|
1867
|
+
union
|
1868
|
+
{elisa_subquery}
|
1869
|
+
union
|
1870
|
+
{wes_subquery}
|
1871
|
+
union
|
1872
|
+
{wes_tumor_assay_subquery}
|
1873
|
+
union
|
1874
|
+
{wes_normal_assay_subquery}
|
1875
|
+
) assays
|
1876
|
+
group by
|
1877
|
+
trial_id
|
1878
|
+
) total_counts
|
1879
|
+
on expected_assays.trial_id = total_counts.trial_id
|
1880
|
+
full join (
|
1881
|
+
select
|
1882
|
+
trial_id,
|
1883
|
+
jsonb_object_agg(key, num_sample) as sample_counts
|
1884
|
+
from (
|
1885
|
+
select
|
1886
|
+
trial_id,
|
1887
|
+
key,
|
1888
|
+
count(distinct cimac_id) as num_sample
|
1889
|
+
from (
|
1890
|
+
{generic_assay_subquery}
|
1891
|
+
union all
|
1892
|
+
{nanostring_subquery}
|
1893
|
+
union all
|
1894
|
+
{olink_subquery}
|
1895
|
+
union all
|
1896
|
+
{elisa_subquery}
|
1897
|
+
union all
|
1898
|
+
{wes_subquery}
|
1899
|
+
union all
|
1900
|
+
{wes_tumor_assay_subquery}
|
1901
|
+
union all
|
1902
|
+
{wes_analysis_subquery}
|
1903
|
+
union all
|
1904
|
+
{wes_tumor_only_analysis_subquery}
|
1905
|
+
union all
|
1906
|
+
{rna_level1_analysis_subquery}
|
1907
|
+
union all
|
1908
|
+
{tcr_analysis_subquery}
|
1909
|
+
union all
|
1910
|
+
{cytof_analysis_subquery}
|
1911
|
+
union all
|
1912
|
+
{atacseq_analysis_subquery}
|
1913
|
+
) assays_and_analysis
|
1914
|
+
group by
|
1915
|
+
trial_id, key
|
1916
|
+
) q
|
1917
|
+
group by
|
1918
|
+
trial_id
|
1919
|
+
) sample_counts
|
1920
|
+
on expected_assays.trial_id = sample_counts.trial_id
|
1921
|
+
full join ({excluded_samples_subquery}) excluded_sample_lists
|
1922
|
+
on expected_assays.trial_id = excluded_sample_lists.trial_id
|
1923
|
+
full join ({files_subquery}) file_sizes
|
1924
|
+
on expected_assays.trial_id = file_sizes.trial_id
|
1925
|
+
full join ({clinical_subquery}) clinical_participants
|
1926
|
+
on expected_assays.trial_id = clinical_participants.trial_id
|
1946
1927
|
group by
|
1947
|
-
trial_id,
|
1948
|
-
|
1949
|
-
|
1950
|
-
|
1951
|
-
|
1952
|
-
|
1953
|
-
|
1954
|
-
|
1955
|
-
|
1956
|
-
|
1957
|
-
|
1958
|
-
|
1959
|
-
|
1960
|
-
|
1961
|
-
|
1962
|
-
|
1963
|
-
|
1964
|
-
|
1965
|
-
|
1966
|
-
|
1967
|
-
|
1928
|
+
expected_assays.trial_id,
|
1929
|
+
total_participants,
|
1930
|
+
total_samples,
|
1931
|
+
sample_counts.sample_counts
|
1932
|
+
;
|
1933
|
+
"""
|
1934
|
+
|
1935
|
+
return combined_query
|
1936
|
+
|
1937
|
+
@staticmethod
|
1938
|
+
@with_default_session
|
1939
|
+
def get_summaries(session: Session) -> List[dict]:
|
1940
|
+
"""
|
1941
|
+
Return a list of trial summaries, where each summary has structure like:
|
1942
|
+
```python
|
1943
|
+
{
|
1944
|
+
"trial_id": ...,
|
1945
|
+
"expected_assays": ..., # list of assays the trial should have data for
|
1946
|
+
"file_size_bytes": ..., # total file size for the trial
|
1947
|
+
"clinical_participants": ..., # number of participants with clinical data
|
1948
|
+
"total_participants": ..., # number of unique participants with assay data
|
1949
|
+
"total_samples": ..., # number of samples with assay data
|
1950
|
+
"cytof": ..., # cytof sample count
|
1951
|
+
... # other assays and analysis
|
1952
|
+
}
|
1953
|
+
```
|
1954
|
+
NOTE: if the metadata model for any existing assays substantially changes,
|
1955
|
+
or if new assays are introduced that don't follow the typical structure
|
1956
|
+
(batches containing sample-level records), then this method will need to
|
1957
|
+
be updated to accommodate those changes.
|
1958
|
+
|
1959
|
+
Only the assays are used for calculating `"total_participants"` and `"total_samples"`,
|
1960
|
+
as all analyses are derived from assay data.
|
1961
|
+
Each assay/analysis subquery is expected to return a set with `trial_id`, `key`,
|
1962
|
+
and `cimac_id` which are used for both assay-level and overall counting.
|
1963
|
+
|
1964
|
+
There is a bit of complexity with the way that WES samples are counted:
|
1965
|
+
- `"wes"` only counts tumor samples slated for paired wes_analysis
|
1966
|
+
- `"wes_tumor_only"` counts all tumor samples NOT slated for paired wes_analysis
|
1967
|
+
- `"wes_analysis"` counts tumor samples with paired wes_analysis
|
1968
|
+
- `"wes_tumor_only_analysis"` counts (tumor) samples with tumor-only analysis
|
1969
|
+
For `"total_[participants/samples]"`, ALL (ie tumor AND normal) WES assay samples are included.
|
1970
|
+
"""
|
1971
|
+
summaries_query = "SELECT result FROM trial_summaries_mv"
|
1972
|
+
# Retrieve trial-level summary results from data cached in trial_summaries_mv materialized view.
|
1973
|
+
# The source of the SQL query used in trial_summaries_mv is get_summaries_query()
|
1968
1974
|
summaries = [
|
1969
|
-
summary for (summary,) in session.execute(
|
1975
|
+
summary for (summary,) in session.execute(summaries_query) if summary
|
1970
1976
|
]
|
1971
1977
|
|
1972
1978
|
# Shortcut to impute 0 values for assays where trials don't yet have data
|
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.1
|
2
2
|
Name: nci_cidc_api_modules
|
3
|
-
Version: 1.0.
|
3
|
+
Version: 1.0.12
|
4
4
|
Summary: SQLAlchemy data models and configuration tools used in the NCI CIDC API
|
5
5
|
Home-page: https://github.com/NCI-CIDC/cidc-api-gae
|
6
6
|
License: MIT license
|
@@ -21,10 +21,11 @@ Requires-Dist: google-cloud-bigquery ==3.18.0
|
|
21
21
|
Requires-Dist: google-api-python-client ==2.64.0
|
22
22
|
Requires-Dist: packaging >=20.0.0
|
23
23
|
Requires-Dist: pyarrow ==14.0.1
|
24
|
+
Requires-Dist: numpy <2,>=1.16.5
|
24
25
|
Requires-Dist: pandas <2,>=1
|
25
26
|
Requires-Dist: python-dotenv ==0.10.3
|
26
|
-
Requires-Dist: requests ==2.
|
27
|
-
Requires-Dist: jinja2 ==3.1.
|
27
|
+
Requires-Dist: requests ==2.32.3
|
28
|
+
Requires-Dist: jinja2 ==3.1.4
|
28
29
|
Requires-Dist: nci-cidc-schemas ==0.26.33
|
29
30
|
|
30
31
|
# NCI CIDC API <!-- omit in TOC -->
|
@@ -8,7 +8,7 @@ cidc_api/csms/auth.py,sha256=25Yma2Kz3KLENAPSeBYacFuSZXng-EDgmgInKBsRyP0,3191
|
|
8
8
|
cidc_api/models/__init__.py,sha256=bl445G8Zic9YbhZ8ZBni07wtBMhLJRMBA-JqjLxx2bw,66
|
9
9
|
cidc_api/models/csms_api.py,sha256=Wp4b53vwOqSlOIaoAYGlI1p8ZfXRXmVJ6MLcsvzq0LA,31664
|
10
10
|
cidc_api/models/migrations.py,sha256=gp9vtkYbA9FFy2s-7woelAmsvQbJ41LO2_DY-YkFIrQ,11464
|
11
|
-
cidc_api/models/models.py,sha256=
|
11
|
+
cidc_api/models/models.py,sha256=Hjp9sieGdldNbUzneFi-7vRYyo9wwr0D-0m_UbxsDEk,124106
|
12
12
|
cidc_api/models/schemas.py,sha256=7tDYtmULuzTt2kg7RorWhte06ffalgpQKrFiDRGcPEQ,2711
|
13
13
|
cidc_api/models/files/__init__.py,sha256=8BMTnUSHzUbz0lBeEQY6NvApxDD3GMWMduoVMos2g4Y,213
|
14
14
|
cidc_api/models/files/details.py,sha256=eg1u8uZwtxb0m9mFobcTL_mnPBMq1MPZv3NN3KWMGOI,62309
|
@@ -18,8 +18,8 @@ cidc_api/shared/auth.py,sha256=VMd_3QJE2iG16QxuGzHBV9MzJJItOZNn9gcw0_iUBLI,11647
|
|
18
18
|
cidc_api/shared/emails.py,sha256=5dyuKlpcg1M4P_RrAt0ss2hiCqb-Y7p2XXR1d9uBXg8,4868
|
19
19
|
cidc_api/shared/gcloud_client.py,sha256=7dDs0crLMJKdIp4IDSfrZBMB3h-zvWNieB81azoeLO4,33746
|
20
20
|
cidc_api/shared/rest_utils.py,sha256=LMfBpvJRjkfQjCzVXuhTTe4Foz4wlvaKg6QntyR-Hkc,6648
|
21
|
-
nci_cidc_api_modules-1.0.
|
22
|
-
nci_cidc_api_modules-1.0.
|
23
|
-
nci_cidc_api_modules-1.0.
|
24
|
-
nci_cidc_api_modules-1.0.
|
25
|
-
nci_cidc_api_modules-1.0.
|
21
|
+
nci_cidc_api_modules-1.0.12.dist-info/LICENSE,sha256=pNYWVTHaYonnmJyplmeAp7tQAjosmDpAWjb34jjv7Xs,1102
|
22
|
+
nci_cidc_api_modules-1.0.12.dist-info/METADATA,sha256=SPwKY2ReUVP7izlWlwHgVmLTSPxN95TSsFIWSQstems,40508
|
23
|
+
nci_cidc_api_modules-1.0.12.dist-info/WHEEL,sha256=GJ7t_kWBFywbagK5eo9IoUwLW6oyOeTKmQ-9iHFVNxQ,92
|
24
|
+
nci_cidc_api_modules-1.0.12.dist-info/top_level.txt,sha256=rNiRzL0lJGi5Q9tY9uSoMdTbJ-7u5c_D2E86KA94yRA,9
|
25
|
+
nci_cidc_api_modules-1.0.12.dist-info/RECORD,,
|
File without changes
|
File without changes
|
{nci_cidc_api_modules-1.0.9.dist-info → nci_cidc_api_modules-1.0.12.dist-info}/top_level.txt
RENAMED
File without changes
|