idc-index-data 18.0.1__tar.gz → 18.2.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of idc-index-data might be problematic. Click here for more details.

Files changed (32) hide show
  1. idc_index_data-18.2.0/.github/workflows/external-indices.yml +60 -0
  2. {idc_index_data-18.0.1 → idc_index_data-18.2.0}/CMakeLists.txt +3 -0
  3. {idc_index_data-18.0.1 → idc_index_data-18.2.0}/PKG-INFO +1 -1
  4. idc_index_data-18.2.0/assets/README.md +3 -0
  5. idc_index_data-18.2.0/assets/sm_index.sql +132 -0
  6. idc_index_data-18.2.0/assets/sm_instance_index.sql +124 -0
  7. {idc_index_data-18.0.1 → idc_index_data-18.2.0}/pyproject.toml +1 -1
  8. idc_index_data-18.2.0/scripts/python/external-indices.py +26 -0
  9. idc_index_data-18.2.0/scripts/sql/prior_versions_index.sql +89 -0
  10. {idc_index_data-18.0.1 → idc_index_data-18.2.0}/src/idc_index_data/__init__.py +4 -0
  11. {idc_index_data-18.0.1 → idc_index_data-18.2.0}/tests/test_package.py +5 -0
  12. {idc_index_data-18.0.1 → idc_index_data-18.2.0}/.git_archival.txt +0 -0
  13. {idc_index_data-18.0.1 → idc_index_data-18.2.0}/.gitattributes +0 -0
  14. {idc_index_data-18.0.1 → idc_index_data-18.2.0}/.github/CONTRIBUTING.md +0 -0
  15. {idc_index_data-18.0.1 → idc_index_data-18.2.0}/.github/dependabot.yml +0 -0
  16. {idc_index_data-18.0.1 → idc_index_data-18.2.0}/.github/matchers/pylint.json +0 -0
  17. {idc_index_data-18.0.1 → idc_index_data-18.2.0}/.github/workflows/cd.yml +0 -0
  18. {idc_index_data-18.0.1 → idc_index_data-18.2.0}/.github/workflows/ci.yml +0 -0
  19. {idc_index_data-18.0.1 → idc_index_data-18.2.0}/.github/workflows/keep-alive.yml +0 -0
  20. {idc_index_data-18.0.1 → idc_index_data-18.2.0}/.gitignore +0 -0
  21. {idc_index_data-18.0.1 → idc_index_data-18.2.0}/.pre-commit-config.yaml +0 -0
  22. {idc_index_data-18.0.1 → idc_index_data-18.2.0}/.readthedocs.yaml +0 -0
  23. {idc_index_data-18.0.1 → idc_index_data-18.2.0}/LICENSE +0 -0
  24. {idc_index_data-18.0.1 → idc_index_data-18.2.0}/README.md +0 -0
  25. {idc_index_data-18.0.1 → idc_index_data-18.2.0}/docs/conf.py +0 -0
  26. {idc_index_data-18.0.1 → idc_index_data-18.2.0}/docs/index.md +0 -0
  27. {idc_index_data-18.0.1 → idc_index_data-18.2.0}/noxfile.py +0 -0
  28. {idc_index_data-18.0.1 → idc_index_data-18.2.0}/scripts/python/idc_index_data_manager.py +0 -0
  29. {idc_index_data-18.0.1 → idc_index_data-18.2.0}/scripts/python/update_idc_index_version.py +0 -0
  30. {idc_index_data-18.0.1 → idc_index_data-18.2.0}/scripts/sql/idc_index.sql +0 -0
  31. {idc_index_data-18.0.1 → idc_index_data-18.2.0}/src/idc_index_data/_version.pyi +0 -0
  32. {idc_index_data-18.0.1 → idc_index_data-18.2.0}/src/idc_index_data/py.typed +0 -0
@@ -0,0 +1,60 @@
1
+ name: CD-external-indices
2
+
3
+ on:
4
+ workflow_dispatch:
5
+ pull_request:
6
+ push:
7
+ branches:
8
+ - main
9
+ release:
10
+ types:
11
+ - published
12
+
13
+ concurrency:
14
+ group: ${{ github.workflow }}-${{ github.ref }}
15
+ cancel-in-progress: true
16
+
17
+ env:
18
+ FORCE_COLOR: 3
19
+
20
+ jobs:
21
+ dist:
22
+ name: CD-external-indices
23
+ runs-on: ubuntu-latest
24
+
25
+ steps:
26
+ - uses: actions/checkout@v4
27
+ with:
28
+ fetch-depth: 0
29
+
30
+ - name: Set up Python
31
+ uses: actions/setup-python@v5
32
+ with:
33
+ python-version: "3.12"
34
+
35
+ - name: Install dependencies
36
+ run: |
37
+ python -m pip install --upgrade pip
38
+ pip install db-dtypes google-cloud-bigquery pandas pyarrow
39
+
40
+ - name: Authorize Google Cloud
41
+ uses: google-github-actions/auth@v2
42
+ with:
43
+ credentials_json: "${{ secrets.SERVICE_ACCOUNT_KEY }}"
44
+ create_credentials_file: true
45
+ export_environment_variables: true
46
+
47
+ - name: Execute SQL Query and Generate Parquet Files
48
+ run: |
49
+ python scripts/python/external-indices.py
50
+ env:
51
+ PROJECT_ID: ${{ env.GCP_PROJECT }}
52
+
53
+ - name: Create Tagged Release
54
+ id: create_tagged_release
55
+ if: github.event_name == 'release' && github.event.action == 'published'
56
+ uses: ncipollo/release-action@v1
57
+ with:
58
+ artifacts: "*.parquet"
59
+ allowUpdates: true
60
+ omitBodyDuringUpdate: true
@@ -19,6 +19,7 @@ add_custom_command(
19
19
  OUTPUT
20
20
  $<$<BOOL:${IDC_INDEX_DATA_GENERATE_CSV_ARCHIVE}>:${download_dir}/idc_index.csv.zip>
21
21
  $<$<BOOL:${IDC_INDEX_DATA_GENERATE_PARQUET}>:${download_dir}/idc_index.parquet>
22
+ $<$<BOOL:${IDC_INDEX_DATA_GENERATE_PARQUET}>:${download_dir}/prior_versions_index.parquet>
22
23
  COMMAND python ${CMAKE_CURRENT_SOURCE_DIR}/scripts/python/idc_index_data_manager.py
23
24
  $<$<BOOL:${IDC_INDEX_DATA_GENERATE_CSV_ARCHIVE}>:--generate-csv-archive>
24
25
  $<$<BOOL:${IDC_INDEX_DATA_GENERATE_PARQUET}>:--generate-parquet>
@@ -28,10 +29,12 @@ add_custom_target(run_idc_index_data_manager ALL
28
29
  DEPENDS
29
30
  $<$<BOOL:${IDC_INDEX_DATA_GENERATE_CSV_ARCHIVE}>:${download_dir}/idc_index.csv.zip>
30
31
  $<$<BOOL:${IDC_INDEX_DATA_GENERATE_PARQUET}>:${download_dir}/idc_index.parquet>
32
+ $<$<BOOL:${IDC_INDEX_DATA_GENERATE_PARQUET}>:${download_dir}/prior_versions_index.parquet>
31
33
  )
32
34
 
33
35
  install(
34
36
  FILES
35
37
  $<$<BOOL:${IDC_INDEX_DATA_GENERATE_CSV_ARCHIVE}>:${download_dir}/idc_index.csv.zip>
36
38
  $<$<BOOL:${IDC_INDEX_DATA_GENERATE_PARQUET}>:${download_dir}/idc_index.parquet>
39
+ $<$<BOOL:${IDC_INDEX_DATA_GENERATE_PARQUET}>:${download_dir}/prior_versions_index.parquet>
37
40
  DESTINATION "idc_index_data")
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: idc-index-data
3
- Version: 18.0.1
3
+ Version: 18.2.0
4
4
  Summary: ImagingDataCommons index to query and download data.
5
5
  Author-Email: Andrey Fedorov <andrey.fedorov@gmail.com>, Vamsi Thiriveedhi <vthiriveedhi@mgh.harvard.edu>, Jean-Christophe Fillion-Robin <jchris.fillionr@kitware.com>
6
6
  License: Copyright 2024 Andrey Fedorov
@@ -0,0 +1,3 @@
1
+ This folder contains SQL scripts that are used to generate tables that are
2
+ attached to the releases as assets. Initially, those will be generated and
3
+ attached manually, but in the future this process may be automated.
@@ -0,0 +1,132 @@
1
+ -- Note that this query can be substituted with a much simpler one below
2
+ -- once this PR is merged and https://github.com/ImagingDataCommons/etl_flow/pull/104
3
+ -- the latter makes it to a public release
4
+ --
5
+ -- SELECT
6
+ -- * EXCEPT(Modality)
7
+ -- FROM
8
+ -- `bigquery-public-data.idc_v18.dicom_metadata_curated_series_level`
9
+ -- WHERE
10
+ -- Modality = "SM"
11
+
12
+ WITH
13
+ temp_table AS (
14
+ SELECT
15
+ dicom_all.SeriesInstanceUID,
16
+ ANY_VALUE(Modality) AS Modality,
17
+ STRING_AGG(DISTINCT(collection_id),",") AS collection_id,
18
+ ANY_VALUE(OpticalPathSequence[SAFE_OFFSET(0)].ObjectiveLensPower) AS ObjectiveLensPower,
19
+ MAX(DISTINCT(TotalPixelMatrixColumns)) AS max_TotalPixelMatrixColumns,
20
+ MAX(DISTINCT(TotalPixelMatrixRows)) AS max_TotalPixelMatrixRows,
21
+ MAX(DISTINCT(`Columns`)) AS max_Columns,
22
+ MAX(DISTINCT(`Rows`)) AS max_Rows,
23
+ MIN(DISTINCT(SAFE_CAST(PixelSpacing[SAFE_OFFSET(0)] AS FLOAT64))) AS min_spacing_0,
24
+ MIN(SAFE_CAST(SharedFunctionalGroupsSequence[SAFE_OFFSET(0)].PixelMeasuresSequence[SAFE_OFFSET(0)]. PixelSpacing[SAFE_OFFSET(0)] AS FLOAT64)) AS fg_min_spacing_0,
25
+ ARRAY_AGG(DISTINCT(CONCAT(SpecimenDescriptionSequence[SAFE_OFFSET(0)].PrimaryAnatomicStructureSequence[SAFE_OFFSET(0)].CodingSchemeDesignator,":", SpecimenDescriptionSequence[SAFE_OFFSET(0)].PrimaryAnatomicStructureSequence[SAFE_OFFSET(0)].CodeValue, ":", SpecimenDescriptionSequence[SAFE_OFFSET(0)].PrimaryAnatomicStructureSequence[SAFE_OFFSET(0)].CodeMeaning)) IGNORE NULLS)[SAFE_OFFSET(0)] AS primaryAnatomicStructure_code_str,
26
+ ARRAY_AGG(DISTINCT(CONCAT(SpecimenDescriptionSequence[SAFE_OFFSET(0)].PrimaryAnatomicStructureSequence[SAFE_OFFSET(0)].PrimaryAnatomicStructureModifierSequence[SAFE_OFFSET(0)].CodingSchemeDesignator,":", SpecimenDescriptionSequence[SAFE_OFFSET(0)].PrimaryAnatomicStructureSequence[SAFE_OFFSET(0)].PrimaryAnatomicStructureModifierSequence[SAFE_OFFSET(0)].CodeValue, ":", SpecimenDescriptionSequence[SAFE_OFFSET(0)].PrimaryAnatomicStructureSequence[SAFE_OFFSET(0)].PrimaryAnatomicStructureModifierSequence[SAFE_OFFSET(0)].CodeMeaning)) IGNORE NULLS)[SAFE_OFFSET(0)] AS primaryAnatomicStructureModifier_code_str,
27
+
28
+ ARRAY_AGG(DISTINCT(CONCAT(OpticalPathSequence[SAFE_OFFSET(0)].IlluminationTypeCodeSequence[SAFE_OFFSET(0)].CodingSchemeDesignator,":", OpticalPathSequence[SAFE_OFFSET(0)].IlluminationTypeCodeSequence[SAFE_OFFSET(0)].CodeValue, ":", OpticalPathSequence[SAFE_OFFSET(0)].IlluminationTypeCodeSequence[SAFE_OFFSET(0)].CodeMeaning)) IGNORE NULLS)[SAFE_OFFSET(0)] AS illuminationType_code_str,
29
+ FROM
30
+ `bigquery-public-data.idc_v18.dicom_all` AS dicom_all
31
+ GROUP BY
32
+ SeriesInstanceUID
33
+ ),
34
+
35
+ SpecimenPreparationSequence_unnested AS (
36
+ SELECT
37
+ SeriesInstanceUID,
38
+ concept_name_code_sequence.CodeMeaning AS cnc_cm,
39
+ concept_name_code_sequence.CodingSchemeDesignator AS cnc_csd,
40
+ concept_name_code_sequence.CodeValue AS cnc_val,
41
+ concept_code_sequence.CodeMeaning AS ccs_cm,
42
+ concept_code_sequence.CodingSchemeDesignator AS ccs_csd,
43
+ concept_code_sequence.CodeValue AS ccs_val,
44
+ FROM `bigquery-public-data.idc_v18.dicom_all`,
45
+ UNNEST(SpecimenDescriptionSequence[SAFE_OFFSET(0)].SpecimenPreparationSequence) as preparation_unnest_step1,
46
+ UNNEST(preparation_unnest_step1.SpecimenPreparationStepContentItemSequence) as preparation_unnest_step2,
47
+ UNNEST(preparation_unnest_step2.ConceptNameCodeSequence) as concept_name_code_sequence,
48
+ UNNEST(preparation_unnest_step2.ConceptCodeSequence) as concept_code_sequence
49
+ ),
50
+
51
+ slide_embedding AS (
52
+ SELECT
53
+ SeriesInstanceUID,
54
+ ARRAY_AGG(DISTINCT(CONCAT(ccs_cm,":",ccs_csd,":",ccs_val))) as embeddingMedium_code_str
55
+ FROM SpecimenPreparationSequence_unnested
56
+ WHERE (cnc_csd = 'SCT' and cnc_val = '430863003') -- CodeMeaning is 'Embedding medium'
57
+ GROUP BY SeriesInstanceUID
58
+ ),
59
+
60
+ slide_fixative AS (
61
+ SELECT
62
+ SeriesInstanceUID,
63
+ ARRAY_AGG(DISTINCT(CONCAT(ccs_cm, ":", ccs_csd,":",ccs_val))) as tissueFixative_code_str
64
+ FROM SpecimenPreparationSequence_unnested
65
+ WHERE (cnc_csd = 'SCT' and cnc_val = '430864009') -- CodeMeaning is 'Tissue Fixative'
66
+ GROUP BY SeriesInstanceUID
67
+ ),
68
+
69
+ slide_staining AS (
70
+ SELECT
71
+ SeriesInstanceUID,
72
+ ARRAY_AGG(DISTINCT(CONCAT(ccs_cm, ":", ccs_csd,":",ccs_val))) as staining_usingSubstance_code_str,
73
+ FROM SpecimenPreparationSequence_unnested
74
+ WHERE (cnc_csd = 'SCT' and cnc_val = '424361007') -- CodeMeaning is 'Using substance'
75
+ GROUP BY SeriesInstanceUID
76
+ )
77
+
78
+ SELECT
79
+ temp_table.SeriesInstanceUID,
80
+ -- Embedding Medium
81
+ ARRAY(
82
+ SELECT IF(code IS NULL, NULL, SPLIT(code, ':')[SAFE_OFFSET(0)])
83
+ FROM UNNEST(embeddingMedium_code_str) AS code
84
+ ) AS embeddingMedium_CodeMeaning,
85
+ ARRAY(
86
+ SELECT IF(code IS NULL, NULL,
87
+ IF(STRPOS(code, ':') = 0, NULL,
88
+ SUBSTR(code, STRPOS(code, ':') + 1)))
89
+ FROM UNNEST(embeddingMedium_code_str) AS code
90
+ ) AS embeddingMedium_code_designator_value_str,
91
+ -- Tissue Fixative
92
+ ARRAY(
93
+ SELECT IF(code IS NULL, NULL, SPLIT(code, ':')[SAFE_OFFSET(0)])
94
+ FROM UNNEST(tissueFixative_code_str) AS code
95
+ ) AS tissueFixative_CodeMeaning,
96
+ ARRAY(
97
+ SELECT IF(code IS NULL, NULL,
98
+ IF(STRPOS(code, ':') = 0, NULL,
99
+ SUBSTR(code, STRPOS(code, ':') + 1)))
100
+ FROM UNNEST(tissueFixative_code_str) AS code
101
+ ) AS tissueFixative_code_designator_value_str,
102
+ -- Staining using substance
103
+ ARRAY(
104
+ SELECT IF(code IS NULL, NULL, SPLIT(code, ':')[SAFE_OFFSET(0)])
105
+ FROM UNNEST(staining_usingSubstance_code_str) AS code
106
+ ) AS staining_usingSubstance_CodeMeaning,
107
+ ARRAY(
108
+ SELECT IF(code IS NULL, NULL,
109
+ IF(STRPOS(code, ':') = 0, NULL,
110
+ SUBSTR(code, STRPOS(code, ':') + 1)))
111
+ FROM UNNEST(staining_usingSubstance_code_str) AS code
112
+ ) AS staining_usingSubstance_code_designator_value_str,
113
+
114
+ if(COALESCE(min_spacing_0, fg_min_spacing_0) = 0, 0,
115
+ round(COALESCE(min_spacing_0, fg_min_spacing_0) ,CAST(2 -1-floor(log10(abs(COALESCE(min_spacing_0, fg_min_spacing_0) ))) AS INT64))) AS min_PixelSpacing_2sf,
116
+ COALESCE(max_TotalPixelMatrixColumns, max_Columns) AS max_TotalPixelMatrixColumns,
117
+ COALESCE(max_TotalPixelMatrixRows, max_Rows) AS max_TotalPixelMatrixRows,
118
+ SAFE_CAST(ObjectiveLensPower as INT) as ObjectiveLensPower,
119
+ CONCAT(SPLIT(primaryAnatomicStructure_code_str,":")[SAFE_OFFSET(0)],":",SPLIT(primaryAnatomicStructure_code_str,":")[SAFE_OFFSET(1)]) as primaryAnatomicStructure_code_designator_value_str,
120
+ SPLIT(primaryAnatomicStructure_code_str,":")[SAFE_OFFSET(2)] as primaryAnatomicStructure_CodeMeaning,
121
+ CONCAT(SPLIT(primaryAnatomicStructureModifier_code_str,":")[SAFE_OFFSET(0)],":",SPLIT(primaryAnatomicStructureModifier_code_str,":")[SAFE_OFFSET(1)]) as primaryAnatomicStructureModifier_code_designator_value_str,
122
+ SPLIT(primaryAnatomicStructureModifier_code_str,":")[SAFE_OFFSET(2)] as primaryAnatomicStructureModifier_CodeMeaning,
123
+
124
+ CONCAT(SPLIT(illuminationType_code_str,":")[SAFE_OFFSET(0)],":",SPLIT(illuminationType_code_str,":")[SAFE_OFFSET(1)]) as illuminationType_code_designator_value_str,
125
+ SPLIT(illuminationType_code_str,":")[SAFE_OFFSET(2)] as illuminationType_CodeMeaning,
126
+ FROM
127
+ temp_table
128
+ LEFT JOIN slide_embedding on temp_table.SeriesInstanceUID = slide_embedding.SeriesInstanceUID
129
+ LEFT JOIN slide_fixative on temp_table.SeriesInstanceUID = slide_fixative.SeriesInstanceUID
130
+ LEFT JOIN slide_staining on temp_table.SeriesInstanceUID = slide_staining.SeriesInstanceUID
131
+ WHERE
132
+ Modality = "SM"
@@ -0,0 +1,124 @@
1
+ WITH
2
+ SpecimenPreparationSequence_unnested AS (
3
+ SELECT
4
+ SOPInstanceUID,
5
+ concept_name_code_sequence.CodeMeaning AS cnc_cm,
6
+ concept_name_code_sequence.CodingSchemeDesignator AS cnc_csd,
7
+ concept_name_code_sequence.CodeValue AS cnc_val,
8
+ concept_code_sequence.CodeMeaning AS ccs_cm,
9
+ concept_code_sequence.CodingSchemeDesignator AS ccs_csd,
10
+ concept_code_sequence.CodeValue AS ccs_val,
11
+ FROM
12
+ `bigquery-public-data.idc_v18.dicom_all`,
13
+ UNNEST(SpecimenDescriptionSequence[SAFE_OFFSET(0)].SpecimenPreparationSequence) AS preparation_unnest_step1,
14
+ UNNEST(preparation_unnest_step1.SpecimenPreparationStepContentItemSequence) AS preparation_unnest_step2,
15
+ UNNEST(preparation_unnest_step2.ConceptNameCodeSequence) AS concept_name_code_sequence,
16
+ UNNEST(preparation_unnest_step2.ConceptCodeSequence) AS concept_code_sequence ),
17
+ slide_embedding AS (
18
+ SELECT
19
+ SOPInstanceUID,
20
+ ARRAY_AGG(DISTINCT(CONCAT(ccs_cm,":",ccs_csd,":",ccs_val))) AS embeddingMedium_code_str
21
+ FROM
22
+ SpecimenPreparationSequence_unnested
23
+ WHERE
24
+ (cnc_csd = 'SCT'
25
+ AND cnc_val = '430863003') -- CodeMeaning is 'Embedding medium'
26
+ GROUP BY
27
+ SOPInstanceUID ),
28
+ slide_fixative AS (
29
+ SELECT
30
+ SOPInstanceUID,
31
+ ARRAY_AGG(DISTINCT(CONCAT(ccs_cm, ":", ccs_csd,":",ccs_val))) AS tissueFixative_code_str
32
+ FROM
33
+ SpecimenPreparationSequence_unnested
34
+ WHERE
35
+ (cnc_csd = 'SCT'
36
+ AND cnc_val = '430864009') -- CodeMeaning is 'Tissue Fixative'
37
+ GROUP BY
38
+ SOPInstanceUID ),
39
+ slide_staining AS (
40
+ SELECT
41
+ SOPInstanceUID,
42
+ ARRAY_AGG(DISTINCT(CONCAT(ccs_cm, ":", ccs_csd,":",ccs_val))) AS staining_usingSubstance_code_str,
43
+ FROM
44
+ SpecimenPreparationSequence_unnested
45
+ WHERE
46
+ (cnc_csd = 'SCT'
47
+ AND cnc_val = '424361007') -- CodeMeaning is 'Using substance'
48
+ GROUP BY
49
+ SOPInstanceUID )
50
+ SELECT
51
+ dicom_all.SOPInstanceUID,
52
+ dicom_all.SeriesInstanceUID,
53
+ -- Embedding Medium
54
+ ARRAY(
55
+ SELECT
56
+ IF
57
+ (code IS NULL, NULL, SPLIT(code, ':')[SAFE_OFFSET(0)])
58
+ FROM
59
+ UNNEST(embeddingMedium_code_str) AS code ) AS embeddingMedium_CodeMeaning,
60
+ ARRAY(
61
+ SELECT
62
+ IF
63
+ (code IS NULL, NULL,
64
+ IF
65
+ (STRPOS(code, ':') = 0, NULL, SUBSTR(code, STRPOS(code, ':') + 1)))
66
+ FROM
67
+ UNNEST(embeddingMedium_code_str) AS code ) AS embeddingMedium_code_designator_value_str,
68
+ -- Tissue Fixative
69
+ ARRAY(
70
+ SELECT
71
+ IF
72
+ (code IS NULL, NULL, SPLIT(code, ':')[SAFE_OFFSET(0)])
73
+ FROM
74
+ UNNEST(tissueFixative_code_str) AS code ) AS tissueFixative_CodeMeaning,
75
+ ARRAY(
76
+ SELECT
77
+ IF
78
+ (code IS NULL, NULL,
79
+ IF
80
+ (STRPOS(code, ':') = 0, NULL, SUBSTR(code, STRPOS(code, ':') + 1)))
81
+ FROM
82
+ UNNEST(tissueFixative_code_str) AS code ) AS tissueFixative_code_designator_value_str,
83
+ -- Staining using substance
84
+ ARRAY(
85
+ SELECT
86
+ IF
87
+ (code IS NULL, NULL, SPLIT(code, ':')[SAFE_OFFSET(0)])
88
+ FROM
89
+ UNNEST(staining_usingSubstance_code_str) AS code ) AS staining_usingSubstance_CodeMeaning,
90
+ ARRAY(
91
+ SELECT
92
+ IF
93
+ (code IS NULL, NULL,
94
+ IF
95
+ (STRPOS(code, ':') = 0, NULL, SUBSTR(code, STRPOS(code, ':') + 1)))
96
+ FROM
97
+ UNNEST(staining_usingSubstance_code_str) AS code ) AS staining_usingSubstance_code_designator_value_str,
98
+ -- instance-specific image attributes
99
+ -- NB: there is a caveat that I think in general, we expect square pixels, but in htan_wustl and cptac_luad this assumption does not hold,
100
+ -- and in htan_wustl, the difference is rather large (x2) - waiting to hear from David Clunie about this...
101
+ SAFE_CAST(SharedFunctionalGroupsSequence[SAFE_OFFSET(0)].PixelMeasuresSequence[SAFE_OFFSET(0)]. PixelSpacing[SAFE_OFFSET(0)] AS FLOAT64) AS PixelSpacing_0,
102
+ dicom_all.ImageType,
103
+ dicom_all.TransferSyntaxUID,
104
+ dicom_all.instance_size,
105
+ -- attributes needed to retrieve the selected instances/files
106
+ dicom_all.crdc_instance_uuid
107
+ FROM
108
+ `bigquery-public-data.idc_current.dicom_all` AS dicom_all
109
+ LEFT JOIN
110
+ slide_embedding
111
+ ON
112
+ dicom_all.SOPInstanceUID = slide_embedding.SOPInstanceUID
113
+ LEFT JOIN
114
+ slide_fixative
115
+ ON
116
+ dicom_all.SOPInstanceUID = slide_fixative.SOPInstanceUID
117
+ LEFT JOIN
118
+ slide_staining
119
+ ON
120
+ dicom_all.SOPInstanceUID = slide_staining.SOPInstanceUID
121
+ WHERE
122
+ dicom_all.Modality="SM"
123
+ ORDER BY
124
+ SeriesInstanceUID DESC
@@ -13,7 +13,7 @@ build-backend = "scikit_build_core.build"
13
13
 
14
14
  [project]
15
15
  name = "idc-index-data"
16
- version = "18.0.1"
16
+ version = "18.2.0"
17
17
  authors = [
18
18
  { name = "Andrey Fedorov", email = "andrey.fedorov@gmail.com" },
19
19
  { name = "Vamsi Thiriveedhi", email = "vthiriveedhi@mgh.harvard.edu" },
@@ -0,0 +1,26 @@
1
+ # new_script.py
2
+ from __future__ import annotations
3
+
4
+ import os
5
+ from pathlib import Path
6
+
7
+ from idc_index_data_manager import IDCIndexDataManager
8
+
9
+
10
+ def main():
11
+ project_id = os.getenv("PROJECT_ID")
12
+ manager = IDCIndexDataManager(project_id=project_id)
13
+ scripts_dir = Path(__file__).resolve().parent.parent
14
+ assets_dir = scripts_dir.parent / "assets"
15
+
16
+ # Collecting all .sql files from sql_dir and assets_dir
17
+ sql_files = [f for f in os.listdir(assets_dir) if f.endswith(".sql")]
18
+
19
+ for file_name in sql_files:
20
+ file_path = assets_dir / file_name
21
+ index_df, output_basename = manager.execute_sql_query(file_path)
22
+ index_df.to_parquet(f"{output_basename}.parquet")
23
+
24
+
25
+ if __name__ == "__main__":
26
+ main()
@@ -0,0 +1,89 @@
1
+ -- For details on the syntax, see
2
+ -- https://cloud.google.com/bigquery/docs/reference/standard-sql/procedural-language
3
+ --
4
+ -- Step 1: Declare variables
5
+ DECLARE idc_versions ARRAY<INT64>;
6
+ DECLARE latest_idc_version INT64 DEFAULT 18;
7
+ DECLARE union_all_query STRING;
8
+
9
+ --Step 2
10
+ --SET latest_idc_version = (
11
+ --SELECT max(idc_version)
12
+ --FROM
13
+ --bigquery-public-data.idc_current.version_metadata
14
+ --);
15
+
16
+ -- Step 3: Get all idc_versions
17
+ SET idc_versions = (
18
+ SELECT GENERATE_ARRAY(1, latest_idc_version)
19
+ -- SELECT [1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18]
20
+ --SELECT ARRAY_AGG(idc_version)
21
+ --FROM
22
+ --`bigquery-public-data.idc_current.version_metadata`
23
+ );
24
+
25
+ -- Step 4: Generate the UNION ALL query dynamically
26
+ SET union_all_query = (
27
+ SELECT STRING_AGG(
28
+ FORMAT("""
29
+ SELECT
30
+ %d AS idc_version,
31
+ collection_id,
32
+ PatientID,
33
+ SeriesInstanceUID,
34
+ StudyInstanceUID,
35
+ Modality,
36
+ regexp_extract(gcs_url, 'gs://([^/]+)/') as gcs_bucket,
37
+ crdc_series_uuid,
38
+ ROUND(SUM(SAFE_CAST(instance_size AS float64))/1000000, 2) AS series_size_MB,
39
+ FROM
40
+ `bigquery-public-data.idc_v%d.dicom_all` AS dicom_all
41
+ where crdc_series_uuid not in (select distinct crdc_series_uuid from `bigquery-public-data.idc_v%d.dicom_all`)
42
+ GROUP BY
43
+ 1,2,3,4,5,6,7,8
44
+
45
+ """,
46
+ version, version, latest_idc_version),
47
+ " UNION ALL "
48
+ )
49
+ FROM UNNEST(idc_versions) AS version
50
+ );
51
+
52
+ -- Step 5: Execute the complete query
53
+ EXECUTE IMMEDIATE FORMAT("""
54
+ WITH all_versions AS (
55
+ %s
56
+ )
57
+ SELECT
58
+ collection_id,
59
+ PatientID,
60
+ SeriesInstanceUID,
61
+ StudyInstanceUID,
62
+ Modality,
63
+ gcs_bucket,
64
+ crdc_series_uuid,
65
+ series_size_MB,
66
+ CASE
67
+
68
+ # map GCS bucket to AWS bucket, since for idc-index we prefer AWS
69
+ # if new buckets are included in IDC, this will need to be updated!
70
+
71
+ WHEN gcs_bucket='public-datasets-idc' THEN CONCAT('s3://','idc-open-data/',crdc_series_uuid, '/*')
72
+ WHEN gcs_bucket='idc-open-idc1' THEN CONCAT('s3://','idc-open-data-two/',crdc_series_uuid, '/*')
73
+ WHEN gcs_bucket='idc-open-cr' THEN CONCAT('s3://','idc-open-data-cr/',crdc_series_uuid, '/*')
74
+ END AS series_aws_url,
75
+ MIN(idc_version) AS min_idc_version,
76
+ MAX(idc_version) AS max_idc_version
77
+ FROM all_versions
78
+
79
+ where gcs_bucket not in ('idc-open-idc')
80
+
81
+ #per @bcli4d:idc-open-idc was our public bucket before we moved most data to the Google owned public-datasets-idc.
82
+ #We decided at the time to not touch BQ. To deal with this and other cases where some metadata can change (Licences),
83
+ #we include the mutable_metadata table which maps crdc_instance_uuid to current gcs_url, aws_url, license, doi.
84
+
85
+ GROUP BY
86
+ 1,2,3,4,5,6,7,8
87
+ """,
88
+ union_all_query
89
+ );
@@ -15,6 +15,7 @@ __all__ = [
15
15
  "__version__",
16
16
  "IDC_INDEX_CSV_ARCHIVE_FILEPATH",
17
17
  "IDC_INDEX_PARQUET_FILEPATH",
18
+ "PRIOR_VERSIONS_INDEX_PARQUET_FILEPATH",
18
19
  ]
19
20
 
20
21
 
@@ -36,3 +37,6 @@ IDC_INDEX_CSV_ARCHIVE_FILEPATH: Path | None = _lookup(
36
37
  "idc_index_data/idc_index.csv.zip", optional=True
37
38
  )
38
39
  IDC_INDEX_PARQUET_FILEPATH: Path | None = _lookup("idc_index_data/idc_index.parquet")
40
+ PRIOR_VERSIONS_INDEX_PARQUET_FILEPATH: Path | None = _lookup(
41
+ "idc_index_data/prior_versions_index.parquet"
42
+ )
@@ -38,3 +38,8 @@ def test_reading_index():
38
38
  assert m.IDC_INDEX_PARQUET_FILEPATH.is_file()
39
39
  df_parquet = pd.read_parquet(m.IDC_INDEX_PARQUET_FILEPATH)
40
40
  assert not df_parquet.empty
41
+
42
+ if m.PRIOR_VERSIONS_INDEX_PARQUET_FILEPATH is not None:
43
+ assert m.PRIOR_VERSIONS_INDEX_PARQUET_FILEPATH.is_file()
44
+ df_parquet = pd.read_parquet(m.PRIOR_VERSIONS_INDEX_PARQUET_FILEPATH)
45
+ assert not df_parquet.empty
File without changes