idc-index-data 22.1.1__tar.gz → 22.1.3__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {idc_index_data-22.1.1 → idc_index_data-22.1.3}/.github/workflows/external-indices.yml +1 -1
- {idc_index_data-22.1.1 → idc_index_data-22.1.3}/.pre-commit-config.yaml +1 -0
- {idc_index_data-22.1.1 → idc_index_data-22.1.3}/PKG-INFO +2 -1
- idc_index_data-22.1.3/assets/clinical_index.sql +23 -0
- {idc_index_data-22.1.1 → idc_index_data-22.1.3}/assets/sm_index.sql +36 -3
- {idc_index_data-22.1.1 → idc_index_data-22.1.3}/assets/sm_instance_index.sql +30 -0
- {idc_index_data-22.1.1 → idc_index_data-22.1.3}/pyproject.toml +6 -4
- idc_index_data-22.1.3/pytest.ini +2 -0
- {idc_index_data-22.1.1 → idc_index_data-22.1.3}/scripts/python/generate-indices.py +14 -4
- idc_index_data-22.1.3/scripts/python/idc_index_data_manager.py +424 -0
- idc_index_data-22.1.3/scripts/sql/collections_index.sql +39 -0
- idc_index_data-22.1.3/scripts/sql/idc_index.sql +88 -0
- idc_index_data-22.1.3/tests/test_column_description_parser.py +218 -0
- idc_index_data-22.1.3/tests/test_real_sql_parsing.py +101 -0
- idc_index_data-22.1.1/assets/clinical_index.sql +0 -11
- idc_index_data-22.1.1/scripts/python/idc_index_data_manager.py +0 -202
- idc_index_data-22.1.1/scripts/sql/collections_index.sql +0 -15
- idc_index_data-22.1.1/scripts/sql/idc_index.sql +0 -38
- {idc_index_data-22.1.1 → idc_index_data-22.1.3}/.git_archival.txt +0 -0
- {idc_index_data-22.1.1 → idc_index_data-22.1.3}/.gitattributes +0 -0
- {idc_index_data-22.1.1 → idc_index_data-22.1.3}/.github/CONTRIBUTING.md +0 -0
- {idc_index_data-22.1.1 → idc_index_data-22.1.3}/.github/copilot-instructions.md +0 -0
- {idc_index_data-22.1.1 → idc_index_data-22.1.3}/.github/dependabot.yml +0 -0
- {idc_index_data-22.1.1 → idc_index_data-22.1.3}/.github/matchers/pylint.json +0 -0
- {idc_index_data-22.1.1 → idc_index_data-22.1.3}/.github/workflows/cd.yml +0 -0
- {idc_index_data-22.1.1 → idc_index_data-22.1.3}/.github/workflows/ci.yml +0 -0
- {idc_index_data-22.1.1 → idc_index_data-22.1.3}/.gitignore +0 -0
- {idc_index_data-22.1.1 → idc_index_data-22.1.3}/.readthedocs.yaml +0 -0
- {idc_index_data-22.1.1 → idc_index_data-22.1.3}/CMakeLists.txt +0 -0
- {idc_index_data-22.1.1 → idc_index_data-22.1.3}/LICENSE +0 -0
- {idc_index_data-22.1.1 → idc_index_data-22.1.3}/README.md +0 -0
- {idc_index_data-22.1.1 → idc_index_data-22.1.3}/assets/README.md +0 -0
- {idc_index_data-22.1.1 → idc_index_data-22.1.3}/docs/conf.py +0 -0
- {idc_index_data-22.1.1 → idc_index_data-22.1.3}/docs/index.md +0 -0
- {idc_index_data-22.1.1 → idc_index_data-22.1.3}/noxfile.py +0 -0
- {idc_index_data-22.1.1 → idc_index_data-22.1.3}/scripts/python/update_idc_index_version.py +0 -0
- {idc_index_data-22.1.1 → idc_index_data-22.1.3}/scripts/sql/analysis_results_index.sql +0 -0
- {idc_index_data-22.1.1 → idc_index_data-22.1.3}/scripts/sql/prior_versions_index.sql +0 -0
- {idc_index_data-22.1.1 → idc_index_data-22.1.3}/src/idc_index_data/__init__.py +0 -0
- {idc_index_data-22.1.1 → idc_index_data-22.1.3}/src/idc_index_data/_version.pyi +0 -0
- {idc_index_data-22.1.1 → idc_index_data-22.1.3}/src/idc_index_data/py.typed +0 -0
- {idc_index_data-22.1.1 → idc_index_data-22.1.3}/tests/test_package.py +0 -0
|
@@ -55,6 +55,6 @@ jobs:
|
|
|
55
55
|
if: github.event_name == 'release' && github.event.action == 'published'
|
|
56
56
|
uses: ncipollo/release-action@v1
|
|
57
57
|
with:
|
|
58
|
-
artifacts: "release_artifacts/*.parquet,release_artifacts/*.json"
|
|
58
|
+
artifacts: "release_artifacts/*.parquet,release_artifacts/*.json,release_artifacts/*.sql"
|
|
59
59
|
allowUpdates: true
|
|
60
60
|
omitBodyDuringUpdate: true
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.1
|
|
2
2
|
Name: idc-index-data
|
|
3
|
-
Version: 22.1.
|
|
3
|
+
Version: 22.1.3
|
|
4
4
|
Summary: ImagingDataCommons index to query and download data.
|
|
5
5
|
Author-Email: Andrey Fedorov <andrey.fedorov@gmail.com>, Vamsi Thiriveedhi <vthiriveedhi@mgh.harvard.edu>, Jean-Christophe Fillion-Robin <jchris.fillionr@kitware.com>
|
|
6
6
|
License: Copyright 2024 Andrey Fedorov
|
|
@@ -41,6 +41,7 @@ Project-URL: Bug Tracker, https://github.com/ImagingDataCommons/idc-index-data/i
|
|
|
41
41
|
Project-URL: Discussions, https://discourse.canceridc.dev/
|
|
42
42
|
Project-URL: Changelog, https://github.com/ImagingDataCommons/idc-index-data/releases
|
|
43
43
|
Requires-Python: >=3.10
|
|
44
|
+
Requires-Dist: google-cloud-bigquery
|
|
44
45
|
Provides-Extra: test
|
|
45
46
|
Requires-Dist: pandas; extra == "test"
|
|
46
47
|
Requires-Dist: pyarrow; extra == "test"
|
|
@@ -0,0 +1,23 @@
|
|
|
1
|
+
SELECT
|
|
2
|
+
# description:
|
|
3
|
+
# unique identifier of the collection
|
|
4
|
+
collection_id,
|
|
5
|
+
# description:
|
|
6
|
+
# full name of the table in which the column is stored
|
|
7
|
+
table_name,
|
|
8
|
+
# description:
|
|
9
|
+
# short name of the table in which the column is stored
|
|
10
|
+
SPLIT(table_name,'.')[SAFE_OFFSET(2)] AS short_table_name,
|
|
11
|
+
# description:
|
|
12
|
+
# name of the column in which the value is stored
|
|
13
|
+
`column`,
|
|
14
|
+
# description:
|
|
15
|
+
# human readable name of the column
|
|
16
|
+
column_label,
|
|
17
|
+
# description:
|
|
18
|
+
# values encountered in the column
|
|
19
|
+
`values`
|
|
20
|
+
FROM
|
|
21
|
+
`bigquery-public-data.idc_v22_clinical.column_metadata`
|
|
22
|
+
ORDER BY
|
|
23
|
+
collection_id, table_name
|
|
@@ -82,10 +82,14 @@ SpecimenPreparationSequence_unnested AS (
|
|
|
82
82
|
SELECT
|
|
83
83
|
temp_table.SeriesInstanceUID,
|
|
84
84
|
-- Embedding Medium
|
|
85
|
+
# description:
|
|
86
|
+
# embedding medium used for the slide preparation
|
|
85
87
|
ARRAY(
|
|
86
88
|
SELECT IF(code IS NULL, NULL, SPLIT(code, ':')[SAFE_OFFSET(0)])
|
|
87
89
|
FROM UNNEST(embeddingMedium_code_str) AS code
|
|
88
90
|
) AS embeddingMedium_CodeMeaning,
|
|
91
|
+
# description:
|
|
92
|
+
# embedding medium code tuple
|
|
89
93
|
ARRAY(
|
|
90
94
|
SELECT IF(code IS NULL, NULL,
|
|
91
95
|
IF(STRPOS(code, ':') = 0, NULL,
|
|
@@ -93,10 +97,14 @@ SELECT
|
|
|
93
97
|
FROM UNNEST(embeddingMedium_code_str) AS code
|
|
94
98
|
) AS embeddingMedium_code_designator_value_str,
|
|
95
99
|
-- Tissue Fixative
|
|
100
|
+
# description:
|
|
101
|
+
# tissue fixative used for the slide preparation
|
|
96
102
|
ARRAY(
|
|
97
103
|
SELECT IF(code IS NULL, NULL, SPLIT(code, ':')[SAFE_OFFSET(0)])
|
|
98
104
|
FROM UNNEST(tissueFixative_code_str) AS code
|
|
99
105
|
) AS tissueFixative_CodeMeaning,
|
|
106
|
+
# description:
|
|
107
|
+
# tissue fixative code tuple
|
|
100
108
|
ARRAY(
|
|
101
109
|
SELECT IF(code IS NULL, NULL,
|
|
102
110
|
IF(STRPOS(code, ':') = 0, NULL,
|
|
@@ -104,31 +112,56 @@ SELECT
|
|
|
104
112
|
FROM UNNEST(tissueFixative_code_str) AS code
|
|
105
113
|
) AS tissueFixative_code_designator_value_str,
|
|
106
114
|
-- Staining using substance
|
|
115
|
+
# description:
|
|
116
|
+
# staining substances used for the slide preparation
|
|
107
117
|
ARRAY(
|
|
108
118
|
SELECT IF(code IS NULL, NULL, SPLIT(code, ':')[SAFE_OFFSET(0)])
|
|
109
119
|
FROM UNNEST(staining_usingSubstance_code_str) AS code
|
|
110
120
|
) AS staining_usingSubstance_CodeMeaning,
|
|
121
|
+
# description:
|
|
122
|
+
# staining using substance code tuple
|
|
111
123
|
ARRAY(
|
|
112
124
|
SELECT IF(code IS NULL, NULL,
|
|
113
125
|
IF(STRPOS(code, ':') = 0, NULL,
|
|
114
126
|
SUBSTR(code, STRPOS(code, ':') + 1)))
|
|
115
127
|
FROM UNNEST(staining_usingSubstance_code_str) AS code
|
|
116
128
|
) AS staining_usingSubstance_code_designator_value_str,
|
|
117
|
-
|
|
129
|
+
# description:
|
|
130
|
+
# pixel spacing in mm at the maximum resolution layer, rounded to 2 significant figures
|
|
118
131
|
if(COALESCE(min_spacing_0, fg_min_spacing_0) = 0, 0,
|
|
119
132
|
round(COALESCE(min_spacing_0, fg_min_spacing_0) ,CAST(2 -1-floor(log10(abs(COALESCE(min_spacing_0, fg_min_spacing_0) ))) AS INT64))) AS min_PixelSpacing_2sf,
|
|
133
|
+
# description:
|
|
134
|
+
# width of the image at the maximum resolution
|
|
120
135
|
COALESCE(max_TotalPixelMatrixColumns, max_Columns) AS max_TotalPixelMatrixColumns,
|
|
136
|
+
# description:
|
|
137
|
+
# height of the image at the maximum resolution
|
|
121
138
|
COALESCE(max_TotalPixelMatrixRows, max_Rows) AS max_TotalPixelMatrixRows,
|
|
139
|
+
# description:
|
|
140
|
+
# power of the objective lens of the equipment used to digitize the slide
|
|
122
141
|
SAFE_CAST(ObjectiveLensPower as INT) as ObjectiveLensPower,
|
|
142
|
+
# description:
|
|
143
|
+
# anatomic location from where the imaged specimen was collected
|
|
123
144
|
CONCAT(SPLIT(primaryAnatomicStructure_code_str,":")[SAFE_OFFSET(0)],":",SPLIT(primaryAnatomicStructure_code_str,":")[SAFE_OFFSET(1)]) as primaryAnatomicStructure_code_designator_value_str,
|
|
145
|
+
# description:
|
|
146
|
+
# code tuple for the anatomic location from where the imaged specimen was collected
|
|
124
147
|
SPLIT(primaryAnatomicStructure_code_str,":")[SAFE_OFFSET(2)] as primaryAnatomicStructure_CodeMeaning,
|
|
148
|
+
# description:
|
|
149
|
+
# additional characteristics of the specimen, such as whether it is a tumor or normal tissue (when available)
|
|
125
150
|
CONCAT(SPLIT(primaryAnatomicStructureModifier_code_str,":")[SAFE_OFFSET(0)],":",SPLIT(primaryAnatomicStructureModifier_code_str,":")[SAFE_OFFSET(1)]) as primaryAnatomicStructureModifier_code_designator_value_str,
|
|
151
|
+
# description:
|
|
152
|
+
# code tuple for additional characteristics of the specimen, such as whether it is a tumor or normal tissue (when available)
|
|
126
153
|
SPLIT(primaryAnatomicStructureModifier_code_str,":")[SAFE_OFFSET(2)] as primaryAnatomicStructureModifier_CodeMeaning,
|
|
127
|
-
|
|
154
|
+
# description:
|
|
155
|
+
# illumination type used during slide digitization
|
|
128
156
|
CONCAT(SPLIT(illuminationType_code_str,":")[SAFE_OFFSET(0)],":",SPLIT(illuminationType_code_str,":")[SAFE_OFFSET(1)]) as illuminationType_code_designator_value_str,
|
|
157
|
+
# description:
|
|
158
|
+
# code tuple for the illumination type used during slide digitization
|
|
129
159
|
SPLIT(illuminationType_code_str,":")[SAFE_OFFSET(2)] as illuminationType_CodeMeaning,
|
|
130
|
-
|
|
160
|
+
# description:
|
|
161
|
+
# admitting diagnosis associated with the specimen imaged on the slide (when available)
|
|
131
162
|
CONCAT(SPLIT(admittingDiagnosis_code_str,":")[SAFE_OFFSET(0)],":",SPLIT(admittingDiagnosis_code_str,":")[SAFE_OFFSET(1)]) as admittingDiagnosis_code_designator_value_str,
|
|
163
|
+
# description:
|
|
164
|
+
# code tuple for the admitting diagnosis associated with the specimen imaged on the slide (when available)
|
|
132
165
|
SPLIT(admittingDiagnosis_code_str,":")[SAFE_OFFSET(2)] as admittingDiagnosis_CodeMeaning,
|
|
133
166
|
FROM
|
|
134
167
|
temp_table
|
|
@@ -48,15 +48,23 @@ WITH
|
|
|
48
48
|
GROUP BY
|
|
49
49
|
SOPInstanceUID )
|
|
50
50
|
SELECT
|
|
51
|
+
# description:
|
|
52
|
+
# unique identifier of the instance
|
|
51
53
|
dicom_all.SOPInstanceUID,
|
|
54
|
+
# description:
|
|
55
|
+
# unique identifier of the series
|
|
52
56
|
dicom_all.SeriesInstanceUID,
|
|
53
57
|
-- Embedding Medium
|
|
58
|
+
# description:
|
|
59
|
+
# embedding medium used for the slide preparation
|
|
54
60
|
ARRAY(
|
|
55
61
|
SELECT
|
|
56
62
|
IF
|
|
57
63
|
(code IS NULL, NULL, SPLIT(code, ':')[SAFE_OFFSET(0)])
|
|
58
64
|
FROM
|
|
59
65
|
UNNEST(embeddingMedium_code_str) AS code ) AS embeddingMedium_CodeMeaning,
|
|
66
|
+
# description:
|
|
67
|
+
# embedding medium code tuple
|
|
60
68
|
ARRAY(
|
|
61
69
|
SELECT
|
|
62
70
|
IF
|
|
@@ -66,12 +74,16 @@ SELECT
|
|
|
66
74
|
FROM
|
|
67
75
|
UNNEST(embeddingMedium_code_str) AS code ) AS embeddingMedium_code_designator_value_str,
|
|
68
76
|
-- Tissue Fixative
|
|
77
|
+
# description:
|
|
78
|
+
# tissue fixative used for the slide preparation
|
|
69
79
|
ARRAY(
|
|
70
80
|
SELECT
|
|
71
81
|
IF
|
|
72
82
|
(code IS NULL, NULL, SPLIT(code, ':')[SAFE_OFFSET(0)])
|
|
73
83
|
FROM
|
|
74
84
|
UNNEST(tissueFixative_code_str) AS code ) AS tissueFixative_CodeMeaning,
|
|
85
|
+
# description:
|
|
86
|
+
# tissue fixative code tuple
|
|
75
87
|
ARRAY(
|
|
76
88
|
SELECT
|
|
77
89
|
IF
|
|
@@ -81,12 +93,16 @@ SELECT
|
|
|
81
93
|
FROM
|
|
82
94
|
UNNEST(tissueFixative_code_str) AS code ) AS tissueFixative_code_designator_value_str,
|
|
83
95
|
-- Staining using substance
|
|
96
|
+
# description:
|
|
97
|
+
# staining substances used for the slide preparation
|
|
84
98
|
ARRAY(
|
|
85
99
|
SELECT
|
|
86
100
|
IF
|
|
87
101
|
(code IS NULL, NULL, SPLIT(code, ':')[SAFE_OFFSET(0)])
|
|
88
102
|
FROM
|
|
89
103
|
UNNEST(staining_usingSubstance_code_str) AS code ) AS staining_usingSubstance_CodeMeaning,
|
|
104
|
+
# description:
|
|
105
|
+
# staining using substance code tuple
|
|
90
106
|
ARRAY(
|
|
91
107
|
SELECT
|
|
92
108
|
IF
|
|
@@ -98,13 +114,27 @@ SELECT
|
|
|
98
114
|
-- instance-specific image attributes
|
|
99
115
|
-- NB: there is a caveat that I think in general, we expect square pixels, but in htan_wustl and cptac_luad this assumption does not hold,
|
|
100
116
|
-- and in htan_wustl, the difference is rather large (x2) - waiting to hear from David Clunie about this...
|
|
117
|
+
# description:
|
|
118
|
+
# pixel spacing in mm, rounded to 2 significant figures
|
|
101
119
|
SAFE_CAST(SharedFunctionalGroupsSequence[SAFE_OFFSET(0)].PixelMeasuresSequence[SAFE_OFFSET(0)]. PixelSpacing[SAFE_OFFSET(0)] AS FLOAT64) AS PixelSpacing_0,
|
|
120
|
+
# description:
|
|
121
|
+
# DICOM ImageType attribute
|
|
102
122
|
dicom_all.ImageType,
|
|
123
|
+
# description:
|
|
124
|
+
# DICOM TransferSyntaxUID attribute
|
|
103
125
|
dicom_all.TransferSyntaxUID,
|
|
126
|
+
# description:
|
|
127
|
+
# size of the instance file in bytes
|
|
104
128
|
dicom_all.instance_size,
|
|
129
|
+
# description:
|
|
130
|
+
# number of columns in the image
|
|
105
131
|
dicom_all.TotalPixelMatrixColumns,
|
|
132
|
+
# description:
|
|
133
|
+
# number of rows in the image
|
|
106
134
|
dicom_all.TotalPixelMatrixRows,
|
|
107
135
|
-- attributes needed to retrieve the selected instances/files
|
|
136
|
+
# description:
|
|
137
|
+
# unique identifier of the instance within the IDC
|
|
108
138
|
dicom_all.crdc_instance_uuid
|
|
109
139
|
FROM
|
|
110
140
|
`bigquery-public-data.idc_v22.dicom_all` AS dicom_all
|
|
@@ -13,7 +13,7 @@ build-backend = "scikit_build_core.build"
|
|
|
13
13
|
|
|
14
14
|
[project]
|
|
15
15
|
name = "idc-index-data"
|
|
16
|
-
version = "22.1.
|
|
16
|
+
version = "22.1.3"
|
|
17
17
|
authors = [
|
|
18
18
|
{ name = "Andrey Fedorov", email = "andrey.fedorov@gmail.com" },
|
|
19
19
|
{ name = "Vamsi Thiriveedhi", email = "vthiriveedhi@mgh.harvard.edu" },
|
|
@@ -38,7 +38,9 @@ classifiers = [
|
|
|
38
38
|
"Topic :: Scientific/Engineering",
|
|
39
39
|
"Typing :: Typed",
|
|
40
40
|
]
|
|
41
|
-
dependencies = [
|
|
41
|
+
dependencies = [
|
|
42
|
+
"google-cloud-bigquery"
|
|
43
|
+
]
|
|
42
44
|
|
|
43
45
|
[project.optional-dependencies]
|
|
44
46
|
test = [
|
|
@@ -102,7 +104,7 @@ report.exclude_also = [
|
|
|
102
104
|
|
|
103
105
|
[tool.mypy]
|
|
104
106
|
files = ["src", "tests"]
|
|
105
|
-
python_version = "3.
|
|
107
|
+
python_version = "3.10"
|
|
106
108
|
warn_unused_configs = true
|
|
107
109
|
strict = true
|
|
108
110
|
enable_error_code = ["ignore-without-code", "redundant-expr", "truthy-bool"]
|
|
@@ -158,7 +160,7 @@ isort.required-imports = ["from __future__ import annotations"]
|
|
|
158
160
|
|
|
159
161
|
|
|
160
162
|
[tool.pylint]
|
|
161
|
-
py-version = "3.
|
|
163
|
+
py-version = "3.10"
|
|
162
164
|
ignore-paths = [".*/_version.py"]
|
|
163
165
|
reports.output-format = "colorized"
|
|
164
166
|
similarities.ignore-imports = "yes"
|
|
@@ -23,10 +23,13 @@ def main():
|
|
|
23
23
|
|
|
24
24
|
for file_name in sql_files:
|
|
25
25
|
file_path = assets_dir / file_name
|
|
26
|
-
index_df, output_basename, schema = manager.execute_sql_query(
|
|
26
|
+
index_df, output_basename, schema, sql_query = manager.execute_sql_query(
|
|
27
|
+
file_path
|
|
28
|
+
)
|
|
27
29
|
parquet_file_path = output_dir / f"{output_basename}.parquet"
|
|
28
30
|
index_df.to_parquet(parquet_file_path)
|
|
29
|
-
manager.save_schema_to_json(schema, output_basename, output_dir)
|
|
31
|
+
manager.save_schema_to_json(schema, output_basename, sql_query, output_dir)
|
|
32
|
+
manager.save_sql_query(sql_query, output_basename, output_dir)
|
|
30
33
|
|
|
31
34
|
core_indices_dir = scripts_dir.parent / "scripts" / "sql"
|
|
32
35
|
|
|
@@ -34,10 +37,17 @@ def main():
|
|
|
34
37
|
|
|
35
38
|
for file_name in sql_files:
|
|
36
39
|
file_path = core_indices_dir / file_name
|
|
37
|
-
index_df, output_basename, schema = manager.execute_sql_query(
|
|
40
|
+
index_df, output_basename, schema, sql_query = manager.execute_sql_query(
|
|
41
|
+
file_path
|
|
42
|
+
)
|
|
38
43
|
parquet_file_path = output_dir / f"{output_basename}.parquet"
|
|
39
44
|
index_df.to_parquet(parquet_file_path)
|
|
40
|
-
|
|
45
|
+
if output_basename == "prior_versions_index":
|
|
46
|
+
# For prior_versions_index, save schema without descriptions
|
|
47
|
+
manager.save_schema_to_json(schema, output_basename, None, output_dir)
|
|
48
|
+
else:
|
|
49
|
+
manager.save_schema_to_json(schema, output_basename, sql_query, output_dir)
|
|
50
|
+
manager.save_sql_query(sql_query, output_basename, output_dir)
|
|
41
51
|
|
|
42
52
|
|
|
43
53
|
if __name__ == "__main__":
|