idc-index-data 22.1.3__tar.gz → 22.1.5__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {idc_index_data-22.1.3 → idc_index_data-22.1.5}/.pre-commit-config.yaml +3 -3
- {idc_index_data-22.1.3 → idc_index_data-22.1.5}/PKG-INFO +1 -1
- {idc_index_data-22.1.3 → idc_index_data-22.1.5}/assets/clinical_index.sql +7 -0
- {idc_index_data-22.1.3 → idc_index_data-22.1.5}/assets/sm_index.sql +6 -0
- {idc_index_data-22.1.3 → idc_index_data-22.1.5}/assets/sm_instance_index.sql +15 -8
- {idc_index_data-22.1.3 → idc_index_data-22.1.5}/pyproject.toml +1 -1
- {idc_index_data-22.1.3 → idc_index_data-22.1.5}/scripts/python/idc_index_data_manager.py +60 -6
- idc_index_data-22.1.5/scripts/sql/analysis_results_index.sql +46 -0
- {idc_index_data-22.1.3 → idc_index_data-22.1.5}/scripts/sql/collections_index.sql +4 -0
- {idc_index_data-22.1.3 → idc_index_data-22.1.5}/tests/test_real_sql_parsing.py +0 -12
- idc_index_data-22.1.3/scripts/sql/analysis_results_index.sql +0 -16
- {idc_index_data-22.1.3 → idc_index_data-22.1.5}/.git_archival.txt +0 -0
- {idc_index_data-22.1.3 → idc_index_data-22.1.5}/.gitattributes +0 -0
- {idc_index_data-22.1.3 → idc_index_data-22.1.5}/.github/CONTRIBUTING.md +0 -0
- {idc_index_data-22.1.3 → idc_index_data-22.1.5}/.github/copilot-instructions.md +0 -0
- {idc_index_data-22.1.3 → idc_index_data-22.1.5}/.github/dependabot.yml +0 -0
- {idc_index_data-22.1.3 → idc_index_data-22.1.5}/.github/matchers/pylint.json +0 -0
- {idc_index_data-22.1.3 → idc_index_data-22.1.5}/.github/workflows/cd.yml +0 -0
- {idc_index_data-22.1.3 → idc_index_data-22.1.5}/.github/workflows/ci.yml +0 -0
- {idc_index_data-22.1.3 → idc_index_data-22.1.5}/.github/workflows/external-indices.yml +0 -0
- {idc_index_data-22.1.3 → idc_index_data-22.1.5}/.gitignore +0 -0
- {idc_index_data-22.1.3 → idc_index_data-22.1.5}/.readthedocs.yaml +0 -0
- {idc_index_data-22.1.3 → idc_index_data-22.1.5}/CMakeLists.txt +0 -0
- {idc_index_data-22.1.3 → idc_index_data-22.1.5}/LICENSE +0 -0
- {idc_index_data-22.1.3 → idc_index_data-22.1.5}/README.md +0 -0
- {idc_index_data-22.1.3 → idc_index_data-22.1.5}/assets/README.md +0 -0
- {idc_index_data-22.1.3 → idc_index_data-22.1.5}/docs/conf.py +0 -0
- {idc_index_data-22.1.3 → idc_index_data-22.1.5}/docs/index.md +0 -0
- {idc_index_data-22.1.3 → idc_index_data-22.1.5}/noxfile.py +0 -0
- {idc_index_data-22.1.3 → idc_index_data-22.1.5}/pytest.ini +0 -0
- {idc_index_data-22.1.3 → idc_index_data-22.1.5}/scripts/python/generate-indices.py +0 -0
- {idc_index_data-22.1.3 → idc_index_data-22.1.5}/scripts/python/update_idc_index_version.py +0 -0
- {idc_index_data-22.1.3 → idc_index_data-22.1.5}/scripts/sql/idc_index.sql +0 -0
- {idc_index_data-22.1.3 → idc_index_data-22.1.5}/scripts/sql/prior_versions_index.sql +0 -0
- {idc_index_data-22.1.3 → idc_index_data-22.1.5}/src/idc_index_data/__init__.py +0 -0
- {idc_index_data-22.1.3 → idc_index_data-22.1.5}/src/idc_index_data/_version.pyi +0 -0
- {idc_index_data-22.1.3 → idc_index_data-22.1.5}/src/idc_index_data/py.typed +0 -0
- {idc_index_data-22.1.3 → idc_index_data-22.1.5}/tests/test_column_description_parser.py +0 -0
- {idc_index_data-22.1.3 → idc_index_data-22.1.5}/tests/test_package.py +0 -0
|
@@ -40,7 +40,7 @@ repos:
|
|
|
40
40
|
args: [--prose-wrap=always]
|
|
41
41
|
|
|
42
42
|
- repo: https://github.com/astral-sh/ruff-pre-commit
|
|
43
|
-
rev: "v0.14.
|
|
43
|
+
rev: "v0.14.5"
|
|
44
44
|
hooks:
|
|
45
45
|
- id: ruff-check
|
|
46
46
|
args: ["--fix", "--show-fixes"]
|
|
@@ -76,12 +76,12 @@ repos:
|
|
|
76
76
|
exclude: .pre-commit-config.yaml
|
|
77
77
|
|
|
78
78
|
- repo: https://github.com/henryiii/validate-pyproject-schema-store
|
|
79
|
-
rev: "2025.11.
|
|
79
|
+
rev: "2025.11.14"
|
|
80
80
|
hooks:
|
|
81
81
|
- id: validate-pyproject
|
|
82
82
|
|
|
83
83
|
- repo: https://github.com/python-jsonschema/check-jsonschema
|
|
84
|
-
rev: "0.
|
|
84
|
+
rev: "0.35.0"
|
|
85
85
|
hooks:
|
|
86
86
|
- id: check-dependabot
|
|
87
87
|
- id: check-github-workflows
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.1
|
|
2
2
|
Name: idc-index-data
|
|
3
|
-
Version: 22.1.
|
|
3
|
+
Version: 22.1.5
|
|
4
4
|
Summary: ImagingDataCommons index to query and download data.
|
|
5
5
|
Author-Email: Andrey Fedorov <andrey.fedorov@gmail.com>, Vamsi Thiriveedhi <vthiriveedhi@mgh.harvard.edu>, Jean-Christophe Fillion-Robin <jchris.fillionr@kitware.com>
|
|
6
6
|
License: Copyright 2024 Andrey Fedorov
|
|
@@ -1,3 +1,10 @@
|
|
|
1
|
+
# table-description:
|
|
2
|
+
# This table contains metadata about the tabular data, including clinical data, accompanying images that
|
|
3
|
+
# is available in IDC. Think about this table as a dictionary containing information about the columns
|
|
4
|
+
# for all of the tabular data accompanying individual collections in IDC. Each row corresponds to a unique
|
|
5
|
+
# combination of collection, clinical data table that is available for that collection, and a column from that
|
|
6
|
+
# table. Individual tables referenced from this table can be retrieved using idc-index `get_clinical_table()`
|
|
7
|
+
# function.
|
|
1
8
|
SELECT
|
|
2
9
|
# description:
|
|
3
10
|
# unique identifier of the collection
|
|
@@ -9,6 +9,12 @@
|
|
|
9
9
|
-- WHERE
|
|
10
10
|
-- Modality = "SM"
|
|
11
11
|
|
|
12
|
+
# table-description:
|
|
13
|
+
# This table contains metadata about the slide microscopy (SM) series available in IDC. Each row
|
|
14
|
+
# corresponds to a DICOM series, and contains attributes specific to SM series, such as the pixel spacing at the maximum
|
|
15
|
+
# resolution layer, the power of the objective lens used to digitize the slide, and the anatomic location
|
|
16
|
+
# from where the imaged specimen was collected. This table can be joined with the main index table using the
|
|
17
|
+
# `SeriesInstanceUID` column.
|
|
12
18
|
WITH
|
|
13
19
|
temp_table AS (
|
|
14
20
|
SELECT
|
|
@@ -1,3 +1,10 @@
|
|
|
1
|
+
# table-description:
|
|
2
|
+
# This table contains metadata about the slide microscopy (SM) series available in IDC. Each row
|
|
3
|
+
# corresponds to an instance from a DICOM Slide Microscopy series available from IDC, identified by
|
|
4
|
+
# `SOPInstanceUID`, and contains attributes specific to SM series, such as the pixel spacing at the maximum
|
|
5
|
+
# resolution layer, the power of the objective lens used to digitize the slide, and the anatomic location
|
|
6
|
+
# from where the imaged specimen was collected. This table can be joined with the main index table
|
|
7
|
+
# and/or with `sm_index` using the `SeriesInstanceUID` column.
|
|
1
8
|
WITH
|
|
2
9
|
SpecimenPreparationSequence_unnested AS (
|
|
3
10
|
SELECT
|
|
@@ -50,10 +57,10 @@ WITH
|
|
|
50
57
|
SELECT
|
|
51
58
|
# description:
|
|
52
59
|
# unique identifier of the instance
|
|
53
|
-
dicom_all.SOPInstanceUID,
|
|
60
|
+
dicom_all.SOPInstanceUID AS SOPInstanceUID,
|
|
54
61
|
# description:
|
|
55
62
|
# unique identifier of the series
|
|
56
|
-
dicom_all.SeriesInstanceUID,
|
|
63
|
+
dicom_all.SeriesInstanceUID AS SeriesInstanceUID,
|
|
57
64
|
-- Embedding Medium
|
|
58
65
|
# description:
|
|
59
66
|
# embedding medium used for the slide preparation
|
|
@@ -119,23 +126,23 @@ SELECT
|
|
|
119
126
|
SAFE_CAST(SharedFunctionalGroupsSequence[SAFE_OFFSET(0)].PixelMeasuresSequence[SAFE_OFFSET(0)]. PixelSpacing[SAFE_OFFSET(0)] AS FLOAT64) AS PixelSpacing_0,
|
|
120
127
|
# description:
|
|
121
128
|
# DICOM ImageType attribute
|
|
122
|
-
dicom_all.ImageType,
|
|
129
|
+
dicom_all.ImageType AS ImageType,
|
|
123
130
|
# description:
|
|
124
131
|
# DICOM TransferSyntaxUID attribute
|
|
125
|
-
dicom_all.TransferSyntaxUID,
|
|
132
|
+
dicom_all.TransferSyntaxUID AS TransferSyntaxUID,
|
|
126
133
|
# description:
|
|
127
134
|
# size of the instance file in bytes
|
|
128
|
-
dicom_all.instance_size,
|
|
135
|
+
dicom_all.instance_size AS instance_size,
|
|
129
136
|
# description:
|
|
130
137
|
# number of columns in the image
|
|
131
|
-
dicom_all.TotalPixelMatrixColumns,
|
|
138
|
+
dicom_all.TotalPixelMatrixColumns AS TotalPixelMatrixColumns,
|
|
132
139
|
# description:
|
|
133
140
|
# number of rows in the image
|
|
134
|
-
dicom_all.TotalPixelMatrixRows,
|
|
141
|
+
dicom_all.TotalPixelMatrixRows AS TotalPixelMatrixRows,
|
|
135
142
|
-- attributes needed to retrieve the selected instances/files
|
|
136
143
|
# description:
|
|
137
144
|
# unique identifier of the instance within the IDC
|
|
138
|
-
dicom_all.crdc_instance_uuid
|
|
145
|
+
dicom_all.crdc_instance_uuid AS crdc_instance_uuid
|
|
139
146
|
FROM
|
|
140
147
|
`bigquery-public-data.idc_v22.dicom_all` AS dicom_all
|
|
141
148
|
LEFT JOIN
|
|
@@ -13,7 +13,7 @@ build-backend = "scikit_build_core.build"
|
|
|
13
13
|
|
|
14
14
|
[project]
|
|
15
15
|
name = "idc-index-data"
|
|
16
|
-
version = "22.1.
|
|
16
|
+
version = "22.1.5"
|
|
17
17
|
authors = [
|
|
18
18
|
{ name = "Andrey Fedorov", email = "andrey.fedorov@gmail.com" },
|
|
19
19
|
{ name = "Vamsi Thiriveedhi", email = "vthiriveedhi@mgh.harvard.edu" },
|
|
@@ -22,6 +22,51 @@ class IDCIndexDataManager:
|
|
|
22
22
|
self.client = bigquery.Client(project=project_id)
|
|
23
23
|
logger.debug("IDCIndexDataManager initialized with project ID: %s", project_id)
|
|
24
24
|
|
|
25
|
+
@staticmethod
|
|
26
|
+
def parse_table_description(sql_query: str) -> str:
|
|
27
|
+
"""
|
|
28
|
+
Parses the table description from SQL query comments.
|
|
29
|
+
|
|
30
|
+
The method looks for comments following the pattern:
|
|
31
|
+
# table-description:
|
|
32
|
+
# description text continues here
|
|
33
|
+
# and can span multiple lines
|
|
34
|
+
|
|
35
|
+
Args:
|
|
36
|
+
sql_query: The SQL query string containing comments
|
|
37
|
+
|
|
38
|
+
Returns:
|
|
39
|
+
The table description as a string
|
|
40
|
+
"""
|
|
41
|
+
description_lines = []
|
|
42
|
+
logger.debug("Parsing table description from SQL query comments")
|
|
43
|
+
logger.debug(sql_query)
|
|
44
|
+
lines = sql_query.split("\n")
|
|
45
|
+
|
|
46
|
+
for i, line in enumerate(lines):
|
|
47
|
+
stripped = line.strip()
|
|
48
|
+
if stripped == "# table-description:":
|
|
49
|
+
# Collect description lines until we hit a non-comment line
|
|
50
|
+
j = i + 1
|
|
51
|
+
while j < len(lines):
|
|
52
|
+
next_line = lines[j]
|
|
53
|
+
next_stripped = next_line.strip()
|
|
54
|
+
if next_stripped.startswith("#") and next_stripped != "#":
|
|
55
|
+
# Remove the leading # and whitespace
|
|
56
|
+
desc_text = next_stripped[1:].strip()
|
|
57
|
+
if desc_text:
|
|
58
|
+
description_lines.append(desc_text)
|
|
59
|
+
j += 1
|
|
60
|
+
elif next_stripped.startswith("#"):
|
|
61
|
+
# Empty comment line, skip
|
|
62
|
+
j += 1
|
|
63
|
+
else:
|
|
64
|
+
# Non-comment line, stop collecting
|
|
65
|
+
break
|
|
66
|
+
break
|
|
67
|
+
|
|
68
|
+
return " ".join(description_lines)
|
|
69
|
+
|
|
25
70
|
@staticmethod
|
|
26
71
|
def parse_column_descriptions(sql_query: str) -> dict[str, str]:
|
|
27
72
|
"""
|
|
@@ -130,10 +175,16 @@ class IDCIndexDataManager:
|
|
|
130
175
|
logger.debug(
|
|
131
176
|
"Parsed description for column '%s': %s",
|
|
132
177
|
column_name,
|
|
133
|
-
description
|
|
134
|
-
if len(description) > 50
|
|
135
|
-
else description,
|
|
178
|
+
description,
|
|
136
179
|
)
|
|
180
|
+
# throw exception if description is empty
|
|
181
|
+
if not description:
|
|
182
|
+
raise ValueError(
|
|
183
|
+
"Description for column '"
|
|
184
|
+
+ column_name
|
|
185
|
+
+ "' is empty, and empty descriptions are not allowed."
|
|
186
|
+
)
|
|
187
|
+
|
|
137
188
|
else:
|
|
138
189
|
i += 1
|
|
139
190
|
else:
|
|
@@ -226,11 +277,14 @@ class IDCIndexDataManager:
|
|
|
226
277
|
logger.debug("Parsing column descriptions from SQL query comments")
|
|
227
278
|
logger.debug(sql_query)
|
|
228
279
|
if sql_query is not None:
|
|
280
|
+
table_description = self.parse_table_description(sql_query)
|
|
281
|
+
logger.debug("Parsed table description: %s", table_description)
|
|
229
282
|
descriptions = self.parse_column_descriptions(sql_query)
|
|
230
283
|
|
|
231
284
|
# Convert BigQuery schema to JSON-serializable format
|
|
232
285
|
schema_dict = {
|
|
233
|
-
"
|
|
286
|
+
"table_description": table_description,
|
|
287
|
+
"columns": [
|
|
234
288
|
{
|
|
235
289
|
"name": field.name,
|
|
236
290
|
"type": field.field_type,
|
|
@@ -238,12 +292,12 @@ class IDCIndexDataManager:
|
|
|
238
292
|
"description": descriptions.get(field.name, ""),
|
|
239
293
|
}
|
|
240
294
|
for field in schema
|
|
241
|
-
]
|
|
295
|
+
],
|
|
242
296
|
}
|
|
243
297
|
else:
|
|
244
298
|
# If no SQL query provided, save schema without descriptions
|
|
245
299
|
schema_dict = {
|
|
246
|
-
"
|
|
300
|
+
"columns": [
|
|
247
301
|
{
|
|
248
302
|
"name": field.name,
|
|
249
303
|
"type": field.field_type,
|
|
@@ -0,0 +1,46 @@
|
|
|
1
|
+
# table-description:
|
|
2
|
+
# This table contains metadata about the analysis results collections available in IDC. Each row corresponds to an
|
|
3
|
+
# analysis results collection, and contains attributes such as the collection name, types of cancer represented,
|
|
4
|
+
# number of subjects, and pointers to the resources to learn more about the content of the collection
|
|
5
|
+
SELECT
|
|
6
|
+
# description:
|
|
7
|
+
# unique identifier of the analysis results collection
|
|
8
|
+
ID AS analysis_result_id,
|
|
9
|
+
# description:
|
|
10
|
+
# name of the analysis results collection
|
|
11
|
+
Title AS analysis_result_title,
|
|
12
|
+
# description:
|
|
13
|
+
# Digital Object Identifier (DOI) of the analysis results collection
|
|
14
|
+
source_doi,
|
|
15
|
+
# description:
|
|
16
|
+
# URL for the location of additional information about the analysis results collection
|
|
17
|
+
source_url,
|
|
18
|
+
# description:
|
|
19
|
+
# number of subjects analyzed in the analysis results collection
|
|
20
|
+
Subjects,
|
|
21
|
+
# description:
|
|
22
|
+
# collections analyzed in the analysis results collection
|
|
23
|
+
Collections,
|
|
24
|
+
# description:
|
|
25
|
+
# analysis artifacts included in the analysis results collection
|
|
26
|
+
AnalysisArtifacts,
|
|
27
|
+
# description:
|
|
28
|
+
# timestamp of the last update to the analysis results collection
|
|
29
|
+
Updated,
|
|
30
|
+
# description:
|
|
31
|
+
# license URL for the analysis results collection
|
|
32
|
+
license_url,
|
|
33
|
+
# description:
|
|
34
|
+
# license name for the analysis results collection
|
|
35
|
+
license_long_name,
|
|
36
|
+
# description:
|
|
37
|
+
# short name for the license of the analysis results collection
|
|
38
|
+
license_short_name,
|
|
39
|
+
# description:
|
|
40
|
+
# detailed description of the analysis results collection
|
|
41
|
+
Description,
|
|
42
|
+
# description:
|
|
43
|
+
# citation for the analysis results collection that should be used for acknowledgment
|
|
44
|
+
Citation
|
|
45
|
+
FROM
|
|
46
|
+
`bigquery-public-data.idc_v22.analysis_results_metadata`
|
|
@@ -1,3 +1,7 @@
|
|
|
1
|
+
# table-description:
|
|
2
|
+
# This table contains metadata about the collections available in IDC. Each row corresponds to a collection,
|
|
3
|
+
# and contains attributes such as the collection name, types of cancer represented, number of subjects,
|
|
4
|
+
# and pointers to the resources to learn more about the content of the collection.
|
|
1
5
|
SELECT
|
|
2
6
|
# description:
|
|
3
7
|
# name of the collection
|
|
@@ -84,18 +84,6 @@ def test_real_sql_files() -> None:
|
|
|
84
84
|
else:
|
|
85
85
|
print(f"✗ Missing expected column: {col}")
|
|
86
86
|
|
|
87
|
-
# Test analysis_results_index.sql (should have no descriptions)
|
|
88
|
-
analysis_sql_path = sql_dir / "analysis_results_index.sql"
|
|
89
|
-
if analysis_sql_path.exists():
|
|
90
|
-
with analysis_sql_path.open("r") as f:
|
|
91
|
-
sql_query = f.read()
|
|
92
|
-
|
|
93
|
-
descriptions = IDCIndexDataManager.parse_column_descriptions(sql_query)
|
|
94
|
-
print("\n=== analysis_results_index.sql ===")
|
|
95
|
-
print(f"Found {len(descriptions)} column descriptions (expected 0)")
|
|
96
|
-
assert len(descriptions) == 0, "Expected no descriptions in this file"
|
|
97
|
-
print("✓ Correctly found no descriptions")
|
|
98
|
-
|
|
99
87
|
|
|
100
88
|
if __name__ == "__main__":
|
|
101
89
|
test_real_sql_files()
|
|
@@ -1,16 +0,0 @@
|
|
|
1
|
-
SELECT
|
|
2
|
-
ID AS analysis_result_id,
|
|
3
|
-
Title AS analysis_result_title,
|
|
4
|
-
source_doi,
|
|
5
|
-
source_url,
|
|
6
|
-
Subjects,
|
|
7
|
-
Collections,
|
|
8
|
-
AnalysisArtifacts,
|
|
9
|
-
Updated,
|
|
10
|
-
license_url,
|
|
11
|
-
license_long_name,
|
|
12
|
-
license_short_name,
|
|
13
|
-
Description,
|
|
14
|
-
Citation
|
|
15
|
-
FROM
|
|
16
|
-
`bigquery-public-data.idc_v22.analysis_results_metadata`
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|