idc-index-data 22.1.4__tar.gz → 23.0.1__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {idc_index_data-22.1.4 → idc_index_data-23.0.1}/.pre-commit-config.yaml +3 -3
- {idc_index_data-22.1.4 → idc_index_data-23.0.1}/PKG-INFO +1 -1
- idc_index_data-23.0.1/assets/clinical_index.sql +30 -0
- {idc_index_data-22.1.4 → idc_index_data-23.0.1}/assets/sm_index.sql +10 -2
- {idc_index_data-22.1.4 → idc_index_data-23.0.1}/assets/sm_instance_index.sql +9 -2
- {idc_index_data-22.1.4 → idc_index_data-23.0.1}/pyproject.toml +1 -1
- {idc_index_data-22.1.4 → idc_index_data-23.0.1}/scripts/python/idc_index_data_manager.py +52 -4
- {idc_index_data-22.1.4 → idc_index_data-23.0.1}/scripts/sql/analysis_results_index.sql +5 -1
- {idc_index_data-22.1.4 → idc_index_data-23.0.1}/scripts/sql/collections_index.sql +5 -1
- {idc_index_data-22.1.4 → idc_index_data-23.0.1}/scripts/sql/idc_index.sql +7 -3
- {idc_index_data-22.1.4 → idc_index_data-23.0.1}/scripts/sql/prior_versions_index.sql +1 -1
- {idc_index_data-22.1.4 → idc_index_data-23.0.1}/tests/test_package.py +1 -1
- idc_index_data-22.1.4/assets/clinical_index.sql +0 -23
- {idc_index_data-22.1.4 → idc_index_data-23.0.1}/.git_archival.txt +0 -0
- {idc_index_data-22.1.4 → idc_index_data-23.0.1}/.gitattributes +0 -0
- {idc_index_data-22.1.4 → idc_index_data-23.0.1}/.github/CONTRIBUTING.md +0 -0
- {idc_index_data-22.1.4 → idc_index_data-23.0.1}/.github/copilot-instructions.md +0 -0
- {idc_index_data-22.1.4 → idc_index_data-23.0.1}/.github/dependabot.yml +0 -0
- {idc_index_data-22.1.4 → idc_index_data-23.0.1}/.github/matchers/pylint.json +0 -0
- {idc_index_data-22.1.4 → idc_index_data-23.0.1}/.github/workflows/cd.yml +0 -0
- {idc_index_data-22.1.4 → idc_index_data-23.0.1}/.github/workflows/ci.yml +0 -0
- {idc_index_data-22.1.4 → idc_index_data-23.0.1}/.github/workflows/external-indices.yml +0 -0
- {idc_index_data-22.1.4 → idc_index_data-23.0.1}/.gitignore +0 -0
- {idc_index_data-22.1.4 → idc_index_data-23.0.1}/.readthedocs.yaml +0 -0
- {idc_index_data-22.1.4 → idc_index_data-23.0.1}/CMakeLists.txt +0 -0
- {idc_index_data-22.1.4 → idc_index_data-23.0.1}/LICENSE +0 -0
- {idc_index_data-22.1.4 → idc_index_data-23.0.1}/README.md +0 -0
- {idc_index_data-22.1.4 → idc_index_data-23.0.1}/assets/README.md +0 -0
- {idc_index_data-22.1.4 → idc_index_data-23.0.1}/docs/conf.py +0 -0
- {idc_index_data-22.1.4 → idc_index_data-23.0.1}/docs/index.md +0 -0
- {idc_index_data-22.1.4 → idc_index_data-23.0.1}/noxfile.py +0 -0
- {idc_index_data-22.1.4 → idc_index_data-23.0.1}/pytest.ini +0 -0
- {idc_index_data-22.1.4 → idc_index_data-23.0.1}/scripts/python/generate-indices.py +0 -0
- {idc_index_data-22.1.4 → idc_index_data-23.0.1}/scripts/python/update_idc_index_version.py +0 -0
- {idc_index_data-22.1.4 → idc_index_data-23.0.1}/src/idc_index_data/__init__.py +0 -0
- {idc_index_data-22.1.4 → idc_index_data-23.0.1}/src/idc_index_data/_version.pyi +0 -0
- {idc_index_data-22.1.4 → idc_index_data-23.0.1}/src/idc_index_data/py.typed +0 -0
- {idc_index_data-22.1.4 → idc_index_data-23.0.1}/tests/test_column_description_parser.py +0 -0
- {idc_index_data-22.1.4 → idc_index_data-23.0.1}/tests/test_real_sql_parsing.py +0 -0
|
@@ -40,7 +40,7 @@ repos:
|
|
|
40
40
|
args: [--prose-wrap=always]
|
|
41
41
|
|
|
42
42
|
- repo: https://github.com/astral-sh/ruff-pre-commit
|
|
43
|
-
rev: "v0.14.
|
|
43
|
+
rev: "v0.14.5"
|
|
44
44
|
hooks:
|
|
45
45
|
- id: ruff-check
|
|
46
46
|
args: ["--fix", "--show-fixes"]
|
|
@@ -76,12 +76,12 @@ repos:
|
|
|
76
76
|
exclude: .pre-commit-config.yaml
|
|
77
77
|
|
|
78
78
|
- repo: https://github.com/henryiii/validate-pyproject-schema-store
|
|
79
|
-
rev: "2025.11.
|
|
79
|
+
rev: "2025.11.14"
|
|
80
80
|
hooks:
|
|
81
81
|
- id: validate-pyproject
|
|
82
82
|
|
|
83
83
|
- repo: https://github.com/python-jsonschema/check-jsonschema
|
|
84
|
-
rev: "0.
|
|
84
|
+
rev: "0.35.0"
|
|
85
85
|
hooks:
|
|
86
86
|
- id: check-dependabot
|
|
87
87
|
- id: check-github-workflows
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.1
|
|
2
2
|
Name: idc-index-data
|
|
3
|
-
Version:
|
|
3
|
+
Version: 23.0.1
|
|
4
4
|
Summary: ImagingDataCommons index to query and download data.
|
|
5
5
|
Author-Email: Andrey Fedorov <andrey.fedorov@gmail.com>, Vamsi Thiriveedhi <vthiriveedhi@mgh.harvard.edu>, Jean-Christophe Fillion-Robin <jchris.fillionr@kitware.com>
|
|
6
6
|
License: Copyright 2024 Andrey Fedorov
|
|
@@ -0,0 +1,30 @@
|
|
|
1
|
+
# table-description:
|
|
2
|
+
# This table contains metadata about the tabular data, including clinical data, accompanying images that
|
|
3
|
+
# is available in IDC. Think about this table as a dictionary containing information about the columns
|
|
4
|
+
# for all of the tabular data accompanying individual collections in IDC. Each row corresponds to a unique
|
|
5
|
+
# combination of collection, clinical data table that is available for that collection, and a column from that
|
|
6
|
+
# table. Individual tables referenced from this table can be retrieved using idc-index `get_clinical_table()`
|
|
7
|
+
# function.
|
|
8
|
+
SELECT
|
|
9
|
+
# description:
|
|
10
|
+
# unique identifier of the collection
|
|
11
|
+
collection_id,
|
|
12
|
+
# description:
|
|
13
|
+
# full name of the table in which the column is stored
|
|
14
|
+
table_name,
|
|
15
|
+
# description:
|
|
16
|
+
# short name of the table in which the column is stored
|
|
17
|
+
SPLIT(table_name,'.')[SAFE_OFFSET(2)] AS short_table_name,
|
|
18
|
+
# description:
|
|
19
|
+
# name of the column in which the value is stored
|
|
20
|
+
`column`,
|
|
21
|
+
# description:
|
|
22
|
+
# human readable name of the column
|
|
23
|
+
column_label,
|
|
24
|
+
# description:
|
|
25
|
+
# values encountered in the column
|
|
26
|
+
`values`
|
|
27
|
+
FROM
|
|
28
|
+
`bigquery-public-data.idc_v23_clinical.column_metadata`
|
|
29
|
+
ORDER BY
|
|
30
|
+
collection_id, table_name
|
|
@@ -9,6 +9,12 @@
|
|
|
9
9
|
-- WHERE
|
|
10
10
|
-- Modality = "SM"
|
|
11
11
|
|
|
12
|
+
# table-description:
|
|
13
|
+
# This table contains metadata about the slide microscopy (SM) series available in IDC. Each row
|
|
14
|
+
# corresponds to a DICOM series, and contains attributes specific to SM series, such as the pixel spacing at the maximum
|
|
15
|
+
# resolution layer, the power of the objective lens used to digitize the slide, and the anatomic location
|
|
16
|
+
# from where the imaged specimen was collected. This table can be joined with the main index table using the
|
|
17
|
+
# `SeriesInstanceUID` column.
|
|
12
18
|
WITH
|
|
13
19
|
temp_table AS (
|
|
14
20
|
SELECT
|
|
@@ -31,7 +37,7 @@ WITH
|
|
|
31
37
|
|
|
32
38
|
|
|
33
39
|
FROM
|
|
34
|
-
`bigquery-public-data.
|
|
40
|
+
`bigquery-public-data.idc_v23.dicom_all` AS dicom_all
|
|
35
41
|
GROUP BY
|
|
36
42
|
SeriesInstanceUID
|
|
37
43
|
),
|
|
@@ -45,7 +51,7 @@ SpecimenPreparationSequence_unnested AS (
|
|
|
45
51
|
concept_code_sequence.CodeMeaning AS ccs_cm,
|
|
46
52
|
concept_code_sequence.CodingSchemeDesignator AS ccs_csd,
|
|
47
53
|
concept_code_sequence.CodeValue AS ccs_val,
|
|
48
|
-
FROM `bigquery-public-data.
|
|
54
|
+
FROM `bigquery-public-data.idc_v23.dicom_all`,
|
|
49
55
|
UNNEST(SpecimenDescriptionSequence[SAFE_OFFSET(0)].SpecimenPreparationSequence) as preparation_unnest_step1,
|
|
50
56
|
UNNEST(preparation_unnest_step1.SpecimenPreparationStepContentItemSequence) as preparation_unnest_step2,
|
|
51
57
|
UNNEST(preparation_unnest_step2.ConceptNameCodeSequence) as concept_name_code_sequence,
|
|
@@ -80,6 +86,8 @@ SpecimenPreparationSequence_unnested AS (
|
|
|
80
86
|
)
|
|
81
87
|
|
|
82
88
|
SELECT
|
|
89
|
+
# description:
|
|
90
|
+
# DICOM SeriesInstanceUID identifier of the series
|
|
83
91
|
temp_table.SeriesInstanceUID,
|
|
84
92
|
-- Embedding Medium
|
|
85
93
|
# description:
|
|
@@ -1,3 +1,10 @@
|
|
|
1
|
+
# table-description:
|
|
2
|
+
# This table contains metadata about the slide microscopy (SM) series available in IDC. Each row
|
|
3
|
+
# corresponds to an instance from a DICOM Slide Microscopy series available from IDC, identified by
|
|
4
|
+
# `SOPInstanceUID`, and contains attributes specific to SM series, such as the pixel spacing at the maximum
|
|
5
|
+
# resolution layer, the power of the objective lens used to digitize the slide, and the anatomic location
|
|
6
|
+
# from where the imaged specimen was collected. This table can be joined with the main index table
|
|
7
|
+
# and/or with `sm_index` using the `SeriesInstanceUID` column.
|
|
1
8
|
WITH
|
|
2
9
|
SpecimenPreparationSequence_unnested AS (
|
|
3
10
|
SELECT
|
|
@@ -9,7 +16,7 @@ WITH
|
|
|
9
16
|
concept_code_sequence.CodingSchemeDesignator AS ccs_csd,
|
|
10
17
|
concept_code_sequence.CodeValue AS ccs_val,
|
|
11
18
|
FROM
|
|
12
|
-
`bigquery-public-data.
|
|
19
|
+
`bigquery-public-data.idc_v23.dicom_all`,
|
|
13
20
|
UNNEST(SpecimenDescriptionSequence[SAFE_OFFSET(0)].SpecimenPreparationSequence) AS preparation_unnest_step1,
|
|
14
21
|
UNNEST(preparation_unnest_step1.SpecimenPreparationStepContentItemSequence) AS preparation_unnest_step2,
|
|
15
22
|
UNNEST(preparation_unnest_step2.ConceptNameCodeSequence) AS concept_name_code_sequence,
|
|
@@ -137,7 +144,7 @@ SELECT
|
|
|
137
144
|
# unique identifier of the instance within the IDC
|
|
138
145
|
dicom_all.crdc_instance_uuid AS crdc_instance_uuid
|
|
139
146
|
FROM
|
|
140
|
-
`bigquery-public-data.
|
|
147
|
+
`bigquery-public-data.idc_v23.dicom_all` AS dicom_all
|
|
141
148
|
LEFT JOIN
|
|
142
149
|
slide_embedding
|
|
143
150
|
ON
|
|
@@ -13,7 +13,7 @@ build-backend = "scikit_build_core.build"
|
|
|
13
13
|
|
|
14
14
|
[project]
|
|
15
15
|
name = "idc-index-data"
|
|
16
|
-
version = "
|
|
16
|
+
version = "23.0.1"
|
|
17
17
|
authors = [
|
|
18
18
|
{ name = "Andrey Fedorov", email = "andrey.fedorov@gmail.com" },
|
|
19
19
|
{ name = "Vamsi Thiriveedhi", email = "vthiriveedhi@mgh.harvard.edu" },
|
|
@@ -22,6 +22,51 @@ class IDCIndexDataManager:
|
|
|
22
22
|
self.client = bigquery.Client(project=project_id)
|
|
23
23
|
logger.debug("IDCIndexDataManager initialized with project ID: %s", project_id)
|
|
24
24
|
|
|
25
|
+
@staticmethod
|
|
26
|
+
def parse_table_description(sql_query: str) -> str:
|
|
27
|
+
"""
|
|
28
|
+
Parses the table description from SQL query comments.
|
|
29
|
+
|
|
30
|
+
The method looks for comments following the pattern:
|
|
31
|
+
# table-description:
|
|
32
|
+
# description text continues here
|
|
33
|
+
# and can span multiple lines
|
|
34
|
+
|
|
35
|
+
Args:
|
|
36
|
+
sql_query: The SQL query string containing comments
|
|
37
|
+
|
|
38
|
+
Returns:
|
|
39
|
+
The table description as a string
|
|
40
|
+
"""
|
|
41
|
+
description_lines = []
|
|
42
|
+
logger.debug("Parsing table description from SQL query comments")
|
|
43
|
+
logger.debug(sql_query)
|
|
44
|
+
lines = sql_query.split("\n")
|
|
45
|
+
|
|
46
|
+
for i, line in enumerate(lines):
|
|
47
|
+
stripped = line.strip()
|
|
48
|
+
if stripped == "# table-description:":
|
|
49
|
+
# Collect description lines until we hit a non-comment line
|
|
50
|
+
j = i + 1
|
|
51
|
+
while j < len(lines):
|
|
52
|
+
next_line = lines[j]
|
|
53
|
+
next_stripped = next_line.strip()
|
|
54
|
+
if next_stripped.startswith("#") and next_stripped != "#":
|
|
55
|
+
# Remove the leading # and whitespace
|
|
56
|
+
desc_text = next_stripped[1:].strip()
|
|
57
|
+
if desc_text:
|
|
58
|
+
description_lines.append(desc_text)
|
|
59
|
+
j += 1
|
|
60
|
+
elif next_stripped.startswith("#"):
|
|
61
|
+
# Empty comment line, skip
|
|
62
|
+
j += 1
|
|
63
|
+
else:
|
|
64
|
+
# Non-comment line, stop collecting
|
|
65
|
+
break
|
|
66
|
+
break
|
|
67
|
+
|
|
68
|
+
return " ".join(description_lines)
|
|
69
|
+
|
|
25
70
|
@staticmethod
|
|
26
71
|
def parse_column_descriptions(sql_query: str) -> dict[str, str]:
|
|
27
72
|
"""
|
|
@@ -232,11 +277,14 @@ class IDCIndexDataManager:
|
|
|
232
277
|
logger.debug("Parsing column descriptions from SQL query comments")
|
|
233
278
|
logger.debug(sql_query)
|
|
234
279
|
if sql_query is not None:
|
|
280
|
+
table_description = self.parse_table_description(sql_query)
|
|
281
|
+
logger.debug("Parsed table description: %s", table_description)
|
|
235
282
|
descriptions = self.parse_column_descriptions(sql_query)
|
|
236
283
|
|
|
237
284
|
# Convert BigQuery schema to JSON-serializable format
|
|
238
285
|
schema_dict = {
|
|
239
|
-
"
|
|
286
|
+
"table_description": table_description,
|
|
287
|
+
"columns": [
|
|
240
288
|
{
|
|
241
289
|
"name": field.name,
|
|
242
290
|
"type": field.field_type,
|
|
@@ -244,12 +292,12 @@ class IDCIndexDataManager:
|
|
|
244
292
|
"description": descriptions.get(field.name, ""),
|
|
245
293
|
}
|
|
246
294
|
for field in schema
|
|
247
|
-
]
|
|
295
|
+
],
|
|
248
296
|
}
|
|
249
297
|
else:
|
|
250
298
|
# If no SQL query provided, save schema without descriptions
|
|
251
299
|
schema_dict = {
|
|
252
|
-
"
|
|
300
|
+
"columns": [
|
|
253
301
|
{
|
|
254
302
|
"name": field.name,
|
|
255
303
|
"type": field.field_type,
|
|
@@ -375,7 +423,7 @@ class IDCIndexDataManager:
|
|
|
375
423
|
SELECT
|
|
376
424
|
MAX(idc_version) AS latest_idc_release_version
|
|
377
425
|
FROM
|
|
378
|
-
`bigquery-public-data.
|
|
426
|
+
`bigquery-public-data.idc_v23.version_metadata`
|
|
379
427
|
"""
|
|
380
428
|
query_job = self.client.query(query)
|
|
381
429
|
result = query_job.result()
|
|
@@ -1,3 +1,7 @@
|
|
|
1
|
+
# table-description:
|
|
2
|
+
# This table contains metadata about the analysis results collections available in IDC. Each row corresponds to an
|
|
3
|
+
# analysis results collection, and contains attributes such as the collection name, types of cancer represented,
|
|
4
|
+
# number of subjects, and pointers to the resources to learn more about the content of the collection
|
|
1
5
|
SELECT
|
|
2
6
|
# description:
|
|
3
7
|
# unique identifier of the analysis results collection
|
|
@@ -39,4 +43,4 @@ SELECT
|
|
|
39
43
|
# citation for the analysis results collection that should be used for acknowledgment
|
|
40
44
|
Citation
|
|
41
45
|
FROM
|
|
42
|
-
`bigquery-public-data.
|
|
46
|
+
`bigquery-public-data.idc_v23.analysis_results_metadata`
|
|
@@ -1,3 +1,7 @@
|
|
|
1
|
+
# table-description:
|
|
2
|
+
# This table contains metadata about the collections available in IDC. Each row corresponds to a collection,
|
|
3
|
+
# and contains attributes such as the collection name, types of cancer represented, number of subjects,
|
|
4
|
+
# and pointers to the resources to learn more about the content of the collection.
|
|
1
5
|
SELECT
|
|
2
6
|
# description:
|
|
3
7
|
# name of the collection
|
|
@@ -36,4 +40,4 @@ SELECT
|
|
|
36
40
|
# detailed information about the collection
|
|
37
41
|
Description
|
|
38
42
|
FROM
|
|
39
|
-
`bigquery-public-data.
|
|
43
|
+
`bigquery-public-data.idc_v23.original_collections_metadata`
|
|
@@ -1,3 +1,7 @@
|
|
|
1
|
+
# table-description:
|
|
2
|
+
# This is the main metadata table provided by idc-index. Each row corresponds to a DICOM series, and contains
|
|
3
|
+
# attributes at the collection, patient, study, and series levels. The table also contains download-related
|
|
4
|
+
# attributes, such as the AWS S3 bucket and URL to download the series.
|
|
1
5
|
SELECT
|
|
2
6
|
# collection level attributes
|
|
3
7
|
# description:
|
|
@@ -22,7 +26,7 @@ SELECT
|
|
|
22
26
|
# series; follow this DOI to learn more about the activity that produced
|
|
23
27
|
# this series
|
|
24
28
|
ANY_VALUE(source_DOI) AS source_DOI,
|
|
25
|
-
# patient level attributes
|
|
29
|
+
# patient level attributes:
|
|
26
30
|
# description:
|
|
27
31
|
# age of the subject at the time of imaging (DICOM attribute)
|
|
28
32
|
ANY_VALUE(PatientAge) AS PatientAge,
|
|
@@ -79,9 +83,9 @@ SELECT
|
|
|
79
83
|
# total size of the series in megabytes
|
|
80
84
|
ROUND(SUM(SAFE_CAST(instance_size AS float64))/1000000, 2) AS series_size_MB,
|
|
81
85
|
FROM
|
|
82
|
-
`bigquery-public-data.
|
|
86
|
+
`bigquery-public-data.idc_v23.dicom_all` AS dicom_all
|
|
83
87
|
JOIN
|
|
84
|
-
`bigquery-public-data.
|
|
88
|
+
`bigquery-public-data.idc_v23.dicom_metadata_curated` AS dicom_curated
|
|
85
89
|
ON
|
|
86
90
|
dicom_all.SOPInstanceUID = dicom_curated.SOPInstanceUID
|
|
87
91
|
GROUP BY
|
|
@@ -1,23 +0,0 @@
|
|
|
1
|
-
SELECT
|
|
2
|
-
# description:
|
|
3
|
-
# unique identifier of the collection
|
|
4
|
-
collection_id,
|
|
5
|
-
# description:
|
|
6
|
-
# full name of the table in which the column is stored
|
|
7
|
-
table_name,
|
|
8
|
-
# description:
|
|
9
|
-
# short name of the table in which the column is stored
|
|
10
|
-
SPLIT(table_name,'.')[SAFE_OFFSET(2)] AS short_table_name,
|
|
11
|
-
# description:
|
|
12
|
-
# name of the column in which the value is stored
|
|
13
|
-
`column`,
|
|
14
|
-
# description:
|
|
15
|
-
# human readable name of the column
|
|
16
|
-
column_label,
|
|
17
|
-
# description:
|
|
18
|
-
# values encountered in the column
|
|
19
|
-
`values`
|
|
20
|
-
FROM
|
|
21
|
-
`bigquery-public-data.idc_v22_clinical.column_metadata`
|
|
22
|
-
ORDER BY
|
|
23
|
-
collection_id, table_name
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|