idc-index-data 22.1.0__tar.gz → 22.1.2__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- idc_index_data-22.1.2/.github/copilot-instructions.md +170 -0
- {idc_index_data-22.1.0 → idc_index_data-22.1.2}/.github/workflows/external-indices.yml +1 -1
- {idc_index_data-22.1.0 → idc_index_data-22.1.2}/.gitignore +3 -0
- {idc_index_data-22.1.0 → idc_index_data-22.1.2}/PKG-INFO +2 -4
- {idc_index_data-22.1.0 → idc_index_data-22.1.2}/assets/sm_index.sql +36 -3
- {idc_index_data-22.1.0 → idc_index_data-22.1.2}/assets/sm_instance_index.sql +30 -0
- {idc_index_data-22.1.0 → idc_index_data-22.1.2}/pyproject.toml +2 -4
- idc_index_data-22.1.2/scripts/python/generate-indices.py +50 -0
- {idc_index_data-22.1.0 → idc_index_data-22.1.2}/scripts/python/idc_index_data_manager.py +72 -15
- idc_index_data-22.1.2/scripts/sql/collections_index.sql +39 -0
- idc_index_data-22.1.2/scripts/sql/idc_index.sql +88 -0
- idc_index_data-22.1.0/scripts/python/generate-indices.py +0 -38
- idc_index_data-22.1.0/scripts/sql/collections_index.sql +0 -15
- idc_index_data-22.1.0/scripts/sql/idc_index.sql +0 -38
- {idc_index_data-22.1.0 → idc_index_data-22.1.2}/.git_archival.txt +0 -0
- {idc_index_data-22.1.0 → idc_index_data-22.1.2}/.gitattributes +0 -0
- {idc_index_data-22.1.0 → idc_index_data-22.1.2}/.github/CONTRIBUTING.md +0 -0
- {idc_index_data-22.1.0 → idc_index_data-22.1.2}/.github/dependabot.yml +0 -0
- {idc_index_data-22.1.0 → idc_index_data-22.1.2}/.github/matchers/pylint.json +0 -0
- {idc_index_data-22.1.0 → idc_index_data-22.1.2}/.github/workflows/cd.yml +0 -0
- {idc_index_data-22.1.0 → idc_index_data-22.1.2}/.github/workflows/ci.yml +0 -0
- {idc_index_data-22.1.0 → idc_index_data-22.1.2}/.pre-commit-config.yaml +0 -0
- {idc_index_data-22.1.0 → idc_index_data-22.1.2}/.readthedocs.yaml +0 -0
- {idc_index_data-22.1.0 → idc_index_data-22.1.2}/CMakeLists.txt +0 -0
- {idc_index_data-22.1.0 → idc_index_data-22.1.2}/LICENSE +0 -0
- {idc_index_data-22.1.0 → idc_index_data-22.1.2}/README.md +0 -0
- {idc_index_data-22.1.0 → idc_index_data-22.1.2}/assets/README.md +0 -0
- {idc_index_data-22.1.0 → idc_index_data-22.1.2}/assets/clinical_index.sql +0 -0
- {idc_index_data-22.1.0 → idc_index_data-22.1.2}/docs/conf.py +0 -0
- {idc_index_data-22.1.0 → idc_index_data-22.1.2}/docs/index.md +0 -0
- {idc_index_data-22.1.0 → idc_index_data-22.1.2}/noxfile.py +0 -0
- {idc_index_data-22.1.0 → idc_index_data-22.1.2}/scripts/python/update_idc_index_version.py +0 -0
- {idc_index_data-22.1.0 → idc_index_data-22.1.2}/scripts/sql/analysis_results_index.sql +0 -0
- {idc_index_data-22.1.0 → idc_index_data-22.1.2}/scripts/sql/prior_versions_index.sql +0 -0
- {idc_index_data-22.1.0 → idc_index_data-22.1.2}/src/idc_index_data/__init__.py +0 -0
- {idc_index_data-22.1.0 → idc_index_data-22.1.2}/src/idc_index_data/_version.pyi +0 -0
- {idc_index_data-22.1.0 → idc_index_data-22.1.2}/src/idc_index_data/py.typed +0 -0
- {idc_index_data-22.1.0 → idc_index_data-22.1.2}/tests/test_package.py +0 -0
|
@@ -0,0 +1,170 @@
|
|
|
1
|
+
# GitHub Copilot Instructions for idc-index-data
|
|
2
|
+
|
|
3
|
+
## Project Overview
|
|
4
|
+
|
|
5
|
+
`idc-index-data` is a Python package that bundles the index data for the NCI
|
|
6
|
+
Imaging Data Commons (IDC). The package provides Parquet files containing
|
|
7
|
+
metadata about imaging data hosted by IDC, intended to be used by the
|
|
8
|
+
`idc-index` Python package.
|
|
9
|
+
|
|
10
|
+
## Technology Stack
|
|
11
|
+
|
|
12
|
+
- **Build System**: scikit-build-core with CMake
|
|
13
|
+
- **Package Manager**: pip
|
|
14
|
+
- **Python Versions**: 3.10, 3.11, 3.12
|
|
15
|
+
- **Testing**: pytest with pytest-cov
|
|
16
|
+
- **Task Runner**: nox
|
|
17
|
+
- **Linting**: ruff, pylint, mypy, pre-commit hooks
|
|
18
|
+
- **Documentation**: Sphinx with MyST parser and Furo theme
|
|
19
|
+
- **Data Processing**: pandas, pyarrow, Google Cloud BigQuery
|
|
20
|
+
|
|
21
|
+
## Development Workflow
|
|
22
|
+
|
|
23
|
+
### Setting Up Development Environment
|
|
24
|
+
|
|
25
|
+
```bash
|
|
26
|
+
python3 -m venv .venv
|
|
27
|
+
source ./.venv/bin/activate
|
|
28
|
+
pip install -v -e .[dev]
|
|
29
|
+
pre-commit install
|
|
30
|
+
```
|
|
31
|
+
|
|
32
|
+
### Common Commands
|
|
33
|
+
|
|
34
|
+
- **Run all checks**: `nox` (runs lint, pylint, and tests by default)
|
|
35
|
+
- **Lint code**: `nox -s lint`
|
|
36
|
+
- **Run pylint**: `nox -s pylint`
|
|
37
|
+
- **Run tests**: `nox -s tests`
|
|
38
|
+
- **Build docs**: `nox -s docs`
|
|
39
|
+
- **Serve docs**: `nox -s docs -- --serve`
|
|
40
|
+
- **Build package**: `nox -s build`
|
|
41
|
+
- **Update IDC index version**: `nox -s bump -- <version>` (or leave off version
|
|
42
|
+
for latest)
|
|
43
|
+
- **Tag release**: `nox -s tag_release` (shows instructions)
|
|
44
|
+
|
|
45
|
+
### Pre-commit Checks
|
|
46
|
+
|
|
47
|
+
Always run pre-commit before committing:
|
|
48
|
+
|
|
49
|
+
```bash
|
|
50
|
+
pre-commit run --all-files
|
|
51
|
+
```
|
|
52
|
+
|
|
53
|
+
## Code Style and Conventions
|
|
54
|
+
|
|
55
|
+
### Python Code Style
|
|
56
|
+
|
|
57
|
+
- **Import Statement**: All files must include
|
|
58
|
+
`from __future__ import annotations` at the top
|
|
59
|
+
- **Type Hints**: Use type hints throughout; strict type checking is enabled for
|
|
60
|
+
`idc_index_data.*` modules
|
|
61
|
+
- **Linting**: Follow ruff and pylint rules configured in `pyproject.toml`
|
|
62
|
+
- **Formatting**: Code is formatted with ruff formatter
|
|
63
|
+
- **Line Length**: Not strictly enforced but keep reasonable
|
|
64
|
+
- **Docstrings**: Use when appropriate, especially for public APIs
|
|
65
|
+
|
|
66
|
+
### Key Ruff Rules
|
|
67
|
+
|
|
68
|
+
The project uses extensive ruff rules including:
|
|
69
|
+
|
|
70
|
+
- `B` - flake8-bugbear
|
|
71
|
+
- `I` - isort (import sorting)
|
|
72
|
+
- `ARG` - flake8-unused-arguments
|
|
73
|
+
- `UP` - pyupgrade
|
|
74
|
+
- `PTH` - flake8-use-pathlib (prefer pathlib over os.path)
|
|
75
|
+
- `NPY` - NumPy specific rules
|
|
76
|
+
- `PD` - pandas-vet
|
|
77
|
+
|
|
78
|
+
### Type Checking
|
|
79
|
+
|
|
80
|
+
- Python 3.8 minimum target
|
|
81
|
+
- Strict mypy checking for package code
|
|
82
|
+
- Use `typing.TYPE_CHECKING` for import cycles
|
|
83
|
+
|
|
84
|
+
## Project Structure
|
|
85
|
+
|
|
86
|
+
```
|
|
87
|
+
idc-index-data/
|
|
88
|
+
├── src/idc_index_data/ # Main package source
|
|
89
|
+
│ ├── __init__.py # Package exports and file path lookups
|
|
90
|
+
│ └── _version.py # Auto-generated version file
|
|
91
|
+
├── scripts/ # Management scripts
|
|
92
|
+
│ ├── python/ # Python scripts for index management
|
|
93
|
+
│ └── sql/ # SQL queries for BigQuery
|
|
94
|
+
├── tests/ # Test files
|
|
95
|
+
│ └── test_package.py # Package tests
|
|
96
|
+
├── docs/ # Sphinx documentation
|
|
97
|
+
├── pyproject.toml # Project configuration
|
|
98
|
+
├── noxfile.py # Nox session definitions
|
|
99
|
+
└── CMakeLists.txt # Build configuration
|
|
100
|
+
```
|
|
101
|
+
|
|
102
|
+
## Important Considerations
|
|
103
|
+
|
|
104
|
+
### Package Purpose
|
|
105
|
+
|
|
106
|
+
This package is a **data package** - it bundles index files (CSV and Parquet)
|
|
107
|
+
and provides file paths to locate them. It does not contain complex business
|
|
108
|
+
logic but rather serves as a data distribution mechanism.
|
|
109
|
+
|
|
110
|
+
### Version Management
|
|
111
|
+
|
|
112
|
+
- Version is defined in `pyproject.toml`
|
|
113
|
+
- Use `nox -s bump` to update to new IDC index versions
|
|
114
|
+
- The version should match the IDC release version
|
|
115
|
+
- Always update both index files and test expectations when bumping version
|
|
116
|
+
|
|
117
|
+
### Data Files
|
|
118
|
+
|
|
119
|
+
The package includes:
|
|
120
|
+
|
|
121
|
+
- `idc_index.csv.zip` - Compressed CSV index (optional)
|
|
122
|
+
- `idc_index.parquet` - Parquet format index
|
|
123
|
+
- `prior_versions_index.parquet` - Historical version index
|
|
124
|
+
|
|
125
|
+
### Google Cloud Integration
|
|
126
|
+
|
|
127
|
+
- Some operations require Google Cloud credentials
|
|
128
|
+
- BigQuery is used to fetch latest index data
|
|
129
|
+
- Scripts need `GCP_PROJECT` and `GOOGLE_APPLICATION_CREDENTIALS` environment
|
|
130
|
+
variables
|
|
131
|
+
|
|
132
|
+
### Testing
|
|
133
|
+
|
|
134
|
+
- Tests verify package installation and file accessibility
|
|
135
|
+
- Coverage reporting is configured but codecov upload is currently disabled
|
|
136
|
+
- Tests should work across platforms (Linux, macOS, Windows)
|
|
137
|
+
|
|
138
|
+
## Release Process
|
|
139
|
+
|
|
140
|
+
1. Update index version: `nox -s bump -- --commit <version>`
|
|
141
|
+
2. Create PR: `gh pr create --fill`
|
|
142
|
+
3. After merge, tag release: follow instructions from `nox -s tag_release`
|
|
143
|
+
4. Push tag: `git push origin <version>`
|
|
144
|
+
5. GitHub Actions will automatically build and publish to PyPI
|
|
145
|
+
|
|
146
|
+
## CI/CD
|
|
147
|
+
|
|
148
|
+
- **Format check**: pre-commit hooks + pylint
|
|
149
|
+
- **Tests**: Run on Python 3.10 and 3.12 across Linux, macOS, and Windows
|
|
150
|
+
- **Publishing**: Automated through GitHub Actions on tagged releases
|
|
151
|
+
|
|
152
|
+
## Additional Resources
|
|
153
|
+
|
|
154
|
+
- [Contributing Guide](.github/CONTRIBUTING.md)
|
|
155
|
+
- [Scientific Python Developer Guide](https://learn.scientific-python.org/development/)
|
|
156
|
+
- [IDC Homepage](https://imaging.datacommons.cancer.gov)
|
|
157
|
+
- [IDC Discourse Forum](https://discourse.canceridc.dev/)
|
|
158
|
+
|
|
159
|
+
## When Making Changes
|
|
160
|
+
|
|
161
|
+
1. **Always** run tests before and after changes: `nox -s tests`
|
|
162
|
+
2. **Always** run linters: `nox -s lint`
|
|
163
|
+
3. **Never** commit without running pre-commit checks
|
|
164
|
+
4. **Prefer** pathlib over os.path for file operations
|
|
165
|
+
5. **Use** type hints for all new code
|
|
166
|
+
6. **Update** tests if changing package structure or exports
|
|
167
|
+
7. **Follow** existing patterns in the codebase
|
|
168
|
+
8. **Keep** changes minimal and focused
|
|
169
|
+
9. **Document** any new public APIs
|
|
170
|
+
10. **Test** across Python versions when changing core functionality
|
|
@@ -55,6 +55,6 @@ jobs:
|
|
|
55
55
|
if: github.event_name == 'release' && github.event.action == 'published'
|
|
56
56
|
uses: ncipollo/release-action@v1
|
|
57
57
|
with:
|
|
58
|
-
artifacts: "
|
|
58
|
+
artifacts: "release_artifacts/*.parquet,release_artifacts/*.json,release_artifacts/*.sql"
|
|
59
59
|
allowUpdates: true
|
|
60
60
|
omitBodyDuringUpdate: true
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.1
|
|
2
2
|
Name: idc-index-data
|
|
3
|
-
Version: 22.1.
|
|
3
|
+
Version: 22.1.2
|
|
4
4
|
Summary: ImagingDataCommons index to query and download data.
|
|
5
5
|
Author-Email: Andrey Fedorov <andrey.fedorov@gmail.com>, Vamsi Thiriveedhi <vthiriveedhi@mgh.harvard.edu>, Jean-Christophe Fillion-Robin <jchris.fillionr@kitware.com>
|
|
6
6
|
License: Copyright 2024 Andrey Fedorov
|
|
@@ -31,8 +31,6 @@ Classifier: Operating System :: OS Independent
|
|
|
31
31
|
Classifier: Programming Language :: Python
|
|
32
32
|
Classifier: Programming Language :: Python :: 3
|
|
33
33
|
Classifier: Programming Language :: Python :: 3 :: Only
|
|
34
|
-
Classifier: Programming Language :: Python :: 3.8
|
|
35
|
-
Classifier: Programming Language :: Python :: 3.9
|
|
36
34
|
Classifier: Programming Language :: Python :: 3.10
|
|
37
35
|
Classifier: Programming Language :: Python :: 3.11
|
|
38
36
|
Classifier: Programming Language :: Python :: 3.12
|
|
@@ -42,7 +40,7 @@ Project-URL: Homepage, https://github.com/ImagingDataCommons/idc-index-data
|
|
|
42
40
|
Project-URL: Bug Tracker, https://github.com/ImagingDataCommons/idc-index-data/issues
|
|
43
41
|
Project-URL: Discussions, https://discourse.canceridc.dev/
|
|
44
42
|
Project-URL: Changelog, https://github.com/ImagingDataCommons/idc-index-data/releases
|
|
45
|
-
Requires-Python: >=3.
|
|
43
|
+
Requires-Python: >=3.10
|
|
46
44
|
Provides-Extra: test
|
|
47
45
|
Requires-Dist: pandas; extra == "test"
|
|
48
46
|
Requires-Dist: pyarrow; extra == "test"
|
|
@@ -82,10 +82,14 @@ SpecimenPreparationSequence_unnested AS (
|
|
|
82
82
|
SELECT
|
|
83
83
|
temp_table.SeriesInstanceUID,
|
|
84
84
|
-- Embedding Medium
|
|
85
|
+
# description:
|
|
86
|
+
# embedding medium used for the slide preparation
|
|
85
87
|
ARRAY(
|
|
86
88
|
SELECT IF(code IS NULL, NULL, SPLIT(code, ':')[SAFE_OFFSET(0)])
|
|
87
89
|
FROM UNNEST(embeddingMedium_code_str) AS code
|
|
88
90
|
) AS embeddingMedium_CodeMeaning,
|
|
91
|
+
# description:
|
|
92
|
+
# embedding medium code tuple
|
|
89
93
|
ARRAY(
|
|
90
94
|
SELECT IF(code IS NULL, NULL,
|
|
91
95
|
IF(STRPOS(code, ':') = 0, NULL,
|
|
@@ -93,10 +97,14 @@ SELECT
|
|
|
93
97
|
FROM UNNEST(embeddingMedium_code_str) AS code
|
|
94
98
|
) AS embeddingMedium_code_designator_value_str,
|
|
95
99
|
-- Tissue Fixative
|
|
100
|
+
# description:
|
|
101
|
+
# tissue fixative used for the slide preparation
|
|
96
102
|
ARRAY(
|
|
97
103
|
SELECT IF(code IS NULL, NULL, SPLIT(code, ':')[SAFE_OFFSET(0)])
|
|
98
104
|
FROM UNNEST(tissueFixative_code_str) AS code
|
|
99
105
|
) AS tissueFixative_CodeMeaning,
|
|
106
|
+
# description:
|
|
107
|
+
# tissue fixative code tuple
|
|
100
108
|
ARRAY(
|
|
101
109
|
SELECT IF(code IS NULL, NULL,
|
|
102
110
|
IF(STRPOS(code, ':') = 0, NULL,
|
|
@@ -104,31 +112,56 @@ SELECT
|
|
|
104
112
|
FROM UNNEST(tissueFixative_code_str) AS code
|
|
105
113
|
) AS tissueFixative_code_designator_value_str,
|
|
106
114
|
-- Staining using substance
|
|
115
|
+
# description:
|
|
116
|
+
# staining substances used for the slide preparation
|
|
107
117
|
ARRAY(
|
|
108
118
|
SELECT IF(code IS NULL, NULL, SPLIT(code, ':')[SAFE_OFFSET(0)])
|
|
109
119
|
FROM UNNEST(staining_usingSubstance_code_str) AS code
|
|
110
120
|
) AS staining_usingSubstance_CodeMeaning,
|
|
121
|
+
# description:
|
|
122
|
+
# staining using substance code tuple
|
|
111
123
|
ARRAY(
|
|
112
124
|
SELECT IF(code IS NULL, NULL,
|
|
113
125
|
IF(STRPOS(code, ':') = 0, NULL,
|
|
114
126
|
SUBSTR(code, STRPOS(code, ':') + 1)))
|
|
115
127
|
FROM UNNEST(staining_usingSubstance_code_str) AS code
|
|
116
128
|
) AS staining_usingSubstance_code_designator_value_str,
|
|
117
|
-
|
|
129
|
+
# description:
|
|
130
|
+
# pixel spacing in mm at the maximum resolution layer, rounded to 2 significant figures
|
|
118
131
|
if(COALESCE(min_spacing_0, fg_min_spacing_0) = 0, 0,
|
|
119
132
|
round(COALESCE(min_spacing_0, fg_min_spacing_0) ,CAST(2 -1-floor(log10(abs(COALESCE(min_spacing_0, fg_min_spacing_0) ))) AS INT64))) AS min_PixelSpacing_2sf,
|
|
133
|
+
# description:
|
|
134
|
+
# width of the image at the maximum resolution
|
|
120
135
|
COALESCE(max_TotalPixelMatrixColumns, max_Columns) AS max_TotalPixelMatrixColumns,
|
|
136
|
+
# description:
|
|
137
|
+
# height of the image at the maximum resolution
|
|
121
138
|
COALESCE(max_TotalPixelMatrixRows, max_Rows) AS max_TotalPixelMatrixRows,
|
|
139
|
+
# description:
|
|
140
|
+
# power of the objective lens of the equipment used to digitize the slide
|
|
122
141
|
SAFE_CAST(ObjectiveLensPower as INT) as ObjectiveLensPower,
|
|
142
|
+
# description:
|
|
143
|
+
# anatomic location from where the imaged specimen was collected
|
|
123
144
|
CONCAT(SPLIT(primaryAnatomicStructure_code_str,":")[SAFE_OFFSET(0)],":",SPLIT(primaryAnatomicStructure_code_str,":")[SAFE_OFFSET(1)]) as primaryAnatomicStructure_code_designator_value_str,
|
|
145
|
+
# description:
|
|
146
|
+
# code tuple for the anatomic location from where the imaged specimen was collected
|
|
124
147
|
SPLIT(primaryAnatomicStructure_code_str,":")[SAFE_OFFSET(2)] as primaryAnatomicStructure_CodeMeaning,
|
|
148
|
+
# description:
|
|
149
|
+
# additional characteristics of the specimen, such as whether it is a tumor or normal tissue (when available)
|
|
125
150
|
CONCAT(SPLIT(primaryAnatomicStructureModifier_code_str,":")[SAFE_OFFSET(0)],":",SPLIT(primaryAnatomicStructureModifier_code_str,":")[SAFE_OFFSET(1)]) as primaryAnatomicStructureModifier_code_designator_value_str,
|
|
151
|
+
# description:
|
|
152
|
+
# code tuple for additional characteristics of the specimen, such as whether it is a tumor or normal tissue (when available)
|
|
126
153
|
SPLIT(primaryAnatomicStructureModifier_code_str,":")[SAFE_OFFSET(2)] as primaryAnatomicStructureModifier_CodeMeaning,
|
|
127
|
-
|
|
154
|
+
# description:
|
|
155
|
+
# illumination type used during slide digitization
|
|
128
156
|
CONCAT(SPLIT(illuminationType_code_str,":")[SAFE_OFFSET(0)],":",SPLIT(illuminationType_code_str,":")[SAFE_OFFSET(1)]) as illuminationType_code_designator_value_str,
|
|
157
|
+
# description:
|
|
158
|
+
# code tuple for the illumination type used during slide digitization
|
|
129
159
|
SPLIT(illuminationType_code_str,":")[SAFE_OFFSET(2)] as illuminationType_CodeMeaning,
|
|
130
|
-
|
|
160
|
+
# description:
|
|
161
|
+
# admitting diagnosis associated with the specimen imaged on the slide (when available)
|
|
131
162
|
CONCAT(SPLIT(admittingDiagnosis_code_str,":")[SAFE_OFFSET(0)],":",SPLIT(admittingDiagnosis_code_str,":")[SAFE_OFFSET(1)]) as admittingDiagnosis_code_designator_value_str,
|
|
163
|
+
# description:
|
|
164
|
+
# code tuple for the admitting diagnosis associated with the specimen imaged on the slide (when available)
|
|
132
165
|
SPLIT(admittingDiagnosis_code_str,":")[SAFE_OFFSET(2)] as admittingDiagnosis_CodeMeaning,
|
|
133
166
|
FROM
|
|
134
167
|
temp_table
|
|
@@ -48,15 +48,23 @@ WITH
|
|
|
48
48
|
GROUP BY
|
|
49
49
|
SOPInstanceUID )
|
|
50
50
|
SELECT
|
|
51
|
+
# description:
|
|
52
|
+
# unique identifier of the instance
|
|
51
53
|
dicom_all.SOPInstanceUID,
|
|
54
|
+
# description:
|
|
55
|
+
# unique identifier of the series
|
|
52
56
|
dicom_all.SeriesInstanceUID,
|
|
53
57
|
-- Embedding Medium
|
|
58
|
+
# description:
|
|
59
|
+
# embedding medium used for the slide preparation
|
|
54
60
|
ARRAY(
|
|
55
61
|
SELECT
|
|
56
62
|
IF
|
|
57
63
|
(code IS NULL, NULL, SPLIT(code, ':')[SAFE_OFFSET(0)])
|
|
58
64
|
FROM
|
|
59
65
|
UNNEST(embeddingMedium_code_str) AS code ) AS embeddingMedium_CodeMeaning,
|
|
66
|
+
# description:
|
|
67
|
+
# embedding medium code tuple
|
|
60
68
|
ARRAY(
|
|
61
69
|
SELECT
|
|
62
70
|
IF
|
|
@@ -66,12 +74,16 @@ SELECT
|
|
|
66
74
|
FROM
|
|
67
75
|
UNNEST(embeddingMedium_code_str) AS code ) AS embeddingMedium_code_designator_value_str,
|
|
68
76
|
-- Tissue Fixative
|
|
77
|
+
# description:
|
|
78
|
+
# tissue fixative used for the slide preparation
|
|
69
79
|
ARRAY(
|
|
70
80
|
SELECT
|
|
71
81
|
IF
|
|
72
82
|
(code IS NULL, NULL, SPLIT(code, ':')[SAFE_OFFSET(0)])
|
|
73
83
|
FROM
|
|
74
84
|
UNNEST(tissueFixative_code_str) AS code ) AS tissueFixative_CodeMeaning,
|
|
85
|
+
# description:
|
|
86
|
+
# tissue fixative code tuple
|
|
75
87
|
ARRAY(
|
|
76
88
|
SELECT
|
|
77
89
|
IF
|
|
@@ -81,12 +93,16 @@ SELECT
|
|
|
81
93
|
FROM
|
|
82
94
|
UNNEST(tissueFixative_code_str) AS code ) AS tissueFixative_code_designator_value_str,
|
|
83
95
|
-- Staining using substance
|
|
96
|
+
# description:
|
|
97
|
+
# staining substances used for the slide preparation
|
|
84
98
|
ARRAY(
|
|
85
99
|
SELECT
|
|
86
100
|
IF
|
|
87
101
|
(code IS NULL, NULL, SPLIT(code, ':')[SAFE_OFFSET(0)])
|
|
88
102
|
FROM
|
|
89
103
|
UNNEST(staining_usingSubstance_code_str) AS code ) AS staining_usingSubstance_CodeMeaning,
|
|
104
|
+
# description:
|
|
105
|
+
# staining using substance code tuple
|
|
90
106
|
ARRAY(
|
|
91
107
|
SELECT
|
|
92
108
|
IF
|
|
@@ -98,13 +114,27 @@ SELECT
|
|
|
98
114
|
-- instance-specific image attributes
|
|
99
115
|
-- NB: there is a caveat that I think in general, we expect square pixels, but in htan_wustl and cptac_luad this assumption does not hold,
|
|
100
116
|
-- and in htan_wustl, the difference is rather large (x2) - waiting to hear from David Clunie about this...
|
|
117
|
+
# description:
|
|
118
|
+
# pixel spacing in mm, rounded to 2 significant figures
|
|
101
119
|
SAFE_CAST(SharedFunctionalGroupsSequence[SAFE_OFFSET(0)].PixelMeasuresSequence[SAFE_OFFSET(0)]. PixelSpacing[SAFE_OFFSET(0)] AS FLOAT64) AS PixelSpacing_0,
|
|
120
|
+
# description:
|
|
121
|
+
# DICOM ImageType attribute
|
|
102
122
|
dicom_all.ImageType,
|
|
123
|
+
# description:
|
|
124
|
+
# DICOM TransferSyntaxUID attribute
|
|
103
125
|
dicom_all.TransferSyntaxUID,
|
|
126
|
+
# description:
|
|
127
|
+
# size of the instance file in bytes
|
|
104
128
|
dicom_all.instance_size,
|
|
129
|
+
# description:
|
|
130
|
+
# number of columns in the image
|
|
105
131
|
dicom_all.TotalPixelMatrixColumns,
|
|
132
|
+
# description:
|
|
133
|
+
# number of rows in the image
|
|
106
134
|
dicom_all.TotalPixelMatrixRows,
|
|
107
135
|
-- attributes needed to retrieve the selected instances/files
|
|
136
|
+
# description:
|
|
137
|
+
# unique identifier of the instance within the IDC
|
|
108
138
|
dicom_all.crdc_instance_uuid
|
|
109
139
|
FROM
|
|
110
140
|
`bigquery-public-data.idc_v22.dicom_all` AS dicom_all
|
|
@@ -13,7 +13,7 @@ build-backend = "scikit_build_core.build"
|
|
|
13
13
|
|
|
14
14
|
[project]
|
|
15
15
|
name = "idc-index-data"
|
|
16
|
-
version = "22.1.
|
|
16
|
+
version = "22.1.2"
|
|
17
17
|
authors = [
|
|
18
18
|
{ name = "Andrey Fedorov", email = "andrey.fedorov@gmail.com" },
|
|
19
19
|
{ name = "Vamsi Thiriveedhi", email = "vthiriveedhi@mgh.harvard.edu" },
|
|
@@ -22,7 +22,7 @@ authors = [
|
|
|
22
22
|
description = "ImagingDataCommons index to query and download data."
|
|
23
23
|
readme = "README.md"
|
|
24
24
|
license.file = "LICENSE"
|
|
25
|
-
requires-python = ">=3.
|
|
25
|
+
requires-python = ">=3.10"
|
|
26
26
|
classifiers = [
|
|
27
27
|
"Development Status :: 4 - Beta",
|
|
28
28
|
"Intended Audience :: Science/Research",
|
|
@@ -32,8 +32,6 @@ classifiers = [
|
|
|
32
32
|
"Programming Language :: Python",
|
|
33
33
|
"Programming Language :: Python :: 3",
|
|
34
34
|
"Programming Language :: Python :: 3 :: Only",
|
|
35
|
-
"Programming Language :: Python :: 3.8",
|
|
36
|
-
"Programming Language :: Python :: 3.9",
|
|
37
35
|
"Programming Language :: Python :: 3.10",
|
|
38
36
|
"Programming Language :: Python :: 3.11",
|
|
39
37
|
"Programming Language :: Python :: 3.12",
|
|
@@ -0,0 +1,50 @@
|
|
|
1
|
+
# new_script.py
|
|
2
|
+
from __future__ import annotations
|
|
3
|
+
|
|
4
|
+
import os
|
|
5
|
+
from pathlib import Path
|
|
6
|
+
|
|
7
|
+
from idc_index_data_manager import IDCIndexDataManager
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
def main():
|
|
11
|
+
project_id = os.getenv("PROJECT_ID")
|
|
12
|
+
manager = IDCIndexDataManager(project_id=project_id)
|
|
13
|
+
scripts_dir = Path(__file__).resolve().parent.parent
|
|
14
|
+
|
|
15
|
+
# Create dedicated output directory for release artifacts
|
|
16
|
+
output_dir = scripts_dir.parent / "release_artifacts"
|
|
17
|
+
output_dir.mkdir(parents=True, exist_ok=True)
|
|
18
|
+
|
|
19
|
+
assets_dir = scripts_dir.parent / "assets"
|
|
20
|
+
|
|
21
|
+
# Collecting all .sql files from sql_dir and assets_dir
|
|
22
|
+
sql_files = [f for f in Path.iterdir(assets_dir) if str(f).endswith(".sql")]
|
|
23
|
+
|
|
24
|
+
for file_name in sql_files:
|
|
25
|
+
file_path = assets_dir / file_name
|
|
26
|
+
index_df, output_basename, schema, sql_query = manager.execute_sql_query(
|
|
27
|
+
file_path
|
|
28
|
+
)
|
|
29
|
+
parquet_file_path = output_dir / f"{output_basename}.parquet"
|
|
30
|
+
index_df.to_parquet(parquet_file_path)
|
|
31
|
+
manager.save_schema_to_json(schema, output_basename, output_dir)
|
|
32
|
+
manager.save_sql_query(sql_query, output_basename, output_dir)
|
|
33
|
+
|
|
34
|
+
core_indices_dir = scripts_dir.parent / "scripts" / "sql"
|
|
35
|
+
|
|
36
|
+
sql_files = [f for f in Path.iterdir(core_indices_dir) if str(f).endswith(".sql")]
|
|
37
|
+
|
|
38
|
+
for file_name in sql_files:
|
|
39
|
+
file_path = core_indices_dir / file_name
|
|
40
|
+
index_df, output_basename, schema, sql_query = manager.execute_sql_query(
|
|
41
|
+
file_path
|
|
42
|
+
)
|
|
43
|
+
parquet_file_path = output_dir / f"{output_basename}.parquet"
|
|
44
|
+
index_df.to_parquet(parquet_file_path)
|
|
45
|
+
manager.save_schema_to_json(schema, output_basename, output_dir)
|
|
46
|
+
manager.save_sql_query(sql_query, output_basename, output_dir)
|
|
47
|
+
|
|
48
|
+
|
|
49
|
+
if __name__ == "__main__":
|
|
50
|
+
main()
|
|
@@ -40,10 +40,13 @@ class IDCIndexDataManager:
|
|
|
40
40
|
index_df["StudyDate"] = index_df["StudyDate"].astype(str)
|
|
41
41
|
output_basename = Path(file_path).name.split(".")[0]
|
|
42
42
|
logger.debug("Executed SQL query from file: %s", file_path)
|
|
43
|
-
return index_df, output_basename, schema
|
|
43
|
+
return index_df, output_basename, schema, sql_query
|
|
44
44
|
|
|
45
45
|
def save_schema_to_json(
|
|
46
|
-
self,
|
|
46
|
+
self,
|
|
47
|
+
schema: list[bigquery.SchemaField],
|
|
48
|
+
output_basename: str,
|
|
49
|
+
output_dir: Path | None = None,
|
|
47
50
|
) -> None:
|
|
48
51
|
"""
|
|
49
52
|
Saves the BigQuery schema to a JSON file.
|
|
@@ -51,6 +54,7 @@ class IDCIndexDataManager:
|
|
|
51
54
|
Args:
|
|
52
55
|
schema: List of BigQuery SchemaField objects from the query result
|
|
53
56
|
output_basename: The base name for the output file
|
|
57
|
+
output_dir: Optional directory path for the output file
|
|
54
58
|
"""
|
|
55
59
|
# Convert BigQuery schema to JSON-serializable format
|
|
56
60
|
schema_dict = {
|
|
@@ -65,13 +69,46 @@ class IDCIndexDataManager:
|
|
|
65
69
|
}
|
|
66
70
|
|
|
67
71
|
# Save to JSON file
|
|
68
|
-
|
|
69
|
-
|
|
72
|
+
if output_dir:
|
|
73
|
+
output_dir.mkdir(parents=True, exist_ok=True)
|
|
74
|
+
json_file_path = output_dir / f"{output_basename}.json"
|
|
75
|
+
else:
|
|
76
|
+
json_file_path = Path(f"{output_basename}.json")
|
|
77
|
+
|
|
78
|
+
with json_file_path.open("w") as f:
|
|
70
79
|
json.dump(schema_dict, f, indent=2)
|
|
71
|
-
logger.debug("Created schema JSON file: %s",
|
|
80
|
+
logger.debug("Created schema JSON file: %s", json_file_path)
|
|
81
|
+
|
|
82
|
+
def save_sql_query(
|
|
83
|
+
self,
|
|
84
|
+
sql_query: str,
|
|
85
|
+
output_basename: str,
|
|
86
|
+
output_dir: Path | None = None,
|
|
87
|
+
) -> None:
|
|
88
|
+
"""
|
|
89
|
+
Saves the SQL query to a file.
|
|
90
|
+
|
|
91
|
+
Args:
|
|
92
|
+
sql_query: The SQL query string
|
|
93
|
+
output_basename: The base name for the output file
|
|
94
|
+
output_dir: Optional directory path for the output file
|
|
95
|
+
"""
|
|
96
|
+
|
|
97
|
+
if output_dir:
|
|
98
|
+
output_dir.mkdir(parents=True, exist_ok=True)
|
|
99
|
+
query_file_path = output_dir / f"{output_basename}.sql"
|
|
100
|
+
else:
|
|
101
|
+
query_file_path = Path(f"{output_basename}.sql")
|
|
102
|
+
|
|
103
|
+
with query_file_path.open("w") as f:
|
|
104
|
+
f.write(sql_query)
|
|
105
|
+
logger.debug("Created SQL query file: %s", query_file_path)
|
|
72
106
|
|
|
73
107
|
def generate_index_data_files(
|
|
74
|
-
self,
|
|
108
|
+
self,
|
|
109
|
+
generate_compressed_csv: bool = True,
|
|
110
|
+
generate_parquet: bool = False,
|
|
111
|
+
output_dir: Path | None = None,
|
|
75
112
|
) -> None:
|
|
76
113
|
"""
|
|
77
114
|
Generates index-data files locally by executing queries against
|
|
@@ -80,32 +117,52 @@ class IDCIndexDataManager:
|
|
|
80
117
|
This method iterates over SQL files in the 'scripts/sql' directory,
|
|
81
118
|
executing each query using :func:`execute_sql_query` and generating a DataFrame,
|
|
82
119
|
'index_df'. The DataFrame is then saved as compressed CSV and/or Parquet file.
|
|
120
|
+
|
|
121
|
+
Args:
|
|
122
|
+
generate_compressed_csv: Whether to generate compressed CSV files
|
|
123
|
+
generate_parquet: Whether to generate Parquet files
|
|
124
|
+
output_dir: Optional directory path for the output files
|
|
83
125
|
"""
|
|
84
126
|
|
|
85
127
|
scripts_dir = Path(__file__).parent.parent
|
|
86
128
|
sql_dir = scripts_dir / "sql"
|
|
87
129
|
|
|
130
|
+
if output_dir:
|
|
131
|
+
output_dir.mkdir(parents=True, exist_ok=True)
|
|
132
|
+
|
|
88
133
|
for file_name in Path.iterdir(sql_dir):
|
|
89
134
|
if str(file_name).endswith(".sql"):
|
|
90
135
|
file_path = Path(sql_dir) / file_name
|
|
91
|
-
index_df, output_basename, schema = self.execute_sql_query(
|
|
136
|
+
index_df, output_basename, schema, sql_query = self.execute_sql_query(
|
|
137
|
+
file_path
|
|
138
|
+
)
|
|
92
139
|
logger.debug(
|
|
93
140
|
"Executed and processed SQL queries from file: %s", file_path
|
|
94
141
|
)
|
|
95
142
|
if generate_compressed_csv:
|
|
96
|
-
|
|
143
|
+
csv_file_path = (
|
|
144
|
+
output_dir / f"{output_basename}.csv.zip"
|
|
145
|
+
if output_dir
|
|
146
|
+
else Path(f"{output_basename}.csv.zip")
|
|
147
|
+
)
|
|
97
148
|
index_df.to_csv(
|
|
98
|
-
|
|
149
|
+
csv_file_path, compression={"method": "zip"}, escapechar="\\"
|
|
99
150
|
)
|
|
100
|
-
logger.debug("Created CSV zip file: %s",
|
|
151
|
+
logger.debug("Created CSV zip file: %s", csv_file_path)
|
|
101
152
|
|
|
102
153
|
if generate_parquet:
|
|
103
|
-
|
|
104
|
-
|
|
105
|
-
|
|
154
|
+
parquet_file_path = (
|
|
155
|
+
output_dir / f"{output_basename}.parquet"
|
|
156
|
+
if output_dir
|
|
157
|
+
else Path(f"{output_basename}.parquet")
|
|
158
|
+
)
|
|
159
|
+
index_df.to_parquet(parquet_file_path, compression="zstd")
|
|
160
|
+
logger.debug("Created Parquet file: %s", parquet_file_path)
|
|
106
161
|
|
|
107
|
-
|
|
108
|
-
|
|
162
|
+
# Save schema to JSON file
|
|
163
|
+
self.save_schema_to_json(schema, output_basename, output_dir)
|
|
164
|
+
# Save SQL query to file
|
|
165
|
+
self.save_sql_query(sql_query, output_basename, output_dir)
|
|
109
166
|
|
|
110
167
|
def retrieve_latest_idc_release_version(self) -> int:
|
|
111
168
|
"""
|
|
@@ -0,0 +1,39 @@
|
|
|
1
|
+
SELECT
|
|
2
|
+
# description:
|
|
3
|
+
# name of the collection
|
|
4
|
+
collection_name,
|
|
5
|
+
# description:
|
|
6
|
+
# unique identifier of the collection
|
|
7
|
+
collection_id,
|
|
8
|
+
# description:
|
|
9
|
+
# types of cancer represented in the collection
|
|
10
|
+
CancerTypes,
|
|
11
|
+
# description:
|
|
12
|
+
# locations of tumors represented in the collection
|
|
13
|
+
TumorLocations,
|
|
14
|
+
# description:
|
|
15
|
+
# number of subjects in the collection
|
|
16
|
+
Subjects,
|
|
17
|
+
# description:
|
|
18
|
+
# species represented in the collection
|
|
19
|
+
Species,
|
|
20
|
+
# description:
|
|
21
|
+
# sources of data for the collection
|
|
22
|
+
Sources,
|
|
23
|
+
# description:
|
|
24
|
+
# additional data supporting the collection available in IDC
|
|
25
|
+
SupportingData,
|
|
26
|
+
# description:
|
|
27
|
+
# broader initiative/category under which this collection is being shared
|
|
28
|
+
Program,
|
|
29
|
+
# description:
|
|
30
|
+
# status of the collection (Completed or Ongoing)
|
|
31
|
+
Status,
|
|
32
|
+
# description:
|
|
33
|
+
# timestamp of the last update to the collection
|
|
34
|
+
Updated,
|
|
35
|
+
# description:
|
|
36
|
+
# detailed information about the collection
|
|
37
|
+
Description
|
|
38
|
+
FROM
|
|
39
|
+
`bigquery-public-data.idc_v22.original_collections_metadata`
|
|
@@ -0,0 +1,88 @@
|
|
|
1
|
+
SELECT
|
|
2
|
+
# collection level attributes
|
|
3
|
+
# description:
|
|
4
|
+
# short string with the identifier of the collection the series belongs to
|
|
5
|
+
ANY_VALUE(collection_id) AS collection_id,
|
|
6
|
+
# description:
|
|
7
|
+
# this string is not empty if the specific series is
|
|
8
|
+
# part of an analysis results collection; analysis results can be added to a
|
|
9
|
+
# given collection over time
|
|
10
|
+
ANY_VALUE(analysis_result_id) AS analysis_result_id,
|
|
11
|
+
# description:
|
|
12
|
+
# identifier of the patient within the collection (DICOM attribute)
|
|
13
|
+
ANY_VALUE(PatientID) AS PatientID,
|
|
14
|
+
# description:
|
|
15
|
+
# unique identifier of the DICOM series (DICOM attribute)
|
|
16
|
+
SeriesInstanceUID,
|
|
17
|
+
# description:
|
|
18
|
+
# unique identifier of the DICOM study (DICOM attribute)
|
|
19
|
+
ANY_VALUE(StudyInstanceUID) AS StudyInstanceUID,
|
|
20
|
+
# description:
|
|
21
|
+
# Digital Object Identifier of the dataset that contains the given
|
|
22
|
+
# series; follow this DOI to learn more about the activity that produced
|
|
23
|
+
# this series
|
|
24
|
+
ANY_VALUE(source_DOI) AS source_DOI,
|
|
25
|
+
# patient level attributes
|
|
26
|
+
# description:
|
|
27
|
+
# age of the subject at the time of imaging (DICOM attribute)
|
|
28
|
+
ANY_VALUE(PatientAge) AS PatientAge,
|
|
29
|
+
# description:
|
|
30
|
+
# subject sex (DICOM attribute)
|
|
31
|
+
ANY_VALUE(PatientSex) AS PatientSex,
|
|
32
|
+
# study level attributes
|
|
33
|
+
# description:
|
|
34
|
+
# date of the study (de-identified) (DICOM attribute)
|
|
35
|
+
ANY_VALUE(StudyDate) AS StudyDate,
|
|
36
|
+
# description:
|
|
37
|
+
# textual description of the study content (DICOM attribute)
|
|
38
|
+
ANY_VALUE(StudyDescription) AS StudyDescription,
|
|
39
|
+
# description:
|
|
40
|
+
# body part imaged (not iniapplicabletialized for SM series) (DICOM attribute)
|
|
41
|
+
ANY_VALUE(dicom_curated.BodyPartExamined) AS BodyPartExamined,
|
|
42
|
+
# series level attributes
|
|
43
|
+
# description:
|
|
44
|
+
# acquisition modality (DICOM attribute)
|
|
45
|
+
ANY_VALUE(Modality) AS Modality,
|
|
46
|
+
# description:
|
|
47
|
+
# manufacturer of the equipment that produced the series (DICOM attribute)
|
|
48
|
+
ANY_VALUE(Manufacturer) AS Manufacturer,
|
|
49
|
+
# description:
|
|
50
|
+
# model name of the equipment that produced the series (DICOM attribute)
|
|
51
|
+
ANY_VALUE(ManufacturerModelName) AS ManufacturerModelName,
|
|
52
|
+
# description:
|
|
53
|
+
# date of the series (de-identified) (DICOM attribute)
|
|
54
|
+
ANY_VALUE(SAFE_CAST(SeriesDate AS STRING)) AS SeriesDate,
|
|
55
|
+
# description:
|
|
56
|
+
# textual description of the series content (DICOM attribute)
|
|
57
|
+
ANY_VALUE(SeriesDescription) AS SeriesDescription,
|
|
58
|
+
# description:
|
|
59
|
+
# series number (DICOM attribute)
|
|
60
|
+
ANY_VALUE(SeriesNumber) AS SeriesNumber,
|
|
61
|
+
# description:
|
|
62
|
+
# number of instances in the series
|
|
63
|
+
COUNT(dicom_all.SOPInstanceUID) AS instanceCount,
|
|
64
|
+
# description:
|
|
65
|
+
# short name of the license that applies to this series
|
|
66
|
+
ANY_VALUE(license_short_name) as license_short_name,
|
|
67
|
+
# download related attributes
|
|
68
|
+
# description:
|
|
69
|
+
# name of the AWS S3 bucket that contains the series
|
|
70
|
+
ANY_VALUE(aws_bucket) AS aws_bucket,
|
|
71
|
+
# description:
|
|
72
|
+
# unique identifier of the series within the IDC
|
|
73
|
+
ANY_VALUE(crdc_series_uuid) AS crdc_series_uuid,
|
|
74
|
+
# series_aws_url will be phased out in favor of constructing URL from bucket+UUID
|
|
75
|
+
# description:
|
|
76
|
+
# public AWS S3 URL to download the series in bulk (each instance is a separate file)
|
|
77
|
+
ANY_VALUE(CONCAT(series_aws_url,"*")) AS series_aws_url,
|
|
78
|
+
# description:
|
|
79
|
+
# total size of the series in megabytes
|
|
80
|
+
ROUND(SUM(SAFE_CAST(instance_size AS float64))/1000000, 2) AS series_size_MB,
|
|
81
|
+
FROM
|
|
82
|
+
`bigquery-public-data.idc_v22.dicom_all` AS dicom_all
|
|
83
|
+
JOIN
|
|
84
|
+
`bigquery-public-data.idc_v22.dicom_metadata_curated` AS dicom_curated
|
|
85
|
+
ON
|
|
86
|
+
dicom_all.SOPInstanceUID = dicom_curated.SOPInstanceUID
|
|
87
|
+
GROUP BY
|
|
88
|
+
SeriesInstanceUID
|
|
@@ -1,38 +0,0 @@
|
|
|
1
|
-
# new_script.py
|
|
2
|
-
from __future__ import annotations
|
|
3
|
-
|
|
4
|
-
import os
|
|
5
|
-
from pathlib import Path
|
|
6
|
-
|
|
7
|
-
from idc_index_data_manager import IDCIndexDataManager
|
|
8
|
-
|
|
9
|
-
|
|
10
|
-
def main():
|
|
11
|
-
project_id = os.getenv("PROJECT_ID")
|
|
12
|
-
manager = IDCIndexDataManager(project_id=project_id)
|
|
13
|
-
scripts_dir = Path(__file__).resolve().parent.parent
|
|
14
|
-
|
|
15
|
-
assets_dir = scripts_dir.parent / "assets"
|
|
16
|
-
|
|
17
|
-
# Collecting all .sql files from sql_dir and assets_dir
|
|
18
|
-
sql_files = [f for f in Path.iterdir(assets_dir) if str(f).endswith(".sql")]
|
|
19
|
-
|
|
20
|
-
for file_name in sql_files:
|
|
21
|
-
file_path = assets_dir / file_name
|
|
22
|
-
index_df, output_basename, schema = manager.execute_sql_query(file_path)
|
|
23
|
-
index_df.to_parquet(f"{output_basename}.parquet")
|
|
24
|
-
manager.save_schema_to_json(schema, output_basename)
|
|
25
|
-
|
|
26
|
-
core_indices_dir = scripts_dir.parent / "scripts" / "sql"
|
|
27
|
-
|
|
28
|
-
sql_files = [f for f in Path.iterdir(core_indices_dir) if str(f).endswith(".sql")]
|
|
29
|
-
|
|
30
|
-
for file_name in sql_files:
|
|
31
|
-
file_path = core_indices_dir / file_name
|
|
32
|
-
index_df, output_basename, schema = manager.execute_sql_query(file_path)
|
|
33
|
-
index_df.to_parquet(f"{output_basename}.parquet")
|
|
34
|
-
manager.save_schema_to_json(schema, output_basename)
|
|
35
|
-
|
|
36
|
-
|
|
37
|
-
if __name__ == "__main__":
|
|
38
|
-
main()
|
|
@@ -1,38 +0,0 @@
|
|
|
1
|
-
SELECT
|
|
2
|
-
# collection level attributes
|
|
3
|
-
ANY_VALUE(collection_id) AS collection_id,
|
|
4
|
-
ANY_VALUE(analysis_result_id) AS analysis_result_id,
|
|
5
|
-
ANY_VALUE(PatientID) AS PatientID,
|
|
6
|
-
SeriesInstanceUID,
|
|
7
|
-
ANY_VALUE(StudyInstanceUID) AS StudyInstanceUID,
|
|
8
|
-
ANY_VALUE(source_DOI) AS source_DOI,
|
|
9
|
-
# patient level attributes
|
|
10
|
-
ANY_VALUE(PatientAge) AS PatientAge,
|
|
11
|
-
ANY_VALUE(PatientSex) AS PatientSex,
|
|
12
|
-
# study level attributes
|
|
13
|
-
ANY_VALUE(StudyDate) AS StudyDate,
|
|
14
|
-
ANY_VALUE(StudyDescription) AS StudyDescription,
|
|
15
|
-
ANY_VALUE(dicom_curated.BodyPartExamined) AS BodyPartExamined,
|
|
16
|
-
# series level attributes
|
|
17
|
-
ANY_VALUE(Modality) AS Modality,
|
|
18
|
-
ANY_VALUE(Manufacturer) AS Manufacturer,
|
|
19
|
-
ANY_VALUE(ManufacturerModelName) AS ManufacturerModelName,
|
|
20
|
-
ANY_VALUE(SAFE_CAST(SeriesDate AS STRING)) AS SeriesDate,
|
|
21
|
-
ANY_VALUE(SeriesDescription) AS SeriesDescription,
|
|
22
|
-
ANY_VALUE(SeriesNumber) AS SeriesNumber,
|
|
23
|
-
COUNT(dicom_all.SOPInstanceUID) AS instanceCount,
|
|
24
|
-
ANY_VALUE(license_short_name) as license_short_name,
|
|
25
|
-
# download related attributes
|
|
26
|
-
ANY_VALUE(aws_bucket) AS aws_bucket,
|
|
27
|
-
ANY_VALUE(crdc_series_uuid) AS crdc_series_uuid,
|
|
28
|
-
# series_aws_url will be phased out in favor of constructing URL from bucket+UUID
|
|
29
|
-
ANY_VALUE(CONCAT(series_aws_url,"*")) AS series_aws_url,
|
|
30
|
-
ROUND(SUM(SAFE_CAST(instance_size AS float64))/1000000, 2) AS series_size_MB,
|
|
31
|
-
FROM
|
|
32
|
-
`bigquery-public-data.idc_v22.dicom_all` AS dicom_all
|
|
33
|
-
JOIN
|
|
34
|
-
`bigquery-public-data.idc_v22.dicom_metadata_curated` AS dicom_curated
|
|
35
|
-
ON
|
|
36
|
-
dicom_all.SOPInstanceUID = dicom_curated.SOPInstanceUID
|
|
37
|
-
GROUP BY
|
|
38
|
-
SeriesInstanceUID
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|