idc-index-data 22.1.0__tar.gz → 22.1.2__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (38) hide show
  1. idc_index_data-22.1.2/.github/copilot-instructions.md +170 -0
  2. {idc_index_data-22.1.0 → idc_index_data-22.1.2}/.github/workflows/external-indices.yml +1 -1
  3. {idc_index_data-22.1.0 → idc_index_data-22.1.2}/.gitignore +3 -0
  4. {idc_index_data-22.1.0 → idc_index_data-22.1.2}/PKG-INFO +2 -4
  5. {idc_index_data-22.1.0 → idc_index_data-22.1.2}/assets/sm_index.sql +36 -3
  6. {idc_index_data-22.1.0 → idc_index_data-22.1.2}/assets/sm_instance_index.sql +30 -0
  7. {idc_index_data-22.1.0 → idc_index_data-22.1.2}/pyproject.toml +2 -4
  8. idc_index_data-22.1.2/scripts/python/generate-indices.py +50 -0
  9. {idc_index_data-22.1.0 → idc_index_data-22.1.2}/scripts/python/idc_index_data_manager.py +72 -15
  10. idc_index_data-22.1.2/scripts/sql/collections_index.sql +39 -0
  11. idc_index_data-22.1.2/scripts/sql/idc_index.sql +88 -0
  12. idc_index_data-22.1.0/scripts/python/generate-indices.py +0 -38
  13. idc_index_data-22.1.0/scripts/sql/collections_index.sql +0 -15
  14. idc_index_data-22.1.0/scripts/sql/idc_index.sql +0 -38
  15. {idc_index_data-22.1.0 → idc_index_data-22.1.2}/.git_archival.txt +0 -0
  16. {idc_index_data-22.1.0 → idc_index_data-22.1.2}/.gitattributes +0 -0
  17. {idc_index_data-22.1.0 → idc_index_data-22.1.2}/.github/CONTRIBUTING.md +0 -0
  18. {idc_index_data-22.1.0 → idc_index_data-22.1.2}/.github/dependabot.yml +0 -0
  19. {idc_index_data-22.1.0 → idc_index_data-22.1.2}/.github/matchers/pylint.json +0 -0
  20. {idc_index_data-22.1.0 → idc_index_data-22.1.2}/.github/workflows/cd.yml +0 -0
  21. {idc_index_data-22.1.0 → idc_index_data-22.1.2}/.github/workflows/ci.yml +0 -0
  22. {idc_index_data-22.1.0 → idc_index_data-22.1.2}/.pre-commit-config.yaml +0 -0
  23. {idc_index_data-22.1.0 → idc_index_data-22.1.2}/.readthedocs.yaml +0 -0
  24. {idc_index_data-22.1.0 → idc_index_data-22.1.2}/CMakeLists.txt +0 -0
  25. {idc_index_data-22.1.0 → idc_index_data-22.1.2}/LICENSE +0 -0
  26. {idc_index_data-22.1.0 → idc_index_data-22.1.2}/README.md +0 -0
  27. {idc_index_data-22.1.0 → idc_index_data-22.1.2}/assets/README.md +0 -0
  28. {idc_index_data-22.1.0 → idc_index_data-22.1.2}/assets/clinical_index.sql +0 -0
  29. {idc_index_data-22.1.0 → idc_index_data-22.1.2}/docs/conf.py +0 -0
  30. {idc_index_data-22.1.0 → idc_index_data-22.1.2}/docs/index.md +0 -0
  31. {idc_index_data-22.1.0 → idc_index_data-22.1.2}/noxfile.py +0 -0
  32. {idc_index_data-22.1.0 → idc_index_data-22.1.2}/scripts/python/update_idc_index_version.py +0 -0
  33. {idc_index_data-22.1.0 → idc_index_data-22.1.2}/scripts/sql/analysis_results_index.sql +0 -0
  34. {idc_index_data-22.1.0 → idc_index_data-22.1.2}/scripts/sql/prior_versions_index.sql +0 -0
  35. {idc_index_data-22.1.0 → idc_index_data-22.1.2}/src/idc_index_data/__init__.py +0 -0
  36. {idc_index_data-22.1.0 → idc_index_data-22.1.2}/src/idc_index_data/_version.pyi +0 -0
  37. {idc_index_data-22.1.0 → idc_index_data-22.1.2}/src/idc_index_data/py.typed +0 -0
  38. {idc_index_data-22.1.0 → idc_index_data-22.1.2}/tests/test_package.py +0 -0
@@ -0,0 +1,170 @@
1
+ # GitHub Copilot Instructions for idc-index-data
2
+
3
+ ## Project Overview
4
+
5
+ `idc-index-data` is a Python package that bundles the index data for the NCI
6
+ Imaging Data Commons (IDC). The package provides Parquet files containing
7
+ metadata about imaging data hosted by IDC, intended to be used by the
8
+ `idc-index` Python package.
9
+
10
+ ## Technology Stack
11
+
12
+ - **Build System**: scikit-build-core with CMake
13
+ - **Package Manager**: pip
14
+ - **Python Versions**: 3.10, 3.11, 3.12
15
+ - **Testing**: pytest with pytest-cov
16
+ - **Task Runner**: nox
17
+ - **Linting**: ruff, pylint, mypy, pre-commit hooks
18
+ - **Documentation**: Sphinx with MyST parser and Furo theme
19
+ - **Data Processing**: pandas, pyarrow, Google Cloud BigQuery
20
+
21
+ ## Development Workflow
22
+
23
+ ### Setting Up Development Environment
24
+
25
+ ```bash
26
+ python3 -m venv .venv
27
+ source ./.venv/bin/activate
28
+ pip install -v -e .[dev]
29
+ pre-commit install
30
+ ```
31
+
32
+ ### Common Commands
33
+
34
+ - **Run all checks**: `nox` (runs lint, pylint, and tests by default)
35
+ - **Lint code**: `nox -s lint`
36
+ - **Run pylint**: `nox -s pylint`
37
+ - **Run tests**: `nox -s tests`
38
+ - **Build docs**: `nox -s docs`
39
+ - **Serve docs**: `nox -s docs -- --serve`
40
+ - **Build package**: `nox -s build`
41
+ - **Update IDC index version**: `nox -s bump -- <version>` (or leave off version
42
+ for latest)
43
+ - **Tag release**: `nox -s tag_release` (shows instructions)
44
+
45
+ ### Pre-commit Checks
46
+
47
+ Always run pre-commit before committing:
48
+
49
+ ```bash
50
+ pre-commit run --all-files
51
+ ```
52
+
53
+ ## Code Style and Conventions
54
+
55
+ ### Python Code Style
56
+
57
+ - **Import Statement**: All files must include
58
+ `from __future__ import annotations` at the top
59
+ - **Type Hints**: Use type hints throughout; strict type checking is enabled for
60
+ `idc_index_data.*` modules
61
+ - **Linting**: Follow ruff and pylint rules configured in `pyproject.toml`
62
+ - **Formatting**: Code is formatted with ruff formatter
63
+ - **Line Length**: Not strictly enforced but keep reasonable
64
+ - **Docstrings**: Use when appropriate, especially for public APIs
65
+
66
+ ### Key Ruff Rules
67
+
68
+ The project uses extensive ruff rules including:
69
+
70
+ - `B` - flake8-bugbear
71
+ - `I` - isort (import sorting)
72
+ - `ARG` - flake8-unused-arguments
73
+ - `UP` - pyupgrade
74
+ - `PTH` - flake8-use-pathlib (prefer pathlib over os.path)
75
+ - `NPY` - NumPy specific rules
76
+ - `PD` - pandas-vet
77
+
78
+ ### Type Checking
79
+
80
+ - Python 3.8 minimum target
81
+ - Strict mypy checking for package code
82
+ - Use `typing.TYPE_CHECKING` for import cycles
83
+
84
+ ## Project Structure
85
+
86
+ ```
87
+ idc-index-data/
88
+ ├── src/idc_index_data/ # Main package source
89
+ │ ├── __init__.py # Package exports and file path lookups
90
+ │ └── _version.py # Auto-generated version file
91
+ ├── scripts/ # Management scripts
92
+ │ ├── python/ # Python scripts for index management
93
+ │ └── sql/ # SQL queries for BigQuery
94
+ ├── tests/ # Test files
95
+ │ └── test_package.py # Package tests
96
+ ├── docs/ # Sphinx documentation
97
+ ├── pyproject.toml # Project configuration
98
+ ├── noxfile.py # Nox session definitions
99
+ └── CMakeLists.txt # Build configuration
100
+ ```
101
+
102
+ ## Important Considerations
103
+
104
+ ### Package Purpose
105
+
106
+ This package is a **data package** - it bundles index files (CSV and Parquet)
107
+ and provides file paths to locate them. It does not contain complex business
108
+ logic but rather serves as a data distribution mechanism.
109
+
110
+ ### Version Management
111
+
112
+ - Version is defined in `pyproject.toml`
113
+ - Use `nox -s bump` to update to new IDC index versions
114
+ - The version should match the IDC release version
115
+ - Always update both index files and test expectations when bumping version
116
+
117
+ ### Data Files
118
+
119
+ The package includes:
120
+
121
+ - `idc_index.csv.zip` - Compressed CSV index (optional)
122
+ - `idc_index.parquet` - Parquet format index
123
+ - `prior_versions_index.parquet` - Historical version index
124
+
125
+ ### Google Cloud Integration
126
+
127
+ - Some operations require Google Cloud credentials
128
+ - BigQuery is used to fetch latest index data
129
+ - Scripts need `GCP_PROJECT` and `GOOGLE_APPLICATION_CREDENTIALS` environment
130
+ variables
131
+
132
+ ### Testing
133
+
134
+ - Tests verify package installation and file accessibility
135
+ - Coverage reporting is configured but codecov upload is currently disabled
136
+ - Tests should work across platforms (Linux, macOS, Windows)
137
+
138
+ ## Release Process
139
+
140
+ 1. Update index version: `nox -s bump -- --commit <version>`
141
+ 2. Create PR: `gh pr create --fill`
142
+ 3. After merge, tag release: follow instructions from `nox -s tag_release`
143
+ 4. Push tag: `git push origin <version>`
144
+ 5. GitHub Actions will automatically build and publish to PyPI
145
+
146
+ ## CI/CD
147
+
148
+ - **Format check**: pre-commit hooks + pylint
149
+ - **Tests**: Run on Python 3.10 and 3.12 across Linux, macOS, and Windows
150
+ - **Publishing**: Automated through GitHub Actions on tagged releases
151
+
152
+ ## Additional Resources
153
+
154
+ - [Contributing Guide](.github/CONTRIBUTING.md)
155
+ - [Scientific Python Developer Guide](https://learn.scientific-python.org/development/)
156
+ - [IDC Homepage](https://imaging.datacommons.cancer.gov)
157
+ - [IDC Discourse Forum](https://discourse.canceridc.dev/)
158
+
159
+ ## When Making Changes
160
+
161
+ 1. **Always** run tests before and after changes: `nox -s tests`
162
+ 2. **Always** run linters: `nox -s lint`
163
+ 3. **Never** commit without running pre-commit checks
164
+ 4. **Prefer** pathlib over os.path for file operations
165
+ 5. **Use** type hints for all new code
166
+ 6. **Update** tests if changing package structure or exports
167
+ 7. **Follow** existing patterns in the codebase
168
+ 8. **Keep** changes minimal and focused
169
+ 9. **Document** any new public APIs
170
+ 10. **Test** across Python versions when changing core functionality
@@ -55,6 +55,6 @@ jobs:
55
55
  if: github.event_name == 'release' && github.event.action == 'published'
56
56
  uses: ncipollo/release-action@v1
57
57
  with:
58
- artifacts: "*.parquet,*.json"
58
+ artifacts: "release_artifacts/*.parquet,release_artifacts/*.json,release_artifacts/*.sql"
59
59
  allowUpdates: true
60
60
  omitBodyDuringUpdate: true
@@ -159,3 +159,6 @@ Thumbs.db
159
159
 
160
160
  # gcp service account keys
161
161
  gha-creds-**.json
162
+
163
+ # Release artifacts directory
164
+ release_artifacts/
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: idc-index-data
3
- Version: 22.1.0
3
+ Version: 22.1.2
4
4
  Summary: ImagingDataCommons index to query and download data.
5
5
  Author-Email: Andrey Fedorov <andrey.fedorov@gmail.com>, Vamsi Thiriveedhi <vthiriveedhi@mgh.harvard.edu>, Jean-Christophe Fillion-Robin <jchris.fillionr@kitware.com>
6
6
  License: Copyright 2024 Andrey Fedorov
@@ -31,8 +31,6 @@ Classifier: Operating System :: OS Independent
31
31
  Classifier: Programming Language :: Python
32
32
  Classifier: Programming Language :: Python :: 3
33
33
  Classifier: Programming Language :: Python :: 3 :: Only
34
- Classifier: Programming Language :: Python :: 3.8
35
- Classifier: Programming Language :: Python :: 3.9
36
34
  Classifier: Programming Language :: Python :: 3.10
37
35
  Classifier: Programming Language :: Python :: 3.11
38
36
  Classifier: Programming Language :: Python :: 3.12
@@ -42,7 +40,7 @@ Project-URL: Homepage, https://github.com/ImagingDataCommons/idc-index-data
42
40
  Project-URL: Bug Tracker, https://github.com/ImagingDataCommons/idc-index-data/issues
43
41
  Project-URL: Discussions, https://discourse.canceridc.dev/
44
42
  Project-URL: Changelog, https://github.com/ImagingDataCommons/idc-index-data/releases
45
- Requires-Python: >=3.8
43
+ Requires-Python: >=3.10
46
44
  Provides-Extra: test
47
45
  Requires-Dist: pandas; extra == "test"
48
46
  Requires-Dist: pyarrow; extra == "test"
@@ -82,10 +82,14 @@ SpecimenPreparationSequence_unnested AS (
82
82
  SELECT
83
83
  temp_table.SeriesInstanceUID,
84
84
  -- Embedding Medium
85
+ # description:
86
+ # embedding medium used for the slide preparation
85
87
  ARRAY(
86
88
  SELECT IF(code IS NULL, NULL, SPLIT(code, ':')[SAFE_OFFSET(0)])
87
89
  FROM UNNEST(embeddingMedium_code_str) AS code
88
90
  ) AS embeddingMedium_CodeMeaning,
91
+ # description:
92
+ # embedding medium code tuple
89
93
  ARRAY(
90
94
  SELECT IF(code IS NULL, NULL,
91
95
  IF(STRPOS(code, ':') = 0, NULL,
@@ -93,10 +97,14 @@ SELECT
93
97
  FROM UNNEST(embeddingMedium_code_str) AS code
94
98
  ) AS embeddingMedium_code_designator_value_str,
95
99
  -- Tissue Fixative
100
+ # description:
101
+ # tissue fixative used for the slide preparation
96
102
  ARRAY(
97
103
  SELECT IF(code IS NULL, NULL, SPLIT(code, ':')[SAFE_OFFSET(0)])
98
104
  FROM UNNEST(tissueFixative_code_str) AS code
99
105
  ) AS tissueFixative_CodeMeaning,
106
+ # description:
107
+ # tissue fixative code tuple
100
108
  ARRAY(
101
109
  SELECT IF(code IS NULL, NULL,
102
110
  IF(STRPOS(code, ':') = 0, NULL,
@@ -104,31 +112,56 @@ SELECT
104
112
  FROM UNNEST(tissueFixative_code_str) AS code
105
113
  ) AS tissueFixative_code_designator_value_str,
106
114
  -- Staining using substance
115
+ # description:
116
+ # staining substances used for the slide preparation
107
117
  ARRAY(
108
118
  SELECT IF(code IS NULL, NULL, SPLIT(code, ':')[SAFE_OFFSET(0)])
109
119
  FROM UNNEST(staining_usingSubstance_code_str) AS code
110
120
  ) AS staining_usingSubstance_CodeMeaning,
121
+ # description:
122
+ # staining using substance code tuple
111
123
  ARRAY(
112
124
  SELECT IF(code IS NULL, NULL,
113
125
  IF(STRPOS(code, ':') = 0, NULL,
114
126
  SUBSTR(code, STRPOS(code, ':') + 1)))
115
127
  FROM UNNEST(staining_usingSubstance_code_str) AS code
116
128
  ) AS staining_usingSubstance_code_designator_value_str,
117
-
129
+ # description:
130
+ # pixel spacing in mm at the maximum resolution layer, rounded to 2 significant figures
118
131
  if(COALESCE(min_spacing_0, fg_min_spacing_0) = 0, 0,
119
132
  round(COALESCE(min_spacing_0, fg_min_spacing_0) ,CAST(2 -1-floor(log10(abs(COALESCE(min_spacing_0, fg_min_spacing_0) ))) AS INT64))) AS min_PixelSpacing_2sf,
133
+ # description:
134
+ # width of the image at the maximum resolution
120
135
  COALESCE(max_TotalPixelMatrixColumns, max_Columns) AS max_TotalPixelMatrixColumns,
136
+ # description:
137
+ # height of the image at the maximum resolution
121
138
  COALESCE(max_TotalPixelMatrixRows, max_Rows) AS max_TotalPixelMatrixRows,
139
+ # description:
140
+ # power of the objective lens of the equipment used to digitize the slide
122
141
  SAFE_CAST(ObjectiveLensPower as INT) as ObjectiveLensPower,
142
+ # description:
143
+ # anatomic location from where the imaged specimen was collected
123
144
  CONCAT(SPLIT(primaryAnatomicStructure_code_str,":")[SAFE_OFFSET(0)],":",SPLIT(primaryAnatomicStructure_code_str,":")[SAFE_OFFSET(1)]) as primaryAnatomicStructure_code_designator_value_str,
145
+ # description:
146
+ # code tuple for the anatomic location from where the imaged specimen was collected
124
147
  SPLIT(primaryAnatomicStructure_code_str,":")[SAFE_OFFSET(2)] as primaryAnatomicStructure_CodeMeaning,
148
+ # description:
149
+ # additional characteristics of the specimen, such as whether it is a tumor or normal tissue (when available)
125
150
  CONCAT(SPLIT(primaryAnatomicStructureModifier_code_str,":")[SAFE_OFFSET(0)],":",SPLIT(primaryAnatomicStructureModifier_code_str,":")[SAFE_OFFSET(1)]) as primaryAnatomicStructureModifier_code_designator_value_str,
151
+ # description:
152
+ # code tuple for additional characteristics of the specimen, such as whether it is a tumor or normal tissue (when available)
126
153
  SPLIT(primaryAnatomicStructureModifier_code_str,":")[SAFE_OFFSET(2)] as primaryAnatomicStructureModifier_CodeMeaning,
127
-
154
+ # description:
155
+ # illumination type used during slide digitization
128
156
  CONCAT(SPLIT(illuminationType_code_str,":")[SAFE_OFFSET(0)],":",SPLIT(illuminationType_code_str,":")[SAFE_OFFSET(1)]) as illuminationType_code_designator_value_str,
157
+ # description:
158
+ # code tuple for the illumination type used during slide digitization
129
159
  SPLIT(illuminationType_code_str,":")[SAFE_OFFSET(2)] as illuminationType_CodeMeaning,
130
-
160
+ # description:
161
+ # admitting diagnosis associated with the specimen imaged on the slide (when available)
131
162
  CONCAT(SPLIT(admittingDiagnosis_code_str,":")[SAFE_OFFSET(0)],":",SPLIT(admittingDiagnosis_code_str,":")[SAFE_OFFSET(1)]) as admittingDiagnosis_code_designator_value_str,
163
+ # description:
164
+ # code tuple for the admitting diagnosis associated with the specimen imaged on the slide (when available)
132
165
  SPLIT(admittingDiagnosis_code_str,":")[SAFE_OFFSET(2)] as admittingDiagnosis_CodeMeaning,
133
166
  FROM
134
167
  temp_table
@@ -48,15 +48,23 @@ WITH
48
48
  GROUP BY
49
49
  SOPInstanceUID )
50
50
  SELECT
51
+ # description:
52
+ # unique identifier of the instance
51
53
  dicom_all.SOPInstanceUID,
54
+ # description:
55
+ # unique identifier of the series
52
56
  dicom_all.SeriesInstanceUID,
53
57
  -- Embedding Medium
58
+ # description:
59
+ # embedding medium used for the slide preparation
54
60
  ARRAY(
55
61
  SELECT
56
62
  IF
57
63
  (code IS NULL, NULL, SPLIT(code, ':')[SAFE_OFFSET(0)])
58
64
  FROM
59
65
  UNNEST(embeddingMedium_code_str) AS code ) AS embeddingMedium_CodeMeaning,
66
+ # description:
67
+ # embedding medium code tuple
60
68
  ARRAY(
61
69
  SELECT
62
70
  IF
@@ -66,12 +74,16 @@ SELECT
66
74
  FROM
67
75
  UNNEST(embeddingMedium_code_str) AS code ) AS embeddingMedium_code_designator_value_str,
68
76
  -- Tissue Fixative
77
+ # description:
78
+ # tissue fixative used for the slide preparation
69
79
  ARRAY(
70
80
  SELECT
71
81
  IF
72
82
  (code IS NULL, NULL, SPLIT(code, ':')[SAFE_OFFSET(0)])
73
83
  FROM
74
84
  UNNEST(tissueFixative_code_str) AS code ) AS tissueFixative_CodeMeaning,
85
+ # description:
86
+ # tissue fixative code tuple
75
87
  ARRAY(
76
88
  SELECT
77
89
  IF
@@ -81,12 +93,16 @@ SELECT
81
93
  FROM
82
94
  UNNEST(tissueFixative_code_str) AS code ) AS tissueFixative_code_designator_value_str,
83
95
  -- Staining using substance
96
+ # description:
97
+ # staining substances used for the slide preparation
84
98
  ARRAY(
85
99
  SELECT
86
100
  IF
87
101
  (code IS NULL, NULL, SPLIT(code, ':')[SAFE_OFFSET(0)])
88
102
  FROM
89
103
  UNNEST(staining_usingSubstance_code_str) AS code ) AS staining_usingSubstance_CodeMeaning,
104
+ # description:
105
+ # staining using substance code tuple
90
106
  ARRAY(
91
107
  SELECT
92
108
  IF
@@ -98,13 +114,27 @@ SELECT
98
114
  -- instance-specific image attributes
99
115
  -- NB: there is a caveat that I think in general, we expect square pixels, but in htan_wustl and cptac_luad this assumption does not hold,
100
116
  -- and in htan_wustl, the difference is rather large (x2) - waiting to hear from David Clunie about this...
117
+ # description:
118
+ # pixel spacing in mm, rounded to 2 significant figures
101
119
  SAFE_CAST(SharedFunctionalGroupsSequence[SAFE_OFFSET(0)].PixelMeasuresSequence[SAFE_OFFSET(0)]. PixelSpacing[SAFE_OFFSET(0)] AS FLOAT64) AS PixelSpacing_0,
120
+ # description:
121
+ # DICOM ImageType attribute
102
122
  dicom_all.ImageType,
123
+ # description:
124
+ # DICOM TransferSyntaxUID attribute
103
125
  dicom_all.TransferSyntaxUID,
126
+ # description:
127
+ # size of the instance file in bytes
104
128
  dicom_all.instance_size,
129
+ # description:
130
+ # number of columns in the image
105
131
  dicom_all.TotalPixelMatrixColumns,
132
+ # description:
133
+ # number of rows in the image
106
134
  dicom_all.TotalPixelMatrixRows,
107
135
  -- attributes needed to retrieve the selected instances/files
136
+ # description:
137
+ # unique identifier of the instance within the IDC
108
138
  dicom_all.crdc_instance_uuid
109
139
  FROM
110
140
  `bigquery-public-data.idc_v22.dicom_all` AS dicom_all
@@ -13,7 +13,7 @@ build-backend = "scikit_build_core.build"
13
13
 
14
14
  [project]
15
15
  name = "idc-index-data"
16
- version = "22.1.0"
16
+ version = "22.1.2"
17
17
  authors = [
18
18
  { name = "Andrey Fedorov", email = "andrey.fedorov@gmail.com" },
19
19
  { name = "Vamsi Thiriveedhi", email = "vthiriveedhi@mgh.harvard.edu" },
@@ -22,7 +22,7 @@ authors = [
22
22
  description = "ImagingDataCommons index to query and download data."
23
23
  readme = "README.md"
24
24
  license.file = "LICENSE"
25
- requires-python = ">=3.8"
25
+ requires-python = ">=3.10"
26
26
  classifiers = [
27
27
  "Development Status :: 4 - Beta",
28
28
  "Intended Audience :: Science/Research",
@@ -32,8 +32,6 @@ classifiers = [
32
32
  "Programming Language :: Python",
33
33
  "Programming Language :: Python :: 3",
34
34
  "Programming Language :: Python :: 3 :: Only",
35
- "Programming Language :: Python :: 3.8",
36
- "Programming Language :: Python :: 3.9",
37
35
  "Programming Language :: Python :: 3.10",
38
36
  "Programming Language :: Python :: 3.11",
39
37
  "Programming Language :: Python :: 3.12",
@@ -0,0 +1,50 @@
1
+ # new_script.py
2
+ from __future__ import annotations
3
+
4
+ import os
5
+ from pathlib import Path
6
+
7
+ from idc_index_data_manager import IDCIndexDataManager
8
+
9
+
10
+ def main():
11
+ project_id = os.getenv("PROJECT_ID")
12
+ manager = IDCIndexDataManager(project_id=project_id)
13
+ scripts_dir = Path(__file__).resolve().parent.parent
14
+
15
+ # Create dedicated output directory for release artifacts
16
+ output_dir = scripts_dir.parent / "release_artifacts"
17
+ output_dir.mkdir(parents=True, exist_ok=True)
18
+
19
+ assets_dir = scripts_dir.parent / "assets"
20
+
21
+ # Collecting all .sql files from sql_dir and assets_dir
22
+ sql_files = [f for f in Path.iterdir(assets_dir) if str(f).endswith(".sql")]
23
+
24
+ for file_name in sql_files:
25
+ file_path = assets_dir / file_name
26
+ index_df, output_basename, schema, sql_query = manager.execute_sql_query(
27
+ file_path
28
+ )
29
+ parquet_file_path = output_dir / f"{output_basename}.parquet"
30
+ index_df.to_parquet(parquet_file_path)
31
+ manager.save_schema_to_json(schema, output_basename, output_dir)
32
+ manager.save_sql_query(sql_query, output_basename, output_dir)
33
+
34
+ core_indices_dir = scripts_dir.parent / "scripts" / "sql"
35
+
36
+ sql_files = [f for f in Path.iterdir(core_indices_dir) if str(f).endswith(".sql")]
37
+
38
+ for file_name in sql_files:
39
+ file_path = core_indices_dir / file_name
40
+ index_df, output_basename, schema, sql_query = manager.execute_sql_query(
41
+ file_path
42
+ )
43
+ parquet_file_path = output_dir / f"{output_basename}.parquet"
44
+ index_df.to_parquet(parquet_file_path)
45
+ manager.save_schema_to_json(schema, output_basename, output_dir)
46
+ manager.save_sql_query(sql_query, output_basename, output_dir)
47
+
48
+
49
+ if __name__ == "__main__":
50
+ main()
@@ -40,10 +40,13 @@ class IDCIndexDataManager:
40
40
  index_df["StudyDate"] = index_df["StudyDate"].astype(str)
41
41
  output_basename = Path(file_path).name.split(".")[0]
42
42
  logger.debug("Executed SQL query from file: %s", file_path)
43
- return index_df, output_basename, schema
43
+ return index_df, output_basename, schema, sql_query
44
44
 
45
45
  def save_schema_to_json(
46
- self, schema: list[bigquery.SchemaField], output_basename: str
46
+ self,
47
+ schema: list[bigquery.SchemaField],
48
+ output_basename: str,
49
+ output_dir: Path | None = None,
47
50
  ) -> None:
48
51
  """
49
52
  Saves the BigQuery schema to a JSON file.
@@ -51,6 +54,7 @@ class IDCIndexDataManager:
51
54
  Args:
52
55
  schema: List of BigQuery SchemaField objects from the query result
53
56
  output_basename: The base name for the output file
57
+ output_dir: Optional directory path for the output file
54
58
  """
55
59
  # Convert BigQuery schema to JSON-serializable format
56
60
  schema_dict = {
@@ -65,13 +69,46 @@ class IDCIndexDataManager:
65
69
  }
66
70
 
67
71
  # Save to JSON file
68
- json_file_name = f"{output_basename}.json"
69
- with Path(json_file_name).open("w") as f:
72
+ if output_dir:
73
+ output_dir.mkdir(parents=True, exist_ok=True)
74
+ json_file_path = output_dir / f"{output_basename}.json"
75
+ else:
76
+ json_file_path = Path(f"{output_basename}.json")
77
+
78
+ with json_file_path.open("w") as f:
70
79
  json.dump(schema_dict, f, indent=2)
71
- logger.debug("Created schema JSON file: %s", json_file_name)
80
+ logger.debug("Created schema JSON file: %s", json_file_path)
81
+
82
+ def save_sql_query(
83
+ self,
84
+ sql_query: str,
85
+ output_basename: str,
86
+ output_dir: Path | None = None,
87
+ ) -> None:
88
+ """
89
+ Saves the SQL query to a file.
90
+
91
+ Args:
92
+ sql_query: The SQL query string
93
+ output_basename: The base name for the output file
94
+ output_dir: Optional directory path for the output file
95
+ """
96
+
97
+ if output_dir:
98
+ output_dir.mkdir(parents=True, exist_ok=True)
99
+ query_file_path = output_dir / f"{output_basename}.sql"
100
+ else:
101
+ query_file_path = Path(f"{output_basename}.sql")
102
+
103
+ with query_file_path.open("w") as f:
104
+ f.write(sql_query)
105
+ logger.debug("Created SQL query file: %s", query_file_path)
72
106
 
73
107
  def generate_index_data_files(
74
- self, generate_compressed_csv: bool = True, generate_parquet: bool = False
108
+ self,
109
+ generate_compressed_csv: bool = True,
110
+ generate_parquet: bool = False,
111
+ output_dir: Path | None = None,
75
112
  ) -> None:
76
113
  """
77
114
  Generates index-data files locally by executing queries against
@@ -80,32 +117,52 @@ class IDCIndexDataManager:
80
117
  This method iterates over SQL files in the 'scripts/sql' directory,
81
118
  executing each query using :func:`execute_sql_query` and generating a DataFrame,
82
119
  'index_df'. The DataFrame is then saved as compressed CSV and/or Parquet file.
120
+
121
+ Args:
122
+ generate_compressed_csv: Whether to generate compressed CSV files
123
+ generate_parquet: Whether to generate Parquet files
124
+ output_dir: Optional directory path for the output files
83
125
  """
84
126
 
85
127
  scripts_dir = Path(__file__).parent.parent
86
128
  sql_dir = scripts_dir / "sql"
87
129
 
130
+ if output_dir:
131
+ output_dir.mkdir(parents=True, exist_ok=True)
132
+
88
133
  for file_name in Path.iterdir(sql_dir):
89
134
  if str(file_name).endswith(".sql"):
90
135
  file_path = Path(sql_dir) / file_name
91
- index_df, output_basename, schema = self.execute_sql_query(file_path)
136
+ index_df, output_basename, schema, sql_query = self.execute_sql_query(
137
+ file_path
138
+ )
92
139
  logger.debug(
93
140
  "Executed and processed SQL queries from file: %s", file_path
94
141
  )
95
142
  if generate_compressed_csv:
96
- csv_file_name = f"{output_basename}.csv.zip"
143
+ csv_file_path = (
144
+ output_dir / f"{output_basename}.csv.zip"
145
+ if output_dir
146
+ else Path(f"{output_basename}.csv.zip")
147
+ )
97
148
  index_df.to_csv(
98
- csv_file_name, compression={"method": "zip"}, escapechar="\\"
149
+ csv_file_path, compression={"method": "zip"}, escapechar="\\"
99
150
  )
100
- logger.debug("Created CSV zip file: %s", csv_file_name)
151
+ logger.debug("Created CSV zip file: %s", csv_file_path)
101
152
 
102
153
  if generate_parquet:
103
- parquet_file_name = f"{output_basename}.parquet"
104
- index_df.to_parquet(parquet_file_name, compression="zstd")
105
- logger.debug("Created Parquet file: %s", parquet_file_name)
154
+ parquet_file_path = (
155
+ output_dir / f"{output_basename}.parquet"
156
+ if output_dir
157
+ else Path(f"{output_basename}.parquet")
158
+ )
159
+ index_df.to_parquet(parquet_file_path, compression="zstd")
160
+ logger.debug("Created Parquet file: %s", parquet_file_path)
106
161
 
107
- # Save schema to JSON file
108
- self.save_schema_to_json(schema, output_basename)
162
+ # Save schema to JSON file
163
+ self.save_schema_to_json(schema, output_basename, output_dir)
164
+ # Save SQL query to file
165
+ self.save_sql_query(sql_query, output_basename, output_dir)
109
166
 
110
167
  def retrieve_latest_idc_release_version(self) -> int:
111
168
  """
@@ -0,0 +1,39 @@
1
+ SELECT
2
+ # description:
3
+ # name of the collection
4
+ collection_name,
5
+ # description:
6
+ # unique identifier of the collection
7
+ collection_id,
8
+ # description:
9
+ # types of cancer represented in the collection
10
+ CancerTypes,
11
+ # description:
12
+ # locations of tumors represented in the collection
13
+ TumorLocations,
14
+ # description:
15
+ # number of subjects in the collection
16
+ Subjects,
17
+ # description:
18
+ # species represented in the collection
19
+ Species,
20
+ # description:
21
+ # sources of data for the collection
22
+ Sources,
23
+ # description:
24
+ # additional data supporting the collection available in IDC
25
+ SupportingData,
26
+ # description:
27
+ # broader initiative/category under which this collection is being shared
28
+ Program,
29
+ # description:
30
+ # status of the collection (Completed or Ongoing)
31
+ Status,
32
+ # description:
33
+ # timestamp of the last update to the collection
34
+ Updated,
35
+ # description:
36
+ # detailed information about the collection
37
+ Description
38
+ FROM
39
+ `bigquery-public-data.idc_v22.original_collections_metadata`
@@ -0,0 +1,88 @@
1
+ SELECT
2
+ # collection level attributes
3
+ # description:
4
+ # short string with the identifier of the collection the series belongs to
5
+ ANY_VALUE(collection_id) AS collection_id,
6
+ # description:
7
+ # this string is not empty if the specific series is
8
+ # part of an analysis results collection; analysis results can be added to a
9
+ # given collection over time
10
+ ANY_VALUE(analysis_result_id) AS analysis_result_id,
11
+ # description:
12
+ # identifier of the patient within the collection (DICOM attribute)
13
+ ANY_VALUE(PatientID) AS PatientID,
14
+ # description:
15
+ # unique identifier of the DICOM series (DICOM attribute)
16
+ SeriesInstanceUID,
17
+ # description:
18
+ # unique identifier of the DICOM study (DICOM attribute)
19
+ ANY_VALUE(StudyInstanceUID) AS StudyInstanceUID,
20
+ # description:
21
+ # Digital Object Identifier of the dataset that contains the given
22
+ # series; follow this DOI to learn more about the activity that produced
23
+ # this series
24
+ ANY_VALUE(source_DOI) AS source_DOI,
25
+ # patient level attributes
26
+ # description:
27
+ # age of the subject at the time of imaging (DICOM attribute)
28
+ ANY_VALUE(PatientAge) AS PatientAge,
29
+ # description:
30
+ # subject sex (DICOM attribute)
31
+ ANY_VALUE(PatientSex) AS PatientSex,
32
+ # study level attributes
33
+ # description:
34
+ # date of the study (de-identified) (DICOM attribute)
35
+ ANY_VALUE(StudyDate) AS StudyDate,
36
+ # description:
37
+ # textual description of the study content (DICOM attribute)
38
+ ANY_VALUE(StudyDescription) AS StudyDescription,
39
+ # description:
40
+ # body part imaged (not iniapplicabletialized for SM series) (DICOM attribute)
41
+ ANY_VALUE(dicom_curated.BodyPartExamined) AS BodyPartExamined,
42
+ # series level attributes
43
+ # description:
44
+ # acquisition modality (DICOM attribute)
45
+ ANY_VALUE(Modality) AS Modality,
46
+ # description:
47
+ # manufacturer of the equipment that produced the series (DICOM attribute)
48
+ ANY_VALUE(Manufacturer) AS Manufacturer,
49
+ # description:
50
+ # model name of the equipment that produced the series (DICOM attribute)
51
+ ANY_VALUE(ManufacturerModelName) AS ManufacturerModelName,
52
+ # description:
53
+ # date of the series (de-identified) (DICOM attribute)
54
+ ANY_VALUE(SAFE_CAST(SeriesDate AS STRING)) AS SeriesDate,
55
+ # description:
56
+ # textual description of the series content (DICOM attribute)
57
+ ANY_VALUE(SeriesDescription) AS SeriesDescription,
58
+ # description:
59
+ # series number (DICOM attribute)
60
+ ANY_VALUE(SeriesNumber) AS SeriesNumber,
61
+ # description:
62
+ # number of instances in the series
63
+ COUNT(dicom_all.SOPInstanceUID) AS instanceCount,
64
+ # description:
65
+ # short name of the license that applies to this series
66
+ ANY_VALUE(license_short_name) as license_short_name,
67
+ # download related attributes
68
+ # description:
69
+ # name of the AWS S3 bucket that contains the series
70
+ ANY_VALUE(aws_bucket) AS aws_bucket,
71
+ # description:
72
+ # unique identifier of the series within the IDC
73
+ ANY_VALUE(crdc_series_uuid) AS crdc_series_uuid,
74
+ # series_aws_url will be phased out in favor of constructing URL from bucket+UUID
75
+ # description:
76
+ # public AWS S3 URL to download the series in bulk (each instance is a separate file)
77
+ ANY_VALUE(CONCAT(series_aws_url,"*")) AS series_aws_url,
78
+ # description:
79
+ # total size of the series in megabytes
80
+ ROUND(SUM(SAFE_CAST(instance_size AS float64))/1000000, 2) AS series_size_MB,
81
+ FROM
82
+ `bigquery-public-data.idc_v22.dicom_all` AS dicom_all
83
+ JOIN
84
+ `bigquery-public-data.idc_v22.dicom_metadata_curated` AS dicom_curated
85
+ ON
86
+ dicom_all.SOPInstanceUID = dicom_curated.SOPInstanceUID
87
+ GROUP BY
88
+ SeriesInstanceUID
@@ -1,38 +0,0 @@
1
- # new_script.py
2
- from __future__ import annotations
3
-
4
- import os
5
- from pathlib import Path
6
-
7
- from idc_index_data_manager import IDCIndexDataManager
8
-
9
-
10
- def main():
11
- project_id = os.getenv("PROJECT_ID")
12
- manager = IDCIndexDataManager(project_id=project_id)
13
- scripts_dir = Path(__file__).resolve().parent.parent
14
-
15
- assets_dir = scripts_dir.parent / "assets"
16
-
17
- # Collecting all .sql files from sql_dir and assets_dir
18
- sql_files = [f for f in Path.iterdir(assets_dir) if str(f).endswith(".sql")]
19
-
20
- for file_name in sql_files:
21
- file_path = assets_dir / file_name
22
- index_df, output_basename, schema = manager.execute_sql_query(file_path)
23
- index_df.to_parquet(f"{output_basename}.parquet")
24
- manager.save_schema_to_json(schema, output_basename)
25
-
26
- core_indices_dir = scripts_dir.parent / "scripts" / "sql"
27
-
28
- sql_files = [f for f in Path.iterdir(core_indices_dir) if str(f).endswith(".sql")]
29
-
30
- for file_name in sql_files:
31
- file_path = core_indices_dir / file_name
32
- index_df, output_basename, schema = manager.execute_sql_query(file_path)
33
- index_df.to_parquet(f"{output_basename}.parquet")
34
- manager.save_schema_to_json(schema, output_basename)
35
-
36
-
37
- if __name__ == "__main__":
38
- main()
@@ -1,15 +0,0 @@
1
- SELECT
2
- collection_name,
3
- collection_id,
4
- CancerTypes,
5
- TumorLocations,
6
- Subjects,
7
- Species,
8
- Sources,
9
- SupportingData,
10
- Program,
11
- Status,
12
- Updated,
13
- Description
14
- FROM
15
- `bigquery-public-data.idc_v22.original_collections_metadata`
@@ -1,38 +0,0 @@
1
- SELECT
2
- # collection level attributes
3
- ANY_VALUE(collection_id) AS collection_id,
4
- ANY_VALUE(analysis_result_id) AS analysis_result_id,
5
- ANY_VALUE(PatientID) AS PatientID,
6
- SeriesInstanceUID,
7
- ANY_VALUE(StudyInstanceUID) AS StudyInstanceUID,
8
- ANY_VALUE(source_DOI) AS source_DOI,
9
- # patient level attributes
10
- ANY_VALUE(PatientAge) AS PatientAge,
11
- ANY_VALUE(PatientSex) AS PatientSex,
12
- # study level attributes
13
- ANY_VALUE(StudyDate) AS StudyDate,
14
- ANY_VALUE(StudyDescription) AS StudyDescription,
15
- ANY_VALUE(dicom_curated.BodyPartExamined) AS BodyPartExamined,
16
- # series level attributes
17
- ANY_VALUE(Modality) AS Modality,
18
- ANY_VALUE(Manufacturer) AS Manufacturer,
19
- ANY_VALUE(ManufacturerModelName) AS ManufacturerModelName,
20
- ANY_VALUE(SAFE_CAST(SeriesDate AS STRING)) AS SeriesDate,
21
- ANY_VALUE(SeriesDescription) AS SeriesDescription,
22
- ANY_VALUE(SeriesNumber) AS SeriesNumber,
23
- COUNT(dicom_all.SOPInstanceUID) AS instanceCount,
24
- ANY_VALUE(license_short_name) as license_short_name,
25
- # download related attributes
26
- ANY_VALUE(aws_bucket) AS aws_bucket,
27
- ANY_VALUE(crdc_series_uuid) AS crdc_series_uuid,
28
- # series_aws_url will be phased out in favor of constructing URL from bucket+UUID
29
- ANY_VALUE(CONCAT(series_aws_url,"*")) AS series_aws_url,
30
- ROUND(SUM(SAFE_CAST(instance_size AS float64))/1000000, 2) AS series_size_MB,
31
- FROM
32
- `bigquery-public-data.idc_v22.dicom_all` AS dicom_all
33
- JOIN
34
- `bigquery-public-data.idc_v22.dicom_metadata_curated` AS dicom_curated
35
- ON
36
- dicom_all.SOPInstanceUID = dicom_curated.SOPInstanceUID
37
- GROUP BY
38
- SeriesInstanceUID
File without changes