idc-index-data 0.1.0__tar.gz → 17.0.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of idc-index-data might be problematic. Click here for more details.
- {idc_index_data-0.1.0 → idc_index_data-17.0.0}/.github/CONTRIBUTING.md +21 -0
- {idc_index_data-0.1.0 → idc_index_data-17.0.0}/.github/workflows/cd.yml +7 -0
- {idc_index_data-0.1.0 → idc_index_data-17.0.0}/.github/workflows/ci.yml +21 -3
- idc_index_data-17.0.0/.github/workflows/keep-alive.yml +18 -0
- {idc_index_data-0.1.0 → idc_index_data-17.0.0}/.gitignore +3 -0
- {idc_index_data-0.1.0 → idc_index_data-17.0.0}/.pre-commit-config.yaml +0 -6
- idc_index_data-17.0.0/CMakeLists.txt +37 -0
- {idc_index_data-0.1.0 → idc_index_data-17.0.0}/PKG-INFO +4 -5
- {idc_index_data-0.1.0 → idc_index_data-17.0.0}/noxfile.py +89 -1
- {idc_index_data-0.1.0 → idc_index_data-17.0.0}/pyproject.toml +17 -8
- idc_index_data-17.0.0/scripts/python/idc_index_data_manager.py +136 -0
- idc_index_data-17.0.0/scripts/python/update_idc_index_version.py +91 -0
- idc_index_data-17.0.0/scripts/sql/idc_index.sql +34 -0
- {idc_index_data-0.1.0 → idc_index_data-17.0.0}/src/idc_index_data/__init__.py +16 -9
- idc_index_data-17.0.0/src/idc_index_data/_version.pyi +3 -0
- idc_index_data-17.0.0/tests/test_package.py +27 -0
- idc_index_data-0.1.0/CMakeLists.txt +0 -20
- idc_index_data-0.1.0/src/idc_index_data/_version.py +0 -16
- idc_index_data-0.1.0/src/idc_index_data/_version.pyi +0 -4
- idc_index_data-0.1.0/tests/test_package.py +0 -14
- {idc_index_data-0.1.0 → idc_index_data-17.0.0}/.git_archival.txt +0 -0
- {idc_index_data-0.1.0 → idc_index_data-17.0.0}/.gitattributes +0 -0
- {idc_index_data-0.1.0 → idc_index_data-17.0.0}/.github/dependabot.yml +0 -0
- {idc_index_data-0.1.0 → idc_index_data-17.0.0}/.github/matchers/pylint.json +0 -0
- {idc_index_data-0.1.0 → idc_index_data-17.0.0}/.readthedocs.yaml +0 -0
- {idc_index_data-0.1.0 → idc_index_data-17.0.0}/LICENSE +0 -0
- {idc_index_data-0.1.0 → idc_index_data-17.0.0}/README.md +0 -0
- {idc_index_data-0.1.0 → idc_index_data-17.0.0}/docs/conf.py +0 -0
- {idc_index_data-0.1.0 → idc_index_data-17.0.0}/docs/index.md +0 -0
- {idc_index_data-0.1.0 → idc_index_data-17.0.0}/src/idc_index_data/py.typed +0 -0
|
@@ -99,3 +99,24 @@ pre-commit run -a
|
|
|
99
99
|
```
|
|
100
100
|
|
|
101
101
|
to check all files.
|
|
102
|
+
|
|
103
|
+
# Updating the IDC index version
|
|
104
|
+
|
|
105
|
+
You can update the version using:
|
|
106
|
+
|
|
107
|
+
```bash
|
|
108
|
+
export GCP_PROJECT=idc-external-025
|
|
109
|
+
export GOOGLE_APPLICATION_CREDENTIALS=/path/to/keyfile.json
|
|
110
|
+
nox -s bump -- <version>
|
|
111
|
+
```
|
|
112
|
+
|
|
113
|
+
And follow the instructions it gives you. Leave off the version to bump to the
|
|
114
|
+
latest version. Add `-–commit` to run the commit procedure.
|
|
115
|
+
|
|
116
|
+
# Tagging a release
|
|
117
|
+
|
|
118
|
+
You can print the instructions for tagging a release using:
|
|
119
|
+
|
|
120
|
+
```bash
|
|
121
|
+
nox -s tag_release
|
|
122
|
+
```
|
|
@@ -27,6 +27,13 @@ jobs:
|
|
|
27
27
|
with:
|
|
28
28
|
fetch-depth: 0
|
|
29
29
|
|
|
30
|
+
- name: Authorize Google Cloud
|
|
31
|
+
uses: google-github-actions/auth@v2
|
|
32
|
+
with:
|
|
33
|
+
credentials_json: "${{ secrets.SERVICE_ACCOUNT_KEY }}"
|
|
34
|
+
create_credentials_file: true
|
|
35
|
+
export_environment_variables: true
|
|
36
|
+
|
|
30
37
|
- uses: hynek/build-and-inspect-python-package@v2
|
|
31
38
|
|
|
32
39
|
publish:
|
|
@@ -22,12 +22,22 @@ jobs:
|
|
|
22
22
|
- uses: actions/checkout@v4
|
|
23
23
|
with:
|
|
24
24
|
fetch-depth: 0
|
|
25
|
+
|
|
26
|
+
- name: Authorize Google Cloud
|
|
27
|
+
uses: google-github-actions/auth@v2
|
|
28
|
+
with:
|
|
29
|
+
credentials_json: "${{ secrets.SERVICE_ACCOUNT_KEY }}"
|
|
30
|
+
create_credentials_file: true
|
|
31
|
+
export_environment_variables: true
|
|
32
|
+
|
|
25
33
|
- uses: actions/setup-python@v5
|
|
26
34
|
with:
|
|
27
35
|
python-version: "3.x"
|
|
36
|
+
|
|
28
37
|
- uses: pre-commit/action@v3.0.1
|
|
29
38
|
with:
|
|
30
39
|
extra_args: --hook-stage manual --all-files
|
|
40
|
+
|
|
31
41
|
- name: Run PyLint
|
|
32
42
|
run: |
|
|
33
43
|
echo "::add-matcher::$GITHUB_WORKSPACE/.github/matchers/pylint.json"
|
|
@@ -43,15 +53,23 @@ jobs:
|
|
|
43
53
|
python-version: ["3.8", "3.12"]
|
|
44
54
|
runs-on: [ubuntu-latest, macos-latest, windows-latest]
|
|
45
55
|
|
|
46
|
-
|
|
47
|
-
|
|
48
|
-
|
|
56
|
+
#currently not working on pypi-3.10
|
|
57
|
+
# include:
|
|
58
|
+
# - python-version: pypy-3.10
|
|
59
|
+
# runs-on: ubuntu-latest
|
|
49
60
|
|
|
50
61
|
steps:
|
|
51
62
|
- uses: actions/checkout@v4
|
|
52
63
|
with:
|
|
53
64
|
fetch-depth: 0
|
|
54
65
|
|
|
66
|
+
- name: Authorize Google Cloud
|
|
67
|
+
uses: google-github-actions/auth@v2
|
|
68
|
+
with:
|
|
69
|
+
credentials_json: "${{ secrets.SERVICE_ACCOUNT_KEY }}"
|
|
70
|
+
create_credentials_file: true
|
|
71
|
+
export_environment_variables: true
|
|
72
|
+
|
|
55
73
|
- uses: actions/setup-python@v5
|
|
56
74
|
with:
|
|
57
75
|
python-version: ${{ matrix.python-version }}
|
|
@@ -0,0 +1,18 @@
|
|
|
1
|
+
name: keep-github-actions-alive
|
|
2
|
+
|
|
3
|
+
on:
|
|
4
|
+
schedule:
|
|
5
|
+
- cron: "0 0 * * *"
|
|
6
|
+
|
|
7
|
+
permissions:
|
|
8
|
+
actions: write
|
|
9
|
+
|
|
10
|
+
jobs:
|
|
11
|
+
keep-alive:
|
|
12
|
+
runs-on: ubuntu-latest
|
|
13
|
+
steps:
|
|
14
|
+
- uses: actions/checkout@v4
|
|
15
|
+
- uses: gautamkrishnar/keepalive-workflow@v2
|
|
16
|
+
with:
|
|
17
|
+
time_elapsed: 50
|
|
18
|
+
use_api: true
|
|
@@ -54,12 +54,6 @@ repos:
|
|
|
54
54
|
args: []
|
|
55
55
|
additional_dependencies:
|
|
56
56
|
- pytest
|
|
57
|
-
# Since the "python_version" set in the "tool.mypy" section of "pyproject.toml" is "3.8",
|
|
58
|
-
# we ensure type checking also works when running the hook from Python versions above 3.8 by always
|
|
59
|
-
# installing "importlib_metadata". Note that because the "importlib.metadata.distribution"
|
|
60
|
-
# module was added in Python version 3.10 and later, this line can be removed when only supporting
|
|
61
|
-
# Python versions 3.10 and above.
|
|
62
|
-
- importlib_metadata>=2.0
|
|
63
57
|
|
|
64
58
|
- repo: https://github.com/codespell-project/codespell
|
|
65
59
|
rev: "v2.2.6"
|
|
@@ -0,0 +1,37 @@
|
|
|
1
|
+
cmake_minimum_required(VERSION 3.15...3.26)
|
|
2
|
+
project(${SKBUILD_PROJECT_NAME} LANGUAGES NONE)
|
|
3
|
+
|
|
4
|
+
find_package(
|
|
5
|
+
Python
|
|
6
|
+
COMPONENTS Interpreter
|
|
7
|
+
REQUIRED)
|
|
8
|
+
|
|
9
|
+
if(NOT DEFINED ENV{GCP_PROJECT})
|
|
10
|
+
message(FATAL_ERROR "GCP_PROJECT env. variable is not set")
|
|
11
|
+
endif()
|
|
12
|
+
|
|
13
|
+
option(IDC_INDEX_DATA_GENERATE_CSV_ARCHIVE "Generate idc_index.csv.zip file" ON)
|
|
14
|
+
option(IDC_INDEX_DATA_GENERATE_PARQUET "Generate idc_index.parquet file" OFF)
|
|
15
|
+
|
|
16
|
+
set(download_dir "${PROJECT_BINARY_DIR}")
|
|
17
|
+
|
|
18
|
+
add_custom_command(
|
|
19
|
+
OUTPUT
|
|
20
|
+
$<$<BOOL:${IDC_INDEX_DATA_GENERATE_CSV_ARCHIVE}>:${download_dir}/idc_index.csv.zip>
|
|
21
|
+
$<$<BOOL:${IDC_INDEX_DATA_GENERATE_PARQUET}>:${download_dir}/idc_index.parquet>
|
|
22
|
+
COMMAND python ${CMAKE_CURRENT_SOURCE_DIR}/scripts/python/idc_index_data_manager.py
|
|
23
|
+
$<$<BOOL:${IDC_INDEX_DATA_GENERATE_CSV_ARCHIVE}>:--generate-csv-archive>
|
|
24
|
+
$<$<BOOL:${IDC_INDEX_DATA_GENERATE_PARQUET}>:--generate-parquet>
|
|
25
|
+
)
|
|
26
|
+
|
|
27
|
+
add_custom_target(run_idc_index_data_manager ALL
|
|
28
|
+
DEPENDS
|
|
29
|
+
$<$<BOOL:${IDC_INDEX_DATA_GENERATE_CSV_ARCHIVE}>:${download_dir}/idc_index.csv.zip>
|
|
30
|
+
$<$<BOOL:${IDC_INDEX_DATA_GENERATE_PARQUET}>:${download_dir}/idc_index.parquet>
|
|
31
|
+
)
|
|
32
|
+
|
|
33
|
+
install(
|
|
34
|
+
FILES
|
|
35
|
+
$<$<BOOL:${IDC_INDEX_DATA_GENERATE_CSV_ARCHIVE}>:${download_dir}/idc_index.csv.zip>
|
|
36
|
+
$<$<BOOL:${IDC_INDEX_DATA_GENERATE_PARQUET}>:${download_dir}/idc_index.parquet>
|
|
37
|
+
DESTINATION "idc_index_data")
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.1
|
|
2
2
|
Name: idc-index-data
|
|
3
|
-
Version: 0.
|
|
3
|
+
Version: 17.0.0
|
|
4
4
|
Summary: ImagingDataCommons index to query and download data.
|
|
5
5
|
Author-Email: Andrey Fedorov <andrey.fedorov@gmail.com>, Vamsi Thiriveedhi <vthiriveedhi@mgh.harvard.edu>, Jean-Christophe Fillion-Robin <jchris.fillionr@kitware.com>
|
|
6
6
|
License: Copyright 2024 Andrey Fedorov
|
|
@@ -42,7 +42,9 @@ Project-URL: Bug tracker, https://github.com/ImagingDataCommons/idc-index-data/i
|
|
|
42
42
|
Project-URL: Discussions, https://discourse.canceridc.dev/
|
|
43
43
|
Project-URL: Changelog, https://github.com/ImagingDataCommons/idc-index-data/releases
|
|
44
44
|
Requires-Python: >=3.8
|
|
45
|
-
|
|
45
|
+
Provides-Extra: test
|
|
46
|
+
Provides-Extra: dev
|
|
47
|
+
Provides-Extra: docs
|
|
46
48
|
Requires-Dist: pytest>=6; extra == "test"
|
|
47
49
|
Requires-Dist: pytest-cov>=3; extra == "test"
|
|
48
50
|
Requires-Dist: pytest>=6; extra == "dev"
|
|
@@ -52,9 +54,6 @@ Requires-Dist: myst_parser>=0.13; extra == "docs"
|
|
|
52
54
|
Requires-Dist: sphinx_copybutton; extra == "docs"
|
|
53
55
|
Requires-Dist: sphinx_autodoc_typehints; extra == "docs"
|
|
54
56
|
Requires-Dist: furo>=2023.08.17; extra == "docs"
|
|
55
|
-
Provides-Extra: test
|
|
56
|
-
Provides-Extra: dev
|
|
57
|
-
Provides-Extra: docs
|
|
58
57
|
Description-Content-Type: text/markdown
|
|
59
58
|
|
|
60
59
|
# idc-index-data
|
|
@@ -1,6 +1,7 @@
|
|
|
1
1
|
from __future__ import annotations
|
|
2
2
|
|
|
3
3
|
import argparse
|
|
4
|
+
import re
|
|
4
5
|
import shutil
|
|
5
6
|
from pathlib import Path
|
|
6
7
|
|
|
@@ -8,7 +9,7 @@ import nox
|
|
|
8
9
|
|
|
9
10
|
DIR = Path(__file__).parent.resolve()
|
|
10
11
|
|
|
11
|
-
nox.options.sessions = ["lint", "pylint", "tests"]
|
|
12
|
+
nox.options.sessions = ["lint", "pylint", "tests"] # Session run by default
|
|
12
13
|
|
|
13
14
|
|
|
14
15
|
@nox.session
|
|
@@ -115,3 +116,90 @@ def build(session: nox.Session) -> None:
|
|
|
115
116
|
|
|
116
117
|
session.install("build")
|
|
117
118
|
session.run("python", "-m", "build")
|
|
119
|
+
|
|
120
|
+
|
|
121
|
+
def _bump(session: nox.Session, name: str, script: str, files) -> None:
|
|
122
|
+
parser = argparse.ArgumentParser()
|
|
123
|
+
parser.add_argument(
|
|
124
|
+
"--commit", action="store_true", help="Make a branch and commit."
|
|
125
|
+
)
|
|
126
|
+
parser.add_argument(
|
|
127
|
+
"version", nargs="?", help="The version to process - leave off for latest."
|
|
128
|
+
)
|
|
129
|
+
args = parser.parse_args(session.posargs)
|
|
130
|
+
|
|
131
|
+
session.install("db-dtypes")
|
|
132
|
+
session.install("google-cloud-bigquery")
|
|
133
|
+
session.install("pandas")
|
|
134
|
+
session.install("pyarrow")
|
|
135
|
+
|
|
136
|
+
if args.version is None:
|
|
137
|
+
gcp_project = "idc-external-025"
|
|
138
|
+
idc_index_version = session.run(
|
|
139
|
+
"python",
|
|
140
|
+
"scripts/python/idc_index_data_manager.py",
|
|
141
|
+
"--project",
|
|
142
|
+
gcp_project,
|
|
143
|
+
"--retrieve-latest-idc-release-version",
|
|
144
|
+
external=True,
|
|
145
|
+
silent=True,
|
|
146
|
+
).strip()
|
|
147
|
+
|
|
148
|
+
else:
|
|
149
|
+
idc_index_version = args.version
|
|
150
|
+
|
|
151
|
+
extra = ["--quiet"] if args.commit else []
|
|
152
|
+
session.run("python", script, idc_index_version, *extra)
|
|
153
|
+
|
|
154
|
+
if args.commit:
|
|
155
|
+
session.run(
|
|
156
|
+
"git",
|
|
157
|
+
"switch",
|
|
158
|
+
"-c",
|
|
159
|
+
f"update-to-{name.replace(' ', '-').lower()}-{idc_index_version}",
|
|
160
|
+
external=True,
|
|
161
|
+
)
|
|
162
|
+
session.run("git", "add", "-u", *files, external=True)
|
|
163
|
+
session.run(
|
|
164
|
+
"git",
|
|
165
|
+
"commit",
|
|
166
|
+
"-m",
|
|
167
|
+
f"Update to {name} {idc_index_version}",
|
|
168
|
+
external=True,
|
|
169
|
+
)
|
|
170
|
+
session.log(
|
|
171
|
+
f'Complete! Now run: gh pr create --fill --body "Created by running `nox -s {session.name} -- --commit`"'
|
|
172
|
+
)
|
|
173
|
+
|
|
174
|
+
|
|
175
|
+
@nox.session
|
|
176
|
+
def bump(session: nox.Session) -> None:
|
|
177
|
+
"""
|
|
178
|
+
Set to a new IDC index version, use -- <version>, otherwise will use the latest version.
|
|
179
|
+
"""
|
|
180
|
+
files = (
|
|
181
|
+
"pyproject.toml",
|
|
182
|
+
"scripts/sql/idc_index.sql",
|
|
183
|
+
"tests/test_package.py",
|
|
184
|
+
)
|
|
185
|
+
_bump(
|
|
186
|
+
session,
|
|
187
|
+
"IDC index",
|
|
188
|
+
"scripts/python/update_idc_index_version.py",
|
|
189
|
+
files,
|
|
190
|
+
)
|
|
191
|
+
|
|
192
|
+
|
|
193
|
+
@nox.session(venv_backend="none")
|
|
194
|
+
def tag_release(session: nox.Session) -> None:
|
|
195
|
+
"""
|
|
196
|
+
Print instructions for tagging a release and pushing it to GitHub.
|
|
197
|
+
"""
|
|
198
|
+
|
|
199
|
+
session.log("Run the following commands to make a release:")
|
|
200
|
+
txt = Path("pyproject.toml").read_text()
|
|
201
|
+
current_version = next(iter(re.finditer(r'^version = "([\d\.]+)$"', txt))).group(1)
|
|
202
|
+
print(
|
|
203
|
+
f"git tag --sign -m 'idc-index-data {current_version}' {current_version} main"
|
|
204
|
+
)
|
|
205
|
+
print(f"git push origin {current_version}")
|
|
@@ -1,10 +1,19 @@
|
|
|
1
1
|
[build-system]
|
|
2
|
-
requires = [
|
|
2
|
+
requires = [
|
|
3
|
+
"scikit-build-core",
|
|
4
|
+
"db-dtypes",
|
|
5
|
+
"google-cloud-bigquery",
|
|
6
|
+
"pandas",
|
|
7
|
+
"pyarrow",
|
|
8
|
+
"pygithub",
|
|
9
|
+
"requests"
|
|
10
|
+
]
|
|
3
11
|
build-backend = "scikit_build_core.build"
|
|
4
12
|
|
|
5
13
|
|
|
6
14
|
[project]
|
|
7
15
|
name = "idc-index-data"
|
|
16
|
+
version = "17.0.0"
|
|
8
17
|
authors = [
|
|
9
18
|
{ name = "Andrey Fedorov", email = "andrey.fedorov@gmail.com" },
|
|
10
19
|
{ name = "Vamsi Thiriveedhi", email = "vthiriveedhi@mgh.harvard.edu" },
|
|
@@ -31,8 +40,7 @@ classifiers = [
|
|
|
31
40
|
"Topic :: Scientific/Engineering",
|
|
32
41
|
"Typing :: Typed",
|
|
33
42
|
]
|
|
34
|
-
|
|
35
|
-
dependencies = ["importlib_metadata>=2.0; python_version<'3.10'"]
|
|
43
|
+
dependencies = []
|
|
36
44
|
|
|
37
45
|
[project.optional-dependencies]
|
|
38
46
|
test = [
|
|
@@ -61,14 +69,15 @@ Changelog = "https://github.com/ImagingDataCommons/idc-index-data/releases"
|
|
|
61
69
|
[tool.scikit-build]
|
|
62
70
|
minimum-version = "0.8.2"
|
|
63
71
|
build-dir = "build/{wheel_tag}"
|
|
64
|
-
metadata.version.provider = "scikit_build_core.metadata.setuptools_scm"
|
|
65
|
-
sdist.include = ["src/idc_index_data/_version.py"]
|
|
66
72
|
wheel.platlib = false
|
|
67
73
|
wheel.py-api = "py3"
|
|
68
74
|
|
|
69
75
|
|
|
70
|
-
[tool.
|
|
71
|
-
|
|
76
|
+
[[tool.scikit-build.generate]]
|
|
77
|
+
path = "idc_index_data/_version.py"
|
|
78
|
+
template = '''
|
|
79
|
+
version = "${version}"
|
|
80
|
+
'''
|
|
72
81
|
|
|
73
82
|
|
|
74
83
|
[tool.pytest.ini_options]
|
|
@@ -108,7 +117,7 @@ disallow_incomplete_defs = true
|
|
|
108
117
|
|
|
109
118
|
|
|
110
119
|
[tool.ruff]
|
|
111
|
-
src = ["src"]
|
|
120
|
+
src = ["src", "scripts"]
|
|
112
121
|
|
|
113
122
|
[tool.ruff.lint]
|
|
114
123
|
extend-select = [
|
|
@@ -0,0 +1,136 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
import logging
|
|
4
|
+
import os
|
|
5
|
+
from pathlib import Path
|
|
6
|
+
|
|
7
|
+
import pandas as pd
|
|
8
|
+
from google.cloud import bigquery
|
|
9
|
+
|
|
10
|
+
logging.basicConfig(level=logging.DEBUG)
|
|
11
|
+
logger = logging.getLogger(__name__)
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
class IDCIndexDataManager:
|
|
15
|
+
def __init__(self, project_id: str):
|
|
16
|
+
"""
|
|
17
|
+
Initializes the IDCIndexDataManager using the Google Cloud Platform project ID.
|
|
18
|
+
"""
|
|
19
|
+
self.project_id = project_id
|
|
20
|
+
self.client = bigquery.Client(project=project_id)
|
|
21
|
+
logger.debug("IDCIndexDataManager initialized with project ID: %s", project_id)
|
|
22
|
+
|
|
23
|
+
def execute_sql_query(self, file_path: str) -> tuple[pd.DataFrame, str]:
|
|
24
|
+
"""
|
|
25
|
+
Executes the SQL query in the specified file.
|
|
26
|
+
|
|
27
|
+
Returns:
|
|
28
|
+
Tuple[pd.DataFrame, str]: A tuple containing the DataFrame with query results,
|
|
29
|
+
the output basename.
|
|
30
|
+
"""
|
|
31
|
+
with Path(file_path).open("r") as file:
|
|
32
|
+
sql_query = file.read()
|
|
33
|
+
index_df = self.client.query(sql_query).to_dataframe()
|
|
34
|
+
output_basename = Path(file_path).name.split(".")[0]
|
|
35
|
+
logger.debug("Executed SQL query from file: %s", file_path)
|
|
36
|
+
return index_df, output_basename
|
|
37
|
+
|
|
38
|
+
def generate_index_data_files(
|
|
39
|
+
self, generate_compressed_csv: bool = True, generate_parquet: bool = False
|
|
40
|
+
) -> None:
|
|
41
|
+
"""
|
|
42
|
+
Generates index-data files locally by executing queries against
|
|
43
|
+
the Google Cloud Platform IDC project tables.
|
|
44
|
+
|
|
45
|
+
This method iterates over SQL files in the 'scripts/sql' directory,
|
|
46
|
+
executing each query using :func:`execute_sql_query` and generating a DataFrame,
|
|
47
|
+
'index_df'. The DataFrame is then saved as compressed CSV and/or Parquet file.
|
|
48
|
+
"""
|
|
49
|
+
|
|
50
|
+
scripts_dir = Path(__file__).parent.parent
|
|
51
|
+
sql_dir = scripts_dir / "sql"
|
|
52
|
+
|
|
53
|
+
for file_name in os.listdir(sql_dir):
|
|
54
|
+
if file_name.endswith(".sql"):
|
|
55
|
+
file_path = Path(sql_dir) / file_name
|
|
56
|
+
index_df, output_basename = self.execute_sql_query(file_path)
|
|
57
|
+
logger.debug(
|
|
58
|
+
"Executed and processed SQL queries from file: %s", file_path
|
|
59
|
+
)
|
|
60
|
+
if generate_compressed_csv:
|
|
61
|
+
csv_file_name = f"{output_basename}.csv.zip"
|
|
62
|
+
index_df.to_csv(
|
|
63
|
+
csv_file_name, compression={"method": "zip"}, escapechar="\\"
|
|
64
|
+
)
|
|
65
|
+
logger.debug("Created CSV zip file: %s", csv_file_name)
|
|
66
|
+
|
|
67
|
+
if generate_parquet:
|
|
68
|
+
parquet_file_name = f"{output_basename}.parquet"
|
|
69
|
+
index_df.to_parquet(parquet_file_name)
|
|
70
|
+
logger.debug("Created Parquet file: %s", parquet_file_name)
|
|
71
|
+
|
|
72
|
+
def retrieve_latest_idc_release_version(self) -> int:
|
|
73
|
+
"""
|
|
74
|
+
Retrieves the latest IDC release version.
|
|
75
|
+
|
|
76
|
+
This function executes a SQL query on the `version_metadata` table in the
|
|
77
|
+
`idc_current` dataset of the BigQuery client. It retrieves the maximum
|
|
78
|
+
`idc_version` and returns it as an integer.
|
|
79
|
+
"""
|
|
80
|
+
query = """
|
|
81
|
+
SELECT
|
|
82
|
+
MAX(idc_version) AS latest_idc_release_version
|
|
83
|
+
FROM
|
|
84
|
+
`bigquery-public-data.idc_current.version_metadata`
|
|
85
|
+
"""
|
|
86
|
+
query_job = self.client.query(query)
|
|
87
|
+
result = query_job.result()
|
|
88
|
+
return int(next(result).latest_idc_release_version)
|
|
89
|
+
|
|
90
|
+
|
|
91
|
+
if __name__ == "__main__":
|
|
92
|
+
import argparse
|
|
93
|
+
|
|
94
|
+
parser = argparse.ArgumentParser()
|
|
95
|
+
parser.add_argument(
|
|
96
|
+
"--project",
|
|
97
|
+
default=os.environ.get("GCP_PROJECT", None),
|
|
98
|
+
help="Google Cloud Platform Project ID (default from GCP_PROJECT env. variable)",
|
|
99
|
+
)
|
|
100
|
+
parser.add_argument(
|
|
101
|
+
"--generate-csv-archive",
|
|
102
|
+
action="store_true",
|
|
103
|
+
help="Generate idc_index.csv.zip file",
|
|
104
|
+
)
|
|
105
|
+
parser.add_argument(
|
|
106
|
+
"--generate-parquet",
|
|
107
|
+
action="store_true",
|
|
108
|
+
help="Generate idc_index.parquet file",
|
|
109
|
+
)
|
|
110
|
+
parser.add_argument(
|
|
111
|
+
"--retrieve-latest-idc-release-version",
|
|
112
|
+
action="store_true",
|
|
113
|
+
help="Retrieve and display the latest IDC release version",
|
|
114
|
+
)
|
|
115
|
+
|
|
116
|
+
args = parser.parse_args()
|
|
117
|
+
|
|
118
|
+
if not args.project:
|
|
119
|
+
parser.error(
|
|
120
|
+
"Set GCP_PROJECT environment variable or specify --project argument"
|
|
121
|
+
)
|
|
122
|
+
|
|
123
|
+
if any([args.generate_csv_archive, args.generate_parquet]):
|
|
124
|
+
IDCIndexDataManager(args.project).generate_index_data_files(
|
|
125
|
+
generate_compressed_csv=args.generate_csv_archive,
|
|
126
|
+
generate_parquet=args.generate_parquet,
|
|
127
|
+
)
|
|
128
|
+
elif args.retrieve_latest_idc_release_version:
|
|
129
|
+
logging.basicConfig(level=logging.ERROR, force=True)
|
|
130
|
+
logger.setLevel(logging.ERROR)
|
|
131
|
+
version = IDCIndexDataManager(
|
|
132
|
+
args.project
|
|
133
|
+
).retrieve_latest_idc_release_version()
|
|
134
|
+
print(f"{version}") # noqa: T201
|
|
135
|
+
else:
|
|
136
|
+
parser.print_help()
|
|
@@ -0,0 +1,91 @@
|
|
|
1
|
+
#!/usr/bin/env python3
|
|
2
|
+
"""
|
|
3
|
+
Command line executable allowing to update source files given a IDC index version.
|
|
4
|
+
"""
|
|
5
|
+
|
|
6
|
+
from __future__ import annotations
|
|
7
|
+
|
|
8
|
+
import argparse
|
|
9
|
+
import contextlib
|
|
10
|
+
import os
|
|
11
|
+
import re
|
|
12
|
+
import textwrap
|
|
13
|
+
from pathlib import Path
|
|
14
|
+
|
|
15
|
+
ROOT_DIR = Path(__file__).parent / "../.."
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
@contextlib.contextmanager
|
|
19
|
+
def _log(txt, verbose=True):
|
|
20
|
+
if verbose:
|
|
21
|
+
print(txt) # noqa: T201
|
|
22
|
+
yield
|
|
23
|
+
if verbose:
|
|
24
|
+
print(f"{txt} - done") # noqa: T201
|
|
25
|
+
|
|
26
|
+
|
|
27
|
+
def _update_file(filepath, regex, replacement):
|
|
28
|
+
msg = "Updating %s" % os.path.relpath(str(filepath), ROOT_DIR)
|
|
29
|
+
with _log(msg):
|
|
30
|
+
pattern = re.compile(regex)
|
|
31
|
+
with filepath.open() as doc_file:
|
|
32
|
+
lines = doc_file.readlines()
|
|
33
|
+
updated_content = []
|
|
34
|
+
for line in lines:
|
|
35
|
+
updated_content.append(re.sub(pattern, replacement, line))
|
|
36
|
+
with filepath.open("w") as doc_file:
|
|
37
|
+
doc_file.writelines(updated_content)
|
|
38
|
+
|
|
39
|
+
|
|
40
|
+
def update_pyproject_toml(idc_index_version):
|
|
41
|
+
pattern = re.compile(r'^version = "[\w\.]+"$')
|
|
42
|
+
replacement = f'version = "{idc_index_version}.0.0"'
|
|
43
|
+
_update_file(ROOT_DIR / "pyproject.toml", pattern, replacement)
|
|
44
|
+
|
|
45
|
+
|
|
46
|
+
def update_sql_scripts(idc_index_version):
|
|
47
|
+
pattern = re.compile(r"idc_v\d+")
|
|
48
|
+
replacement = f"idc_v{idc_index_version}"
|
|
49
|
+
_update_file(ROOT_DIR / "scripts/sql/idc_index.sql", pattern, replacement)
|
|
50
|
+
|
|
51
|
+
|
|
52
|
+
def update_tests(idc_index_version):
|
|
53
|
+
pattern = re.compile(r"EXPECTED_IDC_INDEX_VERSION = \d+")
|
|
54
|
+
replacement = f"EXPECTED_IDC_INDEX_VERSION = {idc_index_version}"
|
|
55
|
+
_update_file(ROOT_DIR / "tests/test_package.py", pattern, replacement)
|
|
56
|
+
|
|
57
|
+
|
|
58
|
+
def main():
|
|
59
|
+
parser = argparse.ArgumentParser(description=__doc__)
|
|
60
|
+
parser.add_argument(
|
|
61
|
+
"idc_index_version",
|
|
62
|
+
metavar="IDC_INDEX_VERSION",
|
|
63
|
+
type=int,
|
|
64
|
+
help="IDC index version of the form NN",
|
|
65
|
+
)
|
|
66
|
+
parser.add_argument(
|
|
67
|
+
"--quiet",
|
|
68
|
+
action="store_true",
|
|
69
|
+
help="Hide the output",
|
|
70
|
+
)
|
|
71
|
+
|
|
72
|
+
args = parser.parse_args()
|
|
73
|
+
|
|
74
|
+
update_pyproject_toml(args.idc_index_version)
|
|
75
|
+
update_sql_scripts(args.idc_index_version)
|
|
76
|
+
update_tests(args.idc_index_version)
|
|
77
|
+
|
|
78
|
+
if not args.quiet:
|
|
79
|
+
msg = """\
|
|
80
|
+
Complete! Now run:
|
|
81
|
+
|
|
82
|
+
git switch -c update-to-idc-index-{release}
|
|
83
|
+
git add -u pyproject.toml scripts/sql/idc_index.sql tests/test_package.py
|
|
84
|
+
git commit -m "Update to IDC index {release}"
|
|
85
|
+
gh pr create --fill --body "Created by update_idc_index_version.py"
|
|
86
|
+
"""
|
|
87
|
+
print(textwrap.dedent(msg.format(release=args.idc_index_version))) # noqa: T201
|
|
88
|
+
|
|
89
|
+
|
|
90
|
+
if __name__ == "__main__":
|
|
91
|
+
main()
|
|
@@ -0,0 +1,34 @@
|
|
|
1
|
+
SELECT
|
|
2
|
+
# collection level attributes
|
|
3
|
+
ANY_VALUE(collection_id) AS collection_id,
|
|
4
|
+
ANY_VALUE(PatientID) AS PatientID,
|
|
5
|
+
SeriesInstanceUID,
|
|
6
|
+
ANY_VALUE(StudyInstanceUID) AS StudyInstanceUID,
|
|
7
|
+
ANY_VALUE(source_DOI) AS source_DOI,
|
|
8
|
+
# patient level attributes
|
|
9
|
+
ANY_VALUE(PatientAge) AS PatientAge,
|
|
10
|
+
ANY_VALUE(PatientSex) AS PatientSex,
|
|
11
|
+
# study level attributes
|
|
12
|
+
ANY_VALUE(StudyDate) AS StudyDate,
|
|
13
|
+
ANY_VALUE(StudyDescription) AS StudyDescription,
|
|
14
|
+
ANY_VALUE(dicom_curated.BodyPartExamined) AS BodyPartExamined,
|
|
15
|
+
# series level attributes
|
|
16
|
+
ANY_VALUE(Modality) AS Modality,
|
|
17
|
+
ANY_VALUE(Manufacturer) AS Manufacturer,
|
|
18
|
+
ANY_VALUE(ManufacturerModelName) AS ManufacturerModelName,
|
|
19
|
+
ANY_VALUE(SAFE_CAST(SeriesDate AS STRING)) AS SeriesDate,
|
|
20
|
+
ANY_VALUE(SeriesDescription) AS SeriesDescription,
|
|
21
|
+
ANY_VALUE(SeriesNumber) AS SeriesNumber,
|
|
22
|
+
COUNT(dicom_all.SOPInstanceUID) AS instanceCount,
|
|
23
|
+
ANY_VALUE(license_short_name) as license_short_name,
|
|
24
|
+
# download related attributes
|
|
25
|
+
ANY_VALUE(CONCAT("s3://", SPLIT(aws_url,"/")[SAFE_OFFSET(2)], "/", crdc_series_uuid, "/*")) AS series_aws_url,
|
|
26
|
+
ROUND(SUM(SAFE_CAST(instance_size AS float64))/1000000, 2) AS series_size_MB,
|
|
27
|
+
FROM
|
|
28
|
+
`bigquery-public-data.idc_v17.dicom_all` AS dicom_all
|
|
29
|
+
JOIN
|
|
30
|
+
`bigquery-public-data.idc_v17.dicom_metadata_curated` AS dicom_curated
|
|
31
|
+
ON
|
|
32
|
+
dicom_all.SOPInstanceUID = dicom_curated.SOPInstanceUID
|
|
33
|
+
GROUP BY
|
|
34
|
+
SeriesInstanceUID
|
|
@@ -6,28 +6,35 @@ idc-index-data: ImagingDataCommons index to query and download data.
|
|
|
6
6
|
|
|
7
7
|
from __future__ import annotations
|
|
8
8
|
|
|
9
|
-
import
|
|
9
|
+
from importlib.metadata import distribution
|
|
10
10
|
from pathlib import Path
|
|
11
11
|
|
|
12
|
-
if sys.version_info >= (3, 10):
|
|
13
|
-
from importlib.metadata import distribution
|
|
14
|
-
else:
|
|
15
|
-
from importlib_metadata import distribution
|
|
16
|
-
|
|
17
12
|
from ._version import version as __version__
|
|
18
13
|
|
|
19
|
-
__all__ = [
|
|
14
|
+
__all__ = [
|
|
15
|
+
"__version__",
|
|
16
|
+
"IDC_INDEX_CSV_ARCHIVE_FILEPATH",
|
|
17
|
+
"IDC_INDEX_PARQUET_FILEPATH",
|
|
18
|
+
]
|
|
20
19
|
|
|
21
20
|
|
|
22
|
-
def _lookup(path: str) -> Path:
|
|
21
|
+
def _lookup(path: str, optional: bool = False) -> Path | None:
|
|
23
22
|
"""Support editable installation by looking up path using distribution API."""
|
|
24
23
|
files = distribution("idc_index_data").files
|
|
25
24
|
if files is not None:
|
|
26
25
|
for _file in files:
|
|
27
26
|
if str(_file) == path:
|
|
28
27
|
return Path(str(_file.locate())).resolve(strict=True)
|
|
28
|
+
if optional:
|
|
29
|
+
return None
|
|
30
|
+
|
|
29
31
|
msg = f"Failed to lookup '{path}`."
|
|
30
32
|
raise FileNotFoundError(msg)
|
|
31
33
|
|
|
32
34
|
|
|
33
|
-
IDC_INDEX_CSV_ARCHIVE_FILEPATH: Path = _lookup(
|
|
35
|
+
IDC_INDEX_CSV_ARCHIVE_FILEPATH: Path | None = _lookup(
|
|
36
|
+
"idc_index_data/idc_index.csv.zip"
|
|
37
|
+
)
|
|
38
|
+
IDC_INDEX_PARQUET_FILEPATH: Path | None = _lookup(
|
|
39
|
+
"idc_index_data/idc_index.parquet", optional=True
|
|
40
|
+
)
|
|
@@ -0,0 +1,27 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
import importlib.metadata
|
|
4
|
+
|
|
5
|
+
from packaging.version import Version
|
|
6
|
+
|
|
7
|
+
import idc_index_data as m
|
|
8
|
+
|
|
9
|
+
EXPECTED_IDC_INDEX_VERSION = 17
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
def test_version():
|
|
13
|
+
assert importlib.metadata.version("idc_index_data") == m.__version__
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
def test_idc_index_version():
|
|
17
|
+
assert Version(m.__version__).major == EXPECTED_IDC_INDEX_VERSION
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
def test_filepath():
|
|
21
|
+
if m.IDC_INDEX_CSV_ARCHIVE_FILEPATH is not None:
|
|
22
|
+
assert m.IDC_INDEX_CSV_ARCHIVE_FILEPATH.is_file()
|
|
23
|
+
assert m.IDC_INDEX_CSV_ARCHIVE_FILEPATH.name == "idc_index.csv.zip"
|
|
24
|
+
|
|
25
|
+
if m.IDC_INDEX_PARQUET_FILEPATH is not None:
|
|
26
|
+
assert m.IDC_INDEX_PARQUET_FILEPATH.is_file()
|
|
27
|
+
assert m.IDC_INDEX_PARQUET_FILEPATH.name == "idc_index.parquet"
|
|
@@ -1,20 +0,0 @@
|
|
|
1
|
-
cmake_minimum_required(VERSION 3.15...3.26)
|
|
2
|
-
project(${SKBUILD_PROJECT_NAME} LANGUAGES NONE)
|
|
3
|
-
|
|
4
|
-
|
|
5
|
-
set(idc_index_release_version "0.3.2")
|
|
6
|
-
set(idc_index_data_url "https://github.com/ImagingDataCommons/idc-index/releases/download/${idc_index_release_version}/idc_index.csv.zip")
|
|
7
|
-
set(idc_index_data_sha256 "70ec9f915686a27bee3098163b8695c69c8696c05bfb7bd76943a24024cdeeb9")
|
|
8
|
-
|
|
9
|
-
#
|
|
10
|
-
# Download and install index
|
|
11
|
-
#
|
|
12
|
-
set(download_dir "${PROJECT_BINARY_DIR}")
|
|
13
|
-
include(FetchContent)
|
|
14
|
-
FetchContent_Populate(s5cmd
|
|
15
|
-
URL ${idc_index_data_url}
|
|
16
|
-
URL_HASH SHA256=${idc_index_data_sha256}
|
|
17
|
-
DOWNLOAD_DIR ${download_dir}
|
|
18
|
-
DOWNLOAD_NO_EXTRACT TRUE
|
|
19
|
-
)
|
|
20
|
-
install(FILES "${download_dir}/idc_index.csv.zip" DESTINATION "idc_index_data")
|
|
@@ -1,16 +0,0 @@
|
|
|
1
|
-
# file generated by setuptools_scm
|
|
2
|
-
# don't change, don't track in version control
|
|
3
|
-
TYPE_CHECKING = False
|
|
4
|
-
if TYPE_CHECKING:
|
|
5
|
-
from typing import Tuple, Union
|
|
6
|
-
VERSION_TUPLE = Tuple[Union[int, str], ...]
|
|
7
|
-
else:
|
|
8
|
-
VERSION_TUPLE = object
|
|
9
|
-
|
|
10
|
-
version: str
|
|
11
|
-
__version__: str
|
|
12
|
-
__version_tuple__: VERSION_TUPLE
|
|
13
|
-
version_tuple: VERSION_TUPLE
|
|
14
|
-
|
|
15
|
-
__version__ = version = '0.1.0'
|
|
16
|
-
__version_tuple__ = version_tuple = (0, 1, 0)
|
|
@@ -1,14 +0,0 @@
|
|
|
1
|
-
from __future__ import annotations
|
|
2
|
-
|
|
3
|
-
import importlib.metadata
|
|
4
|
-
|
|
5
|
-
import idc_index_data as m
|
|
6
|
-
|
|
7
|
-
|
|
8
|
-
def test_version():
|
|
9
|
-
assert importlib.metadata.version("idc_index_data") == m.__version__
|
|
10
|
-
|
|
11
|
-
|
|
12
|
-
def test_filepath():
|
|
13
|
-
assert m.IDC_INDEX_CSV_ARCHIVE_FILEPATH.is_file()
|
|
14
|
-
assert m.IDC_INDEX_CSV_ARCHIVE_FILEPATH.name == "idc_index.csv.zip"
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|