idc-index-data 22.0.2__tar.gz → 22.1.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {idc_index_data-22.0.2 → idc_index_data-22.1.0}/.github/workflows/ci.yml +1 -1
- {idc_index_data-22.0.2 → idc_index_data-22.1.0}/.github/workflows/external-indices.yml +1 -1
- {idc_index_data-22.0.2 → idc_index_data-22.1.0}/.pre-commit-config.yaml +12 -13
- {idc_index_data-22.0.2 → idc_index_data-22.1.0}/PKG-INFO +1 -1
- {idc_index_data-22.0.2 → idc_index_data-22.1.0}/pyproject.toml +1 -1
- {idc_index_data-22.0.2 → idc_index_data-22.1.0}/scripts/python/generate-indices.py +6 -4
- {idc_index_data-22.0.2 → idc_index_data-22.1.0}/scripts/python/idc_index_data_manager.py +55 -19
- {idc_index_data-22.0.2 → idc_index_data-22.1.0}/scripts/python/update_idc_index_version.py +2 -1
- idc_index_data-22.1.0/scripts/sql/analysis_results_index.sql +16 -0
- idc_index_data-22.1.0/scripts/sql/collections_index.sql +15 -0
- {idc_index_data-22.0.2 → idc_index_data-22.1.0}/.git_archival.txt +0 -0
- {idc_index_data-22.0.2 → idc_index_data-22.1.0}/.gitattributes +0 -0
- {idc_index_data-22.0.2 → idc_index_data-22.1.0}/.github/CONTRIBUTING.md +0 -0
- {idc_index_data-22.0.2 → idc_index_data-22.1.0}/.github/dependabot.yml +0 -0
- {idc_index_data-22.0.2 → idc_index_data-22.1.0}/.github/matchers/pylint.json +0 -0
- {idc_index_data-22.0.2 → idc_index_data-22.1.0}/.github/workflows/cd.yml +0 -0
- {idc_index_data-22.0.2 → idc_index_data-22.1.0}/.gitignore +0 -0
- {idc_index_data-22.0.2 → idc_index_data-22.1.0}/.readthedocs.yaml +0 -0
- {idc_index_data-22.0.2 → idc_index_data-22.1.0}/CMakeLists.txt +0 -0
- {idc_index_data-22.0.2 → idc_index_data-22.1.0}/LICENSE +0 -0
- {idc_index_data-22.0.2 → idc_index_data-22.1.0}/README.md +0 -0
- {idc_index_data-22.0.2 → idc_index_data-22.1.0}/assets/README.md +0 -0
- {idc_index_data-22.0.2 → idc_index_data-22.1.0}/assets/clinical_index.sql +0 -0
- {idc_index_data-22.0.2 → idc_index_data-22.1.0}/assets/sm_index.sql +0 -0
- {idc_index_data-22.0.2 → idc_index_data-22.1.0}/assets/sm_instance_index.sql +0 -0
- {idc_index_data-22.0.2 → idc_index_data-22.1.0}/docs/conf.py +0 -0
- {idc_index_data-22.0.2 → idc_index_data-22.1.0}/docs/index.md +0 -0
- {idc_index_data-22.0.2 → idc_index_data-22.1.0}/noxfile.py +0 -0
- {idc_index_data-22.0.2 → idc_index_data-22.1.0}/scripts/sql/idc_index.sql +0 -0
- {idc_index_data-22.0.2 → idc_index_data-22.1.0}/scripts/sql/prior_versions_index.sql +0 -0
- {idc_index_data-22.0.2 → idc_index_data-22.1.0}/src/idc_index_data/__init__.py +1 -1
- {idc_index_data-22.0.2 → idc_index_data-22.1.0}/src/idc_index_data/_version.pyi +0 -0
- {idc_index_data-22.0.2 → idc_index_data-22.1.0}/src/idc_index_data/py.typed +0 -0
- {idc_index_data-22.0.2 → idc_index_data-22.1.0}/tests/test_package.py +0 -0
|
@@ -4,13 +4,13 @@ ci:
|
|
|
4
4
|
|
|
5
5
|
repos:
|
|
6
6
|
- repo: https://github.com/adamchainz/blacken-docs
|
|
7
|
-
rev: "1.
|
|
7
|
+
rev: "1.20.0"
|
|
8
8
|
hooks:
|
|
9
9
|
- id: blacken-docs
|
|
10
10
|
additional_dependencies: [black==24.*]
|
|
11
11
|
|
|
12
12
|
- repo: https://github.com/pre-commit/pre-commit-hooks
|
|
13
|
-
rev: "
|
|
13
|
+
rev: "v6.0.0"
|
|
14
14
|
hooks:
|
|
15
15
|
- id: check-added-large-files
|
|
16
16
|
- id: check-case-conflict
|
|
@@ -32,22 +32,22 @@ repos:
|
|
|
32
32
|
- id: rst-directive-colons
|
|
33
33
|
- id: rst-inline-touching-normal
|
|
34
34
|
|
|
35
|
-
- repo: https://github.com/
|
|
36
|
-
rev: "v3.
|
|
35
|
+
- repo: https://github.com/rbubley/mirrors-prettier
|
|
36
|
+
rev: "v3.6.2"
|
|
37
37
|
hooks:
|
|
38
38
|
- id: prettier
|
|
39
39
|
types_or: [yaml, markdown, html, css, scss, javascript, json]
|
|
40
40
|
args: [--prose-wrap=always]
|
|
41
41
|
|
|
42
42
|
- repo: https://github.com/astral-sh/ruff-pre-commit
|
|
43
|
-
rev: "v0.
|
|
43
|
+
rev: "v0.14.4"
|
|
44
44
|
hooks:
|
|
45
|
-
- id: ruff
|
|
45
|
+
- id: ruff-check
|
|
46
46
|
args: ["--fix", "--show-fixes"]
|
|
47
47
|
- id: ruff-format
|
|
48
48
|
|
|
49
49
|
- repo: https://github.com/pre-commit/mirrors-mypy
|
|
50
|
-
rev: "v1.
|
|
50
|
+
rev: "v1.18.2"
|
|
51
51
|
hooks:
|
|
52
52
|
- id: mypy
|
|
53
53
|
files: src|tests
|
|
@@ -57,12 +57,12 @@ repos:
|
|
|
57
57
|
- pandas-stubs
|
|
58
58
|
|
|
59
59
|
- repo: https://github.com/codespell-project/codespell
|
|
60
|
-
rev: "v2.
|
|
60
|
+
rev: "v2.4.1"
|
|
61
61
|
hooks:
|
|
62
62
|
- id: codespell
|
|
63
63
|
|
|
64
64
|
- repo: https://github.com/shellcheck-py/shellcheck-py
|
|
65
|
-
rev: "v0.
|
|
65
|
+
rev: "v0.11.0.1"
|
|
66
66
|
hooks:
|
|
67
67
|
- id: shellcheck
|
|
68
68
|
|
|
@@ -74,14 +74,13 @@ repos:
|
|
|
74
74
|
entry: PyBind|Numpy|Cmake|CCache|Github|PyTest
|
|
75
75
|
exclude: .pre-commit-config.yaml
|
|
76
76
|
|
|
77
|
-
- repo: https://github.com/
|
|
78
|
-
rev: "
|
|
77
|
+
- repo: https://github.com/henryiii/validate-pyproject-schema-store
|
|
78
|
+
rev: "2025.11.04"
|
|
79
79
|
hooks:
|
|
80
80
|
- id: validate-pyproject
|
|
81
|
-
additional_dependencies: ["validate-pyproject-schema-store[all]"]
|
|
82
81
|
|
|
83
82
|
- repo: https://github.com/python-jsonschema/check-jsonschema
|
|
84
|
-
rev: "0.
|
|
83
|
+
rev: "0.34.1"
|
|
85
84
|
hooks:
|
|
86
85
|
- id: check-dependabot
|
|
87
86
|
- id: check-github-workflows
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.1
|
|
2
2
|
Name: idc-index-data
|
|
3
|
-
Version: 22.0
|
|
3
|
+
Version: 22.1.0
|
|
4
4
|
Summary: ImagingDataCommons index to query and download data.
|
|
5
5
|
Author-Email: Andrey Fedorov <andrey.fedorov@gmail.com>, Vamsi Thiriveedhi <vthiriveedhi@mgh.harvard.edu>, Jean-Christophe Fillion-Robin <jchris.fillionr@kitware.com>
|
|
6
6
|
License: Copyright 2024 Andrey Fedorov
|
|
@@ -13,7 +13,7 @@ build-backend = "scikit_build_core.build"
|
|
|
13
13
|
|
|
14
14
|
[project]
|
|
15
15
|
name = "idc-index-data"
|
|
16
|
-
version = "22.0
|
|
16
|
+
version = "22.1.0"
|
|
17
17
|
authors = [
|
|
18
18
|
{ name = "Andrey Fedorov", email = "andrey.fedorov@gmail.com" },
|
|
19
19
|
{ name = "Vamsi Thiriveedhi", email = "vthiriveedhi@mgh.harvard.edu" },
|
|
@@ -15,21 +15,23 @@ def main():
|
|
|
15
15
|
assets_dir = scripts_dir.parent / "assets"
|
|
16
16
|
|
|
17
17
|
# Collecting all .sql files from sql_dir and assets_dir
|
|
18
|
-
sql_files = [f for f in
|
|
18
|
+
sql_files = [f for f in Path.iterdir(assets_dir) if str(f).endswith(".sql")]
|
|
19
19
|
|
|
20
20
|
for file_name in sql_files:
|
|
21
21
|
file_path = assets_dir / file_name
|
|
22
|
-
index_df, output_basename = manager.execute_sql_query(file_path)
|
|
22
|
+
index_df, output_basename, schema = manager.execute_sql_query(file_path)
|
|
23
23
|
index_df.to_parquet(f"{output_basename}.parquet")
|
|
24
|
+
manager.save_schema_to_json(schema, output_basename)
|
|
24
25
|
|
|
25
26
|
core_indices_dir = scripts_dir.parent / "scripts" / "sql"
|
|
26
27
|
|
|
27
|
-
sql_files = [f for f in
|
|
28
|
+
sql_files = [f for f in Path.iterdir(core_indices_dir) if str(f).endswith(".sql")]
|
|
28
29
|
|
|
29
30
|
for file_name in sql_files:
|
|
30
31
|
file_path = core_indices_dir / file_name
|
|
31
|
-
index_df, output_basename = manager.execute_sql_query(file_path)
|
|
32
|
+
index_df, output_basename, schema = manager.execute_sql_query(file_path)
|
|
32
33
|
index_df.to_parquet(f"{output_basename}.parquet")
|
|
34
|
+
manager.save_schema_to_json(schema, output_basename)
|
|
33
35
|
|
|
34
36
|
|
|
35
37
|
if __name__ == "__main__":
|
|
@@ -1,5 +1,6 @@
|
|
|
1
1
|
from __future__ import annotations
|
|
2
2
|
|
|
3
|
+
import json
|
|
3
4
|
import logging
|
|
4
5
|
import os
|
|
5
6
|
from pathlib import Path
|
|
@@ -20,22 +21,54 @@ class IDCIndexDataManager:
|
|
|
20
21
|
self.client = bigquery.Client(project=project_id)
|
|
21
22
|
logger.debug("IDCIndexDataManager initialized with project ID: %s", project_id)
|
|
22
23
|
|
|
23
|
-
def execute_sql_query(
|
|
24
|
+
def execute_sql_query(
|
|
25
|
+
self, file_path: str
|
|
26
|
+
) -> tuple[pd.DataFrame, str, list[bigquery.SchemaField]]:
|
|
24
27
|
"""
|
|
25
28
|
Executes the SQL query in the specified file.
|
|
26
29
|
|
|
27
30
|
Returns:
|
|
28
|
-
Tuple[pd.DataFrame, str]: A tuple containing
|
|
29
|
-
the output basename.
|
|
31
|
+
Tuple[pd.DataFrame, str, List[bigquery.SchemaField]]: A tuple containing
|
|
32
|
+
the DataFrame with query results, the output basename, and the BigQuery schema.
|
|
30
33
|
"""
|
|
31
34
|
with Path(file_path).open("r") as file:
|
|
32
35
|
sql_query = file.read()
|
|
33
|
-
|
|
36
|
+
query_job_result = self.client.query(sql_query).result()
|
|
37
|
+
schema = query_job_result.schema # Get schema from BigQuery QueryJob
|
|
38
|
+
index_df = query_job_result.to_dataframe()
|
|
34
39
|
if "StudyDate" in index_df.columns:
|
|
35
40
|
index_df["StudyDate"] = index_df["StudyDate"].astype(str)
|
|
36
41
|
output_basename = Path(file_path).name.split(".")[0]
|
|
37
42
|
logger.debug("Executed SQL query from file: %s", file_path)
|
|
38
|
-
return index_df, output_basename
|
|
43
|
+
return index_df, output_basename, schema
|
|
44
|
+
|
|
45
|
+
def save_schema_to_json(
|
|
46
|
+
self, schema: list[bigquery.SchemaField], output_basename: str
|
|
47
|
+
) -> None:
|
|
48
|
+
"""
|
|
49
|
+
Saves the BigQuery schema to a JSON file.
|
|
50
|
+
|
|
51
|
+
Args:
|
|
52
|
+
schema: List of BigQuery SchemaField objects from the query result
|
|
53
|
+
output_basename: The base name for the output file
|
|
54
|
+
"""
|
|
55
|
+
# Convert BigQuery schema to JSON-serializable format
|
|
56
|
+
schema_dict = {
|
|
57
|
+
"fields": [
|
|
58
|
+
{
|
|
59
|
+
"name": field.name,
|
|
60
|
+
"type": field.field_type,
|
|
61
|
+
"mode": field.mode,
|
|
62
|
+
}
|
|
63
|
+
for field in schema
|
|
64
|
+
]
|
|
65
|
+
}
|
|
66
|
+
|
|
67
|
+
# Save to JSON file
|
|
68
|
+
json_file_name = f"{output_basename}.json"
|
|
69
|
+
with Path(json_file_name).open("w") as f:
|
|
70
|
+
json.dump(schema_dict, f, indent=2)
|
|
71
|
+
logger.debug("Created schema JSON file: %s", json_file_name)
|
|
39
72
|
|
|
40
73
|
def generate_index_data_files(
|
|
41
74
|
self, generate_compressed_csv: bool = True, generate_parquet: bool = False
|
|
@@ -52,24 +85,27 @@ class IDCIndexDataManager:
|
|
|
52
85
|
scripts_dir = Path(__file__).parent.parent
|
|
53
86
|
sql_dir = scripts_dir / "sql"
|
|
54
87
|
|
|
55
|
-
for file_name in
|
|
56
|
-
if file_name.endswith(".sql"):
|
|
88
|
+
for file_name in Path.iterdir(sql_dir):
|
|
89
|
+
if str(file_name).endswith(".sql"):
|
|
57
90
|
file_path = Path(sql_dir) / file_name
|
|
58
|
-
index_df, output_basename = self.execute_sql_query(file_path)
|
|
91
|
+
index_df, output_basename, schema = self.execute_sql_query(file_path)
|
|
59
92
|
logger.debug(
|
|
60
93
|
"Executed and processed SQL queries from file: %s", file_path
|
|
61
94
|
)
|
|
62
|
-
|
|
63
|
-
|
|
64
|
-
|
|
65
|
-
|
|
66
|
-
|
|
67
|
-
|
|
68
|
-
|
|
69
|
-
|
|
70
|
-
|
|
71
|
-
|
|
72
|
-
|
|
95
|
+
if generate_compressed_csv:
|
|
96
|
+
csv_file_name = f"{output_basename}.csv.zip"
|
|
97
|
+
index_df.to_csv(
|
|
98
|
+
csv_file_name, compression={"method": "zip"}, escapechar="\\"
|
|
99
|
+
)
|
|
100
|
+
logger.debug("Created CSV zip file: %s", csv_file_name)
|
|
101
|
+
|
|
102
|
+
if generate_parquet:
|
|
103
|
+
parquet_file_name = f"{output_basename}.parquet"
|
|
104
|
+
index_df.to_parquet(parquet_file_name, compression="zstd")
|
|
105
|
+
logger.debug("Created Parquet file: %s", parquet_file_name)
|
|
106
|
+
|
|
107
|
+
# Save schema to JSON file
|
|
108
|
+
self.save_schema_to_json(schema, output_basename)
|
|
73
109
|
|
|
74
110
|
def retrieve_latest_idc_release_version(self) -> int:
|
|
75
111
|
"""
|
|
@@ -25,7 +25,8 @@ def _log(txt, verbose=True):
|
|
|
25
25
|
|
|
26
26
|
|
|
27
27
|
def _update_file(filepath, regex, replacement):
|
|
28
|
-
|
|
28
|
+
rel_path = os.path.relpath(str(filepath), ROOT_DIR)
|
|
29
|
+
msg = f"Updating {rel_path}"
|
|
29
30
|
with _log(msg):
|
|
30
31
|
pattern = re.compile(regex)
|
|
31
32
|
with filepath.open() as doc_file:
|
|
@@ -0,0 +1,16 @@
|
|
|
1
|
+
SELECT
|
|
2
|
+
ID AS analysis_result_id,
|
|
3
|
+
Title AS analysis_result_title,
|
|
4
|
+
source_doi,
|
|
5
|
+
source_url,
|
|
6
|
+
Subjects,
|
|
7
|
+
Collections,
|
|
8
|
+
AnalysisArtifacts,
|
|
9
|
+
Updated,
|
|
10
|
+
license_url,
|
|
11
|
+
license_long_name,
|
|
12
|
+
license_short_name,
|
|
13
|
+
Description,
|
|
14
|
+
Citation
|
|
15
|
+
FROM
|
|
16
|
+
`bigquery-public-data.idc_v22.analysis_results_metadata`
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
@@ -12,10 +12,10 @@ from pathlib import Path
|
|
|
12
12
|
from ._version import version as __version__
|
|
13
13
|
|
|
14
14
|
__all__ = [
|
|
15
|
-
"__version__",
|
|
16
15
|
"IDC_INDEX_CSV_ARCHIVE_FILEPATH",
|
|
17
16
|
"IDC_INDEX_PARQUET_FILEPATH",
|
|
18
17
|
"PRIOR_VERSIONS_INDEX_PARQUET_FILEPATH",
|
|
18
|
+
"__version__",
|
|
19
19
|
]
|
|
20
20
|
|
|
21
21
|
|
|
File without changes
|
|
File without changes
|
|
File without changes
|