idc-index-data 22.0.2__tar.gz → 22.1.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (34) hide show
  1. {idc_index_data-22.0.2 → idc_index_data-22.1.0}/.github/workflows/ci.yml +1 -1
  2. {idc_index_data-22.0.2 → idc_index_data-22.1.0}/.github/workflows/external-indices.yml +1 -1
  3. {idc_index_data-22.0.2 → idc_index_data-22.1.0}/.pre-commit-config.yaml +12 -13
  4. {idc_index_data-22.0.2 → idc_index_data-22.1.0}/PKG-INFO +1 -1
  5. {idc_index_data-22.0.2 → idc_index_data-22.1.0}/pyproject.toml +1 -1
  6. {idc_index_data-22.0.2 → idc_index_data-22.1.0}/scripts/python/generate-indices.py +6 -4
  7. {idc_index_data-22.0.2 → idc_index_data-22.1.0}/scripts/python/idc_index_data_manager.py +55 -19
  8. {idc_index_data-22.0.2 → idc_index_data-22.1.0}/scripts/python/update_idc_index_version.py +2 -1
  9. idc_index_data-22.1.0/scripts/sql/analysis_results_index.sql +16 -0
  10. idc_index_data-22.1.0/scripts/sql/collections_index.sql +15 -0
  11. {idc_index_data-22.0.2 → idc_index_data-22.1.0}/.git_archival.txt +0 -0
  12. {idc_index_data-22.0.2 → idc_index_data-22.1.0}/.gitattributes +0 -0
  13. {idc_index_data-22.0.2 → idc_index_data-22.1.0}/.github/CONTRIBUTING.md +0 -0
  14. {idc_index_data-22.0.2 → idc_index_data-22.1.0}/.github/dependabot.yml +0 -0
  15. {idc_index_data-22.0.2 → idc_index_data-22.1.0}/.github/matchers/pylint.json +0 -0
  16. {idc_index_data-22.0.2 → idc_index_data-22.1.0}/.github/workflows/cd.yml +0 -0
  17. {idc_index_data-22.0.2 → idc_index_data-22.1.0}/.gitignore +0 -0
  18. {idc_index_data-22.0.2 → idc_index_data-22.1.0}/.readthedocs.yaml +0 -0
  19. {idc_index_data-22.0.2 → idc_index_data-22.1.0}/CMakeLists.txt +0 -0
  20. {idc_index_data-22.0.2 → idc_index_data-22.1.0}/LICENSE +0 -0
  21. {idc_index_data-22.0.2 → idc_index_data-22.1.0}/README.md +0 -0
  22. {idc_index_data-22.0.2 → idc_index_data-22.1.0}/assets/README.md +0 -0
  23. {idc_index_data-22.0.2 → idc_index_data-22.1.0}/assets/clinical_index.sql +0 -0
  24. {idc_index_data-22.0.2 → idc_index_data-22.1.0}/assets/sm_index.sql +0 -0
  25. {idc_index_data-22.0.2 → idc_index_data-22.1.0}/assets/sm_instance_index.sql +0 -0
  26. {idc_index_data-22.0.2 → idc_index_data-22.1.0}/docs/conf.py +0 -0
  27. {idc_index_data-22.0.2 → idc_index_data-22.1.0}/docs/index.md +0 -0
  28. {idc_index_data-22.0.2 → idc_index_data-22.1.0}/noxfile.py +0 -0
  29. {idc_index_data-22.0.2 → idc_index_data-22.1.0}/scripts/sql/idc_index.sql +0 -0
  30. {idc_index_data-22.0.2 → idc_index_data-22.1.0}/scripts/sql/prior_versions_index.sql +0 -0
  31. {idc_index_data-22.0.2 → idc_index_data-22.1.0}/src/idc_index_data/__init__.py +1 -1
  32. {idc_index_data-22.0.2 → idc_index_data-22.1.0}/src/idc_index_data/_version.pyi +0 -0
  33. {idc_index_data-22.0.2 → idc_index_data-22.1.0}/src/idc_index_data/py.typed +0 -0
  34. {idc_index_data-22.0.2 → idc_index_data-22.1.0}/tests/test_package.py +0 -0
@@ -50,7 +50,7 @@ jobs:
50
50
  strategy:
51
51
  fail-fast: false
52
52
  matrix:
53
- python-version: ["3.8", "3.12"]
53
+ python-version: ["3.10", "3.12"]
54
54
  runs-on: [ubuntu-latest, macos-latest, windows-latest]
55
55
 
56
56
  #currently not working on pypi-3.10
@@ -55,6 +55,6 @@ jobs:
55
55
  if: github.event_name == 'release' && github.event.action == 'published'
56
56
  uses: ncipollo/release-action@v1
57
57
  with:
58
- artifacts: "*.parquet"
58
+ artifacts: "*.parquet,*.json"
59
59
  allowUpdates: true
60
60
  omitBodyDuringUpdate: true
@@ -4,13 +4,13 @@ ci:
4
4
 
5
5
  repos:
6
6
  - repo: https://github.com/adamchainz/blacken-docs
7
- rev: "1.16.0"
7
+ rev: "1.20.0"
8
8
  hooks:
9
9
  - id: blacken-docs
10
10
  additional_dependencies: [black==24.*]
11
11
 
12
12
  - repo: https://github.com/pre-commit/pre-commit-hooks
13
- rev: "v4.5.0"
13
+ rev: "v6.0.0"
14
14
  hooks:
15
15
  - id: check-added-large-files
16
16
  - id: check-case-conflict
@@ -32,22 +32,22 @@ repos:
32
32
  - id: rst-directive-colons
33
33
  - id: rst-inline-touching-normal
34
34
 
35
- - repo: https://github.com/pre-commit/mirrors-prettier
36
- rev: "v3.1.0"
35
+ - repo: https://github.com/rbubley/mirrors-prettier
36
+ rev: "v3.6.2"
37
37
  hooks:
38
38
  - id: prettier
39
39
  types_or: [yaml, markdown, html, css, scss, javascript, json]
40
40
  args: [--prose-wrap=always]
41
41
 
42
42
  - repo: https://github.com/astral-sh/ruff-pre-commit
43
- rev: "v0.3.0"
43
+ rev: "v0.14.4"
44
44
  hooks:
45
- - id: ruff
45
+ - id: ruff-check
46
46
  args: ["--fix", "--show-fixes"]
47
47
  - id: ruff-format
48
48
 
49
49
  - repo: https://github.com/pre-commit/mirrors-mypy
50
- rev: "v1.8.0"
50
+ rev: "v1.18.2"
51
51
  hooks:
52
52
  - id: mypy
53
53
  files: src|tests
@@ -57,12 +57,12 @@ repos:
57
57
  - pandas-stubs
58
58
 
59
59
  - repo: https://github.com/codespell-project/codespell
60
- rev: "v2.2.6"
60
+ rev: "v2.4.1"
61
61
  hooks:
62
62
  - id: codespell
63
63
 
64
64
  - repo: https://github.com/shellcheck-py/shellcheck-py
65
- rev: "v0.9.0.6"
65
+ rev: "v0.11.0.1"
66
66
  hooks:
67
67
  - id: shellcheck
68
68
 
@@ -74,14 +74,13 @@ repos:
74
74
  entry: PyBind|Numpy|Cmake|CCache|Github|PyTest
75
75
  exclude: .pre-commit-config.yaml
76
76
 
77
- - repo: https://github.com/abravalheri/validate-pyproject
78
- rev: "v0.16"
77
+ - repo: https://github.com/henryiii/validate-pyproject-schema-store
78
+ rev: "2025.11.04"
79
79
  hooks:
80
80
  - id: validate-pyproject
81
- additional_dependencies: ["validate-pyproject-schema-store[all]"]
82
81
 
83
82
  - repo: https://github.com/python-jsonschema/check-jsonschema
84
- rev: "0.28.0"
83
+ rev: "0.34.1"
85
84
  hooks:
86
85
  - id: check-dependabot
87
86
  - id: check-github-workflows
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: idc-index-data
3
- Version: 22.0.2
3
+ Version: 22.1.0
4
4
  Summary: ImagingDataCommons index to query and download data.
5
5
  Author-Email: Andrey Fedorov <andrey.fedorov@gmail.com>, Vamsi Thiriveedhi <vthiriveedhi@mgh.harvard.edu>, Jean-Christophe Fillion-Robin <jchris.fillionr@kitware.com>
6
6
  License: Copyright 2024 Andrey Fedorov
@@ -13,7 +13,7 @@ build-backend = "scikit_build_core.build"
13
13
 
14
14
  [project]
15
15
  name = "idc-index-data"
16
- version = "22.0.2"
16
+ version = "22.1.0"
17
17
  authors = [
18
18
  { name = "Andrey Fedorov", email = "andrey.fedorov@gmail.com" },
19
19
  { name = "Vamsi Thiriveedhi", email = "vthiriveedhi@mgh.harvard.edu" },
@@ -15,21 +15,23 @@ def main():
15
15
  assets_dir = scripts_dir.parent / "assets"
16
16
 
17
17
  # Collecting all .sql files from sql_dir and assets_dir
18
- sql_files = [f for f in os.listdir(assets_dir) if f.endswith(".sql")]
18
+ sql_files = [f for f in Path.iterdir(assets_dir) if str(f).endswith(".sql")]
19
19
 
20
20
  for file_name in sql_files:
21
21
  file_path = assets_dir / file_name
22
- index_df, output_basename = manager.execute_sql_query(file_path)
22
+ index_df, output_basename, schema = manager.execute_sql_query(file_path)
23
23
  index_df.to_parquet(f"{output_basename}.parquet")
24
+ manager.save_schema_to_json(schema, output_basename)
24
25
 
25
26
  core_indices_dir = scripts_dir.parent / "scripts" / "sql"
26
27
 
27
- sql_files = [f for f in os.listdir(core_indices_dir) if f.endswith(".sql")]
28
+ sql_files = [f for f in Path.iterdir(core_indices_dir) if str(f).endswith(".sql")]
28
29
 
29
30
  for file_name in sql_files:
30
31
  file_path = core_indices_dir / file_name
31
- index_df, output_basename = manager.execute_sql_query(file_path)
32
+ index_df, output_basename, schema = manager.execute_sql_query(file_path)
32
33
  index_df.to_parquet(f"{output_basename}.parquet")
34
+ manager.save_schema_to_json(schema, output_basename)
33
35
 
34
36
 
35
37
  if __name__ == "__main__":
@@ -1,5 +1,6 @@
1
1
  from __future__ import annotations
2
2
 
3
+ import json
3
4
  import logging
4
5
  import os
5
6
  from pathlib import Path
@@ -20,22 +21,54 @@ class IDCIndexDataManager:
20
21
  self.client = bigquery.Client(project=project_id)
21
22
  logger.debug("IDCIndexDataManager initialized with project ID: %s", project_id)
22
23
 
23
- def execute_sql_query(self, file_path: str) -> tuple[pd.DataFrame, str]:
24
+ def execute_sql_query(
25
+ self, file_path: str
26
+ ) -> tuple[pd.DataFrame, str, list[bigquery.SchemaField]]:
24
27
  """
25
28
  Executes the SQL query in the specified file.
26
29
 
27
30
  Returns:
28
- Tuple[pd.DataFrame, str]: A tuple containing the DataFrame with query results,
29
- the output basename.
31
+ Tuple[pd.DataFrame, str, List[bigquery.SchemaField]]: A tuple containing
32
+ the DataFrame with query results, the output basename, and the BigQuery schema.
30
33
  """
31
34
  with Path(file_path).open("r") as file:
32
35
  sql_query = file.read()
33
- index_df = self.client.query(sql_query).to_dataframe()
36
+ query_job_result = self.client.query(sql_query).result()
37
+ schema = query_job_result.schema # Get schema from BigQuery QueryJob
38
+ index_df = query_job_result.to_dataframe()
34
39
  if "StudyDate" in index_df.columns:
35
40
  index_df["StudyDate"] = index_df["StudyDate"].astype(str)
36
41
  output_basename = Path(file_path).name.split(".")[0]
37
42
  logger.debug("Executed SQL query from file: %s", file_path)
38
- return index_df, output_basename
43
+ return index_df, output_basename, schema
44
+
45
+ def save_schema_to_json(
46
+ self, schema: list[bigquery.SchemaField], output_basename: str
47
+ ) -> None:
48
+ """
49
+ Saves the BigQuery schema to a JSON file.
50
+
51
+ Args:
52
+ schema: List of BigQuery SchemaField objects from the query result
53
+ output_basename: The base name for the output file
54
+ """
55
+ # Convert BigQuery schema to JSON-serializable format
56
+ schema_dict = {
57
+ "fields": [
58
+ {
59
+ "name": field.name,
60
+ "type": field.field_type,
61
+ "mode": field.mode,
62
+ }
63
+ for field in schema
64
+ ]
65
+ }
66
+
67
+ # Save to JSON file
68
+ json_file_name = f"{output_basename}.json"
69
+ with Path(json_file_name).open("w") as f:
70
+ json.dump(schema_dict, f, indent=2)
71
+ logger.debug("Created schema JSON file: %s", json_file_name)
39
72
 
40
73
  def generate_index_data_files(
41
74
  self, generate_compressed_csv: bool = True, generate_parquet: bool = False
@@ -52,24 +85,27 @@ class IDCIndexDataManager:
52
85
  scripts_dir = Path(__file__).parent.parent
53
86
  sql_dir = scripts_dir / "sql"
54
87
 
55
- for file_name in os.listdir(sql_dir):
56
- if file_name.endswith(".sql"):
88
+ for file_name in Path.iterdir(sql_dir):
89
+ if str(file_name).endswith(".sql"):
57
90
  file_path = Path(sql_dir) / file_name
58
- index_df, output_basename = self.execute_sql_query(file_path)
91
+ index_df, output_basename, schema = self.execute_sql_query(file_path)
59
92
  logger.debug(
60
93
  "Executed and processed SQL queries from file: %s", file_path
61
94
  )
62
- if generate_compressed_csv:
63
- csv_file_name = f"{output_basename}.csv.zip"
64
- index_df.to_csv(
65
- csv_file_name, compression={"method": "zip"}, escapechar="\\"
66
- )
67
- logger.debug("Created CSV zip file: %s", csv_file_name)
68
-
69
- if generate_parquet:
70
- parquet_file_name = f"{output_basename}.parquet"
71
- index_df.to_parquet(parquet_file_name, compression="zstd")
72
- logger.debug("Created Parquet file: %s", parquet_file_name)
95
+ if generate_compressed_csv:
96
+ csv_file_name = f"{output_basename}.csv.zip"
97
+ index_df.to_csv(
98
+ csv_file_name, compression={"method": "zip"}, escapechar="\\"
99
+ )
100
+ logger.debug("Created CSV zip file: %s", csv_file_name)
101
+
102
+ if generate_parquet:
103
+ parquet_file_name = f"{output_basename}.parquet"
104
+ index_df.to_parquet(parquet_file_name, compression="zstd")
105
+ logger.debug("Created Parquet file: %s", parquet_file_name)
106
+
107
+ # Save schema to JSON file
108
+ self.save_schema_to_json(schema, output_basename)
73
109
 
74
110
  def retrieve_latest_idc_release_version(self) -> int:
75
111
  """
@@ -25,7 +25,8 @@ def _log(txt, verbose=True):
25
25
 
26
26
 
27
27
  def _update_file(filepath, regex, replacement):
28
- msg = "Updating %s" % os.path.relpath(str(filepath), ROOT_DIR)
28
+ rel_path = os.path.relpath(str(filepath), ROOT_DIR)
29
+ msg = f"Updating {rel_path}"
29
30
  with _log(msg):
30
31
  pattern = re.compile(regex)
31
32
  with filepath.open() as doc_file:
@@ -0,0 +1,16 @@
1
+ SELECT
2
+ ID AS analysis_result_id,
3
+ Title AS analysis_result_title,
4
+ source_doi,
5
+ source_url,
6
+ Subjects,
7
+ Collections,
8
+ AnalysisArtifacts,
9
+ Updated,
10
+ license_url,
11
+ license_long_name,
12
+ license_short_name,
13
+ Description,
14
+ Citation
15
+ FROM
16
+ `bigquery-public-data.idc_v22.analysis_results_metadata`
@@ -0,0 +1,15 @@
1
+ SELECT
2
+ collection_name,
3
+ collection_id,
4
+ CancerTypes,
5
+ TumorLocations,
6
+ Subjects,
7
+ Species,
8
+ Sources,
9
+ SupportingData,
10
+ Program,
11
+ Status,
12
+ Updated,
13
+ Description
14
+ FROM
15
+ `bigquery-public-data.idc_v22.original_collections_metadata`
File without changes
@@ -12,10 +12,10 @@ from pathlib import Path
12
12
  from ._version import version as __version__
13
13
 
14
14
  __all__ = [
15
- "__version__",
16
15
  "IDC_INDEX_CSV_ARCHIVE_FILEPATH",
17
16
  "IDC_INDEX_PARQUET_FILEPATH",
18
17
  "PRIOR_VERSIONS_INDEX_PARQUET_FILEPATH",
18
+ "__version__",
19
19
  ]
20
20
 
21
21