idc-index-data 22.0.3__tar.gz → 22.1.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (34) hide show
  1. {idc_index_data-22.0.3 → idc_index_data-22.1.0}/.github/workflows/external-indices.yml +1 -1
  2. {idc_index_data-22.0.3 → idc_index_data-22.1.0}/PKG-INFO +1 -1
  3. {idc_index_data-22.0.3 → idc_index_data-22.1.0}/pyproject.toml +1 -1
  4. {idc_index_data-22.0.3 → idc_index_data-22.1.0}/scripts/python/generate-indices.py +4 -2
  5. {idc_index_data-22.0.3 → idc_index_data-22.1.0}/scripts/python/idc_index_data_manager.py +53 -17
  6. {idc_index_data-22.0.3 → idc_index_data-22.1.0}/.git_archival.txt +0 -0
  7. {idc_index_data-22.0.3 → idc_index_data-22.1.0}/.gitattributes +0 -0
  8. {idc_index_data-22.0.3 → idc_index_data-22.1.0}/.github/CONTRIBUTING.md +0 -0
  9. {idc_index_data-22.0.3 → idc_index_data-22.1.0}/.github/dependabot.yml +0 -0
  10. {idc_index_data-22.0.3 → idc_index_data-22.1.0}/.github/matchers/pylint.json +0 -0
  11. {idc_index_data-22.0.3 → idc_index_data-22.1.0}/.github/workflows/cd.yml +0 -0
  12. {idc_index_data-22.0.3 → idc_index_data-22.1.0}/.github/workflows/ci.yml +0 -0
  13. {idc_index_data-22.0.3 → idc_index_data-22.1.0}/.gitignore +0 -0
  14. {idc_index_data-22.0.3 → idc_index_data-22.1.0}/.pre-commit-config.yaml +0 -0
  15. {idc_index_data-22.0.3 → idc_index_data-22.1.0}/.readthedocs.yaml +0 -0
  16. {idc_index_data-22.0.3 → idc_index_data-22.1.0}/CMakeLists.txt +0 -0
  17. {idc_index_data-22.0.3 → idc_index_data-22.1.0}/LICENSE +0 -0
  18. {idc_index_data-22.0.3 → idc_index_data-22.1.0}/README.md +0 -0
  19. {idc_index_data-22.0.3 → idc_index_data-22.1.0}/assets/README.md +0 -0
  20. {idc_index_data-22.0.3 → idc_index_data-22.1.0}/assets/clinical_index.sql +0 -0
  21. {idc_index_data-22.0.3 → idc_index_data-22.1.0}/assets/sm_index.sql +0 -0
  22. {idc_index_data-22.0.3 → idc_index_data-22.1.0}/assets/sm_instance_index.sql +0 -0
  23. {idc_index_data-22.0.3 → idc_index_data-22.1.0}/docs/conf.py +0 -0
  24. {idc_index_data-22.0.3 → idc_index_data-22.1.0}/docs/index.md +0 -0
  25. {idc_index_data-22.0.3 → idc_index_data-22.1.0}/noxfile.py +0 -0
  26. {idc_index_data-22.0.3 → idc_index_data-22.1.0}/scripts/python/update_idc_index_version.py +0 -0
  27. {idc_index_data-22.0.3 → idc_index_data-22.1.0}/scripts/sql/analysis_results_index.sql +0 -0
  28. {idc_index_data-22.0.3 → idc_index_data-22.1.0}/scripts/sql/collections_index.sql +0 -0
  29. {idc_index_data-22.0.3 → idc_index_data-22.1.0}/scripts/sql/idc_index.sql +0 -0
  30. {idc_index_data-22.0.3 → idc_index_data-22.1.0}/scripts/sql/prior_versions_index.sql +0 -0
  31. {idc_index_data-22.0.3 → idc_index_data-22.1.0}/src/idc_index_data/__init__.py +0 -0
  32. {idc_index_data-22.0.3 → idc_index_data-22.1.0}/src/idc_index_data/_version.pyi +0 -0
  33. {idc_index_data-22.0.3 → idc_index_data-22.1.0}/src/idc_index_data/py.typed +0 -0
  34. {idc_index_data-22.0.3 → idc_index_data-22.1.0}/tests/test_package.py +0 -0
@@ -55,6 +55,6 @@ jobs:
55
55
  if: github.event_name == 'release' && github.event.action == 'published'
56
56
  uses: ncipollo/release-action@v1
57
57
  with:
58
- artifacts: "*.parquet"
58
+ artifacts: "*.parquet,*.json"
59
59
  allowUpdates: true
60
60
  omitBodyDuringUpdate: true
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: idc-index-data
3
- Version: 22.0.3
3
+ Version: 22.1.0
4
4
  Summary: ImagingDataCommons index to query and download data.
5
5
  Author-Email: Andrey Fedorov <andrey.fedorov@gmail.com>, Vamsi Thiriveedhi <vthiriveedhi@mgh.harvard.edu>, Jean-Christophe Fillion-Robin <jchris.fillionr@kitware.com>
6
6
  License: Copyright 2024 Andrey Fedorov
@@ -13,7 +13,7 @@ build-backend = "scikit_build_core.build"
13
13
 
14
14
  [project]
15
15
  name = "idc-index-data"
16
- version = "22.0.3"
16
+ version = "22.1.0"
17
17
  authors = [
18
18
  { name = "Andrey Fedorov", email = "andrey.fedorov@gmail.com" },
19
19
  { name = "Vamsi Thiriveedhi", email = "vthiriveedhi@mgh.harvard.edu" },
@@ -19,8 +19,9 @@ def main():
19
19
 
20
20
  for file_name in sql_files:
21
21
  file_path = assets_dir / file_name
22
- index_df, output_basename = manager.execute_sql_query(file_path)
22
+ index_df, output_basename, schema = manager.execute_sql_query(file_path)
23
23
  index_df.to_parquet(f"{output_basename}.parquet")
24
+ manager.save_schema_to_json(schema, output_basename)
24
25
 
25
26
  core_indices_dir = scripts_dir.parent / "scripts" / "sql"
26
27
 
@@ -28,8 +29,9 @@ def main():
28
29
 
29
30
  for file_name in sql_files:
30
31
  file_path = core_indices_dir / file_name
31
- index_df, output_basename = manager.execute_sql_query(file_path)
32
+ index_df, output_basename, schema = manager.execute_sql_query(file_path)
32
33
  index_df.to_parquet(f"{output_basename}.parquet")
34
+ manager.save_schema_to_json(schema, output_basename)
33
35
 
34
36
 
35
37
  if __name__ == "__main__":
@@ -1,5 +1,6 @@
1
1
  from __future__ import annotations
2
2
 
3
+ import json
3
4
  import logging
4
5
  import os
5
6
  from pathlib import Path
@@ -20,22 +21,54 @@ class IDCIndexDataManager:
20
21
  self.client = bigquery.Client(project=project_id)
21
22
  logger.debug("IDCIndexDataManager initialized with project ID: %s", project_id)
22
23
 
23
- def execute_sql_query(self, file_path: str) -> tuple[pd.DataFrame, str]:
24
+ def execute_sql_query(
25
+ self, file_path: str
26
+ ) -> tuple[pd.DataFrame, str, list[bigquery.SchemaField]]:
24
27
  """
25
28
  Executes the SQL query in the specified file.
26
29
 
27
30
  Returns:
28
- Tuple[pd.DataFrame, str]: A tuple containing the DataFrame with query results,
29
- the output basename.
31
+ Tuple[pd.DataFrame, str, List[bigquery.SchemaField]]: A tuple containing
32
+ the DataFrame with query results, the output basename, and the BigQuery schema.
30
33
  """
31
34
  with Path(file_path).open("r") as file:
32
35
  sql_query = file.read()
33
- index_df = self.client.query(sql_query).to_dataframe()
36
+ query_job_result = self.client.query(sql_query).result()
37
+ schema = query_job_result.schema # Get schema from BigQuery QueryJob
38
+ index_df = query_job_result.to_dataframe()
34
39
  if "StudyDate" in index_df.columns:
35
40
  index_df["StudyDate"] = index_df["StudyDate"].astype(str)
36
41
  output_basename = Path(file_path).name.split(".")[0]
37
42
  logger.debug("Executed SQL query from file: %s", file_path)
38
- return index_df, output_basename
43
+ return index_df, output_basename, schema
44
+
45
+ def save_schema_to_json(
46
+ self, schema: list[bigquery.SchemaField], output_basename: str
47
+ ) -> None:
48
+ """
49
+ Saves the BigQuery schema to a JSON file.
50
+
51
+ Args:
52
+ schema: List of BigQuery SchemaField objects from the query result
53
+ output_basename: The base name for the output file
54
+ """
55
+ # Convert BigQuery schema to JSON-serializable format
56
+ schema_dict = {
57
+ "fields": [
58
+ {
59
+ "name": field.name,
60
+ "type": field.field_type,
61
+ "mode": field.mode,
62
+ }
63
+ for field in schema
64
+ ]
65
+ }
66
+
67
+ # Save to JSON file
68
+ json_file_name = f"{output_basename}.json"
69
+ with Path(json_file_name).open("w") as f:
70
+ json.dump(schema_dict, f, indent=2)
71
+ logger.debug("Created schema JSON file: %s", json_file_name)
39
72
 
40
73
  def generate_index_data_files(
41
74
  self, generate_compressed_csv: bool = True, generate_parquet: bool = False
@@ -55,21 +88,24 @@ class IDCIndexDataManager:
55
88
  for file_name in Path.iterdir(sql_dir):
56
89
  if str(file_name).endswith(".sql"):
57
90
  file_path = Path(sql_dir) / file_name
58
- index_df, output_basename = self.execute_sql_query(file_path)
91
+ index_df, output_basename, schema = self.execute_sql_query(file_path)
59
92
  logger.debug(
60
93
  "Executed and processed SQL queries from file: %s", file_path
61
94
  )
62
- if generate_compressed_csv:
63
- csv_file_name = f"{output_basename}.csv.zip"
64
- index_df.to_csv(
65
- csv_file_name, compression={"method": "zip"}, escapechar="\\"
66
- )
67
- logger.debug("Created CSV zip file: %s", csv_file_name)
68
-
69
- if generate_parquet:
70
- parquet_file_name = f"{output_basename}.parquet"
71
- index_df.to_parquet(parquet_file_name, compression="zstd")
72
- logger.debug("Created Parquet file: %s", parquet_file_name)
95
+ if generate_compressed_csv:
96
+ csv_file_name = f"{output_basename}.csv.zip"
97
+ index_df.to_csv(
98
+ csv_file_name, compression={"method": "zip"}, escapechar="\\"
99
+ )
100
+ logger.debug("Created CSV zip file: %s", csv_file_name)
101
+
102
+ if generate_parquet:
103
+ parquet_file_name = f"{output_basename}.parquet"
104
+ index_df.to_parquet(parquet_file_name, compression="zstd")
105
+ logger.debug("Created Parquet file: %s", parquet_file_name)
106
+
107
+ # Save schema to JSON file
108
+ self.save_schema_to_json(schema, output_basename)
73
109
 
74
110
  def retrieve_latest_idc_release_version(self) -> int:
75
111
  """
File without changes