idc-index-data 22.0.3__tar.gz → 22.1.1__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (35) hide show
  1. idc_index_data-22.1.1/.github/copilot-instructions.md +170 -0
  2. {idc_index_data-22.0.3 → idc_index_data-22.1.1}/.github/workflows/external-indices.yml +1 -1
  3. {idc_index_data-22.0.3 → idc_index_data-22.1.1}/.gitignore +3 -0
  4. {idc_index_data-22.0.3 → idc_index_data-22.1.1}/PKG-INFO +2 -4
  5. {idc_index_data-22.0.3 → idc_index_data-22.1.1}/pyproject.toml +2 -4
  6. {idc_index_data-22.0.3 → idc_index_data-22.1.1}/scripts/python/generate-indices.py +12 -4
  7. {idc_index_data-22.0.3 → idc_index_data-22.1.1}/scripts/python/idc_index_data_manager.py +82 -18
  8. {idc_index_data-22.0.3 → idc_index_data-22.1.1}/.git_archival.txt +0 -0
  9. {idc_index_data-22.0.3 → idc_index_data-22.1.1}/.gitattributes +0 -0
  10. {idc_index_data-22.0.3 → idc_index_data-22.1.1}/.github/CONTRIBUTING.md +0 -0
  11. {idc_index_data-22.0.3 → idc_index_data-22.1.1}/.github/dependabot.yml +0 -0
  12. {idc_index_data-22.0.3 → idc_index_data-22.1.1}/.github/matchers/pylint.json +0 -0
  13. {idc_index_data-22.0.3 → idc_index_data-22.1.1}/.github/workflows/cd.yml +0 -0
  14. {idc_index_data-22.0.3 → idc_index_data-22.1.1}/.github/workflows/ci.yml +0 -0
  15. {idc_index_data-22.0.3 → idc_index_data-22.1.1}/.pre-commit-config.yaml +0 -0
  16. {idc_index_data-22.0.3 → idc_index_data-22.1.1}/.readthedocs.yaml +0 -0
  17. {idc_index_data-22.0.3 → idc_index_data-22.1.1}/CMakeLists.txt +0 -0
  18. {idc_index_data-22.0.3 → idc_index_data-22.1.1}/LICENSE +0 -0
  19. {idc_index_data-22.0.3 → idc_index_data-22.1.1}/README.md +0 -0
  20. {idc_index_data-22.0.3 → idc_index_data-22.1.1}/assets/README.md +0 -0
  21. {idc_index_data-22.0.3 → idc_index_data-22.1.1}/assets/clinical_index.sql +0 -0
  22. {idc_index_data-22.0.3 → idc_index_data-22.1.1}/assets/sm_index.sql +0 -0
  23. {idc_index_data-22.0.3 → idc_index_data-22.1.1}/assets/sm_instance_index.sql +0 -0
  24. {idc_index_data-22.0.3 → idc_index_data-22.1.1}/docs/conf.py +0 -0
  25. {idc_index_data-22.0.3 → idc_index_data-22.1.1}/docs/index.md +0 -0
  26. {idc_index_data-22.0.3 → idc_index_data-22.1.1}/noxfile.py +0 -0
  27. {idc_index_data-22.0.3 → idc_index_data-22.1.1}/scripts/python/update_idc_index_version.py +0 -0
  28. {idc_index_data-22.0.3 → idc_index_data-22.1.1}/scripts/sql/analysis_results_index.sql +0 -0
  29. {idc_index_data-22.0.3 → idc_index_data-22.1.1}/scripts/sql/collections_index.sql +0 -0
  30. {idc_index_data-22.0.3 → idc_index_data-22.1.1}/scripts/sql/idc_index.sql +0 -0
  31. {idc_index_data-22.0.3 → idc_index_data-22.1.1}/scripts/sql/prior_versions_index.sql +0 -0
  32. {idc_index_data-22.0.3 → idc_index_data-22.1.1}/src/idc_index_data/__init__.py +0 -0
  33. {idc_index_data-22.0.3 → idc_index_data-22.1.1}/src/idc_index_data/_version.pyi +0 -0
  34. {idc_index_data-22.0.3 → idc_index_data-22.1.1}/src/idc_index_data/py.typed +0 -0
  35. {idc_index_data-22.0.3 → idc_index_data-22.1.1}/tests/test_package.py +0 -0
@@ -0,0 +1,170 @@
1
+ # GitHub Copilot Instructions for idc-index-data
2
+
3
+ ## Project Overview
4
+
5
+ `idc-index-data` is a Python package that bundles the index data for the NCI
6
+ Imaging Data Commons (IDC). The package provides Parquet files containing
7
+ metadata about imaging data hosted by IDC, intended to be used by the
8
+ `idc-index` Python package.
9
+
10
+ ## Technology Stack
11
+
12
+ - **Build System**: scikit-build-core with CMake
13
+ - **Package Manager**: pip
14
+ - **Python Versions**: 3.10, 3.11, 3.12
15
+ - **Testing**: pytest with pytest-cov
16
+ - **Task Runner**: nox
17
+ - **Linting**: ruff, pylint, mypy, pre-commit hooks
18
+ - **Documentation**: Sphinx with MyST parser and Furo theme
19
+ - **Data Processing**: pandas, pyarrow, Google Cloud BigQuery
20
+
21
+ ## Development Workflow
22
+
23
+ ### Setting Up Development Environment
24
+
25
+ ```bash
26
+ python3 -m venv .venv
27
+ source ./.venv/bin/activate
28
+ pip install -v -e .[dev]
29
+ pre-commit install
30
+ ```
31
+
32
+ ### Common Commands
33
+
34
+ - **Run all checks**: `nox` (runs lint, pylint, and tests by default)
35
+ - **Lint code**: `nox -s lint`
36
+ - **Run pylint**: `nox -s pylint`
37
+ - **Run tests**: `nox -s tests`
38
+ - **Build docs**: `nox -s docs`
39
+ - **Serve docs**: `nox -s docs -- --serve`
40
+ - **Build package**: `nox -s build`
41
+ - **Update IDC index version**: `nox -s bump -- <version>` (or leave off version
42
+ for latest)
43
+ - **Tag release**: `nox -s tag_release` (shows instructions)
44
+
45
+ ### Pre-commit Checks
46
+
47
+ Always run pre-commit before committing:
48
+
49
+ ```bash
50
+ pre-commit run --all-files
51
+ ```
52
+
53
+ ## Code Style and Conventions
54
+
55
+ ### Python Code Style
56
+
57
+ - **Import Statement**: All files must include
58
+ `from __future__ import annotations` at the top
59
+ - **Type Hints**: Use type hints throughout; strict type checking is enabled for
60
+ `idc_index_data.*` modules
61
+ - **Linting**: Follow ruff and pylint rules configured in `pyproject.toml`
62
+ - **Formatting**: Code is formatted with ruff formatter
63
+ - **Line Length**: Not strictly enforced but keep reasonable
64
+ - **Docstrings**: Use when appropriate, especially for public APIs
65
+
66
+ ### Key Ruff Rules
67
+
68
+ The project uses extensive ruff rules including:
69
+
70
+ - `B` - flake8-bugbear
71
+ - `I` - isort (import sorting)
72
+ - `ARG` - flake8-unused-arguments
73
+ - `UP` - pyupgrade
74
+ - `PTH` - flake8-use-pathlib (prefer pathlib over os.path)
75
+ - `NPY` - NumPy specific rules
76
+ - `PD` - pandas-vet
77
+
78
+ ### Type Checking
79
+
80
+ - Python 3.8 minimum target
81
+ - Strict mypy checking for package code
82
+ - Use `typing.TYPE_CHECKING` for import cycles
83
+
84
+ ## Project Structure
85
+
86
+ ```
87
+ idc-index-data/
88
+ ├── src/idc_index_data/ # Main package source
89
+ │ ├── __init__.py # Package exports and file path lookups
90
+ │ └── _version.py # Auto-generated version file
91
+ ├── scripts/ # Management scripts
92
+ │ ├── python/ # Python scripts for index management
93
+ │ └── sql/ # SQL queries for BigQuery
94
+ ├── tests/ # Test files
95
+ │ └── test_package.py # Package tests
96
+ ├── docs/ # Sphinx documentation
97
+ ├── pyproject.toml # Project configuration
98
+ ├── noxfile.py # Nox session definitions
99
+ └── CMakeLists.txt # Build configuration
100
+ ```
101
+
102
+ ## Important Considerations
103
+
104
+ ### Package Purpose
105
+
106
+ This package is a **data package** - it bundles index files (CSV and Parquet)
107
+ and provides file paths to locate them. It does not contain complex business
108
+ logic but rather serves as a data distribution mechanism.
109
+
110
+ ### Version Management
111
+
112
+ - Version is defined in `pyproject.toml`
113
+ - Use `nox -s bump` to update to new IDC index versions
114
+ - The version should match the IDC release version
115
+ - Always update both index files and test expectations when bumping version
116
+
117
+ ### Data Files
118
+
119
+ The package includes:
120
+
121
+ - `idc_index.csv.zip` - Compressed CSV index (optional)
122
+ - `idc_index.parquet` - Parquet format index
123
+ - `prior_versions_index.parquet` - Historical version index
124
+
125
+ ### Google Cloud Integration
126
+
127
+ - Some operations require Google Cloud credentials
128
+ - BigQuery is used to fetch latest index data
129
+ - Scripts need `GCP_PROJECT` and `GOOGLE_APPLICATION_CREDENTIALS` environment
130
+ variables
131
+
132
+ ### Testing
133
+
134
+ - Tests verify package installation and file accessibility
135
+ - Coverage reporting is configured but codecov upload is currently disabled
136
+ - Tests should work across platforms (Linux, macOS, Windows)
137
+
138
+ ## Release Process
139
+
140
+ 1. Update index version: `nox -s bump -- --commit <version>`
141
+ 2. Create PR: `gh pr create --fill`
142
+ 3. After merge, tag release: follow instructions from `nox -s tag_release`
143
+ 4. Push tag: `git push origin <version>`
144
+ 5. GitHub Actions will automatically build and publish to PyPI
145
+
146
+ ## CI/CD
147
+
148
+ - **Format check**: pre-commit hooks + pylint
149
+ - **Tests**: Run on Python 3.10 and 3.12 across Linux, macOS, and Windows
150
+ - **Publishing**: Automated through GitHub Actions on tagged releases
151
+
152
+ ## Additional Resources
153
+
154
+ - [Contributing Guide](.github/CONTRIBUTING.md)
155
+ - [Scientific Python Developer Guide](https://learn.scientific-python.org/development/)
156
+ - [IDC Homepage](https://imaging.datacommons.cancer.gov)
157
+ - [IDC Discourse Forum](https://discourse.canceridc.dev/)
158
+
159
+ ## When Making Changes
160
+
161
+ 1. **Always** run tests before and after changes: `nox -s tests`
162
+ 2. **Always** run linters: `nox -s lint`
163
+ 3. **Never** commit without running pre-commit checks
164
+ 4. **Prefer** pathlib over os.path for file operations
165
+ 5. **Use** type hints for all new code
166
+ 6. **Update** tests if changing package structure or exports
167
+ 7. **Follow** existing patterns in the codebase
168
+ 8. **Keep** changes minimal and focused
169
+ 9. **Document** any new public APIs
170
+ 10. **Test** across Python versions when changing core functionality
@@ -55,6 +55,6 @@ jobs:
55
55
  if: github.event_name == 'release' && github.event.action == 'published'
56
56
  uses: ncipollo/release-action@v1
57
57
  with:
58
- artifacts: "*.parquet"
58
+ artifacts: "release_artifacts/*.parquet,release_artifacts/*.json"
59
59
  allowUpdates: true
60
60
  omitBodyDuringUpdate: true
@@ -159,3 +159,6 @@ Thumbs.db
159
159
 
160
160
  # gcp service account keys
161
161
  gha-creds-**.json
162
+
163
+ # Release artifacts directory
164
+ release_artifacts/
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: idc-index-data
3
- Version: 22.0.3
3
+ Version: 22.1.1
4
4
  Summary: ImagingDataCommons index to query and download data.
5
5
  Author-Email: Andrey Fedorov <andrey.fedorov@gmail.com>, Vamsi Thiriveedhi <vthiriveedhi@mgh.harvard.edu>, Jean-Christophe Fillion-Robin <jchris.fillionr@kitware.com>
6
6
  License: Copyright 2024 Andrey Fedorov
@@ -31,8 +31,6 @@ Classifier: Operating System :: OS Independent
31
31
  Classifier: Programming Language :: Python
32
32
  Classifier: Programming Language :: Python :: 3
33
33
  Classifier: Programming Language :: Python :: 3 :: Only
34
- Classifier: Programming Language :: Python :: 3.8
35
- Classifier: Programming Language :: Python :: 3.9
36
34
  Classifier: Programming Language :: Python :: 3.10
37
35
  Classifier: Programming Language :: Python :: 3.11
38
36
  Classifier: Programming Language :: Python :: 3.12
@@ -42,7 +40,7 @@ Project-URL: Homepage, https://github.com/ImagingDataCommons/idc-index-data
42
40
  Project-URL: Bug Tracker, https://github.com/ImagingDataCommons/idc-index-data/issues
43
41
  Project-URL: Discussions, https://discourse.canceridc.dev/
44
42
  Project-URL: Changelog, https://github.com/ImagingDataCommons/idc-index-data/releases
45
- Requires-Python: >=3.8
43
+ Requires-Python: >=3.10
46
44
  Provides-Extra: test
47
45
  Requires-Dist: pandas; extra == "test"
48
46
  Requires-Dist: pyarrow; extra == "test"
@@ -13,7 +13,7 @@ build-backend = "scikit_build_core.build"
13
13
 
14
14
  [project]
15
15
  name = "idc-index-data"
16
- version = "22.0.3"
16
+ version = "22.1.1"
17
17
  authors = [
18
18
  { name = "Andrey Fedorov", email = "andrey.fedorov@gmail.com" },
19
19
  { name = "Vamsi Thiriveedhi", email = "vthiriveedhi@mgh.harvard.edu" },
@@ -22,7 +22,7 @@ authors = [
22
22
  description = "ImagingDataCommons index to query and download data."
23
23
  readme = "README.md"
24
24
  license.file = "LICENSE"
25
- requires-python = ">=3.8"
25
+ requires-python = ">=3.10"
26
26
  classifiers = [
27
27
  "Development Status :: 4 - Beta",
28
28
  "Intended Audience :: Science/Research",
@@ -32,8 +32,6 @@ classifiers = [
32
32
  "Programming Language :: Python",
33
33
  "Programming Language :: Python :: 3",
34
34
  "Programming Language :: Python :: 3 :: Only",
35
- "Programming Language :: Python :: 3.8",
36
- "Programming Language :: Python :: 3.9",
37
35
  "Programming Language :: Python :: 3.10",
38
36
  "Programming Language :: Python :: 3.11",
39
37
  "Programming Language :: Python :: 3.12",
@@ -12,6 +12,10 @@ def main():
12
12
  manager = IDCIndexDataManager(project_id=project_id)
13
13
  scripts_dir = Path(__file__).resolve().parent.parent
14
14
 
15
+ # Create dedicated output directory for release artifacts
16
+ output_dir = scripts_dir.parent / "release_artifacts"
17
+ output_dir.mkdir(parents=True, exist_ok=True)
18
+
15
19
  assets_dir = scripts_dir.parent / "assets"
16
20
 
17
21
  # Collecting all .sql files from sql_dir and assets_dir
@@ -19,8 +23,10 @@ def main():
19
23
 
20
24
  for file_name in sql_files:
21
25
  file_path = assets_dir / file_name
22
- index_df, output_basename = manager.execute_sql_query(file_path)
23
- index_df.to_parquet(f"{output_basename}.parquet")
26
+ index_df, output_basename, schema = manager.execute_sql_query(file_path)
27
+ parquet_file_path = output_dir / f"{output_basename}.parquet"
28
+ index_df.to_parquet(parquet_file_path)
29
+ manager.save_schema_to_json(schema, output_basename, output_dir)
24
30
 
25
31
  core_indices_dir = scripts_dir.parent / "scripts" / "sql"
26
32
 
@@ -28,8 +34,10 @@ def main():
28
34
 
29
35
  for file_name in sql_files:
30
36
  file_path = core_indices_dir / file_name
31
- index_df, output_basename = manager.execute_sql_query(file_path)
32
- index_df.to_parquet(f"{output_basename}.parquet")
37
+ index_df, output_basename, schema = manager.execute_sql_query(file_path)
38
+ parquet_file_path = output_dir / f"{output_basename}.parquet"
39
+ index_df.to_parquet(parquet_file_path)
40
+ manager.save_schema_to_json(schema, output_basename, output_dir)
33
41
 
34
42
 
35
43
  if __name__ == "__main__":
@@ -1,5 +1,6 @@
1
1
  from __future__ import annotations
2
2
 
3
+ import json
3
4
  import logging
4
5
  import os
5
6
  from pathlib import Path
@@ -20,25 +21,69 @@ class IDCIndexDataManager:
20
21
  self.client = bigquery.Client(project=project_id)
21
22
  logger.debug("IDCIndexDataManager initialized with project ID: %s", project_id)
22
23
 
23
- def execute_sql_query(self, file_path: str) -> tuple[pd.DataFrame, str]:
24
+ def execute_sql_query(
25
+ self, file_path: str
26
+ ) -> tuple[pd.DataFrame, str, list[bigquery.SchemaField]]:
24
27
  """
25
28
  Executes the SQL query in the specified file.
26
29
 
27
30
  Returns:
28
- Tuple[pd.DataFrame, str]: A tuple containing the DataFrame with query results,
29
- the output basename.
31
+ Tuple[pd.DataFrame, str, List[bigquery.SchemaField]]: A tuple containing
32
+ the DataFrame with query results, the output basename, and the BigQuery schema.
30
33
  """
31
34
  with Path(file_path).open("r") as file:
32
35
  sql_query = file.read()
33
- index_df = self.client.query(sql_query).to_dataframe()
36
+ query_job_result = self.client.query(sql_query).result()
37
+ schema = query_job_result.schema # Get schema from BigQuery QueryJob
38
+ index_df = query_job_result.to_dataframe()
34
39
  if "StudyDate" in index_df.columns:
35
40
  index_df["StudyDate"] = index_df["StudyDate"].astype(str)
36
41
  output_basename = Path(file_path).name.split(".")[0]
37
42
  logger.debug("Executed SQL query from file: %s", file_path)
38
- return index_df, output_basename
43
+ return index_df, output_basename, schema
44
+
45
+ def save_schema_to_json(
46
+ self,
47
+ schema: list[bigquery.SchemaField],
48
+ output_basename: str,
49
+ output_dir: Path | None = None,
50
+ ) -> None:
51
+ """
52
+ Saves the BigQuery schema to a JSON file.
53
+
54
+ Args:
55
+ schema: List of BigQuery SchemaField objects from the query result
56
+ output_basename: The base name for the output file
57
+ output_dir: Optional directory path for the output file
58
+ """
59
+ # Convert BigQuery schema to JSON-serializable format
60
+ schema_dict = {
61
+ "fields": [
62
+ {
63
+ "name": field.name,
64
+ "type": field.field_type,
65
+ "mode": field.mode,
66
+ }
67
+ for field in schema
68
+ ]
69
+ }
70
+
71
+ # Save to JSON file
72
+ if output_dir:
73
+ output_dir.mkdir(parents=True, exist_ok=True)
74
+ json_file_path = output_dir / f"{output_basename}.json"
75
+ else:
76
+ json_file_path = Path(f"{output_basename}.json")
77
+
78
+ with json_file_path.open("w") as f:
79
+ json.dump(schema_dict, f, indent=2)
80
+ logger.debug("Created schema JSON file: %s", json_file_path)
39
81
 
40
82
  def generate_index_data_files(
41
- self, generate_compressed_csv: bool = True, generate_parquet: bool = False
83
+ self,
84
+ generate_compressed_csv: bool = True,
85
+ generate_parquet: bool = False,
86
+ output_dir: Path | None = None,
42
87
  ) -> None:
43
88
  """
44
89
  Generates index-data files locally by executing queries against
@@ -47,29 +92,48 @@ class IDCIndexDataManager:
47
92
  This method iterates over SQL files in the 'scripts/sql' directory,
48
93
  executing each query using :func:`execute_sql_query` and generating a DataFrame,
49
94
  'index_df'. The DataFrame is then saved as compressed CSV and/or Parquet file.
95
+
96
+ Args:
97
+ generate_compressed_csv: Whether to generate compressed CSV files
98
+ generate_parquet: Whether to generate Parquet files
99
+ output_dir: Optional directory path for the output files
50
100
  """
51
101
 
52
102
  scripts_dir = Path(__file__).parent.parent
53
103
  sql_dir = scripts_dir / "sql"
54
104
 
105
+ if output_dir:
106
+ output_dir.mkdir(parents=True, exist_ok=True)
107
+
55
108
  for file_name in Path.iterdir(sql_dir):
56
109
  if str(file_name).endswith(".sql"):
57
110
  file_path = Path(sql_dir) / file_name
58
- index_df, output_basename = self.execute_sql_query(file_path)
111
+ index_df, output_basename, schema = self.execute_sql_query(file_path)
59
112
  logger.debug(
60
113
  "Executed and processed SQL queries from file: %s", file_path
61
114
  )
62
- if generate_compressed_csv:
63
- csv_file_name = f"{output_basename}.csv.zip"
64
- index_df.to_csv(
65
- csv_file_name, compression={"method": "zip"}, escapechar="\\"
66
- )
67
- logger.debug("Created CSV zip file: %s", csv_file_name)
68
-
69
- if generate_parquet:
70
- parquet_file_name = f"{output_basename}.parquet"
71
- index_df.to_parquet(parquet_file_name, compression="zstd")
72
- logger.debug("Created Parquet file: %s", parquet_file_name)
115
+ if generate_compressed_csv:
116
+ csv_file_path = (
117
+ output_dir / f"{output_basename}.csv.zip"
118
+ if output_dir
119
+ else Path(f"{output_basename}.csv.zip")
120
+ )
121
+ index_df.to_csv(
122
+ csv_file_path, compression={"method": "zip"}, escapechar="\\"
123
+ )
124
+ logger.debug("Created CSV zip file: %s", csv_file_path)
125
+
126
+ if generate_parquet:
127
+ parquet_file_path = (
128
+ output_dir / f"{output_basename}.parquet"
129
+ if output_dir
130
+ else Path(f"{output_basename}.parquet")
131
+ )
132
+ index_df.to_parquet(parquet_file_path, compression="zstd")
133
+ logger.debug("Created Parquet file: %s", parquet_file_path)
134
+
135
+ # Save schema to JSON file
136
+ self.save_schema_to_json(schema, output_basename, output_dir)
73
137
 
74
138
  def retrieve_latest_idc_release_version(self) -> int:
75
139
  """
File without changes