idc-index-data 22.1.2__tar.gz → 22.1.3__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (40) hide show
  1. {idc_index_data-22.1.2 → idc_index_data-22.1.3}/.pre-commit-config.yaml +1 -0
  2. {idc_index_data-22.1.2 → idc_index_data-22.1.3}/PKG-INFO +2 -1
  3. idc_index_data-22.1.3/assets/clinical_index.sql +23 -0
  4. {idc_index_data-22.1.2 → idc_index_data-22.1.3}/pyproject.toml +6 -4
  5. idc_index_data-22.1.3/pytest.ini +2 -0
  6. {idc_index_data-22.1.2 → idc_index_data-22.1.3}/scripts/python/generate-indices.py +6 -2
  7. idc_index_data-22.1.3/scripts/python/idc_index_data_manager.py +424 -0
  8. idc_index_data-22.1.3/tests/test_column_description_parser.py +218 -0
  9. idc_index_data-22.1.3/tests/test_real_sql_parsing.py +101 -0
  10. idc_index_data-22.1.2/assets/clinical_index.sql +0 -11
  11. idc_index_data-22.1.2/scripts/python/idc_index_data_manager.py +0 -231
  12. {idc_index_data-22.1.2 → idc_index_data-22.1.3}/.git_archival.txt +0 -0
  13. {idc_index_data-22.1.2 → idc_index_data-22.1.3}/.gitattributes +0 -0
  14. {idc_index_data-22.1.2 → idc_index_data-22.1.3}/.github/CONTRIBUTING.md +0 -0
  15. {idc_index_data-22.1.2 → idc_index_data-22.1.3}/.github/copilot-instructions.md +0 -0
  16. {idc_index_data-22.1.2 → idc_index_data-22.1.3}/.github/dependabot.yml +0 -0
  17. {idc_index_data-22.1.2 → idc_index_data-22.1.3}/.github/matchers/pylint.json +0 -0
  18. {idc_index_data-22.1.2 → idc_index_data-22.1.3}/.github/workflows/cd.yml +0 -0
  19. {idc_index_data-22.1.2 → idc_index_data-22.1.3}/.github/workflows/ci.yml +0 -0
  20. {idc_index_data-22.1.2 → idc_index_data-22.1.3}/.github/workflows/external-indices.yml +0 -0
  21. {idc_index_data-22.1.2 → idc_index_data-22.1.3}/.gitignore +0 -0
  22. {idc_index_data-22.1.2 → idc_index_data-22.1.3}/.readthedocs.yaml +0 -0
  23. {idc_index_data-22.1.2 → idc_index_data-22.1.3}/CMakeLists.txt +0 -0
  24. {idc_index_data-22.1.2 → idc_index_data-22.1.3}/LICENSE +0 -0
  25. {idc_index_data-22.1.2 → idc_index_data-22.1.3}/README.md +0 -0
  26. {idc_index_data-22.1.2 → idc_index_data-22.1.3}/assets/README.md +0 -0
  27. {idc_index_data-22.1.2 → idc_index_data-22.1.3}/assets/sm_index.sql +0 -0
  28. {idc_index_data-22.1.2 → idc_index_data-22.1.3}/assets/sm_instance_index.sql +0 -0
  29. {idc_index_data-22.1.2 → idc_index_data-22.1.3}/docs/conf.py +0 -0
  30. {idc_index_data-22.1.2 → idc_index_data-22.1.3}/docs/index.md +0 -0
  31. {idc_index_data-22.1.2 → idc_index_data-22.1.3}/noxfile.py +0 -0
  32. {idc_index_data-22.1.2 → idc_index_data-22.1.3}/scripts/python/update_idc_index_version.py +0 -0
  33. {idc_index_data-22.1.2 → idc_index_data-22.1.3}/scripts/sql/analysis_results_index.sql +0 -0
  34. {idc_index_data-22.1.2 → idc_index_data-22.1.3}/scripts/sql/collections_index.sql +0 -0
  35. {idc_index_data-22.1.2 → idc_index_data-22.1.3}/scripts/sql/idc_index.sql +0 -0
  36. {idc_index_data-22.1.2 → idc_index_data-22.1.3}/scripts/sql/prior_versions_index.sql +0 -0
  37. {idc_index_data-22.1.2 → idc_index_data-22.1.3}/src/idc_index_data/__init__.py +0 -0
  38. {idc_index_data-22.1.2 → idc_index_data-22.1.3}/src/idc_index_data/_version.pyi +0 -0
  39. {idc_index_data-22.1.2 → idc_index_data-22.1.3}/src/idc_index_data/py.typed +0 -0
  40. {idc_index_data-22.1.2 → idc_index_data-22.1.3}/tests/test_package.py +0 -0
@@ -55,6 +55,7 @@ repos:
55
55
  additional_dependencies:
56
56
  - pytest
57
57
  - pandas-stubs
58
+ - google-cloud-bigquery
58
59
 
59
60
  - repo: https://github.com/codespell-project/codespell
60
61
  rev: "v2.4.1"
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: idc-index-data
3
- Version: 22.1.2
3
+ Version: 22.1.3
4
4
  Summary: ImagingDataCommons index to query and download data.
5
5
  Author-Email: Andrey Fedorov <andrey.fedorov@gmail.com>, Vamsi Thiriveedhi <vthiriveedhi@mgh.harvard.edu>, Jean-Christophe Fillion-Robin <jchris.fillionr@kitware.com>
6
6
  License: Copyright 2024 Andrey Fedorov
@@ -41,6 +41,7 @@ Project-URL: Bug Tracker, https://github.com/ImagingDataCommons/idc-index-data/i
41
41
  Project-URL: Discussions, https://discourse.canceridc.dev/
42
42
  Project-URL: Changelog, https://github.com/ImagingDataCommons/idc-index-data/releases
43
43
  Requires-Python: >=3.10
44
+ Requires-Dist: google-cloud-bigquery
44
45
  Provides-Extra: test
45
46
  Requires-Dist: pandas; extra == "test"
46
47
  Requires-Dist: pyarrow; extra == "test"
@@ -0,0 +1,23 @@
1
+ SELECT
2
+ # description:
3
+ # unique identifier of the collection
4
+ collection_id,
5
+ # description:
6
+ # full name of the table in which the column is stored
7
+ table_name,
8
+ # description:
9
+ # short name of the table in which the column is stored
10
+ SPLIT(table_name,'.')[SAFE_OFFSET(2)] AS short_table_name,
11
+ # description:
12
+ # name of the column in which the value is stored
13
+ `column`,
14
+ # description:
15
+ # human readable name of the column
16
+ column_label,
17
+ # description:
18
+ # values encountered in the column
19
+ `values`
20
+ FROM
21
+ `bigquery-public-data.idc_v22_clinical.column_metadata`
22
+ ORDER BY
23
+ collection_id, table_name
@@ -13,7 +13,7 @@ build-backend = "scikit_build_core.build"
13
13
 
14
14
  [project]
15
15
  name = "idc-index-data"
16
- version = "22.1.2"
16
+ version = "22.1.3"
17
17
  authors = [
18
18
  { name = "Andrey Fedorov", email = "andrey.fedorov@gmail.com" },
19
19
  { name = "Vamsi Thiriveedhi", email = "vthiriveedhi@mgh.harvard.edu" },
@@ -38,7 +38,9 @@ classifiers = [
38
38
  "Topic :: Scientific/Engineering",
39
39
  "Typing :: Typed",
40
40
  ]
41
- dependencies = []
41
+ dependencies = [
42
+ "google-cloud-bigquery"
43
+ ]
42
44
 
43
45
  [project.optional-dependencies]
44
46
  test = [
@@ -102,7 +104,7 @@ report.exclude_also = [
102
104
 
103
105
  [tool.mypy]
104
106
  files = ["src", "tests"]
105
- python_version = "3.8"
107
+ python_version = "3.10"
106
108
  warn_unused_configs = true
107
109
  strict = true
108
110
  enable_error_code = ["ignore-without-code", "redundant-expr", "truthy-bool"]
@@ -158,7 +160,7 @@ isort.required-imports = ["from __future__ import annotations"]
158
160
 
159
161
 
160
162
  [tool.pylint]
161
- py-version = "3.8"
163
+ py-version = "3.10"
162
164
  ignore-paths = [".*/_version.py"]
163
165
  reports.output-format = "colorized"
164
166
  similarities.ignore-imports = "yes"
@@ -0,0 +1,2 @@
1
+ [pytest]
2
+ filterwarnings = ignore::FutureWarning:google.api_core
@@ -28,7 +28,7 @@ def main():
28
28
  )
29
29
  parquet_file_path = output_dir / f"{output_basename}.parquet"
30
30
  index_df.to_parquet(parquet_file_path)
31
- manager.save_schema_to_json(schema, output_basename, output_dir)
31
+ manager.save_schema_to_json(schema, output_basename, sql_query, output_dir)
32
32
  manager.save_sql_query(sql_query, output_basename, output_dir)
33
33
 
34
34
  core_indices_dir = scripts_dir.parent / "scripts" / "sql"
@@ -42,7 +42,11 @@ def main():
42
42
  )
43
43
  parquet_file_path = output_dir / f"{output_basename}.parquet"
44
44
  index_df.to_parquet(parquet_file_path)
45
- manager.save_schema_to_json(schema, output_basename, output_dir)
45
+ if output_basename == "prior_versions_index":
46
+ # For prior_versions_index, save schema without descriptions
47
+ manager.save_schema_to_json(schema, output_basename, None, output_dir)
48
+ else:
49
+ manager.save_schema_to_json(schema, output_basename, sql_query, output_dir)
46
50
  manager.save_sql_query(sql_query, output_basename, output_dir)
47
51
 
48
52
 
@@ -0,0 +1,424 @@
1
+ from __future__ import annotations
2
+
3
+ import json
4
+ import logging
5
+ import os
6
+ import re
7
+ from pathlib import Path
8
+
9
+ import pandas as pd
10
+ from google.cloud import bigquery
11
+
12
+ logging.basicConfig(level=logging.DEBUG)
13
+ logger = logging.getLogger(__name__)
14
+
15
+
16
+ class IDCIndexDataManager:
17
+ def __init__(self, project_id: str):
18
+ """
19
+ Initializes the IDCIndexDataManager using the Google Cloud Platform project ID.
20
+ """
21
+ self.project_id = project_id
22
+ self.client = bigquery.Client(project=project_id)
23
+ logger.debug("IDCIndexDataManager initialized with project ID: %s", project_id)
24
+
25
+ @staticmethod
26
+ def parse_column_descriptions(sql_query: str) -> dict[str, str]:
27
+ """
28
+ Parses column descriptions from SQL query comments.
29
+
30
+ The method looks for comments following the pattern:
31
+ # description:
32
+ # description text continues here
33
+ # and can span multiple lines
34
+ column_name or expression AS column_name,
35
+
36
+ Args:
37
+ sql_query: The SQL query string containing comments
38
+
39
+ Returns:
40
+ Dictionary mapping column names to their descriptions
41
+ """
42
+ descriptions: dict[str, str] = {}
43
+ logger.debug("Parsing column descriptions from SQL query comments")
44
+ logger.debug(sql_query)
45
+ lines = sql_query.split("\n")
46
+
47
+ i = 0
48
+ while i < len(lines):
49
+ line = lines[i]
50
+ stripped = line.strip()
51
+
52
+ # Check if this line starts a description comment
53
+ if stripped == "# description:":
54
+ # Collect description lines until we hit a non-comment line
55
+ description_lines = []
56
+ i += 1
57
+
58
+ while i < len(lines):
59
+ next_line = lines[i]
60
+ next_stripped = next_line.strip()
61
+
62
+ # If it's a description comment line (starts with #)
63
+ if next_stripped.startswith("#") and next_stripped != "#":
64
+ # Remove the leading # and whitespace
65
+ desc_text = next_stripped[1:].strip()
66
+ if desc_text:
67
+ description_lines.append(desc_text)
68
+ i += 1
69
+ elif next_stripped.startswith("#"):
70
+ # Empty comment line, skip
71
+ i += 1
72
+ else:
73
+ # Non-comment line - this should contain the column definition
74
+ break
75
+
76
+ # Now parse the column definition
77
+ if i < len(lines) and description_lines:
78
+ # Join the description lines
79
+ description = " ".join(description_lines)
80
+
81
+ # Find the column name by parsing the SELECT clause
82
+ # We need to handle multi-line column definitions with nested structures
83
+ column_def = ""
84
+ paren_depth = (
85
+ 0 # Track parentheses depth to handle nested SELECT/FROM
86
+ )
87
+
88
+ while i < len(lines):
89
+ current_line = lines[i]
90
+ current_stripped = current_line.strip()
91
+
92
+ # Count parentheses to track nesting depth
93
+ paren_depth += current_line.count("(") - current_line.count(")")
94
+
95
+ # Only check for top-level SQL keywords when not inside nested structures
96
+ if paren_depth == 0 and any(
97
+ current_stripped.upper().startswith(keyword)
98
+ for keyword in [
99
+ "FROM",
100
+ "WHERE",
101
+ "GROUP BY",
102
+ "ORDER BY",
103
+ "JOIN",
104
+ "LEFT",
105
+ "RIGHT",
106
+ "INNER",
107
+ "OUTER",
108
+ ]
109
+ ):
110
+ # Don't include this line in column_def
111
+ # Don't increment i here - let outer loop handle it
112
+ break
113
+
114
+ column_def += " " + current_stripped
115
+ i += 1
116
+
117
+ # Check if we've found a complete column definition
118
+ # (has a comma at depth 0)
119
+ if paren_depth == 0 and "," in current_line:
120
+ break
121
+
122
+ # Safety check: if we've gone too deep, break
123
+ if paren_depth < 0:
124
+ break
125
+
126
+ # Extract column name from the definition
127
+ column_name = IDCIndexDataManager._extract_column_name(column_def)
128
+ if column_name:
129
+ descriptions[column_name] = description
130
+ logger.debug(
131
+ "Parsed description for column '%s': %s",
132
+ column_name,
133
+ description[:50] + "..."
134
+ if len(description) > 50
135
+ else description,
136
+ )
137
+ else:
138
+ i += 1
139
+ else:
140
+ i += 1
141
+
142
+ return descriptions
143
+
144
+ @staticmethod
145
+ def _extract_column_name(column_def: str) -> str | None:
146
+ """
147
+ Extracts the column name from a column definition.
148
+
149
+ Handles various formats:
150
+ - column_name,
151
+ - expression AS column_name,
152
+ - ANY_VALUE(column) AS column_name,
153
+ - Complex expressions with nested parentheses
154
+
155
+ Args:
156
+ column_def: The column definition string
157
+
158
+ Returns:
159
+ The column name or None if not found
160
+ """
161
+ # Remove trailing comma and whitespace
162
+ column_def = column_def.strip().rstrip(",").strip()
163
+
164
+ # Look for the last AS clause (to handle nested AS in CAST expressions)
165
+ # Use a regex that finds the rightmost AS followed by a word
166
+ as_matches = list(re.finditer(r"\bAS\b\s+(\w+)", column_def, re.IGNORECASE))
167
+ if as_matches:
168
+ # Return the last match (rightmost AS clause)
169
+ return as_matches[-1].group(1)
170
+
171
+ # If no AS clause, try to get the column name
172
+ # Remove function calls and get the last word before comma
173
+ # Handle cases like: column_name, or just column_name
174
+ parts = column_def.split()
175
+ if parts:
176
+ # Get the last word that looks like an identifier
177
+ for original_part in reversed(parts):
178
+ # Remove trailing punctuation
179
+ part = original_part.rstrip(",").strip()
180
+ # Check if it's a valid identifier (word characters only)
181
+ if re.match(r"^\w+$", part):
182
+ return part
183
+
184
+ return None
185
+
186
+ def execute_sql_query(
187
+ self, file_path: str
188
+ ) -> tuple[pd.DataFrame, str, list[bigquery.SchemaField], str]:
189
+ """
190
+ Executes the SQL query in the specified file.
191
+
192
+ Returns:
193
+ Tuple[pd.DataFrame, str, List[bigquery.SchemaField], str]: A tuple containing
194
+ the DataFrame with query results, the output basename, the BigQuery schema, and
195
+ the SQL query string.
196
+ """
197
+ with Path(file_path).open("r") as file:
198
+ sql_query = file.read()
199
+ query_job_result = self.client.query(sql_query).result()
200
+ schema = query_job_result.schema # Get schema from BigQuery QueryJob
201
+ index_df = query_job_result.to_dataframe()
202
+ if "StudyDate" in index_df.columns:
203
+ index_df["StudyDate"] = index_df["StudyDate"].astype(str)
204
+ output_basename = Path(file_path).name.split(".")[0]
205
+ logger.debug("Executed SQL query from file: %s", file_path)
206
+ return index_df, output_basename, schema, sql_query
207
+
208
+ def save_schema_to_json(
209
+ self,
210
+ schema: list[bigquery.SchemaField],
211
+ output_basename: str,
212
+ sql_query: str | None = None,
213
+ output_dir: Path | None = None,
214
+ ) -> None:
215
+ """
216
+ Saves the BigQuery schema to a JSON file, including column descriptions
217
+ parsed from SQL query comments.
218
+
219
+ Args:
220
+ schema: List of BigQuery SchemaField objects from the query result
221
+ output_basename: The base name for the output file
222
+ sql_query: The SQL query string to parse for column descriptions
223
+ output_dir: Optional directory path for the output file
224
+ """
225
+ # Parse column descriptions from SQL comments
226
+ logger.debug("Parsing column descriptions from SQL query comments")
227
+ logger.debug(sql_query)
228
+ if sql_query is not None:
229
+ descriptions = self.parse_column_descriptions(sql_query)
230
+
231
+ # Convert BigQuery schema to JSON-serializable format
232
+ schema_dict = {
233
+ "fields": [
234
+ {
235
+ "name": field.name,
236
+ "type": field.field_type,
237
+ "mode": field.mode,
238
+ "description": descriptions.get(field.name, ""),
239
+ }
240
+ for field in schema
241
+ ]
242
+ }
243
+ else:
244
+ # If no SQL query provided, save schema without descriptions
245
+ schema_dict = {
246
+ "fields": [
247
+ {
248
+ "name": field.name,
249
+ "type": field.field_type,
250
+ "mode": field.mode,
251
+ "description": "",
252
+ }
253
+ for field in schema
254
+ ]
255
+ }
256
+
257
+ # Save to JSON file
258
+ if output_dir:
259
+ output_dir.mkdir(parents=True, exist_ok=True)
260
+ json_file_path = output_dir / f"{output_basename}.json"
261
+ else:
262
+ json_file_path = Path(f"{output_basename}.json")
263
+
264
+ with json_file_path.open("w") as f:
265
+ json.dump(schema_dict, f, indent=2)
266
+ logger.debug("Created schema JSON file: %s", json_file_path)
267
+
268
+ def save_sql_query(
269
+ self,
270
+ sql_query: str,
271
+ output_basename: str,
272
+ output_dir: Path | None = None,
273
+ ) -> None:
274
+ """
275
+ Saves the SQL query to a file.
276
+
277
+ Args:
278
+ sql_query: The SQL query string
279
+ output_basename: The base name for the output file
280
+ output_dir: Optional directory path for the output file
281
+ """
282
+
283
+ if output_dir:
284
+ output_dir.mkdir(parents=True, exist_ok=True)
285
+ query_file_path = output_dir / f"{output_basename}.sql"
286
+ else:
287
+ query_file_path = Path(f"{output_basename}.sql")
288
+
289
+ with query_file_path.open("w") as f:
290
+ f.write(sql_query)
291
+ logger.debug("Created SQL query file: %s", query_file_path)
292
+
293
+ def generate_index_data_files(
294
+ self,
295
+ generate_compressed_csv: bool = True,
296
+ generate_parquet: bool = False,
297
+ output_dir: Path | None = None,
298
+ ) -> None:
299
+ """
300
+ Generates index-data files locally by executing queries against
301
+ the Google Cloud Platform IDC project tables.
302
+
303
+ This method iterates over SQL files in the 'scripts/sql' directory,
304
+ executing each query using :func:`execute_sql_query` and generating a DataFrame,
305
+ 'index_df'. The DataFrame is then saved as compressed CSV and/or Parquet file.
306
+
307
+ Args:
308
+ generate_compressed_csv: Whether to generate compressed CSV files
309
+ generate_parquet: Whether to generate Parquet files
310
+ output_dir: Optional directory path for the output files
311
+ """
312
+
313
+ scripts_dir = Path(__file__).parent.parent
314
+ sql_dir = scripts_dir / "sql"
315
+
316
+ if output_dir:
317
+ output_dir.mkdir(parents=True, exist_ok=True)
318
+
319
+ for file_name in Path.iterdir(sql_dir):
320
+ if str(file_name).endswith(".sql"):
321
+ file_path = Path(sql_dir) / file_name
322
+ index_df, output_basename, schema, sql_query = self.execute_sql_query(
323
+ str(file_path)
324
+ )
325
+ logger.debug(
326
+ "Executed and processed SQL queries from file: %s", file_path
327
+ )
328
+ if generate_compressed_csv:
329
+ csv_file_path = (
330
+ output_dir / f"{output_basename}.csv.zip"
331
+ if output_dir
332
+ else Path(f"{output_basename}.csv.zip")
333
+ )
334
+ index_df.to_csv(
335
+ csv_file_path, compression={"method": "zip"}, escapechar="\\"
336
+ )
337
+ logger.debug("Created CSV zip file: %s", csv_file_path)
338
+
339
+ if generate_parquet:
340
+ parquet_file_path = (
341
+ output_dir / f"{output_basename}.parquet"
342
+ if output_dir
343
+ else Path(f"{output_basename}.parquet")
344
+ )
345
+ index_df.to_parquet(parquet_file_path, compression="zstd")
346
+ logger.debug("Created Parquet file: %s", parquet_file_path)
347
+
348
+ # Save schema to JSON file
349
+ # Skip parsing descriptions for prior_versions_index as it has dynamic SQL
350
+ if output_basename != "prior_versions_index":
351
+ self.save_schema_to_json(
352
+ schema, output_basename, sql_query, output_dir
353
+ )
354
+ else:
355
+ # For prior_versions_index, save schema without descriptions
356
+ self.save_schema_to_json(schema, output_basename, None, output_dir)
357
+ # Save SQL query to file
358
+ self.save_sql_query(sql_query, output_basename, output_dir)
359
+
360
+ def retrieve_latest_idc_release_version(self) -> int:
361
+ """
362
+ Retrieves the latest IDC release version.
363
+
364
+ This function executes a SQL query on the `version_metadata` table in the
365
+ `idc_current` dataset of the BigQuery client. It retrieves the maximum
366
+ `idc_version` and returns it as an integer.
367
+ """
368
+ query = """
369
+ SELECT
370
+ MAX(idc_version) AS latest_idc_release_version
371
+ FROM
372
+ `bigquery-public-data.idc_current.version_metadata`
373
+ """
374
+ query_job = self.client.query(query)
375
+ result = query_job.result()
376
+ return int(next(result).latest_idc_release_version)
377
+
378
+
379
+ if __name__ == "__main__":
380
+ import argparse
381
+
382
+ parser = argparse.ArgumentParser()
383
+ parser.add_argument(
384
+ "--project",
385
+ default=os.environ.get("GCP_PROJECT", None),
386
+ help="Google Cloud Platform Project ID (default from GCP_PROJECT env. variable)",
387
+ )
388
+ parser.add_argument(
389
+ "--generate-csv-archive",
390
+ action="store_true",
391
+ help="Generate idc_index.csv.zip file",
392
+ )
393
+ parser.add_argument(
394
+ "--generate-parquet",
395
+ action="store_true",
396
+ help="Generate idc_index.parquet file",
397
+ )
398
+ parser.add_argument(
399
+ "--retrieve-latest-idc-release-version",
400
+ action="store_true",
401
+ help="Retrieve and display the latest IDC release version",
402
+ )
403
+
404
+ args = parser.parse_args()
405
+
406
+ if not args.project:
407
+ parser.error(
408
+ "Set GCP_PROJECT environment variable or specify --project argument"
409
+ )
410
+
411
+ if any([args.generate_csv_archive, args.generate_parquet]):
412
+ IDCIndexDataManager(args.project).generate_index_data_files(
413
+ generate_compressed_csv=args.generate_csv_archive,
414
+ generate_parquet=args.generate_parquet,
415
+ )
416
+ elif args.retrieve_latest_idc_release_version:
417
+ logging.basicConfig(level=logging.ERROR, force=True)
418
+ logger.setLevel(logging.ERROR)
419
+ version = IDCIndexDataManager(
420
+ args.project
421
+ ).retrieve_latest_idc_release_version()
422
+ print(f"{version}") # noqa: T201
423
+ else:
424
+ parser.print_help()
@@ -0,0 +1,218 @@
1
+ from __future__ import annotations
2
+
3
+ from scripts.python.idc_index_data_manager import IDCIndexDataManager
4
+
5
+
6
+ class TestColumnDescriptionParser:
7
+ """Tests for parsing column descriptions from SQL comments."""
8
+
9
+ def test_simple_column_description(self):
10
+ """Test parsing a simple column with description."""
11
+ sql_query = """
12
+ SELECT
13
+ # description:
14
+ # name of the collection
15
+ collection_name,
16
+ FROM table
17
+ """
18
+ descriptions = IDCIndexDataManager.parse_column_descriptions(sql_query)
19
+ assert "collection_name" in descriptions
20
+ assert descriptions["collection_name"] == "name of the collection"
21
+
22
+ def test_multiline_description(self):
23
+ """Test parsing a column with multi-line description."""
24
+ sql_query = """
25
+ SELECT
26
+ # description:
27
+ # this string is not empty if the specific series is
28
+ # part of an analysis results collection; analysis results can be added to a
29
+ # given collection over time
30
+ ANY_VALUE(analysis_result_id) AS analysis_result_id,
31
+ FROM table
32
+ """
33
+ descriptions = IDCIndexDataManager.parse_column_descriptions(sql_query)
34
+ assert "analysis_result_id" in descriptions
35
+ expected = (
36
+ "this string is not empty if the specific series is "
37
+ "part of an analysis results collection; analysis results can be added to a "
38
+ "given collection over time"
39
+ )
40
+ assert descriptions["analysis_result_id"] == expected
41
+
42
+ def test_column_with_as_clause(self):
43
+ """Test parsing a column with AS clause."""
44
+ sql_query = """
45
+ SELECT
46
+ # description:
47
+ # unique identifier of the DICOM series
48
+ SeriesInstanceUID AS series_id,
49
+ FROM table
50
+ """
51
+ descriptions = IDCIndexDataManager.parse_column_descriptions(sql_query)
52
+ assert "series_id" in descriptions
53
+ assert descriptions["series_id"] == "unique identifier of the DICOM series"
54
+
55
+ def test_column_with_function(self):
56
+ """Test parsing a column with function call."""
57
+ sql_query = """
58
+ SELECT
59
+ # description:
60
+ # age of the subject at the time of imaging
61
+ ANY_VALUE(PatientAge) AS PatientAge,
62
+ FROM table
63
+ """
64
+ descriptions = IDCIndexDataManager.parse_column_descriptions(sql_query)
65
+ assert "PatientAge" in descriptions
66
+ assert descriptions["PatientAge"] == "age of the subject at the time of imaging"
67
+
68
+ def test_multiple_columns(self):
69
+ """Test parsing multiple columns with descriptions."""
70
+ sql_query = """
71
+ SELECT
72
+ # description:
73
+ # name of the collection
74
+ collection_name,
75
+ # description:
76
+ # unique identifier of the collection
77
+ collection_id,
78
+ # description:
79
+ # types of cancer represented in the collection
80
+ CancerTypes,
81
+ FROM table
82
+ """
83
+ descriptions = IDCIndexDataManager.parse_column_descriptions(sql_query)
84
+ assert len(descriptions) == 3
85
+ assert descriptions["collection_name"] == "name of the collection"
86
+ assert descriptions["collection_id"] == "unique identifier of the collection"
87
+ assert (
88
+ descriptions["CancerTypes"]
89
+ == "types of cancer represented in the collection"
90
+ )
91
+
92
+ def test_column_without_description(self):
93
+ """Test that columns without descriptions are not in the result."""
94
+ sql_query = """
95
+ SELECT
96
+ # description:
97
+ # name of the collection
98
+ collection_name,
99
+ collection_id,
100
+ FROM table
101
+ """
102
+ descriptions = IDCIndexDataManager.parse_column_descriptions(sql_query)
103
+ assert "collection_name" in descriptions
104
+ assert "collection_id" not in descriptions
105
+
106
+ def test_extract_column_name_simple(self):
107
+ """Test extracting column name from simple definition."""
108
+ assert (
109
+ IDCIndexDataManager._extract_column_name("collection_name,")
110
+ == "collection_name"
111
+ )
112
+ assert (
113
+ IDCIndexDataManager._extract_column_name("collection_name")
114
+ == "collection_name"
115
+ )
116
+
117
+ def test_extract_column_name_with_as(self):
118
+ """Test extracting column name with AS clause."""
119
+ assert (
120
+ IDCIndexDataManager._extract_column_name(
121
+ "ANY_VALUE(collection_id) AS collection_id,"
122
+ )
123
+ == "collection_id"
124
+ )
125
+ assert IDCIndexDataManager._extract_column_name("column AS alias,") == "alias"
126
+
127
+ def test_extract_column_name_complex(self):
128
+ """Test extracting column name from complex expressions."""
129
+ assert (
130
+ IDCIndexDataManager._extract_column_name(
131
+ "ROUND(SUM(SAFE_CAST(instance_size AS float64))/1000000, 2) AS series_size_MB,"
132
+ )
133
+ == "series_size_MB"
134
+ )
135
+ assert (
136
+ IDCIndexDataManager._extract_column_name("COUNT(SOPInstanceUID) AS count,")
137
+ == "count"
138
+ )
139
+
140
+ def test_complex_multiline_select(self):
141
+ """Test parsing a complex multi-line SELECT statement."""
142
+ sql_query = """
143
+ SELECT
144
+ # description:
145
+ # total size of the series in megabytes
146
+ ROUND(SUM(SAFE_CAST(instance_size AS float64))/1000000, 2) AS series_size_MB,
147
+ FROM table
148
+ """
149
+ descriptions = IDCIndexDataManager.parse_column_descriptions(sql_query)
150
+ assert "series_size_MB" in descriptions
151
+ assert descriptions["series_size_MB"] == "total size of the series in megabytes"
152
+
153
+ def test_no_descriptions(self):
154
+ """Test SQL query with no descriptions."""
155
+ sql_query = """
156
+ SELECT
157
+ collection_name,
158
+ collection_id,
159
+ CancerTypes
160
+ FROM table
161
+ """
162
+ descriptions = IDCIndexDataManager.parse_column_descriptions(sql_query)
163
+ assert len(descriptions) == 0
164
+
165
+ def test_empty_description_lines(self):
166
+ """Test handling of empty comment lines in descriptions."""
167
+ sql_query = """
168
+ SELECT
169
+ # description:
170
+ # name of the collection
171
+ #
172
+ # additional info
173
+ collection_name,
174
+ FROM table
175
+ """
176
+ descriptions = IDCIndexDataManager.parse_column_descriptions(sql_query)
177
+ assert "collection_name" in descriptions
178
+ # Empty comment lines should be skipped
179
+ assert (
180
+ descriptions["collection_name"] == "name of the collection additional info"
181
+ )
182
+
183
+ def test_nested_array_select_with_if(self):
184
+ """Test parsing complex nested ARRAY/SELECT/IF statements."""
185
+ sql_query = """
186
+ SELECT
187
+ # description:
188
+ # embedding medium used for the slide preparation
189
+ ARRAY(
190
+ SELECT
191
+ IF
192
+ (code IS NULL, NULL, SPLIT(code, ':')[SAFE_OFFSET(0)])
193
+ FROM
194
+ UNNEST(embeddingMedium_code_str) AS code ) AS embeddingMedium_CodeMeaning,
195
+ # description:
196
+ # embedding medium code tuple
197
+ ARRAY(
198
+ SELECT
199
+ IF
200
+ (code IS NULL, NULL,
201
+ IF
202
+ (STRPOS(code, ':') = 0, NULL, SUBSTR(code, STRPOS(code, ':') + 1)))
203
+ FROM
204
+ UNNEST(embeddingMedium_code_str) AS code ) AS embeddingMedium_code_designator_value_str,
205
+ FROM table
206
+ """
207
+ descriptions = IDCIndexDataManager.parse_column_descriptions(sql_query)
208
+ assert len(descriptions) == 2
209
+ assert "embeddingMedium_CodeMeaning" in descriptions
210
+ assert (
211
+ descriptions["embeddingMedium_CodeMeaning"]
212
+ == "embedding medium used for the slide preparation"
213
+ )
214
+ assert "embeddingMedium_code_designator_value_str" in descriptions
215
+ assert (
216
+ descriptions["embeddingMedium_code_designator_value_str"]
217
+ == "embedding medium code tuple"
218
+ )
@@ -0,0 +1,101 @@
1
+ from __future__ import annotations
2
+
3
+ from pathlib import Path
4
+
5
+ from scripts.python.idc_index_data_manager import IDCIndexDataManager
6
+
7
+ """Test script to verify column description parsing with real SQL files."""
8
+
9
+
10
+ def test_real_sql_files() -> None:
11
+ """Test parsing descriptions from actual SQL files in the repository."""
12
+ scripts_dir = Path(__file__).parent.parent / "scripts"
13
+ sql_dir = scripts_dir / "sql"
14
+
15
+ # Test collections_index.sql
16
+ collections_sql_path = sql_dir / "collections_index.sql"
17
+ if collections_sql_path.exists():
18
+ with collections_sql_path.open("r") as f:
19
+ sql_query = f.read()
20
+
21
+ descriptions = IDCIndexDataManager.parse_column_descriptions(sql_query)
22
+ print("\n=== collections_index.sql ===")
23
+ print(f"Found {len(descriptions)} column descriptions:")
24
+ for col_name, desc in descriptions.items():
25
+ print(f" {col_name}: {desc[:60]}...")
26
+
27
+ # Validate expected columns
28
+ expected_columns = [
29
+ "collection_name",
30
+ "collection_id",
31
+ "CancerTypes",
32
+ "TumorLocations",
33
+ "Subjects",
34
+ "Species",
35
+ "Sources",
36
+ "SupportingData",
37
+ "Program",
38
+ "Status",
39
+ "Updated",
40
+ "Description",
41
+ ]
42
+ for col in expected_columns:
43
+ assert col in descriptions, (
44
+ f"Expected column '{col}' not found in descriptions"
45
+ )
46
+ print("✓ All expected columns found")
47
+
48
+ # Test idc_index.sql
49
+ idc_index_sql_path = sql_dir / "idc_index.sql"
50
+ if idc_index_sql_path.exists():
51
+ with idc_index_sql_path.open("r") as f:
52
+ sql_query = f.read()
53
+
54
+ descriptions = IDCIndexDataManager.parse_column_descriptions(sql_query)
55
+ print("\n=== idc_index.sql ===")
56
+ print(f"Found {len(descriptions)} column descriptions:")
57
+
58
+ # Show first 10 descriptions
59
+ for i, (col_name, desc) in enumerate(descriptions.items()):
60
+ if i < 10:
61
+ print(f" {col_name}: {desc[:60]}...")
62
+ else:
63
+ break
64
+
65
+ if len(descriptions) > 10:
66
+ print(f" ... and {len(descriptions) - 10} more")
67
+
68
+ # Validate some expected columns
69
+ expected_columns = [
70
+ "collection_id",
71
+ "analysis_result_id",
72
+ "PatientID",
73
+ "SeriesInstanceUID",
74
+ "StudyInstanceUID",
75
+ "source_DOI",
76
+ "PatientAge",
77
+ "PatientSex",
78
+ "StudyDate",
79
+ "series_size_MB",
80
+ ]
81
+ for col in expected_columns:
82
+ if col in descriptions:
83
+ print(f"✓ Found expected column: {col}")
84
+ else:
85
+ print(f"✗ Missing expected column: {col}")
86
+
87
+ # Test analysis_results_index.sql (should have no descriptions)
88
+ analysis_sql_path = sql_dir / "analysis_results_index.sql"
89
+ if analysis_sql_path.exists():
90
+ with analysis_sql_path.open("r") as f:
91
+ sql_query = f.read()
92
+
93
+ descriptions = IDCIndexDataManager.parse_column_descriptions(sql_query)
94
+ print("\n=== analysis_results_index.sql ===")
95
+ print(f"Found {len(descriptions)} column descriptions (expected 0)")
96
+ assert len(descriptions) == 0, "Expected no descriptions in this file"
97
+ print("✓ Correctly found no descriptions")
98
+
99
+
100
+ if __name__ == "__main__":
101
+ test_real_sql_files()
@@ -1,11 +0,0 @@
1
- SELECT
2
- collection_id,
3
- table_name,
4
- SPLIT(table_name,'.')[SAFE_OFFSET(2)] AS short_table_name,
5
- `column`,
6
- column_label,
7
- `values`
8
- FROM
9
- `bigquery-public-data.idc_v22_clinical.column_metadata`
10
- ORDER BY
11
- collection_id, table_name
@@ -1,231 +0,0 @@
1
- from __future__ import annotations
2
-
3
- import json
4
- import logging
5
- import os
6
- from pathlib import Path
7
-
8
- import pandas as pd
9
- from google.cloud import bigquery
10
-
11
- logging.basicConfig(level=logging.DEBUG)
12
- logger = logging.getLogger(__name__)
13
-
14
-
15
- class IDCIndexDataManager:
16
- def __init__(self, project_id: str):
17
- """
18
- Initializes the IDCIndexDataManager using the Google Cloud Platform project ID.
19
- """
20
- self.project_id = project_id
21
- self.client = bigquery.Client(project=project_id)
22
- logger.debug("IDCIndexDataManager initialized with project ID: %s", project_id)
23
-
24
- def execute_sql_query(
25
- self, file_path: str
26
- ) -> tuple[pd.DataFrame, str, list[bigquery.SchemaField]]:
27
- """
28
- Executes the SQL query in the specified file.
29
-
30
- Returns:
31
- Tuple[pd.DataFrame, str, List[bigquery.SchemaField]]: A tuple containing
32
- the DataFrame with query results, the output basename, and the BigQuery schema.
33
- """
34
- with Path(file_path).open("r") as file:
35
- sql_query = file.read()
36
- query_job_result = self.client.query(sql_query).result()
37
- schema = query_job_result.schema # Get schema from BigQuery QueryJob
38
- index_df = query_job_result.to_dataframe()
39
- if "StudyDate" in index_df.columns:
40
- index_df["StudyDate"] = index_df["StudyDate"].astype(str)
41
- output_basename = Path(file_path).name.split(".")[0]
42
- logger.debug("Executed SQL query from file: %s", file_path)
43
- return index_df, output_basename, schema, sql_query
44
-
45
- def save_schema_to_json(
46
- self,
47
- schema: list[bigquery.SchemaField],
48
- output_basename: str,
49
- output_dir: Path | None = None,
50
- ) -> None:
51
- """
52
- Saves the BigQuery schema to a JSON file.
53
-
54
- Args:
55
- schema: List of BigQuery SchemaField objects from the query result
56
- output_basename: The base name for the output file
57
- output_dir: Optional directory path for the output file
58
- """
59
- # Convert BigQuery schema to JSON-serializable format
60
- schema_dict = {
61
- "fields": [
62
- {
63
- "name": field.name,
64
- "type": field.field_type,
65
- "mode": field.mode,
66
- }
67
- for field in schema
68
- ]
69
- }
70
-
71
- # Save to JSON file
72
- if output_dir:
73
- output_dir.mkdir(parents=True, exist_ok=True)
74
- json_file_path = output_dir / f"{output_basename}.json"
75
- else:
76
- json_file_path = Path(f"{output_basename}.json")
77
-
78
- with json_file_path.open("w") as f:
79
- json.dump(schema_dict, f, indent=2)
80
- logger.debug("Created schema JSON file: %s", json_file_path)
81
-
82
- def save_sql_query(
83
- self,
84
- sql_query: str,
85
- output_basename: str,
86
- output_dir: Path | None = None,
87
- ) -> None:
88
- """
89
- Saves the SQL query to a file.
90
-
91
- Args:
92
- sql_query: The SQL query string
93
- output_basename: The base name for the output file
94
- output_dir: Optional directory path for the output file
95
- """
96
-
97
- if output_dir:
98
- output_dir.mkdir(parents=True, exist_ok=True)
99
- query_file_path = output_dir / f"{output_basename}.sql"
100
- else:
101
- query_file_path = Path(f"{output_basename}.sql")
102
-
103
- with query_file_path.open("w") as f:
104
- f.write(sql_query)
105
- logger.debug("Created SQL query file: %s", query_file_path)
106
-
107
- def generate_index_data_files(
108
- self,
109
- generate_compressed_csv: bool = True,
110
- generate_parquet: bool = False,
111
- output_dir: Path | None = None,
112
- ) -> None:
113
- """
114
- Generates index-data files locally by executing queries against
115
- the Google Cloud Platform IDC project tables.
116
-
117
- This method iterates over SQL files in the 'scripts/sql' directory,
118
- executing each query using :func:`execute_sql_query` and generating a DataFrame,
119
- 'index_df'. The DataFrame is then saved as compressed CSV and/or Parquet file.
120
-
121
- Args:
122
- generate_compressed_csv: Whether to generate compressed CSV files
123
- generate_parquet: Whether to generate Parquet files
124
- output_dir: Optional directory path for the output files
125
- """
126
-
127
- scripts_dir = Path(__file__).parent.parent
128
- sql_dir = scripts_dir / "sql"
129
-
130
- if output_dir:
131
- output_dir.mkdir(parents=True, exist_ok=True)
132
-
133
- for file_name in Path.iterdir(sql_dir):
134
- if str(file_name).endswith(".sql"):
135
- file_path = Path(sql_dir) / file_name
136
- index_df, output_basename, schema, sql_query = self.execute_sql_query(
137
- file_path
138
- )
139
- logger.debug(
140
- "Executed and processed SQL queries from file: %s", file_path
141
- )
142
- if generate_compressed_csv:
143
- csv_file_path = (
144
- output_dir / f"{output_basename}.csv.zip"
145
- if output_dir
146
- else Path(f"{output_basename}.csv.zip")
147
- )
148
- index_df.to_csv(
149
- csv_file_path, compression={"method": "zip"}, escapechar="\\"
150
- )
151
- logger.debug("Created CSV zip file: %s", csv_file_path)
152
-
153
- if generate_parquet:
154
- parquet_file_path = (
155
- output_dir / f"{output_basename}.parquet"
156
- if output_dir
157
- else Path(f"{output_basename}.parquet")
158
- )
159
- index_df.to_parquet(parquet_file_path, compression="zstd")
160
- logger.debug("Created Parquet file: %s", parquet_file_path)
161
-
162
- # Save schema to JSON file
163
- self.save_schema_to_json(schema, output_basename, output_dir)
164
- # Save SQL query to file
165
- self.save_sql_query(sql_query, output_basename, output_dir)
166
-
167
- def retrieve_latest_idc_release_version(self) -> int:
168
- """
169
- Retrieves the latest IDC release version.
170
-
171
- This function executes a SQL query on the `version_metadata` table in the
172
- `idc_current` dataset of the BigQuery client. It retrieves the maximum
173
- `idc_version` and returns it as an integer.
174
- """
175
- query = """
176
- SELECT
177
- MAX(idc_version) AS latest_idc_release_version
178
- FROM
179
- `bigquery-public-data.idc_current.version_metadata`
180
- """
181
- query_job = self.client.query(query)
182
- result = query_job.result()
183
- return int(next(result).latest_idc_release_version)
184
-
185
-
186
- if __name__ == "__main__":
187
- import argparse
188
-
189
- parser = argparse.ArgumentParser()
190
- parser.add_argument(
191
- "--project",
192
- default=os.environ.get("GCP_PROJECT", None),
193
- help="Google Cloud Platform Project ID (default from GCP_PROJECT env. variable)",
194
- )
195
- parser.add_argument(
196
- "--generate-csv-archive",
197
- action="store_true",
198
- help="Generate idc_index.csv.zip file",
199
- )
200
- parser.add_argument(
201
- "--generate-parquet",
202
- action="store_true",
203
- help="Generate idc_index.parquet file",
204
- )
205
- parser.add_argument(
206
- "--retrieve-latest-idc-release-version",
207
- action="store_true",
208
- help="Retrieve and display the latest IDC release version",
209
- )
210
-
211
- args = parser.parse_args()
212
-
213
- if not args.project:
214
- parser.error(
215
- "Set GCP_PROJECT environment variable or specify --project argument"
216
- )
217
-
218
- if any([args.generate_csv_archive, args.generate_parquet]):
219
- IDCIndexDataManager(args.project).generate_index_data_files(
220
- generate_compressed_csv=args.generate_csv_archive,
221
- generate_parquet=args.generate_parquet,
222
- )
223
- elif args.retrieve_latest_idc_release_version:
224
- logging.basicConfig(level=logging.ERROR, force=True)
225
- logger.setLevel(logging.ERROR)
226
- version = IDCIndexDataManager(
227
- args.project
228
- ).retrieve_latest_idc_release_version()
229
- print(f"{version}") # noqa: T201
230
- else:
231
- parser.print_help()
File without changes