idc-index-data 22.1.2__tar.gz → 22.1.3__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {idc_index_data-22.1.2 → idc_index_data-22.1.3}/.pre-commit-config.yaml +1 -0
- {idc_index_data-22.1.2 → idc_index_data-22.1.3}/PKG-INFO +2 -1
- idc_index_data-22.1.3/assets/clinical_index.sql +23 -0
- {idc_index_data-22.1.2 → idc_index_data-22.1.3}/pyproject.toml +6 -4
- idc_index_data-22.1.3/pytest.ini +2 -0
- {idc_index_data-22.1.2 → idc_index_data-22.1.3}/scripts/python/generate-indices.py +6 -2
- idc_index_data-22.1.3/scripts/python/idc_index_data_manager.py +424 -0
- idc_index_data-22.1.3/tests/test_column_description_parser.py +218 -0
- idc_index_data-22.1.3/tests/test_real_sql_parsing.py +101 -0
- idc_index_data-22.1.2/assets/clinical_index.sql +0 -11
- idc_index_data-22.1.2/scripts/python/idc_index_data_manager.py +0 -231
- {idc_index_data-22.1.2 → idc_index_data-22.1.3}/.git_archival.txt +0 -0
- {idc_index_data-22.1.2 → idc_index_data-22.1.3}/.gitattributes +0 -0
- {idc_index_data-22.1.2 → idc_index_data-22.1.3}/.github/CONTRIBUTING.md +0 -0
- {idc_index_data-22.1.2 → idc_index_data-22.1.3}/.github/copilot-instructions.md +0 -0
- {idc_index_data-22.1.2 → idc_index_data-22.1.3}/.github/dependabot.yml +0 -0
- {idc_index_data-22.1.2 → idc_index_data-22.1.3}/.github/matchers/pylint.json +0 -0
- {idc_index_data-22.1.2 → idc_index_data-22.1.3}/.github/workflows/cd.yml +0 -0
- {idc_index_data-22.1.2 → idc_index_data-22.1.3}/.github/workflows/ci.yml +0 -0
- {idc_index_data-22.1.2 → idc_index_data-22.1.3}/.github/workflows/external-indices.yml +0 -0
- {idc_index_data-22.1.2 → idc_index_data-22.1.3}/.gitignore +0 -0
- {idc_index_data-22.1.2 → idc_index_data-22.1.3}/.readthedocs.yaml +0 -0
- {idc_index_data-22.1.2 → idc_index_data-22.1.3}/CMakeLists.txt +0 -0
- {idc_index_data-22.1.2 → idc_index_data-22.1.3}/LICENSE +0 -0
- {idc_index_data-22.1.2 → idc_index_data-22.1.3}/README.md +0 -0
- {idc_index_data-22.1.2 → idc_index_data-22.1.3}/assets/README.md +0 -0
- {idc_index_data-22.1.2 → idc_index_data-22.1.3}/assets/sm_index.sql +0 -0
- {idc_index_data-22.1.2 → idc_index_data-22.1.3}/assets/sm_instance_index.sql +0 -0
- {idc_index_data-22.1.2 → idc_index_data-22.1.3}/docs/conf.py +0 -0
- {idc_index_data-22.1.2 → idc_index_data-22.1.3}/docs/index.md +0 -0
- {idc_index_data-22.1.2 → idc_index_data-22.1.3}/noxfile.py +0 -0
- {idc_index_data-22.1.2 → idc_index_data-22.1.3}/scripts/python/update_idc_index_version.py +0 -0
- {idc_index_data-22.1.2 → idc_index_data-22.1.3}/scripts/sql/analysis_results_index.sql +0 -0
- {idc_index_data-22.1.2 → idc_index_data-22.1.3}/scripts/sql/collections_index.sql +0 -0
- {idc_index_data-22.1.2 → idc_index_data-22.1.3}/scripts/sql/idc_index.sql +0 -0
- {idc_index_data-22.1.2 → idc_index_data-22.1.3}/scripts/sql/prior_versions_index.sql +0 -0
- {idc_index_data-22.1.2 → idc_index_data-22.1.3}/src/idc_index_data/__init__.py +0 -0
- {idc_index_data-22.1.2 → idc_index_data-22.1.3}/src/idc_index_data/_version.pyi +0 -0
- {idc_index_data-22.1.2 → idc_index_data-22.1.3}/src/idc_index_data/py.typed +0 -0
- {idc_index_data-22.1.2 → idc_index_data-22.1.3}/tests/test_package.py +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.1
|
|
2
2
|
Name: idc-index-data
|
|
3
|
-
Version: 22.1.
|
|
3
|
+
Version: 22.1.3
|
|
4
4
|
Summary: ImagingDataCommons index to query and download data.
|
|
5
5
|
Author-Email: Andrey Fedorov <andrey.fedorov@gmail.com>, Vamsi Thiriveedhi <vthiriveedhi@mgh.harvard.edu>, Jean-Christophe Fillion-Robin <jchris.fillionr@kitware.com>
|
|
6
6
|
License: Copyright 2024 Andrey Fedorov
|
|
@@ -41,6 +41,7 @@ Project-URL: Bug Tracker, https://github.com/ImagingDataCommons/idc-index-data/i
|
|
|
41
41
|
Project-URL: Discussions, https://discourse.canceridc.dev/
|
|
42
42
|
Project-URL: Changelog, https://github.com/ImagingDataCommons/idc-index-data/releases
|
|
43
43
|
Requires-Python: >=3.10
|
|
44
|
+
Requires-Dist: google-cloud-bigquery
|
|
44
45
|
Provides-Extra: test
|
|
45
46
|
Requires-Dist: pandas; extra == "test"
|
|
46
47
|
Requires-Dist: pyarrow; extra == "test"
|
|
@@ -0,0 +1,23 @@
|
|
|
1
|
+
SELECT
|
|
2
|
+
# description:
|
|
3
|
+
# unique identifier of the collection
|
|
4
|
+
collection_id,
|
|
5
|
+
# description:
|
|
6
|
+
# full name of the table in which the column is stored
|
|
7
|
+
table_name,
|
|
8
|
+
# description:
|
|
9
|
+
# short name of the table in which the column is stored
|
|
10
|
+
SPLIT(table_name,'.')[SAFE_OFFSET(2)] AS short_table_name,
|
|
11
|
+
# description:
|
|
12
|
+
# name of the column in which the value is stored
|
|
13
|
+
`column`,
|
|
14
|
+
# description:
|
|
15
|
+
# human readable name of the column
|
|
16
|
+
column_label,
|
|
17
|
+
# description:
|
|
18
|
+
# values encountered in the column
|
|
19
|
+
`values`
|
|
20
|
+
FROM
|
|
21
|
+
`bigquery-public-data.idc_v22_clinical.column_metadata`
|
|
22
|
+
ORDER BY
|
|
23
|
+
collection_id, table_name
|
|
@@ -13,7 +13,7 @@ build-backend = "scikit_build_core.build"
|
|
|
13
13
|
|
|
14
14
|
[project]
|
|
15
15
|
name = "idc-index-data"
|
|
16
|
-
version = "22.1.
|
|
16
|
+
version = "22.1.3"
|
|
17
17
|
authors = [
|
|
18
18
|
{ name = "Andrey Fedorov", email = "andrey.fedorov@gmail.com" },
|
|
19
19
|
{ name = "Vamsi Thiriveedhi", email = "vthiriveedhi@mgh.harvard.edu" },
|
|
@@ -38,7 +38,9 @@ classifiers = [
|
|
|
38
38
|
"Topic :: Scientific/Engineering",
|
|
39
39
|
"Typing :: Typed",
|
|
40
40
|
]
|
|
41
|
-
dependencies = [
|
|
41
|
+
dependencies = [
|
|
42
|
+
"google-cloud-bigquery"
|
|
43
|
+
]
|
|
42
44
|
|
|
43
45
|
[project.optional-dependencies]
|
|
44
46
|
test = [
|
|
@@ -102,7 +104,7 @@ report.exclude_also = [
|
|
|
102
104
|
|
|
103
105
|
[tool.mypy]
|
|
104
106
|
files = ["src", "tests"]
|
|
105
|
-
python_version = "3.
|
|
107
|
+
python_version = "3.10"
|
|
106
108
|
warn_unused_configs = true
|
|
107
109
|
strict = true
|
|
108
110
|
enable_error_code = ["ignore-without-code", "redundant-expr", "truthy-bool"]
|
|
@@ -158,7 +160,7 @@ isort.required-imports = ["from __future__ import annotations"]
|
|
|
158
160
|
|
|
159
161
|
|
|
160
162
|
[tool.pylint]
|
|
161
|
-
py-version = "3.
|
|
163
|
+
py-version = "3.10"
|
|
162
164
|
ignore-paths = [".*/_version.py"]
|
|
163
165
|
reports.output-format = "colorized"
|
|
164
166
|
similarities.ignore-imports = "yes"
|
|
@@ -28,7 +28,7 @@ def main():
|
|
|
28
28
|
)
|
|
29
29
|
parquet_file_path = output_dir / f"{output_basename}.parquet"
|
|
30
30
|
index_df.to_parquet(parquet_file_path)
|
|
31
|
-
manager.save_schema_to_json(schema, output_basename, output_dir)
|
|
31
|
+
manager.save_schema_to_json(schema, output_basename, sql_query, output_dir)
|
|
32
32
|
manager.save_sql_query(sql_query, output_basename, output_dir)
|
|
33
33
|
|
|
34
34
|
core_indices_dir = scripts_dir.parent / "scripts" / "sql"
|
|
@@ -42,7 +42,11 @@ def main():
|
|
|
42
42
|
)
|
|
43
43
|
parquet_file_path = output_dir / f"{output_basename}.parquet"
|
|
44
44
|
index_df.to_parquet(parquet_file_path)
|
|
45
|
-
|
|
45
|
+
if output_basename == "prior_versions_index":
|
|
46
|
+
# For prior_versions_index, save schema without descriptions
|
|
47
|
+
manager.save_schema_to_json(schema, output_basename, None, output_dir)
|
|
48
|
+
else:
|
|
49
|
+
manager.save_schema_to_json(schema, output_basename, sql_query, output_dir)
|
|
46
50
|
manager.save_sql_query(sql_query, output_basename, output_dir)
|
|
47
51
|
|
|
48
52
|
|
|
@@ -0,0 +1,424 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
import json
|
|
4
|
+
import logging
|
|
5
|
+
import os
|
|
6
|
+
import re
|
|
7
|
+
from pathlib import Path
|
|
8
|
+
|
|
9
|
+
import pandas as pd
|
|
10
|
+
from google.cloud import bigquery
|
|
11
|
+
|
|
12
|
+
logging.basicConfig(level=logging.DEBUG)
|
|
13
|
+
logger = logging.getLogger(__name__)
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
class IDCIndexDataManager:
|
|
17
|
+
def __init__(self, project_id: str):
|
|
18
|
+
"""
|
|
19
|
+
Initializes the IDCIndexDataManager using the Google Cloud Platform project ID.
|
|
20
|
+
"""
|
|
21
|
+
self.project_id = project_id
|
|
22
|
+
self.client = bigquery.Client(project=project_id)
|
|
23
|
+
logger.debug("IDCIndexDataManager initialized with project ID: %s", project_id)
|
|
24
|
+
|
|
25
|
+
@staticmethod
|
|
26
|
+
def parse_column_descriptions(sql_query: str) -> dict[str, str]:
|
|
27
|
+
"""
|
|
28
|
+
Parses column descriptions from SQL query comments.
|
|
29
|
+
|
|
30
|
+
The method looks for comments following the pattern:
|
|
31
|
+
# description:
|
|
32
|
+
# description text continues here
|
|
33
|
+
# and can span multiple lines
|
|
34
|
+
column_name or expression AS column_name,
|
|
35
|
+
|
|
36
|
+
Args:
|
|
37
|
+
sql_query: The SQL query string containing comments
|
|
38
|
+
|
|
39
|
+
Returns:
|
|
40
|
+
Dictionary mapping column names to their descriptions
|
|
41
|
+
"""
|
|
42
|
+
descriptions: dict[str, str] = {}
|
|
43
|
+
logger.debug("Parsing column descriptions from SQL query comments")
|
|
44
|
+
logger.debug(sql_query)
|
|
45
|
+
lines = sql_query.split("\n")
|
|
46
|
+
|
|
47
|
+
i = 0
|
|
48
|
+
while i < len(lines):
|
|
49
|
+
line = lines[i]
|
|
50
|
+
stripped = line.strip()
|
|
51
|
+
|
|
52
|
+
# Check if this line starts a description comment
|
|
53
|
+
if stripped == "# description:":
|
|
54
|
+
# Collect description lines until we hit a non-comment line
|
|
55
|
+
description_lines = []
|
|
56
|
+
i += 1
|
|
57
|
+
|
|
58
|
+
while i < len(lines):
|
|
59
|
+
next_line = lines[i]
|
|
60
|
+
next_stripped = next_line.strip()
|
|
61
|
+
|
|
62
|
+
# If it's a description comment line (starts with #)
|
|
63
|
+
if next_stripped.startswith("#") and next_stripped != "#":
|
|
64
|
+
# Remove the leading # and whitespace
|
|
65
|
+
desc_text = next_stripped[1:].strip()
|
|
66
|
+
if desc_text:
|
|
67
|
+
description_lines.append(desc_text)
|
|
68
|
+
i += 1
|
|
69
|
+
elif next_stripped.startswith("#"):
|
|
70
|
+
# Empty comment line, skip
|
|
71
|
+
i += 1
|
|
72
|
+
else:
|
|
73
|
+
# Non-comment line - this should contain the column definition
|
|
74
|
+
break
|
|
75
|
+
|
|
76
|
+
# Now parse the column definition
|
|
77
|
+
if i < len(lines) and description_lines:
|
|
78
|
+
# Join the description lines
|
|
79
|
+
description = " ".join(description_lines)
|
|
80
|
+
|
|
81
|
+
# Find the column name by parsing the SELECT clause
|
|
82
|
+
# We need to handle multi-line column definitions with nested structures
|
|
83
|
+
column_def = ""
|
|
84
|
+
paren_depth = (
|
|
85
|
+
0 # Track parentheses depth to handle nested SELECT/FROM
|
|
86
|
+
)
|
|
87
|
+
|
|
88
|
+
while i < len(lines):
|
|
89
|
+
current_line = lines[i]
|
|
90
|
+
current_stripped = current_line.strip()
|
|
91
|
+
|
|
92
|
+
# Count parentheses to track nesting depth
|
|
93
|
+
paren_depth += current_line.count("(") - current_line.count(")")
|
|
94
|
+
|
|
95
|
+
# Only check for top-level SQL keywords when not inside nested structures
|
|
96
|
+
if paren_depth == 0 and any(
|
|
97
|
+
current_stripped.upper().startswith(keyword)
|
|
98
|
+
for keyword in [
|
|
99
|
+
"FROM",
|
|
100
|
+
"WHERE",
|
|
101
|
+
"GROUP BY",
|
|
102
|
+
"ORDER BY",
|
|
103
|
+
"JOIN",
|
|
104
|
+
"LEFT",
|
|
105
|
+
"RIGHT",
|
|
106
|
+
"INNER",
|
|
107
|
+
"OUTER",
|
|
108
|
+
]
|
|
109
|
+
):
|
|
110
|
+
# Don't include this line in column_def
|
|
111
|
+
# Don't increment i here - let outer loop handle it
|
|
112
|
+
break
|
|
113
|
+
|
|
114
|
+
column_def += " " + current_stripped
|
|
115
|
+
i += 1
|
|
116
|
+
|
|
117
|
+
# Check if we've found a complete column definition
|
|
118
|
+
# (has a comma at depth 0)
|
|
119
|
+
if paren_depth == 0 and "," in current_line:
|
|
120
|
+
break
|
|
121
|
+
|
|
122
|
+
# Safety check: if we've gone too deep, break
|
|
123
|
+
if paren_depth < 0:
|
|
124
|
+
break
|
|
125
|
+
|
|
126
|
+
# Extract column name from the definition
|
|
127
|
+
column_name = IDCIndexDataManager._extract_column_name(column_def)
|
|
128
|
+
if column_name:
|
|
129
|
+
descriptions[column_name] = description
|
|
130
|
+
logger.debug(
|
|
131
|
+
"Parsed description for column '%s': %s",
|
|
132
|
+
column_name,
|
|
133
|
+
description[:50] + "..."
|
|
134
|
+
if len(description) > 50
|
|
135
|
+
else description,
|
|
136
|
+
)
|
|
137
|
+
else:
|
|
138
|
+
i += 1
|
|
139
|
+
else:
|
|
140
|
+
i += 1
|
|
141
|
+
|
|
142
|
+
return descriptions
|
|
143
|
+
|
|
144
|
+
@staticmethod
|
|
145
|
+
def _extract_column_name(column_def: str) -> str | None:
|
|
146
|
+
"""
|
|
147
|
+
Extracts the column name from a column definition.
|
|
148
|
+
|
|
149
|
+
Handles various formats:
|
|
150
|
+
- column_name,
|
|
151
|
+
- expression AS column_name,
|
|
152
|
+
- ANY_VALUE(column) AS column_name,
|
|
153
|
+
- Complex expressions with nested parentheses
|
|
154
|
+
|
|
155
|
+
Args:
|
|
156
|
+
column_def: The column definition string
|
|
157
|
+
|
|
158
|
+
Returns:
|
|
159
|
+
The column name or None if not found
|
|
160
|
+
"""
|
|
161
|
+
# Remove trailing comma and whitespace
|
|
162
|
+
column_def = column_def.strip().rstrip(",").strip()
|
|
163
|
+
|
|
164
|
+
# Look for the last AS clause (to handle nested AS in CAST expressions)
|
|
165
|
+
# Use a regex that finds the rightmost AS followed by a word
|
|
166
|
+
as_matches = list(re.finditer(r"\bAS\b\s+(\w+)", column_def, re.IGNORECASE))
|
|
167
|
+
if as_matches:
|
|
168
|
+
# Return the last match (rightmost AS clause)
|
|
169
|
+
return as_matches[-1].group(1)
|
|
170
|
+
|
|
171
|
+
# If no AS clause, try to get the column name
|
|
172
|
+
# Remove function calls and get the last word before comma
|
|
173
|
+
# Handle cases like: column_name, or just column_name
|
|
174
|
+
parts = column_def.split()
|
|
175
|
+
if parts:
|
|
176
|
+
# Get the last word that looks like an identifier
|
|
177
|
+
for original_part in reversed(parts):
|
|
178
|
+
# Remove trailing punctuation
|
|
179
|
+
part = original_part.rstrip(",").strip()
|
|
180
|
+
# Check if it's a valid identifier (word characters only)
|
|
181
|
+
if re.match(r"^\w+$", part):
|
|
182
|
+
return part
|
|
183
|
+
|
|
184
|
+
return None
|
|
185
|
+
|
|
186
|
+
def execute_sql_query(
|
|
187
|
+
self, file_path: str
|
|
188
|
+
) -> tuple[pd.DataFrame, str, list[bigquery.SchemaField], str]:
|
|
189
|
+
"""
|
|
190
|
+
Executes the SQL query in the specified file.
|
|
191
|
+
|
|
192
|
+
Returns:
|
|
193
|
+
Tuple[pd.DataFrame, str, List[bigquery.SchemaField], str]: A tuple containing
|
|
194
|
+
the DataFrame with query results, the output basename, the BigQuery schema, and
|
|
195
|
+
the SQL query string.
|
|
196
|
+
"""
|
|
197
|
+
with Path(file_path).open("r") as file:
|
|
198
|
+
sql_query = file.read()
|
|
199
|
+
query_job_result = self.client.query(sql_query).result()
|
|
200
|
+
schema = query_job_result.schema # Get schema from BigQuery QueryJob
|
|
201
|
+
index_df = query_job_result.to_dataframe()
|
|
202
|
+
if "StudyDate" in index_df.columns:
|
|
203
|
+
index_df["StudyDate"] = index_df["StudyDate"].astype(str)
|
|
204
|
+
output_basename = Path(file_path).name.split(".")[0]
|
|
205
|
+
logger.debug("Executed SQL query from file: %s", file_path)
|
|
206
|
+
return index_df, output_basename, schema, sql_query
|
|
207
|
+
|
|
208
|
+
def save_schema_to_json(
|
|
209
|
+
self,
|
|
210
|
+
schema: list[bigquery.SchemaField],
|
|
211
|
+
output_basename: str,
|
|
212
|
+
sql_query: str | None = None,
|
|
213
|
+
output_dir: Path | None = None,
|
|
214
|
+
) -> None:
|
|
215
|
+
"""
|
|
216
|
+
Saves the BigQuery schema to a JSON file, including column descriptions
|
|
217
|
+
parsed from SQL query comments.
|
|
218
|
+
|
|
219
|
+
Args:
|
|
220
|
+
schema: List of BigQuery SchemaField objects from the query result
|
|
221
|
+
output_basename: The base name for the output file
|
|
222
|
+
sql_query: The SQL query string to parse for column descriptions
|
|
223
|
+
output_dir: Optional directory path for the output file
|
|
224
|
+
"""
|
|
225
|
+
# Parse column descriptions from SQL comments
|
|
226
|
+
logger.debug("Parsing column descriptions from SQL query comments")
|
|
227
|
+
logger.debug(sql_query)
|
|
228
|
+
if sql_query is not None:
|
|
229
|
+
descriptions = self.parse_column_descriptions(sql_query)
|
|
230
|
+
|
|
231
|
+
# Convert BigQuery schema to JSON-serializable format
|
|
232
|
+
schema_dict = {
|
|
233
|
+
"fields": [
|
|
234
|
+
{
|
|
235
|
+
"name": field.name,
|
|
236
|
+
"type": field.field_type,
|
|
237
|
+
"mode": field.mode,
|
|
238
|
+
"description": descriptions.get(field.name, ""),
|
|
239
|
+
}
|
|
240
|
+
for field in schema
|
|
241
|
+
]
|
|
242
|
+
}
|
|
243
|
+
else:
|
|
244
|
+
# If no SQL query provided, save schema without descriptions
|
|
245
|
+
schema_dict = {
|
|
246
|
+
"fields": [
|
|
247
|
+
{
|
|
248
|
+
"name": field.name,
|
|
249
|
+
"type": field.field_type,
|
|
250
|
+
"mode": field.mode,
|
|
251
|
+
"description": "",
|
|
252
|
+
}
|
|
253
|
+
for field in schema
|
|
254
|
+
]
|
|
255
|
+
}
|
|
256
|
+
|
|
257
|
+
# Save to JSON file
|
|
258
|
+
if output_dir:
|
|
259
|
+
output_dir.mkdir(parents=True, exist_ok=True)
|
|
260
|
+
json_file_path = output_dir / f"{output_basename}.json"
|
|
261
|
+
else:
|
|
262
|
+
json_file_path = Path(f"{output_basename}.json")
|
|
263
|
+
|
|
264
|
+
with json_file_path.open("w") as f:
|
|
265
|
+
json.dump(schema_dict, f, indent=2)
|
|
266
|
+
logger.debug("Created schema JSON file: %s", json_file_path)
|
|
267
|
+
|
|
268
|
+
def save_sql_query(
|
|
269
|
+
self,
|
|
270
|
+
sql_query: str,
|
|
271
|
+
output_basename: str,
|
|
272
|
+
output_dir: Path | None = None,
|
|
273
|
+
) -> None:
|
|
274
|
+
"""
|
|
275
|
+
Saves the SQL query to a file.
|
|
276
|
+
|
|
277
|
+
Args:
|
|
278
|
+
sql_query: The SQL query string
|
|
279
|
+
output_basename: The base name for the output file
|
|
280
|
+
output_dir: Optional directory path for the output file
|
|
281
|
+
"""
|
|
282
|
+
|
|
283
|
+
if output_dir:
|
|
284
|
+
output_dir.mkdir(parents=True, exist_ok=True)
|
|
285
|
+
query_file_path = output_dir / f"{output_basename}.sql"
|
|
286
|
+
else:
|
|
287
|
+
query_file_path = Path(f"{output_basename}.sql")
|
|
288
|
+
|
|
289
|
+
with query_file_path.open("w") as f:
|
|
290
|
+
f.write(sql_query)
|
|
291
|
+
logger.debug("Created SQL query file: %s", query_file_path)
|
|
292
|
+
|
|
293
|
+
def generate_index_data_files(
|
|
294
|
+
self,
|
|
295
|
+
generate_compressed_csv: bool = True,
|
|
296
|
+
generate_parquet: bool = False,
|
|
297
|
+
output_dir: Path | None = None,
|
|
298
|
+
) -> None:
|
|
299
|
+
"""
|
|
300
|
+
Generates index-data files locally by executing queries against
|
|
301
|
+
the Google Cloud Platform IDC project tables.
|
|
302
|
+
|
|
303
|
+
This method iterates over SQL files in the 'scripts/sql' directory,
|
|
304
|
+
executing each query using :func:`execute_sql_query` and generating a DataFrame,
|
|
305
|
+
'index_df'. The DataFrame is then saved as compressed CSV and/or Parquet file.
|
|
306
|
+
|
|
307
|
+
Args:
|
|
308
|
+
generate_compressed_csv: Whether to generate compressed CSV files
|
|
309
|
+
generate_parquet: Whether to generate Parquet files
|
|
310
|
+
output_dir: Optional directory path for the output files
|
|
311
|
+
"""
|
|
312
|
+
|
|
313
|
+
scripts_dir = Path(__file__).parent.parent
|
|
314
|
+
sql_dir = scripts_dir / "sql"
|
|
315
|
+
|
|
316
|
+
if output_dir:
|
|
317
|
+
output_dir.mkdir(parents=True, exist_ok=True)
|
|
318
|
+
|
|
319
|
+
for file_name in Path.iterdir(sql_dir):
|
|
320
|
+
if str(file_name).endswith(".sql"):
|
|
321
|
+
file_path = Path(sql_dir) / file_name
|
|
322
|
+
index_df, output_basename, schema, sql_query = self.execute_sql_query(
|
|
323
|
+
str(file_path)
|
|
324
|
+
)
|
|
325
|
+
logger.debug(
|
|
326
|
+
"Executed and processed SQL queries from file: %s", file_path
|
|
327
|
+
)
|
|
328
|
+
if generate_compressed_csv:
|
|
329
|
+
csv_file_path = (
|
|
330
|
+
output_dir / f"{output_basename}.csv.zip"
|
|
331
|
+
if output_dir
|
|
332
|
+
else Path(f"{output_basename}.csv.zip")
|
|
333
|
+
)
|
|
334
|
+
index_df.to_csv(
|
|
335
|
+
csv_file_path, compression={"method": "zip"}, escapechar="\\"
|
|
336
|
+
)
|
|
337
|
+
logger.debug("Created CSV zip file: %s", csv_file_path)
|
|
338
|
+
|
|
339
|
+
if generate_parquet:
|
|
340
|
+
parquet_file_path = (
|
|
341
|
+
output_dir / f"{output_basename}.parquet"
|
|
342
|
+
if output_dir
|
|
343
|
+
else Path(f"{output_basename}.parquet")
|
|
344
|
+
)
|
|
345
|
+
index_df.to_parquet(parquet_file_path, compression="zstd")
|
|
346
|
+
logger.debug("Created Parquet file: %s", parquet_file_path)
|
|
347
|
+
|
|
348
|
+
# Save schema to JSON file
|
|
349
|
+
# Skip parsing descriptions for prior_versions_index as it has dynamic SQL
|
|
350
|
+
if output_basename != "prior_versions_index":
|
|
351
|
+
self.save_schema_to_json(
|
|
352
|
+
schema, output_basename, sql_query, output_dir
|
|
353
|
+
)
|
|
354
|
+
else:
|
|
355
|
+
# For prior_versions_index, save schema without descriptions
|
|
356
|
+
self.save_schema_to_json(schema, output_basename, None, output_dir)
|
|
357
|
+
# Save SQL query to file
|
|
358
|
+
self.save_sql_query(sql_query, output_basename, output_dir)
|
|
359
|
+
|
|
360
|
+
def retrieve_latest_idc_release_version(self) -> int:
|
|
361
|
+
"""
|
|
362
|
+
Retrieves the latest IDC release version.
|
|
363
|
+
|
|
364
|
+
This function executes a SQL query on the `version_metadata` table in the
|
|
365
|
+
`idc_current` dataset of the BigQuery client. It retrieves the maximum
|
|
366
|
+
`idc_version` and returns it as an integer.
|
|
367
|
+
"""
|
|
368
|
+
query = """
|
|
369
|
+
SELECT
|
|
370
|
+
MAX(idc_version) AS latest_idc_release_version
|
|
371
|
+
FROM
|
|
372
|
+
`bigquery-public-data.idc_current.version_metadata`
|
|
373
|
+
"""
|
|
374
|
+
query_job = self.client.query(query)
|
|
375
|
+
result = query_job.result()
|
|
376
|
+
return int(next(result).latest_idc_release_version)
|
|
377
|
+
|
|
378
|
+
|
|
379
|
+
if __name__ == "__main__":
|
|
380
|
+
import argparse
|
|
381
|
+
|
|
382
|
+
parser = argparse.ArgumentParser()
|
|
383
|
+
parser.add_argument(
|
|
384
|
+
"--project",
|
|
385
|
+
default=os.environ.get("GCP_PROJECT", None),
|
|
386
|
+
help="Google Cloud Platform Project ID (default from GCP_PROJECT env. variable)",
|
|
387
|
+
)
|
|
388
|
+
parser.add_argument(
|
|
389
|
+
"--generate-csv-archive",
|
|
390
|
+
action="store_true",
|
|
391
|
+
help="Generate idc_index.csv.zip file",
|
|
392
|
+
)
|
|
393
|
+
parser.add_argument(
|
|
394
|
+
"--generate-parquet",
|
|
395
|
+
action="store_true",
|
|
396
|
+
help="Generate idc_index.parquet file",
|
|
397
|
+
)
|
|
398
|
+
parser.add_argument(
|
|
399
|
+
"--retrieve-latest-idc-release-version",
|
|
400
|
+
action="store_true",
|
|
401
|
+
help="Retrieve and display the latest IDC release version",
|
|
402
|
+
)
|
|
403
|
+
|
|
404
|
+
args = parser.parse_args()
|
|
405
|
+
|
|
406
|
+
if not args.project:
|
|
407
|
+
parser.error(
|
|
408
|
+
"Set GCP_PROJECT environment variable or specify --project argument"
|
|
409
|
+
)
|
|
410
|
+
|
|
411
|
+
if any([args.generate_csv_archive, args.generate_parquet]):
|
|
412
|
+
IDCIndexDataManager(args.project).generate_index_data_files(
|
|
413
|
+
generate_compressed_csv=args.generate_csv_archive,
|
|
414
|
+
generate_parquet=args.generate_parquet,
|
|
415
|
+
)
|
|
416
|
+
elif args.retrieve_latest_idc_release_version:
|
|
417
|
+
logging.basicConfig(level=logging.ERROR, force=True)
|
|
418
|
+
logger.setLevel(logging.ERROR)
|
|
419
|
+
version = IDCIndexDataManager(
|
|
420
|
+
args.project
|
|
421
|
+
).retrieve_latest_idc_release_version()
|
|
422
|
+
print(f"{version}") # noqa: T201
|
|
423
|
+
else:
|
|
424
|
+
parser.print_help()
|
|
@@ -0,0 +1,218 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
from scripts.python.idc_index_data_manager import IDCIndexDataManager
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
class TestColumnDescriptionParser:
|
|
7
|
+
"""Tests for parsing column descriptions from SQL comments."""
|
|
8
|
+
|
|
9
|
+
def test_simple_column_description(self):
|
|
10
|
+
"""Test parsing a simple column with description."""
|
|
11
|
+
sql_query = """
|
|
12
|
+
SELECT
|
|
13
|
+
# description:
|
|
14
|
+
# name of the collection
|
|
15
|
+
collection_name,
|
|
16
|
+
FROM table
|
|
17
|
+
"""
|
|
18
|
+
descriptions = IDCIndexDataManager.parse_column_descriptions(sql_query)
|
|
19
|
+
assert "collection_name" in descriptions
|
|
20
|
+
assert descriptions["collection_name"] == "name of the collection"
|
|
21
|
+
|
|
22
|
+
def test_multiline_description(self):
|
|
23
|
+
"""Test parsing a column with multi-line description."""
|
|
24
|
+
sql_query = """
|
|
25
|
+
SELECT
|
|
26
|
+
# description:
|
|
27
|
+
# this string is not empty if the specific series is
|
|
28
|
+
# part of an analysis results collection; analysis results can be added to a
|
|
29
|
+
# given collection over time
|
|
30
|
+
ANY_VALUE(analysis_result_id) AS analysis_result_id,
|
|
31
|
+
FROM table
|
|
32
|
+
"""
|
|
33
|
+
descriptions = IDCIndexDataManager.parse_column_descriptions(sql_query)
|
|
34
|
+
assert "analysis_result_id" in descriptions
|
|
35
|
+
expected = (
|
|
36
|
+
"this string is not empty if the specific series is "
|
|
37
|
+
"part of an analysis results collection; analysis results can be added to a "
|
|
38
|
+
"given collection over time"
|
|
39
|
+
)
|
|
40
|
+
assert descriptions["analysis_result_id"] == expected
|
|
41
|
+
|
|
42
|
+
def test_column_with_as_clause(self):
|
|
43
|
+
"""Test parsing a column with AS clause."""
|
|
44
|
+
sql_query = """
|
|
45
|
+
SELECT
|
|
46
|
+
# description:
|
|
47
|
+
# unique identifier of the DICOM series
|
|
48
|
+
SeriesInstanceUID AS series_id,
|
|
49
|
+
FROM table
|
|
50
|
+
"""
|
|
51
|
+
descriptions = IDCIndexDataManager.parse_column_descriptions(sql_query)
|
|
52
|
+
assert "series_id" in descriptions
|
|
53
|
+
assert descriptions["series_id"] == "unique identifier of the DICOM series"
|
|
54
|
+
|
|
55
|
+
def test_column_with_function(self):
|
|
56
|
+
"""Test parsing a column with function call."""
|
|
57
|
+
sql_query = """
|
|
58
|
+
SELECT
|
|
59
|
+
# description:
|
|
60
|
+
# age of the subject at the time of imaging
|
|
61
|
+
ANY_VALUE(PatientAge) AS PatientAge,
|
|
62
|
+
FROM table
|
|
63
|
+
"""
|
|
64
|
+
descriptions = IDCIndexDataManager.parse_column_descriptions(sql_query)
|
|
65
|
+
assert "PatientAge" in descriptions
|
|
66
|
+
assert descriptions["PatientAge"] == "age of the subject at the time of imaging"
|
|
67
|
+
|
|
68
|
+
def test_multiple_columns(self):
|
|
69
|
+
"""Test parsing multiple columns with descriptions."""
|
|
70
|
+
sql_query = """
|
|
71
|
+
SELECT
|
|
72
|
+
# description:
|
|
73
|
+
# name of the collection
|
|
74
|
+
collection_name,
|
|
75
|
+
# description:
|
|
76
|
+
# unique identifier of the collection
|
|
77
|
+
collection_id,
|
|
78
|
+
# description:
|
|
79
|
+
# types of cancer represented in the collection
|
|
80
|
+
CancerTypes,
|
|
81
|
+
FROM table
|
|
82
|
+
"""
|
|
83
|
+
descriptions = IDCIndexDataManager.parse_column_descriptions(sql_query)
|
|
84
|
+
assert len(descriptions) == 3
|
|
85
|
+
assert descriptions["collection_name"] == "name of the collection"
|
|
86
|
+
assert descriptions["collection_id"] == "unique identifier of the collection"
|
|
87
|
+
assert (
|
|
88
|
+
descriptions["CancerTypes"]
|
|
89
|
+
== "types of cancer represented in the collection"
|
|
90
|
+
)
|
|
91
|
+
|
|
92
|
+
def test_column_without_description(self):
|
|
93
|
+
"""Test that columns without descriptions are not in the result."""
|
|
94
|
+
sql_query = """
|
|
95
|
+
SELECT
|
|
96
|
+
# description:
|
|
97
|
+
# name of the collection
|
|
98
|
+
collection_name,
|
|
99
|
+
collection_id,
|
|
100
|
+
FROM table
|
|
101
|
+
"""
|
|
102
|
+
descriptions = IDCIndexDataManager.parse_column_descriptions(sql_query)
|
|
103
|
+
assert "collection_name" in descriptions
|
|
104
|
+
assert "collection_id" not in descriptions
|
|
105
|
+
|
|
106
|
+
def test_extract_column_name_simple(self):
|
|
107
|
+
"""Test extracting column name from simple definition."""
|
|
108
|
+
assert (
|
|
109
|
+
IDCIndexDataManager._extract_column_name("collection_name,")
|
|
110
|
+
== "collection_name"
|
|
111
|
+
)
|
|
112
|
+
assert (
|
|
113
|
+
IDCIndexDataManager._extract_column_name("collection_name")
|
|
114
|
+
== "collection_name"
|
|
115
|
+
)
|
|
116
|
+
|
|
117
|
+
def test_extract_column_name_with_as(self):
|
|
118
|
+
"""Test extracting column name with AS clause."""
|
|
119
|
+
assert (
|
|
120
|
+
IDCIndexDataManager._extract_column_name(
|
|
121
|
+
"ANY_VALUE(collection_id) AS collection_id,"
|
|
122
|
+
)
|
|
123
|
+
== "collection_id"
|
|
124
|
+
)
|
|
125
|
+
assert IDCIndexDataManager._extract_column_name("column AS alias,") == "alias"
|
|
126
|
+
|
|
127
|
+
def test_extract_column_name_complex(self):
|
|
128
|
+
"""Test extracting column name from complex expressions."""
|
|
129
|
+
assert (
|
|
130
|
+
IDCIndexDataManager._extract_column_name(
|
|
131
|
+
"ROUND(SUM(SAFE_CAST(instance_size AS float64))/1000000, 2) AS series_size_MB,"
|
|
132
|
+
)
|
|
133
|
+
== "series_size_MB"
|
|
134
|
+
)
|
|
135
|
+
assert (
|
|
136
|
+
IDCIndexDataManager._extract_column_name("COUNT(SOPInstanceUID) AS count,")
|
|
137
|
+
== "count"
|
|
138
|
+
)
|
|
139
|
+
|
|
140
|
+
def test_complex_multiline_select(self):
|
|
141
|
+
"""Test parsing a complex multi-line SELECT statement."""
|
|
142
|
+
sql_query = """
|
|
143
|
+
SELECT
|
|
144
|
+
# description:
|
|
145
|
+
# total size of the series in megabytes
|
|
146
|
+
ROUND(SUM(SAFE_CAST(instance_size AS float64))/1000000, 2) AS series_size_MB,
|
|
147
|
+
FROM table
|
|
148
|
+
"""
|
|
149
|
+
descriptions = IDCIndexDataManager.parse_column_descriptions(sql_query)
|
|
150
|
+
assert "series_size_MB" in descriptions
|
|
151
|
+
assert descriptions["series_size_MB"] == "total size of the series in megabytes"
|
|
152
|
+
|
|
153
|
+
def test_no_descriptions(self):
|
|
154
|
+
"""Test SQL query with no descriptions."""
|
|
155
|
+
sql_query = """
|
|
156
|
+
SELECT
|
|
157
|
+
collection_name,
|
|
158
|
+
collection_id,
|
|
159
|
+
CancerTypes
|
|
160
|
+
FROM table
|
|
161
|
+
"""
|
|
162
|
+
descriptions = IDCIndexDataManager.parse_column_descriptions(sql_query)
|
|
163
|
+
assert len(descriptions) == 0
|
|
164
|
+
|
|
165
|
+
def test_empty_description_lines(self):
|
|
166
|
+
"""Test handling of empty comment lines in descriptions."""
|
|
167
|
+
sql_query = """
|
|
168
|
+
SELECT
|
|
169
|
+
# description:
|
|
170
|
+
# name of the collection
|
|
171
|
+
#
|
|
172
|
+
# additional info
|
|
173
|
+
collection_name,
|
|
174
|
+
FROM table
|
|
175
|
+
"""
|
|
176
|
+
descriptions = IDCIndexDataManager.parse_column_descriptions(sql_query)
|
|
177
|
+
assert "collection_name" in descriptions
|
|
178
|
+
# Empty comment lines should be skipped
|
|
179
|
+
assert (
|
|
180
|
+
descriptions["collection_name"] == "name of the collection additional info"
|
|
181
|
+
)
|
|
182
|
+
|
|
183
|
+
def test_nested_array_select_with_if(self):
|
|
184
|
+
"""Test parsing complex nested ARRAY/SELECT/IF statements."""
|
|
185
|
+
sql_query = """
|
|
186
|
+
SELECT
|
|
187
|
+
# description:
|
|
188
|
+
# embedding medium used for the slide preparation
|
|
189
|
+
ARRAY(
|
|
190
|
+
SELECT
|
|
191
|
+
IF
|
|
192
|
+
(code IS NULL, NULL, SPLIT(code, ':')[SAFE_OFFSET(0)])
|
|
193
|
+
FROM
|
|
194
|
+
UNNEST(embeddingMedium_code_str) AS code ) AS embeddingMedium_CodeMeaning,
|
|
195
|
+
# description:
|
|
196
|
+
# embedding medium code tuple
|
|
197
|
+
ARRAY(
|
|
198
|
+
SELECT
|
|
199
|
+
IF
|
|
200
|
+
(code IS NULL, NULL,
|
|
201
|
+
IF
|
|
202
|
+
(STRPOS(code, ':') = 0, NULL, SUBSTR(code, STRPOS(code, ':') + 1)))
|
|
203
|
+
FROM
|
|
204
|
+
UNNEST(embeddingMedium_code_str) AS code ) AS embeddingMedium_code_designator_value_str,
|
|
205
|
+
FROM table
|
|
206
|
+
"""
|
|
207
|
+
descriptions = IDCIndexDataManager.parse_column_descriptions(sql_query)
|
|
208
|
+
assert len(descriptions) == 2
|
|
209
|
+
assert "embeddingMedium_CodeMeaning" in descriptions
|
|
210
|
+
assert (
|
|
211
|
+
descriptions["embeddingMedium_CodeMeaning"]
|
|
212
|
+
== "embedding medium used for the slide preparation"
|
|
213
|
+
)
|
|
214
|
+
assert "embeddingMedium_code_designator_value_str" in descriptions
|
|
215
|
+
assert (
|
|
216
|
+
descriptions["embeddingMedium_code_designator_value_str"]
|
|
217
|
+
== "embedding medium code tuple"
|
|
218
|
+
)
|
|
@@ -0,0 +1,101 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
from pathlib import Path
|
|
4
|
+
|
|
5
|
+
from scripts.python.idc_index_data_manager import IDCIndexDataManager
|
|
6
|
+
|
|
7
|
+
"""Test script to verify column description parsing with real SQL files."""
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
def test_real_sql_files() -> None:
|
|
11
|
+
"""Test parsing descriptions from actual SQL files in the repository."""
|
|
12
|
+
scripts_dir = Path(__file__).parent.parent / "scripts"
|
|
13
|
+
sql_dir = scripts_dir / "sql"
|
|
14
|
+
|
|
15
|
+
# Test collections_index.sql
|
|
16
|
+
collections_sql_path = sql_dir / "collections_index.sql"
|
|
17
|
+
if collections_sql_path.exists():
|
|
18
|
+
with collections_sql_path.open("r") as f:
|
|
19
|
+
sql_query = f.read()
|
|
20
|
+
|
|
21
|
+
descriptions = IDCIndexDataManager.parse_column_descriptions(sql_query)
|
|
22
|
+
print("\n=== collections_index.sql ===")
|
|
23
|
+
print(f"Found {len(descriptions)} column descriptions:")
|
|
24
|
+
for col_name, desc in descriptions.items():
|
|
25
|
+
print(f" {col_name}: {desc[:60]}...")
|
|
26
|
+
|
|
27
|
+
# Validate expected columns
|
|
28
|
+
expected_columns = [
|
|
29
|
+
"collection_name",
|
|
30
|
+
"collection_id",
|
|
31
|
+
"CancerTypes",
|
|
32
|
+
"TumorLocations",
|
|
33
|
+
"Subjects",
|
|
34
|
+
"Species",
|
|
35
|
+
"Sources",
|
|
36
|
+
"SupportingData",
|
|
37
|
+
"Program",
|
|
38
|
+
"Status",
|
|
39
|
+
"Updated",
|
|
40
|
+
"Description",
|
|
41
|
+
]
|
|
42
|
+
for col in expected_columns:
|
|
43
|
+
assert col in descriptions, (
|
|
44
|
+
f"Expected column '{col}' not found in descriptions"
|
|
45
|
+
)
|
|
46
|
+
print("✓ All expected columns found")
|
|
47
|
+
|
|
48
|
+
# Test idc_index.sql
|
|
49
|
+
idc_index_sql_path = sql_dir / "idc_index.sql"
|
|
50
|
+
if idc_index_sql_path.exists():
|
|
51
|
+
with idc_index_sql_path.open("r") as f:
|
|
52
|
+
sql_query = f.read()
|
|
53
|
+
|
|
54
|
+
descriptions = IDCIndexDataManager.parse_column_descriptions(sql_query)
|
|
55
|
+
print("\n=== idc_index.sql ===")
|
|
56
|
+
print(f"Found {len(descriptions)} column descriptions:")
|
|
57
|
+
|
|
58
|
+
# Show first 10 descriptions
|
|
59
|
+
for i, (col_name, desc) in enumerate(descriptions.items()):
|
|
60
|
+
if i < 10:
|
|
61
|
+
print(f" {col_name}: {desc[:60]}...")
|
|
62
|
+
else:
|
|
63
|
+
break
|
|
64
|
+
|
|
65
|
+
if len(descriptions) > 10:
|
|
66
|
+
print(f" ... and {len(descriptions) - 10} more")
|
|
67
|
+
|
|
68
|
+
# Validate some expected columns
|
|
69
|
+
expected_columns = [
|
|
70
|
+
"collection_id",
|
|
71
|
+
"analysis_result_id",
|
|
72
|
+
"PatientID",
|
|
73
|
+
"SeriesInstanceUID",
|
|
74
|
+
"StudyInstanceUID",
|
|
75
|
+
"source_DOI",
|
|
76
|
+
"PatientAge",
|
|
77
|
+
"PatientSex",
|
|
78
|
+
"StudyDate",
|
|
79
|
+
"series_size_MB",
|
|
80
|
+
]
|
|
81
|
+
for col in expected_columns:
|
|
82
|
+
if col in descriptions:
|
|
83
|
+
print(f"✓ Found expected column: {col}")
|
|
84
|
+
else:
|
|
85
|
+
print(f"✗ Missing expected column: {col}")
|
|
86
|
+
|
|
87
|
+
# Test analysis_results_index.sql (should have no descriptions)
|
|
88
|
+
analysis_sql_path = sql_dir / "analysis_results_index.sql"
|
|
89
|
+
if analysis_sql_path.exists():
|
|
90
|
+
with analysis_sql_path.open("r") as f:
|
|
91
|
+
sql_query = f.read()
|
|
92
|
+
|
|
93
|
+
descriptions = IDCIndexDataManager.parse_column_descriptions(sql_query)
|
|
94
|
+
print("\n=== analysis_results_index.sql ===")
|
|
95
|
+
print(f"Found {len(descriptions)} column descriptions (expected 0)")
|
|
96
|
+
assert len(descriptions) == 0, "Expected no descriptions in this file"
|
|
97
|
+
print("✓ Correctly found no descriptions")
|
|
98
|
+
|
|
99
|
+
|
|
100
|
+
if __name__ == "__main__":
|
|
101
|
+
test_real_sql_files()
|
|
@@ -1,231 +0,0 @@
|
|
|
1
|
-
from __future__ import annotations
|
|
2
|
-
|
|
3
|
-
import json
|
|
4
|
-
import logging
|
|
5
|
-
import os
|
|
6
|
-
from pathlib import Path
|
|
7
|
-
|
|
8
|
-
import pandas as pd
|
|
9
|
-
from google.cloud import bigquery
|
|
10
|
-
|
|
11
|
-
logging.basicConfig(level=logging.DEBUG)
|
|
12
|
-
logger = logging.getLogger(__name__)
|
|
13
|
-
|
|
14
|
-
|
|
15
|
-
class IDCIndexDataManager:
|
|
16
|
-
def __init__(self, project_id: str):
|
|
17
|
-
"""
|
|
18
|
-
Initializes the IDCIndexDataManager using the Google Cloud Platform project ID.
|
|
19
|
-
"""
|
|
20
|
-
self.project_id = project_id
|
|
21
|
-
self.client = bigquery.Client(project=project_id)
|
|
22
|
-
logger.debug("IDCIndexDataManager initialized with project ID: %s", project_id)
|
|
23
|
-
|
|
24
|
-
def execute_sql_query(
|
|
25
|
-
self, file_path: str
|
|
26
|
-
) -> tuple[pd.DataFrame, str, list[bigquery.SchemaField]]:
|
|
27
|
-
"""
|
|
28
|
-
Executes the SQL query in the specified file.
|
|
29
|
-
|
|
30
|
-
Returns:
|
|
31
|
-
Tuple[pd.DataFrame, str, List[bigquery.SchemaField]]: A tuple containing
|
|
32
|
-
the DataFrame with query results, the output basename, and the BigQuery schema.
|
|
33
|
-
"""
|
|
34
|
-
with Path(file_path).open("r") as file:
|
|
35
|
-
sql_query = file.read()
|
|
36
|
-
query_job_result = self.client.query(sql_query).result()
|
|
37
|
-
schema = query_job_result.schema # Get schema from BigQuery QueryJob
|
|
38
|
-
index_df = query_job_result.to_dataframe()
|
|
39
|
-
if "StudyDate" in index_df.columns:
|
|
40
|
-
index_df["StudyDate"] = index_df["StudyDate"].astype(str)
|
|
41
|
-
output_basename = Path(file_path).name.split(".")[0]
|
|
42
|
-
logger.debug("Executed SQL query from file: %s", file_path)
|
|
43
|
-
return index_df, output_basename, schema, sql_query
|
|
44
|
-
|
|
45
|
-
def save_schema_to_json(
|
|
46
|
-
self,
|
|
47
|
-
schema: list[bigquery.SchemaField],
|
|
48
|
-
output_basename: str,
|
|
49
|
-
output_dir: Path | None = None,
|
|
50
|
-
) -> None:
|
|
51
|
-
"""
|
|
52
|
-
Saves the BigQuery schema to a JSON file.
|
|
53
|
-
|
|
54
|
-
Args:
|
|
55
|
-
schema: List of BigQuery SchemaField objects from the query result
|
|
56
|
-
output_basename: The base name for the output file
|
|
57
|
-
output_dir: Optional directory path for the output file
|
|
58
|
-
"""
|
|
59
|
-
# Convert BigQuery schema to JSON-serializable format
|
|
60
|
-
schema_dict = {
|
|
61
|
-
"fields": [
|
|
62
|
-
{
|
|
63
|
-
"name": field.name,
|
|
64
|
-
"type": field.field_type,
|
|
65
|
-
"mode": field.mode,
|
|
66
|
-
}
|
|
67
|
-
for field in schema
|
|
68
|
-
]
|
|
69
|
-
}
|
|
70
|
-
|
|
71
|
-
# Save to JSON file
|
|
72
|
-
if output_dir:
|
|
73
|
-
output_dir.mkdir(parents=True, exist_ok=True)
|
|
74
|
-
json_file_path = output_dir / f"{output_basename}.json"
|
|
75
|
-
else:
|
|
76
|
-
json_file_path = Path(f"{output_basename}.json")
|
|
77
|
-
|
|
78
|
-
with json_file_path.open("w") as f:
|
|
79
|
-
json.dump(schema_dict, f, indent=2)
|
|
80
|
-
logger.debug("Created schema JSON file: %s", json_file_path)
|
|
81
|
-
|
|
82
|
-
def save_sql_query(
|
|
83
|
-
self,
|
|
84
|
-
sql_query: str,
|
|
85
|
-
output_basename: str,
|
|
86
|
-
output_dir: Path | None = None,
|
|
87
|
-
) -> None:
|
|
88
|
-
"""
|
|
89
|
-
Saves the SQL query to a file.
|
|
90
|
-
|
|
91
|
-
Args:
|
|
92
|
-
sql_query: The SQL query string
|
|
93
|
-
output_basename: The base name for the output file
|
|
94
|
-
output_dir: Optional directory path for the output file
|
|
95
|
-
"""
|
|
96
|
-
|
|
97
|
-
if output_dir:
|
|
98
|
-
output_dir.mkdir(parents=True, exist_ok=True)
|
|
99
|
-
query_file_path = output_dir / f"{output_basename}.sql"
|
|
100
|
-
else:
|
|
101
|
-
query_file_path = Path(f"{output_basename}.sql")
|
|
102
|
-
|
|
103
|
-
with query_file_path.open("w") as f:
|
|
104
|
-
f.write(sql_query)
|
|
105
|
-
logger.debug("Created SQL query file: %s", query_file_path)
|
|
106
|
-
|
|
107
|
-
def generate_index_data_files(
|
|
108
|
-
self,
|
|
109
|
-
generate_compressed_csv: bool = True,
|
|
110
|
-
generate_parquet: bool = False,
|
|
111
|
-
output_dir: Path | None = None,
|
|
112
|
-
) -> None:
|
|
113
|
-
"""
|
|
114
|
-
Generates index-data files locally by executing queries against
|
|
115
|
-
the Google Cloud Platform IDC project tables.
|
|
116
|
-
|
|
117
|
-
This method iterates over SQL files in the 'scripts/sql' directory,
|
|
118
|
-
executing each query using :func:`execute_sql_query` and generating a DataFrame,
|
|
119
|
-
'index_df'. The DataFrame is then saved as compressed CSV and/or Parquet file.
|
|
120
|
-
|
|
121
|
-
Args:
|
|
122
|
-
generate_compressed_csv: Whether to generate compressed CSV files
|
|
123
|
-
generate_parquet: Whether to generate Parquet files
|
|
124
|
-
output_dir: Optional directory path for the output files
|
|
125
|
-
"""
|
|
126
|
-
|
|
127
|
-
scripts_dir = Path(__file__).parent.parent
|
|
128
|
-
sql_dir = scripts_dir / "sql"
|
|
129
|
-
|
|
130
|
-
if output_dir:
|
|
131
|
-
output_dir.mkdir(parents=True, exist_ok=True)
|
|
132
|
-
|
|
133
|
-
for file_name in Path.iterdir(sql_dir):
|
|
134
|
-
if str(file_name).endswith(".sql"):
|
|
135
|
-
file_path = Path(sql_dir) / file_name
|
|
136
|
-
index_df, output_basename, schema, sql_query = self.execute_sql_query(
|
|
137
|
-
file_path
|
|
138
|
-
)
|
|
139
|
-
logger.debug(
|
|
140
|
-
"Executed and processed SQL queries from file: %s", file_path
|
|
141
|
-
)
|
|
142
|
-
if generate_compressed_csv:
|
|
143
|
-
csv_file_path = (
|
|
144
|
-
output_dir / f"{output_basename}.csv.zip"
|
|
145
|
-
if output_dir
|
|
146
|
-
else Path(f"{output_basename}.csv.zip")
|
|
147
|
-
)
|
|
148
|
-
index_df.to_csv(
|
|
149
|
-
csv_file_path, compression={"method": "zip"}, escapechar="\\"
|
|
150
|
-
)
|
|
151
|
-
logger.debug("Created CSV zip file: %s", csv_file_path)
|
|
152
|
-
|
|
153
|
-
if generate_parquet:
|
|
154
|
-
parquet_file_path = (
|
|
155
|
-
output_dir / f"{output_basename}.parquet"
|
|
156
|
-
if output_dir
|
|
157
|
-
else Path(f"{output_basename}.parquet")
|
|
158
|
-
)
|
|
159
|
-
index_df.to_parquet(parquet_file_path, compression="zstd")
|
|
160
|
-
logger.debug("Created Parquet file: %s", parquet_file_path)
|
|
161
|
-
|
|
162
|
-
# Save schema to JSON file
|
|
163
|
-
self.save_schema_to_json(schema, output_basename, output_dir)
|
|
164
|
-
# Save SQL query to file
|
|
165
|
-
self.save_sql_query(sql_query, output_basename, output_dir)
|
|
166
|
-
|
|
167
|
-
def retrieve_latest_idc_release_version(self) -> int:
|
|
168
|
-
"""
|
|
169
|
-
Retrieves the latest IDC release version.
|
|
170
|
-
|
|
171
|
-
This function executes a SQL query on the `version_metadata` table in the
|
|
172
|
-
`idc_current` dataset of the BigQuery client. It retrieves the maximum
|
|
173
|
-
`idc_version` and returns it as an integer.
|
|
174
|
-
"""
|
|
175
|
-
query = """
|
|
176
|
-
SELECT
|
|
177
|
-
MAX(idc_version) AS latest_idc_release_version
|
|
178
|
-
FROM
|
|
179
|
-
`bigquery-public-data.idc_current.version_metadata`
|
|
180
|
-
"""
|
|
181
|
-
query_job = self.client.query(query)
|
|
182
|
-
result = query_job.result()
|
|
183
|
-
return int(next(result).latest_idc_release_version)
|
|
184
|
-
|
|
185
|
-
|
|
186
|
-
if __name__ == "__main__":
|
|
187
|
-
import argparse
|
|
188
|
-
|
|
189
|
-
parser = argparse.ArgumentParser()
|
|
190
|
-
parser.add_argument(
|
|
191
|
-
"--project",
|
|
192
|
-
default=os.environ.get("GCP_PROJECT", None),
|
|
193
|
-
help="Google Cloud Platform Project ID (default from GCP_PROJECT env. variable)",
|
|
194
|
-
)
|
|
195
|
-
parser.add_argument(
|
|
196
|
-
"--generate-csv-archive",
|
|
197
|
-
action="store_true",
|
|
198
|
-
help="Generate idc_index.csv.zip file",
|
|
199
|
-
)
|
|
200
|
-
parser.add_argument(
|
|
201
|
-
"--generate-parquet",
|
|
202
|
-
action="store_true",
|
|
203
|
-
help="Generate idc_index.parquet file",
|
|
204
|
-
)
|
|
205
|
-
parser.add_argument(
|
|
206
|
-
"--retrieve-latest-idc-release-version",
|
|
207
|
-
action="store_true",
|
|
208
|
-
help="Retrieve and display the latest IDC release version",
|
|
209
|
-
)
|
|
210
|
-
|
|
211
|
-
args = parser.parse_args()
|
|
212
|
-
|
|
213
|
-
if not args.project:
|
|
214
|
-
parser.error(
|
|
215
|
-
"Set GCP_PROJECT environment variable or specify --project argument"
|
|
216
|
-
)
|
|
217
|
-
|
|
218
|
-
if any([args.generate_csv_archive, args.generate_parquet]):
|
|
219
|
-
IDCIndexDataManager(args.project).generate_index_data_files(
|
|
220
|
-
generate_compressed_csv=args.generate_csv_archive,
|
|
221
|
-
generate_parquet=args.generate_parquet,
|
|
222
|
-
)
|
|
223
|
-
elif args.retrieve_latest_idc_release_version:
|
|
224
|
-
logging.basicConfig(level=logging.ERROR, force=True)
|
|
225
|
-
logger.setLevel(logging.ERROR)
|
|
226
|
-
version = IDCIndexDataManager(
|
|
227
|
-
args.project
|
|
228
|
-
).retrieve_latest_idc_release_version()
|
|
229
|
-
print(f"{version}") # noqa: T201
|
|
230
|
-
else:
|
|
231
|
-
parser.print_help()
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|