idc-index-data 22.1.1__tar.gz → 22.1.2__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (37) hide show
  1. {idc_index_data-22.1.1 → idc_index_data-22.1.2}/.github/workflows/external-indices.yml +1 -1
  2. {idc_index_data-22.1.1 → idc_index_data-22.1.2}/PKG-INFO +1 -1
  3. {idc_index_data-22.1.1 → idc_index_data-22.1.2}/assets/sm_index.sql +36 -3
  4. {idc_index_data-22.1.1 → idc_index_data-22.1.2}/assets/sm_instance_index.sql +30 -0
  5. {idc_index_data-22.1.1 → idc_index_data-22.1.2}/pyproject.toml +1 -1
  6. {idc_index_data-22.1.1 → idc_index_data-22.1.2}/scripts/python/generate-indices.py +8 -2
  7. {idc_index_data-22.1.1 → idc_index_data-22.1.2}/scripts/python/idc_index_data_manager.py +33 -4
  8. idc_index_data-22.1.2/scripts/sql/collections_index.sql +39 -0
  9. idc_index_data-22.1.2/scripts/sql/idc_index.sql +88 -0
  10. idc_index_data-22.1.1/scripts/sql/collections_index.sql +0 -15
  11. idc_index_data-22.1.1/scripts/sql/idc_index.sql +0 -38
  12. {idc_index_data-22.1.1 → idc_index_data-22.1.2}/.git_archival.txt +0 -0
  13. {idc_index_data-22.1.1 → idc_index_data-22.1.2}/.gitattributes +0 -0
  14. {idc_index_data-22.1.1 → idc_index_data-22.1.2}/.github/CONTRIBUTING.md +0 -0
  15. {idc_index_data-22.1.1 → idc_index_data-22.1.2}/.github/copilot-instructions.md +0 -0
  16. {idc_index_data-22.1.1 → idc_index_data-22.1.2}/.github/dependabot.yml +0 -0
  17. {idc_index_data-22.1.1 → idc_index_data-22.1.2}/.github/matchers/pylint.json +0 -0
  18. {idc_index_data-22.1.1 → idc_index_data-22.1.2}/.github/workflows/cd.yml +0 -0
  19. {idc_index_data-22.1.1 → idc_index_data-22.1.2}/.github/workflows/ci.yml +0 -0
  20. {idc_index_data-22.1.1 → idc_index_data-22.1.2}/.gitignore +0 -0
  21. {idc_index_data-22.1.1 → idc_index_data-22.1.2}/.pre-commit-config.yaml +0 -0
  22. {idc_index_data-22.1.1 → idc_index_data-22.1.2}/.readthedocs.yaml +0 -0
  23. {idc_index_data-22.1.1 → idc_index_data-22.1.2}/CMakeLists.txt +0 -0
  24. {idc_index_data-22.1.1 → idc_index_data-22.1.2}/LICENSE +0 -0
  25. {idc_index_data-22.1.1 → idc_index_data-22.1.2}/README.md +0 -0
  26. {idc_index_data-22.1.1 → idc_index_data-22.1.2}/assets/README.md +0 -0
  27. {idc_index_data-22.1.1 → idc_index_data-22.1.2}/assets/clinical_index.sql +0 -0
  28. {idc_index_data-22.1.1 → idc_index_data-22.1.2}/docs/conf.py +0 -0
  29. {idc_index_data-22.1.1 → idc_index_data-22.1.2}/docs/index.md +0 -0
  30. {idc_index_data-22.1.1 → idc_index_data-22.1.2}/noxfile.py +0 -0
  31. {idc_index_data-22.1.1 → idc_index_data-22.1.2}/scripts/python/update_idc_index_version.py +0 -0
  32. {idc_index_data-22.1.1 → idc_index_data-22.1.2}/scripts/sql/analysis_results_index.sql +0 -0
  33. {idc_index_data-22.1.1 → idc_index_data-22.1.2}/scripts/sql/prior_versions_index.sql +0 -0
  34. {idc_index_data-22.1.1 → idc_index_data-22.1.2}/src/idc_index_data/__init__.py +0 -0
  35. {idc_index_data-22.1.1 → idc_index_data-22.1.2}/src/idc_index_data/_version.pyi +0 -0
  36. {idc_index_data-22.1.1 → idc_index_data-22.1.2}/src/idc_index_data/py.typed +0 -0
  37. {idc_index_data-22.1.1 → idc_index_data-22.1.2}/tests/test_package.py +0 -0
@@ -55,6 +55,6 @@ jobs:
55
55
  if: github.event_name == 'release' && github.event.action == 'published'
56
56
  uses: ncipollo/release-action@v1
57
57
  with:
58
- artifacts: "release_artifacts/*.parquet,release_artifacts/*.json"
58
+ artifacts: "release_artifacts/*.parquet,release_artifacts/*.json,release_artifacts/*.sql"
59
59
  allowUpdates: true
60
60
  omitBodyDuringUpdate: true
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: idc-index-data
3
- Version: 22.1.1
3
+ Version: 22.1.2
4
4
  Summary: ImagingDataCommons index to query and download data.
5
5
  Author-Email: Andrey Fedorov <andrey.fedorov@gmail.com>, Vamsi Thiriveedhi <vthiriveedhi@mgh.harvard.edu>, Jean-Christophe Fillion-Robin <jchris.fillionr@kitware.com>
6
6
  License: Copyright 2024 Andrey Fedorov
@@ -82,10 +82,14 @@ SpecimenPreparationSequence_unnested AS (
82
82
  SELECT
83
83
  temp_table.SeriesInstanceUID,
84
84
  -- Embedding Medium
85
+ # description:
86
+ # embedding medium used for the slide preparation
85
87
  ARRAY(
86
88
  SELECT IF(code IS NULL, NULL, SPLIT(code, ':')[SAFE_OFFSET(0)])
87
89
  FROM UNNEST(embeddingMedium_code_str) AS code
88
90
  ) AS embeddingMedium_CodeMeaning,
91
+ # description:
92
+ # embedding medium code tuple
89
93
  ARRAY(
90
94
  SELECT IF(code IS NULL, NULL,
91
95
  IF(STRPOS(code, ':') = 0, NULL,
@@ -93,10 +97,14 @@ SELECT
93
97
  FROM UNNEST(embeddingMedium_code_str) AS code
94
98
  ) AS embeddingMedium_code_designator_value_str,
95
99
  -- Tissue Fixative
100
+ # description:
101
+ # tissue fixative used for the slide preparation
96
102
  ARRAY(
97
103
  SELECT IF(code IS NULL, NULL, SPLIT(code, ':')[SAFE_OFFSET(0)])
98
104
  FROM UNNEST(tissueFixative_code_str) AS code
99
105
  ) AS tissueFixative_CodeMeaning,
106
+ # description:
107
+ # tissue fixative code tuple
100
108
  ARRAY(
101
109
  SELECT IF(code IS NULL, NULL,
102
110
  IF(STRPOS(code, ':') = 0, NULL,
@@ -104,31 +112,56 @@ SELECT
104
112
  FROM UNNEST(tissueFixative_code_str) AS code
105
113
  ) AS tissueFixative_code_designator_value_str,
106
114
  -- Staining using substance
115
+ # description:
116
+ # staining substances used for the slide preparation
107
117
  ARRAY(
108
118
  SELECT IF(code IS NULL, NULL, SPLIT(code, ':')[SAFE_OFFSET(0)])
109
119
  FROM UNNEST(staining_usingSubstance_code_str) AS code
110
120
  ) AS staining_usingSubstance_CodeMeaning,
121
+ # description:
122
+ # staining using substance code tuple
111
123
  ARRAY(
112
124
  SELECT IF(code IS NULL, NULL,
113
125
  IF(STRPOS(code, ':') = 0, NULL,
114
126
  SUBSTR(code, STRPOS(code, ':') + 1)))
115
127
  FROM UNNEST(staining_usingSubstance_code_str) AS code
116
128
  ) AS staining_usingSubstance_code_designator_value_str,
117
-
129
+ # description:
130
+ # pixel spacing in mm at the maximum resolution layer, rounded to 2 significant figures
118
131
  if(COALESCE(min_spacing_0, fg_min_spacing_0) = 0, 0,
119
132
  round(COALESCE(min_spacing_0, fg_min_spacing_0) ,CAST(2 -1-floor(log10(abs(COALESCE(min_spacing_0, fg_min_spacing_0) ))) AS INT64))) AS min_PixelSpacing_2sf,
133
+ # description:
134
+ # width of the image at the maximum resolution
120
135
  COALESCE(max_TotalPixelMatrixColumns, max_Columns) AS max_TotalPixelMatrixColumns,
136
+ # description:
137
+ # height of the image at the maximum resolution
121
138
  COALESCE(max_TotalPixelMatrixRows, max_Rows) AS max_TotalPixelMatrixRows,
139
+ # description:
140
+ # power of the objective lens of the equipment used to digitize the slide
122
141
  SAFE_CAST(ObjectiveLensPower as INT) as ObjectiveLensPower,
142
+ # description:
143
+ # anatomic location from where the imaged specimen was collected
123
144
  CONCAT(SPLIT(primaryAnatomicStructure_code_str,":")[SAFE_OFFSET(0)],":",SPLIT(primaryAnatomicStructure_code_str,":")[SAFE_OFFSET(1)]) as primaryAnatomicStructure_code_designator_value_str,
145
+ # description:
146
+ # code tuple for the anatomic location from where the imaged specimen was collected
124
147
  SPLIT(primaryAnatomicStructure_code_str,":")[SAFE_OFFSET(2)] as primaryAnatomicStructure_CodeMeaning,
148
+ # description:
149
+ # additional characteristics of the specimen, such as whether it is a tumor or normal tissue (when available)
125
150
  CONCAT(SPLIT(primaryAnatomicStructureModifier_code_str,":")[SAFE_OFFSET(0)],":",SPLIT(primaryAnatomicStructureModifier_code_str,":")[SAFE_OFFSET(1)]) as primaryAnatomicStructureModifier_code_designator_value_str,
151
+ # description:
152
+ # code tuple for additional characteristics of the specimen, such as whether it is a tumor or normal tissue (when available)
126
153
  SPLIT(primaryAnatomicStructureModifier_code_str,":")[SAFE_OFFSET(2)] as primaryAnatomicStructureModifier_CodeMeaning,
127
-
154
+ # description:
155
+ # illumination type used during slide digitization
128
156
  CONCAT(SPLIT(illuminationType_code_str,":")[SAFE_OFFSET(0)],":",SPLIT(illuminationType_code_str,":")[SAFE_OFFSET(1)]) as illuminationType_code_designator_value_str,
157
+ # description:
158
+ # code tuple for the illumination type used during slide digitization
129
159
  SPLIT(illuminationType_code_str,":")[SAFE_OFFSET(2)] as illuminationType_CodeMeaning,
130
-
160
+ # description:
161
+ # admitting diagnosis associated with the specimen imaged on the slide (when available)
131
162
  CONCAT(SPLIT(admittingDiagnosis_code_str,":")[SAFE_OFFSET(0)],":",SPLIT(admittingDiagnosis_code_str,":")[SAFE_OFFSET(1)]) as admittingDiagnosis_code_designator_value_str,
163
+ # description:
164
+ # code tuple for the admitting diagnosis associated with the specimen imaged on the slide (when available)
132
165
  SPLIT(admittingDiagnosis_code_str,":")[SAFE_OFFSET(2)] as admittingDiagnosis_CodeMeaning,
133
166
  FROM
134
167
  temp_table
@@ -48,15 +48,23 @@ WITH
48
48
  GROUP BY
49
49
  SOPInstanceUID )
50
50
  SELECT
51
+ # description:
52
+ # unique identifier of the instance
51
53
  dicom_all.SOPInstanceUID,
54
+ # description:
55
+ # unique identifier of the series
52
56
  dicom_all.SeriesInstanceUID,
53
57
  -- Embedding Medium
58
+ # description:
59
+ # embedding medium used for the slide preparation
54
60
  ARRAY(
55
61
  SELECT
56
62
  IF
57
63
  (code IS NULL, NULL, SPLIT(code, ':')[SAFE_OFFSET(0)])
58
64
  FROM
59
65
  UNNEST(embeddingMedium_code_str) AS code ) AS embeddingMedium_CodeMeaning,
66
+ # description:
67
+ # embedding medium code tuple
60
68
  ARRAY(
61
69
  SELECT
62
70
  IF
@@ -66,12 +74,16 @@ SELECT
66
74
  FROM
67
75
  UNNEST(embeddingMedium_code_str) AS code ) AS embeddingMedium_code_designator_value_str,
68
76
  -- Tissue Fixative
77
+ # description:
78
+ # tissue fixative used for the slide preparation
69
79
  ARRAY(
70
80
  SELECT
71
81
  IF
72
82
  (code IS NULL, NULL, SPLIT(code, ':')[SAFE_OFFSET(0)])
73
83
  FROM
74
84
  UNNEST(tissueFixative_code_str) AS code ) AS tissueFixative_CodeMeaning,
85
+ # description:
86
+ # tissue fixative code tuple
75
87
  ARRAY(
76
88
  SELECT
77
89
  IF
@@ -81,12 +93,16 @@ SELECT
81
93
  FROM
82
94
  UNNEST(tissueFixative_code_str) AS code ) AS tissueFixative_code_designator_value_str,
83
95
  -- Staining using substance
96
+ # description:
97
+ # staining substances used for the slide preparation
84
98
  ARRAY(
85
99
  SELECT
86
100
  IF
87
101
  (code IS NULL, NULL, SPLIT(code, ':')[SAFE_OFFSET(0)])
88
102
  FROM
89
103
  UNNEST(staining_usingSubstance_code_str) AS code ) AS staining_usingSubstance_CodeMeaning,
104
+ # description:
105
+ # staining using substance code tuple
90
106
  ARRAY(
91
107
  SELECT
92
108
  IF
@@ -98,13 +114,27 @@ SELECT
98
114
  -- instance-specific image attributes
99
115
  -- NB: there is a caveat that I think in general, we expect square pixels, but in htan_wustl and cptac_luad this assumption does not hold,
100
116
  -- and in htan_wustl, the difference is rather large (x2) - waiting to hear from David Clunie about this...
117
+ # description:
118
+ # pixel spacing in mm, rounded to 2 significant figures
101
119
  SAFE_CAST(SharedFunctionalGroupsSequence[SAFE_OFFSET(0)].PixelMeasuresSequence[SAFE_OFFSET(0)]. PixelSpacing[SAFE_OFFSET(0)] AS FLOAT64) AS PixelSpacing_0,
120
+ # description:
121
+ # DICOM ImageType attribute
102
122
  dicom_all.ImageType,
123
+ # description:
124
+ # DICOM TransferSyntaxUID attribute
103
125
  dicom_all.TransferSyntaxUID,
126
+ # description:
127
+ # size of the instance file in bytes
104
128
  dicom_all.instance_size,
129
+ # description:
130
+ # number of columns in the image
105
131
  dicom_all.TotalPixelMatrixColumns,
132
+ # description:
133
+ # number of rows in the image
106
134
  dicom_all.TotalPixelMatrixRows,
107
135
  -- attributes needed to retrieve the selected instances/files
136
+ # description:
137
+ # unique identifier of the instance within the IDC
108
138
  dicom_all.crdc_instance_uuid
109
139
  FROM
110
140
  `bigquery-public-data.idc_v22.dicom_all` AS dicom_all
@@ -13,7 +13,7 @@ build-backend = "scikit_build_core.build"
13
13
 
14
14
  [project]
15
15
  name = "idc-index-data"
16
- version = "22.1.1"
16
+ version = "22.1.2"
17
17
  authors = [
18
18
  { name = "Andrey Fedorov", email = "andrey.fedorov@gmail.com" },
19
19
  { name = "Vamsi Thiriveedhi", email = "vthiriveedhi@mgh.harvard.edu" },
@@ -23,10 +23,13 @@ def main():
23
23
 
24
24
  for file_name in sql_files:
25
25
  file_path = assets_dir / file_name
26
- index_df, output_basename, schema = manager.execute_sql_query(file_path)
26
+ index_df, output_basename, schema, sql_query = manager.execute_sql_query(
27
+ file_path
28
+ )
27
29
  parquet_file_path = output_dir / f"{output_basename}.parquet"
28
30
  index_df.to_parquet(parquet_file_path)
29
31
  manager.save_schema_to_json(schema, output_basename, output_dir)
32
+ manager.save_sql_query(sql_query, output_basename, output_dir)
30
33
 
31
34
  core_indices_dir = scripts_dir.parent / "scripts" / "sql"
32
35
 
@@ -34,10 +37,13 @@ def main():
34
37
 
35
38
  for file_name in sql_files:
36
39
  file_path = core_indices_dir / file_name
37
- index_df, output_basename, schema = manager.execute_sql_query(file_path)
40
+ index_df, output_basename, schema, sql_query = manager.execute_sql_query(
41
+ file_path
42
+ )
38
43
  parquet_file_path = output_dir / f"{output_basename}.parquet"
39
44
  index_df.to_parquet(parquet_file_path)
40
45
  manager.save_schema_to_json(schema, output_basename, output_dir)
46
+ manager.save_sql_query(sql_query, output_basename, output_dir)
41
47
 
42
48
 
43
49
  if __name__ == "__main__":
@@ -40,7 +40,7 @@ class IDCIndexDataManager:
40
40
  index_df["StudyDate"] = index_df["StudyDate"].astype(str)
41
41
  output_basename = Path(file_path).name.split(".")[0]
42
42
  logger.debug("Executed SQL query from file: %s", file_path)
43
- return index_df, output_basename, schema
43
+ return index_df, output_basename, schema, sql_query
44
44
 
45
45
  def save_schema_to_json(
46
46
  self,
@@ -79,6 +79,31 @@ class IDCIndexDataManager:
79
79
  json.dump(schema_dict, f, indent=2)
80
80
  logger.debug("Created schema JSON file: %s", json_file_path)
81
81
 
82
+ def save_sql_query(
83
+ self,
84
+ sql_query: str,
85
+ output_basename: str,
86
+ output_dir: Path | None = None,
87
+ ) -> None:
88
+ """
89
+ Saves the SQL query to a file.
90
+
91
+ Args:
92
+ sql_query: The SQL query string
93
+ output_basename: The base name for the output file
94
+ output_dir: Optional directory path for the output file
95
+ """
96
+
97
+ if output_dir:
98
+ output_dir.mkdir(parents=True, exist_ok=True)
99
+ query_file_path = output_dir / f"{output_basename}.sql"
100
+ else:
101
+ query_file_path = Path(f"{output_basename}.sql")
102
+
103
+ with query_file_path.open("w") as f:
104
+ f.write(sql_query)
105
+ logger.debug("Created SQL query file: %s", query_file_path)
106
+
82
107
  def generate_index_data_files(
83
108
  self,
84
109
  generate_compressed_csv: bool = True,
@@ -108,7 +133,9 @@ class IDCIndexDataManager:
108
133
  for file_name in Path.iterdir(sql_dir):
109
134
  if str(file_name).endswith(".sql"):
110
135
  file_path = Path(sql_dir) / file_name
111
- index_df, output_basename, schema = self.execute_sql_query(file_path)
136
+ index_df, output_basename, schema, sql_query = self.execute_sql_query(
137
+ file_path
138
+ )
112
139
  logger.debug(
113
140
  "Executed and processed SQL queries from file: %s", file_path
114
141
  )
@@ -132,8 +159,10 @@ class IDCIndexDataManager:
132
159
  index_df.to_parquet(parquet_file_path, compression="zstd")
133
160
  logger.debug("Created Parquet file: %s", parquet_file_path)
134
161
 
135
- # Save schema to JSON file
136
- self.save_schema_to_json(schema, output_basename, output_dir)
162
+ # Save schema to JSON file
163
+ self.save_schema_to_json(schema, output_basename, output_dir)
164
+ # Save SQL query to file
165
+ self.save_sql_query(sql_query, output_basename, output_dir)
137
166
 
138
167
  def retrieve_latest_idc_release_version(self) -> int:
139
168
  """
@@ -0,0 +1,39 @@
1
+ SELECT
2
+ # description:
3
+ # name of the collection
4
+ collection_name,
5
+ # description:
6
+ # unique identifier of the collection
7
+ collection_id,
8
+ # description:
9
+ # types of cancer represented in the collection
10
+ CancerTypes,
11
+ # description:
12
+ # locations of tumors represented in the collection
13
+ TumorLocations,
14
+ # description:
15
+ # number of subjects in the collection
16
+ Subjects,
17
+ # description:
18
+ # species represented in the collection
19
+ Species,
20
+ # description:
21
+ # sources of data for the collection
22
+ Sources,
23
+ # description:
24
+ # additional data supporting the collection available in IDC
25
+ SupportingData,
26
+ # description:
27
+ # broader initiative/category under which this collection is being shared
28
+ Program,
29
+ # description:
30
+ # status of the collection (Completed or Ongoing)
31
+ Status,
32
+ # description:
33
+ # timestamp of the last update to the collection
34
+ Updated,
35
+ # description:
36
+ # detailed information about the collection
37
+ Description
38
+ FROM
39
+ `bigquery-public-data.idc_v22.original_collections_metadata`
@@ -0,0 +1,88 @@
1
+ SELECT
2
+ # collection level attributes
3
+ # description:
4
+ # short string with the identifier of the collection the series belongs to
5
+ ANY_VALUE(collection_id) AS collection_id,
6
+ # description:
7
+ # this string is not empty if the specific series is
8
+ # part of an analysis results collection; analysis results can be added to a
9
+ # given collection over time
10
+ ANY_VALUE(analysis_result_id) AS analysis_result_id,
11
+ # description:
12
+ # identifier of the patient within the collection (DICOM attribute)
13
+ ANY_VALUE(PatientID) AS PatientID,
14
+ # description:
15
+ # unique identifier of the DICOM series (DICOM attribute)
16
+ SeriesInstanceUID,
17
+ # description:
18
+ # unique identifier of the DICOM study (DICOM attribute)
19
+ ANY_VALUE(StudyInstanceUID) AS StudyInstanceUID,
20
+ # description:
21
+ # Digital Object Identifier of the dataset that contains the given
22
+ # series; follow this DOI to learn more about the activity that produced
23
+ # this series
24
+ ANY_VALUE(source_DOI) AS source_DOI,
25
+ # patient level attributes
26
+ # description:
27
+ # age of the subject at the time of imaging (DICOM attribute)
28
+ ANY_VALUE(PatientAge) AS PatientAge,
29
+ # description:
30
+ # subject sex (DICOM attribute)
31
+ ANY_VALUE(PatientSex) AS PatientSex,
32
+ # study level attributes
33
+ # description:
34
+ # date of the study (de-identified) (DICOM attribute)
35
+ ANY_VALUE(StudyDate) AS StudyDate,
36
+ # description:
37
+ # textual description of the study content (DICOM attribute)
38
+ ANY_VALUE(StudyDescription) AS StudyDescription,
39
+ # description:
40
+ # body part imaged (not iniapplicabletialized for SM series) (DICOM attribute)
41
+ ANY_VALUE(dicom_curated.BodyPartExamined) AS BodyPartExamined,
42
+ # series level attributes
43
+ # description:
44
+ # acquisition modality (DICOM attribute)
45
+ ANY_VALUE(Modality) AS Modality,
46
+ # description:
47
+ # manufacturer of the equipment that produced the series (DICOM attribute)
48
+ ANY_VALUE(Manufacturer) AS Manufacturer,
49
+ # description:
50
+ # model name of the equipment that produced the series (DICOM attribute)
51
+ ANY_VALUE(ManufacturerModelName) AS ManufacturerModelName,
52
+ # description:
53
+ # date of the series (de-identified) (DICOM attribute)
54
+ ANY_VALUE(SAFE_CAST(SeriesDate AS STRING)) AS SeriesDate,
55
+ # description:
56
+ # textual description of the series content (DICOM attribute)
57
+ ANY_VALUE(SeriesDescription) AS SeriesDescription,
58
+ # description:
59
+ # series number (DICOM attribute)
60
+ ANY_VALUE(SeriesNumber) AS SeriesNumber,
61
+ # description:
62
+ # number of instances in the series
63
+ COUNT(dicom_all.SOPInstanceUID) AS instanceCount,
64
+ # description:
65
+ # short name of the license that applies to this series
66
+ ANY_VALUE(license_short_name) as license_short_name,
67
+ # download related attributes
68
+ # description:
69
+ # name of the AWS S3 bucket that contains the series
70
+ ANY_VALUE(aws_bucket) AS aws_bucket,
71
+ # description:
72
+ # unique identifier of the series within the IDC
73
+ ANY_VALUE(crdc_series_uuid) AS crdc_series_uuid,
74
+ # series_aws_url will be phased out in favor of constructing URL from bucket+UUID
75
+ # description:
76
+ # public AWS S3 URL to download the series in bulk (each instance is a separate file)
77
+ ANY_VALUE(CONCAT(series_aws_url,"*")) AS series_aws_url,
78
+ # description:
79
+ # total size of the series in megabytes
80
+ ROUND(SUM(SAFE_CAST(instance_size AS float64))/1000000, 2) AS series_size_MB,
81
+ FROM
82
+ `bigquery-public-data.idc_v22.dicom_all` AS dicom_all
83
+ JOIN
84
+ `bigquery-public-data.idc_v22.dicom_metadata_curated` AS dicom_curated
85
+ ON
86
+ dicom_all.SOPInstanceUID = dicom_curated.SOPInstanceUID
87
+ GROUP BY
88
+ SeriesInstanceUID
@@ -1,15 +0,0 @@
1
- SELECT
2
- collection_name,
3
- collection_id,
4
- CancerTypes,
5
- TumorLocations,
6
- Subjects,
7
- Species,
8
- Sources,
9
- SupportingData,
10
- Program,
11
- Status,
12
- Updated,
13
- Description
14
- FROM
15
- `bigquery-public-data.idc_v22.original_collections_metadata`
@@ -1,38 +0,0 @@
1
- SELECT
2
- # collection level attributes
3
- ANY_VALUE(collection_id) AS collection_id,
4
- ANY_VALUE(analysis_result_id) AS analysis_result_id,
5
- ANY_VALUE(PatientID) AS PatientID,
6
- SeriesInstanceUID,
7
- ANY_VALUE(StudyInstanceUID) AS StudyInstanceUID,
8
- ANY_VALUE(source_DOI) AS source_DOI,
9
- # patient level attributes
10
- ANY_VALUE(PatientAge) AS PatientAge,
11
- ANY_VALUE(PatientSex) AS PatientSex,
12
- # study level attributes
13
- ANY_VALUE(StudyDate) AS StudyDate,
14
- ANY_VALUE(StudyDescription) AS StudyDescription,
15
- ANY_VALUE(dicom_curated.BodyPartExamined) AS BodyPartExamined,
16
- # series level attributes
17
- ANY_VALUE(Modality) AS Modality,
18
- ANY_VALUE(Manufacturer) AS Manufacturer,
19
- ANY_VALUE(ManufacturerModelName) AS ManufacturerModelName,
20
- ANY_VALUE(SAFE_CAST(SeriesDate AS STRING)) AS SeriesDate,
21
- ANY_VALUE(SeriesDescription) AS SeriesDescription,
22
- ANY_VALUE(SeriesNumber) AS SeriesNumber,
23
- COUNT(dicom_all.SOPInstanceUID) AS instanceCount,
24
- ANY_VALUE(license_short_name) as license_short_name,
25
- # download related attributes
26
- ANY_VALUE(aws_bucket) AS aws_bucket,
27
- ANY_VALUE(crdc_series_uuid) AS crdc_series_uuid,
28
- # series_aws_url will be phased out in favor of constructing URL from bucket+UUID
29
- ANY_VALUE(CONCAT(series_aws_url,"*")) AS series_aws_url,
30
- ROUND(SUM(SAFE_CAST(instance_size AS float64))/1000000, 2) AS series_size_MB,
31
- FROM
32
- `bigquery-public-data.idc_v22.dicom_all` AS dicom_all
33
- JOIN
34
- `bigquery-public-data.idc_v22.dicom_metadata_curated` AS dicom_curated
35
- ON
36
- dicom_all.SOPInstanceUID = dicom_curated.SOPInstanceUID
37
- GROUP BY
38
- SeriesInstanceUID
File without changes