idc-index-data 22.1.1__tar.gz → 22.1.3__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (42) hide show
  1. {idc_index_data-22.1.1 → idc_index_data-22.1.3}/.github/workflows/external-indices.yml +1 -1
  2. {idc_index_data-22.1.1 → idc_index_data-22.1.3}/.pre-commit-config.yaml +1 -0
  3. {idc_index_data-22.1.1 → idc_index_data-22.1.3}/PKG-INFO +2 -1
  4. idc_index_data-22.1.3/assets/clinical_index.sql +23 -0
  5. {idc_index_data-22.1.1 → idc_index_data-22.1.3}/assets/sm_index.sql +36 -3
  6. {idc_index_data-22.1.1 → idc_index_data-22.1.3}/assets/sm_instance_index.sql +30 -0
  7. {idc_index_data-22.1.1 → idc_index_data-22.1.3}/pyproject.toml +6 -4
  8. idc_index_data-22.1.3/pytest.ini +2 -0
  9. {idc_index_data-22.1.1 → idc_index_data-22.1.3}/scripts/python/generate-indices.py +14 -4
  10. idc_index_data-22.1.3/scripts/python/idc_index_data_manager.py +424 -0
  11. idc_index_data-22.1.3/scripts/sql/collections_index.sql +39 -0
  12. idc_index_data-22.1.3/scripts/sql/idc_index.sql +88 -0
  13. idc_index_data-22.1.3/tests/test_column_description_parser.py +218 -0
  14. idc_index_data-22.1.3/tests/test_real_sql_parsing.py +101 -0
  15. idc_index_data-22.1.1/assets/clinical_index.sql +0 -11
  16. idc_index_data-22.1.1/scripts/python/idc_index_data_manager.py +0 -202
  17. idc_index_data-22.1.1/scripts/sql/collections_index.sql +0 -15
  18. idc_index_data-22.1.1/scripts/sql/idc_index.sql +0 -38
  19. {idc_index_data-22.1.1 → idc_index_data-22.1.3}/.git_archival.txt +0 -0
  20. {idc_index_data-22.1.1 → idc_index_data-22.1.3}/.gitattributes +0 -0
  21. {idc_index_data-22.1.1 → idc_index_data-22.1.3}/.github/CONTRIBUTING.md +0 -0
  22. {idc_index_data-22.1.1 → idc_index_data-22.1.3}/.github/copilot-instructions.md +0 -0
  23. {idc_index_data-22.1.1 → idc_index_data-22.1.3}/.github/dependabot.yml +0 -0
  24. {idc_index_data-22.1.1 → idc_index_data-22.1.3}/.github/matchers/pylint.json +0 -0
  25. {idc_index_data-22.1.1 → idc_index_data-22.1.3}/.github/workflows/cd.yml +0 -0
  26. {idc_index_data-22.1.1 → idc_index_data-22.1.3}/.github/workflows/ci.yml +0 -0
  27. {idc_index_data-22.1.1 → idc_index_data-22.1.3}/.gitignore +0 -0
  28. {idc_index_data-22.1.1 → idc_index_data-22.1.3}/.readthedocs.yaml +0 -0
  29. {idc_index_data-22.1.1 → idc_index_data-22.1.3}/CMakeLists.txt +0 -0
  30. {idc_index_data-22.1.1 → idc_index_data-22.1.3}/LICENSE +0 -0
  31. {idc_index_data-22.1.1 → idc_index_data-22.1.3}/README.md +0 -0
  32. {idc_index_data-22.1.1 → idc_index_data-22.1.3}/assets/README.md +0 -0
  33. {idc_index_data-22.1.1 → idc_index_data-22.1.3}/docs/conf.py +0 -0
  34. {idc_index_data-22.1.1 → idc_index_data-22.1.3}/docs/index.md +0 -0
  35. {idc_index_data-22.1.1 → idc_index_data-22.1.3}/noxfile.py +0 -0
  36. {idc_index_data-22.1.1 → idc_index_data-22.1.3}/scripts/python/update_idc_index_version.py +0 -0
  37. {idc_index_data-22.1.1 → idc_index_data-22.1.3}/scripts/sql/analysis_results_index.sql +0 -0
  38. {idc_index_data-22.1.1 → idc_index_data-22.1.3}/scripts/sql/prior_versions_index.sql +0 -0
  39. {idc_index_data-22.1.1 → idc_index_data-22.1.3}/src/idc_index_data/__init__.py +0 -0
  40. {idc_index_data-22.1.1 → idc_index_data-22.1.3}/src/idc_index_data/_version.pyi +0 -0
  41. {idc_index_data-22.1.1 → idc_index_data-22.1.3}/src/idc_index_data/py.typed +0 -0
  42. {idc_index_data-22.1.1 → idc_index_data-22.1.3}/tests/test_package.py +0 -0
@@ -55,6 +55,6 @@ jobs:
55
55
  if: github.event_name == 'release' && github.event.action == 'published'
56
56
  uses: ncipollo/release-action@v1
57
57
  with:
58
- artifacts: "release_artifacts/*.parquet,release_artifacts/*.json"
58
+ artifacts: "release_artifacts/*.parquet,release_artifacts/*.json,release_artifacts/*.sql"
59
59
  allowUpdates: true
60
60
  omitBodyDuringUpdate: true
@@ -55,6 +55,7 @@ repos:
55
55
  additional_dependencies:
56
56
  - pytest
57
57
  - pandas-stubs
58
+ - google-cloud-bigquery
58
59
 
59
60
  - repo: https://github.com/codespell-project/codespell
60
61
  rev: "v2.4.1"
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: idc-index-data
3
- Version: 22.1.1
3
+ Version: 22.1.3
4
4
  Summary: ImagingDataCommons index to query and download data.
5
5
  Author-Email: Andrey Fedorov <andrey.fedorov@gmail.com>, Vamsi Thiriveedhi <vthiriveedhi@mgh.harvard.edu>, Jean-Christophe Fillion-Robin <jchris.fillionr@kitware.com>
6
6
  License: Copyright 2024 Andrey Fedorov
@@ -41,6 +41,7 @@ Project-URL: Bug Tracker, https://github.com/ImagingDataCommons/idc-index-data/i
41
41
  Project-URL: Discussions, https://discourse.canceridc.dev/
42
42
  Project-URL: Changelog, https://github.com/ImagingDataCommons/idc-index-data/releases
43
43
  Requires-Python: >=3.10
44
+ Requires-Dist: google-cloud-bigquery
44
45
  Provides-Extra: test
45
46
  Requires-Dist: pandas; extra == "test"
46
47
  Requires-Dist: pyarrow; extra == "test"
@@ -0,0 +1,23 @@
1
+ SELECT
2
+ # description:
3
+ # unique identifier of the collection
4
+ collection_id,
5
+ # description:
6
+ # full name of the table in which the column is stored
7
+ table_name,
8
+ # description:
9
+ # short name of the table in which the column is stored
10
+ SPLIT(table_name,'.')[SAFE_OFFSET(2)] AS short_table_name,
11
+ # description:
12
+ # name of the column in which the value is stored
13
+ `column`,
14
+ # description:
15
+ # human readable name of the column
16
+ column_label,
17
+ # description:
18
+ # values encountered in the column
19
+ `values`
20
+ FROM
21
+ `bigquery-public-data.idc_v22_clinical.column_metadata`
22
+ ORDER BY
23
+ collection_id, table_name
@@ -82,10 +82,14 @@ SpecimenPreparationSequence_unnested AS (
82
82
  SELECT
83
83
  temp_table.SeriesInstanceUID,
84
84
  -- Embedding Medium
85
+ # description:
86
+ # embedding medium used for the slide preparation
85
87
  ARRAY(
86
88
  SELECT IF(code IS NULL, NULL, SPLIT(code, ':')[SAFE_OFFSET(0)])
87
89
  FROM UNNEST(embeddingMedium_code_str) AS code
88
90
  ) AS embeddingMedium_CodeMeaning,
91
+ # description:
92
+ # embedding medium code tuple
89
93
  ARRAY(
90
94
  SELECT IF(code IS NULL, NULL,
91
95
  IF(STRPOS(code, ':') = 0, NULL,
@@ -93,10 +97,14 @@ SELECT
93
97
  FROM UNNEST(embeddingMedium_code_str) AS code
94
98
  ) AS embeddingMedium_code_designator_value_str,
95
99
  -- Tissue Fixative
100
+ # description:
101
+ # tissue fixative used for the slide preparation
96
102
  ARRAY(
97
103
  SELECT IF(code IS NULL, NULL, SPLIT(code, ':')[SAFE_OFFSET(0)])
98
104
  FROM UNNEST(tissueFixative_code_str) AS code
99
105
  ) AS tissueFixative_CodeMeaning,
106
+ # description:
107
+ # tissue fixative code tuple
100
108
  ARRAY(
101
109
  SELECT IF(code IS NULL, NULL,
102
110
  IF(STRPOS(code, ':') = 0, NULL,
@@ -104,31 +112,56 @@ SELECT
104
112
  FROM UNNEST(tissueFixative_code_str) AS code
105
113
  ) AS tissueFixative_code_designator_value_str,
106
114
  -- Staining using substance
115
+ # description:
116
+ # staining substances used for the slide preparation
107
117
  ARRAY(
108
118
  SELECT IF(code IS NULL, NULL, SPLIT(code, ':')[SAFE_OFFSET(0)])
109
119
  FROM UNNEST(staining_usingSubstance_code_str) AS code
110
120
  ) AS staining_usingSubstance_CodeMeaning,
121
+ # description:
122
+ # staining using substance code tuple
111
123
  ARRAY(
112
124
  SELECT IF(code IS NULL, NULL,
113
125
  IF(STRPOS(code, ':') = 0, NULL,
114
126
  SUBSTR(code, STRPOS(code, ':') + 1)))
115
127
  FROM UNNEST(staining_usingSubstance_code_str) AS code
116
128
  ) AS staining_usingSubstance_code_designator_value_str,
117
-
129
+ # description:
130
+ # pixel spacing in mm at the maximum resolution layer, rounded to 2 significant figures
118
131
  if(COALESCE(min_spacing_0, fg_min_spacing_0) = 0, 0,
119
132
  round(COALESCE(min_spacing_0, fg_min_spacing_0) ,CAST(2 -1-floor(log10(abs(COALESCE(min_spacing_0, fg_min_spacing_0) ))) AS INT64))) AS min_PixelSpacing_2sf,
133
+ # description:
134
+ # width of the image at the maximum resolution
120
135
  COALESCE(max_TotalPixelMatrixColumns, max_Columns) AS max_TotalPixelMatrixColumns,
136
+ # description:
137
+ # height of the image at the maximum resolution
121
138
  COALESCE(max_TotalPixelMatrixRows, max_Rows) AS max_TotalPixelMatrixRows,
139
+ # description:
140
+ # power of the objective lens of the equipment used to digitize the slide
122
141
  SAFE_CAST(ObjectiveLensPower as INT) as ObjectiveLensPower,
142
+ # description:
143
+ # anatomic location from where the imaged specimen was collected
123
144
  CONCAT(SPLIT(primaryAnatomicStructure_code_str,":")[SAFE_OFFSET(0)],":",SPLIT(primaryAnatomicStructure_code_str,":")[SAFE_OFFSET(1)]) as primaryAnatomicStructure_code_designator_value_str,
145
+ # description:
146
+ # code tuple for the anatomic location from where the imaged specimen was collected
124
147
  SPLIT(primaryAnatomicStructure_code_str,":")[SAFE_OFFSET(2)] as primaryAnatomicStructure_CodeMeaning,
148
+ # description:
149
+ # additional characteristics of the specimen, such as whether it is a tumor or normal tissue (when available)
125
150
  CONCAT(SPLIT(primaryAnatomicStructureModifier_code_str,":")[SAFE_OFFSET(0)],":",SPLIT(primaryAnatomicStructureModifier_code_str,":")[SAFE_OFFSET(1)]) as primaryAnatomicStructureModifier_code_designator_value_str,
151
+ # description:
152
+ # code tuple for additional characteristics of the specimen, such as whether it is a tumor or normal tissue (when available)
126
153
  SPLIT(primaryAnatomicStructureModifier_code_str,":")[SAFE_OFFSET(2)] as primaryAnatomicStructureModifier_CodeMeaning,
127
-
154
+ # description:
155
+ # illumination type used during slide digitization
128
156
  CONCAT(SPLIT(illuminationType_code_str,":")[SAFE_OFFSET(0)],":",SPLIT(illuminationType_code_str,":")[SAFE_OFFSET(1)]) as illuminationType_code_designator_value_str,
157
+ # description:
158
+ # code tuple for the illumination type used during slide digitization
129
159
  SPLIT(illuminationType_code_str,":")[SAFE_OFFSET(2)] as illuminationType_CodeMeaning,
130
-
160
+ # description:
161
+ # admitting diagnosis associated with the specimen imaged on the slide (when available)
131
162
  CONCAT(SPLIT(admittingDiagnosis_code_str,":")[SAFE_OFFSET(0)],":",SPLIT(admittingDiagnosis_code_str,":")[SAFE_OFFSET(1)]) as admittingDiagnosis_code_designator_value_str,
163
+ # description:
164
+ # code tuple for the admitting diagnosis associated with the specimen imaged on the slide (when available)
132
165
  SPLIT(admittingDiagnosis_code_str,":")[SAFE_OFFSET(2)] as admittingDiagnosis_CodeMeaning,
133
166
  FROM
134
167
  temp_table
@@ -48,15 +48,23 @@ WITH
48
48
  GROUP BY
49
49
  SOPInstanceUID )
50
50
  SELECT
51
+ # description:
52
+ # unique identifier of the instance
51
53
  dicom_all.SOPInstanceUID,
54
+ # description:
55
+ # unique identifier of the series
52
56
  dicom_all.SeriesInstanceUID,
53
57
  -- Embedding Medium
58
+ # description:
59
+ # embedding medium used for the slide preparation
54
60
  ARRAY(
55
61
  SELECT
56
62
  IF
57
63
  (code IS NULL, NULL, SPLIT(code, ':')[SAFE_OFFSET(0)])
58
64
  FROM
59
65
  UNNEST(embeddingMedium_code_str) AS code ) AS embeddingMedium_CodeMeaning,
66
+ # description:
67
+ # embedding medium code tuple
60
68
  ARRAY(
61
69
  SELECT
62
70
  IF
@@ -66,12 +74,16 @@ SELECT
66
74
  FROM
67
75
  UNNEST(embeddingMedium_code_str) AS code ) AS embeddingMedium_code_designator_value_str,
68
76
  -- Tissue Fixative
77
+ # description:
78
+ # tissue fixative used for the slide preparation
69
79
  ARRAY(
70
80
  SELECT
71
81
  IF
72
82
  (code IS NULL, NULL, SPLIT(code, ':')[SAFE_OFFSET(0)])
73
83
  FROM
74
84
  UNNEST(tissueFixative_code_str) AS code ) AS tissueFixative_CodeMeaning,
85
+ # description:
86
+ # tissue fixative code tuple
75
87
  ARRAY(
76
88
  SELECT
77
89
  IF
@@ -81,12 +93,16 @@ SELECT
81
93
  FROM
82
94
  UNNEST(tissueFixative_code_str) AS code ) AS tissueFixative_code_designator_value_str,
83
95
  -- Staining using substance
96
+ # description:
97
+ # staining substances used for the slide preparation
84
98
  ARRAY(
85
99
  SELECT
86
100
  IF
87
101
  (code IS NULL, NULL, SPLIT(code, ':')[SAFE_OFFSET(0)])
88
102
  FROM
89
103
  UNNEST(staining_usingSubstance_code_str) AS code ) AS staining_usingSubstance_CodeMeaning,
104
+ # description:
105
+ # staining using substance code tuple
90
106
  ARRAY(
91
107
  SELECT
92
108
  IF
@@ -98,13 +114,27 @@ SELECT
98
114
  -- instance-specific image attributes
99
115
  -- NB: there is a caveat that I think in general, we expect square pixels, but in htan_wustl and cptac_luad this assumption does not hold,
100
116
  -- and in htan_wustl, the difference is rather large (x2) - waiting to hear from David Clunie about this...
117
+ # description:
118
+ # pixel spacing in mm, rounded to 2 significant figures
101
119
  SAFE_CAST(SharedFunctionalGroupsSequence[SAFE_OFFSET(0)].PixelMeasuresSequence[SAFE_OFFSET(0)]. PixelSpacing[SAFE_OFFSET(0)] AS FLOAT64) AS PixelSpacing_0,
120
+ # description:
121
+ # DICOM ImageType attribute
102
122
  dicom_all.ImageType,
123
+ # description:
124
+ # DICOM TransferSyntaxUID attribute
103
125
  dicom_all.TransferSyntaxUID,
126
+ # description:
127
+ # size of the instance file in bytes
104
128
  dicom_all.instance_size,
129
+ # description:
130
+ # number of columns in the image
105
131
  dicom_all.TotalPixelMatrixColumns,
132
+ # description:
133
+ # number of rows in the image
106
134
  dicom_all.TotalPixelMatrixRows,
107
135
  -- attributes needed to retrieve the selected instances/files
136
+ # description:
137
+ # unique identifier of the instance within the IDC
108
138
  dicom_all.crdc_instance_uuid
109
139
  FROM
110
140
  `bigquery-public-data.idc_v22.dicom_all` AS dicom_all
@@ -13,7 +13,7 @@ build-backend = "scikit_build_core.build"
13
13
 
14
14
  [project]
15
15
  name = "idc-index-data"
16
- version = "22.1.1"
16
+ version = "22.1.3"
17
17
  authors = [
18
18
  { name = "Andrey Fedorov", email = "andrey.fedorov@gmail.com" },
19
19
  { name = "Vamsi Thiriveedhi", email = "vthiriveedhi@mgh.harvard.edu" },
@@ -38,7 +38,9 @@ classifiers = [
38
38
  "Topic :: Scientific/Engineering",
39
39
  "Typing :: Typed",
40
40
  ]
41
- dependencies = []
41
+ dependencies = [
42
+ "google-cloud-bigquery"
43
+ ]
42
44
 
43
45
  [project.optional-dependencies]
44
46
  test = [
@@ -102,7 +104,7 @@ report.exclude_also = [
102
104
 
103
105
  [tool.mypy]
104
106
  files = ["src", "tests"]
105
- python_version = "3.8"
107
+ python_version = "3.10"
106
108
  warn_unused_configs = true
107
109
  strict = true
108
110
  enable_error_code = ["ignore-without-code", "redundant-expr", "truthy-bool"]
@@ -158,7 +160,7 @@ isort.required-imports = ["from __future__ import annotations"]
158
160
 
159
161
 
160
162
  [tool.pylint]
161
- py-version = "3.8"
163
+ py-version = "3.10"
162
164
  ignore-paths = [".*/_version.py"]
163
165
  reports.output-format = "colorized"
164
166
  similarities.ignore-imports = "yes"
@@ -0,0 +1,2 @@
1
+ [pytest]
2
+ filterwarnings = ignore::FutureWarning:google.api_core
@@ -23,10 +23,13 @@ def main():
23
23
 
24
24
  for file_name in sql_files:
25
25
  file_path = assets_dir / file_name
26
- index_df, output_basename, schema = manager.execute_sql_query(file_path)
26
+ index_df, output_basename, schema, sql_query = manager.execute_sql_query(
27
+ file_path
28
+ )
27
29
  parquet_file_path = output_dir / f"{output_basename}.parquet"
28
30
  index_df.to_parquet(parquet_file_path)
29
- manager.save_schema_to_json(schema, output_basename, output_dir)
31
+ manager.save_schema_to_json(schema, output_basename, sql_query, output_dir)
32
+ manager.save_sql_query(sql_query, output_basename, output_dir)
30
33
 
31
34
  core_indices_dir = scripts_dir.parent / "scripts" / "sql"
32
35
 
@@ -34,10 +37,17 @@ def main():
34
37
 
35
38
  for file_name in sql_files:
36
39
  file_path = core_indices_dir / file_name
37
- index_df, output_basename, schema = manager.execute_sql_query(file_path)
40
+ index_df, output_basename, schema, sql_query = manager.execute_sql_query(
41
+ file_path
42
+ )
38
43
  parquet_file_path = output_dir / f"{output_basename}.parquet"
39
44
  index_df.to_parquet(parquet_file_path)
40
- manager.save_schema_to_json(schema, output_basename, output_dir)
45
+ if output_basename == "prior_versions_index":
46
+ # For prior_versions_index, save schema without descriptions
47
+ manager.save_schema_to_json(schema, output_basename, None, output_dir)
48
+ else:
49
+ manager.save_schema_to_json(schema, output_basename, sql_query, output_dir)
50
+ manager.save_sql_query(sql_query, output_basename, output_dir)
41
51
 
42
52
 
43
53
  if __name__ == "__main__":