idc-index-data 22.1.4__tar.gz → 23.0.1__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (39) hide show
  1. {idc_index_data-22.1.4 → idc_index_data-23.0.1}/.pre-commit-config.yaml +3 -3
  2. {idc_index_data-22.1.4 → idc_index_data-23.0.1}/PKG-INFO +1 -1
  3. idc_index_data-23.0.1/assets/clinical_index.sql +30 -0
  4. {idc_index_data-22.1.4 → idc_index_data-23.0.1}/assets/sm_index.sql +10 -2
  5. {idc_index_data-22.1.4 → idc_index_data-23.0.1}/assets/sm_instance_index.sql +9 -2
  6. {idc_index_data-22.1.4 → idc_index_data-23.0.1}/pyproject.toml +1 -1
  7. {idc_index_data-22.1.4 → idc_index_data-23.0.1}/scripts/python/idc_index_data_manager.py +52 -4
  8. {idc_index_data-22.1.4 → idc_index_data-23.0.1}/scripts/sql/analysis_results_index.sql +5 -1
  9. {idc_index_data-22.1.4 → idc_index_data-23.0.1}/scripts/sql/collections_index.sql +5 -1
  10. {idc_index_data-22.1.4 → idc_index_data-23.0.1}/scripts/sql/idc_index.sql +7 -3
  11. {idc_index_data-22.1.4 → idc_index_data-23.0.1}/scripts/sql/prior_versions_index.sql +1 -1
  12. {idc_index_data-22.1.4 → idc_index_data-23.0.1}/tests/test_package.py +1 -1
  13. idc_index_data-22.1.4/assets/clinical_index.sql +0 -23
  14. {idc_index_data-22.1.4 → idc_index_data-23.0.1}/.git_archival.txt +0 -0
  15. {idc_index_data-22.1.4 → idc_index_data-23.0.1}/.gitattributes +0 -0
  16. {idc_index_data-22.1.4 → idc_index_data-23.0.1}/.github/CONTRIBUTING.md +0 -0
  17. {idc_index_data-22.1.4 → idc_index_data-23.0.1}/.github/copilot-instructions.md +0 -0
  18. {idc_index_data-22.1.4 → idc_index_data-23.0.1}/.github/dependabot.yml +0 -0
  19. {idc_index_data-22.1.4 → idc_index_data-23.0.1}/.github/matchers/pylint.json +0 -0
  20. {idc_index_data-22.1.4 → idc_index_data-23.0.1}/.github/workflows/cd.yml +0 -0
  21. {idc_index_data-22.1.4 → idc_index_data-23.0.1}/.github/workflows/ci.yml +0 -0
  22. {idc_index_data-22.1.4 → idc_index_data-23.0.1}/.github/workflows/external-indices.yml +0 -0
  23. {idc_index_data-22.1.4 → idc_index_data-23.0.1}/.gitignore +0 -0
  24. {idc_index_data-22.1.4 → idc_index_data-23.0.1}/.readthedocs.yaml +0 -0
  25. {idc_index_data-22.1.4 → idc_index_data-23.0.1}/CMakeLists.txt +0 -0
  26. {idc_index_data-22.1.4 → idc_index_data-23.0.1}/LICENSE +0 -0
  27. {idc_index_data-22.1.4 → idc_index_data-23.0.1}/README.md +0 -0
  28. {idc_index_data-22.1.4 → idc_index_data-23.0.1}/assets/README.md +0 -0
  29. {idc_index_data-22.1.4 → idc_index_data-23.0.1}/docs/conf.py +0 -0
  30. {idc_index_data-22.1.4 → idc_index_data-23.0.1}/docs/index.md +0 -0
  31. {idc_index_data-22.1.4 → idc_index_data-23.0.1}/noxfile.py +0 -0
  32. {idc_index_data-22.1.4 → idc_index_data-23.0.1}/pytest.ini +0 -0
  33. {idc_index_data-22.1.4 → idc_index_data-23.0.1}/scripts/python/generate-indices.py +0 -0
  34. {idc_index_data-22.1.4 → idc_index_data-23.0.1}/scripts/python/update_idc_index_version.py +0 -0
  35. {idc_index_data-22.1.4 → idc_index_data-23.0.1}/src/idc_index_data/__init__.py +0 -0
  36. {idc_index_data-22.1.4 → idc_index_data-23.0.1}/src/idc_index_data/_version.pyi +0 -0
  37. {idc_index_data-22.1.4 → idc_index_data-23.0.1}/src/idc_index_data/py.typed +0 -0
  38. {idc_index_data-22.1.4 → idc_index_data-23.0.1}/tests/test_column_description_parser.py +0 -0
  39. {idc_index_data-22.1.4 → idc_index_data-23.0.1}/tests/test_real_sql_parsing.py +0 -0
@@ -40,7 +40,7 @@ repos:
40
40
  args: [--prose-wrap=always]
41
41
 
42
42
  - repo: https://github.com/astral-sh/ruff-pre-commit
43
- rev: "v0.14.4"
43
+ rev: "v0.14.5"
44
44
  hooks:
45
45
  - id: ruff-check
46
46
  args: ["--fix", "--show-fixes"]
@@ -76,12 +76,12 @@ repos:
76
76
  exclude: .pre-commit-config.yaml
77
77
 
78
78
  - repo: https://github.com/henryiii/validate-pyproject-schema-store
79
- rev: "2025.11.04"
79
+ rev: "2025.11.14"
80
80
  hooks:
81
81
  - id: validate-pyproject
82
82
 
83
83
  - repo: https://github.com/python-jsonschema/check-jsonschema
84
- rev: "0.34.1"
84
+ rev: "0.35.0"
85
85
  hooks:
86
86
  - id: check-dependabot
87
87
  - id: check-github-workflows
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: idc-index-data
3
- Version: 22.1.4
3
+ Version: 23.0.1
4
4
  Summary: ImagingDataCommons index to query and download data.
5
5
  Author-Email: Andrey Fedorov <andrey.fedorov@gmail.com>, Vamsi Thiriveedhi <vthiriveedhi@mgh.harvard.edu>, Jean-Christophe Fillion-Robin <jchris.fillionr@kitware.com>
6
6
  License: Copyright 2024 Andrey Fedorov
@@ -0,0 +1,30 @@
1
+ # table-description:
2
+ # This table contains metadata about the tabular data, including clinical data, accompanying images that
3
+ # is available in IDC. Think about this table as a dictionary containing information about the columns
4
+ # for all of the tabular data accompanying individual collections in IDC. Each row corresponds to a unique
5
+ # combination of collection, clinical data table that is available for that collection, and a column from that
6
+ # table. Individual tables referenced from this table can be retrieved using idc-index `get_clinical_table()`
7
+ # function.
8
+ SELECT
9
+ # description:
10
+ # unique identifier of the collection
11
+ collection_id,
12
+ # description:
13
+ # full name of the table in which the column is stored
14
+ table_name,
15
+ # description:
16
+ # short name of the table in which the column is stored
17
+ SPLIT(table_name,'.')[SAFE_OFFSET(2)] AS short_table_name,
18
+ # description:
19
+ # name of the column in which the value is stored
20
+ `column`,
21
+ # description:
22
+ # human readable name of the column
23
+ column_label,
24
+ # description:
25
+ # values encountered in the column
26
+ `values`
27
+ FROM
28
+ `bigquery-public-data.idc_v23_clinical.column_metadata`
29
+ ORDER BY
30
+ collection_id, table_name
@@ -9,6 +9,12 @@
9
9
  -- WHERE
10
10
  -- Modality = "SM"
11
11
 
12
+ # table-description:
13
+ # This table contains metadata about the slide microscopy (SM) series available in IDC. Each row
14
+ # corresponds to a DICOM series, and contains attributes specific to SM series, such as the pixel spacing at the maximum
15
+ # resolution layer, the power of the objective lens used to digitize the slide, and the anatomic location
16
+ # from where the imaged specimen was collected. This table can be joined with the main index table using the
17
+ # `SeriesInstanceUID` column.
12
18
  WITH
13
19
  temp_table AS (
14
20
  SELECT
@@ -31,7 +37,7 @@ WITH
31
37
 
32
38
 
33
39
  FROM
34
- `bigquery-public-data.idc_v22.dicom_all` AS dicom_all
40
+ `bigquery-public-data.idc_v23.dicom_all` AS dicom_all
35
41
  GROUP BY
36
42
  SeriesInstanceUID
37
43
  ),
@@ -45,7 +51,7 @@ SpecimenPreparationSequence_unnested AS (
45
51
  concept_code_sequence.CodeMeaning AS ccs_cm,
46
52
  concept_code_sequence.CodingSchemeDesignator AS ccs_csd,
47
53
  concept_code_sequence.CodeValue AS ccs_val,
48
- FROM `bigquery-public-data.idc_v22.dicom_all`,
54
+ FROM `bigquery-public-data.idc_v23.dicom_all`,
49
55
  UNNEST(SpecimenDescriptionSequence[SAFE_OFFSET(0)].SpecimenPreparationSequence) as preparation_unnest_step1,
50
56
  UNNEST(preparation_unnest_step1.SpecimenPreparationStepContentItemSequence) as preparation_unnest_step2,
51
57
  UNNEST(preparation_unnest_step2.ConceptNameCodeSequence) as concept_name_code_sequence,
@@ -80,6 +86,8 @@ SpecimenPreparationSequence_unnested AS (
80
86
  )
81
87
 
82
88
  SELECT
89
+ # description:
90
+ # DICOM SeriesInstanceUID identifier of the series
83
91
  temp_table.SeriesInstanceUID,
84
92
  -- Embedding Medium
85
93
  # description:
@@ -1,3 +1,10 @@
1
+ # table-description:
2
+ # This table contains metadata about the slide microscopy (SM) series available in IDC. Each row
3
+ # corresponds to an instance from a DICOM Slide Microscopy series available from IDC, identified by
4
+ # `SOPInstanceUID`, and contains attributes specific to SM series, such as the pixel spacing at the maximum
5
+ # resolution layer, the power of the objective lens used to digitize the slide, and the anatomic location
6
+ # from where the imaged specimen was collected. This table can be joined with the main index table
7
+ # and/or with `sm_index` using the `SeriesInstanceUID` column.
1
8
  WITH
2
9
  SpecimenPreparationSequence_unnested AS (
3
10
  SELECT
@@ -9,7 +16,7 @@ WITH
9
16
  concept_code_sequence.CodingSchemeDesignator AS ccs_csd,
10
17
  concept_code_sequence.CodeValue AS ccs_val,
11
18
  FROM
12
- `bigquery-public-data.idc_v22.dicom_all`,
19
+ `bigquery-public-data.idc_v23.dicom_all`,
13
20
  UNNEST(SpecimenDescriptionSequence[SAFE_OFFSET(0)].SpecimenPreparationSequence) AS preparation_unnest_step1,
14
21
  UNNEST(preparation_unnest_step1.SpecimenPreparationStepContentItemSequence) AS preparation_unnest_step2,
15
22
  UNNEST(preparation_unnest_step2.ConceptNameCodeSequence) AS concept_name_code_sequence,
@@ -137,7 +144,7 @@ SELECT
137
144
  # unique identifier of the instance within the IDC
138
145
  dicom_all.crdc_instance_uuid AS crdc_instance_uuid
139
146
  FROM
140
- `bigquery-public-data.idc_v22.dicom_all` AS dicom_all
147
+ `bigquery-public-data.idc_v23.dicom_all` AS dicom_all
141
148
  LEFT JOIN
142
149
  slide_embedding
143
150
  ON
@@ -13,7 +13,7 @@ build-backend = "scikit_build_core.build"
13
13
 
14
14
  [project]
15
15
  name = "idc-index-data"
16
- version = "22.1.4"
16
+ version = "23.0.1"
17
17
  authors = [
18
18
  { name = "Andrey Fedorov", email = "andrey.fedorov@gmail.com" },
19
19
  { name = "Vamsi Thiriveedhi", email = "vthiriveedhi@mgh.harvard.edu" },
@@ -22,6 +22,51 @@ class IDCIndexDataManager:
22
22
  self.client = bigquery.Client(project=project_id)
23
23
  logger.debug("IDCIndexDataManager initialized with project ID: %s", project_id)
24
24
 
25
+ @staticmethod
26
+ def parse_table_description(sql_query: str) -> str:
27
+ """
28
+ Parses the table description from SQL query comments.
29
+
30
+ The method looks for comments following the pattern:
31
+ # table-description:
32
+ # description text continues here
33
+ # and can span multiple lines
34
+
35
+ Args:
36
+ sql_query: The SQL query string containing comments
37
+
38
+ Returns:
39
+ The table description as a string
40
+ """
41
+ description_lines = []
42
+ logger.debug("Parsing table description from SQL query comments")
43
+ logger.debug(sql_query)
44
+ lines = sql_query.split("\n")
45
+
46
+ for i, line in enumerate(lines):
47
+ stripped = line.strip()
48
+ if stripped == "# table-description:":
49
+ # Collect description lines until we hit a non-comment line
50
+ j = i + 1
51
+ while j < len(lines):
52
+ next_line = lines[j]
53
+ next_stripped = next_line.strip()
54
+ if next_stripped.startswith("#") and next_stripped != "#":
55
+ # Remove the leading # and whitespace
56
+ desc_text = next_stripped[1:].strip()
57
+ if desc_text:
58
+ description_lines.append(desc_text)
59
+ j += 1
60
+ elif next_stripped.startswith("#"):
61
+ # Empty comment line, skip
62
+ j += 1
63
+ else:
64
+ # Non-comment line, stop collecting
65
+ break
66
+ break
67
+
68
+ return " ".join(description_lines)
69
+
25
70
  @staticmethod
26
71
  def parse_column_descriptions(sql_query: str) -> dict[str, str]:
27
72
  """
@@ -232,11 +277,14 @@ class IDCIndexDataManager:
232
277
  logger.debug("Parsing column descriptions from SQL query comments")
233
278
  logger.debug(sql_query)
234
279
  if sql_query is not None:
280
+ table_description = self.parse_table_description(sql_query)
281
+ logger.debug("Parsed table description: %s", table_description)
235
282
  descriptions = self.parse_column_descriptions(sql_query)
236
283
 
237
284
  # Convert BigQuery schema to JSON-serializable format
238
285
  schema_dict = {
239
- "fields": [
286
+ "table_description": table_description,
287
+ "columns": [
240
288
  {
241
289
  "name": field.name,
242
290
  "type": field.field_type,
@@ -244,12 +292,12 @@ class IDCIndexDataManager:
244
292
  "description": descriptions.get(field.name, ""),
245
293
  }
246
294
  for field in schema
247
- ]
295
+ ],
248
296
  }
249
297
  else:
250
298
  # If no SQL query provided, save schema without descriptions
251
299
  schema_dict = {
252
- "fields": [
300
+ "columns": [
253
301
  {
254
302
  "name": field.name,
255
303
  "type": field.field_type,
@@ -375,7 +423,7 @@ class IDCIndexDataManager:
375
423
  SELECT
376
424
  MAX(idc_version) AS latest_idc_release_version
377
425
  FROM
378
- `bigquery-public-data.idc_current.version_metadata`
426
+ `bigquery-public-data.idc_v23.version_metadata`
379
427
  """
380
428
  query_job = self.client.query(query)
381
429
  result = query_job.result()
@@ -1,3 +1,7 @@
1
+ # table-description:
2
+ # This table contains metadata about the analysis results collections available in IDC. Each row corresponds to an
3
+ # analysis results collection, and contains attributes such as the collection name, types of cancer represented,
4
+ # number of subjects, and pointers to the resources to learn more about the content of the collection
1
5
  SELECT
2
6
  # description:
3
7
  # unique identifier of the analysis results collection
@@ -39,4 +43,4 @@ SELECT
39
43
  # citation for the analysis results collection that should be used for acknowledgment
40
44
  Citation
41
45
  FROM
42
- `bigquery-public-data.idc_v22.analysis_results_metadata`
46
+ `bigquery-public-data.idc_v23.analysis_results_metadata`
@@ -1,3 +1,7 @@
1
+ # table-description:
2
+ # This table contains metadata about the collections available in IDC. Each row corresponds to a collection,
3
+ # and contains attributes such as the collection name, types of cancer represented, number of subjects,
4
+ # and pointers to the resources to learn more about the content of the collection.
1
5
  SELECT
2
6
  # description:
3
7
  # name of the collection
@@ -36,4 +40,4 @@ SELECT
36
40
  # detailed information about the collection
37
41
  Description
38
42
  FROM
39
- `bigquery-public-data.idc_v22.original_collections_metadata`
43
+ `bigquery-public-data.idc_v23.original_collections_metadata`
@@ -1,3 +1,7 @@
1
+ # table-description:
2
+ # This is the main metadata table provided by idc-index. Each row corresponds to a DICOM series, and contains
3
+ # attributes at the collection, patient, study, and series levels. The table also contains download-related
4
+ # attributes, such as the AWS S3 bucket and URL to download the series.
1
5
  SELECT
2
6
  # collection level attributes
3
7
  # description:
@@ -22,7 +26,7 @@ SELECT
22
26
  # series; follow this DOI to learn more about the activity that produced
23
27
  # this series
24
28
  ANY_VALUE(source_DOI) AS source_DOI,
25
- # patient level attributes
29
+ # patient level attributes:
26
30
  # description:
27
31
  # age of the subject at the time of imaging (DICOM attribute)
28
32
  ANY_VALUE(PatientAge) AS PatientAge,
@@ -79,9 +83,9 @@ SELECT
79
83
  # total size of the series in megabytes
80
84
  ROUND(SUM(SAFE_CAST(instance_size AS float64))/1000000, 2) AS series_size_MB,
81
85
  FROM
82
- `bigquery-public-data.idc_v22.dicom_all` AS dicom_all
86
+ `bigquery-public-data.idc_v23.dicom_all` AS dicom_all
83
87
  JOIN
84
- `bigquery-public-data.idc_v22.dicom_metadata_curated` AS dicom_curated
88
+ `bigquery-public-data.idc_v23.dicom_metadata_curated` AS dicom_curated
85
89
  ON
86
90
  dicom_all.SOPInstanceUID = dicom_curated.SOPInstanceUID
87
91
  GROUP BY
@@ -3,7 +3,7 @@
3
3
  --
4
4
  -- Step 1: Declare variables
5
5
  DECLARE idc_versions ARRAY<INT64>;
6
- DECLARE latest_idc_version INT64 DEFAULT 22;
6
+ DECLARE latest_idc_version INT64 DEFAULT 23;
7
7
  DECLARE union_all_query STRING;
8
8
 
9
9
  --Step 2
@@ -7,7 +7,7 @@ from packaging.version import Version
7
7
 
8
8
  import idc_index_data as m
9
9
 
10
- EXPECTED_IDC_INDEX_VERSION = 22
10
+ EXPECTED_IDC_INDEX_VERSION = 23
11
11
 
12
12
 
13
13
  def test_version():
@@ -1,23 +0,0 @@
1
- SELECT
2
- # description:
3
- # unique identifier of the collection
4
- collection_id,
5
- # description:
6
- # full name of the table in which the column is stored
7
- table_name,
8
- # description:
9
- # short name of the table in which the column is stored
10
- SPLIT(table_name,'.')[SAFE_OFFSET(2)] AS short_table_name,
11
- # description:
12
- # name of the column in which the value is stored
13
- `column`,
14
- # description:
15
- # human readable name of the column
16
- column_label,
17
- # description:
18
- # values encountered in the column
19
- `values`
20
- FROM
21
- `bigquery-public-data.idc_v22_clinical.column_metadata`
22
- ORDER BY
23
- collection_id, table_name
File without changes