idc-index-data 18.1.0__tar.gz → 19.0.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of idc-index-data might be problematic. Click here for more details.

Files changed (32) hide show
  1. {idc_index_data-18.1.0 → idc_index_data-19.0.0}/CMakeLists.txt +3 -0
  2. {idc_index_data-18.1.0 → idc_index_data-19.0.0}/PKG-INFO +1 -1
  3. {idc_index_data-18.1.0 → idc_index_data-19.0.0}/assets/sm_index.sql +3 -3
  4. {idc_index_data-18.1.0 → idc_index_data-19.0.0}/assets/sm_instance_index.sql +1 -1
  5. {idc_index_data-18.1.0 → idc_index_data-19.0.0}/pyproject.toml +1 -1
  6. {idc_index_data-18.1.0 → idc_index_data-19.0.0}/scripts/sql/idc_index.sql +2 -2
  7. idc_index_data-19.0.0/scripts/sql/prior_versions_index.sql +89 -0
  8. {idc_index_data-18.1.0 → idc_index_data-19.0.0}/src/idc_index_data/__init__.py +4 -0
  9. {idc_index_data-18.1.0 → idc_index_data-19.0.0}/tests/test_package.py +6 -1
  10. {idc_index_data-18.1.0 → idc_index_data-19.0.0}/.git_archival.txt +0 -0
  11. {idc_index_data-18.1.0 → idc_index_data-19.0.0}/.gitattributes +0 -0
  12. {idc_index_data-18.1.0 → idc_index_data-19.0.0}/.github/CONTRIBUTING.md +0 -0
  13. {idc_index_data-18.1.0 → idc_index_data-19.0.0}/.github/dependabot.yml +0 -0
  14. {idc_index_data-18.1.0 → idc_index_data-19.0.0}/.github/matchers/pylint.json +0 -0
  15. {idc_index_data-18.1.0 → idc_index_data-19.0.0}/.github/workflows/cd.yml +0 -0
  16. {idc_index_data-18.1.0 → idc_index_data-19.0.0}/.github/workflows/ci.yml +0 -0
  17. {idc_index_data-18.1.0 → idc_index_data-19.0.0}/.github/workflows/external-indices.yml +0 -0
  18. {idc_index_data-18.1.0 → idc_index_data-19.0.0}/.github/workflows/keep-alive.yml +0 -0
  19. {idc_index_data-18.1.0 → idc_index_data-19.0.0}/.gitignore +0 -0
  20. {idc_index_data-18.1.0 → idc_index_data-19.0.0}/.pre-commit-config.yaml +0 -0
  21. {idc_index_data-18.1.0 → idc_index_data-19.0.0}/.readthedocs.yaml +0 -0
  22. {idc_index_data-18.1.0 → idc_index_data-19.0.0}/LICENSE +0 -0
  23. {idc_index_data-18.1.0 → idc_index_data-19.0.0}/README.md +0 -0
  24. {idc_index_data-18.1.0 → idc_index_data-19.0.0}/assets/README.md +0 -0
  25. {idc_index_data-18.1.0 → idc_index_data-19.0.0}/docs/conf.py +0 -0
  26. {idc_index_data-18.1.0 → idc_index_data-19.0.0}/docs/index.md +0 -0
  27. {idc_index_data-18.1.0 → idc_index_data-19.0.0}/noxfile.py +0 -0
  28. {idc_index_data-18.1.0 → idc_index_data-19.0.0}/scripts/python/external-indices.py +0 -0
  29. {idc_index_data-18.1.0 → idc_index_data-19.0.0}/scripts/python/idc_index_data_manager.py +0 -0
  30. {idc_index_data-18.1.0 → idc_index_data-19.0.0}/scripts/python/update_idc_index_version.py +0 -0
  31. {idc_index_data-18.1.0 → idc_index_data-19.0.0}/src/idc_index_data/_version.pyi +0 -0
  32. {idc_index_data-18.1.0 → idc_index_data-19.0.0}/src/idc_index_data/py.typed +0 -0
@@ -19,6 +19,7 @@ add_custom_command(
19
19
  OUTPUT
20
20
  $<$<BOOL:${IDC_INDEX_DATA_GENERATE_CSV_ARCHIVE}>:${download_dir}/idc_index.csv.zip>
21
21
  $<$<BOOL:${IDC_INDEX_DATA_GENERATE_PARQUET}>:${download_dir}/idc_index.parquet>
22
+ $<$<BOOL:${IDC_INDEX_DATA_GENERATE_PARQUET}>:${download_dir}/prior_versions_index.parquet>
22
23
  COMMAND python ${CMAKE_CURRENT_SOURCE_DIR}/scripts/python/idc_index_data_manager.py
23
24
  $<$<BOOL:${IDC_INDEX_DATA_GENERATE_CSV_ARCHIVE}>:--generate-csv-archive>
24
25
  $<$<BOOL:${IDC_INDEX_DATA_GENERATE_PARQUET}>:--generate-parquet>
@@ -28,10 +29,12 @@ add_custom_target(run_idc_index_data_manager ALL
28
29
  DEPENDS
29
30
  $<$<BOOL:${IDC_INDEX_DATA_GENERATE_CSV_ARCHIVE}>:${download_dir}/idc_index.csv.zip>
30
31
  $<$<BOOL:${IDC_INDEX_DATA_GENERATE_PARQUET}>:${download_dir}/idc_index.parquet>
32
+ $<$<BOOL:${IDC_INDEX_DATA_GENERATE_PARQUET}>:${download_dir}/prior_versions_index.parquet>
31
33
  )
32
34
 
33
35
  install(
34
36
  FILES
35
37
  $<$<BOOL:${IDC_INDEX_DATA_GENERATE_CSV_ARCHIVE}>:${download_dir}/idc_index.csv.zip>
36
38
  $<$<BOOL:${IDC_INDEX_DATA_GENERATE_PARQUET}>:${download_dir}/idc_index.parquet>
39
+ $<$<BOOL:${IDC_INDEX_DATA_GENERATE_PARQUET}>:${download_dir}/prior_versions_index.parquet>
37
40
  DESTINATION "idc_index_data")
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: idc-index-data
3
- Version: 18.1.0
3
+ Version: 19.0.0
4
4
  Summary: ImagingDataCommons index to query and download data.
5
5
  Author-Email: Andrey Fedorov <andrey.fedorov@gmail.com>, Vamsi Thiriveedhi <vthiriveedhi@mgh.harvard.edu>, Jean-Christophe Fillion-Robin <jchris.fillionr@kitware.com>
6
6
  License: Copyright 2024 Andrey Fedorov
@@ -5,7 +5,7 @@
5
5
  -- SELECT
6
6
  -- * EXCEPT(Modality)
7
7
  -- FROM
8
- -- `bigquery-public-data.idc_v18.dicom_metadata_curated_series_level`
8
+ -- `bigquery-public-data.idc_v19.dicom_metadata_curated_series_level`
9
9
  -- WHERE
10
10
  -- Modality = "SM"
11
11
 
@@ -27,7 +27,7 @@ WITH
27
27
 
28
28
  ARRAY_AGG(DISTINCT(CONCAT(OpticalPathSequence[SAFE_OFFSET(0)].IlluminationTypeCodeSequence[SAFE_OFFSET(0)].CodingSchemeDesignator,":", OpticalPathSequence[SAFE_OFFSET(0)].IlluminationTypeCodeSequence[SAFE_OFFSET(0)].CodeValue, ":", OpticalPathSequence[SAFE_OFFSET(0)].IlluminationTypeCodeSequence[SAFE_OFFSET(0)].CodeMeaning)) IGNORE NULLS)[SAFE_OFFSET(0)] AS illuminationType_code_str,
29
29
  FROM
30
- `bigquery-public-data.idc_v18.dicom_all` AS dicom_all
30
+ `bigquery-public-data.idc_v19.dicom_all` AS dicom_all
31
31
  GROUP BY
32
32
  SeriesInstanceUID
33
33
  ),
@@ -41,7 +41,7 @@ SpecimenPreparationSequence_unnested AS (
41
41
  concept_code_sequence.CodeMeaning AS ccs_cm,
42
42
  concept_code_sequence.CodingSchemeDesignator AS ccs_csd,
43
43
  concept_code_sequence.CodeValue AS ccs_val,
44
- FROM `bigquery-public-data.idc_v18.dicom_all`,
44
+ FROM `bigquery-public-data.idc_v19.dicom_all`,
45
45
  UNNEST(SpecimenDescriptionSequence[SAFE_OFFSET(0)].SpecimenPreparationSequence) as preparation_unnest_step1,
46
46
  UNNEST(preparation_unnest_step1.SpecimenPreparationStepContentItemSequence) as preparation_unnest_step2,
47
47
  UNNEST(preparation_unnest_step2.ConceptNameCodeSequence) as concept_name_code_sequence,
@@ -9,7 +9,7 @@ WITH
9
9
  concept_code_sequence.CodingSchemeDesignator AS ccs_csd,
10
10
  concept_code_sequence.CodeValue AS ccs_val,
11
11
  FROM
12
- `bigquery-public-data.idc_v18.dicom_all`,
12
+ `bigquery-public-data.idc_v19.dicom_all`,
13
13
  UNNEST(SpecimenDescriptionSequence[SAFE_OFFSET(0)].SpecimenPreparationSequence) AS preparation_unnest_step1,
14
14
  UNNEST(preparation_unnest_step1.SpecimenPreparationStepContentItemSequence) AS preparation_unnest_step2,
15
15
  UNNEST(preparation_unnest_step2.ConceptNameCodeSequence) AS concept_name_code_sequence,
@@ -13,7 +13,7 @@ build-backend = "scikit_build_core.build"
13
13
 
14
14
  [project]
15
15
  name = "idc-index-data"
16
- version = "18.1.0"
16
+ version = "19.0.0"
17
17
  authors = [
18
18
  { name = "Andrey Fedorov", email = "andrey.fedorov@gmail.com" },
19
19
  { name = "Vamsi Thiriveedhi", email = "vthiriveedhi@mgh.harvard.edu" },
@@ -26,9 +26,9 @@ SELECT
26
26
  ANY_VALUE(CONCAT(series_aws_url,"*")) AS series_aws_url,
27
27
  ROUND(SUM(SAFE_CAST(instance_size AS float64))/1000000, 2) AS series_size_MB,
28
28
  FROM
29
- `bigquery-public-data.idc_v18.dicom_all` AS dicom_all
29
+ `bigquery-public-data.idc_v19.dicom_all` AS dicom_all
30
30
  JOIN
31
- `bigquery-public-data.idc_v18.dicom_metadata_curated` AS dicom_curated
31
+ `bigquery-public-data.idc_v19.dicom_metadata_curated` AS dicom_curated
32
32
  ON
33
33
  dicom_all.SOPInstanceUID = dicom_curated.SOPInstanceUID
34
34
  GROUP BY
@@ -0,0 +1,89 @@
1
+ -- For details on the syntax, see
2
+ -- https://cloud.google.com/bigquery/docs/reference/standard-sql/procedural-language
3
+ --
4
+ -- Step 1: Declare variables
5
+ DECLARE idc_versions ARRAY<INT64>;
6
+ DECLARE latest_idc_version INT64 DEFAULT 19;
7
+ DECLARE union_all_query STRING;
8
+
9
+ --Step 2
10
+ --SET latest_idc_version = (
11
+ --SELECT max(idc_version)
12
+ --FROM
13
+ --bigquery-public-data.idc_current.version_metadata
14
+ --);
15
+
16
+ -- Step 3: Get all idc_versions
17
+ SET idc_versions = (
18
+ SELECT GENERATE_ARRAY(1, latest_idc_version)
19
+ -- SELECT [1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18]
20
+ --SELECT ARRAY_AGG(idc_version)
21
+ --FROM
22
+ --`bigquery-public-data.idc_current.version_metadata`
23
+ );
24
+
25
+ -- Step 4: Generate the UNION ALL query dynamically
26
+ SET union_all_query = (
27
+ SELECT STRING_AGG(
28
+ FORMAT("""
29
+ SELECT
30
+ %d AS idc_version,
31
+ collection_id,
32
+ PatientID,
33
+ SeriesInstanceUID,
34
+ StudyInstanceUID,
35
+ Modality,
36
+ regexp_extract(gcs_url, 'gs://([^/]+)/') as gcs_bucket,
37
+ crdc_series_uuid,
38
+ ROUND(SUM(SAFE_CAST(instance_size AS float64))/1000000, 2) AS series_size_MB,
39
+ FROM
40
+ `bigquery-public-data.idc_v%d.dicom_all` AS dicom_all
41
+ where crdc_series_uuid not in (select distinct crdc_series_uuid from `bigquery-public-data.idc_v%d.dicom_all`)
42
+ GROUP BY
43
+ 1,2,3,4,5,6,7,8
44
+
45
+ """,
46
+ version, version, latest_idc_version),
47
+ " UNION ALL "
48
+ )
49
+ FROM UNNEST(idc_versions) AS version
50
+ );
51
+
52
+ -- Step 5: Execute the complete query
53
+ EXECUTE IMMEDIATE FORMAT("""
54
+ WITH all_versions AS (
55
+ %s
56
+ )
57
+ SELECT
58
+ collection_id,
59
+ PatientID,
60
+ SeriesInstanceUID,
61
+ StudyInstanceUID,
62
+ Modality,
63
+ gcs_bucket,
64
+ crdc_series_uuid,
65
+ series_size_MB,
66
+ CASE
67
+
68
+ # map GCS bucket to AWS bucket, since for idc-index we prefer AWS
69
+ # if new buckets are included in IDC, this will need to be updated!
70
+
71
+ WHEN gcs_bucket='public-datasets-idc' THEN CONCAT('s3://','idc-open-data/',crdc_series_uuid, '/*')
72
+ WHEN gcs_bucket='idc-open-idc1' THEN CONCAT('s3://','idc-open-data-two/',crdc_series_uuid, '/*')
73
+ WHEN gcs_bucket='idc-open-cr' THEN CONCAT('s3://','idc-open-data-cr/',crdc_series_uuid, '/*')
74
+ END AS series_aws_url,
75
+ MIN(idc_version) AS min_idc_version,
76
+ MAX(idc_version) AS max_idc_version
77
+ FROM all_versions
78
+
79
+ where gcs_bucket not in ('idc-open-idc')
80
+
81
+ #per @bcli4d:idc-open-idc was our public bucket before we moved most data to the Google owned public-datasets-idc.
82
+ #We decided at the time to not touch BQ. To deal with this and other cases where some metadata can change (Licences),
83
+ #we include the mutable_metadata table which maps crdc_instance_uuid to current gcs_url, aws_url, license, doi.
84
+
85
+ GROUP BY
86
+ 1,2,3,4,5,6,7,8
87
+ """,
88
+ union_all_query
89
+ );
@@ -15,6 +15,7 @@ __all__ = [
15
15
  "__version__",
16
16
  "IDC_INDEX_CSV_ARCHIVE_FILEPATH",
17
17
  "IDC_INDEX_PARQUET_FILEPATH",
18
+ "PRIOR_VERSIONS_INDEX_PARQUET_FILEPATH",
18
19
  ]
19
20
 
20
21
 
@@ -36,3 +37,6 @@ IDC_INDEX_CSV_ARCHIVE_FILEPATH: Path | None = _lookup(
36
37
  "idc_index_data/idc_index.csv.zip", optional=True
37
38
  )
38
39
  IDC_INDEX_PARQUET_FILEPATH: Path | None = _lookup("idc_index_data/idc_index.parquet")
40
+ PRIOR_VERSIONS_INDEX_PARQUET_FILEPATH: Path | None = _lookup(
41
+ "idc_index_data/prior_versions_index.parquet"
42
+ )
@@ -7,7 +7,7 @@ from packaging.version import Version
7
7
 
8
8
  import idc_index_data as m
9
9
 
10
- EXPECTED_IDC_INDEX_VERSION = 18
10
+ EXPECTED_IDC_INDEX_VERSION = 19
11
11
 
12
12
 
13
13
  def test_version():
@@ -38,3 +38,8 @@ def test_reading_index():
38
38
  assert m.IDC_INDEX_PARQUET_FILEPATH.is_file()
39
39
  df_parquet = pd.read_parquet(m.IDC_INDEX_PARQUET_FILEPATH)
40
40
  assert not df_parquet.empty
41
+
42
+ if m.PRIOR_VERSIONS_INDEX_PARQUET_FILEPATH is not None:
43
+ assert m.PRIOR_VERSIONS_INDEX_PARQUET_FILEPATH.is_file()
44
+ df_parquet = pd.read_parquet(m.PRIOR_VERSIONS_INDEX_PARQUET_FILEPATH)
45
+ assert not df_parquet.empty
File without changes