PyPI - idc-index-data - Versions diffs - 18.1.0__tar.gz → 19.0.0__tar.gz - Mend

idc-index-data 18.1.0tar.gz → 19.0.0tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of idc-index-data might be problematic. Click here for more details.

Files changed (32) hide show

{idc_index_data-18.1.0 → idc_index_data-19.0.0}/CMakeLists.txt RENAMED Viewed

@@ -19,6 +19,7 @@ add_custom_command(
   OUTPUT
     $<$<BOOL:${IDC_INDEX_DATA_GENERATE_CSV_ARCHIVE}>:${download_dir}/idc_index.csv.zip>
     $<$<BOOL:${IDC_INDEX_DATA_GENERATE_PARQUET}>:${download_dir}/idc_index.parquet>
+    $<$<BOOL:${IDC_INDEX_DATA_GENERATE_PARQUET}>:${download_dir}/prior_versions_index.parquet>
   COMMAND python ${CMAKE_CURRENT_SOURCE_DIR}/scripts/python/idc_index_data_manager.py
     $<$<BOOL:${IDC_INDEX_DATA_GENERATE_CSV_ARCHIVE}>:--generate-csv-archive>
     $<$<BOOL:${IDC_INDEX_DATA_GENERATE_PARQUET}>:--generate-parquet>
@@ -28,10 +29,12 @@ add_custom_target(run_idc_index_data_manager ALL
   DEPENDS
     $<$<BOOL:${IDC_INDEX_DATA_GENERATE_CSV_ARCHIVE}>:${download_dir}/idc_index.csv.zip>
     $<$<BOOL:${IDC_INDEX_DATA_GENERATE_PARQUET}>:${download_dir}/idc_index.parquet>
+    $<$<BOOL:${IDC_INDEX_DATA_GENERATE_PARQUET}>:${download_dir}/prior_versions_index.parquet>
 )
 install(
   FILES
     $<$<BOOL:${IDC_INDEX_DATA_GENERATE_CSV_ARCHIVE}>:${download_dir}/idc_index.csv.zip>
     $<$<BOOL:${IDC_INDEX_DATA_GENERATE_PARQUET}>:${download_dir}/idc_index.parquet>
+    $<$<BOOL:${IDC_INDEX_DATA_GENERATE_PARQUET}>:${download_dir}/prior_versions_index.parquet>
   DESTINATION "idc_index_data")

{idc_index_data-18.1.0 → idc_index_data-19.0.0}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.1
 Name: idc-index-data
-Version: 18.1.0
+Version: 19.0.0
 Summary: ImagingDataCommons index to query and download data.
 Author-Email: Andrey Fedorov <andrey.fedorov@gmail.com>, Vamsi Thiriveedhi <vthiriveedhi@mgh.harvard.edu>, Jean-Christophe Fillion-Robin <jchris.fillionr@kitware.com>
 License: Copyright 2024 Andrey Fedorov

{idc_index_data-18.1.0 → idc_index_data-19.0.0}/assets/sm_index.sql RENAMED Viewed

@@ -5,7 +5,7 @@
 -- SELECT
 --   * EXCEPT(Modality)
 -- FROM
---   `bigquery-public-data.idc_v18.dicom_metadata_curated_series_level`
+--   `bigquery-public-data.idc_v19.dicom_metadata_curated_series_level`
 -- WHERE
 --   Modality = "SM"
@@ -27,7 +27,7 @@ WITH
     ARRAY_AGG(DISTINCT(CONCAT(OpticalPathSequence[SAFE_OFFSET(0)].IlluminationTypeCodeSequence[SAFE_OFFSET(0)].CodingSchemeDesignator,":", OpticalPathSequence[SAFE_OFFSET(0)].IlluminationTypeCodeSequence[SAFE_OFFSET(0)].CodeValue, ":", OpticalPathSequence[SAFE_OFFSET(0)].IlluminationTypeCodeSequence[SAFE_OFFSET(0)].CodeMeaning)) IGNORE NULLS)[SAFE_OFFSET(0)] AS illuminationType_code_str,
   FROM
-    `bigquery-public-data.idc_v18.dicom_all` AS dicom_all
+    `bigquery-public-data.idc_v19.dicom_all` AS dicom_all
   GROUP BY
     SeriesInstanceUID
   ),
@@ -41,7 +41,7 @@ SpecimenPreparationSequence_unnested AS (
         concept_code_sequence.CodeMeaning AS ccs_cm,
         concept_code_sequence.CodingSchemeDesignator AS ccs_csd,
         concept_code_sequence.CodeValue AS ccs_val,
-      FROM `bigquery-public-data.idc_v18.dicom_all`,
+      FROM `bigquery-public-data.idc_v19.dicom_all`,
       UNNEST(SpecimenDescriptionSequence[SAFE_OFFSET(0)].SpecimenPreparationSequence) as preparation_unnest_step1,
       UNNEST(preparation_unnest_step1.SpecimenPreparationStepContentItemSequence) as preparation_unnest_step2,
       UNNEST(preparation_unnest_step2.ConceptNameCodeSequence) as concept_name_code_sequence,

{idc_index_data-18.1.0 → idc_index_data-19.0.0}/assets/sm_instance_index.sql RENAMED Viewed

@@ -9,7 +9,7 @@ WITH
     concept_code_sequence.CodingSchemeDesignator AS ccs_csd,
     concept_code_sequence.CodeValue AS ccs_val,
   FROM
-    `bigquery-public-data.idc_v18.dicom_all`,
+    `bigquery-public-data.idc_v19.dicom_all`,
     UNNEST(SpecimenDescriptionSequence[SAFE_OFFSET(0)].SpecimenPreparationSequence) AS preparation_unnest_step1,
     UNNEST(preparation_unnest_step1.SpecimenPreparationStepContentItemSequence) AS preparation_unnest_step2,
     UNNEST(preparation_unnest_step2.ConceptNameCodeSequence) AS concept_name_code_sequence,

{idc_index_data-18.1.0 → idc_index_data-19.0.0}/pyproject.toml RENAMED Viewed

@@ -13,7 +13,7 @@ build-backend = "scikit_build_core.build"
 [project]
 name = "idc-index-data"
-version = "18.1.0"
+version = "19.0.0"
 authors = [
   { name = "Andrey Fedorov", email = "andrey.fedorov@gmail.com" },
   { name = "Vamsi Thiriveedhi", email = "vthiriveedhi@mgh.harvard.edu" },

{idc_index_data-18.1.0 → idc_index_data-19.0.0}/scripts/sql/idc_index.sql RENAMED Viewed

@@ -26,9 +26,9 @@ SELECT
   ANY_VALUE(CONCAT(series_aws_url,"*")) AS series_aws_url,
   ROUND(SUM(SAFE_CAST(instance_size AS float64))/1000000, 2) AS series_size_MB,
 FROM
-  `bigquery-public-data.idc_v18.dicom_all` AS dicom_all
+  `bigquery-public-data.idc_v19.dicom_all` AS dicom_all
 JOIN
-  `bigquery-public-data.idc_v18.dicom_metadata_curated` AS dicom_curated
+  `bigquery-public-data.idc_v19.dicom_metadata_curated` AS dicom_curated
 ON
   dicom_all.SOPInstanceUID = dicom_curated.SOPInstanceUID
 GROUP BY

idc_index_data-19.0.0/scripts/sql/prior_versions_index.sql ADDED Viewed

@@ -0,0 +1,89 @@
+-- For details on the syntax, see
+-- https://cloud.google.com/bigquery/docs/reference/standard-sql/procedural-language
+--
+-- Step 1: Declare variables
+DECLARE idc_versions ARRAY<INT64>;
+DECLARE latest_idc_version INT64 DEFAULT 19;
+DECLARE union_all_query STRING;
+--Step 2
+--SET latest_idc_version = (
+--SELECT max(idc_version)
+--FROM
+--bigquery-public-data.idc_current.version_metadata
+--);
+-- Step 3: Get all idc_versions
+SET idc_versions = (
+  SELECT GENERATE_ARRAY(1, latest_idc_version)
+  -- SELECT [1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18]
+  --SELECT ARRAY_AGG(idc_version)
+  --FROM
+  --`bigquery-public-data.idc_current.version_metadata`
+);
+-- Step 4: Generate the UNION ALL query dynamically
+SET union_all_query = (
+  SELECT STRING_AGG(
+    FORMAT("""
+    SELECT
+    %d AS idc_version,
+    collection_id,
+    PatientID,
+    SeriesInstanceUID,
+    StudyInstanceUID,
+    Modality,
+    regexp_extract(gcs_url, 'gs://([^/]+)/') as gcs_bucket,
+    crdc_series_uuid,
+    ROUND(SUM(SAFE_CAST(instance_size AS float64))/1000000, 2) AS series_size_MB,
+  FROM
+  `bigquery-public-data.idc_v%d.dicom_all` AS dicom_all
+  where crdc_series_uuid not in (select distinct crdc_series_uuid from `bigquery-public-data.idc_v%d.dicom_all`)
+  GROUP BY
+  1,2,3,4,5,6,7,8
+  """,
+  version, version, latest_idc_version),
+    " UNION ALL "
+  )
+  FROM UNNEST(idc_versions) AS version
+);
+-- Step 5: Execute the complete query
+EXECUTE IMMEDIATE FORMAT("""
+WITH all_versions AS (
+  %s
+)
+SELECT
+  collection_id,
+  PatientID,
+  SeriesInstanceUID,
+  StudyInstanceUID,
+  Modality,
+  gcs_bucket,
+  crdc_series_uuid,
+  series_size_MB,
+  CASE
+  # map GCS bucket to AWS bucket, since for idc-index we prefer AWS
+  # if new buckets are included in IDC, this will need to be updated!
+  WHEN gcs_bucket='public-datasets-idc' THEN CONCAT('s3://','idc-open-data/',crdc_series_uuid, '/*')
+  WHEN gcs_bucket='idc-open-idc1' THEN CONCAT('s3://','idc-open-data-two/',crdc_series_uuid, '/*')
+  WHEN gcs_bucket='idc-open-cr' THEN CONCAT('s3://','idc-open-data-cr/',crdc_series_uuid, '/*')
+    END AS series_aws_url,
+  MIN(idc_version) AS min_idc_version,
+  MAX(idc_version) AS max_idc_version
+FROM all_versions
+where gcs_bucket not in ('idc-open-idc')
+#per @bcli4d:idc-open-idc was our public bucket before we moved most data to the Google owned public-datasets-idc.
+#We decided at the time to not touch BQ. To deal with this and other cases where some metadata can change (Licences),
+#we include the mutable_metadata table which maps crdc_instance_uuid to current gcs_url, aws_url, license, doi.
+GROUP BY
+ 1,2,3,4,5,6,7,8
+  """,
+  union_all_query
+);

{idc_index_data-18.1.0 → idc_index_data-19.0.0}/src/idc_index_data/__init__.py RENAMED Viewed

@@ -15,6 +15,7 @@ __all__ = [
     "__version__",
     "IDC_INDEX_CSV_ARCHIVE_FILEPATH",
     "IDC_INDEX_PARQUET_FILEPATH",
+    "PRIOR_VERSIONS_INDEX_PARQUET_FILEPATH",
 ]
@@ -36,3 +37,6 @@ IDC_INDEX_CSV_ARCHIVE_FILEPATH: Path | None = _lookup(
     "idc_index_data/idc_index.csv.zip", optional=True
 )
 IDC_INDEX_PARQUET_FILEPATH: Path | None = _lookup("idc_index_data/idc_index.parquet")
+PRIOR_VERSIONS_INDEX_PARQUET_FILEPATH: Path | None = _lookup(
+    "idc_index_data/prior_versions_index.parquet"
+)

{idc_index_data-18.1.0 → idc_index_data-19.0.0}/tests/test_package.py RENAMED Viewed

@@ -7,7 +7,7 @@ from packaging.version import Version
 import idc_index_data as m
-EXPECTED_IDC_INDEX_VERSION = 18
+EXPECTED_IDC_INDEX_VERSION = 19
 def test_version():
@@ -38,3 +38,8 @@ def test_reading_index():
         assert m.IDC_INDEX_PARQUET_FILEPATH.is_file()
         df_parquet = pd.read_parquet(m.IDC_INDEX_PARQUET_FILEPATH)
         assert not df_parquet.empty
+    if m.PRIOR_VERSIONS_INDEX_PARQUET_FILEPATH is not None:
+        assert m.PRIOR_VERSIONS_INDEX_PARQUET_FILEPATH.is_file()
+        df_parquet = pd.read_parquet(m.PRIOR_VERSIONS_INDEX_PARQUET_FILEPATH)
+        assert not df_parquet.empty