PyPI - dist-s1-enumerator - Versions diffs - 1.0.3__tar.gz → 1.0.5__tar.gz - Mend

dist-s1-enumerator 1.0.3tar.gz → 1.0.5tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (54) hide show

{dist_s1_enumerator-1.0.3 → dist_s1_enumerator-1.0.5}/CHANGELOG.md RENAMED Viewed

@@ -6,6 +6,23 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
 and this project adheres to [PEP 440](https://www.python.org/dev/peps/pep-0440/)
 and uses [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
+## [1.0.5] - 2025-09-29
+### Fixed
+* CMR metadata does not have correctly migrated urls from ASF datapool to ASF cumulus - see https://github.com/opera-adt/dist-s1/issues/158.
+## [1.0.4] - 2025-09-29
+### Added
+* Update time-series enumeration for multiple polarizations within an MGRS tile.
+   - We now ensure that for each MGRS tile, a single fixed spatial burst creates a baseline (set of pre-images) for a given RTC-S1 burst product. That is, if the recent data was VV+VH in a burst, then the baseline for that burst VV+VH. Multiple dual polarization (i.e. both VV+VH and HH+HV) data can be used within a single MGRS tile.
+* We now ensure that single polarization data is excluded from baselines and not used in the creation of the post-image set.
+### Fixed
+* Bug in enumerating 1 product - we did not ensure spatial bursts were consistent between pre-/post-image sets.
 ## [1.0.3] - 2025-09-09
 ### Fixed

{dist_s1_enumerator-1.0.3/src/dist_s1_enumerator.egg-info → dist_s1_enumerator-1.0.5}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: dist-s1-enumerator
-Version: 1.0.3
+Version: 1.0.5
 Summary: Enumeration and ops library for the OPERA DIST-S1 project
 Author-email: "Richard West, Charlie Marshak, Talib Oliver-Cabrera, and Jungkyo Jung" <charlie.z.marshak@jpl.nasa.gov>
 License: Apache-2.0

{dist_s1_enumerator-1.0.3 → dist_s1_enumerator-1.0.5}/src/dist_s1_enumerator/asf.py RENAMED Viewed

@@ -12,6 +12,27 @@ from dist_s1_enumerator.mgrs_burst_data import get_burst_ids_in_mgrs_tiles, get_
 from dist_s1_enumerator.tabular_models import reorder_columns, rtc_s1_resp_schema, rtc_s1_schema
+def convert_asf_url_to_cumulus(url: str) -> str:
+    asf_base = 'https://datapool.asf.alaska.edu/RTC/OPERA-S1/'
+    cumulus_base = 'https://cumulus.asf.earthdatacloud.nasa.gov/OPERA/OPERA_L2_RTC-S1/'
+    if not (url.startswith(cumulus_base) or url.startswith(asf_base)):
+        warn(f'URL {url} is not a valid ASF datapool or cumulus earthdatacloud URL.')
+        return url
+    if not url.startswith(asf_base):
+        return url
+    filename = url.split('/')[-1]
+    granule_pol_parts = filename.rsplit('_', 1)
+    if len(granule_pol_parts) != 2:
+        raise ValueError(f'Could not extract granule name from filename: {filename}')
+    granule_name = granule_pol_parts[0]
+    new_url = f'{cumulus_base}{granule_name}/{filename}'
+    return new_url
 def format_polarization(pol: list | str) -> str:
     if isinstance(pol, list):
         if ('VV' in pol) and len(pol) == 2:
@@ -72,6 +93,7 @@ def get_rtc_s1_ts_metadata_by_burst_ids(
     start_acq_dt: str | datetime | None | pd.Timestamp = None,
     stop_acq_dt: str | datetime | None | pd.Timestamp = None,
     polarizations: str | None = None,
+    include_single_polarization: bool = False,
 ) -> gpd.GeoDataFrame:
     """Wrap/format the ASF search API for RTC-S1 metadata search. All searches go through this function.
@@ -138,37 +160,36 @@ def get_rtc_s1_ts_metadata_by_burst_ids(
     df_rtc['polarizations'] = df_rtc['polarizations'].map(format_polarization)
     if polarizations is not None:
         ind_pol = df_rtc['polarizations'] == polarizations
-    else:
+    elif not include_single_polarization:
         ind_pol = df_rtc['polarizations'].isin(['HH+HV', 'VV+VH'])
+    else:
+        ind_pol = df_rtc['polarizations'].isin(['HH+HV', 'VV+VH', 'HH', 'HV', 'VV', 'VH'])
     if not ind_pol.any():
-        raise ValueError(f'No valid dual polarization images found for {burst_ids}.')
+        warn(f'No valid dual polarization images found for {burst_ids}.')
     # First get all the dual-polarizations images
     df_rtc = df_rtc[ind_pol].reset_index(drop=True)
-    # Then check all the dual-polarizations are the same (either HH+HV or VV+VH)
-    # TODO: if there are mixtures, can DIST-S1 still be generated assuming they look the same?
-    polarizations_unique = df_rtc['polarizations'].unique().tolist()
-    if len(polarizations_unique) > 1:
-        raise ValueError(
-            f'Mixed dual polarizations found for {burst_ids}. That is, some images are HH+HV and others are VV+HV.'
-        )
-    else:
-        # Either HH+HV or VV+VH
-        copol, crosspol = polarizations_unique[0].split('+')
     def get_url_by_polarization(prod_urls: list[str], polarization_token: str) -> list[str]:
-        possible_urls = [url for url in prod_urls if f'_{polarization_token}.tif' == url[-7:]]
+        if polarization_token == 'copol':
+            polarizations_allowed = ['VV', 'HH']
+        elif polarization_token == 'crosspol':
+            polarizations_allowed = ['HV', 'VH']
+        else:
+            raise ValueError(f'Invalid polarization token: {polarization_token}. Must be one of: copol, crosspol.')
+        possible_urls = [url for pol in polarizations_allowed for url in prod_urls if f'_{pol}.tif' == url[-7:]]
         if len(possible_urls) == 0:
-            raise ValueError(f'No {polarization_token} urls found')
+            raise ValueError(f'No {polarizations_allowed} urls found')
         if len(possible_urls) > 1:
-            breakpoint()
-            raise ValueError(f'Multiple {polarization_token} urls found')
+            raise ValueError(f'Multiple {polarization_token} urls found: {", ".join(possible_urls)}')
         return possible_urls[0]
-    url_copol = df_rtc.all_urls.map(lambda urls_for_prod: get_url_by_polarization(urls_for_prod, copol))
-    url_crosspol = df_rtc.all_urls.map(lambda urls_for_prod: get_url_by_polarization(urls_for_prod, crosspol))
+    url_copol = df_rtc.all_urls.map(lambda urls_for_prod: get_url_by_polarization(urls_for_prod, 'copol'))
+    url_crosspol = df_rtc.all_urls.map(lambda urls_for_prod: get_url_by_polarization(urls_for_prod, 'crosspol'))
     df_rtc['url_copol'] = url_copol
     df_rtc['url_crosspol'] = url_crosspol
+    df_rtc['url_copol'] = df_rtc['url_copol'].map(convert_asf_url_to_cumulus)
+    df_rtc['url_crosspol'] = df_rtc['url_crosspol'].map(convert_asf_url_to_cumulus)
     df_rtc = df_rtc.drop(columns=['all_urls'])
     # Ensure the data is sorted by jpl_burst_id and acq_dt
@@ -187,6 +208,7 @@ def get_rtc_s1_metadata_from_acq_group(
     start_acq_dt: datetime | str | None = None,
     stop_acq_dt: datetime | str | None = None,
     max_variation_seconds: float | None = None,
+    polarizations: str | None = None,
 ) -> gpd.GeoDataFrame:
     """
     Meant for acquiring a pre-image or post-image set from MGRS tiles for a given S1 pass.
@@ -241,6 +263,7 @@ def get_rtc_s1_metadata_from_acq_group(
         burst_ids,
         start_acq_dt=start_acq_dt,
         stop_acq_dt=stop_acq_dt,
+        polarizations=polarizations,
     )
     # Assumes that each group is ordered by date (earliest first and most recent last)
     columns = df_rtc.columns

{dist_s1_enumerator-1.0.3 → dist_s1_enumerator-1.0.5}/src/dist_s1_enumerator/dist_enum.py RENAMED Viewed

@@ -117,7 +117,6 @@ def enumerate_one_dist_s1_product(
         max_variation_seconds=300,
         n_images_per_burst=1,
     )
     if df_rtc_post.empty:
         raise ValueError(f'No RTC-S1 post-images found for track {track_number} in MGRS tile {mgrs_tile_id}.')
@@ -137,6 +136,11 @@ def enumerate_one_dist_s1_product(
             stop_acq_dt=stop_acq_dt,
             n_images_per_burst=max_pre_imgs_per_burst,
         )
+        df_unique_keys = df_rtc_post[['jpl_burst_id', 'polarizations']].drop_duplicates()
+        df_rtc_pre = pd.merge(df_rtc_pre, df_unique_keys, on=['jpl_burst_id', 'polarizations'], how='inner')
+        df_rtc_pre['input_category'] = 'pre'
     elif lookback_strategy == 'multi_window':
         df_rtc_pre_list = []
@@ -155,16 +159,22 @@ def enumerate_one_dist_s1_product(
             latest_lookback = delta_lookback_day
             start_acq_dt = post_date_min - timedelta(days=latest_lookback)
             stop_acq_dt = post_date_min - timedelta(days=earliest_lookback)
-            df_rtc_pre = get_rtc_s1_metadata_from_acq_group(
+            df_rtc_pre_window = get_rtc_s1_metadata_from_acq_group(
                 [mgrs_tile_id],
                 track_numbers=track_numbers,
                 start_acq_dt=start_acq_dt,
                 stop_acq_dt=stop_acq_dt,
                 n_images_per_burst=max_pre_img_per_burst,
+                polarizations=None,
             )
+            df_unique_keys = df_rtc_post[['jpl_burst_id', 'polarizations']].drop_duplicates()
-            if not df_rtc_pre.empty:
-                df_rtc_pre_list.append(df_rtc_pre)
+            df_rtc_pre_window = pd.merge(
+                df_rtc_pre_window, df_unique_keys, on=['jpl_burst_id', 'polarizations'], how='inner'
+            )
+            if not df_rtc_pre_window.empty:
+                df_rtc_pre_list.append(df_rtc_pre_window)
         df_rtc_pre = pd.concat(df_rtc_pre_list, ignore_index=True) if df_rtc_pre_list else pd.DataFrame()
@@ -179,7 +189,7 @@ def enumerate_one_dist_s1_product(
         df_rtc_pre = df_rtc_pre[df_rtc_pre.jpl_burst_id.isin(burst_ids_with_min_pre_images)].reset_index(drop=True)
         post_burst_ids = df_rtc_post.jpl_burst_id.unique().tolist()
-        pre_burst_ids = df_rtc_post.jpl_burst_id.unique().tolist()
+        pre_burst_ids = df_rtc_pre.jpl_burst_id.unique().tolist()
         final_burst_ids = list(set(post_burst_ids) & set(pre_burst_ids))
         df_rtc_pre = df_rtc_pre[df_rtc_pre.jpl_burst_id.isin(final_burst_ids)].reset_index(drop=True)
@@ -308,10 +318,15 @@ def enumerate_dist_s1_products(
                     ind_time = (df_rtc_ts_tile_track.acq_dt < window_stop) & (
                         df_rtc_ts_tile_track.acq_dt >= window_start
                     )
+                    df_rtc_ts_tile_track_filtered = df_rtc_ts_tile_track[ind_time].reset_index(drop=True)
                     # Select images that are present in the post-image
-                    ind_burst = df_rtc_ts_tile_track.jpl_burst_id.isin(df_rtc_post.jpl_burst_id)
-                    ind = ind_time & ind_burst
-                    df_rtc_pre = df_rtc_ts_tile_track[ind].reset_index(drop=True)
+                    df_unique_keys = df_rtc_post[['jpl_burst_id', 'polarizations']].drop_duplicates()
+                    df_rtc_pre = pd.merge(
+                        df_rtc_ts_tile_track_filtered,
+                        df_unique_keys,
+                        on=['jpl_burst_id', 'polarizations'],
+                        how='inner',
+                    )
                     df_rtc_pre['input_category'] = 'pre'
                     # It is unclear how merging when multiple MGRS tiles are provided will impact order so this
@@ -343,10 +358,15 @@ def enumerate_dist_s1_products(
                         ind_time = (df_rtc_ts_tile_track.acq_dt < window_stop) & (
                             df_rtc_ts_tile_track.acq_dt >= window_start
                         )
-                        # Select images that are present in the post-image
-                        ind_burst = df_rtc_ts_tile_track.jpl_burst_id.isin(df_rtc_post.jpl_burst_id)
-                        ind = ind_time & ind_burst
-                        df_rtc_pre = df_rtc_ts_tile_track[ind].reset_index(drop=True)
+                        df_rtc_ts_tile_track_filtered = df_rtc_ts_tile_track[ind_time].reset_index(drop=True)
+                        df_unique_keys = df_rtc_post[['jpl_burst_id', 'polarizations']].drop_duplicates()
+                        df_rtc_pre = pd.merge(
+                            df_rtc_ts_tile_track_filtered,
+                            df_unique_keys,
+                            on=['jpl_burst_id', 'polarizations'],
+                            how='inner',
+                        )
                         df_rtc_pre['input_category'] = 'pre'
                         # It is unclear how merging when multiple MGRS tiles are provided will impact order so this
@@ -361,7 +381,7 @@ def enumerate_dist_s1_products(
                             continue
                         if not df_rtc_pre.empty:
-                            df_rtc_pre_list.append(df_rtc_pre)  # Store each df_rtc_pre
+                            df_rtc_pre_list.append(df_rtc_pre)
                     # Concatenate all df_rtc_pre into a single DataFrame
                     df_rtc_pre_final = (

{dist_s1_enumerator-1.0.3 → dist_s1_enumerator-1.0.5/src/dist_s1_enumerator.egg-info}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: dist-s1-enumerator
-Version: 1.0.3
+Version: 1.0.5
 Summary: Enumeration and ops library for the OPERA DIST-S1 project
 Author-email: "Richard West, Charlie Marshak, Talib Oliver-Cabrera, and Jungkyo Jung" <charlie.z.marshak@jpl.nasa.gov>
 License: Apache-2.0

{dist_s1_enumerator-1.0.3 → dist_s1_enumerator-1.0.5}/tests/test_asf.py RENAMED Viewed

@@ -1,6 +1,6 @@
 import pytest
-from dist_s1_enumerator.asf import append_pass_data, get_rtc_s1_ts_metadata_by_burst_ids
+from dist_s1_enumerator.asf import append_pass_data, convert_asf_url_to_cumulus, get_rtc_s1_ts_metadata_by_burst_ids
 @pytest.mark.integration
@@ -35,3 +35,24 @@ def test_appending_mgrs_tiles() -> None:
     df_rtc_formatted_no_rows = append_pass_data(df_rtc_resp, ['22NFF'])
     assert df_rtc_formatted_no_rows.empty
+@pytest.mark.parametrize('pol_token', ['VV', 'VH', 'HH', 'HV'])
+def test_convert_asf_url_to_cumulus_from_datapool(pol_token: str) -> None:
+    """Test converting ASF datapool URL to cumulus earthdatacloud URL."""
+    asf_url = f'https://datapool.asf.alaska.edu/RTC/OPERA-S1/OPERA_L2_RTC-S1_T001-000189-IW2_20211028T180924Z_20250703T015334Z_S1A_30_v1.0_{pol_token}.tif'
+    expected_cumulus_url = f'https://cumulus.asf.earthdatacloud.nasa.gov/OPERA/OPERA_L2_RTC-S1/OPERA_L2_RTC-S1_T001-000189-IW2_20211028T180924Z_20250703T015334Z_S1A_30_v1.0/OPERA_L2_RTC-S1_T001-000189-IW2_20211028T180924Z_20250703T015334Z_S1A_30_v1.0_{pol_token}.tif'
+    result = convert_asf_url_to_cumulus(asf_url)
+    assert result == expected_cumulus_url
+@pytest.mark.parametrize('pol_token', ['VV', 'VH', 'HH', 'HV'])
+def test_convert_asf_url_to_cumulus_already_cumulus(pol_token: str) -> None:
+    """Test that cumulus URLs are returned unchanged."""
+    cumulus_url = f'https://cumulus.asf.earthdatacloud.nasa.gov/OPERA/OPERA_L2_RTC-S1/OPERA_L2_RTC-S1_T001-000189-IW2_20211028T180924Z_20250703T015334Z_S1A_30_v1.0/OPERA_L2_RTC-S1_T001-000189-IW2_20211028T180924Z_20250703T015334Z_S1A_30_v1.0_{pol_token}.tif'
+    result = convert_asf_url_to_cumulus(cumulus_url)
+    assert result == cumulus_url

{dist_s1_enumerator-1.0.3 → dist_s1_enumerator-1.0.5}/tests/test_dist_enum.py RENAMED Viewed

@@ -247,3 +247,114 @@ def test_burst_ids_consistent_between_pre_and_post(mgrs_tile_ids: list[str], tra
         df_pre = df_product[df_product['input_category'] == 'pre'].reset_index(drop=True)
         df_post = df_product[df_product['input_category'] == 'post'].reset_index(drop=True)
         assert sorted(df_pre['jpl_burst_id'].unique().tolist()) == sorted(df_post['jpl_burst_id'].unique().tolist())
+@pytest.mark.integration
+def test_dist_enum_one_with_multi_window_with_multiple_polarizations_and_asf_daac() -> None:
+    """Test enumeration of 1 product with multi_window strategy with multiple dual polarization data.
+    Context: MGRS Tile 20TLP: https://search.asf.alaska.edu/#/?polygon=
+    POLYGON((-65.5041%2044.226,-65.4632%2043.2383,-64.1113%2043.2594,-64.1298%2044.2478,-65.5041%2044.226))
+    &start=2025-09-18T07:00:00Z&end=2025-09-20T06:59:59Z&resultsLoaded=true&zoom=8.078
+    &center=-63.112,42.844&dataset=OPERA-S1&productTypes=RTC
+    &granule=OPERA_L2_RTC-S1_T171-365960-IW2_20250919T102314Z_20250919T135744Z_S1C_30_v1.0
+    """
+    df_product = enumerate_one_dist_s1_product(
+        '20TLP',
+        track_number=171,
+        post_date='2025-09-19',
+        lookback_strategy='multi_window',
+        # Need to look back further for valid VV+VH data
+        delta_lookback_days=(1460, 1095, 730, 365),
+        max_pre_imgs_per_burst=(3, 3, 3, 4),
+    )
+    assert sorted(df_product.polarizations.unique().tolist()) == ['HH+HV', 'VV+VH']
+    df_sample_vvvh_burst = df_product[df_product.jpl_burst_id == 'T171-365965-IW3'].reset_index(drop=True)
+    dates_for_sample_vvvh_burst = sorted(df_sample_vvvh_burst['acq_date_for_mgrs_pass'].unique().tolist())
+    # Note the last date is the post date
+    expected_dates = ['2020-09-21', '2021-05-19', '2021-05-31', '2025-09-19']
+    assert dates_for_sample_vvvh_burst == expected_dates
+    # Check baseline data
+    # The post image is VV+VH
+    # Ref: https://search.asf.alaska.edu/#/?dataset=OPERA-S1&productTypes=RTC&operaBurstID=T171_365965_IW3&zoom=3.000
+    # &center=-74.108,31.979
+    # &resultsLoaded=true&granule=OPERA_L2_RTC-S1_T171-365965-IW3_20250919T102329Z_20250919T145901Z_S1C_30_v1.0
+    opera_ids = df_sample_vvvh_burst.opera_id.unique().tolist()
+    opera_ids_trunc = ['_'.join(op_id.split('_')[:5]) for op_id in opera_ids]
+    # another VV+VH image
+    assert 'OPERA_L2_RTC-S1_T171-365965-IW3_20200921T102347Z' in opera_ids_trunc
+    # a HH+HV image in the time series - there is only one image from 2024 so should be in if it weren't 2024
+    assert 'OPERA_L2_RTC-S1_T171-365965-IW3_20240427T102443Z' not in opera_ids_trunc
+@pytest.mark.integration
+def test_dist_enum_one_with_multi_window_with_asf_daac() -> None:
+    df_product = enumerate_one_dist_s1_product(
+        '11SLT',
+        track_number=144,
+        post_date='2025-06-19',
+        lookback_strategy='multi_window',
+        delta_lookback_days=(1095, 730, 365),
+        max_pre_imgs_per_burst=(3, 3, 4),
+    )
+    burst_ids_expected = [
+        'T144-308024-IW1',
+        'T144-308025-IW1',
+        'T144-308026-IW1',
+        'T144-308027-IW1',
+        'T144-308028-IW1',
+        'T144-308029-IW1',
+        'T144-308030-IW1',
+        'T144-308031-IW1',
+    ]
+    assert sorted(df_product['jpl_burst_id'].unique().tolist()) == sorted(burst_ids_expected)
+    post_ind = df_product.input_category == 'post'
+    df_product_post = df_product[post_ind].reset_index(drop=True)
+    pre_ind = df_product.input_category == 'pre'
+    df_product_pre = df_product[pre_ind].reset_index(drop=True)
+    pre_dates_expected = [
+        '2024-06-12',
+        '2024-05-31',
+        '2024-05-19',
+        '2024-05-07',
+        '2023-06-18',
+        '2023-06-06',
+        '2023-05-25',
+        '2022-06-11',
+        '2022-05-30',
+        '2022-05-18',
+    ]
+    assert sorted(df_product_pre['acq_date_for_mgrs_pass'].unique().tolist()) == sorted(pre_dates_expected)
+    assert df_product_post['acq_date_for_mgrs_pass'].unique().tolist() == ['2025-06-19']
+@pytest.mark.integration
+def test_dist_enum_one_with_multi_window_with_asf_daac_single_polarization() -> None:
+    """
+    Test enumeration of 1 product with multi_window strategy with single polarization data in post-image set.
+    The dataframe should be empty!
+    https://search.asf.alaska.edu/#/?maxResults=250&zoom=4.562&center=144.313,-7.683
+    &polygon=POLYGON((-242.1832%205.9478,-231.2276%205.9478,-231.2276%2018.4899,-242.1832%2018.4899,-242.1832%205.9478))
+    &dataset=OPERA-S1&productTypes=RTC&start=2024-10-18T08:00:00Z
+    &end=2024-10-31T07:59:59Z&resultsLoaded=true
+    &granule=OPERA_L2_RTC-S1_T069-146165-IW2_20241029T100013Z_20241029T204425Z_S1A_30_v1.0
+    &flightDirs=Ascending
+    """
+    with pytest.raises(ValueError, match='No RTC-S1 post-images found for track 69 in MGRS tile 51QUU.'):
+        _ = enumerate_one_dist_s1_product(
+            '51QUU',
+            track_number=69,
+            post_date='2024-10-29',
+            lookback_strategy='multi_window',
+            delta_lookback_days=(730, 365),
+            max_pre_imgs_per_burst=(3, 4),
+        )