PyPI - dist-s1-enumerator - Versions diffs - 1.0.8__py3-none-any.whl - Mend

dist-s1-enumerator 1.0.8__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (19) hide show

dist_s1_enumerator/__init__.py +57 -0
dist_s1_enumerator/asf.py +328 -0
dist_s1_enumerator/constants.py +50 -0
dist_s1_enumerator/data/jpl_burst_geo.parquet +0 -0
dist_s1_enumerator/data/mgrs.parquet +0 -0
dist_s1_enumerator/data/mgrs_burst_lookup_table.parquet +0 -0
dist_s1_enumerator/dist_enum.py +425 -0
dist_s1_enumerator/dist_enum_inputs.py +138 -0
dist_s1_enumerator/exceptions.py +2 -0
dist_s1_enumerator/mgrs_burst_data.py +170 -0
dist_s1_enumerator/param_models.py +100 -0
dist_s1_enumerator/py.typed +0 -0
dist_s1_enumerator/rtc_s1_io.py +142 -0
dist_s1_enumerator/tabular_models.py +91 -0
dist_s1_enumerator-1.0.8.dist-info/METADATA +295 -0
dist_s1_enumerator-1.0.8.dist-info/RECORD +19 -0
dist_s1_enumerator-1.0.8.dist-info/WHEEL +5 -0
dist_s1_enumerator-1.0.8.dist-info/licenses/LICENSE +202 -0
dist_s1_enumerator-1.0.8.dist-info/top_level.txt +1 -0

dist_s1_enumerator/mgrs_burst_data.py ADDED Viewed

@@ -0,0 +1,170 @@
+from functools import lru_cache
+from pathlib import Path
+import geopandas as gpd
+import pandas as pd
+from shapely.geometry import Point, Polygon
+from dist_s1_enumerator.exceptions import NoMGRSCoverage
+from dist_s1_enumerator.tabular_models import burst_mgrs_lut_schema, burst_schema, mgrs_tile_schema, reorder_columns
+DATA_DIR = Path(__file__).resolve().parent / 'data'
+def get_mgrs_burst_lut_path() -> Path:
+    parquet_path = DATA_DIR / 'mgrs_burst_lookup_table.parquet'
+    return parquet_path
+def get_mgrs_data_path() -> Path:
+    parquet_path = DATA_DIR / 'mgrs.parquet'
+    return parquet_path
+def get_burst_data_path() -> Path:
+    parquet_path = DATA_DIR / 'jpl_burst_geo.parquet'
+    return parquet_path
+def get_burst_table(burst_ids: list[str] | str | None = None) -> gpd.GeoDataFrame:
+    parquet_path = get_burst_data_path()
+    if burst_ids is None:
+        df = gpd.read_parquet(parquet_path)
+    else:
+        if isinstance(burst_ids, str):
+            burst_ids = [burst_ids]
+        filters = [('jpl_burst_id', 'in', burst_ids)]
+        df = gpd.read_parquet(parquet_path, filters=filters)
+    if df.empty:
+        burst_ids_str = ', '.join(map(str, burst_ids))
+        raise ValueError(f'No burst data found for {burst_ids_str}.')
+    burst_schema.validate(df)
+    df = reorder_columns(df, burst_schema)
+    return df.reset_index(drop=True)
+@lru_cache
+def get_mgrs_burst_lut() -> gpd.GeoDataFrame:
+    parquet_path = get_mgrs_burst_lut_path()
+    df = pd.read_parquet(parquet_path)
+    burst_mgrs_lut_schema.validate(df)
+    df = reorder_columns(df, burst_mgrs_lut_schema)
+    return df.reset_index(drop=True)
+def get_lut_by_mgrs_tile_ids(mgrs_tile_ids: str | list[str]) -> gpd.GeoDataFrame:
+    if isinstance(mgrs_tile_ids, str):
+        mgrs_tile_ids = [mgrs_tile_ids]
+    parquet_path = get_mgrs_burst_lut_path()
+    filters = [('mgrs_tile_id', 'in', mgrs_tile_ids)]
+    df_mgrs_burst_lut = pd.read_parquet(parquet_path, filters=filters)
+    if df_mgrs_burst_lut.empty:
+        mgrs_tile_ids_str = ', '.join(map(str, mgrs_tile_ids))
+        raise ValueError(f'No LUT data found for MGRS tile ids {mgrs_tile_ids_str}.')
+    burst_mgrs_lut_schema.validate(df_mgrs_burst_lut)
+    df_mgrs_burst_lut = reorder_columns(df_mgrs_burst_lut, burst_mgrs_lut_schema)
+    return df_mgrs_burst_lut.reset_index(drop=True)
+@lru_cache
+def get_mgrs_table() -> gpd.GeoDataFrame:
+    path = get_mgrs_data_path()
+    df_mgrs = gpd.read_parquet(path)
+    mgrs_tile_schema.validate(df_mgrs)
+    df_mgrs = reorder_columns(df_mgrs, mgrs_tile_schema)
+    return df_mgrs
+def get_mgrs_tile_table_by_ids(mgrs_tile_ids: list[str]) -> gpd.GeoDataFrame:
+    df_mgrs = get_mgrs_table()
+    if isinstance(mgrs_tile_ids, str):
+        mgrs_tile_ids = [mgrs_tile_ids]
+    ind = df_mgrs.mgrs_tile_id.isin(mgrs_tile_ids)
+    if not ind.any():
+        mgrs_tile_ids_str = ', '.join(map(str, mgrs_tile_ids))
+        raise ValueError(f'No MGRS tile data found for {mgrs_tile_ids_str}.')
+    df_mgrs_subset = df_mgrs[ind].reset_index(drop=True)
+    return df_mgrs_subset
+def get_mgrs_tiles_overlapping_geometry(geometry: Polygon | Point) -> gpd.GeoDataFrame:
+    df_mgrs = get_mgrs_table()
+    ind = df_mgrs.intersects(geometry)
+    if not ind.any():
+        raise NoMGRSCoverage(
+            'We only have MGRS tiles that overlap with DIST-HLS products (this is slightly less than Sentinel-2). '
+        )
+    df_mgrs_overlapping = df_mgrs[ind].reset_index(drop=True)
+    mgrs_tile_schema.validate(df_mgrs_overlapping)
+    df_mgrs_overlapping = reorder_columns(df_mgrs_overlapping, mgrs_tile_schema)
+    return df_mgrs_overlapping
+def get_burst_ids_in_mgrs_tiles(mgrs_tile_ids: list[str] | str, track_numbers: list[int] = None) -> list[str]:
+    """Get all the burst ids in the provided MGRS tiles.
+    If track numbers are provided gets all the burst ids for the provided pass associated with the tracks
+    for each MGRS tile. Throws an error if there are multiple acq_group_id_within_mgrs_tile for a single MGRS tile.
+    """
+    df_mgrs_burst_luts = get_lut_by_mgrs_tile_ids(mgrs_tile_ids)
+    if isinstance(mgrs_tile_ids, str):
+        mgrs_tile_ids = [mgrs_tile_ids]
+    if track_numbers is not None:
+        if len(track_numbers) > 2:
+            raise ValueError(
+                'More than 2 track numbers provided. When track numbers are provided, we select data from a single '
+                'pass so this is an invalid input.'
+            )
+        tile_data = []
+        for mgrs_tile_id in mgrs_tile_ids:
+            ind_temp = (df_mgrs_burst_luts.mgrs_tile_id == mgrs_tile_id) & (
+                df_mgrs_burst_luts.track_number.isin(track_numbers)
+            )
+            df_lut_temp = df_mgrs_burst_luts[ind_temp].reset_index(drop=True)
+            if df_lut_temp.empty:
+                mgrs_tile_ids_str = ', '.join(map(str, mgrs_tile_ids))
+                track_numbers_str = ', '.join(map(str, track_numbers))
+                available_track_numbers = (
+                    df_mgrs_burst_luts[df_mgrs_burst_luts.mgrs_tile_id == mgrs_tile_id].track_number.unique().tolist()
+                )
+                available_track_numbers_str = ', '.join(map(str, available_track_numbers))
+                raise ValueError(
+                    f'Mismatch - no LUT data found for MGRS tile ids {mgrs_tile_ids_str} '
+                    f'and track numbers {track_numbers_str}. '
+                    f'Available track numbers for tile {mgrs_tile_ids_str} are {available_track_numbers_str}.'
+                )
+            acq_ids = df_lut_temp.acq_group_id_within_mgrs_tile.unique().tolist()
+            if len(acq_ids) != 1:
+                track_numbers_str = ', '.join(map(str, track_numbers))
+                raise ValueError(
+                    f'Multiple acq_group_id_within_mgrs_tile found for mgrs_tile_id {mgrs_tile_id} and '
+                    f'track_numbers {track_numbers_str}.'
+                )
+            acq_id = acq_ids[0]
+            df_lut_pass = df_mgrs_burst_luts[df_mgrs_burst_luts.acq_group_id_within_mgrs_tile == acq_id].reset_index(
+                drop=True
+            )
+            tile_data.append(df_lut_pass)
+        df_mgrs_burst_luts = pd.concat(tile_data, axis=0)
+        # Remove duplicates if sequential track numbers are provided.
+        df_mgrs_burst_luts = df_mgrs_burst_luts.drop_duplicates().reset_index(drop=True)
+    df_mgrs_burst_luts = df_mgrs_burst_luts.drop_duplicates(subset=['jpl_burst_id', 'mgrs_tile_id'])
+    burst_ids = df_mgrs_burst_luts.jpl_burst_id.unique().tolist()
+    return burst_ids
+def get_burst_table_from_mgrs_tiles(mgrs_tile_ids: str | list[str]) -> list:
+    df_mgrs_burst_luts = get_lut_by_mgrs_tile_ids(mgrs_tile_ids)
+    burst_ids = df_mgrs_burst_luts.jpl_burst_id.unique().tolist()
+    df_burst = get_burst_table(burst_ids)
+    df_burst = pd.merge(
+        df_burst,
+        df_mgrs_burst_luts[['jpl_burst_id', 'track_number', 'acq_group_id_within_mgrs_tile', 'mgrs_tile_id']],
+        how='left',
+        on='jpl_burst_id',
+    )
+    burst_schema.validate(df_burst)
+    df_burst = reorder_columns(df_burst, burst_schema)
+    return df_burst.reset_index(drop=True)

dist_s1_enumerator/param_models.py ADDED Viewed

@@ -0,0 +1,100 @@
+from pydantic import BaseModel, ValidationInfo, field_validator
+class LookbackStrategyParams(BaseModel):
+    """Pydantic model for validating lookback strategy parameters."""
+    lookback_strategy: str
+    max_pre_imgs_per_burst: int | list[int] | tuple[int, ...]
+    delta_lookback_days: int | list[int] | tuple[int, ...]
+    min_pre_imgs_per_burst: int
+    delta_window_days: int
+    @field_validator('delta_window_days')
+    @classmethod
+    def validate_delta_window_days(cls, v: int) -> int:
+        """Validate that delta_window_days is less than 365 days."""
+        if v > 365:
+            raise ValueError('delta_window_days must be less than 365 days.')
+        return v
+    @field_validator('lookback_strategy')
+    @classmethod
+    def validate_lookback_strategy(cls, v: str) -> str:
+        """Validate that lookback_strategy is one of the supported values."""
+        allowed_strategies = ['immediate_lookback', 'multi_window']
+        if v not in allowed_strategies:
+            raise ValueError(f'lookback_strategy must be one of {allowed_strategies}, got {v}')
+        return v
+    @field_validator('max_pre_imgs_per_burst')
+    @classmethod
+    def validate_max_pre_imgs_per_burst(
+        cls, v: int | list[int] | tuple[int, ...], info: ValidationInfo
+    ) -> int | tuple[int, ...]:
+        """Validate max_pre_imgs_per_burst based on lookback_strategy."""
+        lookback_strategy = info.data.get('lookback_strategy')
+        if lookback_strategy == 'immediate_lookback':
+            if isinstance(v, list | tuple):
+                raise ValueError('max_pre_imgs_per_burst must be a single integer for immediate lookback strategy.')
+        elif lookback_strategy == 'multi_window':
+            if isinstance(v, int):
+                v = (v,) * 3
+            elif isinstance(v, list):
+                v = tuple(v)
+        return v
+    @field_validator('delta_lookback_days')
+    @classmethod
+    def validate_delta_lookback_days(
+        cls, v: int | list[int] | tuple[int, ...], info: ValidationInfo
+    ) -> int | tuple[int, ...]:
+        """Validate delta_lookback_days based on lookback_strategy and max_pre_imgs_per_burst."""
+        lookback_strategy = info.data.get('lookback_strategy')
+        max_pre_imgs_per_burst = info.data.get('max_pre_imgs_per_burst')
+        if lookback_strategy == 'immediate_lookback':
+            if v != 0:
+                raise ValueError('delta_lookback_days must be 0 for immediate lookback strategy.')
+        elif lookback_strategy == 'multi_window':
+            if isinstance(v, int):
+                if isinstance(max_pre_imgs_per_burst, list | tuple):
+                    v = tuple(v * i for i in range(1, len(max_pre_imgs_per_burst) + 1))
+                else:
+                    v = tuple(v * i for i in range(1, 3 + 1))  # Default to 3 if max_pre_imgs_per_burst is still an int
+            elif isinstance(v, list):
+                v = tuple(v)
+            if isinstance(max_pre_imgs_per_burst, list | tuple) and len(v) != len(max_pre_imgs_per_burst):
+                raise ValueError(
+                    'max_pre_imgs_per_burst and delta_lookback_days must have the same length. '
+                    'If max_pre_imgs_per_burst is a single integer, this is interpreted as the maximum '
+                    'number of pre-images on 3 anniversary dates so ensure that `delta_lookback_days` '
+                    'is a tuple of length 3 or an integer.'
+                )
+        return v
+    @field_validator('min_pre_imgs_per_burst')
+    @classmethod
+    def validate_min_pre_imgs_per_burst(cls, v: int, info: ValidationInfo) -> int:
+        """Validate that all max_pre_imgs_per_burst values are greater than min_pre_imgs_per_burst."""
+        max_pre_imgs_per_burst = info.data.get('max_pre_imgs_per_burst')
+        lookback_strategy = info.data.get('lookback_strategy')
+        if lookback_strategy == 'immediate_lookback':
+            if isinstance(max_pre_imgs_per_burst, int) and max_pre_imgs_per_burst < v:
+                raise ValueError('max_pre_imgs_per_burst must be greater than min_pre_imgs_per_burst')
+        elif lookback_strategy == 'multi_window':
+            if isinstance(max_pre_imgs_per_burst, list | tuple):
+                if any(m < v for m in max_pre_imgs_per_burst):
+                    raise ValueError('All values in max_pre_imgs_per_burst must be greater than min_pre_imgs_per_burst')
+            if isinstance(max_pre_imgs_per_burst, int) and max_pre_imgs_per_burst < v:
+                raise ValueError('max_pre_imgs_per_burst must be greater than min_pre_imgs_per_burst')
+        return v

dist_s1_enumerator/py.typed ADDED Viewed

File without changes

dist_s1_enumerator/rtc_s1_io.py ADDED Viewed

@@ -0,0 +1,142 @@
+import concurrent.futures
+from pathlib import Path
+import geopandas as gpd
+import requests
+from pandera.pandas import check_input
+from rasterio.errors import RasterioIOError
+from requests.exceptions import HTTPError, RequestException, Timeout
+from tenacity import retry, retry_if_exception_type, stop_after_attempt, wait_exponential
+from tqdm.auto import tqdm
+from dist_s1_enumerator.tabular_models import rtc_s1_schema
+def generate_rtc_s1_local_paths(
+    urls: list[str], data_dir: Path | str, track_token: list, date_tokens: list[str], mgrs_tokens: list[str]
+) -> list[Path]:
+    data_dir = Path(data_dir)
+    data_dir.mkdir(parents=True, exist_ok=True)
+    n = len(urls)
+    bad_data = [
+        (input_name, len(data))
+        for (input_name, data) in zip(
+            ['urls', 'date_tokens', 'mgrs_tokens', 'track_token'], [urls, date_tokens, mgrs_tokens, track_token]
+        )
+        if len(data) != n
+    ]
+    if bad_data:
+        raise ValueError(f'Number of {bad_data[0][0]} (which is {bad_data[0][1]}) must match the number of URLs ({n}).')
+    dst_dirs = [
+        data_dir / mgrs_token / track_token / date_token
+        for (mgrs_token, track_token, date_token) in zip(mgrs_tokens, track_token, date_tokens)
+    ]
+    [dst_dir.mkdir(parents=True, exist_ok=True) for dst_dir in dst_dirs]
+    local_paths = [dst_dir / url.split('/')[-1] for (dst_dir, url) in zip(dst_dirs, urls)]
+    return local_paths
+def append_local_paths(df_rtc_ts: gpd.GeoDataFrame, data_dir: Path | str) -> list[Path]:
+    copol_urls = df_rtc_ts['url_copol'].tolist()
+    crosspol_urls = df_rtc_ts['url_crosspol'].tolist()
+    track_tokens = df_rtc_ts['track_token'].tolist()
+    date_tokens = df_rtc_ts['acq_date_for_mgrs_pass'].tolist()
+    mgrs_tokens = df_rtc_ts['mgrs_tile_id'].tolist()
+    out_paths_copol = generate_rtc_s1_local_paths(copol_urls, data_dir, track_tokens, date_tokens, mgrs_tokens)
+    out_paths_crosspol = generate_rtc_s1_local_paths(crosspol_urls, data_dir, track_tokens, date_tokens, mgrs_tokens)
+    df_out = df_rtc_ts.copy()
+    df_out['loc_path_copol'] = out_paths_copol
+    df_out['loc_path_crosspol'] = out_paths_crosspol
+    return df_out
+def create_download_session(max_workers: int = 5) -> requests.Session:
+    """Create a requests session with appropriate settings for downloads.
+    Args:
+        max_workers: Number of concurrent download threads (used to size connection pool)
+    """
+    session = requests.Session()
+    session.headers.update({'User-Agent': 'dist-s1-enumerator/1.0'})
+    # Size connection pool based on concurrent workers
+    pool_maxsize = max(max_workers * 2, 10)
+    pool_maxsize = min(pool_maxsize, 50)
+    adapter = requests.adapters.HTTPAdapter(
+        pool_connections=10,
+        pool_maxsize=pool_maxsize,
+        max_retries=0,  # handle retries with tenacity
+    )
+    session.mount('http://', adapter)
+    session.mount('https://', adapter)
+    return session
+@retry(
+    retry=retry_if_exception_type((ConnectionError, HTTPError, RasterioIOError, Timeout, RequestException)),
+    stop=stop_after_attempt(5),
+    wait=wait_exponential(multiplier=1, min=1, max=10),
+    reraise=True,
+)
+def localize_one_rtc(url: str, out_path: Path, session: requests.Session | None = None) -> Path:
+    """Download a single RTC file with retry logic."""
+    if out_path.exists():
+        return out_path
+    if session is None:
+        session = create_download_session()
+    try:
+        with session.get(url, stream=True, timeout=30) as r:
+            r.raise_for_status()
+            out_path.parent.mkdir(parents=True, exist_ok=True)
+            with out_path.open('wb') as f:
+                for chunk in r.iter_content(chunk_size=16384):
+                    if chunk:  # filter out keep-alive chunks
+                        f.write(chunk)
+    except Exception:
+        # Clean up partial file on failure
+        if out_path.exists():
+            out_path.unlink()
+        raise
+    return out_path
+@check_input(rtc_s1_schema, 0)
+def localize_rtc_s1_ts(
+    df_rtc_ts: gpd.GeoDataFrame,
+    data_dir: Path | str,
+    max_workers: int = 5,
+    tqdm_enabled: bool = True,
+) -> gpd.GeoDataFrame:
+    df_out = append_local_paths(df_rtc_ts, data_dir)
+    urls = df_out['url_copol'].tolist() + df_out['url_crosspol'].tolist()
+    out_paths = df_out['loc_path_copol'].tolist() + df_out['loc_path_crosspol'].tolist()
+    # Create shared session for connection pooling, sized for concurrent workers
+    session = create_download_session(max_workers)
+    def localize_one_rtc_with_session(data: tuple) -> Path:
+        url, out_path = data
+        return localize_one_rtc(url, out_path, session)
+    disable_tqdm = not tqdm_enabled
+    with concurrent.futures.ThreadPoolExecutor(max_workers=max_workers) as executor:
+        _ = list(
+            tqdm(
+                executor.map(localize_one_rtc_with_session, zip(urls, out_paths)),
+                total=len(urls),
+                disable=disable_tqdm,
+                desc='Downloading RTC-S1 burst data',
+                dynamic_ncols=True,
+            )
+        )
+    # For serialization
+    df_out['loc_path_copol'] = df_out['loc_path_copol'].astype(str)
+    df_out['loc_path_crosspol'] = df_out['loc_path_crosspol'].astype(str)
+    return df_out

dist_s1_enumerator/tabular_models.py ADDED Viewed

@@ -0,0 +1,91 @@
+import geopandas as gpd
+from pandera.engines.pandas_engine import DateTime
+from pandera.pandas import Column, DataFrameSchema
+burst_schema = DataFrameSchema(
+    {
+        'jpl_burst_id': Column(str, required=True),
+        'track_number': Column(int, required=False),
+        'acq_group_id_within_mgrs_tile': Column(int, required=False),
+        'mgrs_tile_id': Column(str, required=False),
+        'geometry': Column('geometry', required=True),
+    }
+)
+mgrs_tile_schema = DataFrameSchema(
+    {
+        'mgrs_tile_id': Column(str, required=True),
+        'utm_epsg': Column(int, required=True),
+        'utm_wkt': Column(str, required=True),
+        'geometry': Column('geometry', required=True),
+    }
+)
+# Response schema from ASF DAAC API
+rtc_s1_resp_schema = DataFrameSchema(
+    {
+        'opera_id': Column(str, required=True),
+        'jpl_burst_id': Column(str, required=True),
+        'acq_dt': Column(DateTime(tz='UTC'), required=True),
+        'acq_date_for_mgrs_pass': Column(str, required=False),
+        'polarizations': Column(str, required=True),
+        'track_number': Column(int, required=True),
+        # Integer number of 6 day periods since 2014-01-01
+        'pass_id': Column(int, required=True),
+        'url_crosspol': Column(str, required=True),
+        'url_copol': Column(str, required=True),
+        'geometry': Column('geometry', required=True),
+    }
+)
+# Schema for RTC-S1 metadata with MGRS tile and acq group id appended
+# Note: a single burst product may be associated with multiple MGRS tiles and acq group_ids
+rtc_s1_schema = rtc_s1_resp_schema.add_columns(
+    {
+        'mgrs_tile_id': Column(str, required=True),
+        'acq_group_id_within_mgrs_tile': Column(int, required=True),
+        'track_token': Column(str, required=True),
+        'geometry': Column('geometry', required=True),
+    }
+)
+# Schema for inputs to dist-s1 workflow
+dist_s1_input_schema = rtc_s1_schema.add_columns(
+    {
+        'input_category': Column(str, required=True),
+        'product_id': Column(int, required=False),
+        'geometry': Column('geometry', required=True),
+    }
+)
+# Schema for localized inputs
+dist_s1_loc_input_schema = dist_s1_input_schema.add_columns(
+    {
+        'loc_path_copol': Column(str, required=True),
+        'loc_path_crosspol': Column(str, required=True),
+        'geometry': Column('geometry', required=True),
+    }
+)
+burst_mgrs_lut_schema = DataFrameSchema(
+    {
+        'jpl_burst_id': Column(str, required=True),
+        'mgrs_tile_id': Column(str, required=True),
+        'track_number': Column(int, required=True),
+        'acq_group_id_within_mgrs_tile': Column(int, required=True),
+        'orbit_pass': Column(str, required=True),
+        'area_per_acq_group_km2': Column(int, required=True),
+        'n_bursts_per_acq_group': Column(int, required=True),
+    }
+)
+def reorder_columns(df: gpd.GeoDataFrame, schema: DataFrameSchema) -> gpd.GeoDataFrame:
+    if not df.empty:
+        df = df[[col for col in schema.columns.keys() if col in df.columns]]
+    else:
+        df = gpd.GeoDataFrame(columns=schema.columns.keys())
+        if 'geometry' in schema.columns.keys():
+            df.set_crs(epsg=4326)
+    return df