PyPI - geoai-py - Versions diffs - 0.4.1__py2.py3-none-any.whl → 0.4.2__py2.py3-none-any.whl - Mend

geoai-py 0.4.1py2.py3-none-any.whl → 0.4.2py2.py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (13) hide show

geoai/__init__.py +1 -1
geoai/download.py +644 -0
geoai/extract.py +518 -0
geoai/geoai.py +9 -1
geoai/train.py +98 -12
geoai/utils.py +240 -8
{geoai_py-0.4.1.dist-info → geoai_py-0.4.2.dist-info}/METADATA +6 -6
geoai_py-0.4.2.dist-info/RECORD +15 -0
{geoai_py-0.4.1.dist-info → geoai_py-0.4.2.dist-info}/WHEEL +1 -1
geoai_py-0.4.1.dist-info/RECORD +0 -15
{geoai_py-0.4.1.dist-info → geoai_py-0.4.2.dist-info}/entry_points.txt +0 -0
{geoai_py-0.4.1.dist-info → geoai_py-0.4.2.dist-info/licenses}/LICENSE +0 -0
{geoai_py-0.4.1.dist-info → geoai_py-0.4.2.dist-info}/top_level.txt +0 -0

geoai/__init__.py CHANGED Viewed

@@ -2,7 +2,7 @@
 __author__ = """Qiusheng Wu"""
 __email__ = "giswqs@gmail.com"
-__version__ = "0.4.1"
+__version__ = "0.4.2"
 import os

geoai/download.py CHANGED Viewed

@@ -8,9 +8,11 @@ from typing import Any, Dict, List, Optional, Tuple
 import geopandas as gpd
 import matplotlib.pyplot as plt
 import numpy as np
+import pandas as pd
 import planetary_computer as pc
 import requests
 import rioxarray
+import xarray as xr
 from pystac_client import Client
 from shapely.geometry import box
 from tqdm import tqdm
@@ -394,3 +396,645 @@ def extract_building_stats(geojson_file: str) -> Dict[str, Any]:
     except Exception as e:
         logger.error(f"Error extracting statistics: {str(e)}")
         return {"error": str(e)}
+def download_pc_stac_item(
+    item_url,
+    bands=None,
+    output_dir=None,
+    show_progress=True,
+    merge_bands=False,
+    merged_filename=None,
+    overwrite=False,
+    cell_size=None,
+):
+    """
+    Downloads a STAC item from Microsoft Planetary Computer with specified bands.
+    This function fetches a STAC item by URL, signs the assets using Planetary Computer
+    credentials, and downloads the specified bands with a progress bar. Can optionally
+    merge bands into a single multi-band GeoTIFF.
+    Args:
+        item_url (str): The URL of the STAC item to download.
+        bands (list, optional): List of specific bands to download (e.g., ['B01', 'B02']).
+                               If None, all available bands will be downloaded.
+        output_dir (str, optional): Directory to save downloaded bands. If None,
+                                   bands are returned as xarray DataArrays.
+        show_progress (bool, optional): Whether to display a progress bar. Default is True.
+        merge_bands (bool, optional): Whether to merge downloaded bands into a single
+                                     multi-band GeoTIFF file. Default is False.
+        merged_filename (str, optional): Filename for the merged bands. If None and
+                                        merge_bands is True, uses "{item_id}_merged.tif".
+        overwrite (bool, optional): Whether to overwrite existing files. Default is False.
+        cell_size (float, optional): Resolution in meters for the merged output. If None,
+                                    uses the resolution of the first band.
+    Returns:
+        dict: Dictionary mapping band names to their corresponding xarray DataArrays
+              or file paths if output_dir is provided. If merge_bands is True, also
+              includes a 'merged' key with the path to the merged file.
+    Raises:
+        ValueError: If the item cannot be retrieved or a requested band is not available.
+    """
+    from rasterio.enums import Resampling
+    # Get the item ID from the URL
+    item_id = item_url.split("/")[-1]
+    collection = item_url.split("/collections/")[1].split("/items/")[0]
+    # Connect to the Planetary Computer STAC API
+    catalog = Client.open(
+        "https://planetarycomputer.microsoft.com/api/stac/v1",
+        modifier=pc.sign_inplace,
+    )
+    # Search for the specific item
+    search = catalog.search(collections=[collection], ids=[item_id])
+    # Get the first item from the search results
+    items = list(search.get_items())
+    if not items:
+        raise ValueError(f"Item with ID {item_id} not found")
+    item = items[0]
+    # Determine which bands to download
+    available_assets = list(item.assets.keys())
+    if bands is None:
+        # If no bands specified, download all band assets
+        bands_to_download = [
+            asset for asset in available_assets if asset.startswith("B")
+        ]
+    else:
+        # Verify all requested bands exist
+        missing_bands = [band for band in bands if band not in available_assets]
+        if missing_bands:
+            raise ValueError(
+                f"The following bands are not available: {missing_bands}. "
+                f"Available assets are: {available_assets}"
+            )
+        bands_to_download = bands
+    # Create output directory if specified and doesn't exist
+    if output_dir and not os.path.exists(output_dir):
+        os.makedirs(output_dir)
+    result = {}
+    band_data_arrays = []
+    resampled_arrays = []
+    band_names = []  # Track band names in order
+    # Set up progress bar
+    progress_iter = (
+        tqdm(bands_to_download, desc="Downloading bands")
+        if show_progress
+        else bands_to_download
+    )
+    # Download each requested band
+    for band in progress_iter:
+        if band not in item.assets:
+            if show_progress and not isinstance(progress_iter, list):
+                progress_iter.write(
+                    f"Warning: Band {band} not found in assets, skipping."
+                )
+            continue
+        band_url = item.assets[band].href
+        if output_dir:
+            file_path = os.path.join(output_dir, f"{item.id}_{band}.tif")
+            # Check if file exists and skip if overwrite is False
+            if os.path.exists(file_path) and not overwrite:
+                if show_progress and not isinstance(progress_iter, list):
+                    progress_iter.write(
+                        f"File {file_path} already exists, skipping (use overwrite=True to force download)."
+                    )
+                # Still need to open the file to get the data for merging
+                if merge_bands:
+                    band_data = rioxarray.open_rasterio(file_path)
+                    band_data_arrays.append((band, band_data))
+                    band_names.append(band)
+                result[band] = file_path
+                continue
+        if show_progress and not isinstance(progress_iter, list):
+            progress_iter.set_description(f"Downloading {band}")
+        band_data = rioxarray.open_rasterio(band_url)
+        # Store the data array for potential merging later
+        if merge_bands:
+            band_data_arrays.append((band, band_data))
+            band_names.append(band)
+        if output_dir:
+            file_path = os.path.join(output_dir, f"{item.id}_{band}.tif")
+            band_data.rio.to_raster(file_path)
+            result[band] = file_path
+        else:
+            result[band] = band_data
+    # Merge bands if requested
+    if merge_bands and output_dir:
+        if merged_filename is None:
+            merged_filename = f"{item.id}_merged.tif"
+        merged_path = os.path.join(output_dir, merged_filename)
+        # Check if merged file exists and skip if overwrite is False
+        if os.path.exists(merged_path) and not overwrite:
+            if show_progress:
+                print(
+                    f"Merged file {merged_path} already exists, skipping (use overwrite=True to force creation)."
+                )
+            result["merged"] = merged_path
+        else:
+            if show_progress:
+                print("Resampling and merging bands...")
+            # Determine target cell size if not provided
+            if cell_size is None and band_data_arrays:
+                # Use the resolution of the first band (usually 10m for B02, B03, B04, B08)
+                # Get the affine transform (containing resolution info)
+                first_band_data = band_data_arrays[0][1]
+                # Extract resolution from transform
+                cell_size = abs(first_band_data.rio.transform()[0])
+                if show_progress:
+                    print(f"Using detected resolution: {cell_size}m")
+            elif cell_size is None:
+                # Default to 10m if no bands are available
+                cell_size = 10
+                if show_progress:
+                    print(f"Using default resolution: {cell_size}m")
+            # Process bands in memory-efficient way
+            for i, (band_name, data_array) in enumerate(band_data_arrays):
+                if show_progress:
+                    print(f"Processing band: {band_name}")
+                # Get current resolution
+                current_res = abs(data_array.rio.transform()[0])
+                # Resample if needed
+                if (
+                    abs(current_res - cell_size) > 0.01
+                ):  # Small tolerance for floating point comparison
+                    if show_progress:
+                        print(
+                            f"Resampling {band_name} from {current_res}m to {cell_size}m"
+                        )
+                    # Use bilinear for downsampling (higher to lower resolution)
+                    # Use nearest for upsampling (lower to higher resolution)
+                    resampling_method = (
+                        Resampling.bilinear
+                        if current_res < cell_size
+                        else Resampling.nearest
+                    )
+                    resampled = data_array.rio.reproject(
+                        data_array.rio.crs,
+                        resolution=(cell_size, cell_size),
+                        resampling=resampling_method,
+                    )
+                    resampled_arrays.append(resampled)
+                else:
+                    resampled_arrays.append(data_array)
+            if show_progress:
+                print("Stacking bands...")
+            # Concatenate all resampled arrays along the band dimension
+            try:
+                merged_data = xr.concat(resampled_arrays, dim="band")
+                if show_progress:
+                    print(f"Writing merged data to {merged_path}...")
+                # Add description metadata
+                merged_data.attrs["description"] = (
+                    f"Multi-band image containing {', '.join(band_names)}"
+                )
+                # Create a dictionary mapping band indices to band names
+                band_descriptions = {}
+                for i, name in enumerate(band_names):
+                    band_descriptions[i + 1] = name
+                # Write the merged data to file with band descriptions
+                merged_data.rio.to_raster(
+                    merged_path,
+                    tags={"BAND_NAMES": ",".join(band_names)},
+                    descriptions=band_names,
+                )
+                result["merged"] = merged_path
+                if show_progress:
+                    print(f"Merged bands saved to: {merged_path}")
+                    print(f"Band order in merged file: {', '.join(band_names)}")
+            except Exception as e:
+                if show_progress:
+                    print(f"Error during merging: {str(e)}")
+                    print(f"Error details: {type(e).__name__}: {str(e)}")
+                raise
+    return result
+def pc_collection_list(
+    endpoint="https://planetarycomputer.microsoft.com/api/stac/v1",
+    detailed=False,
+    filter_by=None,
+    sort_by="id",
+):
+    """
+    Retrieves and displays the list of available collections from Planetary Computer.
+    This function connects to the Planetary Computer STAC API and retrieves the
+    list of all available collections, with options to filter and sort the results.
+    Args:
+        endpoint (str, optional): STAC API endpoint URL.
+            Defaults to "https://planetarycomputer.microsoft.com/api/stac/v1".
+        detailed (bool, optional): Whether to return detailed information for each
+            collection. If False, returns only basic info. Defaults to False.
+        filter_by (dict, optional): Dictionary of field:value pairs to filter
+            collections. For example, {"license": "CC-BY-4.0"}. Defaults to None.
+        sort_by (str, optional): Field to sort the collections by.
+            Defaults to "id".
+    Returns:
+        pandas.DataFrame: DataFrame containing collection information.
+    Raises:
+        ConnectionError: If there's an issue connecting to the API.
+    """
+    # Initialize the STAC client
+    try:
+        catalog = Client.open(endpoint)
+    except Exception as e:
+        raise ConnectionError(f"Failed to connect to STAC API at {endpoint}: {str(e)}")
+    # Get all collections
+    try:
+        collections = list(catalog.get_collections())
+    except Exception as e:
+        raise Exception(f"Error retrieving collections: {str(e)}")
+    # Basic info to extract from all collections
+    collection_info = []
+    # Extract information based on detail level
+    for collection in collections:
+        # Basic information always included
+        info = {
+            "id": collection.id,
+            "title": collection.title or "No title",
+            "description": (
+                collection.description[:100] + "..."
+                if collection.description and len(collection.description) > 100
+                else collection.description
+            ),
+        }
+        # Add detailed information if requested
+        if detailed:
+            # Get temporal extent if available
+            temporal_extent = "Unknown"
+            if collection.extent and collection.extent.temporal:
+                interval = (
+                    collection.extent.temporal.intervals[0]
+                    if collection.extent.temporal.intervals
+                    else None
+                )
+                if interval:
+                    start = interval[0] or "Unknown Start"
+                    end = interval[1] or "Present"
+                    if isinstance(start, datetime.datetime):
+                        start = start.strftime("%Y-%m-%d")
+                    if isinstance(end, datetime.datetime):
+                        end = end.strftime("%Y-%m-%d")
+                    temporal_extent = f"{start} to {end}"
+            # Add additional details
+            info.update(
+                {
+                    "license": collection.license or "Unknown",
+                    "keywords": (
+                        ", ".join(collection.keywords)
+                        if collection.keywords
+                        else "None"
+                    ),
+                    "temporal_extent": temporal_extent,
+                    "asset_count": len(collection.assets) if collection.assets else 0,
+                    "providers": (
+                        ", ".join([p.name for p in collection.providers])
+                        if collection.providers
+                        else "Unknown"
+                    ),
+                }
+            )
+            # Add spatial extent if available
+            if collection.extent and collection.extent.spatial:
+                info["bbox"] = (
+                    str(collection.extent.spatial.bboxes[0])
+                    if collection.extent.spatial.bboxes
+                    else "Unknown"
+                )
+        collection_info.append(info)
+    # Convert to DataFrame for easier filtering and sorting
+    df = pd.DataFrame(collection_info)
+    # Apply filtering if specified
+    if filter_by:
+        for field, value in filter_by.items():
+            if field in df.columns:
+                df = df[df[field].astype(str).str.contains(value, case=False, na=False)]
+    # Apply sorting
+    if sort_by in df.columns:
+        df = df.sort_values(by=sort_by)
+    print(f"Retrieved {len(df)} collections from Planetary Computer")
+    # # Print a nicely formatted table
+    # if not df.empty:
+    #     print("\nAvailable collections:")
+    #     print(tabulate(df, headers="keys", tablefmt="grid", showindex=False))
+    return df
+def pc_stac_search(
+    collection,
+    bbox=None,
+    time_range=None,
+    query=None,
+    limit=10,
+    max_items=None,
+    endpoint="https://planetarycomputer.microsoft.com/api/stac/v1",
+):
+    """
+    Search for STAC items in the Planetary Computer catalog.
+    This function queries the Planetary Computer STAC API to find items matching
+    the specified criteria, including collection, bounding box, time range, and
+    additional query parameters.
+    Args:
+        collection (str): The STAC collection ID to search within.
+        bbox (list, optional): Bounding box coordinates [west, south, east, north].
+            Defaults to None.
+        time_range (str or tuple, optional): Time range as a string "start/end" or
+            a tuple of (start, end) datetime objects. Defaults to None.
+        query (dict, optional): Additional query parameters for filtering.
+            Defaults to None.
+        limit (int, optional): Number of items to return per page. Defaults to 10.
+        max_items (int, optional): Maximum total number of items to return.
+            Defaults to None (returns all matching items).
+        endpoint (str, optional): STAC API endpoint URL.
+            Defaults to "https://planetarycomputer.microsoft.com/api/stac/v1".
+    Returns:
+        list: List of STAC Item objects matching the search criteria.
+    Raises:
+        ValueError: If invalid parameters are provided.
+        ConnectionError: If there's an issue connecting to the API.
+    """
+    import datetime
+    # Initialize the STAC client
+    try:
+        catalog = Client.open(endpoint)
+    except Exception as e:
+        raise ConnectionError(f"Failed to connect to STAC API at {endpoint}: {str(e)}")
+    # Process time_range if provided
+    if time_range:
+        if isinstance(time_range, tuple) and len(time_range) == 2:
+            # Convert datetime objects to ISO format strings
+            start, end = time_range
+            if isinstance(start, datetime.datetime):
+                start = start.isoformat()
+            if isinstance(end, datetime.datetime):
+                end = end.isoformat()
+            time_str = f"{start}/{end}"
+        elif isinstance(time_range, str):
+            time_str = time_range
+        else:
+            raise ValueError(
+                "time_range must be a 'start/end' string or tuple of (start, end)"
+            )
+    else:
+        time_str = None
+    # Create the search object
+    search = catalog.search(
+        collections=[collection], bbox=bbox, datetime=time_str, query=query, limit=limit
+    )
+    # Collect the items
+    items = []
+    try:
+        # Use max_items if specified, otherwise get all items
+        if max_items:
+            items_gen = search.get_items()
+            for item in items_gen:
+                items.append(item)
+                if len(items) >= max_items:
+                    break
+        else:
+            items = list(search.get_items())
+    except Exception as e:
+        raise Exception(f"Error retrieving search results: {str(e)}")
+    print(f"Found {len(items)} items matching search criteria")
+    return items
+def pc_stac_download(
+    items,
+    output_dir=".",
+    asset_keys=None,
+    max_workers=4,
+    skip_existing=True,
+    sign_urls=True,
+):
+    """
+    Download assets from STAC items retrieved from the Planetary Computer.
+    This function downloads specified assets from a list of STAC items to the
+    specified output directory. It supports parallel downloads and can skip
+    already downloaded files.
+    Args:
+        items (list or pystac.Item): STAC Item object or list of STAC Item objects.
+        output_dir (str, optional): Directory where assets will be saved.
+            Defaults to current directory.
+        asset_keys (list, optional): List of asset keys to download. If None,
+            downloads all available assets. Defaults to None.
+        max_workers (int, optional): Maximum number of concurrent download threads.
+            Defaults to 4.
+        skip_existing (bool, optional): Skip download if the file already exists.
+            Defaults to True.
+        sign_urls (bool, optional): Whether to sign URLs for authenticated access.
+            Defaults to True.
+    Returns:
+        dict: Dictionary mapping STAC item IDs to dictionaries of their downloaded
+            assets {asset_key: file_path}.
+    Raises:
+        TypeError: If items is not a STAC Item or list of STAC Items.
+        IOError: If there's an error writing the downloaded assets to disk.
+    """
+    import pystac
+    from concurrent.futures import ThreadPoolExecutor, as_completed
+    # Handle single item case
+    if isinstance(items, pystac.Item):
+        items = [items]
+    elif not isinstance(items, list):
+        raise TypeError("items must be a STAC Item or list of STAC Items")
+    # Create output directory if it doesn't exist
+    os.makedirs(output_dir, exist_ok=True)
+    # Function to sign URLs if needed
+    def get_signed_url(href):
+        if not sign_urls:
+            return href
+        # Planetary Computer typically requires signing URLs for accessing data
+        # Check if the URL is from Microsoft Planetary Computer
+        if "planetarycomputer" in href:
+            try:
+                sign_url = "https://planetarycomputer.microsoft.com/api/sas/v1/sign"
+                response = requests.get(sign_url, params={"href": href})
+                response.raise_for_status()
+                return response.json().get("href", href)
+            except Exception as e:
+                print(f"Warning: Failed to sign URL {href}: {str(e)}")
+                return href
+        return href
+    # Function to download a single asset
+    def download_asset(item, asset_key, asset):
+        item_id = item.id
+        # Get the asset URL and sign it if needed
+        asset_url = get_signed_url(asset.href)
+        # Determine output filename
+        if asset.media_type:
+            # Use appropriate file extension based on media type
+            if "tiff" in asset.media_type or "geotiff" in asset.media_type:
+                ext = ".tif"
+            elif "jpeg" in asset.media_type:
+                ext = ".jpg"
+            elif "png" in asset.media_type:
+                ext = ".png"
+            elif "json" in asset.media_type:
+                ext = ".json"
+            else:
+                # Default extension based on the original URL
+                ext = os.path.splitext(asset_url.split("?")[0])[1] or ".data"
+        else:
+            # Default extension based on the original URL
+            ext = os.path.splitext(asset_url.split("?")[0])[1] or ".data"
+        output_path = os.path.join(output_dir, f"{item_id}_{asset_key}{ext}")
+        # Skip if file exists and skip_existing is True
+        if skip_existing and os.path.exists(output_path):
+            print(f"Skipping existing asset: {asset_key} -> {output_path}")
+            return asset_key, output_path
+        try:
+            # Download the asset with progress bar
+            with requests.get(asset_url, stream=True) as r:
+                r.raise_for_status()
+                total_size = int(r.headers.get("content-length", 0))
+                with open(output_path, "wb") as f:
+                    with tqdm(
+                        total=total_size,
+                        unit="B",
+                        unit_scale=True,
+                        unit_divisor=1024,
+                        desc=f"Downloading {item_id}_{asset_key}",
+                        ncols=100,
+                    ) as pbar:
+                        for chunk in r.iter_content(chunk_size=8192):
+                            f.write(chunk)
+                            pbar.update(len(chunk))
+            return asset_key, output_path
+        except Exception as e:
+            print(f"Error downloading {asset_key} for item {item_id}: {str(e)}")
+            if os.path.exists(output_path):
+                os.remove(output_path)  # Clean up partial download
+            return asset_key, None
+    # Process all items and their assets
+    results = {}
+    for item in items:
+        item_assets = {}
+        item_id = item.id
+        print(f"Processing STAC item: {item_id}")
+        # Determine which assets to download
+        if asset_keys:
+            assets_to_download = {
+                k: v for k, v in item.assets.items() if k in asset_keys
+            }
+            if not assets_to_download:
+                print(
+                    f"Warning: None of the specified asset keys {asset_keys} found in item {item_id}"
+                )
+                print(f"Available asset keys: {list(item.assets.keys())}")
+                continue
+        else:
+            assets_to_download = item.assets
+        # Download assets concurrently
+        with ThreadPoolExecutor(max_workers=max_workers) as executor:
+            # Submit all download tasks
+            future_to_asset = {
+                executor.submit(download_asset, item, asset_key, asset): (
+                    asset_key,
+                    asset,
+                )
+                for asset_key, asset in assets_to_download.items()
+            }
+            # Process results as they complete
+            for future in as_completed(future_to_asset):
+                asset_key, asset = future_to_asset[future]
+                try:
+                    key, path = future.result()
+                    if path:
+                        item_assets[key] = path
+                except Exception as e:
+                    print(
+                        f"Error processing asset {asset_key} for item {item_id}: {str(e)}"
+                    )
+        results[item_id] = item_assets
+    # Count total downloaded assets
+    total_assets = sum(len(assets) for assets in results.values())
+    print(f"\nDownloaded {total_assets} assets for {len(results)} items")
+    return results

geoai-py 0.4.1__py2.py3-none-any.whl → 0.4.2__py2.py3-none-any.whl

geoai-py 0.4.1py2.py3-none-any.whl → 0.4.2py2.py3-none-any.whl