PyPI - eoml - Versions diffs - 0.9.0__py3-none-any.whl - Mend

eoml 0.9.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (47) hide show

eoml/__init__.py +74 -0
eoml/automation/__init__.py +7 -0
eoml/automation/configuration.py +105 -0
eoml/automation/dag.py +233 -0
eoml/automation/experience.py +618 -0
eoml/automation/tasks.py +825 -0
eoml/bin/__init__.py +6 -0
eoml/bin/clean_checkpoint.py +146 -0
eoml/bin/land_cover_mapping_toml.py +435 -0
eoml/bin/mosaic_images.py +137 -0
eoml/data/__init__.py +7 -0
eoml/data/basic_geo_data.py +214 -0
eoml/data/dataset_utils.py +98 -0
eoml/data/persistence/__init__.py +7 -0
eoml/data/persistence/generic.py +253 -0
eoml/data/persistence/lmdb.py +379 -0
eoml/data/persistence/serializer.py +82 -0
eoml/raster/__init__.py +7 -0
eoml/raster/band.py +141 -0
eoml/raster/dataset/__init__.py +6 -0
eoml/raster/dataset/extractor.py +604 -0
eoml/raster/raster_reader.py +602 -0
eoml/raster/raster_utils.py +116 -0
eoml/torch/__init__.py +7 -0
eoml/torch/cnn/__init__.py +7 -0
eoml/torch/cnn/augmentation.py +150 -0
eoml/torch/cnn/dataset_evaluator.py +68 -0
eoml/torch/cnn/db_dataset.py +605 -0
eoml/torch/cnn/map_dataset.py +579 -0
eoml/torch/cnn/map_dataset_const_mem.py +135 -0
eoml/torch/cnn/outputs_transformer.py +130 -0
eoml/torch/cnn/torch_utils.py +404 -0
eoml/torch/cnn/training_dataset.py +241 -0
eoml/torch/cnn/windows_dataset.py +120 -0
eoml/torch/dataset/__init__.py +6 -0
eoml/torch/dataset/shade_dataset_tester.py +46 -0
eoml/torch/dataset/shade_tree_dataset_creators.py +537 -0
eoml/torch/model_low_use.py +507 -0
eoml/torch/models.py +282 -0
eoml/torch/resnet.py +437 -0
eoml/torch/sample_statistic.py +260 -0
eoml/torch/trainer.py +782 -0
eoml/torch/trainer_v2.py +253 -0
eoml-0.9.0.dist-info/METADATA +93 -0
eoml-0.9.0.dist-info/RECORD +47 -0
eoml-0.9.0.dist-info/WHEEL +4 -0
eoml-0.9.0.dist-info/entry_points.txt +3 -0

eoml/bin/__init__.py ADDED Viewed

@@ -0,0 +1,6 @@
+"""
+Binary Utilities Module for EOML.
+This module contains command-line utility scripts for various tasks
+such as cleanup operations and data management.
+"""

eoml/bin/clean_checkpoint.py ADDED Viewed

@@ -0,0 +1,146 @@
+"""
+Checkpoint cleanup utility for EOML.
+This command-line utility helps manage disk space by cleaning up old checkpoint
+files from model training. It keeps only the N most recent files in each
+subdirectory, removing older checkpoints to save space.
+Usage:
+    python clean_checkpoint.py <root_directory>
+The script will:
+1. Scan all subdirectories for files
+2. Identify files sorted by modification time
+3. Keep the 4 most recent files in each directory
+4. Remove older files after user confirmation
+5. Report disk space saved
+"""
+import os
+import typer
+from pathlib import Path
+from datetime import datetime
+def get_dir_size(path):
+    """
+    Calculate total size of directory in bytes.
+    Recursively computes the total size of all files in a directory and its
+    subdirectories.
+    Args:
+        path (str or Path): Path to the directory to measure.
+    Returns:
+        int: Total size in bytes.
+    Examples:
+        >>> size = get_dir_size("/path/to/checkpoints")
+        >>> print(f"Directory size: {size / (1024**3):.2f} GB")
+    """
+    total = 0
+    with os.scandir(path) as it:
+        for entry in it:
+            if entry.is_file():
+                total += entry.stat().st_size
+            elif entry.is_dir():
+                total += get_dir_size(entry.path)
+    return total
+def get_files_by_time(directory):
+    """
+    Get list of files sorted by modification time.
+    Retrieves all files in a directory and sorts them by modification time
+    in descending order (most recent first).
+    Args:
+        directory (str or Path): Path to the directory to scan.
+    Returns:
+        list: List of os.DirEntry objects sorted by modification time (newest first).
+    Examples:
+        >>> files = get_files_by_time("/path/to/checkpoints")
+        >>> most_recent = files[0]  # Most recently modified file
+    """
+    files = []
+    with os.scandir(directory) as it:
+        for entry in it:
+            if entry.is_file():
+                files.append(entry)
+    return sorted(files, key=lambda x: x.stat().st_mtime, reverse=True)
+app = typer.Typer()
+@app.command()
+def cleanup_folders(
+        root_dir: Path = typer.Argument(
+            ...,
+            exists=True,
+            dir_okay=True,
+            file_okay=False,
+            help="Root directory to clean up"
+        )
+):
+    """
+    Clean up folders by keeping only 4 most recent files.
+    This command scans all subdirectories under root_dir and keeps only the
+    4 most recently modified files in each directory, removing older files
+    to save disk space.
+    Args:
+        root_dir (Path): Root directory containing subdirectories with checkpoint files.
+    The function will:
+    - Show a list of files to be removed
+    - Ask for user confirmation before deleting
+    - Report the amount of disk space saved
+    - Display the number of files removed
+    Examples:
+        To clean up a checkpoints directory:
+        $ python clean_checkpoint.py /path/to/checkpoints
+    """
+    root_path = Path(root_dir)
+    initial_size = get_dir_size(root_path)
+    files_to_remove = []
+    # Collect files to remove
+    for folder in root_path.glob('*/'):
+        if folder.is_dir():
+            files = get_files_by_time(folder)
+            if len(files) > 4:
+                files_to_remove.extend([f.path for f in files[4:]])
+    if not files_to_remove:
+        typer.echo("No files need to be removed.")
+        return
+    # Show files to be removed
+    typer.echo(f"The following {len(files_to_remove)} files will be removed:")
+    for file in files_to_remove:
+        typer.echo(f"  {file}")
+    # Ask for confirmation
+    if typer.confirm('Do you want to proceed?'):
+        for file in files_to_remove:
+            os.remove(file)
+        final_size = get_dir_size(root_path)
+        saved = initial_size - final_size
+        typer.echo(f"\nSpace saved: {saved / (1024 * 1024):.2f} MB")
+        typer.echo(f"Number of files removed: {len(files_to_remove)}")
+    else:
+        typer.echo("Operation cancelled.")
+if __name__ == '__main__':
+    app()

eoml/bin/land_cover_mapping_toml.py ADDED Viewed

@@ -0,0 +1,435 @@
+"""
+Land cover mapping script using TOML configuration.
+This script demonstrates how to run a complete land cover mapping workflow
+using configuration loaded from a TOML file, leveraging the ExperienceInfo
+configuration system.
+Usage:
+    python land_cover_mapping_toml.py <path_to_config.toml>
+Example:
+    python land_cover_mapping_toml.py ../example_experience_config.toml
+"""
+import logging
+import os
+import sys
+import random
+from datetime import datetime
+from pathlib import Path
+import torch
+from rasterio.enums import Resampling
+from torch import nn
+from torch.optim import AdamW
+from rasterop.tiled_op.operation.mapping import CountCategoryToBand, MaxCategory, MaxScore
+# Configure logging
+logging.basicConfig(
+    level=logging.INFO,
+    format='%(asctime)s - %(name)s - %(levelname)s - %(message)s'
+)
+logger = logging.getLogger(__name__)
+from eoml.automation.experience import ExperienceInfo
+from eoml.automation.tasks import (
+    samples_split_setup,
+    samples_k_fold_setup,
+    extract_sample,
+    train_and_map, tiled_task,
+)
+from eoml.torch.cnn.augmentation import RandomTransform, CropTransform
+# TODO: Import these classes from the appropriate module
+# from eoml.raster.operations import CountCategoryToBand, MaxCategory, MaxScore
+def run_land_cover_mapping(config_path: str):
+    """
+    Run the complete land cover mapping workflow from a TOML configuration.
+    Args:
+        config_path: Path to the TOML configuration file.
+    """
+    # ----------------------------------------------------------------------------------------------------------------
+    # Setup
+    # ----------------------------------------------------------------------------------------------------------------
+    # For GPU support in multiple threads (also needed for mapping)
+    torch.multiprocessing.set_start_method('spawn')
+    logger.info(f"Loading configuration from: {config_path}")
+    # ----------------------------------------------------------------------------------------------------------------
+    # Load Configuration from TOML
+    # ----------------------------------------------------------------------------------------------------------------
+    # Load and validate the experience configuration
+    experience = ExperienceInfo.from_toml(config_path)
+    logger.info("Configuration loaded successfully!")
+    logger.info(f"  GPS file: {experience.experiment.gps_file}")
+    logger.info(f"  Model: {experience.experiment.model_name}")
+    logger.info(f"  Extract size: {experience.experiment.extract_size}")
+    logger.info(f"  Network size: {experience.experiment.size}")
+    logger.info(f"  Epochs: {experience.experiment.epoch}")
+    logger.info(f"  Batch multiplier: {experience.experiment.batch_mult}")
+    logger.info(f"  N-fold: {experience.experiment.nfold}")
+    # Extract runtime objects from experience
+    raster_reader = experience.raster_reader
+    mapper_full = experience.mapper
+    nn_output_transformer = experience.nn_output_transformer
+    system_config = experience.system_config
+    # Extract configuration values for convenient access
+    map_bounds = experience.boundaries.map_bounds
+    map_mask = experience.boundaries.map_mask
+    sample_mask = experience.boundaries.sample_mask
+    gps_file = experience.experiment.gps_file
+    extract_size = experience.experiment.extract_size
+    size = experience.experiment.size
+    class_label = experience.experiment.class_label
+    model_name = experience.experiment.model_name
+    batch_mult = experience.experiment.batch_mult
+    batch_mult_map = experience.experiment.batch_mult_map
+    epoch = experience.experiment.epoch
+    map_tag_name = experience.experiment.map_tag_name
+    nfold = experience.experiment.nfold
+    # ----------------------------------------------------------------------------------------------------------------
+    # Random Seed Configuration
+    # ----------------------------------------------------------------------------------------------------------------
+    # Initialize all random seeds (Python, NumPy, PyTorch) and set deterministic mode if configured
+    seed_info = experience.experiment.initialize_seeds(verbose=True)
+    # ----------------------------------------------------------------------------------------------------------------
+    # Device Configuration
+    # ----------------------------------------------------------------------------------------------------------------
+    device = experience.experiment.get_device()
+    map_mode = experience.experiment.get_map_mode()
+    logger.info(f"  Device: {device} (mode: {map_mode})")
+    # Log additional device info for multi-GPU setup
+    if isinstance(experience.experiment.device, list):
+        logger.info(f"  Available GPUs: {experience.experiment.device}")
+        if torch.cuda.is_available():
+            for gpu_id in experience.experiment.device:
+                if gpu_id < torch.cuda.device_count():
+                    logger.info(f"    - GPU {gpu_id}: {torch.cuda.get_device_name(gpu_id)}")
+                else:
+                    logger.warning(f"    - GPU {gpu_id}: Not available")
+    # ----------------------------------------------------------------------------------------------------------------
+    # File Path Management
+    # ----------------------------------------------------------------------------------------------------------------
+    gps_path = gps_file
+    db_path = f"{system_config.data_dir}/land_cover/samples/{gps_file.stem}_lmdb_NaN_to_0_{extract_size}"
+    # Training output paths
+    timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')
+    run_name = f"ch-{timestamp}"
+    run_stats_dir = f"{system_config.data_dir}/land_cover/nn_run_stats"
+    model_base_path = f"{system_config.data_dir}/land_cover/nn/{run_name}"
+    logger.info(f"Output directory: {model_base_path}")
+    # ----------------------------------------------------------------------------------------------------------------
+    # Sample Extraction Configuration
+    # ----------------------------------------------------------------------------------------------------------------
+    extractor_param = {
+        "gps_path": gps_path,
+        "raster_reader": raster_reader,
+        "db_path": db_path,
+        "windows_size": extract_size,
+        "label_name": class_label,
+        "mask_path": sample_mask,
+        "force_write": False
+    }
+    # ----------------------------------------------------------------------------------------------------------------
+    # Sample Split Configuration (K-Fold or Simple Split)
+    # ----------------------------------------------------------------------------------------------------------------
+    # K-fold cross-validation (recommended)
+    sample_param_kfold = {
+        "methode": samples_k_fold_setup,
+        "param": {
+            "db_path": db_path,
+            "mapper": mapper_full,
+            "n_fold": nfold
+        }
+    }
+    # Simple train/validation split (alternative)
+    sample_param_split = {
+        "methode": samples_split_setup,
+        "param": {
+            "db_path": db_path,
+            "mapper": mapper_full,
+            "split": [0.8, 0.2]
+        }
+    }
+    # Use K-fold by default
+    sample_param = sample_param_kfold
+    # ----------------------------------------------------------------------------------------------------------------
+    # Data Augmentation Configuration
+    # ----------------------------------------------------------------------------------------------------------------
+    augmentation_param = {
+        "methode": "no_dep",
+        "transform_train": RandomTransform(
+            width=size,
+            p_rot=0.90,
+            p_flip=0.50,
+            p_scale=0.4,
+            p_shear=0.3,
+            p_blur=0.3
+        ),
+        "transform_valid": CropTransform(size)
+    }
+    # ----------------------------------------------------------------------------------------------------------------
+    # DataLoader Configuration
+    # ----------------------------------------------------------------------------------------------------------------
+    dataloader_parameter = {
+        "batch_size": int(batch_mult * 1024),
+        "num_worker": 5,
+        "prefetch": 1,
+        "device": device,
+        "balance_sample": False,
+        "persistent_workers": True
+    }
+    # ----------------------------------------------------------------------------------------------------------------
+    # Neural Network Configuration
+    # ----------------------------------------------------------------------------------------------------------------
+    nn_parameter = {
+        "in_size": size,
+        "n_bands": raster_reader.n_band,
+        "n_out": len(mapper_full)
+    }
+    model_parameter = {
+        "model_name": model_name,
+        "type": "normal",
+        "path": None,
+        "device": device,
+        "nn_parameter": nn_parameter
+    }
+    # ----------------------------------------------------------------------------------------------------------------
+    # Optimizer Configuration
+    # ----------------------------------------------------------------------------------------------------------------
+    optimizer_parameter = {
+        "loss": nn.CrossEntropyLoss(),
+        "optimizer": AdamW,
+        "optimizer_parameter": {
+            "lr": 1.5 * 0.018 * 1e-2,
+            "weight_decay": 0.001 * 0.0020
+        },
+        "scheduler_mode": "cycle",
+        "scheduler_parameter": {
+            "max_lr": 0.0008
+        }
+    }
+    # ----------------------------------------------------------------------------------------------------------------
+    # Training Configuration
+    # ----------------------------------------------------------------------------------------------------------------
+    dataset_parameter = {
+        "db_path": db_path,
+        "mapper": mapper_full
+    }
+    train_nn_parameter = {
+        "max_epochs": epoch,
+        "run_stats_dir": run_stats_dir,
+        "model_base_path": model_base_path,
+        "model_tag": model_name,
+        "grad_clip_value": 0.1,
+        "device": device
+    }
+    train_parameter = {
+        "sample_param": sample_param,
+        "augmentation_param": augmentation_param,
+        "dataset_parameter": dataset_parameter,
+        "dataloader_parameter": dataloader_parameter,
+        "model_parameter": model_parameter,
+        "optimizer_parameter": optimizer_parameter,
+        "train_nn_parameter": train_nn_parameter
+    }
+    # ----------------------------------------------------------------------------------------------------------------
+    # Mapping Configuration
+    # ----------------------------------------------------------------------------------------------------------------
+    map_parameter = {
+        "raster_reader": raster_reader,
+        "windows_size": size,
+        "batch_size": int(batch_mult_map * 1024),
+        "map_tag": map_tag_name,
+        "transformer": nn_output_transformer,
+        "mask": map_mask,
+        "bounds": map_bounds,
+        "mode": map_mode,
+        "num_worker": 7,
+        "prefetch": 1
+    }
+    # Map modes:
+    # 0 - Full CPU, no pinning
+    # 1 - Pinned memory in loader, moved asynchronously to GPU (recommended for GPU)
+    # 2 - Start CUDA in each thread, prepare samples directly on GPU
+    #     (uses ~1GB per thread, requires torch.multiprocessing.set_start_method('spawn'))
+    train_map_parameter = train_parameter.copy()
+    train_map_parameter.update({"map_parameter": map_parameter})
+    # ----------------------------------------------------------------------------------------------------------------
+    # Execute Workflow
+    # ----------------------------------------------------------------------------------------------------------------
+    logger.info("=" * 80)
+    logger.info("STARTING LAND COVER MAPPING WORKFLOW")
+    logger.info("=" * 80)
+    # Create output directory
+    os.makedirs(model_base_path, exist_ok=True)
+    # Save configuration log
+    with open(f"{model_base_path}/log.txt", "w") as log:
+        log.write(repr(train_map_parameter))
+    # Copy TOML configuration to output directory for reference
+    import shutil
+    shutil.copy(config_path, f"{model_base_path}/config.toml")
+    logger.info(f"Configuration saved to: {model_base_path}/config.toml")
+    # Step 1: Extract samples from raster data
+    logger.info("[1/4] Extracting samples from raster data...")
+    extract_sample(**extractor_param)
+    logger.info("✓ Sample extraction complete")
+    # Step 2: Train model and generate maps
+    logger.info("[2/4] Training model and generating maps...")
+    maps = train_and_map(**train_map_parameter)
+    logger.info(f"✓ Training complete, generated {len(maps)} maps")
+    # ----------------------------------------------------------------------------------------------------------------
+    # Post-Processing: Merge and Aggregate Maps
+    # ----------------------------------------------------------------------------------------------------------------
+    logger.info("[3/4] Post-processing maps...")
+    raster_out_merge = f"{model_base_path}/01_{run_name}_merged.tif"
+    raster_out_score = f"{model_base_path}/02_{run_name}_max_arg.tif"
+    raster_out_score_max = f"{model_base_path}/02_{run_name}_max_score.tif"
+    default_op_param = {
+        "bounds": map_bounds,
+        "res": None,
+        "resampling": Resampling.nearest,
+        "target_aligned_pixels": False,
+        "indexes": None,
+        "src_kwds": None,
+        "dst_kwds": None,
+        "num_workers": 8
+    }
+    # TODO: Uncomment when CountCategoryToBand, MaxCategory, MaxScore are available
+    #
+    # Count categories across all maps
+    category_count_op = CountCategoryToBand(max(mapper_full.map_values()), dtype="int16")
+    category_count_param = {
+        "maps": maps,
+        "raster_out": raster_out_merge,
+        "operation": category_count_op
+    }
+    category_count_param.update(default_op_param)
+    # Find maximum category (mode)
+    category_max_op = MaxCategory()
+    category_max_param = {
+        "maps": [raster_out_merge],
+        "raster_out": raster_out_score,
+        "operation": category_max_op
+    }
+    category_max_param.update(default_op_param)
+    # Find maximum score (confidence)
+    category_max_score_op = MaxScore()
+    category_max_score_param = {
+        "maps": [raster_out_merge],
+        "raster_out": raster_out_score_max,
+        "operation": category_max_score_op
+    }
+    category_max_score_param.update(default_op_param)
+    # Execute tiled operations
+    logger.info(f"  - Merging {len(maps)} maps...")
+    tiled_task(**category_count_param)
+    logger.info("  - Computing maximum category...")
+    tiled_task(**category_max_param)
+    logger.info("  - Computing maximum score...")
+    tiled_task(**category_max_score_param)
+    #
+    # logger.info("✓ Post-processing complete")
+    logger.warning("Post-processing operations are commented out.")
+    logger.warning("Uncomment the operations in the code once the required classes are available.")
+    # ----------------------------------------------------------------------------------------------------------------
+    # Done
+    # ----------------------------------------------------------------------------------------------------------------
+    logger.info("[4/4] Workflow complete!")
+    logger.info("=" * 80)
+    logger.info("RESULTS")
+    logger.info("=" * 80)
+    logger.info(f"Output directory: {model_base_path}")
+    logger.info(f"Maps generated: {len(maps)}")
+    for i, map_path in enumerate(maps, 1):
+        logger.info(f"  [{i}] {map_path}")
+    # logger.info(f"\nMerged output: {raster_out_merge}")
+    # logger.info(f"Category map: {raster_out_score}")
+    # logger.info(f"Confidence map: {raster_out_score_max}")
+    logger.info("=" * 80)
+def main():
+    """Main entry point for the script."""
+    if len(sys.argv) < 2:
+        logger.error("Usage: python land_cover_mapping_toml.py <path_to_config.toml>")
+        logger.info("Example:")
+        logger.info("  python land_cover_mapping_toml.py ../example_experience_config.toml")
+        sys.exit(1)
+    config_path = sys.argv[1]
+    if not os.path.exists(config_path):
+        logger.error(f"Configuration file not found: {config_path}")
+        sys.exit(1)
+    try:
+        run_land_cover_mapping(config_path)
+    except Exception as e:
+        logger.exception(f"Error during execution: {e}")
+        sys.exit(1)
+if __name__ == '__main__':
+    main()