geoai-py 0.18.1__py2.py3-none-any.whl → 0.19.0__py2.py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,383 @@
1
+ """
2
+ Landcover Classification Utilities - Enhanced Tile Export Module
3
+
4
+ This module extends the base geoai functionality with specialized utilities
5
+ for discrete landcover classification. It provides enhanced tile generation
6
+ with background filtering capabilities to improve training efficiency.
7
+
8
+ Key Features:
9
+ - Enhanced tile filtering with configurable feature ratio thresholds
10
+ - Separate statistics tracking for different skip reasons
11
+ - Maintains full compatibility with base geoai workflow
12
+ - Optimized for discrete landcover classification tasks
13
+
14
+ Date: November 2025
15
+ """
16
+
17
+ import os
18
+ import warnings
19
+ from pathlib import Path
20
+ from typing import Any, Dict, List, Optional, Tuple, Union
21
+
22
+ import geopandas as gpd
23
+ import numpy as np
24
+ import rasterio
25
+ from rasterio import features
26
+ from rasterio.windows import Window
27
+ from tqdm import tqdm
28
+
29
+
30
+ def export_landcover_tiles(
31
+ in_raster: str,
32
+ out_folder: str,
33
+ in_class_data: Optional[Union[str, gpd.GeoDataFrame]] = None,
34
+ tile_size: int = 256,
35
+ stride: int = 128,
36
+ class_value_field: str = "class",
37
+ buffer_radius: float = 0,
38
+ max_tiles: Optional[int] = None,
39
+ quiet: bool = False,
40
+ all_touched: bool = True,
41
+ create_overview: bool = False,
42
+ skip_empty_tiles: bool = False,
43
+ min_feature_ratio: Union[bool, float] = False,
44
+ metadata_format: str = "PASCAL_VOC",
45
+ ) -> Dict[str, Any]:
46
+ """
47
+ Export GeoTIFF tiles optimized for landcover classification training.
48
+
49
+ This function extends the base export_geotiff_tiles with enhanced filtering
50
+ capabilities specifically designed for discrete landcover classification.
51
+ It can filter out tiles dominated by background pixels to improve training
52
+ data quality and reduce dataset size.
53
+
54
+ Args:
55
+ in_raster: Path to input raster (image to tile)
56
+ out_folder: Output directory for tiles
57
+ in_class_data: Path to vector mask or GeoDataFrame (optional for image-only export)
58
+ tile_size: Size of output tiles in pixels (default: 256)
59
+ stride: Stride for sliding window (default: 128)
60
+ class_value_field: Field name containing class values (default: "class")
61
+ buffer_radius: Buffer radius around features in pixels (default: 0)
62
+ max_tiles: Maximum number of tiles to export (default: None)
63
+ quiet: Suppress progress output (default: False)
64
+ all_touched: Include pixels touched by geometry (default: True)
65
+ create_overview: Create overview image showing tile locations (default: False)
66
+ skip_empty_tiles: Skip tiles with no features (default: False)
67
+ min_feature_ratio: Minimum ratio of non-background pixels required to keep tile
68
+ - False: Disable ratio filtering (default)
69
+ - 0.0-1.0: Minimum ratio threshold (e.g., 0.1 = 10% features required)
70
+ metadata_format: Annotation format ("PASCAL_VOC" or "YOLO")
71
+
72
+ Returns:
73
+ Dictionary containing:
74
+ - tiles_exported: Number of tiles successfully exported
75
+ - tiles_skipped_empty: Number of completely empty tiles skipped
76
+ - tiles_skipped_ratio: Number of tiles filtered by min_feature_ratio
77
+ - output_dirs: Dictionary with paths to images and labels directories
78
+
79
+ Examples:
80
+ # Original behavior (no filtering)
81
+ export_landcover_tiles(
82
+ "input.tif",
83
+ "output",
84
+ "mask.shp",
85
+ skip_empty_tiles=True
86
+ )
87
+
88
+ # Light filtering (keep tiles with ≥5% features)
89
+ export_landcover_tiles(
90
+ "input.tif",
91
+ "output",
92
+ "mask.shp",
93
+ skip_empty_tiles=True,
94
+ min_feature_ratio=0.05
95
+ )
96
+
97
+ # Moderate filtering (keep tiles with ≥15% features)
98
+ export_landcover_tiles(
99
+ "input.tif",
100
+ "output",
101
+ "mask.shp",
102
+ skip_empty_tiles=True,
103
+ min_feature_ratio=0.15
104
+ )
105
+
106
+ Note:
107
+ This function is designed for discrete landcover classification where
108
+ class 0 typically represents background/no data. The min_feature_ratio
109
+ parameter counts non-zero pixels as "features".
110
+ """
111
+
112
+ # Validate min_feature_ratio parameter
113
+ if min_feature_ratio is not False:
114
+ if not isinstance(min_feature_ratio, (int, float)):
115
+ warnings.warn(
116
+ f"min_feature_ratio must be a number between 0.0 and 1.0, got {type(min_feature_ratio)}. "
117
+ "Disabling ratio filtering."
118
+ )
119
+ min_feature_ratio = False
120
+ elif not (0.0 <= min_feature_ratio <= 1.0):
121
+ warnings.warn(
122
+ f"min_feature_ratio must be between 0.0 and 1.0, got {min_feature_ratio}. "
123
+ "Disabling ratio filtering."
124
+ )
125
+ min_feature_ratio = False
126
+
127
+ # Create output directories
128
+ out_folder = Path(out_folder)
129
+ out_folder.mkdir(parents=True, exist_ok=True)
130
+
131
+ images_dir = out_folder / "images"
132
+ labels_dir = out_folder / "labels"
133
+ images_dir.mkdir(exist_ok=True)
134
+ labels_dir.mkdir(exist_ok=True)
135
+
136
+ if metadata_format == "PASCAL_VOC":
137
+ ann_dir = out_folder / "annotations"
138
+ ann_dir.mkdir(exist_ok=True)
139
+
140
+ # Initialize statistics
141
+ stats = {
142
+ "tiles_exported": 0,
143
+ "tiles_skipped_empty": 0,
144
+ "tiles_skipped_ratio": 0,
145
+ "output_dirs": {"images": str(images_dir), "labels": str(labels_dir)},
146
+ }
147
+
148
+ # Open raster
149
+ with rasterio.open(in_raster) as src:
150
+ height, width = src.shape
151
+
152
+ # Detect if in_class_data is raster or vector
153
+ is_class_data_raster = False
154
+ class_src = None
155
+ gdf = None
156
+ mask_array = None
157
+
158
+ if in_class_data is not None:
159
+ if isinstance(in_class_data, str):
160
+ file_ext = Path(in_class_data).suffix.lower()
161
+ if file_ext in [
162
+ ".tif",
163
+ ".tiff",
164
+ ".img",
165
+ ".jp2",
166
+ ".png",
167
+ ".bmp",
168
+ ".gif",
169
+ ]:
170
+ try:
171
+ # Try to open as raster
172
+ class_src = rasterio.open(in_class_data)
173
+ is_class_data_raster = True
174
+
175
+ # Verify CRS match
176
+ if class_src.crs != src.crs:
177
+ if not quiet:
178
+ print(
179
+ f"Warning: CRS mismatch between image ({src.crs}) and mask ({class_src.crs})"
180
+ )
181
+ except Exception as e:
182
+ is_class_data_raster = False
183
+ if not quiet:
184
+ print(f"Could not open as raster, trying vector: {e}")
185
+
186
+ # If not raster or raster open failed, try vector
187
+ if not is_class_data_raster:
188
+ gdf = gpd.read_file(in_class_data)
189
+
190
+ # Reproject if needed
191
+ if gdf.crs != src.crs:
192
+ if not quiet:
193
+ print(f"Reprojecting mask from {gdf.crs} to {src.crs}")
194
+ gdf = gdf.to_crs(src.crs)
195
+
196
+ # Apply buffer if requested
197
+ if buffer_radius > 0:
198
+ gdf.geometry = gdf.geometry.buffer(buffer_radius)
199
+
200
+ # For vector data, rasterize entire mask up front for efficiency
201
+ shapes = [
202
+ (geom, value)
203
+ for geom, value in zip(gdf.geometry, gdf[class_value_field])
204
+ ]
205
+ mask_array = features.rasterize(
206
+ shapes,
207
+ out_shape=(height, width),
208
+ transform=src.transform,
209
+ all_touched=all_touched,
210
+ fill=0,
211
+ dtype=np.uint8,
212
+ )
213
+ else:
214
+ # Assume GeoDataFrame passed directly
215
+ gdf = in_class_data
216
+
217
+ # Reproject if needed
218
+ if gdf.crs != src.crs:
219
+ if not quiet:
220
+ print(f"Reprojecting mask from {gdf.crs} to {src.crs}")
221
+ gdf = gdf.to_crs(src.crs)
222
+
223
+ # Apply buffer if requested
224
+ if buffer_radius > 0:
225
+ gdf.geometry = gdf.geometry.buffer(buffer_radius)
226
+
227
+ # Rasterize entire mask up front
228
+ shapes = [
229
+ (geom, value)
230
+ for geom, value in zip(gdf.geometry, gdf[class_value_field])
231
+ ]
232
+ mask_array = features.rasterize(
233
+ shapes,
234
+ out_shape=(height, width),
235
+ transform=src.transform,
236
+ all_touched=all_touched,
237
+ fill=0,
238
+ dtype=np.uint8,
239
+ )
240
+
241
+ # Calculate tile positions
242
+ tile_positions = []
243
+ for y in range(0, height - tile_size + 1, stride):
244
+ for x in range(0, width - tile_size + 1, stride):
245
+ tile_positions.append((x, y))
246
+
247
+ if max_tiles:
248
+ tile_positions = tile_positions[:max_tiles]
249
+
250
+ # Process tiles
251
+ pbar = tqdm(tile_positions, desc="Exporting tiles", disable=quiet)
252
+
253
+ for tile_idx, (x, y) in enumerate(pbar):
254
+ window = Window(x, y, tile_size, tile_size)
255
+
256
+ # Read image tile
257
+ image_tile = src.read(window=window)
258
+
259
+ # Read mask tile based on data type
260
+ mask_tile = None
261
+ has_features = False
262
+
263
+ if is_class_data_raster and class_src is not None:
264
+ # For raster masks, read directly from the raster source
265
+ # Get window transform and bounds
266
+ window_transform = src.window_transform(window)
267
+ minx = window_transform[2]
268
+ maxy = window_transform[5]
269
+ maxx = minx + tile_size * window_transform[0]
270
+ miny = maxy + tile_size * window_transform[4]
271
+
272
+ # Get corresponding window in class raster
273
+ window_class = rasterio.windows.from_bounds(
274
+ minx, miny, maxx, maxy, class_src.transform
275
+ )
276
+
277
+ try:
278
+ # Read label data from raster
279
+ mask_tile = class_src.read(
280
+ 1,
281
+ window=window_class,
282
+ boundless=True,
283
+ out_shape=(tile_size, tile_size),
284
+ )
285
+
286
+ # Check if tile has features
287
+ has_features = np.any(mask_tile > 0)
288
+ except Exception as e:
289
+ if not quiet:
290
+ pbar.write(f"Error reading mask tile at ({x}, {y}): {e}")
291
+ continue
292
+
293
+ elif mask_array is not None:
294
+ # For vector masks (pre-rasterized)
295
+ mask_tile = mask_array[y : y + tile_size, x : x + tile_size]
296
+ has_features = np.any(mask_tile > 0)
297
+
298
+ # Skip empty tiles if requested
299
+ if skip_empty_tiles and not has_features:
300
+ stats["tiles_skipped_empty"] += 1
301
+ continue
302
+
303
+ # Apply min_feature_ratio filtering if enabled
304
+ if skip_empty_tiles and has_features and min_feature_ratio is not False:
305
+ # Calculate ratio of non-background pixels
306
+ total_pixels = mask_tile.size
307
+ feature_pixels = np.sum(mask_tile > 0)
308
+ feature_ratio = feature_pixels / total_pixels
309
+
310
+ # Skip tile if below threshold
311
+ if feature_ratio < min_feature_ratio:
312
+ stats["tiles_skipped_ratio"] += 1
313
+ continue
314
+
315
+ # Save image tile
316
+ tile_name = f"tile_{tile_idx:06d}.tif"
317
+ image_path = images_dir / tile_name
318
+
319
+ # Get transform for this tile
320
+ tile_transform = src.window_transform(window)
321
+
322
+ # Write image
323
+ with rasterio.open(
324
+ image_path,
325
+ "w",
326
+ driver="GTiff",
327
+ height=tile_size,
328
+ width=tile_size,
329
+ count=src.count,
330
+ dtype=src.dtypes[0],
331
+ crs=src.crs,
332
+ transform=tile_transform,
333
+ compress="lzw",
334
+ ) as dst:
335
+ dst.write(image_tile)
336
+
337
+ # Save mask tile if available
338
+ if mask_tile is not None:
339
+ mask_path = labels_dir / tile_name
340
+ with rasterio.open(
341
+ mask_path,
342
+ "w",
343
+ driver="GTiff",
344
+ height=tile_size,
345
+ width=tile_size,
346
+ count=1,
347
+ dtype=np.uint8,
348
+ crs=src.crs,
349
+ transform=tile_transform,
350
+ compress="lzw",
351
+ ) as dst:
352
+ dst.write(mask_tile, 1)
353
+
354
+ stats["tiles_exported"] += 1
355
+
356
+ # Update progress bar description with selection count
357
+ if not quiet:
358
+ pbar.set_description(
359
+ f"Exporting tiles ({stats['tiles_exported']}/{tile_idx + 1})"
360
+ )
361
+
362
+ # Close raster class source if opened
363
+ if class_src is not None:
364
+ class_src.close()
365
+
366
+ # Print summary
367
+ if not quiet:
368
+ print(f"\n{'='*60}")
369
+ print("TILE EXPORT SUMMARY")
370
+ print(f"{'='*60}")
371
+ print(f"Tiles exported: {stats['tiles_exported']}/{len(tile_positions)}")
372
+ if skip_empty_tiles:
373
+ print(f"Tiles skipped (empty): {stats['tiles_skipped_empty']}")
374
+ if min_feature_ratio is not False:
375
+ print(
376
+ f"Tiles skipped (low feature ratio < {min_feature_ratio}): {stats['tiles_skipped_ratio']}"
377
+ )
378
+ print(f"\nOutput directories:")
379
+ print(f" Images: {stats['output_dirs']['images']}")
380
+ print(f" Labels: {stats['output_dirs']['labels']}")
381
+ print(f"{'='*60}\n")
382
+
383
+ return stats