giga-spatial 0.6.9__py3-none-any.whl → 0.7.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,7 +1,9 @@
1
1
  import numpy as np
2
2
  import pandas as pd
3
3
  import geopandas as gpd
4
- from typing import List, Optional, Tuple, Union, Literal, Callable
4
+ import networkx as nx
5
+ import scipy.sparse as sp
6
+ from typing import List, Optional, Tuple, Union, Literal, Callable, Dict, Any
5
7
  from pydantic import ConfigDict
6
8
  from pydantic.dataclasses import dataclass
7
9
  from contextlib import contextmanager
@@ -15,12 +17,17 @@ from functools import partial
15
17
  import multiprocessing
16
18
  from tqdm import tqdm
17
19
  import tempfile
20
+ import shutil
18
21
  import os
19
22
 
20
23
  from gigaspatial.core.io.data_store import DataStore
21
24
  from gigaspatial.core.io.local_data_store import LocalDataStore
22
25
  from gigaspatial.config import config
23
26
 
27
+ # Global variables for multiprocessing workers
28
+ src_handle = None
29
+ memfile_handle = None
30
+
24
31
 
25
32
  @dataclass(config=ConfigDict(arbitrary_types_allowed=True))
26
33
  class TifProcessor:
@@ -35,50 +42,164 @@ class TifProcessor:
35
42
  merge_method: Literal["first", "last", "min", "max", "mean"] = "first"
36
43
  target_crs: Optional[str] = None # For reprojection if needed
37
44
  resampling_method: Resampling = Resampling.nearest
45
+ reprojection_resolution: Optional[Tuple[float, float]] = None
38
46
 
39
47
  def __post_init__(self):
40
48
  """Validate inputs, merge rasters if needed, and set up logging."""
41
49
  self.data_store = self.data_store or LocalDataStore()
42
50
  self.logger = config.get_logger(self.__class__.__name__)
43
51
  self._cache = {}
52
+ self._temp_dir = tempfile.mkdtemp()
44
53
  self._merged_file_path = None
45
- self._temp_dir = None
54
+ self._reprojected_file_path = None
46
55
 
47
56
  # Handle multiple dataset paths
48
57
  if isinstance(self.dataset_path, list):
49
- self.dataset_paths = [Path(p) for p in self.dataset_path]
50
- self._validate_multiple_datasets()
51
- self._merge_rasters()
52
- self.dataset_path = self._merged_file_path
58
+ if len(self.dataset_path) > 1:
59
+ self.dataset_paths = [Path(p) for p in self.dataset_path]
60
+ self._validate_multiple_datasets()
61
+ self._merge_rasters()
62
+ self.dataset_path = self._merged_file_path
53
63
  else:
54
64
  self.dataset_paths = [Path(self.dataset_path)]
55
- if not self.data_store.file_exists(self.dataset_path):
65
+ if not self.data_store.file_exists(str(self.dataset_path)):
56
66
  raise FileNotFoundError(f"Dataset not found at {self.dataset_path}")
57
67
 
68
+ # Reproject single raster during initialization if target_crs is set
69
+ if self.target_crs:
70
+ self.logger.info(f"Reprojecting single raster to {self.target_crs}...")
71
+ with self.data_store.open(str(self.dataset_path), "rb") as f:
72
+ with rasterio.MemoryFile(f.read()) as memfile:
73
+ with memfile.open() as src:
74
+ self._reprojected_file_path = self._reproject_to_temp_file(
75
+ src, self.target_crs
76
+ )
77
+ self.dataset_path = self._reprojected_file_path
78
+
58
79
  self._load_metadata()
80
+ self._validate_mode_band_compatibility()
59
81
 
60
- # Validate mode and band count
61
- if self.mode == "rgba" and self.count != 4:
62
- raise ValueError("RGBA mode requires a 4-band TIF file")
63
- if self.mode == "rgb" and self.count != 3:
64
- raise ValueError("RGB mode requires a 3-band TIF file")
65
- if self.mode == "single" and self.count != 1:
66
- raise ValueError("Single mode requires a 1-band TIF file")
67
- if self.mode == "multi" and self.count < 2:
68
- raise ValueError("Multi mode requires a TIF file with 2 or more bands")
82
+ @contextmanager
83
+ def open_dataset(self):
84
+ """Context manager for accessing the dataset, handling temporary reprojected files."""
85
+ if self._merged_file_path:
86
+ with rasterio.open(self._merged_file_path) as src:
87
+ yield src
88
+ elif self._reprojected_file_path:
89
+ with rasterio.open(self._reprojected_file_path) as src:
90
+ yield src
91
+ elif isinstance(self.data_store, LocalDataStore):
92
+ with rasterio.open(str(self.dataset_path)) as src:
93
+ yield src
94
+ else:
95
+ with self.data_store.open(str(self.dataset_path), "rb") as f:
96
+ with rasterio.MemoryFile(f.read()) as memfile:
97
+ with memfile.open() as src:
98
+ yield src
99
+
100
+ def reproject_to(
101
+ self,
102
+ target_crs: str,
103
+ output_path: Optional[Union[str, Path]] = None,
104
+ resampling_method: Optional[Resampling] = None,
105
+ resolution: Optional[Tuple[float, float]] = None,
106
+ ):
107
+ """
108
+ Reprojects the current raster to a new CRS and optionally saves it.
109
+
110
+ Args:
111
+ target_crs: The CRS to reproject to (e.g., "EPSG:4326").
112
+ output_path: The path to save the reprojected raster. If None,
113
+ it is saved to a temporary file.
114
+ resampling_method: The resampling method to use.
115
+ resolution: The target resolution (pixel size) in the new CRS.
116
+ """
117
+ self.logger.info(f"Reprojecting raster to {target_crs}...")
118
+
119
+ # Use provided or default values
120
+ resampling_method = resampling_method or self.resampling_method
121
+ resolution = resolution or self.reprojection_resolution
122
+
123
+ with self.open_dataset() as src:
124
+ if src.crs.to_string() == target_crs:
125
+ self.logger.info(
126
+ "Raster is already in the target CRS. No reprojection needed."
127
+ )
128
+ # If output_path is specified, copy the file
129
+ if output_path:
130
+ self.data_store.copy_file(str(self.dataset_path), output_path)
131
+ return self.dataset_path
132
+
133
+ dst_path = output_path or os.path.join(
134
+ self._temp_dir, f"reprojected_single_{os.urandom(8).hex()}.tif"
135
+ )
136
+
137
+ with rasterio.open(
138
+ dst_path,
139
+ "w",
140
+ **self._get_reprojection_profile(src, target_crs, resolution),
141
+ ) as dst:
142
+ for band_idx in range(1, src.count + 1):
143
+ reproject(
144
+ source=rasterio.band(src, band_idx),
145
+ destination=rasterio.band(dst, band_idx),
146
+ src_transform=src.transform,
147
+ src_crs=src.crs,
148
+ dst_transform=dst.transform,
149
+ dst_crs=dst.crs,
150
+ resampling=resampling_method,
151
+ num_threads=multiprocessing.cpu_count(),
152
+ )
153
+
154
+ self.logger.info(f"Reprojection complete. Output saved to {dst_path}")
155
+ return Path(dst_path)
156
+
157
+ def get_raster_info(self) -> Dict[str, Any]:
158
+ """Get comprehensive raster information."""
159
+ return {
160
+ "count": self.count,
161
+ "width": self.width,
162
+ "height": self.height,
163
+ "crs": self.crs,
164
+ "bounds": self.bounds,
165
+ "transform": self.transform,
166
+ "dtypes": self.dtype,
167
+ "nodata": self.nodata,
168
+ "mode": self.mode,
169
+ "is_merged": self.is_merged,
170
+ "source_count": self.source_count,
171
+ }
172
+
173
+ def _reproject_to_temp_file(
174
+ self, src: rasterio.DatasetReader, target_crs: str
175
+ ) -> str:
176
+ """Helper to reproject a raster and save it to a temporary file."""
177
+ dst_path = os.path.join(
178
+ self._temp_dir, f"reprojected_temp_{os.urandom(8).hex()}.tif"
179
+ )
180
+ profile = self._get_reprojection_profile(
181
+ src, target_crs, self.reprojection_resolution
182
+ )
183
+
184
+ with rasterio.open(dst_path, "w", **profile) as dst:
185
+ for band_idx in range(1, src.count + 1):
186
+ reproject(
187
+ source=rasterio.band(src, band_idx),
188
+ destination=rasterio.band(dst, band_idx),
189
+ src_transform=src.transform,
190
+ src_crs=src.crs,
191
+ dst_transform=dst.transform,
192
+ dst_crs=dst.crs,
193
+ resampling=self.resampling_method,
194
+ )
195
+ return dst_path
69
196
 
70
197
  def _validate_multiple_datasets(self):
71
198
  """Validate that all datasets exist and have compatible properties."""
72
199
  if len(self.dataset_paths) < 2:
73
200
  raise ValueError("Multiple dataset paths required for merging")
74
201
 
75
- # Check if all files exist
76
- for path in self.dataset_paths:
77
- if not self.data_store.file_exists(path):
78
- raise FileNotFoundError(f"Dataset not found at {path}")
79
-
80
- # Load first dataset to get reference properties
81
- with self.data_store.open(self.dataset_paths[0], "rb") as f:
202
+ with self.data_store.open(str(self.dataset_paths[0]), "rb") as f:
82
203
  with rasterio.MemoryFile(f.read()) as memfile:
83
204
  with memfile.open() as ref_src:
84
205
  ref_count = ref_src.count
@@ -87,9 +208,8 @@ class TifProcessor:
87
208
  ref_transform = ref_src.transform
88
209
  ref_nodata = ref_src.nodata
89
210
 
90
- # Validate all other datasets against reference
91
211
  for i, path in enumerate(self.dataset_paths[1:], 1):
92
- with self.data_store.open(path, "rb") as f:
212
+ with self.data_store.open(str(path), "rb") as f:
93
213
  with rasterio.MemoryFile(f.read()) as memfile:
94
214
  with memfile.open() as src:
95
215
  if src.count != ref_count:
@@ -100,9 +220,10 @@ class TifProcessor:
100
220
  raise ValueError(
101
221
  f"Dataset {i} has dtype {src.dtypes[0]}, expected {ref_dtype}"
102
222
  )
103
- if self.target_crs is None and src.crs != ref_crs:
104
- raise ValueError(
105
- f"Dataset {i} has CRS {src.crs}, expected {ref_crs}. Consider setting target_crs parameter."
223
+ if not self.target_crs and src.crs != ref_crs:
224
+ self.logger.warning(
225
+ f"Dataset {i} has CRS {src.crs}, expected {ref_crs}. "
226
+ "Consider setting target_crs parameter for reprojection before merging."
106
227
  )
107
228
  if self.target_crs is None and not self._transforms_compatible(
108
229
  src.transform, ref_transform
@@ -115,6 +236,46 @@ class TifProcessor:
115
236
  f"Dataset {i} has different nodata value: {src.nodata} vs {ref_nodata}"
116
237
  )
117
238
 
239
+ def _get_reprojection_profile(
240
+ self,
241
+ src: rasterio.DatasetReader,
242
+ target_crs: str,
243
+ resolution: Optional[Tuple[float, float]],
244
+ compression: str = "lzw",
245
+ ):
246
+ """Calculates and returns the profile for a reprojected raster."""
247
+ if resolution:
248
+ src_res = (abs(src.transform.a), abs(src.transform.e))
249
+ self.logger.info(
250
+ f"Using target resolution: {resolution}. Source resolution: {src_res}."
251
+ )
252
+ # Calculate transform and dimensions based on the new resolution
253
+ dst_transform, width, height = calculate_default_transform(
254
+ src.crs,
255
+ target_crs,
256
+ src.width,
257
+ src.height,
258
+ *src.bounds,
259
+ resolution=resolution,
260
+ )
261
+ else:
262
+ # Keep original resolution but reproject
263
+ dst_transform, width, height = calculate_default_transform(
264
+ src.crs, target_crs, src.width, src.height, *src.bounds
265
+ )
266
+
267
+ profile = src.profile.copy()
268
+ profile.update(
269
+ {
270
+ "crs": target_crs,
271
+ "transform": dst_transform,
272
+ "width": width,
273
+ "height": height,
274
+ "compress": compression, # Add compression to save space
275
+ }
276
+ )
277
+ return profile
278
+
118
279
  def _transforms_compatible(self, transform1, transform2, tolerance=1e-6):
119
280
  """Check if two transforms have compatible pixel sizes."""
120
281
  return (
@@ -126,151 +287,77 @@ class TifProcessor:
126
287
  """Merge multiple rasters into a single raster."""
127
288
  self.logger.info(f"Merging {len(self.dataset_paths)} rasters...")
128
289
 
129
- # Create temporary directory for merged file
130
- self._temp_dir = tempfile.mkdtemp()
131
- merged_filename = "merged_raster.tif"
132
- self._merged_file_path = os.path.join(self._temp_dir, merged_filename)
133
-
134
290
  # Open all datasets and handle reprojection if needed
135
- src_files = []
136
- reprojected_files = []
137
-
291
+ datasets_to_merge = []
292
+ temp_reprojected_files = []
138
293
  try:
139
294
  for path in self.dataset_paths:
140
- with self.data_store.open(path, "rb") as f:
141
- # Create temporary file for each dataset
142
- temp_file = tempfile.NamedTemporaryFile(suffix=".tif", delete=False)
143
- temp_file.write(f.read())
144
- temp_file.close()
145
- src_files.append(rasterio.open(temp_file.name))
146
-
147
- # Handle reprojection if target_crs is specified
148
- if self.target_crs:
149
- self.logger.info(f"Reprojecting rasters to {self.target_crs}...")
150
- processed_files = self._reproject_rasters(src_files, self.target_crs)
151
- reprojected_files = processed_files
152
- else:
153
- processed_files = src_files
295
+ with self.data_store.open(str(path), "rb") as f:
296
+ with rasterio.MemoryFile(f.read()) as memfile:
297
+ with memfile.open() as src:
298
+ if self.target_crs and src.crs != self.target_crs:
299
+ self.logger.info(
300
+ f"Reprojecting {path.name} to {self.target_crs} before merging."
301
+ )
302
+ reprojected_path = self._reproject_to_temp_file(
303
+ src, self.target_crs
304
+ )
305
+ temp_reprojected_files.append(reprojected_path)
306
+ datasets_to_merge.append(
307
+ rasterio.open(reprojected_path)
308
+ )
309
+ else:
310
+ temp_path = os.path.join(
311
+ self._temp_dir,
312
+ f"temp_{path.stem}_{os.urandom(4).hex()}.tif",
313
+ )
314
+ temp_reprojected_files.append(temp_path)
315
+
316
+ profile = src.profile
317
+ with rasterio.open(temp_path, "w", **profile) as dst:
318
+ dst.write(src.read())
319
+ datasets_to_merge.append(rasterio.open(temp_path))
320
+
321
+ self._merged_file_path = os.path.join(self._temp_dir, "merged_raster.tif")
154
322
 
155
323
  if self.merge_method == "mean":
156
- # For mean, we need to handle it manually
157
- merged_array, merged_transform = self._merge_with_mean(src_files)
158
-
159
- # Use first source as reference for metadata
160
- ref_src = src_files[0]
161
- profile = ref_src.profile.copy()
162
- profile.update(
163
- {
164
- "height": merged_array.shape[-2],
165
- "width": merged_array.shape[-1],
166
- "transform": merged_transform,
167
- }
324
+ merged_array, merged_transform = self._merge_with_mean(
325
+ datasets_to_merge
168
326
  )
169
-
170
- # Write merged raster
171
- with rasterio.open(self._merged_file_path, "w", **profile) as dst:
172
- dst.write(merged_array)
173
-
174
327
  else:
175
- # Use rasterio's merge function
176
328
  merged_array, merged_transform = merge(
177
- src_files,
329
+ datasets_to_merge,
178
330
  method=self.merge_method,
179
331
  resampling=self.resampling_method,
180
332
  )
181
333
 
182
- # Use first source as reference for metadata
183
- ref_src = src_files[0]
184
- profile = ref_src.profile.copy()
185
- profile.update(
186
- {
187
- "height": merged_array.shape[-2],
188
- "width": merged_array.shape[-1],
189
- "transform": merged_transform,
190
- }
191
- )
192
-
193
- if self.target_crs:
194
- profile["crs"] = self.target_crs
195
-
196
- # Write merged raster
197
- with rasterio.open(self._merged_file_path, "w", **profile) as dst:
198
- dst.write(merged_array)
199
-
200
- finally:
201
- # Clean up source files
202
- for src in src_files:
203
- temp_path = src.name
204
- src.close()
205
- try:
206
- os.unlink(temp_path)
207
- except:
208
- pass
209
-
210
- # Clean up reprojected files
211
- for src in reprojected_files:
212
- if src not in src_files: # Don't double-close
213
- temp_path = src.name
214
- src.close()
215
- try:
216
- os.unlink(temp_path)
217
- except:
218
- pass
219
-
220
- self.logger.info("Raster merging completed!")
221
-
222
- def _reproject_rasters(self, src_files, target_crs):
223
- """Reproject all rasters to a common CRS before merging."""
224
- reprojected_files = []
225
-
226
- for i, src in enumerate(src_files):
227
- if src.crs.to_string() == target_crs:
228
- # No reprojection needed
229
- reprojected_files.append(src)
230
- continue
231
-
232
- # Calculate transform and dimensions for reprojection
233
- transform, width, height = calculate_default_transform(
234
- src.crs,
235
- target_crs,
236
- src.width,
237
- src.height,
238
- *src.bounds,
239
- resolution=self.resolution if hasattr(self, "resolution") else None,
240
- )
241
-
242
- # Create temporary file for reprojected raster
243
- temp_file = tempfile.NamedTemporaryFile(suffix=".tif", delete=False)
244
- temp_file.close()
245
-
246
- # Set up profile for reprojected raster
247
- profile = src.profile.copy()
334
+ # Get profile from the first file in the list (all should be compatible now)
335
+ ref_src = datasets_to_merge[0]
336
+ profile = ref_src.profile.copy()
248
337
  profile.update(
249
338
  {
250
- "crs": target_crs,
251
- "transform": transform,
252
- "width": width,
253
- "height": height,
339
+ "height": merged_array.shape[-2],
340
+ "width": merged_array.shape[-1],
341
+ "transform": merged_transform,
342
+ "crs": self.target_crs if self.target_crs else ref_src.crs,
254
343
  }
255
344
  )
256
345
 
257
- # Reproject and write to temporary file
258
- with rasterio.open(temp_file.name, "w", **profile) as dst:
259
- for band_idx in range(1, src.count + 1):
260
- reproject(
261
- source=rasterio.band(src, band_idx),
262
- destination=rasterio.band(dst, band_idx),
263
- src_transform=src.transform,
264
- src_crs=src.crs,
265
- dst_transform=transform,
266
- dst_crs=target_crs,
267
- resampling=self.resampling_method,
268
- )
346
+ with rasterio.open(self._merged_file_path, "w", **profile) as dst:
347
+ dst.write(merged_array)
348
+ finally:
349
+ for dataset in datasets_to_merge:
350
+ if hasattr(dataset, "close"):
351
+ dataset.close()
269
352
 
270
- # Open reprojected file
271
- reprojected_files.append(rasterio.open(temp_file.name))
353
+ # Clean up temporary files immediately
354
+ for temp_file in temp_reprojected_files:
355
+ try:
356
+ os.remove(temp_file)
357
+ except OSError:
358
+ pass
272
359
 
273
- return reprojected_files
360
+ self.logger.info("Raster merging completed!")
274
361
 
275
362
  def _merge_with_mean(self, src_files):
276
363
  """Merge rasters using mean aggregation."""
@@ -295,6 +382,12 @@ class TifProcessor:
295
382
  bounds.left, bounds.bottom, bounds.right, bounds.top, width, height
296
383
  )
297
384
 
385
+ estimated_memory = height * width * src_files[0].count * 8 # float64
386
+ if estimated_memory > 1e9: # 1GB threshold
387
+ self.logger.warning(
388
+ f"Large memory usage expected: {estimated_memory/1e9:.1f}GB"
389
+ )
390
+
298
391
  # Initialize arrays for sum and count
299
392
  sum_array = np.zeros((src_files[0].count, height, width), dtype=np.float64)
300
393
  count_array = np.zeros((height, width), dtype=np.int32)
@@ -336,33 +429,9 @@ class TifProcessor:
336
429
 
337
430
  return mean_array.astype(src_files[0].dtypes[0]), merged_transform
338
431
 
339
- def __del__(self):
340
- """Cleanup temporary files."""
341
- if self._temp_dir and os.path.exists(self._temp_dir):
342
- try:
343
- import shutil
344
-
345
- shutil.rmtree(self._temp_dir)
346
- except:
347
- pass
348
-
349
- @contextmanager
350
- def open_dataset(self):
351
- """Context manager for accessing the dataset"""
352
- if self._merged_file_path:
353
- # Open merged file directly
354
- with rasterio.open(self._merged_file_path) as src:
355
- yield src
356
- else:
357
- # Original single file logic
358
- with self.data_store.open(self.dataset_path, "rb") as f:
359
- with rasterio.MemoryFile(f.read()) as memfile:
360
- with memfile.open() as src:
361
- yield src
362
-
363
432
  def _load_metadata(self):
364
433
  """Load metadata from the TIF file if not already cached"""
365
- if not self._cache:
434
+ try:
366
435
  with self.open_dataset() as src:
367
436
  self._cache["transform"] = src.transform
368
437
  self._cache["crs"] = src.crs.to_string()
@@ -375,6 +444,10 @@ class TifProcessor:
375
444
  self._cache["nodata"] = src.nodata
376
445
  self._cache["count"] = src.count
377
446
  self._cache["dtype"] = src.dtypes[0]
447
+ except (rasterio.errors.RasterioIOError, FileNotFoundError) as e:
448
+ raise FileNotFoundError(f"Could not read raster metadata: {e}")
449
+ except Exception as e:
450
+ raise RuntimeError(f"Unexpected error loading metadata: {e}")
378
451
 
379
452
  @property
380
453
  def is_merged(self) -> bool:
@@ -386,7 +459,6 @@ class TifProcessor:
386
459
  """Get the number of source rasters."""
387
460
  return len(self.dataset_paths)
388
461
 
389
- # All other methods remain the same...
390
462
  @property
391
463
  def transform(self):
392
464
  """Get the transform from the TIF file"""
@@ -428,53 +500,48 @@ class TifProcessor:
428
500
  return self._cache["nodata"]
429
501
 
430
502
  @property
431
- def tabular(self) -> pd.DataFrame:
432
- """Get the data from the TIF file"""
433
- self.logger.warning(
434
- "The `tabular` property is deprecated, use `to_dataframe` instead"
435
- )
436
- if not hasattr(self, "_tabular"):
437
- try:
438
- if self.mode == "single":
439
- self._tabular = self._to_band_dataframe(
440
- drop_nodata=True, drop_values=[]
441
- )
442
- elif self.mode == "rgb":
443
- self._tabular = self._to_rgb_dataframe(drop_nodata=True)
444
- elif self.mode == "rgba":
445
- self._tabular = self._to_rgba_dataframe(drop_transparent=True)
446
- elif self.mode == "multi":
447
- self._tabular = self._to_multi_band_dataframe(
448
- drop_nodata=True,
449
- drop_values=[],
450
- band_names=None, # Use default band naming
451
- )
452
- else:
453
- raise ValueError(
454
- f"Invalid mode: {self.mode}. Must be one of: single, rgb, rgba, multi"
455
- )
456
- except Exception as e:
457
- raise ValueError(
458
- f"Failed to process TIF file in mode '{self.mode}'. "
459
- f"Please ensure the file is valid and matches the selected mode. "
460
- f"Original error: {str(e)}"
461
- )
503
+ def dtype(self):
504
+ """Get the data types from the TIF file"""
505
+ return self._cache.get("dtype", [])
506
+
507
+ @property
508
+ def width(self):
509
+ return self._cache["width"]
510
+
511
+ @property
512
+ def height(self):
513
+ return self._cache["height"]
514
+
515
+ def to_dataframe(
516
+ self, drop_nodata=True, check_memory=True, **kwargs
517
+ ) -> pd.DataFrame:
518
+ """
519
+ Convert raster to DataFrame.
520
+
521
+ Args:
522
+ drop_nodata: Whether to drop nodata values
523
+ check_memory: Whether to check memory before operation (default True)
524
+ **kwargs: Additional arguments
462
525
 
463
- return self._tabular
526
+ Returns:
527
+ pd.DataFrame with raster data
528
+ """
529
+ # Memory guard check
530
+ if check_memory:
531
+ self._memory_guard("conversion", threshold_percent=80.0)
464
532
 
465
- def to_dataframe(self, drop_nodata=True, **kwargs) -> pd.DataFrame:
466
533
  try:
467
534
  if self.mode == "single":
468
- df = self._to_band_dataframe(drop_nodata=drop_nodata, **kwargs)
469
- elif self.mode == "rgb":
470
- df = self._to_rgb_dataframe(drop_nodata=drop_nodata)
471
- elif self.mode == "rgba":
472
- df = self._to_rgba_dataframe(drop_transparent=drop_nodata)
473
- elif self.mode == "multi":
474
- df = self._to_multi_band_dataframe(drop_nodata=drop_nodata, **kwargs)
535
+ return self._to_dataframe(
536
+ band_number=kwargs.get("band_number", 1),
537
+ drop_nodata=drop_nodata,
538
+ band_names=kwargs.get("band_names", None),
539
+ )
475
540
  else:
476
- raise ValueError(
477
- f"Invalid mode: {self.mode}. Must be one of: single, rgb, rgba, multi"
541
+ return self._to_dataframe(
542
+ band_number=None, # All bands
543
+ drop_nodata=drop_nodata,
544
+ band_names=kwargs.get("band_names", None),
478
545
  )
479
546
  except Exception as e:
480
547
  raise ValueError(
@@ -485,12 +552,23 @@ class TifProcessor:
485
552
 
486
553
  return df
487
554
 
488
- def to_geodataframe(self, **kwargs) -> gpd.GeoDataFrame:
555
+ def to_geodataframe(self, check_memory=True, **kwargs) -> gpd.GeoDataFrame:
489
556
  """
490
557
  Convert the processed TIF data into a GeoDataFrame, where each row represents a pixel zone.
491
558
  Each zone is defined by its bounding box, based on pixel resolution and coordinates.
559
+
560
+ Args:
561
+ check_memory: Whether to check memory before operation
562
+ **kwargs: Additional arguments passed to to_dataframe()
563
+
564
+ Returns:
565
+ gpd.GeoDataFrame with raster data
492
566
  """
493
- df = self.to_dataframe(**kwargs)
567
+ # Memory guard check
568
+ if check_memory:
569
+ self._memory_guard("conversion", threshold_percent=80.0)
570
+
571
+ df = self.to_dataframe(check_memory=False, **kwargs)
494
572
 
495
573
  x_res, y_res = self.resolution
496
574
 
@@ -504,30 +582,300 @@ class TifProcessor:
504
582
 
505
583
  return gdf
506
584
 
507
- def get_zoned_geodataframe(self) -> gpd.GeoDataFrame:
585
+ def to_dataframe_chunked(
586
+ self, drop_nodata=True, chunk_size=None, target_memory_mb=500, **kwargs
587
+ ):
508
588
  """
509
- Convert the processed TIF data into a GeoDataFrame, where each row represents a pixel zone.
510
- Each zone is defined by its bounding box, based on pixel resolution and coordinates.
589
+ Convert raster to DataFrame using chunked processing for memory efficiency.
590
+
591
+ Automatically routes to the appropriate chunked method based on mode.
592
+ Chunk size is automatically calculated based on target memory usage.
593
+
594
+ Args:
595
+ drop_nodata: Whether to drop nodata values
596
+ chunk_size: Number of rows per chunk (auto-calculated if None)
597
+ target_memory_mb: Target memory per chunk in MB (default 500)
598
+ **kwargs: Additional arguments (band_number, band_names, etc.)
599
+ """
600
+
601
+ if chunk_size is None:
602
+ chunk_size = self._calculate_optimal_chunk_size(
603
+ "conversion", target_memory_mb
604
+ )
605
+
606
+ windows = self._get_chunk_windows(chunk_size)
607
+
608
+ # SIMPLE ROUTING
609
+ if self.mode == "single":
610
+ return self._to_dataframe_chunked(
611
+ windows,
612
+ band_number=kwargs.get("band_number", 1),
613
+ drop_nodata=drop_nodata,
614
+ band_names=kwargs.get("band_names", None),
615
+ )
616
+ else: # rgb, rgba, multi
617
+ return self._to_dataframe_chunked(
618
+ windows,
619
+ band_number=None,
620
+ drop_nodata=drop_nodata,
621
+ band_names=kwargs.get("band_names", None),
622
+ )
623
+
624
+ def clip_to_geometry(
625
+ self,
626
+ geometry: Union[
627
+ Polygon, MultiPolygon, gpd.GeoDataFrame, gpd.GeoSeries, List[dict], dict
628
+ ],
629
+ crop: bool = True,
630
+ all_touched: bool = True,
631
+ invert: bool = False,
632
+ nodata: Optional[Union[int, float]] = None,
633
+ pad: bool = False,
634
+ pad_width: float = 0.5,
635
+ return_clipped_processor: bool = True,
636
+ ) -> Union["TifProcessor", tuple]:
637
+ """
638
+ Clip raster to geometry boundaries.
639
+
640
+ Parameters:
641
+ -----------
642
+ geometry : various
643
+ Geometry to clip to. Can be:
644
+ - Shapely Polygon or MultiPolygon
645
+ - GeoDataFrame or GeoSeries
646
+ - List of GeoJSON-like dicts
647
+ - Single GeoJSON-like dict
648
+ crop : bool, default True
649
+ Whether to crop the raster to the extent of the geometry
650
+ all_touched : bool, default True
651
+ Include pixels that touch the geometry boundary
652
+ invert : bool, default False
653
+ If True, mask pixels inside geometry instead of outside
654
+ nodata : int or float, optional
655
+ Value to use for masked pixels. If None, uses raster's nodata value
656
+ pad : bool, default False
657
+ Pad geometry by half pixel before clipping
658
+ pad_width : float, default 0.5
659
+ Width of padding in pixels if pad=True
660
+ return_clipped_processor : bool, default True
661
+ If True, returns new TifProcessor with clipped data
662
+ If False, returns (clipped_array, transform, metadata)
663
+
664
+ Returns:
665
+ --------
666
+ TifProcessor or tuple
667
+ Either new TifProcessor instance or (array, transform, metadata) tuple
668
+ """
669
+ # Handle different geometry input types
670
+ shapes = self._prepare_geometry_for_clipping(geometry)
671
+
672
+ # Validate CRS compatibility
673
+ self._validate_geometry_crs(geometry)
674
+
675
+ # Perform the clipping
676
+ with self.open_dataset() as src:
677
+ try:
678
+ clipped_data, clipped_transform = mask(
679
+ dataset=src,
680
+ shapes=shapes,
681
+ crop=crop,
682
+ all_touched=all_touched,
683
+ invert=invert,
684
+ nodata=nodata,
685
+ pad=pad,
686
+ pad_width=pad_width,
687
+ filled=True,
688
+ )
689
+
690
+ # Update metadata for the clipped raster
691
+ clipped_meta = src.meta.copy()
692
+ clipped_meta.update(
693
+ {
694
+ "height": clipped_data.shape[1],
695
+ "width": clipped_data.shape[2],
696
+ "transform": clipped_transform,
697
+ "nodata": nodata if nodata is not None else src.nodata,
698
+ }
699
+ )
700
+
701
+ except ValueError as e:
702
+ if "Input shapes do not overlap raster" in str(e):
703
+ raise ValueError(
704
+ "The geometry does not overlap with the raster. "
705
+ "Check that both are in the same coordinate reference system."
706
+ ) from e
707
+ else:
708
+ raise e
709
+
710
+ if return_clipped_processor:
711
+ # Create a new TifProcessor with the clipped data
712
+ return self._create_clipped_processor(clipped_data, clipped_meta)
713
+ else:
714
+ return clipped_data, clipped_transform, clipped_meta
715
+
716
+ def clip_to_bounds(
717
+ self,
718
+ bounds: tuple,
719
+ bounds_crs: Optional[str] = None,
720
+ return_clipped_processor: bool = True,
721
+ ) -> Union["TifProcessor", tuple]:
722
+ """
723
+ Clip raster to rectangular bounds.
724
+
725
+ Parameters:
726
+ -----------
727
+ bounds : tuple
728
+ Bounding box as (minx, miny, maxx, maxy)
729
+ bounds_crs : str, optional
730
+ CRS of the bounds. If None, assumes same as raster CRS
731
+ return_clipped_processor : bool, default True
732
+ If True, returns new TifProcessor, else returns (array, transform, metadata)
733
+
734
+ Returns:
735
+ --------
736
+ TifProcessor or tuple
737
+ Either new TifProcessor instance or (array, transform, metadata) tuple
511
738
  """
512
- self.logger.warning(
513
- "The `get_zoned_geodataframe` method is deprecated, use `to_geodataframe` instead"
739
+ # Create bounding box geometry
740
+ bbox_geom = box(*bounds)
741
+
742
+ # If bounds_crs is specified and different from raster CRS, create GeoDataFrame for reprojection
743
+ if bounds_crs is not None:
744
+ raster_crs = self.crs
745
+
746
+ if not self.crs == bounds_crs:
747
+ # Create GeoDataFrame with bounds CRS and reproject
748
+ bbox_gdf = gpd.GeoDataFrame([1], geometry=[bbox_geom], crs=bounds_crs)
749
+ bbox_gdf = bbox_gdf.to_crs(raster_crs)
750
+ bbox_geom = bbox_gdf.geometry.iloc[0]
751
+
752
+ return self.clip_to_geometry(
753
+ geometry=bbox_geom,
754
+ crop=True,
755
+ return_clipped_processor=return_clipped_processor,
514
756
  )
515
- self.logger.info("Converting data to GeoDataFrame with zones...")
516
757
 
517
- df = self.tabular
758
+ def to_graph(
759
+ self,
760
+ connectivity: Literal[4, 8] = 4,
761
+ band: Optional[int] = None,
762
+ include_coordinates: bool = False,
763
+ graph_type: Literal["networkx", "sparse"] = "networkx",
764
+ check_memory: bool = True,
765
+ ) -> Union[nx.Graph, sp.csr_matrix]:
766
+ """
767
+ Convert raster to graph based on pixel adjacency.
518
768
 
519
- x_res, y_res = self.resolution
769
+ Args:
770
+ connectivity: 4 or 8-connectivity
771
+ band: Band number (1-indexed)
772
+ include_coordinates: Include x,y coordinates in nodes
773
+ graph_type: 'networkx' or 'sparse'
774
+ check_memory: Whether to check memory before operation
520
775
 
521
- # create bounding box for each pixel
522
- geometries = [
523
- box(lon - x_res / 2, lat - y_res / 2, lon + x_res / 2, lat + y_res / 2)
524
- for lon, lat in zip(df["lon"], df["lat"])
525
- ]
776
+ Returns:
777
+ Graph representation of raster
778
+ """
526
779
 
527
- gdf = gpd.GeoDataFrame(df, geometry=geometries, crs=self.crs)
780
+ # Memory guard check
781
+ if check_memory:
782
+ self._memory_guard("graph", threshold_percent=80.0)
528
783
 
529
- self.logger.info("Conversion to GeoDataFrame complete!")
530
- return gdf
784
+ with self.open_dataset() as src:
785
+ band_idx = band - 1 if band is not None else 0
786
+ if band_idx < 0 or band_idx >= src.count:
787
+ raise ValueError(
788
+ f"Band {band} not available. Raster has {src.count} bands"
789
+ )
790
+
791
+ data = src.read(band_idx + 1)
792
+ nodata = src.nodata if src.nodata is not None else self.nodata
793
+ valid_mask = (
794
+ data != nodata if nodata is not None else np.ones_like(data, dtype=bool)
795
+ )
796
+
797
+ height, width = data.shape
798
+
799
+ # Find all valid pixels
800
+ valid_rows, valid_cols = np.where(valid_mask)
801
+ num_valid_pixels = len(valid_rows)
802
+
803
+ # Create a sequential mapping from (row, col) to a node ID
804
+ node_map = np.full(data.shape, -1, dtype=int)
805
+ node_map[valid_rows, valid_cols] = np.arange(num_valid_pixels)
806
+
807
+ # Define neighborhood offsets
808
+ if connectivity == 4:
809
+ # von Neumann neighborhood (4-connectivity)
810
+ offsets = [(-1, 0), (1, 0), (0, -1), (0, 1)]
811
+ else: # connectivity == 8
812
+ # Moore neighborhood (8-connectivity)
813
+ offsets = [
814
+ (-1, -1),
815
+ (-1, 0),
816
+ (-1, 1),
817
+ (0, -1),
818
+ (0, 1),
819
+ (1, -1),
820
+ (1, 0),
821
+ (1, 1),
822
+ ]
823
+
824
+ # Collect nodes and edges
825
+ nodes_to_add = []
826
+ edges_to_add = []
827
+
828
+ for i in range(num_valid_pixels):
829
+ row, col = valid_rows[i], valid_cols[i]
830
+ current_node_id = node_map[row, col]
831
+
832
+ # Prepare node attributes
833
+ node_attrs = {"value": float(data[row, col])}
834
+ if include_coordinates:
835
+ x, y = src.xy(row, col)
836
+ node_attrs["x"] = x
837
+ node_attrs["y"] = y
838
+ nodes_to_add.append((current_node_id, node_attrs))
839
+
840
+ # Find neighbors and collect edges
841
+ for dy, dx in offsets:
842
+ neighbor_row, neighbor_col = row + dy, col + dx
843
+
844
+ # Check if neighbor is within bounds and is a valid pixel
845
+ if (
846
+ 0 <= neighbor_row < height
847
+ and 0 <= neighbor_col < width
848
+ and valid_mask[neighbor_row, neighbor_col]
849
+ ):
850
+ neighbor_node_id = node_map[neighbor_row, neighbor_col]
851
+
852
+ # Ensure each edge is added only once
853
+ if current_node_id < neighbor_node_id:
854
+ neighbor_value = float(data[neighbor_row, neighbor_col])
855
+ edges_to_add.append(
856
+ (current_node_id, neighbor_node_id, neighbor_value)
857
+ )
858
+
859
+ if graph_type == "networkx":
860
+ G = nx.Graph()
861
+ G.add_nodes_from(nodes_to_add)
862
+ G.add_weighted_edges_from(edges_to_add)
863
+ return G
864
+ else: # sparse matrix
865
+ edges_array = np.array(edges_to_add)
866
+ row_indices = edges_array[:, 0]
867
+ col_indices = edges_array[:, 1]
868
+ weights = edges_array[:, 2]
869
+
870
+ # Add reverse edges for symmetric matrix
871
+ from_idx = np.append(row_indices, col_indices)
872
+ to_idx = np.append(col_indices, row_indices)
873
+ weights = np.append(weights, weights)
874
+
875
+ return sp.coo_matrix(
876
+ (weights, (from_idx, to_idx)),
877
+ shape=(num_valid_pixels, num_valid_pixels),
878
+ ).tocsr()
531
879
 
532
880
  def sample_by_coordinates(
533
881
  self, coordinate_list: List[Tuple[float, float]], **kwargs
@@ -661,11 +1009,63 @@ class TifProcessor:
661
1009
  stat: Union[str, Callable] = "mean",
662
1010
  batch_size: int = 100,
663
1011
  n_workers: int = 4,
1012
+ show_progress: bool = True,
1013
+ check_memory: bool = True,
664
1014
  **kwargs,
665
1015
  ) -> np.ndarray:
666
1016
  """
667
1017
  Sample raster values by polygons in parallel using batching.
1018
+
1019
+ Args:
1020
+ polygon_list: List of Shapely Polygon or MultiPolygon objects
1021
+ stat: Statistic to compute
1022
+ batch_size: Number of polygons per batch
1023
+ n_workers: Number of worker processes
1024
+ show_progress: Whether to display progress bar
1025
+ check_memory: Whether to check memory before operation
1026
+ **kwargs: Additional arguments
1027
+
1028
+ Returns:
1029
+ np.ndarray of statistics for each polygon
668
1030
  """
1031
+ import sys
1032
+
1033
+ # Memory guard check with n_workers consideration
1034
+ if check_memory:
1035
+ is_safe = self._memory_guard(
1036
+ "batched_sampling",
1037
+ threshold_percent=85.0,
1038
+ n_workers=n_workers,
1039
+ raise_error=False,
1040
+ )
1041
+
1042
+ if not is_safe:
1043
+ # Suggest reducing n_workers
1044
+ memory_info = self._check_available_memory()
1045
+ estimates = self._estimate_memory_usage("batched_sampling", n_workers=1)
1046
+
1047
+ # Calculate optimal workers
1048
+ suggested_workers = max(
1049
+ 1, int(memory_info["available"] * 0.7 / estimates["per_worker"])
1050
+ )
1051
+
1052
+ warnings.warn(
1053
+ f"Consider reducing n_workers from {n_workers} to {suggested_workers} "
1054
+ f"to reduce memory pressure.",
1055
+ ResourceWarning,
1056
+ )
1057
+
1058
+ # Platform check
1059
+ if sys.platform in ["win32", "darwin"]:
1060
+ import warnings
1061
+ import multiprocessing as mp
1062
+
1063
+ if mp.get_start_method(allow_none=True) != "fork":
1064
+ warnings.warn(
1065
+ "Batched sampling may not work on Windows/macOS. "
1066
+ "Use sample_by_polygons() if you encounter errors.",
1067
+ RuntimeWarning,
1068
+ )
669
1069
 
670
1070
  def _chunk_list(data_list, chunk_size):
671
1071
  """Yield successive chunks from data_list."""
@@ -676,20 +1076,22 @@ class TifProcessor:
676
1076
  return np.array([])
677
1077
 
678
1078
  stat_func = stat if callable(stat) else getattr(np, stat)
679
-
680
1079
  polygon_chunks = list(_chunk_list(polygon_list, batch_size))
681
1080
 
682
1081
  with multiprocessing.Pool(
683
1082
  initializer=self._initializer_worker, processes=n_workers
684
1083
  ) as pool:
685
1084
  process_func = partial(self._process_polygon_batch, stat_func=stat_func)
686
- batched_results = list(
687
- tqdm(
688
- pool.imap(process_func, polygon_chunks),
689
- total=len(polygon_chunks),
690
- desc=f"Sampling polygons",
1085
+ if show_progress:
1086
+ batched_results = list(
1087
+ tqdm(
1088
+ pool.imap(process_func, polygon_chunks),
1089
+ total=len(polygon_chunks),
1090
+ desc=f"Sampling polygons",
1091
+ )
691
1092
  )
692
- )
1093
+ else:
1094
+ batched_results = list(pool.imap(process_func, polygon_chunks))
693
1095
 
694
1096
  results = [item for sublist in batched_results for item in sublist]
695
1097
 
@@ -701,24 +1103,46 @@ class TifProcessor:
701
1103
  Opens the raster dataset and stores it in a process-local variable.
702
1104
  This function runs once per worker, not for every task.
703
1105
  """
1106
+ global src_handle, memfile_handle
1107
+
1108
+ # Priority: merged > reprojected > original (same as open_dataset)
1109
+ local_file_path = None
1110
+ if self._merged_file_path:
1111
+ # Merged file is a local temp file
1112
+ local_file_path = self._merged_file_path
1113
+ elif self._reprojected_file_path:
1114
+ # Reprojected file is a local temp file
1115
+ local_file_path = self._reprojected_file_path
1116
+ elif isinstance(self.data_store, LocalDataStore):
1117
+ # Local file - can open directly
1118
+ local_file_path = str(self.dataset_path)
1119
+
1120
+ if local_file_path:
1121
+ # Open local file directly
1122
+ with open(local_file_path, "rb") as f:
1123
+ memfile_handle = rasterio.MemoryFile(f.read())
1124
+ src_handle = memfile_handle.open()
1125
+ else:
1126
+ # Custom DataStore
1127
+ with self.data_store.open(str(self.dataset_path), "rb") as f:
1128
+ memfile_handle = rasterio.MemoryFile(f.read())
1129
+ src_handle = memfile_handle.open()
1130
+
1131
+ def _get_worker_dataset(self):
1132
+ """Get dataset handle for worker process."""
704
1133
  global src_handle
705
- with self.data_store.open(self.dataset_path, "rb") as f:
706
- with rasterio.MemoryFile(f.read()) as memfile:
707
- src_handle = memfile.open()
1134
+ if src_handle is None:
1135
+ raise RuntimeError("Raster dataset not initialized in this process.")
1136
+ return src_handle
708
1137
 
709
1138
  def _process_single_polygon(self, polygon, stat_func):
710
1139
  """
711
1140
  Helper function to process a single polygon.
712
1141
  This will be run in a separate process.
713
1142
  """
714
- global src_handle
715
- if src_handle is None:
716
- # This should not happen if the initializer is set up correctly,
717
- # but it's a good defensive check.
718
- raise RuntimeError("Raster dataset not initialized in this process.")
719
-
720
1143
  try:
721
- out_image, _ = mask(src_handle, [polygon], crop=True, filled=False)
1144
+ src = self._get_worker_dataset()
1145
+ out_image, _ = mask(src, [polygon], crop=True, filled=False)
722
1146
 
723
1147
  if hasattr(out_image, "mask"):
724
1148
  valid_data = out_image.compressed()
@@ -729,11 +1153,12 @@ class TifProcessor:
729
1153
  else out_image.flatten()
730
1154
  )
731
1155
 
732
- if len(valid_data) == 0:
733
- return np.nan
734
- else:
735
- return stat_func(valid_data)
736
- except Exception:
1156
+ return stat_func(valid_data) if len(valid_data) > 0 else np.nan
1157
+ except RuntimeError as e:
1158
+ self.logger.error(f"Worker not initialized: {e}")
1159
+ return np.nan
1160
+ except Exception as e:
1161
+ self.logger.debug(f"Error processing polygon: {e}")
737
1162
  return np.nan
738
1163
 
739
1164
  def _process_polygon_batch(self, polygon_batch, stat_func):
@@ -745,226 +1170,226 @@ class TifProcessor:
745
1170
  for polygon in polygon_batch
746
1171
  ]
747
1172
 
748
- def _to_rgba_dataframe(self, drop_transparent: bool = False) -> pd.DataFrame:
749
- """
750
- Convert RGBA TIF to DataFrame with separate columns for R, G, B, A values.
1173
+ def _to_dataframe(
1174
+ self,
1175
+ band_number: Optional[int] = None,
1176
+ drop_nodata: bool = True,
1177
+ band_names: Optional[Union[str, List[str]]] = None,
1178
+ ) -> pd.DataFrame:
751
1179
  """
752
- self.logger.info("Processing RGBA dataset...")
753
-
754
- with self.open_dataset() as src:
755
- if self.count != 4:
756
- raise ValueError("RGBA mode requires a 4-band TIF file")
757
-
758
- # Read all four bands
759
- red, green, blue, alpha = src.read()
760
-
761
- x_coords, y_coords = self._get_pixel_coordinates()
762
-
763
- if drop_transparent:
764
- mask = alpha > 0
765
- red = np.extract(mask, red)
766
- green = np.extract(mask, green)
767
- blue = np.extract(mask, blue)
768
- alpha = np.extract(mask, alpha)
769
- lons = np.extract(mask, x_coords)
770
- lats = np.extract(mask, y_coords)
771
- else:
772
- lons = x_coords.flatten()
773
- lats = y_coords.flatten()
774
- red = red.flatten()
775
- green = green.flatten()
776
- blue = blue.flatten()
777
- alpha = alpha.flatten()
778
-
779
- # Create DataFrame with RGBA values
780
- data = pd.DataFrame(
781
- {
782
- "lon": lons,
783
- "lat": lats,
784
- "red": red,
785
- "green": green,
786
- "blue": blue,
787
- "alpha": alpha,
788
- }
789
- )
790
-
791
- # Normalize alpha values if they're not in [0, 1] range
792
- if data["alpha"].max() > 1:
793
- data["alpha"] = data["alpha"] / data["alpha"].max()
794
-
795
- self.logger.info("RGBA dataset is processed!")
796
- return data
1180
+ Process TIF to DataFrame - handles both single-band and multi-band.
797
1181
 
798
- def _to_rgb_dataframe(self, drop_nodata: bool = True) -> pd.DataFrame:
799
- """Convert RGB TIF to DataFrame with separate columns for R, G, B values."""
800
- if self.mode != "rgb":
801
- raise ValueError("Use appropriate method for current mode")
802
-
803
- self.logger.info("Processing RGB dataset...")
1182
+ Args:
1183
+ band_number: Specific band to read (1-indexed). If None, reads all bands.
1184
+ drop_no Whether to drop nodata values
1185
+ band_names: Custom names for bands (multi-band only)
804
1186
 
1187
+ Returns:
1188
+ pd.DataFrame with lon, lat, and band value(s)
1189
+ """
805
1190
  with self.open_dataset() as src:
806
- if self.count != 3:
807
- raise ValueError("RGB mode requires a 3-band TIF file")
1191
+ if band_number is not None:
1192
+ # SINGLE BAND MODE
1193
+ band = src.read(band_number)
1194
+ mask = self._build_data_mask(band, drop_nodata, src.nodata)
1195
+ lons, lats = self._extract_coordinates_with_mask(mask)
1196
+ pixel_values = (
1197
+ np.extract(mask, band) if mask is not None else band.flatten()
1198
+ )
1199
+ band_name = band_names if isinstance(band_names, str) else "pixel_value"
808
1200
 
809
- # Read all three bands
810
- red, green, blue = src.read()
1201
+ return pd.DataFrame({"lon": lons, "lat": lats, band_name: pixel_values})
1202
+ else:
1203
+ # MULTI-BAND MODE (all bands)
1204
+ stack = src.read()
1205
+
1206
+ # Auto-detect band names by mode
1207
+ if band_names is None:
1208
+ if self.mode == "rgb":
1209
+ band_names = ["red", "green", "blue"]
1210
+ elif self.mode == "rgba":
1211
+ band_names = ["red", "green", "blue", "alpha"]
1212
+ else:
1213
+ band_names = [
1214
+ src.descriptions[i] or f"band_{i+1}"
1215
+ for i in range(self.count)
1216
+ ]
811
1217
 
812
- x_coords, y_coords = self._get_pixel_coordinates()
1218
+ # Build mask (checks ALL bands!)
1219
+ mask = self._build_multi_band_mask(stack, drop_nodata, src.nodata)
813
1220
 
814
- if drop_nodata:
815
- nodata_value = src.nodata
816
- if nodata_value is not None:
817
- mask = ~(
818
- (red == nodata_value)
819
- | (green == nodata_value)
820
- | (blue == nodata_value)
821
- )
822
- red = np.extract(mask, red)
823
- green = np.extract(mask, green)
824
- blue = np.extract(mask, blue)
825
- lons = np.extract(mask, x_coords)
826
- lats = np.extract(mask, y_coords)
827
- else:
828
- lons = x_coords.flatten()
829
- lats = y_coords.flatten()
830
- red = red.flatten()
831
- green = green.flatten()
832
- blue = blue.flatten()
833
- else:
834
- lons = x_coords.flatten()
835
- lats = y_coords.flatten()
836
- red = red.flatten()
837
- green = green.flatten()
838
- blue = blue.flatten()
1221
+ # Create DataFrame
1222
+ data_dict = self._bands_to_dict(stack, self.count, band_names, mask)
1223
+ df = pd.DataFrame(data_dict)
839
1224
 
840
- data = pd.DataFrame(
841
- {
842
- "lon": lons,
843
- "lat": lats,
844
- "red": red,
845
- "green": green,
846
- "blue": blue,
847
- }
848
- )
1225
+ # RGBA: normalize alpha if needed
1226
+ if (
1227
+ self.mode == "rgba"
1228
+ and "alpha" in df.columns
1229
+ and df["alpha"].max() > 1
1230
+ ):
1231
+ df["alpha"] = df["alpha"] / 255.0
849
1232
 
850
- self.logger.info("RGB dataset is processed!")
851
- return data
1233
+ return df
852
1234
 
853
- def _to_band_dataframe(
854
- self, band_number: int = 1, drop_nodata: bool = True, drop_values: list = []
1235
+ def _to_dataframe_chunked(
1236
+ self,
1237
+ windows: List[rasterio.windows.Window],
1238
+ band_number: Optional[int] = None,
1239
+ drop_nodata: bool = True,
1240
+ band_names: Optional[Union[str, List[str]]] = None,
1241
+ show_progress: bool = True,
855
1242
  ) -> pd.DataFrame:
856
- """Process single-band TIF to DataFrame."""
857
- if self.mode != "single":
858
- raise ValueError("Use appropriate method for current mode")
1243
+ """Universal chunked converter for ALL modes."""
859
1244
 
860
- self.logger.info("Processing single-band dataset...")
861
-
862
- if band_number <= 0 or band_number > self.count:
863
- self.logger.error(
864
- f"Error: Band number {band_number} is out of range. The file has {self.count} bands."
865
- )
866
- return None
1245
+ chunks = []
1246
+ iterator = tqdm(windows, desc="Processing chunks") if show_progress else windows
867
1247
 
868
1248
  with self.open_dataset() as src:
1249
+ # Auto-detect band names ONCE (before loop)
1250
+ if band_number is None and band_names is None:
1251
+ if self.mode == "rgb":
1252
+ band_names = ["red", "green", "blue"]
1253
+ elif self.mode == "rgba":
1254
+ band_names = ["red", "green", "blue", "alpha"]
1255
+ else: # multi
1256
+ band_names = [
1257
+ src.descriptions[i] or f"band_{i+1}" for i in range(self.count)
1258
+ ]
869
1259
 
870
- band = src.read(band_number)
871
-
872
- x_coords, y_coords = self._get_pixel_coordinates()
1260
+ for window in iterator:
1261
+ if band_number is not None:
1262
+ # SINGLE BAND
1263
+ band_chunk = src.read(band_number, window=window)
1264
+ mask = self._build_data_mask(band_chunk, drop_nodata, src.nodata)
1265
+ lons, lats = self._get_chunk_coordinates(window, src)
1266
+ band_name = (
1267
+ band_names if isinstance(band_names, str) else "pixel_value"
1268
+ )
873
1269
 
874
- values_to_mask = []
875
- if drop_nodata:
876
- nodata_value = src.nodata
877
- if nodata_value is not None:
878
- values_to_mask.append(nodata_value)
1270
+ # Build chunk DataFrame (could use helper but simple enough)
1271
+ if mask is not None:
1272
+ mask_flat = mask.flatten()
1273
+ chunk_df = pd.DataFrame(
1274
+ {
1275
+ "lon": lons[mask_flat],
1276
+ "lat": lats[mask_flat],
1277
+ band_name: band_chunk.flatten()[mask_flat],
1278
+ }
1279
+ )
1280
+ else:
1281
+ chunk_df = pd.DataFrame(
1282
+ {"lon": lons, "lat": lats, band_name: band_chunk.flatten()}
1283
+ )
1284
+ else:
1285
+ # MULTI-BAND (includes RGB/RGBA)
1286
+ stack_chunk = src.read(window=window)
1287
+ mask = self._build_multi_band_mask(
1288
+ stack_chunk, drop_nodata, src.nodata
1289
+ )
1290
+ lons, lats = self._get_chunk_coordinates(window, src)
879
1291
 
880
- if drop_values:
881
- values_to_mask.extend(drop_values)
1292
+ # Build DataFrame using helper
1293
+ band_dict = {
1294
+ band_names[i]: stack_chunk[i] for i in range(self.count)
1295
+ }
1296
+ chunk_df = self._build_chunk_dataframe(lons, lats, band_dict, mask)
882
1297
 
883
- if values_to_mask:
884
- data_mask = ~np.isin(band, values_to_mask)
885
- pixel_values = np.extract(data_mask, band)
886
- lons = np.extract(data_mask, x_coords)
887
- lats = np.extract(data_mask, y_coords)
888
- else:
889
- pixel_values = band.flatten()
890
- lons = x_coords.flatten()
891
- lats = y_coords.flatten()
1298
+ # RGBA: normalize alpha
1299
+ if self.mode == "rgba" and "alpha" in chunk_df.columns:
1300
+ if chunk_df["alpha"].max() > 1:
1301
+ chunk_df["alpha"] = chunk_df["alpha"] / 255.0
892
1302
 
893
- data = pd.DataFrame({"lon": lons, "lat": lats, "pixel_value": pixel_values})
1303
+ chunks.append(chunk_df)
894
1304
 
895
- self.logger.info("Dataset is processed!")
896
- return data
1305
+ result = pd.concat(chunks, ignore_index=True)
1306
+ return result
897
1307
 
898
- def _to_multi_band_dataframe(
1308
+ def _prepare_geometry_for_clipping(
899
1309
  self,
900
- drop_nodata: bool = True,
901
- drop_values: list = [],
902
- band_names: Optional[List[str]] = None,
903
- ) -> pd.DataFrame:
904
- """
905
- Process multi-band TIF to DataFrame with all bands included.
906
-
907
- Args:
908
- drop_nodata (bool): Whether to drop nodata values. Defaults to True.
909
- drop_values (list): Additional values to drop from the dataset. Defaults to empty list.
910
- band_names (Optional[List[str]]): Custom names for the bands. If None, bands will be named using
911
- the band descriptions from the GeoTIFF metadata if available,
912
- otherwise 'band_1', 'band_2', etc.
913
-
914
- Returns:
915
- pd.DataFrame: DataFrame containing coordinates and all band values
916
- """
917
- self.logger.info("Processing multi-band dataset...")
918
-
919
- with self.open_dataset() as src:
920
- # Read all bands
921
- stack = src.read()
1310
+ geometry: Union[
1311
+ Polygon, MultiPolygon, gpd.GeoDataFrame, gpd.GeoSeries, List[dict], dict
1312
+ ],
1313
+ ) -> List[dict]:
1314
+ """Convert various geometry formats to list of GeoJSON-like dicts for rasterio.mask"""
1315
+
1316
+ if isinstance(geometry, (Polygon, MultiPolygon)):
1317
+ # Shapely geometry
1318
+ return [geometry.__geo_interface__]
1319
+
1320
+ elif isinstance(geometry, gpd.GeoDataFrame):
1321
+ # GeoDataFrame - use all geometries
1322
+ return [
1323
+ geom.__geo_interface__ for geom in geometry.geometry if geom is not None
1324
+ ]
1325
+
1326
+ elif isinstance(geometry, gpd.GeoSeries):
1327
+ # GeoSeries
1328
+ return [geom.__geo_interface__ for geom in geometry if geom is not None]
1329
+
1330
+ elif isinstance(geometry, dict):
1331
+ # Single GeoJSON-like dict
1332
+ return [geometry]
1333
+
1334
+ elif isinstance(geometry, list):
1335
+ # List of GeoJSON-like dicts
1336
+ return geometry
922
1337
 
923
- x_coords, y_coords = self._get_pixel_coordinates()
924
-
925
- # Initialize dictionary with coordinates
926
- data_dict = {"lon": x_coords.flatten(), "lat": y_coords.flatten()}
1338
+ else:
1339
+ raise TypeError(
1340
+ f"Unsupported geometry type: {type(geometry)}. "
1341
+ "Supported types: Shapely geometries, GeoDataFrame, GeoSeries, "
1342
+ "GeoJSON-like dict, or list of GeoJSON-like dicts."
1343
+ )
927
1344
 
928
- # Get band descriptions from metadata if available
929
- if band_names is None and hasattr(src, "descriptions") and src.descriptions:
930
- band_names = [
931
- desc if desc else f"band_{i+1}"
932
- for i, desc in enumerate(src.descriptions)
933
- ]
1345
+ def _validate_geometry_crs(
1346
+ self,
1347
+ original_geometry: Any,
1348
+ ) -> None:
1349
+ """Validate that geometry CRS matches raster CRS"""
1350
+
1351
+ # Get raster CRS
1352
+ raster_crs = self.crs
1353
+
1354
+ # Try to get geometry CRS
1355
+ geometry_crs = None
1356
+
1357
+ if isinstance(original_geometry, (gpd.GeoDataFrame, gpd.GeoSeries)):
1358
+ geometry_crs = original_geometry.crs
1359
+ elif hasattr(original_geometry, "crs"):
1360
+ geometry_crs = original_geometry.crs
1361
+
1362
+ # Warn if CRS mismatch detected
1363
+ if geometry_crs is not None and raster_crs is not None:
1364
+ if not raster_crs == geometry_crs:
1365
+ self.logger.warning(
1366
+ f"CRS mismatch detected! Raster CRS: {raster_crs}, "
1367
+ f"Geometry CRS: {geometry_crs}. "
1368
+ "Consider reprojecting geometry to match raster CRS for accurate clipping."
1369
+ )
934
1370
 
935
- # Process each band
936
- for band_idx in range(self.count):
937
- band_data = stack[band_idx]
938
-
939
- # Handle nodata and other values to drop
940
- if drop_nodata or drop_values:
941
- values_to_mask = []
942
- if drop_nodata and src.nodata is not None:
943
- values_to_mask.append(src.nodata)
944
- if drop_values:
945
- values_to_mask.extend(drop_values)
946
-
947
- if values_to_mask:
948
- data_mask = ~np.isin(band_data, values_to_mask)
949
- band_values = np.extract(data_mask, band_data)
950
- if band_idx == 0: # Only need to mask coordinates once
951
- data_dict["lon"] = np.extract(data_mask, x_coords)
952
- data_dict["lat"] = np.extract(data_mask, y_coords)
953
- else:
954
- band_values = band_data.flatten()
955
- else:
956
- band_values = band_data.flatten()
1371
+ def _create_clipped_processor(
1372
+ self, clipped_data: np.ndarray, clipped_meta: dict
1373
+ ) -> "TifProcessor":
1374
+ """
1375
+ Helper to create a new TifProcessor instance from clipped data.
1376
+ Saves the clipped data to a temporary file and initializes a new TifProcessor.
1377
+ """
1378
+ clipped_file_path = os.path.join(
1379
+ self._temp_dir, f"clipped_temp_{os.urandom(8).hex()}.tif"
1380
+ )
1381
+ with rasterio.open(clipped_file_path, "w", **clipped_meta) as dst:
1382
+ dst.write(clipped_data)
957
1383
 
958
- # Use custom band names if provided, otherwise use descriptions or default naming
959
- band_name = (
960
- band_names[band_idx]
961
- if band_names and len(band_names) > band_idx
962
- else f"band_{band_idx + 1}"
963
- )
964
- data_dict[band_name] = band_values
1384
+ self.logger.info(f"Clipped raster saved to temporary file: {clipped_file_path}")
965
1385
 
966
- self.logger.info("Multi-band dataset is processed!")
967
- return pd.DataFrame(data_dict)
1386
+ # Create a new TifProcessor instance with the clipped data
1387
+ # Pass relevant parameters from the current instance to maintain consistency
1388
+ return TifProcessor(
1389
+ dataset_path=clipped_file_path,
1390
+ data_store=self.data_store,
1391
+ mode=self.mode,
1392
+ )
968
1393
 
969
1394
  def _get_pixel_coordinates(self):
970
1395
  """Helper method to generate coordinate arrays for all pixels"""
@@ -991,60 +1416,322 @@ class TifProcessor:
991
1416
 
992
1417
  return self._cache["pixel_coords"]
993
1418
 
1419
+ def _get_chunk_coordinates(self, window, src):
1420
+ """Get coordinates for a specific window chunk."""
1421
+ transform = src.window_transform(window)
1422
+ rows, cols = np.meshgrid(
1423
+ np.arange(window.height), np.arange(window.width), indexing="ij"
1424
+ )
1425
+ xs, ys = rasterio.transform.xy(transform, rows.flatten(), cols.flatten())
1426
+ return np.array(xs), np.array(ys)
994
1427
 
995
- def sample_multiple_tifs_by_coordinates(
996
- tif_processors: List[TifProcessor], coordinate_list: List[Tuple[float, float]]
997
- ):
998
- """
999
- Sample raster values from multiple TIFF files for given coordinates.
1428
+ def _extract_coordinates_with_mask(self, mask=None):
1429
+ """Extract flattened coordinates, optionally applying a mask."""
1430
+ x_coords, y_coords = self._get_pixel_coordinates()
1000
1431
 
1001
- Parameters:
1002
- - tif_processors: List of TifProcessor instances.
1003
- - coordinate_list: List of (x, y) coordinates.
1432
+ if mask is not None:
1433
+ return np.extract(mask, x_coords), np.extract(mask, y_coords)
1004
1434
 
1005
- Returns:
1006
- - A NumPy array of sampled values, taking the first non-nodata value encountered.
1007
- """
1008
- sampled_values = np.full(len(coordinate_list), np.nan, dtype=np.float32)
1435
+ return x_coords.flatten(), y_coords.flatten()
1436
+
1437
+ def _build_data_mask(self, data, drop_nodata=True, nodata_value=None):
1438
+ """Build a boolean mask for filtering data based on nodata values."""
1439
+ if not drop_nodata or nodata_value is None:
1440
+ return None
1441
+
1442
+ return data != nodata_value
1443
+
1444
+ def _build_multi_band_mask(
1445
+ self,
1446
+ bands: np.ndarray,
1447
+ drop_nodata: bool = True,
1448
+ nodata_value: Optional[float] = None,
1449
+ ) -> Optional[np.ndarray]:
1450
+ """
1451
+ Build mask for multi-band data - drops pixels where ANY band has nodata.
1452
+
1453
+ Args:
1454
+ bands: 3D array of shape (n_bands, height, width)
1455
+ drop_nodata Whether to drop nodata values
1456
+ nodata_value: The nodata value to check
1457
+
1458
+ Returns:
1459
+ Boolean mask or None if no masking needed
1460
+ """
1461
+ if not drop_nodata or nodata_value is None:
1462
+ return None
1463
+
1464
+ # Check if ANY band has nodata at each pixel location
1465
+ has_nodata = np.any(bands == nodata_value, axis=0)
1466
+
1467
+ # Return True where ALL bands are valid
1468
+ valid_mask = ~has_nodata
1469
+
1470
+ return valid_mask if not valid_mask.all() else None
1471
+
1472
+ def _bands_to_dict(self, bands, band_count, band_names, mask=None):
1473
+ """Read specified bands and return as a dictionary with optional masking."""
1474
+
1475
+ lons, lats = self._extract_coordinates_with_mask(mask)
1476
+ data_dict = {"lon": lons, "lat": lats}
1477
+
1478
+ for idx, name in enumerate(band_names[:band_count]):
1479
+ band_data = bands[idx]
1480
+ data_dict[name] = (
1481
+ np.extract(mask, band_data) if mask is not None else band_data.flatten()
1482
+ )
1483
+
1484
+ return data_dict
1485
+
1486
+ def _calculate_optimal_chunk_size(
1487
+ self, operation: str = "conversion", target_memory_mb: int = 500
1488
+ ) -> int:
1489
+ """
1490
+ Calculate optimal chunk size (number of rows) based on target memory usage.
1009
1491
 
1010
- for tp in tif_processors:
1011
- values = tp.sample_by_coordinates(coordinate_list=coordinate_list)
1492
+ Args:
1493
+ operation: Type of operation ('conversion', 'graph')
1494
+ target_memory_mb: Target memory per chunk in megabytes
1012
1495
 
1013
- if tp.nodata is not None:
1014
- mask = (np.isnan(sampled_values)) & (
1015
- values != tp.nodata
1016
- ) # Replace only NaNs
1496
+ Returns:
1497
+ Number of rows per chunk
1498
+ """
1499
+ bytes_per_element = np.dtype(self.dtype).itemsize
1500
+ n_bands = self.count
1501
+ width = self.width
1502
+
1503
+ # Adjust for operation type
1504
+ if operation == "conversion":
1505
+ # DataFrame overhead is roughly 2x
1506
+ bytes_per_row = width * n_bands * bytes_per_element * 2
1507
+ elif operation == "graph":
1508
+ # Graph needs additional space for edges
1509
+ bytes_per_row = width * bytes_per_element * 4 # Estimate
1017
1510
  else:
1018
- mask = np.isnan(sampled_values) # No explicit nodata, replace all NaNs
1511
+ bytes_per_row = width * n_bands * bytes_per_element
1019
1512
 
1020
- sampled_values[mask] = values[mask] # Update only missing values
1513
+ target_bytes = target_memory_mb * 1024 * 1024
1514
+ chunk_rows = max(1, int(target_bytes / bytes_per_row))
1021
1515
 
1022
- return sampled_values
1516
+ # Ensure chunk size doesn't exceed total height
1517
+ chunk_rows = min(chunk_rows, self.height)
1023
1518
 
1519
+ self.logger.info(
1520
+ f"Calculated chunk size: {chunk_rows} rows "
1521
+ f"(~{self._format_bytes(chunk_rows * bytes_per_row)} per chunk)"
1522
+ )
1024
1523
 
1025
- def sample_multiple_tifs_by_polygons(
1026
- tif_processors: List[TifProcessor],
1027
- polygon_list: List[Union[Polygon, MultiPolygon]],
1028
- stat: str = "mean",
1029
- ) -> np.ndarray:
1030
- """
1031
- Sample raster values from multiple TIFF files for polygons in a list and join the results.
1524
+ return chunk_rows
1525
+
1526
+ def _get_chunk_windows(self, chunk_size: int) -> List[rasterio.windows.Window]:
1527
+ """
1528
+ Generate window objects for chunked reading.
1032
1529
 
1033
- Parameters:
1034
- - tif_processors: List of TifProcessor instances.
1035
- - polygon_list: List of polygon geometries (can include MultiPolygons).
1036
- - stat: Aggregation statistic to compute within each polygon (mean, median, sum, min, max).
1530
+ Args:
1531
+ chunk_size: Number of rows per chunk
1037
1532
 
1038
- Returns:
1039
- - A NumPy array of sampled values, taking the first non-nodata value encountered.
1040
- """
1041
- sampled_values = np.full(len(polygon_list), np.nan, dtype=np.float32)
1533
+ Returns:
1534
+ List of rasterio.windows.Window objects
1535
+ """
1536
+ windows = []
1537
+ for row_start in range(0, self.height, chunk_size):
1538
+ row_end = min(row_start + chunk_size, self.height)
1539
+ window = rasterio.windows.Window(
1540
+ col_off=0,
1541
+ row_off=row_start,
1542
+ width=self.width,
1543
+ height=row_end - row_start,
1544
+ )
1545
+ windows.append(window)
1546
+
1547
+ return windows
1548
+
1549
+ def _format_bytes(self, bytes_value: int) -> str:
1550
+ """Convert bytes to human-readable format."""
1551
+ for unit in ["B", "KB", "MB", "GB", "TB"]:
1552
+ if bytes_value < 1024.0:
1553
+ return f"{bytes_value:.2f} {unit}"
1554
+ bytes_value /= 1024.0
1555
+ return f"{bytes_value:.2f} PB"
1556
+
1557
+ def _check_available_memory(self) -> dict:
1558
+ """
1559
+ Check available system memory.
1560
+
1561
+ Returns:
1562
+ Dict with total, available, and used memory info
1563
+ """
1564
+ import psutil
1565
+
1566
+ memory = psutil.virtual_memory()
1567
+ return {
1568
+ "total": memory.total,
1569
+ "available": memory.available,
1570
+ "used": memory.used,
1571
+ "percent": memory.percent,
1572
+ "available_human": self._format_bytes(memory.available),
1573
+ }
1574
+
1575
+ def _estimate_memory_usage(
1576
+ self, operation: str = "conversion", n_workers: int = 1
1577
+ ) -> dict:
1578
+ """
1579
+ Estimate memory usage for various operations.
1580
+
1581
+ Args:
1582
+ operation: Type of operation ('conversion', 'batched_sampling', 'merge', 'graph')
1583
+ n_workers: Number of workers (for batched_sampling)
1584
+
1585
+ Returns:
1586
+ Dict with estimated memory usage in bytes and human-readable format
1587
+ """
1588
+ bytes_per_element = np.dtype(self.dtype).itemsize
1589
+ n_pixels = self.width * self.height
1590
+ n_bands = self.count
1591
+
1592
+ estimates = {}
1593
+
1594
+ if operation == "conversion":
1595
+ # to_dataframe/to_geodataframe: full raster + DataFrame overhead
1596
+ raster_memory = n_pixels * n_bands * bytes_per_element
1597
+ # DataFrame overhead (roughly 2x for storage + processing)
1598
+ dataframe_memory = (
1599
+ n_pixels * n_bands * 16
1600
+ ) # 16 bytes per value in DataFrame
1601
+ total = raster_memory + dataframe_memory
1602
+ estimates["raster"] = raster_memory
1603
+ estimates["dataframe"] = dataframe_memory
1604
+ estimates["total"] = total
1605
+
1606
+ elif operation == "batched_sampling":
1607
+ # Each worker loads full raster into MemoryFile
1608
+ # Need to get file size
1609
+ if self._merged_file_path:
1610
+ file_path = self._merged_file_path
1611
+ elif self._reprojected_file_path:
1612
+ file_path = self._reprojected_file_path
1613
+ else:
1614
+ file_path = str(self.dataset_path)
1615
+
1616
+ try:
1617
+ import os
1618
+
1619
+ file_size = os.path.getsize(file_path)
1620
+ except:
1621
+ # Estimate if can't get file size
1622
+ file_size = n_pixels * n_bands * bytes_per_element * 1.2 # Add overhead
1623
+
1624
+ estimates["per_worker"] = file_size
1625
+ estimates["total"] = file_size * n_workers
1626
+
1627
+ elif operation == "merge":
1628
+ # _merge_with_mean uses float64 arrays
1629
+ raster_memory = n_pixels * n_bands * 8 # float64
1630
+ estimates["sum_array"] = raster_memory
1631
+ estimates["count_array"] = n_pixels * 4 # int32
1632
+ estimates["total"] = raster_memory + n_pixels * 4
1633
+
1634
+ elif operation == "graph":
1635
+ # to_graph: data + node_map + edges
1636
+ data_memory = n_pixels * bytes_per_element
1637
+ node_map_memory = n_pixels * 4 # int32
1638
+ # Estimate edges (rough: 4-connectivity = 4 edges per pixel)
1639
+ edges_memory = n_pixels * 4 * 3 * 8 # 3 values per edge, float64
1640
+ total = data_memory + node_map_memory + edges_memory
1641
+ estimates["data"] = data_memory
1642
+ estimates["node_map"] = node_map_memory
1643
+ estimates["edges"] = edges_memory
1644
+ estimates["total"] = total
1645
+
1646
+ # Add human-readable format
1647
+ estimates["human_readable"] = self._format_bytes(estimates["total"])
1648
+
1649
+ return estimates
1650
+
1651
+ def _memory_guard(
1652
+ self,
1653
+ operation: str,
1654
+ threshold_percent: float = 80.0,
1655
+ n_workers: Optional[int] = None,
1656
+ raise_error: bool = False,
1657
+ ) -> bool:
1658
+ """
1659
+ Check if operation is safe to perform given memory constraints.
1660
+
1661
+ Args:
1662
+ operation: Type of operation to check
1663
+ threshold_percent: Maximum % of available memory to use (default 80%)
1664
+ n_workers: Number of workers (for batched operations)
1665
+ raise_error: If True, raise MemoryError instead of warning
1666
+
1667
+ Returns:
1668
+ True if operation is safe, False otherwise
1042
1669
 
1043
- for tp in tif_processors:
1044
- values = tp.sample_by_polygons(polygon_list=polygon_list, stat=stat)
1670
+ Raises:
1671
+ MemoryError: If raise_error=True and memory insufficient
1672
+ """
1673
+ import warnings
1674
+
1675
+ estimates = self._estimate_memory_usage(operation, n_workers=n_workers or 1)
1676
+ memory_info = self._check_available_memory()
1677
+
1678
+ estimated_usage = estimates["total"]
1679
+ available = memory_info["available"]
1680
+ threshold = available * (threshold_percent / 100.0)
1681
+
1682
+ is_safe = estimated_usage <= threshold
1683
+
1684
+ if not is_safe:
1685
+ usage_str = self._format_bytes(estimated_usage)
1686
+ available_str = memory_info["available_human"]
1687
+
1688
+ message = (
1689
+ f"Memory warning: {operation} operation may require {usage_str} "
1690
+ f"but only {available_str} is available. "
1691
+ f"Current memory usage: {memory_info['percent']:.1f}%"
1692
+ )
1045
1693
 
1046
- mask = np.isnan(sampled_values) # replace all NaNs
1694
+ if raise_error:
1695
+ raise MemoryError(message)
1696
+ else:
1697
+ warnings.warn(message, ResourceWarning)
1698
+ if hasattr(self, "logger"):
1699
+ self.logger.warning(message)
1700
+
1701
+ return is_safe
1702
+
1703
+ def _validate_mode_band_compatibility(self):
1704
+ """Validate that mode matches band count."""
1705
+ mode_requirements = {
1706
+ "single": (1, "1-band"),
1707
+ "rgb": (3, "3-band"),
1708
+ "rgba": (4, "4-band"),
1709
+ }
1710
+
1711
+ if self.mode in mode_requirements:
1712
+ required_count, description = mode_requirements[self.mode]
1713
+ if self.count != required_count:
1714
+ raise ValueError(
1715
+ f"{self.mode.upper()} mode requires a {description} TIF file"
1716
+ )
1717
+ elif self.mode == "multi" and self.count < 2:
1718
+ raise ValueError("Multi mode requires a TIF file with 2 or more bands")
1047
1719
 
1048
- sampled_values[mask] = values[mask] # Update only values with samapled value
1720
+ def __enter__(self):
1721
+ return self
1049
1722
 
1050
- return sampled_values
1723
+ def __del__(self):
1724
+ """Clean up temporary files and directories."""
1725
+ if hasattr(self, "_temp_dir") and os.path.exists(self._temp_dir):
1726
+ shutil.rmtree(self._temp_dir, ignore_errors=True)
1727
+
1728
+ def cleanup(self):
1729
+ """Explicit cleanup method for better control."""
1730
+ if hasattr(self, "_temp_dir") and os.path.exists(self._temp_dir):
1731
+ shutil.rmtree(self._temp_dir)
1732
+ self.logger.info("Cleaned up temporary files")
1733
+
1734
+ def __exit__(self, exc_type, exc_value, traceback):
1735
+ """Proper context manager exit with cleanup."""
1736
+ self.cleanup()
1737
+ return False