giga-spatial 0.6.9__py3-none-any.whl → 0.7.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,7 +1,9 @@
1
1
  import numpy as np
2
2
  import pandas as pd
3
3
  import geopandas as gpd
4
- from typing import List, Optional, Tuple, Union, Literal, Callable
4
+ import networkx as nx
5
+ import scipy.sparse as sp
6
+ from typing import List, Optional, Tuple, Union, Literal, Callable, Dict, Any
5
7
  from pydantic import ConfigDict
6
8
  from pydantic.dataclasses import dataclass
7
9
  from contextlib import contextmanager
@@ -15,6 +17,7 @@ from functools import partial
15
17
  import multiprocessing
16
18
  from tqdm import tqdm
17
19
  import tempfile
20
+ import shutil
18
21
  import os
19
22
 
20
23
  from gigaspatial.core.io.data_store import DataStore
@@ -35,26 +38,40 @@ class TifProcessor:
35
38
  merge_method: Literal["first", "last", "min", "max", "mean"] = "first"
36
39
  target_crs: Optional[str] = None # For reprojection if needed
37
40
  resampling_method: Resampling = Resampling.nearest
41
+ reprojection_resolution: Optional[Tuple[float, float]] = None
38
42
 
39
43
  def __post_init__(self):
40
44
  """Validate inputs, merge rasters if needed, and set up logging."""
41
45
  self.data_store = self.data_store or LocalDataStore()
42
46
  self.logger = config.get_logger(self.__class__.__name__)
43
47
  self._cache = {}
48
+ self._temp_dir = tempfile.mkdtemp()
44
49
  self._merged_file_path = None
45
- self._temp_dir = None
50
+ self._reprojected_file_path = None
46
51
 
47
52
  # Handle multiple dataset paths
48
53
  if isinstance(self.dataset_path, list):
49
- self.dataset_paths = [Path(p) for p in self.dataset_path]
50
- self._validate_multiple_datasets()
51
- self._merge_rasters()
52
- self.dataset_path = self._merged_file_path
54
+ if len(self.dataset_path) > 1:
55
+ self.dataset_paths = [Path(p) for p in self.dataset_path]
56
+ self._validate_multiple_datasets()
57
+ self._merge_rasters()
58
+ self.dataset_path = self._merged_file_path
53
59
  else:
54
60
  self.dataset_paths = [Path(self.dataset_path)]
55
- if not self.data_store.file_exists(self.dataset_path):
61
+ if not self.data_store.file_exists(str(self.dataset_path)):
56
62
  raise FileNotFoundError(f"Dataset not found at {self.dataset_path}")
57
63
 
64
+ # Reproject single raster during initialization if target_crs is set
65
+ if self.target_crs:
66
+ self.logger.info(f"Reprojecting single raster to {self.target_crs}...")
67
+ with self.data_store.open(str(self.dataset_path), "rb") as f:
68
+ with rasterio.MemoryFile(f.read()) as memfile:
69
+ with memfile.open() as src:
70
+ self._reprojected_file_path = self._reproject_to_temp_file(
71
+ src, self.target_crs
72
+ )
73
+ self.dataset_path = self._reprojected_file_path
74
+
58
75
  self._load_metadata()
59
76
 
60
77
  # Validate mode and band count
@@ -67,18 +84,124 @@ class TifProcessor:
67
84
  if self.mode == "multi" and self.count < 2:
68
85
  raise ValueError("Multi mode requires a TIF file with 2 or more bands")
69
86
 
87
+ @contextmanager
88
+ def open_dataset(self):
89
+ """Context manager for accessing the dataset, handling temporary reprojected files."""
90
+ if self._merged_file_path:
91
+ with rasterio.open(self._merged_file_path) as src:
92
+ yield src
93
+ elif self._reprojected_file_path:
94
+ with rasterio.open(self._reprojected_file_path) as src:
95
+ yield src
96
+ else:
97
+ with self.data_store.open(str(self.dataset_path), "rb") as f:
98
+ with rasterio.MemoryFile(f.read()) as memfile:
99
+ with memfile.open() as src:
100
+ yield src
101
+
102
+ def reproject_to(
103
+ self,
104
+ target_crs: str,
105
+ output_path: Optional[Union[str, Path]] = None,
106
+ resampling_method: Optional[Resampling] = None,
107
+ resolution: Optional[Tuple[float, float]] = None,
108
+ ):
109
+ """
110
+ Reprojects the current raster to a new CRS and optionally saves it.
111
+
112
+ Args:
113
+ target_crs: The CRS to reproject to (e.g., "EPSG:4326").
114
+ output_path: The path to save the reprojected raster. If None,
115
+ it is saved to a temporary file.
116
+ resampling_method: The resampling method to use.
117
+ resolution: The target resolution (pixel size) in the new CRS.
118
+ """
119
+ self.logger.info(f"Reprojecting raster to {target_crs}...")
120
+
121
+ # Use provided or default values
122
+ resampling_method = resampling_method or self.resampling_method
123
+ resolution = resolution or self.reprojection_resolution
124
+
125
+ with self.open_dataset() as src:
126
+ if src.crs.to_string() == target_crs:
127
+ self.logger.info(
128
+ "Raster is already in the target CRS. No reprojection needed."
129
+ )
130
+ # If output_path is specified, copy the file
131
+ if output_path:
132
+ self.data_store.copy_file(str(self.dataset_path), output_path)
133
+ return self.dataset_path
134
+
135
+ dst_path = output_path or os.path.join(
136
+ self._temp_dir, f"reprojected_single_{os.urandom(8).hex()}.tif"
137
+ )
138
+
139
+ with rasterio.open(
140
+ dst_path,
141
+ "w",
142
+ **self._get_reprojection_profile(src, target_crs, resolution),
143
+ ) as dst:
144
+ for band_idx in range(1, src.count + 1):
145
+ reproject(
146
+ source=rasterio.band(src, band_idx),
147
+ destination=rasterio.band(dst, band_idx),
148
+ src_transform=src.transform,
149
+ src_crs=src.crs,
150
+ dst_transform=dst.transform,
151
+ dst_crs=dst.crs,
152
+ resampling=resampling_method,
153
+ num_threads=multiprocessing.cpu_count(),
154
+ )
155
+
156
+ self.logger.info(f"Reprojection complete. Output saved to {dst_path}")
157
+ return Path(dst_path)
158
+
159
+ def get_raster_info(self) -> Dict[str, Any]:
160
+ """Get comprehensive raster information."""
161
+ return {
162
+ "count": self.count,
163
+ "width": self.width,
164
+ "height": self.height,
165
+ "crs": self.crs,
166
+ "bounds": self.bounds,
167
+ "transform": self.transform,
168
+ "dtypes": self.dtype,
169
+ "nodata": self.nodata,
170
+ "mode": self.mode,
171
+ "is_merged": self.is_merged,
172
+ "source_count": self.source_count,
173
+ }
174
+
175
+ def _reproject_to_temp_file(
176
+ self, src: rasterio.DatasetReader, target_crs: str
177
+ ) -> str:
178
+ """Helper to reproject a raster and save it to a temporary file."""
179
+ dst_path = os.path.join(
180
+ self._temp_dir, f"reprojected_temp_{os.urandom(8).hex()}.tif"
181
+ )
182
+ profile = self._get_reprojection_profile(
183
+ src, target_crs, self.reprojection_resolution
184
+ )
185
+
186
+ with rasterio.open(dst_path, "w", **profile) as dst:
187
+ for band_idx in range(1, src.count + 1):
188
+ reproject(
189
+ source=rasterio.band(src, band_idx),
190
+ destination=rasterio.band(dst, band_idx),
191
+ src_transform=src.transform,
192
+ src_crs=src.crs,
193
+ dst_transform=dst.transform,
194
+ dst_crs=dst.crs,
195
+ resampling=self.resampling_method,
196
+ )
197
+ return dst_path
198
+
70
199
  def _validate_multiple_datasets(self):
71
200
  """Validate that all datasets exist and have compatible properties."""
72
201
  if len(self.dataset_paths) < 2:
73
202
  raise ValueError("Multiple dataset paths required for merging")
74
203
 
75
- # Check if all files exist
76
- for path in self.dataset_paths:
77
- if not self.data_store.file_exists(path):
78
- raise FileNotFoundError(f"Dataset not found at {path}")
79
-
80
- # Load first dataset to get reference properties
81
- with self.data_store.open(self.dataset_paths[0], "rb") as f:
204
+ with self.data_store.open(str(self.dataset_paths[0]), "rb") as f:
82
205
  with rasterio.MemoryFile(f.read()) as memfile:
83
206
  with memfile.open() as ref_src:
84
207
  ref_count = ref_src.count
@@ -87,9 +210,8 @@ class TifProcessor:
87
210
  ref_transform = ref_src.transform
88
211
  ref_nodata = ref_src.nodata
89
212
 
90
- # Validate all other datasets against reference
91
213
  for i, path in enumerate(self.dataset_paths[1:], 1):
92
- with self.data_store.open(path, "rb") as f:
214
+ with self.data_store.open(str(path), "rb") as f:
93
215
  with rasterio.MemoryFile(f.read()) as memfile:
94
216
  with memfile.open() as src:
95
217
  if src.count != ref_count:
@@ -100,9 +222,10 @@ class TifProcessor:
100
222
  raise ValueError(
101
223
  f"Dataset {i} has dtype {src.dtypes[0]}, expected {ref_dtype}"
102
224
  )
103
- if self.target_crs is None and src.crs != ref_crs:
104
- raise ValueError(
105
- f"Dataset {i} has CRS {src.crs}, expected {ref_crs}. Consider setting target_crs parameter."
225
+ if not self.target_crs and src.crs != ref_crs:
226
+ self.logger.warning(
227
+ f"Dataset {i} has CRS {src.crs}, expected {ref_crs}. "
228
+ "Consider setting target_crs parameter for reprojection before merging."
106
229
  )
107
230
  if self.target_crs is None and not self._transforms_compatible(
108
231
  src.transform, ref_transform
@@ -115,6 +238,46 @@ class TifProcessor:
115
238
  f"Dataset {i} has different nodata value: {src.nodata} vs {ref_nodata}"
116
239
  )
117
240
 
241
+ def _get_reprojection_profile(
242
+ self,
243
+ src: rasterio.DatasetReader,
244
+ target_crs: str,
245
+ resolution: Optional[Tuple[float, float]],
246
+ compression: str = "lzw",
247
+ ):
248
+ """Calculates and returns the profile for a reprojected raster."""
249
+ if resolution:
250
+ src_res = (abs(src.transform.a), abs(src.transform.e))
251
+ self.logger.info(
252
+ f"Using target resolution: {resolution}. Source resolution: {src_res}."
253
+ )
254
+ # Calculate transform and dimensions based on the new resolution
255
+ dst_transform, width, height = calculate_default_transform(
256
+ src.crs,
257
+ target_crs,
258
+ src.width,
259
+ src.height,
260
+ *src.bounds,
261
+ resolution=resolution,
262
+ )
263
+ else:
264
+ # Keep original resolution but reproject
265
+ dst_transform, width, height = calculate_default_transform(
266
+ src.crs, target_crs, src.width, src.height, *src.bounds
267
+ )
268
+
269
+ profile = src.profile.copy()
270
+ profile.update(
271
+ {
272
+ "crs": target_crs,
273
+ "transform": dst_transform,
274
+ "width": width,
275
+ "height": height,
276
+ "compress": compression, # Add compression to save space
277
+ }
278
+ )
279
+ return profile
280
+
118
281
  def _transforms_compatible(self, transform1, transform2, tolerance=1e-6):
119
282
  """Check if two transforms have compatible pixel sizes."""
120
283
  return (
@@ -126,151 +289,77 @@ class TifProcessor:
126
289
  """Merge multiple rasters into a single raster."""
127
290
  self.logger.info(f"Merging {len(self.dataset_paths)} rasters...")
128
291
 
129
- # Create temporary directory for merged file
130
- self._temp_dir = tempfile.mkdtemp()
131
- merged_filename = "merged_raster.tif"
132
- self._merged_file_path = os.path.join(self._temp_dir, merged_filename)
133
-
134
292
  # Open all datasets and handle reprojection if needed
135
- src_files = []
136
- reprojected_files = []
137
-
293
+ datasets_to_merge = []
294
+ temp_reprojected_files = []
138
295
  try:
139
296
  for path in self.dataset_paths:
140
- with self.data_store.open(path, "rb") as f:
141
- # Create temporary file for each dataset
142
- temp_file = tempfile.NamedTemporaryFile(suffix=".tif", delete=False)
143
- temp_file.write(f.read())
144
- temp_file.close()
145
- src_files.append(rasterio.open(temp_file.name))
146
-
147
- # Handle reprojection if target_crs is specified
148
- if self.target_crs:
149
- self.logger.info(f"Reprojecting rasters to {self.target_crs}...")
150
- processed_files = self._reproject_rasters(src_files, self.target_crs)
151
- reprojected_files = processed_files
152
- else:
153
- processed_files = src_files
297
+ with self.data_store.open(str(path), "rb") as f:
298
+ with rasterio.MemoryFile(f.read()) as memfile:
299
+ with memfile.open() as src:
300
+ if self.target_crs and src.crs != self.target_crs:
301
+ self.logger.info(
302
+ f"Reprojecting {path.name} to {self.target_crs} before merging."
303
+ )
304
+ reprojected_path = self._reproject_to_temp_file(
305
+ src, self.target_crs
306
+ )
307
+ temp_reprojected_files.append(reprojected_path)
308
+ datasets_to_merge.append(
309
+ rasterio.open(reprojected_path)
310
+ )
311
+ else:
312
+ temp_path = os.path.join(
313
+ self._temp_dir,
314
+ f"temp_{path.stem}_{os.urandom(4).hex()}.tif",
315
+ )
316
+ temp_reprojected_files.append(temp_path)
317
+
318
+ profile = src.profile
319
+ with rasterio.open(temp_path, "w", **profile) as dst:
320
+ dst.write(src.read())
321
+ datasets_to_merge.append(rasterio.open(temp_path))
322
+
323
+ self._merged_file_path = os.path.join(self._temp_dir, "merged_raster.tif")
154
324
 
155
325
  if self.merge_method == "mean":
156
- # For mean, we need to handle it manually
157
- merged_array, merged_transform = self._merge_with_mean(src_files)
158
-
159
- # Use first source as reference for metadata
160
- ref_src = src_files[0]
161
- profile = ref_src.profile.copy()
162
- profile.update(
163
- {
164
- "height": merged_array.shape[-2],
165
- "width": merged_array.shape[-1],
166
- "transform": merged_transform,
167
- }
326
+ merged_array, merged_transform = self._merge_with_mean(
327
+ datasets_to_merge
168
328
  )
169
-
170
- # Write merged raster
171
- with rasterio.open(self._merged_file_path, "w", **profile) as dst:
172
- dst.write(merged_array)
173
-
174
329
  else:
175
- # Use rasterio's merge function
176
330
  merged_array, merged_transform = merge(
177
- src_files,
331
+ datasets_to_merge,
178
332
  method=self.merge_method,
179
333
  resampling=self.resampling_method,
180
334
  )
181
335
 
182
- # Use first source as reference for metadata
183
- ref_src = src_files[0]
184
- profile = ref_src.profile.copy()
185
- profile.update(
186
- {
187
- "height": merged_array.shape[-2],
188
- "width": merged_array.shape[-1],
189
- "transform": merged_transform,
190
- }
191
- )
192
-
193
- if self.target_crs:
194
- profile["crs"] = self.target_crs
195
-
196
- # Write merged raster
197
- with rasterio.open(self._merged_file_path, "w", **profile) as dst:
198
- dst.write(merged_array)
199
-
200
- finally:
201
- # Clean up source files
202
- for src in src_files:
203
- temp_path = src.name
204
- src.close()
205
- try:
206
- os.unlink(temp_path)
207
- except:
208
- pass
209
-
210
- # Clean up reprojected files
211
- for src in reprojected_files:
212
- if src not in src_files: # Don't double-close
213
- temp_path = src.name
214
- src.close()
215
- try:
216
- os.unlink(temp_path)
217
- except:
218
- pass
219
-
220
- self.logger.info("Raster merging completed!")
221
-
222
- def _reproject_rasters(self, src_files, target_crs):
223
- """Reproject all rasters to a common CRS before merging."""
224
- reprojected_files = []
225
-
226
- for i, src in enumerate(src_files):
227
- if src.crs.to_string() == target_crs:
228
- # No reprojection needed
229
- reprojected_files.append(src)
230
- continue
231
-
232
- # Calculate transform and dimensions for reprojection
233
- transform, width, height = calculate_default_transform(
234
- src.crs,
235
- target_crs,
236
- src.width,
237
- src.height,
238
- *src.bounds,
239
- resolution=self.resolution if hasattr(self, "resolution") else None,
240
- )
241
-
242
- # Create temporary file for reprojected raster
243
- temp_file = tempfile.NamedTemporaryFile(suffix=".tif", delete=False)
244
- temp_file.close()
245
-
246
- # Set up profile for reprojected raster
247
- profile = src.profile.copy()
336
+ # Get profile from the first file in the list (all should be compatible now)
337
+ ref_src = datasets_to_merge[0]
338
+ profile = ref_src.profile.copy()
248
339
  profile.update(
249
340
  {
250
- "crs": target_crs,
251
- "transform": transform,
252
- "width": width,
253
- "height": height,
341
+ "height": merged_array.shape[-2],
342
+ "width": merged_array.shape[-1],
343
+ "transform": merged_transform,
344
+ "crs": self.target_crs if self.target_crs else ref_src.crs,
254
345
  }
255
346
  )
256
347
 
257
- # Reproject and write to temporary file
258
- with rasterio.open(temp_file.name, "w", **profile) as dst:
259
- for band_idx in range(1, src.count + 1):
260
- reproject(
261
- source=rasterio.band(src, band_idx),
262
- destination=rasterio.band(dst, band_idx),
263
- src_transform=src.transform,
264
- src_crs=src.crs,
265
- dst_transform=transform,
266
- dst_crs=target_crs,
267
- resampling=self.resampling_method,
268
- )
348
+ with rasterio.open(self._merged_file_path, "w", **profile) as dst:
349
+ dst.write(merged_array)
350
+ finally:
351
+ for dataset in datasets_to_merge:
352
+ if hasattr(dataset, "close"):
353
+ dataset.close()
269
354
 
270
- # Open reprojected file
271
- reprojected_files.append(rasterio.open(temp_file.name))
355
+ # Clean up temporary files immediately
356
+ for temp_file in temp_reprojected_files:
357
+ try:
358
+ os.remove(temp_file)
359
+ except OSError:
360
+ pass
272
361
 
273
- return reprojected_files
362
+ self.logger.info("Raster merging completed!")
274
363
 
275
364
  def _merge_with_mean(self, src_files):
276
365
  """Merge rasters using mean aggregation."""
@@ -295,6 +384,12 @@ class TifProcessor:
295
384
  bounds.left, bounds.bottom, bounds.right, bounds.top, width, height
296
385
  )
297
386
 
387
+ estimated_memory = height * width * src_files[0].count * 8 # float64
388
+ if estimated_memory > 1e9: # 1GB threshold
389
+ self.logger.warning(
390
+ f"Large memory usage expected: {estimated_memory/1e9:.1f}GB"
391
+ )
392
+
298
393
  # Initialize arrays for sum and count
299
394
  sum_array = np.zeros((src_files[0].count, height, width), dtype=np.float64)
300
395
  count_array = np.zeros((height, width), dtype=np.int32)
@@ -336,33 +431,9 @@ class TifProcessor:
336
431
 
337
432
  return mean_array.astype(src_files[0].dtypes[0]), merged_transform
338
433
 
339
- def __del__(self):
340
- """Cleanup temporary files."""
341
- if self._temp_dir and os.path.exists(self._temp_dir):
342
- try:
343
- import shutil
344
-
345
- shutil.rmtree(self._temp_dir)
346
- except:
347
- pass
348
-
349
- @contextmanager
350
- def open_dataset(self):
351
- """Context manager for accessing the dataset"""
352
- if self._merged_file_path:
353
- # Open merged file directly
354
- with rasterio.open(self._merged_file_path) as src:
355
- yield src
356
- else:
357
- # Original single file logic
358
- with self.data_store.open(self.dataset_path, "rb") as f:
359
- with rasterio.MemoryFile(f.read()) as memfile:
360
- with memfile.open() as src:
361
- yield src
362
-
363
434
  def _load_metadata(self):
364
435
  """Load metadata from the TIF file if not already cached"""
365
- if not self._cache:
436
+ try:
366
437
  with self.open_dataset() as src:
367
438
  self._cache["transform"] = src.transform
368
439
  self._cache["crs"] = src.crs.to_string()
@@ -375,6 +446,10 @@ class TifProcessor:
375
446
  self._cache["nodata"] = src.nodata
376
447
  self._cache["count"] = src.count
377
448
  self._cache["dtype"] = src.dtypes[0]
449
+ except (rasterio.errors.RasterioIOError, FileNotFoundError) as e:
450
+ raise FileNotFoundError(f"Could not read raster metadata: {e}")
451
+ except Exception as e:
452
+ raise RuntimeError(f"Unexpected error loading metadata: {e}")
378
453
 
379
454
  @property
380
455
  def is_merged(self) -> bool:
@@ -386,7 +461,6 @@ class TifProcessor:
386
461
  """Get the number of source rasters."""
387
462
  return len(self.dataset_paths)
388
463
 
389
- # All other methods remain the same...
390
464
  @property
391
465
  def transform(self):
392
466
  """Get the transform from the TIF file"""
@@ -428,39 +502,17 @@ class TifProcessor:
428
502
  return self._cache["nodata"]
429
503
 
430
504
  @property
431
- def tabular(self) -> pd.DataFrame:
432
- """Get the data from the TIF file"""
433
- self.logger.warning(
434
- "The `tabular` property is deprecated, use `to_dataframe` instead"
435
- )
436
- if not hasattr(self, "_tabular"):
437
- try:
438
- if self.mode == "single":
439
- self._tabular = self._to_band_dataframe(
440
- drop_nodata=True, drop_values=[]
441
- )
442
- elif self.mode == "rgb":
443
- self._tabular = self._to_rgb_dataframe(drop_nodata=True)
444
- elif self.mode == "rgba":
445
- self._tabular = self._to_rgba_dataframe(drop_transparent=True)
446
- elif self.mode == "multi":
447
- self._tabular = self._to_multi_band_dataframe(
448
- drop_nodata=True,
449
- drop_values=[],
450
- band_names=None, # Use default band naming
451
- )
452
- else:
453
- raise ValueError(
454
- f"Invalid mode: {self.mode}. Must be one of: single, rgb, rgba, multi"
455
- )
456
- except Exception as e:
457
- raise ValueError(
458
- f"Failed to process TIF file in mode '{self.mode}'. "
459
- f"Please ensure the file is valid and matches the selected mode. "
460
- f"Original error: {str(e)}"
461
- )
505
+ def dtype(self):
506
+ """Get the data types from the TIF file"""
507
+ return self._cache.get("dtype", [])
462
508
 
463
- return self._tabular
509
+ @property
510
+ def width(self):
511
+ return self._cache["width"]
512
+
513
+ @property
514
+ def height(self):
515
+ return self._cache["height"]
464
516
 
465
517
  def to_dataframe(self, drop_nodata=True, **kwargs) -> pd.DataFrame:
466
518
  try:
@@ -504,30 +556,115 @@ class TifProcessor:
504
556
 
505
557
  return gdf
506
558
 
507
- def get_zoned_geodataframe(self) -> gpd.GeoDataFrame:
559
+ def to_graph(
560
+ self,
561
+ connectivity: Literal[4, 8] = 4,
562
+ band: Optional[int] = None,
563
+ include_coordinates: bool = False,
564
+ graph_type: Literal["networkx", "sparse"] = "networkx",
565
+ chunk_size: Optional[int] = None,
566
+ ) -> Union[nx.Graph, sp.csr_matrix]:
508
567
  """
509
- Convert the processed TIF data into a GeoDataFrame, where each row represents a pixel zone.
510
- Each zone is defined by its bounding box, based on pixel resolution and coordinates.
568
+ Convert raster to graph based on pixel adjacency.
511
569
  """
512
- self.logger.warning(
513
- "The `get_zoned_geodataframe` method is deprecated, use `to_geodataframe` instead"
514
- )
515
- self.logger.info("Converting data to GeoDataFrame with zones...")
570
+ if chunk_size is not None:
571
+ raise NotImplementedError("Chunked processing is not yet implemented.")
516
572
 
517
- df = self.tabular
573
+ with self.open_dataset() as src:
574
+ band_idx = band - 1 if band is not None else 0
575
+ if band_idx < 0 or band_idx >= src.count:
576
+ raise ValueError(
577
+ f"Band {band} not available. Raster has {src.count} bands"
578
+ )
518
579
 
519
- x_res, y_res = self.resolution
580
+ data = src.read(band_idx + 1)
581
+ nodata = src.nodata if src.nodata is not None else self.nodata
582
+ valid_mask = (
583
+ data != nodata if nodata is not None else np.ones_like(data, dtype=bool)
584
+ )
520
585
 
521
- # create bounding box for each pixel
522
- geometries = [
523
- box(lon - x_res / 2, lat - y_res / 2, lon + x_res / 2, lat + y_res / 2)
524
- for lon, lat in zip(df["lon"], df["lat"])
525
- ]
586
+ height, width = data.shape
587
+
588
+ # Find all valid pixels
589
+ valid_rows, valid_cols = np.where(valid_mask)
590
+ num_valid_pixels = len(valid_rows)
591
+
592
+ # Create a sequential mapping from (row, col) to a node ID
593
+ node_map = np.full(data.shape, -1, dtype=int)
594
+ node_map[valid_rows, valid_cols] = np.arange(num_valid_pixels)
595
+
596
+ # Define neighborhood offsets
597
+ if connectivity == 4:
598
+ # von Neumann neighborhood (4-connectivity)
599
+ offsets = [(-1, 0), (1, 0), (0, -1), (0, 1)]
600
+ else: # connectivity == 8
601
+ # Moore neighborhood (8-connectivity)
602
+ offsets = [
603
+ (-1, -1),
604
+ (-1, 0),
605
+ (-1, 1),
606
+ (0, -1),
607
+ (0, 1),
608
+ (1, -1),
609
+ (1, 0),
610
+ (1, 1),
611
+ ]
526
612
 
527
- gdf = gpd.GeoDataFrame(df, geometry=geometries, crs=self.crs)
613
+ # Collect nodes and edges
614
+ nodes_to_add = []
615
+ edges_to_add = []
616
+
617
+ for i in range(num_valid_pixels):
618
+ row, col = valid_rows[i], valid_cols[i]
619
+ current_node_id = node_map[row, col]
620
+
621
+ # Prepare node attributes
622
+ node_attrs = {"value": float(data[row, col])}
623
+ if include_coordinates:
624
+ x, y = src.xy(row, col)
625
+ node_attrs["x"] = x
626
+ node_attrs["y"] = y
627
+ nodes_to_add.append((current_node_id, node_attrs))
628
+
629
+ # Find neighbors and collect edges
630
+ for dy, dx in offsets:
631
+ neighbor_row, neighbor_col = row + dy, col + dx
632
+
633
+ # Check if neighbor is within bounds and is a valid pixel
634
+ if (
635
+ 0 <= neighbor_row < height
636
+ and 0 <= neighbor_col < width
637
+ and valid_mask[neighbor_row, neighbor_col]
638
+ ):
639
+ neighbor_node_id = node_map[neighbor_row, neighbor_col]
640
+
641
+ # Ensure each edge is added only once
642
+ if current_node_id < neighbor_node_id:
643
+ neighbor_value = float(data[neighbor_row, neighbor_col])
644
+ edges_to_add.append(
645
+ (current_node_id, neighbor_node_id, neighbor_value)
646
+ )
528
647
 
529
- self.logger.info("Conversion to GeoDataFrame complete!")
530
- return gdf
648
+ if graph_type == "networkx":
649
+ G = nx.Graph()
650
+ G.add_nodes_from(nodes_to_add)
651
+ G.add_weighted_edges_from(edges_to_add)
652
+ return G
653
+ else: # sparse matrix
654
+ edges_array = np.array(edges_to_add)
655
+ row_indices = edges_array[:, 0]
656
+ col_indices = edges_array[:, 1]
657
+ weights = edges_array[:, 2]
658
+
659
+ # Add reverse edges for symmetric matrix
660
+ row_indices.extend(col_indices)
661
+ col_indices.extend(row_indices)
662
+ weights.extend(weights)
663
+
664
+ return sp.coo_matrix(
665
+ (weights, (row_indices, col_indices)),
666
+ shape=(num_valid_pixels, num_valid_pixels),
667
+ ).tocsr()
531
668
 
532
669
  def sample_by_coordinates(
533
670
  self, coordinate_list: List[Tuple[float, float]], **kwargs
@@ -701,10 +838,10 @@ class TifProcessor:
701
838
  Opens the raster dataset and stores it in a process-local variable.
702
839
  This function runs once per worker, not for every task.
703
840
  """
704
- global src_handle
705
- with self.data_store.open(self.dataset_path, "rb") as f:
706
- with rasterio.MemoryFile(f.read()) as memfile:
707
- src_handle = memfile.open()
841
+ global src_handle, memfile_handle
842
+ with self.data_store.open(str(self.dataset_path), "rb") as f:
843
+ memfile_handle = rasterio.MemoryFile(f.read())
844
+ src_handle = memfile_handle.open()
708
845
 
709
846
  def _process_single_polygon(self, polygon, stat_func):
710
847
  """
@@ -991,6 +1128,25 @@ class TifProcessor:
991
1128
 
992
1129
  return self._cache["pixel_coords"]
993
1130
 
1131
+ def __enter__(self):
1132
+ return self
1133
+
1134
+ def __del__(self):
1135
+ """Clean up temporary files and directories."""
1136
+ if hasattr(self, "_temp_dir") and os.path.exists(self._temp_dir):
1137
+ shutil.rmtree(self._temp_dir, ignore_errors=True)
1138
+
1139
+ def cleanup(self):
1140
+ """Explicit cleanup method for better control."""
1141
+ if hasattr(self, "_temp_dir") and os.path.exists(self._temp_dir):
1142
+ shutil.rmtree(self._temp_dir)
1143
+ self.logger.info("Cleaned up temporary files")
1144
+
1145
+ def __exit__(self, exc_type, exc_value, traceback):
1146
+ """Proper context manager exit with cleanup."""
1147
+ self.cleanup()
1148
+ return False
1149
+
994
1150
 
995
1151
  def sample_multiple_tifs_by_coordinates(
996
1152
  tif_processors: List[TifProcessor], coordinate_list: List[Tuple[float, float]]