py2ls 0.1.10.12__py3-none-any.whl → 0.2.7.10__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of py2ls might be problematic. Click here for more details.

Files changed (72) hide show
  1. py2ls/.DS_Store +0 -0
  2. py2ls/.git/.DS_Store +0 -0
  3. py2ls/.git/index +0 -0
  4. py2ls/.git/logs/refs/remotes/origin/HEAD +1 -0
  5. py2ls/.git/objects/.DS_Store +0 -0
  6. py2ls/.git/refs/.DS_Store +0 -0
  7. py2ls/ImageLoader.py +621 -0
  8. py2ls/__init__.py +7 -5
  9. py2ls/apptainer2ls.py +3940 -0
  10. py2ls/batman.py +164 -42
  11. py2ls/bio.py +2595 -0
  12. py2ls/cell_image_clf.py +1632 -0
  13. py2ls/container2ls.py +4635 -0
  14. py2ls/corr.py +475 -0
  15. py2ls/data/.DS_Store +0 -0
  16. py2ls/data/email/email_html_template.html +88 -0
  17. py2ls/data/hyper_param_autogluon_zeroshot2024.json +2383 -0
  18. py2ls/data/hyper_param_tabrepo_2024.py +1753 -0
  19. py2ls/data/mygenes_fields_241022.txt +355 -0
  20. py2ls/data/re_common_pattern.json +173 -0
  21. py2ls/data/sns_info.json +74 -0
  22. py2ls/data/styles/.DS_Store +0 -0
  23. py2ls/data/styles/example/.DS_Store +0 -0
  24. py2ls/data/styles/stylelib/.DS_Store +0 -0
  25. py2ls/data/styles/stylelib/grid.mplstyle +15 -0
  26. py2ls/data/styles/stylelib/high-contrast.mplstyle +6 -0
  27. py2ls/data/styles/stylelib/high-vis.mplstyle +4 -0
  28. py2ls/data/styles/stylelib/ieee.mplstyle +15 -0
  29. py2ls/data/styles/stylelib/light.mplstyl +6 -0
  30. py2ls/data/styles/stylelib/muted.mplstyle +6 -0
  31. py2ls/data/styles/stylelib/nature-reviews-latex.mplstyle +616 -0
  32. py2ls/data/styles/stylelib/nature-reviews.mplstyle +616 -0
  33. py2ls/data/styles/stylelib/nature.mplstyle +31 -0
  34. py2ls/data/styles/stylelib/no-latex.mplstyle +10 -0
  35. py2ls/data/styles/stylelib/notebook.mplstyle +36 -0
  36. py2ls/data/styles/stylelib/paper.mplstyle +290 -0
  37. py2ls/data/styles/stylelib/paper2.mplstyle +305 -0
  38. py2ls/data/styles/stylelib/retro.mplstyle +4 -0
  39. py2ls/data/styles/stylelib/sans.mplstyle +10 -0
  40. py2ls/data/styles/stylelib/scatter.mplstyle +7 -0
  41. py2ls/data/styles/stylelib/science.mplstyle +48 -0
  42. py2ls/data/styles/stylelib/std-colors.mplstyle +4 -0
  43. py2ls/data/styles/stylelib/vibrant.mplstyle +6 -0
  44. py2ls/data/tiles.csv +146 -0
  45. py2ls/data/usages_pd.json +1417 -0
  46. py2ls/data/usages_sns.json +31 -0
  47. py2ls/docker2ls.py +5446 -0
  48. py2ls/ec2ls.py +61 -0
  49. py2ls/fetch_update.py +145 -0
  50. py2ls/ich2ls.py +1955 -296
  51. py2ls/im2.py +8242 -0
  52. py2ls/image_ml2ls.py +2100 -0
  53. py2ls/ips.py +33909 -3418
  54. py2ls/ml2ls.py +7700 -0
  55. py2ls/mol.py +289 -0
  56. py2ls/mount2ls.py +1307 -0
  57. py2ls/netfinder.py +873 -351
  58. py2ls/nl2ls.py +283 -0
  59. py2ls/ocr.py +1581 -458
  60. py2ls/plot.py +10394 -314
  61. py2ls/rna2ls.py +311 -0
  62. py2ls/ssh2ls.md +456 -0
  63. py2ls/ssh2ls.py +5933 -0
  64. py2ls/ssh2ls_v01.py +2204 -0
  65. py2ls/stats.py +66 -172
  66. py2ls/temp20251124.py +509 -0
  67. py2ls/translator.py +2 -0
  68. py2ls/utils/decorators.py +3564 -0
  69. py2ls/utils_bio.py +3453 -0
  70. {py2ls-0.1.10.12.dist-info → py2ls-0.2.7.10.dist-info}/METADATA +113 -224
  71. {py2ls-0.1.10.12.dist-info → py2ls-0.2.7.10.dist-info}/RECORD +72 -16
  72. {py2ls-0.1.10.12.dist-info → py2ls-0.2.7.10.dist-info}/WHEEL +0 -0
py2ls/.DS_Store CHANGED
Binary file
py2ls/.git/.DS_Store ADDED
Binary file
py2ls/.git/index CHANGED
Binary file
@@ -138,3 +138,4 @@ a15389729850729fc7bd78a54f26fce77f30be12 a15389729850729fc7bd78a54f26fce77f30be1
138
138
  a15389729850729fc7bd78a54f26fce77f30be12 a15389729850729fc7bd78a54f26fce77f30be12 Jianfeng Liu <macjianfeng@jflmbp.speedport.ip> 1723527981 +0200 remote set-head
139
139
  6dc2cdf4a84e538e5d4777486aeff87e42f41799 6dc2cdf4a84e538e5d4777486aeff87e42f41799 Jianfeng Liu <macjianfeng@jflmbp.speedport.ip> 1723527990 +0200 remote set-head
140
140
  86e288b46f8fe179907e4413f665aeb5053fddb1 86e288b46f8fe179907e4413f665aeb5053fddb1 Jianfeng Liu <macjianfeng@JFLMBP.cin.medizin.uni-tuebingen.de> 1725537218 +0200 remote set-head
141
+ 86e288b46f8fe179907e4413f665aeb5053fddb1 86e288b46f8fe179907e4413f665aeb5053fddb1 Jianfeng Liu <macjianfeng@JFLMBP.cin.medizin.uni-tuebingen.de> 1734612687 +0100 remote set-head
Binary file
Binary file
py2ls/ImageLoader.py ADDED
@@ -0,0 +1,621 @@
1
+ # 250604_204034:
2
+ import os
3
+ import numpy as np
4
+ import pandas as pd
5
+ import cv2
6
+ import hashlib
7
+ import warnings
8
+ import functools
9
+ from typing import Union, Optional, Dict, Tuple, List
10
+ from concurrent.futures import ThreadPoolExecutor, ProcessPoolExecutor, as_completed
11
+ from tqdm import tqdm
12
+ from PIL import Image, UnidentifiedImageError
13
+ from sklearn.preprocessing import LabelEncoder
14
+ from tensorflow.keras.preprocessing.image import ImageDataGenerator
15
+ from tensorflow.keras.utils import to_categorical
16
+ import logging
17
+
18
+ """
19
+ -------------------------------------------------------------------------------
20
+ Image Preprocessing
21
+ This file provides a comprehensive image preprocessing pipeline with the following key features:
22
+
23
+ Core Functionality:
24
+ Image loading and processing (resizing, normalization, CLAHE enhancement)
25
+ Parallel processing using ThreadPoolExecutor/ProcessPoolExecutor
26
+ Chunk-based processing for memory efficiency
27
+ Caching mechanism to store processed images
28
+
29
+ Key Components:
30
+ _apply_clahe(): Contrast Limited Adaptive Histogram Equalization
31
+ _process_single_image(): Handles individual image processing
32
+ ImageLoader class: Main preprocessing pipeline
33
+
34
+ Use Cases:
35
+ When you need to preprocess large image datasets efficiently
36
+ When memory management is important (chunk-based processing)
37
+ When you want to cache preprocessed images for future use
38
+ When you need parallel processing for faster preprocessing
39
+
40
+ You only need image preprocessing without ML
41
+ You're working with very large datasets that need chunking
42
+ You want to cache preprocessed images for future use
43
+ You need efficient parallel processing of images
44
+ Your focus is on image enhancement/normalization
45
+ -------------------------------------------------------------------------------
46
+ """
47
+ def _apply_clahe(
48
+ img: np.ndarray,
49
+ clip_limit: float = 2.0,
50
+ tile_grid_size: Tuple[int, int] = (8, 8),
51
+ ) -> np.ndarray:
52
+ """Apply Contrast Limited Adaptive Histogram Equalization (CLAHE).
53
+
54
+ Input image should be uint8 or float32 scaled 0-1.
55
+ Returns uint8 image after CLAHE.
56
+ """
57
+ if img.dtype != np.uint8:
58
+ img = (img * 255).astype(np.uint8)
59
+
60
+ if len(img.shape) == 2 or (len(img.shape) == 3 and img.shape[2] == 1):
61
+ clahe = cv2.createCLAHE(clipLimit=clip_limit, tileGridSize=tile_grid_size)
62
+ return clahe.apply(img)
63
+ else:
64
+ lab = cv2.cvtColor(img, cv2.COLOR_RGB2LAB)
65
+ l, a, b = cv2.split(lab)
66
+ clahe = cv2.createCLAHE(clipLimit=clip_limit, tileGridSize=tile_grid_size)
67
+ l = clahe.apply(l)
68
+ lab = cv2.merge((l, a, b))
69
+ return cv2.cvtColor(lab, cv2.COLOR_LAB2RGB)
70
+
71
+ def _process_single_image(
72
+ path: str, target_size: Tuple[int, int], grayscale: bool, scaler: str
73
+ ) -> Optional[np.ndarray]:
74
+ """Process a single image file with error handling."""
75
+ try:
76
+ with Image.open(path) as img:
77
+ if grayscale:
78
+ img = img.convert("L")
79
+ else:
80
+ img = img.convert("RGB")
81
+
82
+ img = img.resize(target_size[::-1]) # PIL uses (width, height)
83
+ img_array = np.array(img)
84
+
85
+ if scaler == "normalize":
86
+ img_array = img_array.astype(np.float32) / 255.0
87
+ elif scaler == "standardize":
88
+ img_array = img_array.astype(np.float32)
89
+ mean, std = img_array.mean(), img_array.std()
90
+ img_array = (img_array - mean) / std if std > 0 else (img_array - mean)
91
+ elif scaler == "clahe":
92
+ img_array = _apply_clahe(img_array).astype(np.float32) / 255.0
93
+
94
+ return img_array
95
+ except (OSError, UnidentifiedImageError, ValueError, TypeError) as e:
96
+ logging.warning(f"Failed to process image {path}: {e}")
97
+ return None
98
+
99
+
100
+ class ImageLoader:
101
+ """
102
+ A scalable image preprocessing pipeline that can handle datasets of any size
103
+ with efficient memory usage and parallel processing capabilities.
104
+
105
+ # Usage:
106
+ preprocessor = ImageLoader(
107
+ target_size=(32, 32),
108
+ chunk_size=2001, # Process 5,000 images at a time
109
+ cache_dir="./big_dataset_cache",
110
+ backend="threading",
111
+ n_jobs=8,
112
+ grayscale=True,
113
+ )
114
+
115
+ # This will process in chunks and cache results
116
+ result_train = preprocessor.process(
117
+ df_train,
118
+ x_col="path",
119
+ y_col="Label",
120
+ output="df",
121
+ cache=True,
122
+ )
123
+ result_test = preprocessor.process(
124
+ df_train,
125
+ x_col="path",
126
+ y_col="Label",
127
+ output="df",
128
+ cache=True,
129
+ )
130
+ """
131
+
132
+ def __init__(
133
+ self,
134
+ target_size: Tuple[int, int] = (128, 128),
135
+ grayscale: bool = False,
136
+ scaler: str = "normalize",
137
+ n_jobs: int = min(8, os.cpu_count()),
138
+ cache_dir: str = "./preprocessing_cache",
139
+ chunk_size: int = 1000,
140
+ backend: str = "threading",
141
+ verbose: bool = True,
142
+ ):
143
+ """
144
+ Initialize the preprocessor with processing parameters.
145
+
146
+ Args:
147
+ target_size: Target dimensions (height, width) for resizing
148
+ grayscale: Convert to grayscale if True
149
+ scaler: Preprocessing method ('normalize', 'standardize', 'clahe', 'raw')
150
+ n_jobs: Number of parallel workers
151
+ cache_dir: Directory for caching processed data
152
+ chunk_size: Number of images to process at once
153
+ backend: Parallel processing backend ('threading' or 'multiprocessing')
154
+ verbose: Print progress information
155
+ """
156
+ self.target_size = target_size
157
+ self.grayscale = grayscale
158
+ self.scaler = scaler
159
+ self.n_jobs = min(n_jobs, os.cpu_count() or 1)
160
+ self.cache_dir = cache_dir
161
+ self.chunk_size = chunk_size
162
+ self.backend = backend
163
+ self.verbose = verbose
164
+ str_usage="""
165
+ preprocessor = ImageLoader(
166
+ target_size=(32, 32),
167
+ chunk_size=2001, # Process 5,000 images at a time
168
+ cache_dir="./big_dataset_cache",
169
+ backend="threading",
170
+ n_jobs=8,
171
+ grayscale=True,
172
+ )
173
+
174
+ # This will process in chunks and cache results
175
+ result_train = preprocessor.process(
176
+ df_train,
177
+ x_col="path",
178
+ y_col="Label",
179
+ output="df",
180
+ cache=True,
181
+ )
182
+ result_test = preprocessor.process(
183
+ df_test,
184
+ x_col="path",
185
+ y_col="Label",
186
+ output="df",
187
+ cache=True,
188
+ )
189
+ # Sample the same 1000 rows from the training set
190
+ sampled_train = result_train.sample(100)
191
+
192
+ x_train = sampled_train.drop(columns=["label"])
193
+ y_train = sampled_train["label"]
194
+
195
+ # Sample 1000 rows from the test set, and align
196
+ sampled_test = result_test.sample(100)
197
+
198
+ x_true = sampled_test.drop(columns=["label"])
199
+ y_true = sampled_test["label"]
200
+
201
+ # Run prediction
202
+ res_pred_stack = ml2ls.predict(
203
+ x_train=x_train,
204
+ y_train=y_train,
205
+ x_true=x_true,
206
+ y_true=y_true,
207
+ # cls="light", # or "light", etc.
208
+ voting=False,
209
+ )
210
+ """
211
+ if self.verbose:
212
+ print(str_usage)
213
+ # Create cache directory if needed
214
+ os.makedirs(self.cache_dir, exist_ok=True)
215
+
216
+ def _parallel_process(self, paths: List[str]) -> np.ndarray:
217
+ """
218
+ Process images in parallel using the configured backend.
219
+
220
+ Args:
221
+ paths: List of image paths to process
222
+
223
+ Returns:
224
+ Stacked array of processed images
225
+ """
226
+ # Create a partial function with fixed parameters
227
+ worker = functools.partial(
228
+ _process_single_image,
229
+ target_size=self.target_size,
230
+ grayscale=self.grayscale,
231
+ scaler=self.scaler,
232
+ )
233
+
234
+ if self.backend == "threading":
235
+ with ThreadPoolExecutor(max_workers=self.n_jobs) as pool:
236
+ futures = [pool.submit(worker, path) for path in paths]
237
+ results = []
238
+ for future in tqdm(
239
+ as_completed(futures), total=len(futures), disable=not self.verbose
240
+ ):
241
+ try:
242
+ result = future.result()
243
+ if result is not None:
244
+ results.append(result)
245
+ except Exception as e:
246
+ if self.verbose:
247
+ warnings.warn(f"Image processing failed: {str(e)}")
248
+ else:
249
+ # Use smaller chunks for multiprocessing to reduce overhead
250
+ chunk_size = max(1, len(paths) // (self.n_jobs * 2))
251
+ with ProcessPoolExecutor(max_workers=self.n_jobs) as pool:
252
+ futures = []
253
+ for i in range(0, len(paths), chunk_size):
254
+ chunk = paths[i : i + chunk_size]
255
+ futures.append(pool.submit(self._process_chunk, chunk))
256
+
257
+ results = []
258
+ for future in tqdm(
259
+ as_completed(futures), total=len(futures), disable=not self.verbose
260
+ ):
261
+ try:
262
+ chunk_results = future.result()
263
+ results.extend(chunk_results)
264
+ except Exception as e:
265
+ if self.verbose:
266
+ warnings.warn(f"Chunk processing failed: {str(e)}")
267
+
268
+ # Filter out failed images
269
+ valid_results = [res for res in results if res is not None]
270
+ if valid_results:
271
+ # Handle different image dimensions
272
+ try:
273
+ return np.stack(valid_results)
274
+ except ValueError as e:
275
+ if "must have the same shape" in str(e):
276
+ # Handle variable channel issue
277
+ return np.array(valid_results, dtype=object)
278
+ raise
279
+ return np.array([])
280
+
281
+ def _process_chunk(self, paths: List[str]) -> List[np.ndarray]:
282
+ """Process a chunk of images (used for multiprocessing)."""
283
+ results = []
284
+ for path in paths:
285
+ try:
286
+ img = _process_single_image(
287
+ path, self.target_size, self.grayscale, self.scaler
288
+ )
289
+ if img is not None:
290
+ results.append(img)
291
+ except Exception:
292
+ continue
293
+ return results
294
+
295
+ def _get_cache_filename(self, data_hash: str) -> str:
296
+ """
297
+ Generate a unique cache filename based on processing parameters.
298
+
299
+ Args:
300
+ data_hash: Hash of the input data
301
+
302
+ Returns:
303
+ Full path to cache file
304
+ """
305
+ params = {
306
+ "target_size": self.target_size,
307
+ "grayscale": self.grayscale,
308
+ "scaler": self.scaler,
309
+ "chunk_size": self.chunk_size,
310
+ }
311
+ param_hash = hashlib.md5(str(params).encode()).hexdigest()
312
+ return os.path.join(self.cache_dir, f"img_cache_{data_hash}_{param_hash}.npz")
313
+
314
+ @staticmethod
315
+ def _load_from_cache(cache_file: str) -> Optional[Tuple[np.ndarray, np.ndarray]]:
316
+ """
317
+ Load processed data from cache.
318
+
319
+ Args:
320
+ cache_file: Path to cache file
321
+
322
+ Returns:
323
+ Tuple of (images, labels) or None if cache is invalid
324
+ """
325
+ try:
326
+ with np.load(cache_file) as data:
327
+ return data["images"], data["labels"]
328
+ except Exception:
329
+ return None
330
+
331
+ @staticmethod
332
+ def _save_to_cache(cache_file: str, images: np.ndarray, labels: np.ndarray):
333
+ """
334
+ Save processed data to cache.
335
+
336
+ Args:
337
+ cache_file: Path to cache file
338
+ images: Processed images array
339
+ labels: Corresponding labels array
340
+ """
341
+ np.savez_compressed(cache_file, images=images, labels=labels)
342
+
343
+ def _process_labels(
344
+ self,
345
+ data: pd.DataFrame,
346
+ y_col: Optional[str],
347
+ encoder: str,
348
+ label_encoder: Optional[LabelEncoder] = None,
349
+ ) -> Optional[np.ndarray]:
350
+ """
351
+ Process and encode labels according to specified method.
352
+
353
+ Args:
354
+ data: Input DataFrame
355
+ y_col: Name of column containing labels
356
+ encoder: Encoding method ('label', 'onehot', 'binary', None)
357
+ label_encoder: Pre-fitted LabelEncoder (optional)
358
+
359
+ Returns:
360
+ Array of processed labels or None
361
+ """
362
+ if y_col is None or encoder is None:
363
+ return None
364
+
365
+ labels = data[y_col].values
366
+
367
+ if encoder == "binary":
368
+ unique_labels = np.unique(labels)
369
+ if len(unique_labels) != 2:
370
+ raise ValueError("Binary encoding requires exactly 2 classes")
371
+ return (labels == unique_labels[0]).astype(int)
372
+ elif encoder == "onehot":
373
+ if label_encoder is None:
374
+ label_encoder = LabelEncoder()
375
+ labels = label_encoder.fit_transform(labels)
376
+ else:
377
+ labels = label_encoder.transform(labels)
378
+ return to_categorical(labels)
379
+ elif encoder == "label":
380
+ if label_encoder is None:
381
+ label_encoder = LabelEncoder()
382
+ labels = label_encoder.fit_transform(labels)
383
+ else:
384
+ labels = label_encoder.transform(labels)
385
+ return labels
386
+
387
+ return labels
388
+
389
+ def _format_output(
390
+ self, images: np.ndarray, labels: Optional[np.ndarray], output: str
391
+ ) -> Union[ImageDataGenerator, Tuple[np.ndarray, np.ndarray], pd.DataFrame]:
392
+ """
393
+ Format the processed data according to requested output type.
394
+
395
+ Args:
396
+ images: Processed images array
397
+ labels: Processed labels array
398
+ output: Requested output type ('generator', 'array', 'dataframe')
399
+
400
+ Returns:
401
+ Processed data in requested format
402
+ """
403
+ if output == "generator":
404
+ # Create a memory-efficient generator
405
+ def generator():
406
+ for i in range(0, len(images), self.chunk_size):
407
+ batch_images = images[i : i + self.chunk_size]
408
+ batch_labels = (
409
+ labels[i : i + self.chunk_size] if labels is not None else None
410
+ )
411
+ yield (
412
+ (batch_images, batch_labels)
413
+ if batch_labels is not None
414
+ else batch_images
415
+ )
416
+
417
+ return generator()
418
+ elif output == "array":
419
+ return (images, labels) if labels is not None else images
420
+ else: # dataframe
421
+ # Handle variable image dimensions
422
+ if images.dtype == object:
423
+ # Convert to uniform array
424
+ images = np.array([img for img in images], dtype=np.float32)
425
+ if len(images) == 0:
426
+ warnings.warn("No images to process; returning empty DataFrame.")
427
+ return pd.DataFrame()
428
+ # Ensure image dimensions are known
429
+ if images.ndim == 4:
430
+ n, h, w, c = images.shape
431
+ elif images.ndim == 3:
432
+ n, h, w = images.shape
433
+ c = 1
434
+ images = images.reshape(n, h, w, c)
435
+ else:
436
+ print(f"image type: {type(images)}")
437
+ print(f"image shape: {images.shape}")
438
+ print(f"image dtype: {images.dtype}")
439
+ raise ValueError(f"Unexpected image shape: {images.shape}")
440
+
441
+ # Flatten image data
442
+ images_flat = images.reshape(len(images), -1)
443
+
444
+ # Create column names based on channels
445
+ if c == 3:
446
+ col_names = (
447
+ [f"pixel_{i}_r" for i in range(h * w)] +
448
+ [f"pixel_{i}_g" for i in range(h * w)] +
449
+ [f"pixel_{i}_b" for i in range(h * w)]
450
+ )
451
+ elif c == 1:
452
+ col_names = [f"pixel_{i}" for i in range(h * w)]
453
+ else:
454
+ # fallback
455
+ col_names = [f"pixel_{i}" for i in range(images_flat.shape[1])]
456
+
457
+ # Create DataFrame
458
+ df = pd.DataFrame(images_flat, columns=col_names)
459
+
460
+ # Append labels if present
461
+ if labels is not None:
462
+ df["label"] = labels
463
+ print(f"dataframe shape: {df.shape}")
464
+ return df
465
+
466
+ def process(
467
+ self,
468
+ data: pd.DataFrame,
469
+ x_col: str,
470
+ y_col: Optional[str] = None,
471
+ encoder: str = "label",
472
+ label_encoder: Optional[LabelEncoder] = None,
473
+ output: str = "dataframe",
474
+ cache: bool = True,
475
+ max_samples: Optional[int] = None,
476
+ **kwargs,
477
+ ) -> Union[ImageDataGenerator, Tuple[np.ndarray, np.ndarray], pd.DataFrame]:
478
+ """
479
+ Main processing method that handles the entire pipeline.
480
+
481
+ Args:
482
+ data: Input DataFrame containing image paths and labels
483
+ x_col: Name of column containing image paths
484
+ y_col: Name of column containing labels (optional)
485
+ encoder: Label encoding method ('label', 'onehot', 'binary', None)
486
+ label_encoder: Pre-fitted LabelEncoder (optional)
487
+ output: Requested output format ('generator', 'array', 'dataframe')
488
+ cache: Whether to use disk caching
489
+ max_samples: Maximum number of samples to process
490
+ kwargs: Additional arguments for processing
491
+
492
+ Returns:
493
+ Processed data in requested format
494
+ """
495
+ # Validate inputs
496
+ if x_col not in data.columns:
497
+ raise ValueError(f"Column '{x_col}' not found in DataFrame")
498
+
499
+ if y_col is not None and y_col not in data.columns:
500
+ raise ValueError(f"Column '{y_col}' not found in DataFrame")
501
+
502
+ # Limit samples if requested
503
+ if max_samples is not None:
504
+ data = data.iloc[:max_samples]
505
+
506
+ # Generate data hash for caching
507
+ data_hash = hashlib.md5(pd.util.hash_pandas_object(data).values).hexdigest()
508
+ cache_file = self._get_cache_filename(data_hash) if cache else None
509
+
510
+ # Try loading from cache
511
+ if cache and cache_file and os.path.exists(cache_file):
512
+ if self.verbose:
513
+ print("Loading from cache...")
514
+ cached_data = self._load_from_cache(cache_file)
515
+ if cached_data is not None:
516
+ images, labels = cached_data
517
+ return self._format_output(images, labels, output)
518
+
519
+ # Process labels first
520
+ labels = self._process_labels(data, y_col, encoder, label_encoder)
521
+
522
+ # Process images in chunks if dataset is large
523
+ total_images = len(data)
524
+ use_chunking = total_images > self.chunk_size * 2
525
+
526
+ if use_chunking and self.verbose:
527
+ print(f"Processing {total_images} images in chunks of {self.chunk_size}...")
528
+
529
+ # Process all images at once if not chunking
530
+ if not use_chunking:
531
+ images = self._parallel_process(data[x_col].values)
532
+ else:
533
+ # Process images chunk by chunk
534
+ images = []
535
+ for i in tqdm(
536
+ range(0, total_images, self.chunk_size), disable=not self.verbose
537
+ ):
538
+ chunk_paths = data[x_col].iloc[i : i + self.chunk_size].values
539
+ chunk_images = self._parallel_process(chunk_paths)
540
+ if len(chunk_images) > 0:
541
+ images.append(chunk_images)
542
+
543
+ # Handle empty chunks
544
+ if images:
545
+ try:
546
+ images = np.concatenate(images)
547
+ except ValueError:
548
+ # Handle case with no images processed
549
+ images = np.array([])
550
+ else:
551
+ images = np.array([])
552
+
553
+ # Align labels with successfully processed images
554
+ if labels is not None and len(images) > 0:
555
+ labels = labels[: len(images)]
556
+
557
+ # Save to cache if requested
558
+ if cache and cache_file and len(images) > 0:
559
+ self._save_to_cache(cache_file, images, labels)
560
+
561
+ return self._format_output(images, labels, output)
562
+
563
+
564
+ def create_augmentation_generator(
565
+ data: pd.DataFrame,
566
+ x_col: str,
567
+ y_col: str,
568
+ target_size: Tuple[int, int] = (224, 224),
569
+ batch_size: int = 32,
570
+ class_mode: str = "raw",
571
+ augment_params: Optional[Dict] = None,
572
+ grayscale: bool = False,
573
+ shuffle: bool = True,
574
+ seed: Optional[int] = None,
575
+ ) -> ImageDataGenerator:
576
+ """
577
+ Create an augmented image data generator for training.
578
+
579
+ Args:
580
+ data: Input DataFrame
581
+ x_col: Column with image paths
582
+ y_col: Column with labels
583
+ target_size: Target image dimensions
584
+ batch_size: Images per batch
585
+ class_mode: Type of label output
586
+ augment_params: Dictionary of augmentation parameters
587
+ grayscale: Convert to grayscale
588
+ shuffle: Shuffle the data
589
+ seed: Random seed
590
+
591
+ Returns:
592
+ Configured ImageDataGenerator
593
+ """
594
+ default_augment = {
595
+ "rotation_range": 20,
596
+ "width_shift_range": 0.1,
597
+ "height_shift_range": 0.1,
598
+ "shear_range": 0.1,
599
+ "zoom_range": 0.1,
600
+ "horizontal_flip": True,
601
+ "vertical_flip": False,
602
+ "brightness_range": [0.9, 1.1],
603
+ "fill_mode": "reflect",
604
+ }
605
+
606
+ if augment_params:
607
+ default_augment.update(augment_params)
608
+
609
+ datagen = ImageDataGenerator(**default_augment)
610
+
611
+ return datagen.flow_from_dataframe(
612
+ dataframe=data,
613
+ x_col=x_col,
614
+ y_col=y_col,
615
+ target_size=target_size,
616
+ color_mode="grayscale" if grayscale else "rgb",
617
+ class_mode=class_mode,
618
+ batch_size=batch_size,
619
+ shuffle=shuffle,
620
+ seed=seed,
621
+ )
py2ls/__init__.py CHANGED
@@ -1,8 +1,10 @@
1
1
  """
2
2
  __init__ of the pyos module
3
3
  """
4
- from .ips import *
5
- from .translator import *
6
- from .netfinder import *
7
- from .plot import *
8
- from .export_requirements import *
4
+ # try:
5
+ # import tensorflow as tf
6
+ # print("Eager execution enabled:", tf.executing_eagerly())
7
+ # tf.config.set_visible_devices([], "GPU")
8
+ # except Exception as e:
9
+ # print("Error importing tensorflow:", e)
10
+ from .ips import *