py2ls 0.1.10.12__py3-none-any.whl → 0.2.7.10__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of py2ls might be problematic. Click here for more details.
- py2ls/.DS_Store +0 -0
- py2ls/.git/.DS_Store +0 -0
- py2ls/.git/index +0 -0
- py2ls/.git/logs/refs/remotes/origin/HEAD +1 -0
- py2ls/.git/objects/.DS_Store +0 -0
- py2ls/.git/refs/.DS_Store +0 -0
- py2ls/ImageLoader.py +621 -0
- py2ls/__init__.py +7 -5
- py2ls/apptainer2ls.py +3940 -0
- py2ls/batman.py +164 -42
- py2ls/bio.py +2595 -0
- py2ls/cell_image_clf.py +1632 -0
- py2ls/container2ls.py +4635 -0
- py2ls/corr.py +475 -0
- py2ls/data/.DS_Store +0 -0
- py2ls/data/email/email_html_template.html +88 -0
- py2ls/data/hyper_param_autogluon_zeroshot2024.json +2383 -0
- py2ls/data/hyper_param_tabrepo_2024.py +1753 -0
- py2ls/data/mygenes_fields_241022.txt +355 -0
- py2ls/data/re_common_pattern.json +173 -0
- py2ls/data/sns_info.json +74 -0
- py2ls/data/styles/.DS_Store +0 -0
- py2ls/data/styles/example/.DS_Store +0 -0
- py2ls/data/styles/stylelib/.DS_Store +0 -0
- py2ls/data/styles/stylelib/grid.mplstyle +15 -0
- py2ls/data/styles/stylelib/high-contrast.mplstyle +6 -0
- py2ls/data/styles/stylelib/high-vis.mplstyle +4 -0
- py2ls/data/styles/stylelib/ieee.mplstyle +15 -0
- py2ls/data/styles/stylelib/light.mplstyl +6 -0
- py2ls/data/styles/stylelib/muted.mplstyle +6 -0
- py2ls/data/styles/stylelib/nature-reviews-latex.mplstyle +616 -0
- py2ls/data/styles/stylelib/nature-reviews.mplstyle +616 -0
- py2ls/data/styles/stylelib/nature.mplstyle +31 -0
- py2ls/data/styles/stylelib/no-latex.mplstyle +10 -0
- py2ls/data/styles/stylelib/notebook.mplstyle +36 -0
- py2ls/data/styles/stylelib/paper.mplstyle +290 -0
- py2ls/data/styles/stylelib/paper2.mplstyle +305 -0
- py2ls/data/styles/stylelib/retro.mplstyle +4 -0
- py2ls/data/styles/stylelib/sans.mplstyle +10 -0
- py2ls/data/styles/stylelib/scatter.mplstyle +7 -0
- py2ls/data/styles/stylelib/science.mplstyle +48 -0
- py2ls/data/styles/stylelib/std-colors.mplstyle +4 -0
- py2ls/data/styles/stylelib/vibrant.mplstyle +6 -0
- py2ls/data/tiles.csv +146 -0
- py2ls/data/usages_pd.json +1417 -0
- py2ls/data/usages_sns.json +31 -0
- py2ls/docker2ls.py +5446 -0
- py2ls/ec2ls.py +61 -0
- py2ls/fetch_update.py +145 -0
- py2ls/ich2ls.py +1955 -296
- py2ls/im2.py +8242 -0
- py2ls/image_ml2ls.py +2100 -0
- py2ls/ips.py +33909 -3418
- py2ls/ml2ls.py +7700 -0
- py2ls/mol.py +289 -0
- py2ls/mount2ls.py +1307 -0
- py2ls/netfinder.py +873 -351
- py2ls/nl2ls.py +283 -0
- py2ls/ocr.py +1581 -458
- py2ls/plot.py +10394 -314
- py2ls/rna2ls.py +311 -0
- py2ls/ssh2ls.md +456 -0
- py2ls/ssh2ls.py +5933 -0
- py2ls/ssh2ls_v01.py +2204 -0
- py2ls/stats.py +66 -172
- py2ls/temp20251124.py +509 -0
- py2ls/translator.py +2 -0
- py2ls/utils/decorators.py +3564 -0
- py2ls/utils_bio.py +3453 -0
- {py2ls-0.1.10.12.dist-info → py2ls-0.2.7.10.dist-info}/METADATA +113 -224
- {py2ls-0.1.10.12.dist-info → py2ls-0.2.7.10.dist-info}/RECORD +72 -16
- {py2ls-0.1.10.12.dist-info → py2ls-0.2.7.10.dist-info}/WHEEL +0 -0
py2ls/.DS_Store
CHANGED
|
Binary file
|
py2ls/.git/.DS_Store
ADDED
|
Binary file
|
py2ls/.git/index
CHANGED
|
Binary file
|
|
@@ -138,3 +138,4 @@ a15389729850729fc7bd78a54f26fce77f30be12 a15389729850729fc7bd78a54f26fce77f30be1
|
|
|
138
138
|
a15389729850729fc7bd78a54f26fce77f30be12 a15389729850729fc7bd78a54f26fce77f30be12 Jianfeng Liu <macjianfeng@jflmbp.speedport.ip> 1723527981 +0200 remote set-head
|
|
139
139
|
6dc2cdf4a84e538e5d4777486aeff87e42f41799 6dc2cdf4a84e538e5d4777486aeff87e42f41799 Jianfeng Liu <macjianfeng@jflmbp.speedport.ip> 1723527990 +0200 remote set-head
|
|
140
140
|
86e288b46f8fe179907e4413f665aeb5053fddb1 86e288b46f8fe179907e4413f665aeb5053fddb1 Jianfeng Liu <macjianfeng@JFLMBP.cin.medizin.uni-tuebingen.de> 1725537218 +0200 remote set-head
|
|
141
|
+
86e288b46f8fe179907e4413f665aeb5053fddb1 86e288b46f8fe179907e4413f665aeb5053fddb1 Jianfeng Liu <macjianfeng@JFLMBP.cin.medizin.uni-tuebingen.de> 1734612687 +0100 remote set-head
|
|
Binary file
|
|
Binary file
|
py2ls/ImageLoader.py
ADDED
|
@@ -0,0 +1,621 @@
|
|
|
1
|
+
# 250604_204034:
|
|
2
|
+
import os
|
|
3
|
+
import numpy as np
|
|
4
|
+
import pandas as pd
|
|
5
|
+
import cv2
|
|
6
|
+
import hashlib
|
|
7
|
+
import warnings
|
|
8
|
+
import functools
|
|
9
|
+
from typing import Union, Optional, Dict, Tuple, List
|
|
10
|
+
from concurrent.futures import ThreadPoolExecutor, ProcessPoolExecutor, as_completed
|
|
11
|
+
from tqdm import tqdm
|
|
12
|
+
from PIL import Image, UnidentifiedImageError
|
|
13
|
+
from sklearn.preprocessing import LabelEncoder
|
|
14
|
+
from tensorflow.keras.preprocessing.image import ImageDataGenerator
|
|
15
|
+
from tensorflow.keras.utils import to_categorical
|
|
16
|
+
import logging
|
|
17
|
+
|
|
18
|
+
"""
|
|
19
|
+
-------------------------------------------------------------------------------
|
|
20
|
+
Image Preprocessing
|
|
21
|
+
This file provides a comprehensive image preprocessing pipeline with the following key features:
|
|
22
|
+
|
|
23
|
+
Core Functionality:
|
|
24
|
+
Image loading and processing (resizing, normalization, CLAHE enhancement)
|
|
25
|
+
Parallel processing using ThreadPoolExecutor/ProcessPoolExecutor
|
|
26
|
+
Chunk-based processing for memory efficiency
|
|
27
|
+
Caching mechanism to store processed images
|
|
28
|
+
|
|
29
|
+
Key Components:
|
|
30
|
+
_apply_clahe(): Contrast Limited Adaptive Histogram Equalization
|
|
31
|
+
_process_single_image(): Handles individual image processing
|
|
32
|
+
ImageLoader class: Main preprocessing pipeline
|
|
33
|
+
|
|
34
|
+
Use Cases:
|
|
35
|
+
When you need to preprocess large image datasets efficiently
|
|
36
|
+
When memory management is important (chunk-based processing)
|
|
37
|
+
When you want to cache preprocessed images for future use
|
|
38
|
+
When you need parallel processing for faster preprocessing
|
|
39
|
+
|
|
40
|
+
You only need image preprocessing without ML
|
|
41
|
+
You're working with very large datasets that need chunking
|
|
42
|
+
You want to cache preprocessed images for future use
|
|
43
|
+
You need efficient parallel processing of images
|
|
44
|
+
Your focus is on image enhancement/normalization
|
|
45
|
+
-------------------------------------------------------------------------------
|
|
46
|
+
"""
|
|
47
|
+
def _apply_clahe(
|
|
48
|
+
img: np.ndarray,
|
|
49
|
+
clip_limit: float = 2.0,
|
|
50
|
+
tile_grid_size: Tuple[int, int] = (8, 8),
|
|
51
|
+
) -> np.ndarray:
|
|
52
|
+
"""Apply Contrast Limited Adaptive Histogram Equalization (CLAHE).
|
|
53
|
+
|
|
54
|
+
Input image should be uint8 or float32 scaled 0-1.
|
|
55
|
+
Returns uint8 image after CLAHE.
|
|
56
|
+
"""
|
|
57
|
+
if img.dtype != np.uint8:
|
|
58
|
+
img = (img * 255).astype(np.uint8)
|
|
59
|
+
|
|
60
|
+
if len(img.shape) == 2 or (len(img.shape) == 3 and img.shape[2] == 1):
|
|
61
|
+
clahe = cv2.createCLAHE(clipLimit=clip_limit, tileGridSize=tile_grid_size)
|
|
62
|
+
return clahe.apply(img)
|
|
63
|
+
else:
|
|
64
|
+
lab = cv2.cvtColor(img, cv2.COLOR_RGB2LAB)
|
|
65
|
+
l, a, b = cv2.split(lab)
|
|
66
|
+
clahe = cv2.createCLAHE(clipLimit=clip_limit, tileGridSize=tile_grid_size)
|
|
67
|
+
l = clahe.apply(l)
|
|
68
|
+
lab = cv2.merge((l, a, b))
|
|
69
|
+
return cv2.cvtColor(lab, cv2.COLOR_LAB2RGB)
|
|
70
|
+
|
|
71
|
+
def _process_single_image(
|
|
72
|
+
path: str, target_size: Tuple[int, int], grayscale: bool, scaler: str
|
|
73
|
+
) -> Optional[np.ndarray]:
|
|
74
|
+
"""Process a single image file with error handling."""
|
|
75
|
+
try:
|
|
76
|
+
with Image.open(path) as img:
|
|
77
|
+
if grayscale:
|
|
78
|
+
img = img.convert("L")
|
|
79
|
+
else:
|
|
80
|
+
img = img.convert("RGB")
|
|
81
|
+
|
|
82
|
+
img = img.resize(target_size[::-1]) # PIL uses (width, height)
|
|
83
|
+
img_array = np.array(img)
|
|
84
|
+
|
|
85
|
+
if scaler == "normalize":
|
|
86
|
+
img_array = img_array.astype(np.float32) / 255.0
|
|
87
|
+
elif scaler == "standardize":
|
|
88
|
+
img_array = img_array.astype(np.float32)
|
|
89
|
+
mean, std = img_array.mean(), img_array.std()
|
|
90
|
+
img_array = (img_array - mean) / std if std > 0 else (img_array - mean)
|
|
91
|
+
elif scaler == "clahe":
|
|
92
|
+
img_array = _apply_clahe(img_array).astype(np.float32) / 255.0
|
|
93
|
+
|
|
94
|
+
return img_array
|
|
95
|
+
except (OSError, UnidentifiedImageError, ValueError, TypeError) as e:
|
|
96
|
+
logging.warning(f"Failed to process image {path}: {e}")
|
|
97
|
+
return None
|
|
98
|
+
|
|
99
|
+
|
|
100
|
+
class ImageLoader:
|
|
101
|
+
"""
|
|
102
|
+
A scalable image preprocessing pipeline that can handle datasets of any size
|
|
103
|
+
with efficient memory usage and parallel processing capabilities.
|
|
104
|
+
|
|
105
|
+
# Usage:
|
|
106
|
+
preprocessor = ImageLoader(
|
|
107
|
+
target_size=(32, 32),
|
|
108
|
+
chunk_size=2001, # Process 5,000 images at a time
|
|
109
|
+
cache_dir="./big_dataset_cache",
|
|
110
|
+
backend="threading",
|
|
111
|
+
n_jobs=8,
|
|
112
|
+
grayscale=True,
|
|
113
|
+
)
|
|
114
|
+
|
|
115
|
+
# This will process in chunks and cache results
|
|
116
|
+
result_train = preprocessor.process(
|
|
117
|
+
df_train,
|
|
118
|
+
x_col="path",
|
|
119
|
+
y_col="Label",
|
|
120
|
+
output="df",
|
|
121
|
+
cache=True,
|
|
122
|
+
)
|
|
123
|
+
result_test = preprocessor.process(
|
|
124
|
+
df_train,
|
|
125
|
+
x_col="path",
|
|
126
|
+
y_col="Label",
|
|
127
|
+
output="df",
|
|
128
|
+
cache=True,
|
|
129
|
+
)
|
|
130
|
+
"""
|
|
131
|
+
|
|
132
|
+
def __init__(
|
|
133
|
+
self,
|
|
134
|
+
target_size: Tuple[int, int] = (128, 128),
|
|
135
|
+
grayscale: bool = False,
|
|
136
|
+
scaler: str = "normalize",
|
|
137
|
+
n_jobs: int = min(8, os.cpu_count()),
|
|
138
|
+
cache_dir: str = "./preprocessing_cache",
|
|
139
|
+
chunk_size: int = 1000,
|
|
140
|
+
backend: str = "threading",
|
|
141
|
+
verbose: bool = True,
|
|
142
|
+
):
|
|
143
|
+
"""
|
|
144
|
+
Initialize the preprocessor with processing parameters.
|
|
145
|
+
|
|
146
|
+
Args:
|
|
147
|
+
target_size: Target dimensions (height, width) for resizing
|
|
148
|
+
grayscale: Convert to grayscale if True
|
|
149
|
+
scaler: Preprocessing method ('normalize', 'standardize', 'clahe', 'raw')
|
|
150
|
+
n_jobs: Number of parallel workers
|
|
151
|
+
cache_dir: Directory for caching processed data
|
|
152
|
+
chunk_size: Number of images to process at once
|
|
153
|
+
backend: Parallel processing backend ('threading' or 'multiprocessing')
|
|
154
|
+
verbose: Print progress information
|
|
155
|
+
"""
|
|
156
|
+
self.target_size = target_size
|
|
157
|
+
self.grayscale = grayscale
|
|
158
|
+
self.scaler = scaler
|
|
159
|
+
self.n_jobs = min(n_jobs, os.cpu_count() or 1)
|
|
160
|
+
self.cache_dir = cache_dir
|
|
161
|
+
self.chunk_size = chunk_size
|
|
162
|
+
self.backend = backend
|
|
163
|
+
self.verbose = verbose
|
|
164
|
+
str_usage="""
|
|
165
|
+
preprocessor = ImageLoader(
|
|
166
|
+
target_size=(32, 32),
|
|
167
|
+
chunk_size=2001, # Process 5,000 images at a time
|
|
168
|
+
cache_dir="./big_dataset_cache",
|
|
169
|
+
backend="threading",
|
|
170
|
+
n_jobs=8,
|
|
171
|
+
grayscale=True,
|
|
172
|
+
)
|
|
173
|
+
|
|
174
|
+
# This will process in chunks and cache results
|
|
175
|
+
result_train = preprocessor.process(
|
|
176
|
+
df_train,
|
|
177
|
+
x_col="path",
|
|
178
|
+
y_col="Label",
|
|
179
|
+
output="df",
|
|
180
|
+
cache=True,
|
|
181
|
+
)
|
|
182
|
+
result_test = preprocessor.process(
|
|
183
|
+
df_test,
|
|
184
|
+
x_col="path",
|
|
185
|
+
y_col="Label",
|
|
186
|
+
output="df",
|
|
187
|
+
cache=True,
|
|
188
|
+
)
|
|
189
|
+
# Sample the same 1000 rows from the training set
|
|
190
|
+
sampled_train = result_train.sample(100)
|
|
191
|
+
|
|
192
|
+
x_train = sampled_train.drop(columns=["label"])
|
|
193
|
+
y_train = sampled_train["label"]
|
|
194
|
+
|
|
195
|
+
# Sample 1000 rows from the test set, and align
|
|
196
|
+
sampled_test = result_test.sample(100)
|
|
197
|
+
|
|
198
|
+
x_true = sampled_test.drop(columns=["label"])
|
|
199
|
+
y_true = sampled_test["label"]
|
|
200
|
+
|
|
201
|
+
# Run prediction
|
|
202
|
+
res_pred_stack = ml2ls.predict(
|
|
203
|
+
x_train=x_train,
|
|
204
|
+
y_train=y_train,
|
|
205
|
+
x_true=x_true,
|
|
206
|
+
y_true=y_true,
|
|
207
|
+
# cls="light", # or "light", etc.
|
|
208
|
+
voting=False,
|
|
209
|
+
)
|
|
210
|
+
"""
|
|
211
|
+
if self.verbose:
|
|
212
|
+
print(str_usage)
|
|
213
|
+
# Create cache directory if needed
|
|
214
|
+
os.makedirs(self.cache_dir, exist_ok=True)
|
|
215
|
+
|
|
216
|
+
def _parallel_process(self, paths: List[str]) -> np.ndarray:
|
|
217
|
+
"""
|
|
218
|
+
Process images in parallel using the configured backend.
|
|
219
|
+
|
|
220
|
+
Args:
|
|
221
|
+
paths: List of image paths to process
|
|
222
|
+
|
|
223
|
+
Returns:
|
|
224
|
+
Stacked array of processed images
|
|
225
|
+
"""
|
|
226
|
+
# Create a partial function with fixed parameters
|
|
227
|
+
worker = functools.partial(
|
|
228
|
+
_process_single_image,
|
|
229
|
+
target_size=self.target_size,
|
|
230
|
+
grayscale=self.grayscale,
|
|
231
|
+
scaler=self.scaler,
|
|
232
|
+
)
|
|
233
|
+
|
|
234
|
+
if self.backend == "threading":
|
|
235
|
+
with ThreadPoolExecutor(max_workers=self.n_jobs) as pool:
|
|
236
|
+
futures = [pool.submit(worker, path) for path in paths]
|
|
237
|
+
results = []
|
|
238
|
+
for future in tqdm(
|
|
239
|
+
as_completed(futures), total=len(futures), disable=not self.verbose
|
|
240
|
+
):
|
|
241
|
+
try:
|
|
242
|
+
result = future.result()
|
|
243
|
+
if result is not None:
|
|
244
|
+
results.append(result)
|
|
245
|
+
except Exception as e:
|
|
246
|
+
if self.verbose:
|
|
247
|
+
warnings.warn(f"Image processing failed: {str(e)}")
|
|
248
|
+
else:
|
|
249
|
+
# Use smaller chunks for multiprocessing to reduce overhead
|
|
250
|
+
chunk_size = max(1, len(paths) // (self.n_jobs * 2))
|
|
251
|
+
with ProcessPoolExecutor(max_workers=self.n_jobs) as pool:
|
|
252
|
+
futures = []
|
|
253
|
+
for i in range(0, len(paths), chunk_size):
|
|
254
|
+
chunk = paths[i : i + chunk_size]
|
|
255
|
+
futures.append(pool.submit(self._process_chunk, chunk))
|
|
256
|
+
|
|
257
|
+
results = []
|
|
258
|
+
for future in tqdm(
|
|
259
|
+
as_completed(futures), total=len(futures), disable=not self.verbose
|
|
260
|
+
):
|
|
261
|
+
try:
|
|
262
|
+
chunk_results = future.result()
|
|
263
|
+
results.extend(chunk_results)
|
|
264
|
+
except Exception as e:
|
|
265
|
+
if self.verbose:
|
|
266
|
+
warnings.warn(f"Chunk processing failed: {str(e)}")
|
|
267
|
+
|
|
268
|
+
# Filter out failed images
|
|
269
|
+
valid_results = [res for res in results if res is not None]
|
|
270
|
+
if valid_results:
|
|
271
|
+
# Handle different image dimensions
|
|
272
|
+
try:
|
|
273
|
+
return np.stack(valid_results)
|
|
274
|
+
except ValueError as e:
|
|
275
|
+
if "must have the same shape" in str(e):
|
|
276
|
+
# Handle variable channel issue
|
|
277
|
+
return np.array(valid_results, dtype=object)
|
|
278
|
+
raise
|
|
279
|
+
return np.array([])
|
|
280
|
+
|
|
281
|
+
def _process_chunk(self, paths: List[str]) -> List[np.ndarray]:
|
|
282
|
+
"""Process a chunk of images (used for multiprocessing)."""
|
|
283
|
+
results = []
|
|
284
|
+
for path in paths:
|
|
285
|
+
try:
|
|
286
|
+
img = _process_single_image(
|
|
287
|
+
path, self.target_size, self.grayscale, self.scaler
|
|
288
|
+
)
|
|
289
|
+
if img is not None:
|
|
290
|
+
results.append(img)
|
|
291
|
+
except Exception:
|
|
292
|
+
continue
|
|
293
|
+
return results
|
|
294
|
+
|
|
295
|
+
def _get_cache_filename(self, data_hash: str) -> str:
|
|
296
|
+
"""
|
|
297
|
+
Generate a unique cache filename based on processing parameters.
|
|
298
|
+
|
|
299
|
+
Args:
|
|
300
|
+
data_hash: Hash of the input data
|
|
301
|
+
|
|
302
|
+
Returns:
|
|
303
|
+
Full path to cache file
|
|
304
|
+
"""
|
|
305
|
+
params = {
|
|
306
|
+
"target_size": self.target_size,
|
|
307
|
+
"grayscale": self.grayscale,
|
|
308
|
+
"scaler": self.scaler,
|
|
309
|
+
"chunk_size": self.chunk_size,
|
|
310
|
+
}
|
|
311
|
+
param_hash = hashlib.md5(str(params).encode()).hexdigest()
|
|
312
|
+
return os.path.join(self.cache_dir, f"img_cache_{data_hash}_{param_hash}.npz")
|
|
313
|
+
|
|
314
|
+
@staticmethod
|
|
315
|
+
def _load_from_cache(cache_file: str) -> Optional[Tuple[np.ndarray, np.ndarray]]:
|
|
316
|
+
"""
|
|
317
|
+
Load processed data from cache.
|
|
318
|
+
|
|
319
|
+
Args:
|
|
320
|
+
cache_file: Path to cache file
|
|
321
|
+
|
|
322
|
+
Returns:
|
|
323
|
+
Tuple of (images, labels) or None if cache is invalid
|
|
324
|
+
"""
|
|
325
|
+
try:
|
|
326
|
+
with np.load(cache_file) as data:
|
|
327
|
+
return data["images"], data["labels"]
|
|
328
|
+
except Exception:
|
|
329
|
+
return None
|
|
330
|
+
|
|
331
|
+
@staticmethod
|
|
332
|
+
def _save_to_cache(cache_file: str, images: np.ndarray, labels: np.ndarray):
|
|
333
|
+
"""
|
|
334
|
+
Save processed data to cache.
|
|
335
|
+
|
|
336
|
+
Args:
|
|
337
|
+
cache_file: Path to cache file
|
|
338
|
+
images: Processed images array
|
|
339
|
+
labels: Corresponding labels array
|
|
340
|
+
"""
|
|
341
|
+
np.savez_compressed(cache_file, images=images, labels=labels)
|
|
342
|
+
|
|
343
|
+
def _process_labels(
|
|
344
|
+
self,
|
|
345
|
+
data: pd.DataFrame,
|
|
346
|
+
y_col: Optional[str],
|
|
347
|
+
encoder: str,
|
|
348
|
+
label_encoder: Optional[LabelEncoder] = None,
|
|
349
|
+
) -> Optional[np.ndarray]:
|
|
350
|
+
"""
|
|
351
|
+
Process and encode labels according to specified method.
|
|
352
|
+
|
|
353
|
+
Args:
|
|
354
|
+
data: Input DataFrame
|
|
355
|
+
y_col: Name of column containing labels
|
|
356
|
+
encoder: Encoding method ('label', 'onehot', 'binary', None)
|
|
357
|
+
label_encoder: Pre-fitted LabelEncoder (optional)
|
|
358
|
+
|
|
359
|
+
Returns:
|
|
360
|
+
Array of processed labels or None
|
|
361
|
+
"""
|
|
362
|
+
if y_col is None or encoder is None:
|
|
363
|
+
return None
|
|
364
|
+
|
|
365
|
+
labels = data[y_col].values
|
|
366
|
+
|
|
367
|
+
if encoder == "binary":
|
|
368
|
+
unique_labels = np.unique(labels)
|
|
369
|
+
if len(unique_labels) != 2:
|
|
370
|
+
raise ValueError("Binary encoding requires exactly 2 classes")
|
|
371
|
+
return (labels == unique_labels[0]).astype(int)
|
|
372
|
+
elif encoder == "onehot":
|
|
373
|
+
if label_encoder is None:
|
|
374
|
+
label_encoder = LabelEncoder()
|
|
375
|
+
labels = label_encoder.fit_transform(labels)
|
|
376
|
+
else:
|
|
377
|
+
labels = label_encoder.transform(labels)
|
|
378
|
+
return to_categorical(labels)
|
|
379
|
+
elif encoder == "label":
|
|
380
|
+
if label_encoder is None:
|
|
381
|
+
label_encoder = LabelEncoder()
|
|
382
|
+
labels = label_encoder.fit_transform(labels)
|
|
383
|
+
else:
|
|
384
|
+
labels = label_encoder.transform(labels)
|
|
385
|
+
return labels
|
|
386
|
+
|
|
387
|
+
return labels
|
|
388
|
+
|
|
389
|
+
def _format_output(
|
|
390
|
+
self, images: np.ndarray, labels: Optional[np.ndarray], output: str
|
|
391
|
+
) -> Union[ImageDataGenerator, Tuple[np.ndarray, np.ndarray], pd.DataFrame]:
|
|
392
|
+
"""
|
|
393
|
+
Format the processed data according to requested output type.
|
|
394
|
+
|
|
395
|
+
Args:
|
|
396
|
+
images: Processed images array
|
|
397
|
+
labels: Processed labels array
|
|
398
|
+
output: Requested output type ('generator', 'array', 'dataframe')
|
|
399
|
+
|
|
400
|
+
Returns:
|
|
401
|
+
Processed data in requested format
|
|
402
|
+
"""
|
|
403
|
+
if output == "generator":
|
|
404
|
+
# Create a memory-efficient generator
|
|
405
|
+
def generator():
|
|
406
|
+
for i in range(0, len(images), self.chunk_size):
|
|
407
|
+
batch_images = images[i : i + self.chunk_size]
|
|
408
|
+
batch_labels = (
|
|
409
|
+
labels[i : i + self.chunk_size] if labels is not None else None
|
|
410
|
+
)
|
|
411
|
+
yield (
|
|
412
|
+
(batch_images, batch_labels)
|
|
413
|
+
if batch_labels is not None
|
|
414
|
+
else batch_images
|
|
415
|
+
)
|
|
416
|
+
|
|
417
|
+
return generator()
|
|
418
|
+
elif output == "array":
|
|
419
|
+
return (images, labels) if labels is not None else images
|
|
420
|
+
else: # dataframe
|
|
421
|
+
# Handle variable image dimensions
|
|
422
|
+
if images.dtype == object:
|
|
423
|
+
# Convert to uniform array
|
|
424
|
+
images = np.array([img for img in images], dtype=np.float32)
|
|
425
|
+
if len(images) == 0:
|
|
426
|
+
warnings.warn("No images to process; returning empty DataFrame.")
|
|
427
|
+
return pd.DataFrame()
|
|
428
|
+
# Ensure image dimensions are known
|
|
429
|
+
if images.ndim == 4:
|
|
430
|
+
n, h, w, c = images.shape
|
|
431
|
+
elif images.ndim == 3:
|
|
432
|
+
n, h, w = images.shape
|
|
433
|
+
c = 1
|
|
434
|
+
images = images.reshape(n, h, w, c)
|
|
435
|
+
else:
|
|
436
|
+
print(f"image type: {type(images)}")
|
|
437
|
+
print(f"image shape: {images.shape}")
|
|
438
|
+
print(f"image dtype: {images.dtype}")
|
|
439
|
+
raise ValueError(f"Unexpected image shape: {images.shape}")
|
|
440
|
+
|
|
441
|
+
# Flatten image data
|
|
442
|
+
images_flat = images.reshape(len(images), -1)
|
|
443
|
+
|
|
444
|
+
# Create column names based on channels
|
|
445
|
+
if c == 3:
|
|
446
|
+
col_names = (
|
|
447
|
+
[f"pixel_{i}_r" for i in range(h * w)] +
|
|
448
|
+
[f"pixel_{i}_g" for i in range(h * w)] +
|
|
449
|
+
[f"pixel_{i}_b" for i in range(h * w)]
|
|
450
|
+
)
|
|
451
|
+
elif c == 1:
|
|
452
|
+
col_names = [f"pixel_{i}" for i in range(h * w)]
|
|
453
|
+
else:
|
|
454
|
+
# fallback
|
|
455
|
+
col_names = [f"pixel_{i}" for i in range(images_flat.shape[1])]
|
|
456
|
+
|
|
457
|
+
# Create DataFrame
|
|
458
|
+
df = pd.DataFrame(images_flat, columns=col_names)
|
|
459
|
+
|
|
460
|
+
# Append labels if present
|
|
461
|
+
if labels is not None:
|
|
462
|
+
df["label"] = labels
|
|
463
|
+
print(f"dataframe shape: {df.shape}")
|
|
464
|
+
return df
|
|
465
|
+
|
|
466
|
+
def process(
|
|
467
|
+
self,
|
|
468
|
+
data: pd.DataFrame,
|
|
469
|
+
x_col: str,
|
|
470
|
+
y_col: Optional[str] = None,
|
|
471
|
+
encoder: str = "label",
|
|
472
|
+
label_encoder: Optional[LabelEncoder] = None,
|
|
473
|
+
output: str = "dataframe",
|
|
474
|
+
cache: bool = True,
|
|
475
|
+
max_samples: Optional[int] = None,
|
|
476
|
+
**kwargs,
|
|
477
|
+
) -> Union[ImageDataGenerator, Tuple[np.ndarray, np.ndarray], pd.DataFrame]:
|
|
478
|
+
"""
|
|
479
|
+
Main processing method that handles the entire pipeline.
|
|
480
|
+
|
|
481
|
+
Args:
|
|
482
|
+
data: Input DataFrame containing image paths and labels
|
|
483
|
+
x_col: Name of column containing image paths
|
|
484
|
+
y_col: Name of column containing labels (optional)
|
|
485
|
+
encoder: Label encoding method ('label', 'onehot', 'binary', None)
|
|
486
|
+
label_encoder: Pre-fitted LabelEncoder (optional)
|
|
487
|
+
output: Requested output format ('generator', 'array', 'dataframe')
|
|
488
|
+
cache: Whether to use disk caching
|
|
489
|
+
max_samples: Maximum number of samples to process
|
|
490
|
+
kwargs: Additional arguments for processing
|
|
491
|
+
|
|
492
|
+
Returns:
|
|
493
|
+
Processed data in requested format
|
|
494
|
+
"""
|
|
495
|
+
# Validate inputs
|
|
496
|
+
if x_col not in data.columns:
|
|
497
|
+
raise ValueError(f"Column '{x_col}' not found in DataFrame")
|
|
498
|
+
|
|
499
|
+
if y_col is not None and y_col not in data.columns:
|
|
500
|
+
raise ValueError(f"Column '{y_col}' not found in DataFrame")
|
|
501
|
+
|
|
502
|
+
# Limit samples if requested
|
|
503
|
+
if max_samples is not None:
|
|
504
|
+
data = data.iloc[:max_samples]
|
|
505
|
+
|
|
506
|
+
# Generate data hash for caching
|
|
507
|
+
data_hash = hashlib.md5(pd.util.hash_pandas_object(data).values).hexdigest()
|
|
508
|
+
cache_file = self._get_cache_filename(data_hash) if cache else None
|
|
509
|
+
|
|
510
|
+
# Try loading from cache
|
|
511
|
+
if cache and cache_file and os.path.exists(cache_file):
|
|
512
|
+
if self.verbose:
|
|
513
|
+
print("Loading from cache...")
|
|
514
|
+
cached_data = self._load_from_cache(cache_file)
|
|
515
|
+
if cached_data is not None:
|
|
516
|
+
images, labels = cached_data
|
|
517
|
+
return self._format_output(images, labels, output)
|
|
518
|
+
|
|
519
|
+
# Process labels first
|
|
520
|
+
labels = self._process_labels(data, y_col, encoder, label_encoder)
|
|
521
|
+
|
|
522
|
+
# Process images in chunks if dataset is large
|
|
523
|
+
total_images = len(data)
|
|
524
|
+
use_chunking = total_images > self.chunk_size * 2
|
|
525
|
+
|
|
526
|
+
if use_chunking and self.verbose:
|
|
527
|
+
print(f"Processing {total_images} images in chunks of {self.chunk_size}...")
|
|
528
|
+
|
|
529
|
+
# Process all images at once if not chunking
|
|
530
|
+
if not use_chunking:
|
|
531
|
+
images = self._parallel_process(data[x_col].values)
|
|
532
|
+
else:
|
|
533
|
+
# Process images chunk by chunk
|
|
534
|
+
images = []
|
|
535
|
+
for i in tqdm(
|
|
536
|
+
range(0, total_images, self.chunk_size), disable=not self.verbose
|
|
537
|
+
):
|
|
538
|
+
chunk_paths = data[x_col].iloc[i : i + self.chunk_size].values
|
|
539
|
+
chunk_images = self._parallel_process(chunk_paths)
|
|
540
|
+
if len(chunk_images) > 0:
|
|
541
|
+
images.append(chunk_images)
|
|
542
|
+
|
|
543
|
+
# Handle empty chunks
|
|
544
|
+
if images:
|
|
545
|
+
try:
|
|
546
|
+
images = np.concatenate(images)
|
|
547
|
+
except ValueError:
|
|
548
|
+
# Handle case with no images processed
|
|
549
|
+
images = np.array([])
|
|
550
|
+
else:
|
|
551
|
+
images = np.array([])
|
|
552
|
+
|
|
553
|
+
# Align labels with successfully processed images
|
|
554
|
+
if labels is not None and len(images) > 0:
|
|
555
|
+
labels = labels[: len(images)]
|
|
556
|
+
|
|
557
|
+
# Save to cache if requested
|
|
558
|
+
if cache and cache_file and len(images) > 0:
|
|
559
|
+
self._save_to_cache(cache_file, images, labels)
|
|
560
|
+
|
|
561
|
+
return self._format_output(images, labels, output)
|
|
562
|
+
|
|
563
|
+
|
|
564
|
+
def create_augmentation_generator(
|
|
565
|
+
data: pd.DataFrame,
|
|
566
|
+
x_col: str,
|
|
567
|
+
y_col: str,
|
|
568
|
+
target_size: Tuple[int, int] = (224, 224),
|
|
569
|
+
batch_size: int = 32,
|
|
570
|
+
class_mode: str = "raw",
|
|
571
|
+
augment_params: Optional[Dict] = None,
|
|
572
|
+
grayscale: bool = False,
|
|
573
|
+
shuffle: bool = True,
|
|
574
|
+
seed: Optional[int] = None,
|
|
575
|
+
) -> ImageDataGenerator:
|
|
576
|
+
"""
|
|
577
|
+
Create an augmented image data generator for training.
|
|
578
|
+
|
|
579
|
+
Args:
|
|
580
|
+
data: Input DataFrame
|
|
581
|
+
x_col: Column with image paths
|
|
582
|
+
y_col: Column with labels
|
|
583
|
+
target_size: Target image dimensions
|
|
584
|
+
batch_size: Images per batch
|
|
585
|
+
class_mode: Type of label output
|
|
586
|
+
augment_params: Dictionary of augmentation parameters
|
|
587
|
+
grayscale: Convert to grayscale
|
|
588
|
+
shuffle: Shuffle the data
|
|
589
|
+
seed: Random seed
|
|
590
|
+
|
|
591
|
+
Returns:
|
|
592
|
+
Configured ImageDataGenerator
|
|
593
|
+
"""
|
|
594
|
+
default_augment = {
|
|
595
|
+
"rotation_range": 20,
|
|
596
|
+
"width_shift_range": 0.1,
|
|
597
|
+
"height_shift_range": 0.1,
|
|
598
|
+
"shear_range": 0.1,
|
|
599
|
+
"zoom_range": 0.1,
|
|
600
|
+
"horizontal_flip": True,
|
|
601
|
+
"vertical_flip": False,
|
|
602
|
+
"brightness_range": [0.9, 1.1],
|
|
603
|
+
"fill_mode": "reflect",
|
|
604
|
+
}
|
|
605
|
+
|
|
606
|
+
if augment_params:
|
|
607
|
+
default_augment.update(augment_params)
|
|
608
|
+
|
|
609
|
+
datagen = ImageDataGenerator(**default_augment)
|
|
610
|
+
|
|
611
|
+
return datagen.flow_from_dataframe(
|
|
612
|
+
dataframe=data,
|
|
613
|
+
x_col=x_col,
|
|
614
|
+
y_col=y_col,
|
|
615
|
+
target_size=target_size,
|
|
616
|
+
color_mode="grayscale" if grayscale else "rgb",
|
|
617
|
+
class_mode=class_mode,
|
|
618
|
+
batch_size=batch_size,
|
|
619
|
+
shuffle=shuffle,
|
|
620
|
+
seed=seed,
|
|
621
|
+
)
|
py2ls/__init__.py
CHANGED
|
@@ -1,8 +1,10 @@
|
|
|
1
1
|
"""
|
|
2
2
|
__init__ of the pyos module
|
|
3
3
|
"""
|
|
4
|
-
|
|
5
|
-
|
|
6
|
-
|
|
7
|
-
|
|
8
|
-
|
|
4
|
+
# try:
|
|
5
|
+
# import tensorflow as tf
|
|
6
|
+
# print("Eager execution enabled:", tf.executing_eagerly())
|
|
7
|
+
# tf.config.set_visible_devices([], "GPU")
|
|
8
|
+
# except Exception as e:
|
|
9
|
+
# print("Error importing tensorflow:", e)
|
|
10
|
+
from .ips import *
|