PoLab-analyzer 0.1.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- polab_analyzer-0.1.0/PKG-INFO +9 -0
- polab_analyzer-0.1.0/README.md +2 -0
- polab_analyzer-0.1.0/pyproject.toml +19 -0
- polab_analyzer-0.1.0/setup.cfg +4 -0
- polab_analyzer-0.1.0/src/PoLab_analyzer.egg-info/PKG-INFO +9 -0
- polab_analyzer-0.1.0/src/PoLab_analyzer.egg-info/SOURCES.txt +11 -0
- polab_analyzer-0.1.0/src/PoLab_analyzer.egg-info/dependency_links.txt +1 -0
- polab_analyzer-0.1.0/src/PoLab_analyzer.egg-info/entry_points.txt +3 -0
- polab_analyzer-0.1.0/src/PoLab_analyzer.egg-info/requires.txt +5 -0
- polab_analyzer-0.1.0/src/PoLab_analyzer.egg-info/top_level.txt +1 -0
- polab_analyzer-0.1.0/src/scripts/__init__.py +0 -0
- polab_analyzer-0.1.0/src/scripts/eve.py +787 -0
- polab_analyzer-0.1.0/src/scripts/hb.py +375 -0
|
@@ -0,0 +1,19 @@
|
|
|
1
|
+
[build-system]
|
|
2
|
+
requires = ["setuptools>=61.0.0"]
|
|
3
|
+
build-backend = "setuptools.build_meta"
|
|
4
|
+
|
|
5
|
+
[project]
|
|
6
|
+
name = "PoLab_analyzer"
|
|
7
|
+
version = "0.1.0"
|
|
8
|
+
description = "CLI tools for PoLab eve and hunchback in-situ image analysis"
|
|
9
|
+
dependencies = [
|
|
10
|
+
"numpy",
|
|
11
|
+
"pandas",
|
|
12
|
+
"matplotlib",
|
|
13
|
+
"scipy",
|
|
14
|
+
"openpyxl"
|
|
15
|
+
]
|
|
16
|
+
|
|
17
|
+
[project.scripts]
|
|
18
|
+
eve = "scripts.eve:main"
|
|
19
|
+
hb = "scripts.hb:main"
|
|
@@ -0,0 +1,11 @@
|
|
|
1
|
+
README.md
|
|
2
|
+
pyproject.toml
|
|
3
|
+
src/PoLab_analyzer.egg-info/PKG-INFO
|
|
4
|
+
src/PoLab_analyzer.egg-info/SOURCES.txt
|
|
5
|
+
src/PoLab_analyzer.egg-info/dependency_links.txt
|
|
6
|
+
src/PoLab_analyzer.egg-info/entry_points.txt
|
|
7
|
+
src/PoLab_analyzer.egg-info/requires.txt
|
|
8
|
+
src/PoLab_analyzer.egg-info/top_level.txt
|
|
9
|
+
src/scripts/__init__.py
|
|
10
|
+
src/scripts/eve.py
|
|
11
|
+
src/scripts/hb.py
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
scripts
|
|
File without changes
|
|
@@ -0,0 +1,787 @@
|
|
|
1
|
+
#!/usr/bin/env python3
|
|
2
|
+
import argparse
|
|
3
|
+
import os
|
|
4
|
+
from posixpath import abspath
|
|
5
|
+
import re
|
|
6
|
+
import sys
|
|
7
|
+
import warnings
|
|
8
|
+
import matplotlib.pyplot as plt
|
|
9
|
+
import numpy as np
|
|
10
|
+
import pandas as pd
|
|
11
|
+
from openpyxl import load_workbook
|
|
12
|
+
from openpyxl.drawing.image import Image as XLImage
|
|
13
|
+
from scipy.signal import find_peaks, savgol_filter
|
|
14
|
+
|
|
15
|
+
warnings.simplefilter(action="ignore", category=FutureWarning)
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
# --- Helper: test run without saving ---
|
|
19
|
+
def test_parameters(excel_file, lower_lim, upper_lim, distance, prominence, height):
|
|
20
|
+
discarded_sheets = 0
|
|
21
|
+
total_sheets = 0
|
|
22
|
+
|
|
23
|
+
try:
|
|
24
|
+
xl = pd.ExcelFile(excel_file)
|
|
25
|
+
# Process sheets that match the 's' followed by digits pattern
|
|
26
|
+
sheet_names = [s for s in xl.sheet_names if re.match(r"^s\d+$", s)]
|
|
27
|
+
except Exception:
|
|
28
|
+
return 0, 0
|
|
29
|
+
|
|
30
|
+
for sheet_name in sheet_names:
|
|
31
|
+
try:
|
|
32
|
+
data = pd.read_excel(excel_file, sheet_name=sheet_name)
|
|
33
|
+
length = data.iloc[:, 0]
|
|
34
|
+
intensity = data.iloc[:, 1]
|
|
35
|
+
|
|
36
|
+
norm_intensity = intensity / intensity.max()
|
|
37
|
+
inverted_intensity = 1 - norm_intensity
|
|
38
|
+
norm_length = length / length.max()
|
|
39
|
+
percent_length = norm_length * 100
|
|
40
|
+
|
|
41
|
+
smoothed = savgol_filter(inverted_intensity, 11, 2)
|
|
42
|
+
diff = np.diff(smoothed, prepend=smoothed[0])
|
|
43
|
+
change_points = np.diff(np.sign(diff), prepend=0)
|
|
44
|
+
change_point_flags = (change_points < 0).astype(int)
|
|
45
|
+
|
|
46
|
+
peaks, _ = find_peaks(
|
|
47
|
+
smoothed, distance=distance, prominence=prominence, height=height
|
|
48
|
+
)
|
|
49
|
+
peak_percent_lengths = percent_length.iloc[peaks].values
|
|
50
|
+
valid_peak_mask = (peak_percent_lengths > lower_lim) & (
|
|
51
|
+
peak_percent_lengths < upper_lim
|
|
52
|
+
)
|
|
53
|
+
peak_percent_lengths = peak_percent_lengths[valid_peak_mask]
|
|
54
|
+
|
|
55
|
+
data["Percent Length"] = percent_length
|
|
56
|
+
data["Change Point"] = change_point_flags
|
|
57
|
+
|
|
58
|
+
change_df = data[
|
|
59
|
+
(data["Change Point"] == 1) & (data["Percent Length"] < upper_lim)
|
|
60
|
+
]
|
|
61
|
+
closest_matches = []
|
|
62
|
+
for px in peak_percent_lengths:
|
|
63
|
+
if not change_df.empty:
|
|
64
|
+
closest = change_df.iloc[
|
|
65
|
+
(change_df["Percent Length"] - px).abs().argsort()[:1]
|
|
66
|
+
]
|
|
67
|
+
closest_val = closest["Percent Length"].values[0]
|
|
68
|
+
closest_matches.append(closest_val)
|
|
69
|
+
|
|
70
|
+
total_sheets += 1
|
|
71
|
+
if len(closest_matches) != 7:
|
|
72
|
+
discarded_sheets += 1
|
|
73
|
+
|
|
74
|
+
except Exception:
|
|
75
|
+
discarded_sheets += 1
|
|
76
|
+
|
|
77
|
+
return discarded_sheets, total_sheets
|
|
78
|
+
|
|
79
|
+
|
|
80
|
+
|
|
81
|
+
def test_parameters2(input_folder, lower_lim, upper_lim, distance, prominence, height):
|
|
82
|
+
discarded_files = 0
|
|
83
|
+
total_files = 0
|
|
84
|
+
|
|
85
|
+
for filename in os.listdir(input_folder):
|
|
86
|
+
if filename.endswith(".xlsx"):
|
|
87
|
+
input_path = os.path.join(input_folder, filename)
|
|
88
|
+
try:
|
|
89
|
+
data = pd.read_excel(input_path)
|
|
90
|
+
length = data.iloc[:, 0]
|
|
91
|
+
intensity = data.iloc[:, 1]
|
|
92
|
+
|
|
93
|
+
norm_intensity = intensity / intensity.max()
|
|
94
|
+
inverted_intensity = 1 - norm_intensity
|
|
95
|
+
norm_length = length / length.max()
|
|
96
|
+
percent_length = norm_length * 100
|
|
97
|
+
|
|
98
|
+
smoothed = savgol_filter(inverted_intensity, 11, 2)
|
|
99
|
+
diff = np.diff(smoothed, prepend=smoothed[0])
|
|
100
|
+
change_points = np.diff(np.sign(diff), prepend=0)
|
|
101
|
+
change_point_flags = (change_points < 0).astype(int)
|
|
102
|
+
|
|
103
|
+
peaks, _ = find_peaks(
|
|
104
|
+
smoothed, distance=distance, prominence=prominence, height=height
|
|
105
|
+
)
|
|
106
|
+
peak_percent_lengths = percent_length.iloc[peaks].values
|
|
107
|
+
valid_peak_mask = (peak_percent_lengths > lower_lim) & (
|
|
108
|
+
peak_percent_lengths < upper_lim
|
|
109
|
+
)
|
|
110
|
+
peak_percent_lengths = peak_percent_lengths[valid_peak_mask]
|
|
111
|
+
|
|
112
|
+
data["Percent Length"] = percent_length
|
|
113
|
+
data["Change Point"] = change_point_flags
|
|
114
|
+
|
|
115
|
+
change_df = data[
|
|
116
|
+
(data["Change Point"] == 1) & (data["Percent Length"] < upper_lim)
|
|
117
|
+
]
|
|
118
|
+
closest_matches = []
|
|
119
|
+
for px in peak_percent_lengths:
|
|
120
|
+
if not change_df.empty:
|
|
121
|
+
closest = change_df.iloc[
|
|
122
|
+
(change_df["Percent Length"] - px).abs().argsort()[:1]
|
|
123
|
+
]
|
|
124
|
+
closest_val = closest["Percent Length"].values[0]
|
|
125
|
+
closest_matches.append(closest_val)
|
|
126
|
+
|
|
127
|
+
total_files += 1
|
|
128
|
+
if len(closest_matches) != 7:
|
|
129
|
+
discarded_files += 1
|
|
130
|
+
|
|
131
|
+
except Exception:
|
|
132
|
+
discarded_files += 1
|
|
133
|
+
|
|
134
|
+
return discarded_files, total_files
|
|
135
|
+
|
|
136
|
+
|
|
137
|
+
def generate_stripe_summary(output_folder):
|
|
138
|
+
summary_output_path = os.path.join(output_folder, "stripe_summary.xlsx")
|
|
139
|
+
|
|
140
|
+
all_rows = []
|
|
141
|
+
processed_files = sorted(
|
|
142
|
+
[
|
|
143
|
+
f
|
|
144
|
+
for f in os.listdir(output_folder)
|
|
145
|
+
if f.startswith("processed_") and f.endswith(".xlsx")
|
|
146
|
+
],
|
|
147
|
+
key=lambda x: (
|
|
148
|
+
int(re.findall(r"(\d+)", x)[-1]) if re.findall(r"(\d+)", x) else 0
|
|
149
|
+
),
|
|
150
|
+
)
|
|
151
|
+
|
|
152
|
+
for fname in processed_files:
|
|
153
|
+
fpath = os.path.join(output_folder, fname)
|
|
154
|
+
try:
|
|
155
|
+
wb = load_workbook(fpath, data_only=True)
|
|
156
|
+
if "Peaks" not in wb.sheetnames:
|
|
157
|
+
continue
|
|
158
|
+
|
|
159
|
+
peaks_ws = wb["Peaks"]
|
|
160
|
+
peaks = []
|
|
161
|
+
for i, r in enumerate(
|
|
162
|
+
peaks_ws.iter_rows(min_row=2, values_only=True), start=2
|
|
163
|
+
):
|
|
164
|
+
if r[0] is not None:
|
|
165
|
+
peaks.append(r[0])
|
|
166
|
+
|
|
167
|
+
if len(peaks) != 7:
|
|
168
|
+
print(f"\033[91mSkipping {fname}\033[0m: found {len(peaks)} stripes, not 7.")
|
|
169
|
+
continue
|
|
170
|
+
|
|
171
|
+
row_data = {"File": fname}
|
|
172
|
+
for i, p in enumerate(peaks, start=1):
|
|
173
|
+
row_data[f"Stripe-{i}"] = p
|
|
174
|
+
|
|
175
|
+
all_rows.append(row_data)
|
|
176
|
+
|
|
177
|
+
except Exception as e:
|
|
178
|
+
print(f"Error reading {fname}: {e}")
|
|
179
|
+
|
|
180
|
+
# Save summary
|
|
181
|
+
summary_df = pd.DataFrame(all_rows)
|
|
182
|
+
summary_df.to_excel(summary_output_path, index=False)
|
|
183
|
+
print(f"\033[92mStripe summary written to: {summary_output_path}\033[0m")
|
|
184
|
+
|
|
185
|
+
|
|
186
|
+
def run_pipeline(
|
|
187
|
+
excel_file,
|
|
188
|
+
lower_lim,
|
|
189
|
+
upper_lim,
|
|
190
|
+
test,
|
|
191
|
+
distance,
|
|
192
|
+
prominence,
|
|
193
|
+
height,
|
|
194
|
+
distances,
|
|
195
|
+
prom_range,
|
|
196
|
+
height_range,
|
|
197
|
+
):
|
|
198
|
+
"""Auto-tune (or use fixed) peak-detection parameters and process every
|
|
199
|
+
s1, s2, s3, ... sheet of a single Excel file. Shared by both --file mode
|
|
200
|
+
(one file) and --folder mode (looped over every xlsx file in the folder)."""
|
|
201
|
+
if test:
|
|
202
|
+
# --- Grid search mode ---
|
|
203
|
+
distances_list = distances
|
|
204
|
+
prominences = np.arange(
|
|
205
|
+
prom_range[0],
|
|
206
|
+
prom_range[1] + prom_range[2],
|
|
207
|
+
prom_range[2],
|
|
208
|
+
)
|
|
209
|
+
heights = np.arange(
|
|
210
|
+
height_range[0],
|
|
211
|
+
height_range[1] + height_range[2],
|
|
212
|
+
height_range[2],
|
|
213
|
+
)
|
|
214
|
+
|
|
215
|
+
best_params = None
|
|
216
|
+
best_discarded = float("inf")
|
|
217
|
+
|
|
218
|
+
total_iterations = len(distances_list) * len(prominences) * len(heights)
|
|
219
|
+
current_iteration = 0
|
|
220
|
+
|
|
221
|
+
print(f"\n\033[94mRunning parameter grid search over sheets in '{os.path.basename(excel_file)}'...\033[0m")
|
|
222
|
+
|
|
223
|
+
for d in distances_list:
|
|
224
|
+
for p in prominences:
|
|
225
|
+
for h in heights:
|
|
226
|
+
discarded, total = test_parameters(
|
|
227
|
+
excel_file, lower_lim, upper_lim, d, p, h
|
|
228
|
+
)
|
|
229
|
+
if discarded < best_discarded:
|
|
230
|
+
best_discarded = discarded
|
|
231
|
+
best_params = (d, p, h)
|
|
232
|
+
|
|
233
|
+
current_iteration += 1
|
|
234
|
+
percent = (current_iteration / total_iterations) * 100
|
|
235
|
+
|
|
236
|
+
sys.stdout.write(
|
|
237
|
+
f"\r\033[92mProgress: [{percent:3.0f}%]\033[0m Processing parameter set {current_iteration}/{total_iterations}..."
|
|
238
|
+
)
|
|
239
|
+
sys.stdout.flush()
|
|
240
|
+
|
|
241
|
+
print("\n")
|
|
242
|
+
|
|
243
|
+
print(
|
|
244
|
+
f"\033[96mFound best parameters:\033[0m distance={best_params[0]}, "
|
|
245
|
+
f"prominence={best_params[1]:.2f}, height={best_params[2]:.2f} "
|
|
246
|
+
f"(\033[91mdiscarded {best_discarded} sheets\033[0m)"
|
|
247
|
+
)
|
|
248
|
+
|
|
249
|
+
final_distance, final_prominence, final_height = best_params
|
|
250
|
+
|
|
251
|
+
else:
|
|
252
|
+
# --- Direct mode ---
|
|
253
|
+
final_distance = distance
|
|
254
|
+
final_prominence = prominence
|
|
255
|
+
final_height = height
|
|
256
|
+
|
|
257
|
+
print(
|
|
258
|
+
f"\n Using user-specified parameters: "
|
|
259
|
+
f"distance={final_distance}, prominence={final_prominence}, height={final_height}"
|
|
260
|
+
)
|
|
261
|
+
|
|
262
|
+
process_excel_file(
|
|
263
|
+
excel_file,
|
|
264
|
+
lower_lim,
|
|
265
|
+
upper_lim,
|
|
266
|
+
distance=final_distance,
|
|
267
|
+
prominence=final_prominence,
|
|
268
|
+
height=final_height,
|
|
269
|
+
)
|
|
270
|
+
|
|
271
|
+
|
|
272
|
+
def run_pipeline2(input_folder, lower_lim, upper_lim, test, distance, prominence, height, distances, prom_range, height_range):
|
|
273
|
+
if test:
|
|
274
|
+
# --- Grid search mode ---
|
|
275
|
+
distances_list = distances
|
|
276
|
+
prominences = np.arange(
|
|
277
|
+
prom_range[0],
|
|
278
|
+
prom_range[1] + prom_range[2],
|
|
279
|
+
prom_range[2],
|
|
280
|
+
)
|
|
281
|
+
heights = np.arange(
|
|
282
|
+
height_range[0],
|
|
283
|
+
height_range[1] + height_range[2],
|
|
284
|
+
height_range[2],
|
|
285
|
+
)
|
|
286
|
+
|
|
287
|
+
best_params = None
|
|
288
|
+
best_discarded = float("inf")
|
|
289
|
+
|
|
290
|
+
total_iterations = len(distances_list) * len(prominences) * len(heights)
|
|
291
|
+
current_iteration = 0
|
|
292
|
+
|
|
293
|
+
print(f"\n\033[94mRunning parameter grid search over files in '{os.path.basename(input_folder)}'...\033[0m")
|
|
294
|
+
|
|
295
|
+
for d in distances_list:
|
|
296
|
+
for p in prominences:
|
|
297
|
+
for h in heights:
|
|
298
|
+
discarded, total = test_parameters2(
|
|
299
|
+
input_folder, lower_lim, upper_lim, d, p, h
|
|
300
|
+
)
|
|
301
|
+
if discarded < best_discarded:
|
|
302
|
+
best_discarded = discarded
|
|
303
|
+
best_params = (d, p, h)
|
|
304
|
+
|
|
305
|
+
current_iteration += 1
|
|
306
|
+
percent = (current_iteration / total_iterations) * 100
|
|
307
|
+
|
|
308
|
+
sys.stdout.write(
|
|
309
|
+
f"\r\033[92mProgress: [{percent:3.0f}%]\033[0m Processing parameter set {current_iteration}/{total_iterations}..."
|
|
310
|
+
)
|
|
311
|
+
sys.stdout.flush()
|
|
312
|
+
|
|
313
|
+
print("\n")
|
|
314
|
+
|
|
315
|
+
print(
|
|
316
|
+
f"\033[96mFound best parameters:\033[0m distance={best_params[0]}, "
|
|
317
|
+
f"prominence={best_params[1]:.2f}, height={best_params[2]:.2f} "
|
|
318
|
+
f"(\033[91mdiscarded {best_discarded} files\033[0m)"
|
|
319
|
+
)
|
|
320
|
+
|
|
321
|
+
final_distance, final_prominence, final_height = best_params
|
|
322
|
+
|
|
323
|
+
else:
|
|
324
|
+
# --- Direct mode ---
|
|
325
|
+
final_distance = distance
|
|
326
|
+
final_prominence = prominence
|
|
327
|
+
final_height = height
|
|
328
|
+
print(
|
|
329
|
+
f"\n Using user-specified parameters: "
|
|
330
|
+
f"distance={final_distance}, prominence={final_prominence}, height={final_height}"
|
|
331
|
+
)
|
|
332
|
+
|
|
333
|
+
process_folder(
|
|
334
|
+
input_folder,
|
|
335
|
+
lower_lim,
|
|
336
|
+
upper_lim,
|
|
337
|
+
distance=final_distance,
|
|
338
|
+
prominence=final_prominence,
|
|
339
|
+
height=final_height,
|
|
340
|
+
)
|
|
341
|
+
|
|
342
|
+
|
|
343
|
+
def process_folder(input_folder, lower_lim, upper_lim, distance=10, prominence=0.03, height = 0.1):
|
|
344
|
+
output_folder = os.path.join(input_folder, "results")
|
|
345
|
+
os.makedirs(output_folder, exist_ok=True)
|
|
346
|
+
|
|
347
|
+
for filename in os.listdir(input_folder):
|
|
348
|
+
if filename.endswith(".xlsx"):
|
|
349
|
+
input_path = os.path.join(input_folder, filename)
|
|
350
|
+
output_path = os.path.join(output_folder, f"processed_{filename}")
|
|
351
|
+
temp_plot_path = "temp_plot.png"
|
|
352
|
+
try:
|
|
353
|
+
data = pd.read_excel(input_path)
|
|
354
|
+
length = data.iloc[:, 0]
|
|
355
|
+
intensity = data.iloc[:, 1]
|
|
356
|
+
|
|
357
|
+
norm_intensity = intensity / intensity.max()
|
|
358
|
+
inverted_intensity = 1 - norm_intensity
|
|
359
|
+
norm_length = length / length.max()
|
|
360
|
+
percent_length = norm_length * 100
|
|
361
|
+
|
|
362
|
+
smoothed = savgol_filter(inverted_intensity, 11, 2)
|
|
363
|
+
diff = np.diff(smoothed, prepend=smoothed[0])
|
|
364
|
+
change_points = np.diff(np.sign(diff), prepend=0)
|
|
365
|
+
change_point_flags = (change_points < 0).astype(int)
|
|
366
|
+
|
|
367
|
+
# Detect peaks
|
|
368
|
+
peaks, _ = find_peaks(smoothed, distance=distance, prominence=prominence, height=height)
|
|
369
|
+
peak_percent_lengths = percent_length.iloc[peaks].values
|
|
370
|
+
peak_values = smoothed[peaks]
|
|
371
|
+
valid_peak_mask = (peak_percent_lengths > lower_lim) & (peak_percent_lengths < upper_lim)
|
|
372
|
+
peak_percent_lengths = peak_percent_lengths[valid_peak_mask]
|
|
373
|
+
peak_values = peak_values[valid_peak_mask]
|
|
374
|
+
|
|
375
|
+
data['Normalized Intensity'] = norm_intensity
|
|
376
|
+
data['Inverted Intensity'] = inverted_intensity
|
|
377
|
+
data['Normalized Length'] = norm_length
|
|
378
|
+
data['Percent Length'] = percent_length
|
|
379
|
+
data['Smoothed'] = smoothed
|
|
380
|
+
data['Difference'] = diff
|
|
381
|
+
data['Change Point'] = change_point_flags
|
|
382
|
+
data.to_excel(output_path, index=False)
|
|
383
|
+
|
|
384
|
+
plt.figure(figsize=(5, 3))
|
|
385
|
+
plt.plot(percent_length, smoothed, label="Smoothed", color='blue')
|
|
386
|
+
plt.scatter(peak_percent_lengths, peak_values, color='red', s=50, label=f"Peaks > {lower_lim}%")
|
|
387
|
+
plt.title("Smoothed Inverted Intensity")
|
|
388
|
+
plt.xlabel("Percent Length (%)")
|
|
389
|
+
plt.ylabel("Inverted Intensity")
|
|
390
|
+
plt.legend()
|
|
391
|
+
plt.tight_layout()
|
|
392
|
+
plt.savefig(temp_plot_path, dpi=200)
|
|
393
|
+
plt.close()
|
|
394
|
+
wb = load_workbook(output_path)
|
|
395
|
+
ws = wb.active
|
|
396
|
+
img = XLImage(temp_plot_path)
|
|
397
|
+
img.width = 360
|
|
398
|
+
img.height = 220
|
|
399
|
+
ws.add_image(img, "K2")
|
|
400
|
+
wb.save(output_path)
|
|
401
|
+
|
|
402
|
+
# Add Peaks Sheets
|
|
403
|
+
if "Peaks" in wb.sheetnames:
|
|
404
|
+
del wb["Peaks"]
|
|
405
|
+
peak_ws = wb.create_sheet("Peaks")
|
|
406
|
+
peak_ws.append(["Percent Length", "Inverted Intensity"])
|
|
407
|
+
for x, y in zip(peak_percent_lengths, peak_values):
|
|
408
|
+
peak_ws.append([x, y])
|
|
409
|
+
|
|
410
|
+
# Match peaks with change_df data frame
|
|
411
|
+
change_df = data[(data["Change Point"] == 1) & (data["Percent Length"] > lower_lim) & (
|
|
412
|
+
data["Percent Length"] < upper_lim)]
|
|
413
|
+
closest_matches = []
|
|
414
|
+
for px in peak_percent_lengths:
|
|
415
|
+
if not change_df.empty:
|
|
416
|
+
closest = change_df.iloc[(change_df["Percent Length"] - px).abs().argsort()[:1]]
|
|
417
|
+
closest_val = closest["Percent Length"].values[0]
|
|
418
|
+
closest_matches.append(closest_val)
|
|
419
|
+
|
|
420
|
+
# Discard files which have not exactly 7 stripes.
|
|
421
|
+
if len(closest_matches) != 7:
|
|
422
|
+
print(
|
|
423
|
+
f"\033[91mSkipping {filename}:\033[0m found {len(closest_matches)} stripes, not exactly 7."
|
|
424
|
+
)
|
|
425
|
+
if os.path.exists(temp_plot_path):
|
|
426
|
+
os.remove(temp_plot_path)
|
|
427
|
+
continue
|
|
428
|
+
wb.save(output_path)
|
|
429
|
+
if os.path.exists(temp_plot_path):
|
|
430
|
+
os.remove(temp_plot_path)
|
|
431
|
+
|
|
432
|
+
# Add macro sheets.
|
|
433
|
+
macro_df = pd.DataFrame({
|
|
434
|
+
"Percent Length": percent_length,
|
|
435
|
+
"Inverted Intensity": inverted_intensity
|
|
436
|
+
})
|
|
437
|
+
with pd.ExcelWriter(output_path, engine='openpyxl', mode='a', if_sheet_exists="replace") as writer:
|
|
438
|
+
macro_df.to_excel(writer, sheet_name="Macro", index=False)
|
|
439
|
+
|
|
440
|
+
# Add macro 600 points sheet
|
|
441
|
+
indices = np.linspace(0, len(macro_df) - 1, 600, dtype=int)
|
|
442
|
+
squeezed_macro_df = macro_df.iloc[indices].reset_index(drop=True)
|
|
443
|
+
with pd.ExcelWriter(output_path, engine='openpyxl', mode='a', if_sheet_exists='replace') as writer:
|
|
444
|
+
squeezed_macro_df.to_excel(writer, sheet_name='Macro_600points', index=False)
|
|
445
|
+
except Exception as e:
|
|
446
|
+
print(f"Error processing {filename}: {e}")
|
|
447
|
+
|
|
448
|
+
def extract_numeric_suffix(filename):
|
|
449
|
+
nums = re.findall(r'(\d+)', filename)
|
|
450
|
+
return tuple(map(int, nums)) if nums else (0,)
|
|
451
|
+
# Combine all Macro_600points into one Excel
|
|
452
|
+
macro_percent_cols = []
|
|
453
|
+
macro_intensity_cols = []
|
|
454
|
+
sample_names = []
|
|
455
|
+
processed_files = sorted(
|
|
456
|
+
[f for f in os.listdir(output_folder) if f.startswith("processed_") and f.endswith(".xlsx")],
|
|
457
|
+
key=lambda x: int(re.findall(r'(\d+)', x)[-1])
|
|
458
|
+
)
|
|
459
|
+
for idx, fname in enumerate(processed_files, start=1):
|
|
460
|
+
fpath = os.path.join(output_folder, fname)
|
|
461
|
+
try:
|
|
462
|
+
df = pd.read_excel(fpath, sheet_name="Macro")
|
|
463
|
+
percent_col = df.iloc[:, 0] / df.iloc[:, 0].max()
|
|
464
|
+
intensity_col = df.iloc[:, 1]
|
|
465
|
+
indices = np.linspace(0, len(df) - 1, 600, dtype=int)
|
|
466
|
+
percent_squeezed = percent_col.iloc[indices].reset_index(drop=True)
|
|
467
|
+
intensity_squeezed = intensity_col.iloc[indices].reset_index(drop=True)
|
|
468
|
+
|
|
469
|
+
label = os.path.basename(input_folder) + f"_{idx}"
|
|
470
|
+
sample_names.append(label)
|
|
471
|
+
macro_percent_cols.append(percent_squeezed.rename(label))
|
|
472
|
+
macro_intensity_cols.append(intensity_squeezed.rename(label))
|
|
473
|
+
except Exception:
|
|
474
|
+
pass
|
|
475
|
+
|
|
476
|
+
percent_df = pd.concat(macro_percent_cols, axis=1)
|
|
477
|
+
intensity_df = pd.concat(macro_intensity_cols, axis=1)
|
|
478
|
+
percent_df[""] = ""
|
|
479
|
+
intensity_df[""] = ""
|
|
480
|
+
percent_df["Average"] = percent_df.select_dtypes(include=[np.number]).mean(axis=1)
|
|
481
|
+
intensity_df["Average"] = intensity_df.select_dtypes(include=[np.number]).mean(axis=1)
|
|
482
|
+
combined_macro_path = os.path.join(output_folder, "combined_macro_600points.xlsx")
|
|
483
|
+
with pd.ExcelWriter(combined_macro_path) as writer:
|
|
484
|
+
percent_df.to_excel(writer, sheet_name="Percent Length", index=False)
|
|
485
|
+
intensity_df.to_excel(writer, sheet_name="Inverted Intensity", index=False)
|
|
486
|
+
|
|
487
|
+
generate_stripe_summary(output_folder)
|
|
488
|
+
|
|
489
|
+
|
|
490
|
+
|
|
491
|
+
def process_excel_file(
|
|
492
|
+
excel_file, lower_lim, upper_lim, distance=10, prominence=0.03, height=0.1
|
|
493
|
+
):
|
|
494
|
+
parent_folder = os.path.dirname(os.path.abspath(excel_file))
|
|
495
|
+
base_name = os.path.splitext(os.path.basename(excel_file))[0]
|
|
496
|
+
output_folder = os.path.join(parent_folder, "results")
|
|
497
|
+
os.makedirs(output_folder, exist_ok=True)
|
|
498
|
+
|
|
499
|
+
xl = pd.ExcelFile(excel_file)
|
|
500
|
+
sheet_names = sorted(
|
|
501
|
+
[s for s in xl.sheet_names if re.match(r"^s\d+$", s)],
|
|
502
|
+
key=lambda x: int(re.findall(r"(\d+)", x)[0])
|
|
503
|
+
)
|
|
504
|
+
|
|
505
|
+
for sheet_name in sheet_names:
|
|
506
|
+
output_path = os.path.join(output_folder, f"processed_{base_name}_{sheet_name}.xlsx")
|
|
507
|
+
temp_plot_path = f"temp_plot_{sheet_name}.png"
|
|
508
|
+
try:
|
|
509
|
+
data = pd.read_excel(excel_file, sheet_name=sheet_name)
|
|
510
|
+
length = data.iloc[:, 0]
|
|
511
|
+
intensity = data.iloc[:, 1]
|
|
512
|
+
|
|
513
|
+
norm_intensity = intensity / intensity.max()
|
|
514
|
+
inverted_intensity = 1 - norm_intensity
|
|
515
|
+
norm_length = length / length.max()
|
|
516
|
+
percent_length = norm_length * 100
|
|
517
|
+
|
|
518
|
+
smoothed = savgol_filter(inverted_intensity, 11, 2)
|
|
519
|
+
diff = np.diff(smoothed, prepend=smoothed[0])
|
|
520
|
+
change_points = np.diff(np.sign(diff), prepend=0)
|
|
521
|
+
change_point_flags = (change_points < 0).astype(int)
|
|
522
|
+
|
|
523
|
+
# Detect peaks
|
|
524
|
+
peaks, _ = find_peaks(
|
|
525
|
+
smoothed, distance=distance, prominence=prominence, height=height
|
|
526
|
+
)
|
|
527
|
+
peak_percent_lengths = percent_length.iloc[peaks].values
|
|
528
|
+
peak_values = smoothed[peaks]
|
|
529
|
+
valid_peak_mask = (peak_percent_lengths > lower_lim) & (
|
|
530
|
+
peak_percent_lengths < upper_lim
|
|
531
|
+
)
|
|
532
|
+
peak_percent_lengths = peak_percent_lengths[valid_peak_mask]
|
|
533
|
+
peak_values = peak_values[valid_peak_mask]
|
|
534
|
+
|
|
535
|
+
data["Normalized Intensity"] = norm_intensity
|
|
536
|
+
data["Inverted Intensity"] = inverted_intensity
|
|
537
|
+
data["Normalized Length"] = norm_length
|
|
538
|
+
data["Percent Length"] = percent_length
|
|
539
|
+
data["Smoothed"] = smoothed
|
|
540
|
+
data["Difference"] = diff
|
|
541
|
+
data["Change Point"] = change_point_flags
|
|
542
|
+
data.to_excel(output_path, index=False)
|
|
543
|
+
|
|
544
|
+
plt.figure(figsize=(5, 3))
|
|
545
|
+
plt.plot(percent_length, smoothed, label="Smoothed", color="blue")
|
|
546
|
+
plt.scatter(
|
|
547
|
+
peak_percent_lengths,
|
|
548
|
+
peak_values,
|
|
549
|
+
color="red",
|
|
550
|
+
s=50,
|
|
551
|
+
label=f"Peaks > {lower_lim}%",
|
|
552
|
+
)
|
|
553
|
+
plt.title("Smoothed Inverted Intensity")
|
|
554
|
+
plt.xlabel("Percent Length (%)")
|
|
555
|
+
plt.ylabel("Inverted Intensity")
|
|
556
|
+
plt.legend()
|
|
557
|
+
plt.tight_layout()
|
|
558
|
+
plt.savefig(temp_plot_path, dpi=200)
|
|
559
|
+
plt.close()
|
|
560
|
+
|
|
561
|
+
wb = load_workbook(output_path)
|
|
562
|
+
ws = wb.active
|
|
563
|
+
img = XLImage(temp_plot_path)
|
|
564
|
+
img.width = 360
|
|
565
|
+
img.height = 220
|
|
566
|
+
ws.add_image(img, "K2")
|
|
567
|
+
wb.save(output_path)
|
|
568
|
+
|
|
569
|
+
# Add Peaks Sheets
|
|
570
|
+
if "Peaks" in wb.sheetnames:
|
|
571
|
+
del wb["Peaks"]
|
|
572
|
+
peak_ws = wb.create_sheet("Peaks")
|
|
573
|
+
peak_ws.append(["Percent Length", "Inverted Intensity"])
|
|
574
|
+
for x, y in zip(peak_percent_lengths, peak_values):
|
|
575
|
+
peak_ws.append([x, y])
|
|
576
|
+
|
|
577
|
+
# Match peaks with change_df data frame
|
|
578
|
+
change_df = data[
|
|
579
|
+
(data["Change Point"] == 1)
|
|
580
|
+
& (data["Percent Length"] > lower_lim)
|
|
581
|
+
& (data["Percent Length"] < upper_lim)
|
|
582
|
+
]
|
|
583
|
+
closest_matches = []
|
|
584
|
+
for px in peak_percent_lengths:
|
|
585
|
+
if not change_df.empty:
|
|
586
|
+
closest = change_df.iloc[
|
|
587
|
+
(change_df["Percent Length"] - px).abs().argsort()[:1]
|
|
588
|
+
]
|
|
589
|
+
closest_val = closest["Percent Length"].values[0]
|
|
590
|
+
closest_matches.append(closest_val)
|
|
591
|
+
|
|
592
|
+
# Discard files which have not exactly 7 stripes.
|
|
593
|
+
if len(closest_matches) != 7:
|
|
594
|
+
print(
|
|
595
|
+
f"\033[91mSkipping sheet {sheet_name}:\033[0m found {len(closest_matches)} stripes, not exactly 7."
|
|
596
|
+
)
|
|
597
|
+
if os.path.exists(temp_plot_path):
|
|
598
|
+
os.remove(temp_plot_path)
|
|
599
|
+
continue
|
|
600
|
+
wb.save(output_path)
|
|
601
|
+
if os.path.exists(temp_plot_path):
|
|
602
|
+
os.remove(temp_plot_path)
|
|
603
|
+
|
|
604
|
+
# Add macro sheets.
|
|
605
|
+
macro_df = pd.DataFrame(
|
|
606
|
+
{
|
|
607
|
+
"Percent Length": percent_length,
|
|
608
|
+
"Inverted Intensity": inverted_intensity,
|
|
609
|
+
}
|
|
610
|
+
)
|
|
611
|
+
with pd.ExcelWriter(
|
|
612
|
+
output_path, engine="openpyxl", mode="a", if_sheet_exists="replace"
|
|
613
|
+
) as writer:
|
|
614
|
+
macro_df.to_excel(writer, sheet_name="Macro", index=False)
|
|
615
|
+
|
|
616
|
+
# Add macro 600 points sheet
|
|
617
|
+
indices = np.linspace(0, len(macro_df) - 1, 600, dtype=int)
|
|
618
|
+
squeezed_macro_df = macro_df.iloc[indices].reset_index(drop=True)
|
|
619
|
+
with pd.ExcelWriter(
|
|
620
|
+
output_path, engine="openpyxl", mode="a", if_sheet_exists="replace"
|
|
621
|
+
) as writer:
|
|
622
|
+
squeezed_macro_df.to_excel(
|
|
623
|
+
writer, sheet_name="Macro_600points", index=False
|
|
624
|
+
)
|
|
625
|
+
except Exception as e:
|
|
626
|
+
print(f"Error processing sheet {sheet_name}: {e}")
|
|
627
|
+
if os.path.exists(temp_plot_path):
|
|
628
|
+
os.remove(temp_plot_path)
|
|
629
|
+
|
|
630
|
+
# Combine all Macro_600points into one Excel
|
|
631
|
+
macro_percent_cols = []
|
|
632
|
+
macro_intensity_cols = []
|
|
633
|
+
processed_files = sorted(
|
|
634
|
+
[
|
|
635
|
+
f
|
|
636
|
+
for f in os.listdir(output_folder)
|
|
637
|
+
if f.startswith("processed_") and f.endswith(
|
|
638
|
+
".xlsx") and "stripe_summary" not in f and "combined_macro" not in f
|
|
639
|
+
],
|
|
640
|
+
key=lambda x: int(re.findall(r"(\d+)", x)[-1]) if re.findall(r"(\d+)", x) else 0,
|
|
641
|
+
)
|
|
642
|
+
|
|
643
|
+
for idx, fname in enumerate(processed_files, start=1):
|
|
644
|
+
fpath = os.path.join(output_folder, fname)
|
|
645
|
+
try:
|
|
646
|
+
df = pd.read_excel(fpath, sheet_name="Macro")
|
|
647
|
+
percent_col = df.iloc[:, 0] / df.iloc[:, 0].max()
|
|
648
|
+
intensity_col = df.iloc[:, 1]
|
|
649
|
+
indices = np.linspace(0, len(df) - 1, 600, dtype=int)
|
|
650
|
+
percent_squeezed = percent_col.iloc[indices].reset_index(drop=True)
|
|
651
|
+
intensity_squeezed = intensity_col.iloc[indices].reset_index(drop=True)
|
|
652
|
+
|
|
653
|
+
label = f"{base_name}_{idx}"
|
|
654
|
+
macro_percent_cols.append(percent_squeezed.rename(label))
|
|
655
|
+
macro_intensity_cols.append(intensity_squeezed.rename(label))
|
|
656
|
+
except Exception:
|
|
657
|
+
pass
|
|
658
|
+
|
|
659
|
+
if macro_percent_cols and macro_intensity_cols:
|
|
660
|
+
percent_df = pd.concat(macro_percent_cols, axis=1)
|
|
661
|
+
intensity_df = pd.concat(macro_intensity_cols, axis=1)
|
|
662
|
+
percent_df[""] = ""
|
|
663
|
+
intensity_df[""] = ""
|
|
664
|
+
percent_df["Average"] = percent_df.select_dtypes(include=[np.number]).mean(axis=1)
|
|
665
|
+
intensity_df["Average"] = intensity_df.select_dtypes(include=[np.number]).mean(axis=1)
|
|
666
|
+
|
|
667
|
+
combined_macro_path = os.path.join(output_folder, "combined_macro_600points.xlsx")
|
|
668
|
+
with pd.ExcelWriter(combined_macro_path) as writer:
|
|
669
|
+
percent_df.to_excel(writer, sheet_name="Percent Length", index=False)
|
|
670
|
+
intensity_df.to_excel(writer, sheet_name="Inverted Intensity", index=False)
|
|
671
|
+
|
|
672
|
+
generate_stripe_summary(output_folder)
|
|
673
|
+
|
|
674
|
+
|
|
675
|
+
def main():
|
|
676
|
+
parser = argparse.ArgumentParser(
|
|
677
|
+
description="Embryo stripe analysis: auto-tunes parameters to minimize discarded sheets from a single Excel file."
|
|
678
|
+
)
|
|
679
|
+
group = parser.add_mutually_exclusive_group(required=True)
|
|
680
|
+
group.add_argument(
|
|
681
|
+
"--file", "-file", help="Path to a single Excel file containing sheets s1, s2, s3..."
|
|
682
|
+
)
|
|
683
|
+
group.add_argument(
|
|
684
|
+
"--folder",
|
|
685
|
+
"-folder",
|
|
686
|
+
help="Path to a folder containing multiple Excel files, each with sheets s1, s2, s3...",
|
|
687
|
+
)
|
|
688
|
+
parser.add_argument(
|
|
689
|
+
"--lower",
|
|
690
|
+
"-l",
|
|
691
|
+
type=float,
|
|
692
|
+
required=True,
|
|
693
|
+
help="Lower limit for percent length",
|
|
694
|
+
)
|
|
695
|
+
parser.add_argument(
|
|
696
|
+
"--upper",
|
|
697
|
+
"-u",
|
|
698
|
+
type=float,
|
|
699
|
+
required=True,
|
|
700
|
+
help="Upper limit for percent length",
|
|
701
|
+
)
|
|
702
|
+
|
|
703
|
+
parser.add_argument(
|
|
704
|
+
"--test",
|
|
705
|
+
type=lambda x: str(x).lower() == "true",
|
|
706
|
+
default=True,
|
|
707
|
+
help="Whether to run parameter testing (true/false, default: true)",
|
|
708
|
+
)
|
|
709
|
+
parser.add_argument(
|
|
710
|
+
"--distance",
|
|
711
|
+
type=int,
|
|
712
|
+
default=15,
|
|
713
|
+
help="Distance for peak detection (used if --test false)",
|
|
714
|
+
)
|
|
715
|
+
parser.add_argument(
|
|
716
|
+
"--prominence",
|
|
717
|
+
type=float,
|
|
718
|
+
default=0.03,
|
|
719
|
+
help="Prominence for peak detection (used if --test false)",
|
|
720
|
+
)
|
|
721
|
+
parser.add_argument(
|
|
722
|
+
"--height",
|
|
723
|
+
type=float,
|
|
724
|
+
default=0.1,
|
|
725
|
+
help="Height for peak detection (used if --test false)",
|
|
726
|
+
)
|
|
727
|
+
|
|
728
|
+
# For test mode
|
|
729
|
+
parser.add_argument(
|
|
730
|
+
"--distances",
|
|
731
|
+
nargs="+",
|
|
732
|
+
type=int,
|
|
733
|
+
default=[10, 15, 20, 25, 30, 35],
|
|
734
|
+
help="List of distances to test (default: 10 15 20 25 30 35)",
|
|
735
|
+
)
|
|
736
|
+
parser.add_argument(
|
|
737
|
+
"--prom_range",
|
|
738
|
+
nargs=3,
|
|
739
|
+
type=float,
|
|
740
|
+
default=[0.005, 0.08, 0.05],
|
|
741
|
+
help="Prominence range: start end step (default: 0.05 0.8 0.05)",
|
|
742
|
+
)
|
|
743
|
+
parser.add_argument(
|
|
744
|
+
"--height_range",
|
|
745
|
+
nargs=3,
|
|
746
|
+
type=float,
|
|
747
|
+
default=[0.01, 0.5, 0.05],
|
|
748
|
+
help="Height range: start end step (default: 0.01 0.5 0.05)",
|
|
749
|
+
)
|
|
750
|
+
|
|
751
|
+
args = parser.parse_args()
|
|
752
|
+
lower_lim = args.lower
|
|
753
|
+
upper_lim = args.upper
|
|
754
|
+
|
|
755
|
+
pipeline_kwargs = dict(
|
|
756
|
+
lower_lim=lower_lim,
|
|
757
|
+
upper_lim=upper_lim,
|
|
758
|
+
test=args.test,
|
|
759
|
+
distance=args.distance,
|
|
760
|
+
prominence=args.prominence,
|
|
761
|
+
height=args.height,
|
|
762
|
+
distances=args.distances,
|
|
763
|
+
prom_range=args.prom_range,
|
|
764
|
+
height_range=args.height_range,
|
|
765
|
+
)
|
|
766
|
+
|
|
767
|
+
if args.file:
|
|
768
|
+
# --- Single-file mode: file contains sheets s1, s2, s3... ---
|
|
769
|
+
run_pipeline(args.file, **pipeline_kwargs)
|
|
770
|
+
|
|
771
|
+
else:
|
|
772
|
+
# --- Folder mode ---
|
|
773
|
+
input_folder = args.folder
|
|
774
|
+
|
|
775
|
+
if not os.path.isdir(input_folder):
|
|
776
|
+
print(f"\033[91mError: '{input_folder}' is not a valid directory.\033[0m")
|
|
777
|
+
sys.exit(1)
|
|
778
|
+
|
|
779
|
+
# Simply pass the folder path ONCE to the pipeline instead of looping here
|
|
780
|
+
print(f"\033[94m\nStarting analysis on folder: {abspath(input_folder)}\033[0m")
|
|
781
|
+
run_pipeline2(input_folder, **pipeline_kwargs)
|
|
782
|
+
|
|
783
|
+
print("\n Completed !!!")
|
|
784
|
+
|
|
785
|
+
|
|
786
|
+
if __name__ == "__main__":
|
|
787
|
+
main()
|
|
@@ -0,0 +1,375 @@
|
|
|
1
|
+
#!/usr/bin/env python3
|
|
2
|
+
import argparse
|
|
3
|
+
import os
|
|
4
|
+
import re
|
|
5
|
+
import sys
|
|
6
|
+
import warnings
|
|
7
|
+
import matplotlib.pyplot as plt
|
|
8
|
+
import numpy as np
|
|
9
|
+
import pandas as pd
|
|
10
|
+
from openpyxl import load_workbook
|
|
11
|
+
from openpyxl.drawing.image import Image as XLImage
|
|
12
|
+
from scipy.signal import savgol_filter, find_peaks
|
|
13
|
+
|
|
14
|
+
warnings.simplefilter(action="ignore", category=FutureWarning)
|
|
15
|
+
|
|
16
|
+
def analyze_intensity_profile(x_orig, y_orig, sample_name):
|
|
17
|
+
# Normalize length to a 0-100 scale
|
|
18
|
+
if (x_orig.max() - x_orig.min()) == 0:
|
|
19
|
+
print(f" - Skipping {sample_name}: Length data is constant.")
|
|
20
|
+
return None
|
|
21
|
+
x_norm = 100 * (x_orig - x_orig.min()) / (x_orig.max() - x_orig.min())
|
|
22
|
+
|
|
23
|
+
# Normalize intensity to 0-1 and then invert it (1 - normalized_value)
|
|
24
|
+
if (y_orig.max() - y_orig.min()) == 0:
|
|
25
|
+
print(f" - Skipping {sample_name}: Intensity data is constant.")
|
|
26
|
+
return None
|
|
27
|
+
y_norm = (y_orig - y_orig.min()) / (y_orig.max() - y_orig.min())
|
|
28
|
+
y_proc = 1 - y_norm
|
|
29
|
+
|
|
30
|
+
# Smooth the PROCESSED data
|
|
31
|
+
if len(y_proc) > 51:
|
|
32
|
+
window_length = 51
|
|
33
|
+
else:
|
|
34
|
+
window_length = max(5, len(y_proc) // 2 * 2 + 1)
|
|
35
|
+
|
|
36
|
+
if window_length <= 3:
|
|
37
|
+
print(f" - Skipping {sample_name}: Not enough data points to process.")
|
|
38
|
+
return None
|
|
39
|
+
|
|
40
|
+
polyorder = 3
|
|
41
|
+
y_smooth = savgol_filter(y_proc, window_length, polyorder)
|
|
42
|
+
|
|
43
|
+
# Find peaks on the PROCESSED data
|
|
44
|
+
peaks, _ = find_peaks(y_smooth, prominence=0.05, height=0.05)
|
|
45
|
+
|
|
46
|
+
change_point_x = None
|
|
47
|
+
peak_indices = []
|
|
48
|
+
|
|
49
|
+
# Applying the peak finding logic.
|
|
50
|
+
if len(peaks) >= 2:
|
|
51
|
+
# --- Two-Peak Method ---
|
|
52
|
+
peak_prominences = y_smooth[peaks]
|
|
53
|
+
top_two_indices = np.argsort(peak_prominences)[-2:]
|
|
54
|
+
peak_indices = sorted(peaks[top_two_indices])
|
|
55
|
+
first_peak_idx, second_peak_idx = peak_indices[0], peak_indices[1]
|
|
56
|
+
|
|
57
|
+
section = slice(first_peak_idx, second_peak_idx + 1)
|
|
58
|
+
if len(y_smooth[section]) > 1:
|
|
59
|
+
intensity_gradient = np.gradient(y_smooth[section])
|
|
60
|
+
decreasing_indices = np.where(intensity_gradient < -0.002)[0]
|
|
61
|
+
if len(decreasing_indices) > 0:
|
|
62
|
+
groups = np.split(decreasing_indices, np.where(np.diff(decreasing_indices) > 1)[0] + 1)
|
|
63
|
+
longest_group = max(groups, key=len)
|
|
64
|
+
midpoint_local_idx = (longest_group[0] + longest_group[-1]) // 2
|
|
65
|
+
change_point_x = x_norm[section][midpoint_local_idx]
|
|
66
|
+
else:
|
|
67
|
+
# --- Single-Peak Method ---
|
|
68
|
+
first_peak_idx = np.argmax(y_smooth) if len(peaks) == 0 else peaks[0]
|
|
69
|
+
peak_indices = [first_peak_idx]
|
|
70
|
+
|
|
71
|
+
section = slice(first_peak_idx, len(y_smooth))
|
|
72
|
+
if len(y_smooth[section]) > 1:
|
|
73
|
+
intensity_gradient = np.gradient(y_smooth[section])
|
|
74
|
+
steepest_decrease_local_idx = np.argmin(intensity_gradient)
|
|
75
|
+
change_point_x = x_norm[section][steepest_decrease_local_idx]
|
|
76
|
+
|
|
77
|
+
return {
|
|
78
|
+
"x_norm": x_norm,
|
|
79
|
+
"y_proc": y_proc,
|
|
80
|
+
"y_smooth": y_smooth,
|
|
81
|
+
"peak_indices": peak_indices,
|
|
82
|
+
"change_point_x": change_point_x,
|
|
83
|
+
}
|
|
84
|
+
|
|
85
|
+
|
|
86
|
+
def _natural_sort_key(s):
|
|
87
|
+
return [int(t) if t.isdigit() else t.lower() for t in re.split(r"(\d+)", s)]
|
|
88
|
+
|
|
89
|
+
|
|
90
|
+
def generate_midpoint_summary(output_folder):
|
|
91
|
+
summary_output_path = os.path.join(output_folder, "midpoint_summary.xlsx")
|
|
92
|
+
|
|
93
|
+
all_rows = []
|
|
94
|
+
processed_files = sorted(
|
|
95
|
+
[
|
|
96
|
+
f
|
|
97
|
+
for f in os.listdir(output_folder)
|
|
98
|
+
if f.startswith("processed_") and f.endswith(".xlsx")
|
|
99
|
+
],
|
|
100
|
+
key=_natural_sort_key,
|
|
101
|
+
)
|
|
102
|
+
|
|
103
|
+
for fname in processed_files:
|
|
104
|
+
fpath = os.path.join(output_folder, fname)
|
|
105
|
+
try:
|
|
106
|
+
wb = load_workbook(fpath, data_only=True)
|
|
107
|
+
if "Midpoint" not in wb.sheetnames:
|
|
108
|
+
continue
|
|
109
|
+
|
|
110
|
+
mp_ws = wb["Midpoint"]
|
|
111
|
+
rows = list(mp_ws.iter_rows(min_row=2, max_row=2, values_only=True))
|
|
112
|
+
if not rows:
|
|
113
|
+
continue
|
|
114
|
+
|
|
115
|
+
sample, midpoint = rows[0]
|
|
116
|
+
all_rows.append(
|
|
117
|
+
{
|
|
118
|
+
"Sample": sample,
|
|
119
|
+
"Midpoint (% Egg Length)": midpoint if midpoint is not None else "N/A",
|
|
120
|
+
}
|
|
121
|
+
)
|
|
122
|
+
|
|
123
|
+
except Exception as e:
|
|
124
|
+
print(f"Error reading {fname}: {e}")
|
|
125
|
+
|
|
126
|
+
summary_df = pd.DataFrame(all_rows)
|
|
127
|
+
summary_df.to_excel(summary_output_path, index=False)
|
|
128
|
+
print(f"\033[92mMidpoint summary written to: {summary_output_path} \033[0m")
|
|
129
|
+
|
|
130
|
+
|
|
131
|
+
def process_dataset(x_orig, y_orig, sample_name, output_path):
|
|
132
|
+
"""Run the analysis/plot/save pipeline for a single (x, y) profile and
|
|
133
|
+
write the result to output_path. Shared by both --folder mode (one xlsx
|
|
134
|
+
file = one sample) and --file mode (one sheet = one sample)."""
|
|
135
|
+
base_name = os.path.splitext(os.path.basename(output_path))[0]
|
|
136
|
+
temp_plot_path = os.path.join(
|
|
137
|
+
os.path.dirname(output_path), f"temp_plot_{base_name}.png"
|
|
138
|
+
)
|
|
139
|
+
|
|
140
|
+
result = analyze_intensity_profile(x_orig, y_orig, sample_name)
|
|
141
|
+
if result is None:
|
|
142
|
+
return None
|
|
143
|
+
|
|
144
|
+
x_norm = result["x_norm"]
|
|
145
|
+
y_proc = result["y_proc"]
|
|
146
|
+
y_smooth = result["y_smooth"]
|
|
147
|
+
peak_indices = result["peak_indices"]
|
|
148
|
+
change_point_x = result["change_point_x"]
|
|
149
|
+
|
|
150
|
+
try:
|
|
151
|
+
# Save processed data
|
|
152
|
+
out_df = pd.DataFrame(
|
|
153
|
+
{
|
|
154
|
+
"Length (orig)": x_orig,
|
|
155
|
+
"Intensity (orig)": y_orig,
|
|
156
|
+
"Percent Length": x_norm,
|
|
157
|
+
"Processed Intensity": y_proc,
|
|
158
|
+
"Smoothed Intensity": y_smooth,
|
|
159
|
+
}
|
|
160
|
+
)
|
|
161
|
+
out_df.to_excel(output_path, index=False)
|
|
162
|
+
|
|
163
|
+
# Plot the results using PROCESSED data.
|
|
164
|
+
plt.figure(figsize=(12, 7))
|
|
165
|
+
plt.plot(x_norm, y_proc, color="grey", alpha=0.6, label="Raw datapoints")
|
|
166
|
+
plt.plot(x_norm, y_smooth, color="black", linewidth=2, label="Smoothed Line")
|
|
167
|
+
|
|
168
|
+
if len(peak_indices) > 0:
|
|
169
|
+
plt.axvline(
|
|
170
|
+
x=x_norm[peak_indices[0]],
|
|
171
|
+
color="blue",
|
|
172
|
+
linestyle="--",
|
|
173
|
+
label=f"Peak 1 (at {x_norm[peak_indices[0]]:.2f})",
|
|
174
|
+
)
|
|
175
|
+
if len(peak_indices) > 1:
|
|
176
|
+
plt.axvline(
|
|
177
|
+
x=x_norm[peak_indices[1]],
|
|
178
|
+
color="green",
|
|
179
|
+
linestyle="--",
|
|
180
|
+
label=f"Peak 2 (at {x_norm[peak_indices[1]]:.2f})",
|
|
181
|
+
)
|
|
182
|
+
|
|
183
|
+
if change_point_x is not None:
|
|
184
|
+
plt.axvline(
|
|
185
|
+
x=change_point_x,
|
|
186
|
+
color="red",
|
|
187
|
+
linestyle="--",
|
|
188
|
+
linewidth=2,
|
|
189
|
+
label="Midpoint of Decrease",
|
|
190
|
+
)
|
|
191
|
+
annotation_text = f"Midpoint: {change_point_x:.2f}"
|
|
192
|
+
y_range = plt.ylim()[1] - plt.ylim()[0]
|
|
193
|
+
plt.text(
|
|
194
|
+
change_point_x + 2,
|
|
195
|
+
plt.ylim()[0] + y_range * 0.5,
|
|
196
|
+
annotation_text,
|
|
197
|
+
color="red",
|
|
198
|
+
fontsize=12,
|
|
199
|
+
bbox=dict(facecolor="white", alpha=0.8, edgecolor="red"),
|
|
200
|
+
)
|
|
201
|
+
|
|
202
|
+
plt.title(f"Intensity Profile for: {sample_name}", fontsize=16)
|
|
203
|
+
plt.xlabel("Embryo Length (%)", fontsize=12)
|
|
204
|
+
plt.ylabel("Intensity", fontsize=12)
|
|
205
|
+
plt.grid(True, which="both", linestyle="--", linewidth=0.5)
|
|
206
|
+
plt.legend()
|
|
207
|
+
plt.tight_layout()
|
|
208
|
+
plt.savefig(temp_plot_path, dpi=200)
|
|
209
|
+
plt.close()
|
|
210
|
+
|
|
211
|
+
# Embed the plot image into the processed xlsx.
|
|
212
|
+
wb = load_workbook(output_path)
|
|
213
|
+
ws = wb.active
|
|
214
|
+
img = XLImage(temp_plot_path)
|
|
215
|
+
img.width = 480
|
|
216
|
+
img.height = 280
|
|
217
|
+
ws.add_image(img, "H2")
|
|
218
|
+
|
|
219
|
+
if "Midpoint" in wb.sheetnames:
|
|
220
|
+
del wb["Midpoint"]
|
|
221
|
+
mp_ws = wb.create_sheet("Midpoint")
|
|
222
|
+
mp_ws.append(["Sample", "Midpoint (% Egg Length)"])
|
|
223
|
+
mp_ws.append([sample_name, change_point_x if change_point_x is not None else "N/A"])
|
|
224
|
+
|
|
225
|
+
wb.save(output_path)
|
|
226
|
+
|
|
227
|
+
except Exception as e:
|
|
228
|
+
print(f" - Error processing {sample_name}: {e}")
|
|
229
|
+
return None
|
|
230
|
+
finally:
|
|
231
|
+
if os.path.exists(temp_plot_path):
|
|
232
|
+
os.remove(temp_plot_path)
|
|
233
|
+
|
|
234
|
+
return change_point_x
|
|
235
|
+
|
|
236
|
+
|
|
237
|
+
def process_excel_file(excel_file, output_folder):
|
|
238
|
+
"""--folder mode: one xlsx file = one sample."""
|
|
239
|
+
base_name = os.path.splitext(os.path.basename(excel_file))[0]
|
|
240
|
+
output_path = os.path.join(output_folder, f"processed_{base_name}.xlsx")
|
|
241
|
+
|
|
242
|
+
try:
|
|
243
|
+
data = pd.read_excel(excel_file)
|
|
244
|
+
x_orig = data.iloc[:, 0].values
|
|
245
|
+
y_orig = data.iloc[:, 1].values
|
|
246
|
+
except Exception as e:
|
|
247
|
+
print(f" - Error reading {os.path.basename(excel_file)}: {e}")
|
|
248
|
+
return None
|
|
249
|
+
|
|
250
|
+
return process_dataset(x_orig, y_orig, base_name, output_path)
|
|
251
|
+
|
|
252
|
+
|
|
253
|
+
def process_file_sheets(excel_file, output_folder):
|
|
254
|
+
"""--file mode: one xlsx file containing sheets s1, s2, s3, ... = multiple samples."""
|
|
255
|
+
base_name = os.path.splitext(os.path.basename(excel_file))[0]
|
|
256
|
+
|
|
257
|
+
try:
|
|
258
|
+
xl = pd.ExcelFile(excel_file)
|
|
259
|
+
sheet_names = sorted(
|
|
260
|
+
[s for s in xl.sheet_names if re.match(r"^s\d+$", s, re.IGNORECASE)],
|
|
261
|
+
key=lambda s: int(re.findall(r"\d+", s)[0]),
|
|
262
|
+
)
|
|
263
|
+
except Exception as e:
|
|
264
|
+
print(f"\033[91mError reading '{excel_file}': {e}\033[0m")
|
|
265
|
+
return
|
|
266
|
+
|
|
267
|
+
if not sheet_names:
|
|
268
|
+
print(
|
|
269
|
+
f"\033[91mNo sheets matching the 's1', 's2', ... pattern were found in "
|
|
270
|
+
f"'{excel_file}'.\033[0m"
|
|
271
|
+
)
|
|
272
|
+
return
|
|
273
|
+
|
|
274
|
+
total_sheets = len(sheet_names)
|
|
275
|
+
print(
|
|
276
|
+
f"\033[96m\nFound {total_sheets} sheets in '{os.path.basename(excel_file)}'.\033[0m "
|
|
277
|
+
f"\n\033[91mStarting analysis...\033[0m"
|
|
278
|
+
)
|
|
279
|
+
|
|
280
|
+
for i, sheet_name in enumerate(sheet_names, start=1):
|
|
281
|
+
percent = (i / total_sheets) * 100
|
|
282
|
+
sys.stdout.write(
|
|
283
|
+
f"\r\033[92mProgress: [{percent:3.0f}%]\033[0m Processing sheets {i}/{total_sheets}..."
|
|
284
|
+
)
|
|
285
|
+
sys.stdout.flush()
|
|
286
|
+
|
|
287
|
+
sample_name = f"{base_name}_{sheet_name}"
|
|
288
|
+
output_path = os.path.join(output_folder, f"processed_{sample_name}.xlsx")
|
|
289
|
+
|
|
290
|
+
try:
|
|
291
|
+
data = pd.read_excel(excel_file, sheet_name=sheet_name)
|
|
292
|
+
x_orig = data.iloc[:, 0].values
|
|
293
|
+
y_orig = data.iloc[:, 1].values
|
|
294
|
+
except Exception as e:
|
|
295
|
+
print(f"\n - Error reading sheet {sheet_name}: {e}")
|
|
296
|
+
continue
|
|
297
|
+
|
|
298
|
+
process_dataset(x_orig, y_orig, sample_name, output_path)
|
|
299
|
+
|
|
300
|
+
print()
|
|
301
|
+
|
|
302
|
+
|
|
303
|
+
def main():
|
|
304
|
+
parser = argparse.ArgumentParser(
|
|
305
|
+
description="Hb fluorescence intensity profile batch analyzer: finds the midpoint "
|
|
306
|
+
"of the intensity decrease for every sample file in a folder."
|
|
307
|
+
)
|
|
308
|
+
group = parser.add_mutually_exclusive_group(required=True)
|
|
309
|
+
group.add_argument(
|
|
310
|
+
"--folder",
|
|
311
|
+
"-folder",
|
|
312
|
+
help="Path to a folder containing input .xlsx sample files (one sample per file)",
|
|
313
|
+
)
|
|
314
|
+
group.add_argument(
|
|
315
|
+
"--file",
|
|
316
|
+
"-file",
|
|
317
|
+
help="Path to a single .xlsx file containing sheets s1, s2, s3, ... "
|
|
318
|
+
"(one sample per sheet)",
|
|
319
|
+
)
|
|
320
|
+
args = parser.parse_args()
|
|
321
|
+
|
|
322
|
+
if args.folder:
|
|
323
|
+
input_folder = args.folder
|
|
324
|
+
|
|
325
|
+
if not os.path.isdir(input_folder):
|
|
326
|
+
print(f"\033[91mError: '{input_folder}' is not a valid directory.\033[0m")
|
|
327
|
+
sys.exit(1)
|
|
328
|
+
|
|
329
|
+
xlsx_files = sorted(
|
|
330
|
+
f
|
|
331
|
+
for f in os.listdir(input_folder)
|
|
332
|
+
if f.endswith(".xlsx") and not f.startswith("processed_") and "summary" not in f.lower()
|
|
333
|
+
)
|
|
334
|
+
|
|
335
|
+
if not xlsx_files:
|
|
336
|
+
print(f"\nNo .xlsx files found in '{input_folder}'. Exiting.")
|
|
337
|
+
sys.exit(0)
|
|
338
|
+
|
|
339
|
+
output_folder = os.path.join(input_folder, "results")
|
|
340
|
+
os.makedirs(output_folder, exist_ok=True)
|
|
341
|
+
|
|
342
|
+
print(f"\033[96m\nFound total {len(xlsx_files)} Excel files.\033[0m \n\033[91mStarting analysis...\033[0m")
|
|
343
|
+
|
|
344
|
+
total_files = len(xlsx_files)
|
|
345
|
+
for i, fname in enumerate(xlsx_files, start=1):
|
|
346
|
+
fpath = os.path.join(input_folder, fname)
|
|
347
|
+
percent = (i / total_files) * 100
|
|
348
|
+
|
|
349
|
+
sys.stdout.write(
|
|
350
|
+
f"\r\033[92mProgress: [{percent:3.0f}%]\033[0m Processing Excel Files {i}/{total_files}..."
|
|
351
|
+
)
|
|
352
|
+
sys.stdout.flush()
|
|
353
|
+
|
|
354
|
+
process_excel_file(fpath, output_folder)
|
|
355
|
+
|
|
356
|
+
print("\n Completed !!!")
|
|
357
|
+
|
|
358
|
+
else:
|
|
359
|
+
input_file = args.file
|
|
360
|
+
|
|
361
|
+
if not os.path.isfile(input_file):
|
|
362
|
+
print(f"\033[91mError: '{input_file}' is not a valid file.\033[0m")
|
|
363
|
+
sys.exit(1)
|
|
364
|
+
|
|
365
|
+
output_folder = os.path.join(os.path.dirname(os.path.abspath(input_file)), "results")
|
|
366
|
+
os.makedirs(output_folder, exist_ok=True)
|
|
367
|
+
|
|
368
|
+
process_file_sheets(input_file, output_folder)
|
|
369
|
+
|
|
370
|
+
print("\n Completed !!!")
|
|
371
|
+
|
|
372
|
+
generate_midpoint_summary(output_folder)
|
|
373
|
+
|
|
374
|
+
if __name__ == "__main__":
|
|
375
|
+
main()
|