chromstream 0.0.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- chromstream/__init__.py +15 -0
- chromstream/data_processing.py +363 -0
- chromstream/objects.py +358 -0
- chromstream/parsers.py +745 -0
- chromstream/py.typed +0 -0
- chromstream-0.0.1.dist-info/METADATA +141 -0
- chromstream-0.0.1.dist-info/RECORD +10 -0
- chromstream-0.0.1.dist-info/WHEEL +5 -0
- chromstream-0.0.1.dist-info/licenses/LICENSE.md +9 -0
- chromstream-0.0.1.dist-info/top_level.txt +1 -0
chromstream/__init__.py
ADDED
|
@@ -0,0 +1,15 @@
|
|
|
1
|
+
"""Init data"""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
from importlib.metadata import version
|
|
6
|
+
|
|
7
|
+
from .parsers import *
|
|
8
|
+
|
|
9
|
+
from .objects import *
|
|
10
|
+
|
|
11
|
+
from .data_processing import *
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
# Load the version
|
|
15
|
+
__version__ = version("chromstream")
|
|
@@ -0,0 +1,363 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Data processing functions for chromatogram analysis
|
|
3
|
+
"""
|
|
4
|
+
|
|
5
|
+
from __future__ import annotations
|
|
6
|
+
|
|
7
|
+
from typing import TYPE_CHECKING
|
|
8
|
+
|
|
9
|
+
import pandas as pd
|
|
10
|
+
from scipy.integrate import trapezoid
|
|
11
|
+
|
|
12
|
+
if TYPE_CHECKING:
|
|
13
|
+
from .objects import ChannelChromatograms, Chromatogram
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
# Baseline functions
|
|
17
|
+
def min_subtract(data: pd.DataFrame) -> pd.Series:
|
|
18
|
+
"""
|
|
19
|
+
Simple minimum subtraction baseline correction
|
|
20
|
+
|
|
21
|
+
Args:
|
|
22
|
+
data: DataFrame containing time and signal columns
|
|
23
|
+
|
|
24
|
+
Returns:
|
|
25
|
+
Corrected signal as pandas Series
|
|
26
|
+
"""
|
|
27
|
+
signal = data[data.columns[1]]
|
|
28
|
+
return signal - signal.min()
|
|
29
|
+
|
|
30
|
+
|
|
31
|
+
def time_window_baseline(
|
|
32
|
+
data: pd.DataFrame, time_window: tuple[float, float] = (0, 1)
|
|
33
|
+
) -> pd.Series:
|
|
34
|
+
"""
|
|
35
|
+
Use mean of signal in a specific time window as baseline
|
|
36
|
+
|
|
37
|
+
Args:
|
|
38
|
+
data: DataFrame containing time and signal columns
|
|
39
|
+
time_window: Tuple specifying the start and end time of the baseline window. Use the same unit as the chromatogram.
|
|
40
|
+
|
|
41
|
+
Returns:
|
|
42
|
+
Corrected signal as pandas Series
|
|
43
|
+
"""
|
|
44
|
+
start_time, end_time = time_window
|
|
45
|
+
time_col = data.columns[0] # "Time (min)"
|
|
46
|
+
signal_col = data.columns[1]
|
|
47
|
+
|
|
48
|
+
# Find data points in the specified time window
|
|
49
|
+
mask = (data[time_col] >= start_time) & (data[time_col] <= end_time)
|
|
50
|
+
baseline_value = data.loc[mask, signal_col].mean()
|
|
51
|
+
|
|
52
|
+
return data[signal_col] - baseline_value # type: ignore[operator]
|
|
53
|
+
|
|
54
|
+
|
|
55
|
+
def time_point_baseline(data: pd.DataFrame, time_point: float) -> pd.Series:
|
|
56
|
+
"""
|
|
57
|
+
Use signal value at a specific time point as baseline
|
|
58
|
+
|
|
59
|
+
Args:
|
|
60
|
+
data: DataFrame containing time and signal columns
|
|
61
|
+
time_point: Time point to use as baseline reference. Use the same unit as the chromatogram.
|
|
62
|
+
|
|
63
|
+
Returns:
|
|
64
|
+
Corrected signal as pandas Series
|
|
65
|
+
"""
|
|
66
|
+
time_col = data.columns[0] # "Time (min)"
|
|
67
|
+
signal_col = data.columns[1]
|
|
68
|
+
|
|
69
|
+
# Find the closest data point to the specified time
|
|
70
|
+
time_diff = (data[time_col] - time_point).abs()
|
|
71
|
+
closest_index = time_diff.idxmin()
|
|
72
|
+
baseline_value = data.loc[closest_index, signal_col]
|
|
73
|
+
|
|
74
|
+
return data[signal_col] - baseline_value # type: ignore[operator]
|
|
75
|
+
|
|
76
|
+
|
|
77
|
+
def linear_baseline(
|
|
78
|
+
data: pd.DataFrame, start_time: float, end_time: float
|
|
79
|
+
) -> pd.Series:
|
|
80
|
+
"""
|
|
81
|
+
Determines a linear baseline between the signal values at the two specified time points and
|
|
82
|
+
subtracts it from the signal.
|
|
83
|
+
|
|
84
|
+
Args:
|
|
85
|
+
data: DataFrame containing time and signal columns
|
|
86
|
+
start_time: Time point to define the start of the baseline. Use the same unit as the chromatogram.
|
|
87
|
+
end_time: Time point to define the end of the baseline. Use the same unit as the chromatogram.
|
|
88
|
+
|
|
89
|
+
Returns:
|
|
90
|
+
Corrected signal as pandas Series
|
|
91
|
+
"""
|
|
92
|
+
time_col = data.columns[0] # "Time (min)"
|
|
93
|
+
signal_col = data.columns[1]
|
|
94
|
+
|
|
95
|
+
# Find the closest data points to the specified times
|
|
96
|
+
start_diff = (data[time_col] - start_time).abs()
|
|
97
|
+
end_diff = (data[time_col] - end_time).abs()
|
|
98
|
+
start_index = start_diff.idxmin()
|
|
99
|
+
end_index = end_diff.idxmin()
|
|
100
|
+
|
|
101
|
+
# Get the signal values at these points
|
|
102
|
+
start_value = data.loc[start_index, signal_col]
|
|
103
|
+
end_value = data.loc[end_index, signal_col]
|
|
104
|
+
|
|
105
|
+
# Calculate the slope and intercept of the baseline line
|
|
106
|
+
slope = (end_value - start_value) / ( # type: ignore[operator]
|
|
107
|
+
data.loc[end_index, time_col] - data.loc[start_index, time_col]
|
|
108
|
+
)
|
|
109
|
+
intercept = start_value - slope * data.loc[start_index, time_col] # type: ignore[operator]
|
|
110
|
+
|
|
111
|
+
# Calculate the baseline for each time point
|
|
112
|
+
baseline = slope * data[time_col] + intercept # type: ignore[operator]
|
|
113
|
+
|
|
114
|
+
return data[signal_col] - baseline
|
|
115
|
+
|
|
116
|
+
|
|
117
|
+
# Integration functions
|
|
118
|
+
|
|
119
|
+
|
|
120
|
+
def integrate_single_chromatogram(
|
|
121
|
+
chromatogram: Chromatogram, peaklist: dict, column: None | str = None
|
|
122
|
+
) -> dict:
|
|
123
|
+
"""
|
|
124
|
+
Integrate the signal of a single chromatogram over time.
|
|
125
|
+
|
|
126
|
+
Args:
|
|
127
|
+
chromatogram: Chromatogram object containing the data to be analyzed
|
|
128
|
+
peaklist: Dictionary defining the peaks to integrate. Example:
|
|
129
|
+
```
|
|
130
|
+
Peaks_TCD = {"N2": [20, 26], "H2": [16, 19]}
|
|
131
|
+
```
|
|
132
|
+
The list values must be in the same unit as the chromatogram.
|
|
133
|
+
column: Optional column name to use for integration. If None, uses second column.
|
|
134
|
+
|
|
135
|
+
Returns:
|
|
136
|
+
Dictionary with integrated peak areas and timestamp
|
|
137
|
+
"""
|
|
138
|
+
data = chromatogram.data
|
|
139
|
+
time_col = data.columns[0] # the time column must be the first!
|
|
140
|
+
# need to implement handling of pd.datetime here
|
|
141
|
+
|
|
142
|
+
signal_col = column if column is not None else data.columns[1]
|
|
143
|
+
|
|
144
|
+
injection_result = {"Timestamp": chromatogram.injection_time}
|
|
145
|
+
|
|
146
|
+
for peak_name, (start, end) in peaklist.items():
|
|
147
|
+
# Create a mask for the time window
|
|
148
|
+
mask = (data[time_col] >= start) & (data[time_col] <= end)
|
|
149
|
+
|
|
150
|
+
area = trapezoid(data.loc[mask, signal_col], data.loc[mask, time_col])
|
|
151
|
+
injection_result[peak_name] = area
|
|
152
|
+
|
|
153
|
+
return injection_result
|
|
154
|
+
|
|
155
|
+
|
|
156
|
+
def integrate_channel(
|
|
157
|
+
chromatogram: ChannelChromatograms, peaklist: dict, column: None | str = None
|
|
158
|
+
) -> pd.DataFrame:
|
|
159
|
+
"""
|
|
160
|
+
Integrate the signal of a chromatogram over time.
|
|
161
|
+
|
|
162
|
+
Args:
|
|
163
|
+
chromatogram: ChannelChromatograms object containing the chromatograms to be analyzed
|
|
164
|
+
peaklist: Dictionary defining the peaks to integrate. Example:
|
|
165
|
+
```
|
|
166
|
+
Peaks_TCD = {"N2": [20, 26], "H2": [16, 19]}
|
|
167
|
+
```
|
|
168
|
+
The list values must be in the same unit as the chromatogram.
|
|
169
|
+
column: Optional column name to use for integration. If None, uses second column.
|
|
170
|
+
Returns:
|
|
171
|
+
DataFrame with integrated peak areas for each injection
|
|
172
|
+
"""
|
|
173
|
+
|
|
174
|
+
results = []
|
|
175
|
+
|
|
176
|
+
for chrom in chromatogram.chromatograms.values():
|
|
177
|
+
injection_result = integrate_single_chromatogram(chrom, peaklist, column)
|
|
178
|
+
results.append(injection_result)
|
|
179
|
+
|
|
180
|
+
return pd.DataFrame(results)
|
|
181
|
+
|
|
182
|
+
|
|
183
|
+
def get_temp_and_valves_MTO(Integral_Frame, Log):
|
|
184
|
+
"""
|
|
185
|
+
For a Dataframe containing chromatogram integrals and a timestamp column,
|
|
186
|
+
add data from a log file.
|
|
187
|
+
"""
|
|
188
|
+
integral_copy = Integral_Frame.copy()
|
|
189
|
+
|
|
190
|
+
if "Timestamp" not in integral_copy.columns:
|
|
191
|
+
integral_copy = integral_copy.reset_index().rename(
|
|
192
|
+
columns={"index": "Timestamp"}
|
|
193
|
+
)
|
|
194
|
+
|
|
195
|
+
# Ensure both DataFrames are sorted by timestamp
|
|
196
|
+
integral_copy = integral_copy.sort_values("Timestamp")
|
|
197
|
+
Log = Log.sort_values("Timestamp")
|
|
198
|
+
|
|
199
|
+
# Merge to get all log data at once
|
|
200
|
+
result = pd.merge_asof(
|
|
201
|
+
integral_copy,
|
|
202
|
+
Log[["Timestamp", "Oven Temperature", "v10-bubbler", "v11-reactor"]],
|
|
203
|
+
left_on="Timestamp",
|
|
204
|
+
right_on="Timestamp",
|
|
205
|
+
direction="nearest",
|
|
206
|
+
)
|
|
207
|
+
|
|
208
|
+
# Set timestamp as index and return
|
|
209
|
+
return result.set_index("Timestamp")
|
|
210
|
+
|
|
211
|
+
|
|
212
|
+
def add_log_data(
|
|
213
|
+
Integral_Frame: pd.DataFrame, Log: pd.DataFrame, columns: list[str] | all = "all"
|
|
214
|
+
) -> pd.DataFrame:
|
|
215
|
+
"""
|
|
216
|
+
For a dataframe that contains a timestamp column, data from a log dataframe is added.
|
|
217
|
+
The log dataframe must similarly contain a timestamp column.
|
|
218
|
+
Args:
|
|
219
|
+
Integral_Frame (pd.DataFrame): DataFrame containing e.g. chromatogram integrals.
|
|
220
|
+
Log (pd.DataFrame): DataFrame containing log data with a timestamp column.
|
|
221
|
+
columns (list[str] | 'all', optional): List of columns from the log to add. If 'all', all columns except timestamp are added. Defaults to 'all'.
|
|
222
|
+
|
|
223
|
+
Returns:
|
|
224
|
+
pd.DataFrame: DataFrame containing the original dataframe data with log data added.
|
|
225
|
+
"""
|
|
226
|
+
|
|
227
|
+
# Data validation
|
|
228
|
+
if "Timestamp" not in Integral_Frame.columns:
|
|
229
|
+
raise ValueError("Integral_Frame must contain a 'Timestamp' column.")
|
|
230
|
+
if "Timestamp" not in Log.columns:
|
|
231
|
+
raise ValueError("Log must contain a 'Timestamp' column.")
|
|
232
|
+
|
|
233
|
+
# check if the first timestamp of the log is after the first timestamp of the integral frame
|
|
234
|
+
if Log["Timestamp"].min() > Integral_Frame["Timestamp"].max():
|
|
235
|
+
raise ValueError(
|
|
236
|
+
"The first timestamp of the log is after the last timestamp of the "
|
|
237
|
+
"Integral_Frame. Check whether the right files are selected."
|
|
238
|
+
)
|
|
239
|
+
|
|
240
|
+
if Log["Timestamp"].max() < Integral_Frame["Timestamp"].min():
|
|
241
|
+
raise ValueError(
|
|
242
|
+
"The last timestamp of the log is before the first timestamp of the "
|
|
243
|
+
"Integral_Frame. Check whether the right files are selected."
|
|
244
|
+
)
|
|
245
|
+
# Ensuring dfs are sorted by timestamp
|
|
246
|
+
Integral_Frame = Integral_Frame.sort_values("Timestamp")
|
|
247
|
+
Log = Log.sort_values("Timestamp")
|
|
248
|
+
|
|
249
|
+
if columns == "all":
|
|
250
|
+
# If 'all', add all columns except timestamp
|
|
251
|
+
columns = [col for col in Log.columns if col != "Timestamp"]
|
|
252
|
+
elif not isinstance(columns, list):
|
|
253
|
+
raise ValueError("columns must be a list of column names or 'all'.")
|
|
254
|
+
|
|
255
|
+
# Merging the dataframes
|
|
256
|
+
merged = pd.merge_asof(
|
|
257
|
+
Integral_Frame,
|
|
258
|
+
Log[["Timestamp"] + columns],
|
|
259
|
+
on="Timestamp",
|
|
260
|
+
direction="nearest",
|
|
261
|
+
)
|
|
262
|
+
|
|
263
|
+
return merged
|
|
264
|
+
|
|
265
|
+
|
|
266
|
+
# To do - seperate integrate chrom function
|
|
267
|
+
|
|
268
|
+
|
|
269
|
+
# Splitting
|
|
270
|
+
|
|
271
|
+
|
|
272
|
+
def split_chromatogram(
|
|
273
|
+
chromatogram: Chromatogram,
|
|
274
|
+
n_injections: int,
|
|
275
|
+
start_offset: int = 0,
|
|
276
|
+
end_offset: int = 0,
|
|
277
|
+
reset_time=True,
|
|
278
|
+
) -> list[Chromatogram]:
|
|
279
|
+
"""
|
|
280
|
+
When multiple injections are contained in a single chromatogram, this function splits the chromatogram into multiple chromatograms
|
|
281
|
+
Important constraint is the the length of the chromatogram must be divisible by the number of injections.
|
|
282
|
+
The injection time of each split chromatogram is adjusted based on the runtime.
|
|
283
|
+
Note:
|
|
284
|
+
|
|
285
|
+
Args:
|
|
286
|
+
chromatogram (Chromatogram): The chromatogram to be split.
|
|
287
|
+
n_injections (int): The number of injections to split the chromatogram into.
|
|
288
|
+
start_offset (int, optional): Number of data points to skip at the start of the chromatogram. Defaults to 0.
|
|
289
|
+
end_offset (int, optional): Number of data points to skip at the end of the chromatogram. Defaults to 0.
|
|
290
|
+
reset_time (bool, optional): Whether to reset the time column to start from 0 for each split chromatogram. Defaults to True.
|
|
291
|
+
|
|
292
|
+
Returns:
|
|
293
|
+
list[Chromatogram]: A list of split chromatograms.
|
|
294
|
+
"""
|
|
295
|
+
end_index = len(chromatogram.data)
|
|
296
|
+
chrom = (
|
|
297
|
+
chromatogram.data.iloc[start_offset : (end_index - end_offset)]
|
|
298
|
+
.reset_index(drop=True)
|
|
299
|
+
.copy()
|
|
300
|
+
)
|
|
301
|
+
|
|
302
|
+
# Check if divisible by n_injections
|
|
303
|
+
if len(chrom) % n_injections != 0:
|
|
304
|
+
raise ValueError(
|
|
305
|
+
f"Cannot split chromatograms, as length is not divisible by {n_injections}. Padding needs to be implemented."
|
|
306
|
+
)
|
|
307
|
+
|
|
308
|
+
# Calculate split indices, including the end of the data
|
|
309
|
+
split_indices = [
|
|
310
|
+
i * (len(chrom) // n_injections) for i in range(1, n_injections)
|
|
311
|
+
] + [len(chrom)]
|
|
312
|
+
|
|
313
|
+
split_chromatograms = []
|
|
314
|
+
last_index = 0
|
|
315
|
+
|
|
316
|
+
for indx in split_indices:
|
|
317
|
+
# Slice the data for the current segment
|
|
318
|
+
data = chrom.iloc[last_index:indx].reset_index(drop=True).copy()
|
|
319
|
+
last_index = indx
|
|
320
|
+
|
|
321
|
+
# Adjust the time column (must be the first column)
|
|
322
|
+
if chromatogram.time_unit == "min":
|
|
323
|
+
injection_time = chromatogram.injection_time + pd.Timedelta(
|
|
324
|
+
minutes=data[data.columns[0]].iloc[0]
|
|
325
|
+
)
|
|
326
|
+
elif chromatogram.time_unit == "s":
|
|
327
|
+
injection_time = chromatogram.injection_time + pd.Timedelta(
|
|
328
|
+
seconds=data[data.columns[0]].iloc[0]
|
|
329
|
+
)
|
|
330
|
+
else:
|
|
331
|
+
raise ValueError(
|
|
332
|
+
f"Unknown time unit {chromatogram.time_unit}, cannot split chromatogram."
|
|
333
|
+
)
|
|
334
|
+
|
|
335
|
+
if reset_time:
|
|
336
|
+
# reset the time column to start from 0
|
|
337
|
+
data[data.columns[0]] = (
|
|
338
|
+
data[data.columns[0]] - data[data.columns[0]].iloc[0]
|
|
339
|
+
)
|
|
340
|
+
|
|
341
|
+
# Create a new Chromatogram object for the split segment
|
|
342
|
+
from .objects import Chromatogram
|
|
343
|
+
|
|
344
|
+
split_chromatogram = Chromatogram(
|
|
345
|
+
data=data,
|
|
346
|
+
injection_time=injection_time,
|
|
347
|
+
metadata=chromatogram.metadata,
|
|
348
|
+
channel=chromatogram.channel,
|
|
349
|
+
path=chromatogram.path,
|
|
350
|
+
)
|
|
351
|
+
split_chromatograms.append(split_chromatogram)
|
|
352
|
+
|
|
353
|
+
return split_chromatograms
|
|
354
|
+
|
|
355
|
+
|
|
356
|
+
def list_baseline_functions():
|
|
357
|
+
baseline_functions = [
|
|
358
|
+
"min_subtract",
|
|
359
|
+
"time_window_baseline",
|
|
360
|
+
"time_point_baseline",
|
|
361
|
+
"linear_baseline",
|
|
362
|
+
]
|
|
363
|
+
return "\n".join(baseline_functions)
|