chromstream 0.0.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,15 @@
1
+ """Init data"""
2
+
3
+ from __future__ import annotations
4
+
5
+ from importlib.metadata import version
6
+
7
+ from .parsers import *
8
+
9
+ from .objects import *
10
+
11
+ from .data_processing import *
12
+
13
+
14
+ # Load the version
15
+ __version__ = version("chromstream")
@@ -0,0 +1,363 @@
1
+ """
2
+ Data processing functions for chromatogram analysis
3
+ """
4
+
5
+ from __future__ import annotations
6
+
7
+ from typing import TYPE_CHECKING
8
+
9
+ import pandas as pd
10
+ from scipy.integrate import trapezoid
11
+
12
+ if TYPE_CHECKING:
13
+ from .objects import ChannelChromatograms, Chromatogram
14
+
15
+
16
+ # Baseline functions
17
+ def min_subtract(data: pd.DataFrame) -> pd.Series:
18
+ """
19
+ Simple minimum subtraction baseline correction
20
+
21
+ Args:
22
+ data: DataFrame containing time and signal columns
23
+
24
+ Returns:
25
+ Corrected signal as pandas Series
26
+ """
27
+ signal = data[data.columns[1]]
28
+ return signal - signal.min()
29
+
30
+
31
+ def time_window_baseline(
32
+ data: pd.DataFrame, time_window: tuple[float, float] = (0, 1)
33
+ ) -> pd.Series:
34
+ """
35
+ Use mean of signal in a specific time window as baseline
36
+
37
+ Args:
38
+ data: DataFrame containing time and signal columns
39
+ time_window: Tuple specifying the start and end time of the baseline window. Use the same unit as the chromatogram.
40
+
41
+ Returns:
42
+ Corrected signal as pandas Series
43
+ """
44
+ start_time, end_time = time_window
45
+ time_col = data.columns[0] # "Time (min)"
46
+ signal_col = data.columns[1]
47
+
48
+ # Find data points in the specified time window
49
+ mask = (data[time_col] >= start_time) & (data[time_col] <= end_time)
50
+ baseline_value = data.loc[mask, signal_col].mean()
51
+
52
+ return data[signal_col] - baseline_value # type: ignore[operator]
53
+
54
+
55
+ def time_point_baseline(data: pd.DataFrame, time_point: float) -> pd.Series:
56
+ """
57
+ Use signal value at a specific time point as baseline
58
+
59
+ Args:
60
+ data: DataFrame containing time and signal columns
61
+ time_point: Time point to use as baseline reference. Use the same unit as the chromatogram.
62
+
63
+ Returns:
64
+ Corrected signal as pandas Series
65
+ """
66
+ time_col = data.columns[0] # "Time (min)"
67
+ signal_col = data.columns[1]
68
+
69
+ # Find the closest data point to the specified time
70
+ time_diff = (data[time_col] - time_point).abs()
71
+ closest_index = time_diff.idxmin()
72
+ baseline_value = data.loc[closest_index, signal_col]
73
+
74
+ return data[signal_col] - baseline_value # type: ignore[operator]
75
+
76
+
77
+ def linear_baseline(
78
+ data: pd.DataFrame, start_time: float, end_time: float
79
+ ) -> pd.Series:
80
+ """
81
+ Determines a linear baseline between the signal values at the two specified time points and
82
+ subtracts it from the signal.
83
+
84
+ Args:
85
+ data: DataFrame containing time and signal columns
86
+ start_time: Time point to define the start of the baseline. Use the same unit as the chromatogram.
87
+ end_time: Time point to define the end of the baseline. Use the same unit as the chromatogram.
88
+
89
+ Returns:
90
+ Corrected signal as pandas Series
91
+ """
92
+ time_col = data.columns[0] # "Time (min)"
93
+ signal_col = data.columns[1]
94
+
95
+ # Find the closest data points to the specified times
96
+ start_diff = (data[time_col] - start_time).abs()
97
+ end_diff = (data[time_col] - end_time).abs()
98
+ start_index = start_diff.idxmin()
99
+ end_index = end_diff.idxmin()
100
+
101
+ # Get the signal values at these points
102
+ start_value = data.loc[start_index, signal_col]
103
+ end_value = data.loc[end_index, signal_col]
104
+
105
+ # Calculate the slope and intercept of the baseline line
106
+ slope = (end_value - start_value) / ( # type: ignore[operator]
107
+ data.loc[end_index, time_col] - data.loc[start_index, time_col]
108
+ )
109
+ intercept = start_value - slope * data.loc[start_index, time_col] # type: ignore[operator]
110
+
111
+ # Calculate the baseline for each time point
112
+ baseline = slope * data[time_col] + intercept # type: ignore[operator]
113
+
114
+ return data[signal_col] - baseline
115
+
116
+
117
+ # Integration functions
118
+
119
+
120
+ def integrate_single_chromatogram(
121
+ chromatogram: Chromatogram, peaklist: dict, column: None | str = None
122
+ ) -> dict:
123
+ """
124
+ Integrate the signal of a single chromatogram over time.
125
+
126
+ Args:
127
+ chromatogram: Chromatogram object containing the data to be analyzed
128
+ peaklist: Dictionary defining the peaks to integrate. Example:
129
+ ```
130
+ Peaks_TCD = {"N2": [20, 26], "H2": [16, 19]}
131
+ ```
132
+ The list values must be in the same unit as the chromatogram.
133
+ column: Optional column name to use for integration. If None, uses second column.
134
+
135
+ Returns:
136
+ Dictionary with integrated peak areas and timestamp
137
+ """
138
+ data = chromatogram.data
139
+ time_col = data.columns[0] # the time column must be the first!
140
+ # need to implement handling of pd.datetime here
141
+
142
+ signal_col = column if column is not None else data.columns[1]
143
+
144
+ injection_result = {"Timestamp": chromatogram.injection_time}
145
+
146
+ for peak_name, (start, end) in peaklist.items():
147
+ # Create a mask for the time window
148
+ mask = (data[time_col] >= start) & (data[time_col] <= end)
149
+
150
+ area = trapezoid(data.loc[mask, signal_col], data.loc[mask, time_col])
151
+ injection_result[peak_name] = area
152
+
153
+ return injection_result
154
+
155
+
156
+ def integrate_channel(
157
+ chromatogram: ChannelChromatograms, peaklist: dict, column: None | str = None
158
+ ) -> pd.DataFrame:
159
+ """
160
+ Integrate the signal of a chromatogram over time.
161
+
162
+ Args:
163
+ chromatogram: ChannelChromatograms object containing the chromatograms to be analyzed
164
+ peaklist: Dictionary defining the peaks to integrate. Example:
165
+ ```
166
+ Peaks_TCD = {"N2": [20, 26], "H2": [16, 19]}
167
+ ```
168
+ The list values must be in the same unit as the chromatogram.
169
+ column: Optional column name to use for integration. If None, uses second column.
170
+ Returns:
171
+ DataFrame with integrated peak areas for each injection
172
+ """
173
+
174
+ results = []
175
+
176
+ for chrom in chromatogram.chromatograms.values():
177
+ injection_result = integrate_single_chromatogram(chrom, peaklist, column)
178
+ results.append(injection_result)
179
+
180
+ return pd.DataFrame(results)
181
+
182
+
183
+ def get_temp_and_valves_MTO(Integral_Frame, Log):
184
+ """
185
+ For a Dataframe containing chromatogram integrals and a timestamp column,
186
+ add data from a log file.
187
+ """
188
+ integral_copy = Integral_Frame.copy()
189
+
190
+ if "Timestamp" not in integral_copy.columns:
191
+ integral_copy = integral_copy.reset_index().rename(
192
+ columns={"index": "Timestamp"}
193
+ )
194
+
195
+ # Ensure both DataFrames are sorted by timestamp
196
+ integral_copy = integral_copy.sort_values("Timestamp")
197
+ Log = Log.sort_values("Timestamp")
198
+
199
+ # Merge to get all log data at once
200
+ result = pd.merge_asof(
201
+ integral_copy,
202
+ Log[["Timestamp", "Oven Temperature", "v10-bubbler", "v11-reactor"]],
203
+ left_on="Timestamp",
204
+ right_on="Timestamp",
205
+ direction="nearest",
206
+ )
207
+
208
+ # Set timestamp as index and return
209
+ return result.set_index("Timestamp")
210
+
211
+
212
+ def add_log_data(
213
+ Integral_Frame: pd.DataFrame, Log: pd.DataFrame, columns: list[str] | all = "all"
214
+ ) -> pd.DataFrame:
215
+ """
216
+ For a dataframe that contains a timestamp column, data from a log dataframe is added.
217
+ The log dataframe must similarly contain a timestamp column.
218
+ Args:
219
+ Integral_Frame (pd.DataFrame): DataFrame containing e.g. chromatogram integrals.
220
+ Log (pd.DataFrame): DataFrame containing log data with a timestamp column.
221
+ columns (list[str] | 'all', optional): List of columns from the log to add. If 'all', all columns except timestamp are added. Defaults to 'all'.
222
+
223
+ Returns:
224
+ pd.DataFrame: DataFrame containing the original dataframe data with log data added.
225
+ """
226
+
227
+ # Data validation
228
+ if "Timestamp" not in Integral_Frame.columns:
229
+ raise ValueError("Integral_Frame must contain a 'Timestamp' column.")
230
+ if "Timestamp" not in Log.columns:
231
+ raise ValueError("Log must contain a 'Timestamp' column.")
232
+
233
+ # check if the first timestamp of the log is after the first timestamp of the integral frame
234
+ if Log["Timestamp"].min() > Integral_Frame["Timestamp"].max():
235
+ raise ValueError(
236
+ "The first timestamp of the log is after the last timestamp of the "
237
+ "Integral_Frame. Check whether the right files are selected."
238
+ )
239
+
240
+ if Log["Timestamp"].max() < Integral_Frame["Timestamp"].min():
241
+ raise ValueError(
242
+ "The last timestamp of the log is before the first timestamp of the "
243
+ "Integral_Frame. Check whether the right files are selected."
244
+ )
245
+ # Ensuring dfs are sorted by timestamp
246
+ Integral_Frame = Integral_Frame.sort_values("Timestamp")
247
+ Log = Log.sort_values("Timestamp")
248
+
249
+ if columns == "all":
250
+ # If 'all', add all columns except timestamp
251
+ columns = [col for col in Log.columns if col != "Timestamp"]
252
+ elif not isinstance(columns, list):
253
+ raise ValueError("columns must be a list of column names or 'all'.")
254
+
255
+ # Merging the dataframes
256
+ merged = pd.merge_asof(
257
+ Integral_Frame,
258
+ Log[["Timestamp"] + columns],
259
+ on="Timestamp",
260
+ direction="nearest",
261
+ )
262
+
263
+ return merged
264
+
265
+
266
+ # To do - seperate integrate chrom function
267
+
268
+
269
+ # Splitting
270
+
271
+
272
+ def split_chromatogram(
273
+ chromatogram: Chromatogram,
274
+ n_injections: int,
275
+ start_offset: int = 0,
276
+ end_offset: int = 0,
277
+ reset_time=True,
278
+ ) -> list[Chromatogram]:
279
+ """
280
+ When multiple injections are contained in a single chromatogram, this function splits the chromatogram into multiple chromatograms
281
+ Important constraint is the the length of the chromatogram must be divisible by the number of injections.
282
+ The injection time of each split chromatogram is adjusted based on the runtime.
283
+ Note:
284
+
285
+ Args:
286
+ chromatogram (Chromatogram): The chromatogram to be split.
287
+ n_injections (int): The number of injections to split the chromatogram into.
288
+ start_offset (int, optional): Number of data points to skip at the start of the chromatogram. Defaults to 0.
289
+ end_offset (int, optional): Number of data points to skip at the end of the chromatogram. Defaults to 0.
290
+ reset_time (bool, optional): Whether to reset the time column to start from 0 for each split chromatogram. Defaults to True.
291
+
292
+ Returns:
293
+ list[Chromatogram]: A list of split chromatograms.
294
+ """
295
+ end_index = len(chromatogram.data)
296
+ chrom = (
297
+ chromatogram.data.iloc[start_offset : (end_index - end_offset)]
298
+ .reset_index(drop=True)
299
+ .copy()
300
+ )
301
+
302
+ # Check if divisible by n_injections
303
+ if len(chrom) % n_injections != 0:
304
+ raise ValueError(
305
+ f"Cannot split chromatograms, as length is not divisible by {n_injections}. Padding needs to be implemented."
306
+ )
307
+
308
+ # Calculate split indices, including the end of the data
309
+ split_indices = [
310
+ i * (len(chrom) // n_injections) for i in range(1, n_injections)
311
+ ] + [len(chrom)]
312
+
313
+ split_chromatograms = []
314
+ last_index = 0
315
+
316
+ for indx in split_indices:
317
+ # Slice the data for the current segment
318
+ data = chrom.iloc[last_index:indx].reset_index(drop=True).copy()
319
+ last_index = indx
320
+
321
+ # Adjust the time column (must be the first column)
322
+ if chromatogram.time_unit == "min":
323
+ injection_time = chromatogram.injection_time + pd.Timedelta(
324
+ minutes=data[data.columns[0]].iloc[0]
325
+ )
326
+ elif chromatogram.time_unit == "s":
327
+ injection_time = chromatogram.injection_time + pd.Timedelta(
328
+ seconds=data[data.columns[0]].iloc[0]
329
+ )
330
+ else:
331
+ raise ValueError(
332
+ f"Unknown time unit {chromatogram.time_unit}, cannot split chromatogram."
333
+ )
334
+
335
+ if reset_time:
336
+ # reset the time column to start from 0
337
+ data[data.columns[0]] = (
338
+ data[data.columns[0]] - data[data.columns[0]].iloc[0]
339
+ )
340
+
341
+ # Create a new Chromatogram object for the split segment
342
+ from .objects import Chromatogram
343
+
344
+ split_chromatogram = Chromatogram(
345
+ data=data,
346
+ injection_time=injection_time,
347
+ metadata=chromatogram.metadata,
348
+ channel=chromatogram.channel,
349
+ path=chromatogram.path,
350
+ )
351
+ split_chromatograms.append(split_chromatogram)
352
+
353
+ return split_chromatograms
354
+
355
+
356
+ def list_baseline_functions():
357
+ baseline_functions = [
358
+ "min_subtract",
359
+ "time_window_baseline",
360
+ "time_point_baseline",
361
+ "linear_baseline",
362
+ ]
363
+ return "\n".join(baseline_functions)