sigima 0.0.1.dev0__py3-none-any.whl → 1.0.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sigima/__init__.py +142 -2
- sigima/client/__init__.py +105 -0
- sigima/client/base.py +780 -0
- sigima/client/remote.py +469 -0
- sigima/client/stub.py +814 -0
- sigima/client/utils.py +90 -0
- sigima/config.py +444 -0
- sigima/data/logo/Sigima.svg +135 -0
- sigima/data/tests/annotations.json +798 -0
- sigima/data/tests/curve_fitting/exponential_fit.txt +511 -0
- sigima/data/tests/curve_fitting/gaussian_fit.txt +100 -0
- sigima/data/tests/curve_fitting/piecewiseexponential_fit.txt +1022 -0
- sigima/data/tests/curve_fitting/polynomial_fit.txt +100 -0
- sigima/data/tests/curve_fitting/twohalfgaussian_fit.txt +1000 -0
- sigima/data/tests/curve_formats/bandwidth.txt +201 -0
- sigima/data/tests/curve_formats/boxcar.npy +0 -0
- sigima/data/tests/curve_formats/datetime.txt +1001 -0
- sigima/data/tests/curve_formats/dynamic_parameters.txt +4000 -0
- sigima/data/tests/curve_formats/fw1e2.txt +301 -0
- sigima/data/tests/curve_formats/fwhm.txt +319 -0
- sigima/data/tests/curve_formats/multiple_curves.csv +29 -0
- sigima/data/tests/curve_formats/noised_saw.mat +0 -0
- sigima/data/tests/curve_formats/oscilloscope.csv +111 -0
- sigima/data/tests/curve_formats/other/other2/recursive2.txt +5 -0
- sigima/data/tests/curve_formats/other/recursive1.txt +5 -0
- sigima/data/tests/curve_formats/paracetamol.npy +0 -0
- sigima/data/tests/curve_formats/paracetamol.txt +1010 -0
- sigima/data/tests/curve_formats/paracetamol_dx_dy.csv +1000 -0
- sigima/data/tests/curve_formats/paracetamol_dy.csv +1001 -0
- sigima/data/tests/curve_formats/pulse1.npy +0 -0
- sigima/data/tests/curve_formats/pulse2.npy +0 -0
- sigima/data/tests/curve_formats/simple.txt +5 -0
- sigima/data/tests/curve_formats/spectrum.mca +2139 -0
- sigima/data/tests/curve_formats/square2.npy +0 -0
- sigima/data/tests/curve_formats/step.npy +0 -0
- sigima/data/tests/fabry-perot1.jpg +0 -0
- sigima/data/tests/fabry-perot2.jpg +0 -0
- sigima/data/tests/flower.npy +0 -0
- sigima/data/tests/image_formats/NF 180338201.scor-data +11003 -0
- sigima/data/tests/image_formats/binary_image.npy +0 -0
- sigima/data/tests/image_formats/binary_image.png +0 -0
- sigima/data/tests/image_formats/centroid_test.npy +0 -0
- sigima/data/tests/image_formats/coordinated_text/complex_image.txt +10011 -0
- sigima/data/tests/image_formats/coordinated_text/complex_ref_image.txt +10010 -0
- sigima/data/tests/image_formats/coordinated_text/image.txt +15 -0
- sigima/data/tests/image_formats/coordinated_text/image2.txt +14 -0
- sigima/data/tests/image_formats/coordinated_text/image_no_unit_no_label.txt +14 -0
- sigima/data/tests/image_formats/coordinated_text/image_with_nan.txt +15 -0
- sigima/data/tests/image_formats/coordinated_text/image_with_unit.txt +14 -0
- sigima/data/tests/image_formats/fiber.csv +480 -0
- sigima/data/tests/image_formats/fiber.jpg +0 -0
- sigima/data/tests/image_formats/fiber.png +0 -0
- sigima/data/tests/image_formats/fiber.txt +480 -0
- sigima/data/tests/image_formats/gaussian_spot_with_noise.npy +0 -0
- sigima/data/tests/image_formats/mr-brain.dcm +0 -0
- sigima/data/tests/image_formats/noised_gaussian.mat +0 -0
- sigima/data/tests/image_formats/sif_reader/nd_lum_image_no_glue.sif +0 -0
- sigima/data/tests/image_formats/sif_reader/raman1.sif +0 -0
- sigima/data/tests/image_formats/tiling.txt +10 -0
- sigima/data/tests/image_formats/uint16.tiff +0 -0
- sigima/data/tests/image_formats/uint8.tiff +0 -0
- sigima/data/tests/laser_beam/TEM00_z_13.jpg +0 -0
- sigima/data/tests/laser_beam/TEM00_z_18.jpg +0 -0
- sigima/data/tests/laser_beam/TEM00_z_23.jpg +0 -0
- sigima/data/tests/laser_beam/TEM00_z_30.jpg +0 -0
- sigima/data/tests/laser_beam/TEM00_z_35.jpg +0 -0
- sigima/data/tests/laser_beam/TEM00_z_40.jpg +0 -0
- sigima/data/tests/laser_beam/TEM00_z_45.jpg +0 -0
- sigima/data/tests/laser_beam/TEM00_z_50.jpg +0 -0
- sigima/data/tests/laser_beam/TEM00_z_55.jpg +0 -0
- sigima/data/tests/laser_beam/TEM00_z_60.jpg +0 -0
- sigima/data/tests/laser_beam/TEM00_z_65.jpg +0 -0
- sigima/data/tests/laser_beam/TEM00_z_70.jpg +0 -0
- sigima/data/tests/laser_beam/TEM00_z_75.jpg +0 -0
- sigima/data/tests/laser_beam/TEM00_z_80.jpg +0 -0
- sigima/enums.py +195 -0
- sigima/io/__init__.py +123 -0
- sigima/io/base.py +311 -0
- sigima/io/common/__init__.py +5 -0
- sigima/io/common/basename.py +164 -0
- sigima/io/common/converters.py +189 -0
- sigima/io/common/objmeta.py +181 -0
- sigima/io/common/textreader.py +58 -0
- sigima/io/convenience.py +157 -0
- sigima/io/enums.py +17 -0
- sigima/io/ftlab.py +395 -0
- sigima/io/image/__init__.py +9 -0
- sigima/io/image/base.py +177 -0
- sigima/io/image/formats.py +1016 -0
- sigima/io/image/funcs.py +414 -0
- sigima/io/signal/__init__.py +9 -0
- sigima/io/signal/base.py +129 -0
- sigima/io/signal/formats.py +290 -0
- sigima/io/signal/funcs.py +723 -0
- sigima/objects/__init__.py +260 -0
- sigima/objects/base.py +937 -0
- sigima/objects/image/__init__.py +88 -0
- sigima/objects/image/creation.py +556 -0
- sigima/objects/image/object.py +524 -0
- sigima/objects/image/roi.py +904 -0
- sigima/objects/scalar/__init__.py +57 -0
- sigima/objects/scalar/common.py +215 -0
- sigima/objects/scalar/geometry.py +502 -0
- sigima/objects/scalar/table.py +784 -0
- sigima/objects/shape.py +290 -0
- sigima/objects/signal/__init__.py +133 -0
- sigima/objects/signal/constants.py +27 -0
- sigima/objects/signal/creation.py +1428 -0
- sigima/objects/signal/object.py +444 -0
- sigima/objects/signal/roi.py +274 -0
- sigima/params.py +405 -0
- sigima/proc/__init__.py +96 -0
- sigima/proc/base.py +381 -0
- sigima/proc/decorator.py +330 -0
- sigima/proc/image/__init__.py +513 -0
- sigima/proc/image/arithmetic.py +335 -0
- sigima/proc/image/base.py +260 -0
- sigima/proc/image/detection.py +519 -0
- sigima/proc/image/edges.py +329 -0
- sigima/proc/image/exposure.py +406 -0
- sigima/proc/image/extraction.py +458 -0
- sigima/proc/image/filtering.py +219 -0
- sigima/proc/image/fourier.py +147 -0
- sigima/proc/image/geometry.py +661 -0
- sigima/proc/image/mathops.py +340 -0
- sigima/proc/image/measurement.py +195 -0
- sigima/proc/image/morphology.py +155 -0
- sigima/proc/image/noise.py +107 -0
- sigima/proc/image/preprocessing.py +182 -0
- sigima/proc/image/restoration.py +235 -0
- sigima/proc/image/threshold.py +217 -0
- sigima/proc/image/transformations.py +393 -0
- sigima/proc/signal/__init__.py +376 -0
- sigima/proc/signal/analysis.py +206 -0
- sigima/proc/signal/arithmetic.py +551 -0
- sigima/proc/signal/base.py +262 -0
- sigima/proc/signal/extraction.py +60 -0
- sigima/proc/signal/features.py +310 -0
- sigima/proc/signal/filtering.py +484 -0
- sigima/proc/signal/fitting.py +276 -0
- sigima/proc/signal/fourier.py +259 -0
- sigima/proc/signal/mathops.py +420 -0
- sigima/proc/signal/processing.py +580 -0
- sigima/proc/signal/stability.py +175 -0
- sigima/proc/title_formatting.py +227 -0
- sigima/proc/validation.py +272 -0
- sigima/tests/__init__.py +7 -0
- sigima/tests/common/__init__.py +0 -0
- sigima/tests/common/arithmeticparam_unit_test.py +26 -0
- sigima/tests/common/basename_unit_test.py +126 -0
- sigima/tests/common/client_unit_test.py +412 -0
- sigima/tests/common/converters_unit_test.py +77 -0
- sigima/tests/common/decorator_unit_test.py +176 -0
- sigima/tests/common/examples_unit_test.py +104 -0
- sigima/tests/common/kernel_normalization_unit_test.py +242 -0
- sigima/tests/common/roi_basic_unit_test.py +73 -0
- sigima/tests/common/roi_geometry_unit_test.py +171 -0
- sigima/tests/common/scalar_builder_unit_test.py +142 -0
- sigima/tests/common/scalar_unit_test.py +991 -0
- sigima/tests/common/shape_unit_test.py +183 -0
- sigima/tests/common/stat_unit_test.py +138 -0
- sigima/tests/common/title_formatting_unit_test.py +338 -0
- sigima/tests/common/tools_coordinates_unit_test.py +60 -0
- sigima/tests/common/transformations_unit_test.py +178 -0
- sigima/tests/common/validation_unit_test.py +205 -0
- sigima/tests/conftest.py +129 -0
- sigima/tests/data.py +998 -0
- sigima/tests/env.py +280 -0
- sigima/tests/guiutils.py +163 -0
- sigima/tests/helpers.py +532 -0
- sigima/tests/image/__init__.py +28 -0
- sigima/tests/image/binning_unit_test.py +128 -0
- sigima/tests/image/blob_detection_unit_test.py +312 -0
- sigima/tests/image/centroid_unit_test.py +170 -0
- sigima/tests/image/check_2d_array_unit_test.py +63 -0
- sigima/tests/image/contour_unit_test.py +172 -0
- sigima/tests/image/convolution_unit_test.py +178 -0
- sigima/tests/image/datatype_unit_test.py +67 -0
- sigima/tests/image/edges_unit_test.py +155 -0
- sigima/tests/image/enclosingcircle_unit_test.py +88 -0
- sigima/tests/image/exposure_unit_test.py +223 -0
- sigima/tests/image/fft2d_unit_test.py +189 -0
- sigima/tests/image/filtering_unit_test.py +166 -0
- sigima/tests/image/geometry_unit_test.py +654 -0
- sigima/tests/image/hough_circle_unit_test.py +147 -0
- sigima/tests/image/imageobj_unit_test.py +737 -0
- sigima/tests/image/morphology_unit_test.py +71 -0
- sigima/tests/image/noise_unit_test.py +57 -0
- sigima/tests/image/offset_correction_unit_test.py +72 -0
- sigima/tests/image/operation_unit_test.py +518 -0
- sigima/tests/image/peak2d_limits_unit_test.py +41 -0
- sigima/tests/image/peak2d_unit_test.py +133 -0
- sigima/tests/image/profile_unit_test.py +159 -0
- sigima/tests/image/projections_unit_test.py +121 -0
- sigima/tests/image/restoration_unit_test.py +141 -0
- sigima/tests/image/roi2dparam_unit_test.py +53 -0
- sigima/tests/image/roi_advanced_unit_test.py +588 -0
- sigima/tests/image/roi_grid_unit_test.py +279 -0
- sigima/tests/image/spectrum2d_unit_test.py +40 -0
- sigima/tests/image/threshold_unit_test.py +91 -0
- sigima/tests/io/__init__.py +0 -0
- sigima/tests/io/addnewformat_unit_test.py +125 -0
- sigima/tests/io/convenience_funcs_unit_test.py +470 -0
- sigima/tests/io/coordinated_text_format_unit_test.py +495 -0
- sigima/tests/io/datetime_csv_unit_test.py +198 -0
- sigima/tests/io/imageio_formats_test.py +41 -0
- sigima/tests/io/ioregistry_unit_test.py +69 -0
- sigima/tests/io/objmeta_unit_test.py +87 -0
- sigima/tests/io/readobj_unit_test.py +130 -0
- sigima/tests/io/readwriteobj_unit_test.py +67 -0
- sigima/tests/signal/__init__.py +0 -0
- sigima/tests/signal/analysis_unit_test.py +135 -0
- sigima/tests/signal/check_1d_arrays_unit_test.py +169 -0
- sigima/tests/signal/convolution_unit_test.py +404 -0
- sigima/tests/signal/datetime_unit_test.py +176 -0
- sigima/tests/signal/fft1d_unit_test.py +303 -0
- sigima/tests/signal/filters_unit_test.py +403 -0
- sigima/tests/signal/fitting_unit_test.py +929 -0
- sigima/tests/signal/fwhm_unit_test.py +111 -0
- sigima/tests/signal/noise_unit_test.py +128 -0
- sigima/tests/signal/offset_correction_unit_test.py +34 -0
- sigima/tests/signal/operation_unit_test.py +489 -0
- sigima/tests/signal/peakdetection_unit_test.py +145 -0
- sigima/tests/signal/processing_unit_test.py +657 -0
- sigima/tests/signal/pulse/__init__.py +112 -0
- sigima/tests/signal/pulse/crossing_times_unit_test.py +123 -0
- sigima/tests/signal/pulse/plateau_detection_unit_test.py +102 -0
- sigima/tests/signal/pulse/pulse_unit_test.py +1824 -0
- sigima/tests/signal/roi_advanced_unit_test.py +392 -0
- sigima/tests/signal/signalobj_unit_test.py +603 -0
- sigima/tests/signal/stability_unit_test.py +431 -0
- sigima/tests/signal/uncertainty_unit_test.py +611 -0
- sigima/tests/vistools.py +1030 -0
- sigima/tools/__init__.py +59 -0
- sigima/tools/checks.py +290 -0
- sigima/tools/coordinates.py +308 -0
- sigima/tools/datatypes.py +26 -0
- sigima/tools/image/__init__.py +97 -0
- sigima/tools/image/detection.py +451 -0
- sigima/tools/image/exposure.py +77 -0
- sigima/tools/image/extraction.py +48 -0
- sigima/tools/image/fourier.py +260 -0
- sigima/tools/image/geometry.py +190 -0
- sigima/tools/image/preprocessing.py +165 -0
- sigima/tools/signal/__init__.py +86 -0
- sigima/tools/signal/dynamic.py +254 -0
- sigima/tools/signal/features.py +135 -0
- sigima/tools/signal/filtering.py +171 -0
- sigima/tools/signal/fitting.py +1171 -0
- sigima/tools/signal/fourier.py +466 -0
- sigima/tools/signal/interpolation.py +70 -0
- sigima/tools/signal/peakdetection.py +126 -0
- sigima/tools/signal/pulse.py +1626 -0
- sigima/tools/signal/scaling.py +50 -0
- sigima/tools/signal/stability.py +258 -0
- sigima/tools/signal/windowing.py +90 -0
- sigima/worker.py +79 -0
- sigima-1.0.0.dist-info/METADATA +233 -0
- sigima-1.0.0.dist-info/RECORD +262 -0
- {sigima-0.0.1.dev0.dist-info → sigima-1.0.0.dist-info}/licenses/LICENSE +29 -29
- sigima-0.0.1.dev0.dist-info/METADATA +0 -60
- sigima-0.0.1.dev0.dist-info/RECORD +0 -6
- {sigima-0.0.1.dev0.dist-info → sigima-1.0.0.dist-info}/WHEEL +0 -0
- {sigima-0.0.1.dev0.dist-info → sigima-1.0.0.dist-info}/top_level.txt +0 -0
|
@@ -0,0 +1,723 @@
|
|
|
1
|
+
# Copyright (c) DataLab Platform Developers, BSD 3-Clause license, see LICENSE file.
|
|
2
|
+
|
|
3
|
+
"""
|
|
4
|
+
I/O signal functions
|
|
5
|
+
"""
|
|
6
|
+
|
|
7
|
+
# pylint: disable=invalid-name # Allows short reference names like x, y, ...
|
|
8
|
+
|
|
9
|
+
from __future__ import annotations
|
|
10
|
+
|
|
11
|
+
import datetime
|
|
12
|
+
import re
|
|
13
|
+
import warnings
|
|
14
|
+
from dataclasses import dataclass
|
|
15
|
+
from typing import TextIO
|
|
16
|
+
|
|
17
|
+
import numpy as np
|
|
18
|
+
import pandas as pd
|
|
19
|
+
import scipy.interpolate
|
|
20
|
+
|
|
21
|
+
from sigima.io.common.textreader import count_lines, read_first_n_lines
|
|
22
|
+
from sigima.objects.signal.constants import (
|
|
23
|
+
DATETIME_X_FORMAT_KEY,
|
|
24
|
+
DATETIME_X_KEY,
|
|
25
|
+
DEFAULT_DATETIME_FORMAT,
|
|
26
|
+
)
|
|
27
|
+
from sigima.worker import CallbackWorkerProtocol
|
|
28
|
+
|
|
29
|
+
|
|
30
|
+
def get_labels_units_from_dataframe(
|
|
31
|
+
df: pd.DataFrame,
|
|
32
|
+
) -> tuple[str, list[str], str, list[str]]:
|
|
33
|
+
"""Get labels and units from a DataFrame.
|
|
34
|
+
|
|
35
|
+
Args:
|
|
36
|
+
df: DataFrame
|
|
37
|
+
|
|
38
|
+
Returns:
|
|
39
|
+
Tuple (xlabel, ylabels, xunit, yunits)
|
|
40
|
+
"""
|
|
41
|
+
# Reading X,Y labels
|
|
42
|
+
xlabel = str(df.columns[0])
|
|
43
|
+
ylabels = [str(col) for col in df.columns[1:]]
|
|
44
|
+
|
|
45
|
+
# Retrieving units from labels
|
|
46
|
+
xunit = ""
|
|
47
|
+
yunits = [""] * len(ylabels)
|
|
48
|
+
pattern = r"([\S ]*) \(([\S]*)\)"
|
|
49
|
+
match = re.match(pattern, xlabel)
|
|
50
|
+
if match is not None:
|
|
51
|
+
xlabel, xunit = match.groups()
|
|
52
|
+
for i, ylabel in enumerate(ylabels):
|
|
53
|
+
match = re.match(pattern, ylabel)
|
|
54
|
+
if match is not None:
|
|
55
|
+
ylabels[i], yunits[i] = match.groups()
|
|
56
|
+
|
|
57
|
+
return xlabel, ylabels, xunit, yunits
|
|
58
|
+
|
|
59
|
+
|
|
60
|
+
def read_csv_by_chunks(
|
|
61
|
+
fname_or_fileobj: str | TextIO,
|
|
62
|
+
nlines: int | None = None,
|
|
63
|
+
worker: CallbackWorkerProtocol | None = None,
|
|
64
|
+
decimal: str = ".",
|
|
65
|
+
delimiter: str | None = None,
|
|
66
|
+
header: int | None = "infer",
|
|
67
|
+
skiprows: int | None = None,
|
|
68
|
+
nrows: int | None = None,
|
|
69
|
+
comment: str | None = None,
|
|
70
|
+
chunksize: int = 1000,
|
|
71
|
+
) -> pd.DataFrame:
|
|
72
|
+
"""Read CSV data with primitive options, using pandas read_csv function defaults,
|
|
73
|
+
and reading data in chunks, using the iterator interface.
|
|
74
|
+
|
|
75
|
+
Args:
|
|
76
|
+
fname_or_fileobj: CSV file name or text stream object
|
|
77
|
+
nlines: Number of lines contained in file (this argument is mandatory if
|
|
78
|
+
`fname_or_fileobj` is a text stream object: counting line numbers from a
|
|
79
|
+
text stream is not efficient, especially if one already has access to the
|
|
80
|
+
initial text content from which the text stream was made)
|
|
81
|
+
worker: Callback worker object
|
|
82
|
+
decimal: Decimal character
|
|
83
|
+
delimiter: Delimiter
|
|
84
|
+
header: Header line
|
|
85
|
+
skiprows: Skip rows
|
|
86
|
+
nrows: Number of rows to read
|
|
87
|
+
comment: Comment character
|
|
88
|
+
chunksize: Chunk size
|
|
89
|
+
|
|
90
|
+
Returns:
|
|
91
|
+
DataFrame
|
|
92
|
+
"""
|
|
93
|
+
if isinstance(fname_or_fileobj, str):
|
|
94
|
+
nlines = count_lines(fname_or_fileobj)
|
|
95
|
+
elif nlines is None:
|
|
96
|
+
raise ValueError("Argument `nlines` must be passed for text streams")
|
|
97
|
+
# Read data in chunks, and concatenate them at the end, thus allowing to call the
|
|
98
|
+
# progress callback function at each chunk read and to return an intermediate result
|
|
99
|
+
# if the operation is canceled.
|
|
100
|
+
chunks = []
|
|
101
|
+
for chunk in pd.read_csv(
|
|
102
|
+
fname_or_fileobj,
|
|
103
|
+
decimal=decimal,
|
|
104
|
+
delimiter=delimiter,
|
|
105
|
+
header=header,
|
|
106
|
+
skiprows=skiprows,
|
|
107
|
+
nrows=nrows,
|
|
108
|
+
comment=comment,
|
|
109
|
+
chunksize=chunksize,
|
|
110
|
+
encoding_errors="ignore",
|
|
111
|
+
):
|
|
112
|
+
chunks.append(chunk)
|
|
113
|
+
# Compute the progression based on the number of lines read so far
|
|
114
|
+
if worker is not None:
|
|
115
|
+
worker.set_progress(sum(len(chunk) for chunk in chunks) / nlines)
|
|
116
|
+
if worker.was_canceled():
|
|
117
|
+
break
|
|
118
|
+
return pd.concat(chunks)
|
|
119
|
+
|
|
120
|
+
|
|
121
|
+
DATA_HEADERS = [
|
|
122
|
+
"#DATA", # Generic
|
|
123
|
+
"START_OF_DATA", # Various logging devices
|
|
124
|
+
">>>>>Begin Spectral Data<<<<<", # Ocean Optics
|
|
125
|
+
">>>Begin Data<<<", # Ocean Optics (alternative)
|
|
126
|
+
">>>Begin Spectrum Data<<<", # Avantes
|
|
127
|
+
"# Data Start", # Andor, Horiba, Mass Spectrometry (Agilent, Thermo Fisher, ...)
|
|
128
|
+
">DATA START<", # Mass Spectrometry, Chromatography
|
|
129
|
+
"BEGIN DATA", # Mass Spectrometry, Chromatography
|
|
130
|
+
"<Data>", # Mass Spectrometry (XML-based)
|
|
131
|
+
"##Start Data", # Bruker (X-ray, Raman, FTIR)
|
|
132
|
+
"[DataStart]", # PerkinElmer (FTIR, UV-Vis)
|
|
133
|
+
"BEGIN SPECTRUM", # PerkinElmer
|
|
134
|
+
"%% Data Start %%", # LabVIEW, MATLAB
|
|
135
|
+
"---Begin Data---", # General scientific instruments
|
|
136
|
+
"===DATA START===", # Industrial/scientific devices
|
|
137
|
+
]
|
|
138
|
+
|
|
139
|
+
|
|
140
|
+
def _read_df_without_header(
|
|
141
|
+
filename: str, skiprows: int | None = None
|
|
142
|
+
) -> tuple[pd.DataFrame | None, str, str]:
|
|
143
|
+
"""Try to read a CSV file without header, testing various delimiters and decimal.
|
|
144
|
+
|
|
145
|
+
Args:
|
|
146
|
+
filename: CSV file name
|
|
147
|
+
skiprows: Number of rows to skip at the beginning of the file
|
|
148
|
+
|
|
149
|
+
Returns:
|
|
150
|
+
A tuple (DataFrame if successful, None otherwise, decimal used, delimiter used)
|
|
151
|
+
"""
|
|
152
|
+
for decimal in (".", ","):
|
|
153
|
+
for delimiter in (",", ";", r"\s+"):
|
|
154
|
+
try:
|
|
155
|
+
df = pd.read_csv(
|
|
156
|
+
filename,
|
|
157
|
+
decimal=decimal,
|
|
158
|
+
delimiter=delimiter,
|
|
159
|
+
header=None,
|
|
160
|
+
comment="#",
|
|
161
|
+
nrows=1000, # Read only the first 1000 lines
|
|
162
|
+
encoding_errors="ignore",
|
|
163
|
+
skiprows=skiprows,
|
|
164
|
+
dtype=float, # Keep dtype to validate delimiter detection
|
|
165
|
+
)
|
|
166
|
+
break
|
|
167
|
+
except (pd.errors.ParserError, ValueError):
|
|
168
|
+
df = None
|
|
169
|
+
if df is not None:
|
|
170
|
+
break
|
|
171
|
+
return df, decimal, delimiter
|
|
172
|
+
|
|
173
|
+
|
|
174
|
+
def _read_df_with_header(filename: str) -> tuple[pd.DataFrame | None, str, str]:
|
|
175
|
+
"""Try to read a CSV file with header, testing various delimiters and decimal.
|
|
176
|
+
|
|
177
|
+
Args:
|
|
178
|
+
filename: CSV file name
|
|
179
|
+
|
|
180
|
+
Returns:
|
|
181
|
+
A tuple (DataFrame if successful, None otherwise, decimal used, delimiter used)
|
|
182
|
+
"""
|
|
183
|
+
for decimal in (".", ","):
|
|
184
|
+
for delimiter in (",", ";", r"\s+"):
|
|
185
|
+
# Headers are generally in the first 10 lines, so we try to skip the
|
|
186
|
+
# minimum number of lines before reading the data:
|
|
187
|
+
for skiprows in range(20):
|
|
188
|
+
try:
|
|
189
|
+
df = pd.read_csv(
|
|
190
|
+
filename,
|
|
191
|
+
decimal=decimal,
|
|
192
|
+
delimiter=delimiter,
|
|
193
|
+
skiprows=skiprows,
|
|
194
|
+
comment="#",
|
|
195
|
+
nrows=1000, # Read only the first 1000 lines
|
|
196
|
+
encoding_errors="ignore",
|
|
197
|
+
)
|
|
198
|
+
# Validate: CSV should have at least 2 columns (x and y)
|
|
199
|
+
# If only 1 column, likely wrong delimiter
|
|
200
|
+
if df.shape[1] >= 2:
|
|
201
|
+
break # Good delimiter found
|
|
202
|
+
df = None # Try next delimiter
|
|
203
|
+
except (pd.errors.ParserError, ValueError):
|
|
204
|
+
df = None
|
|
205
|
+
if df is not None:
|
|
206
|
+
break
|
|
207
|
+
if df is not None:
|
|
208
|
+
break
|
|
209
|
+
return df, decimal, delimiter
|
|
210
|
+
|
|
211
|
+
|
|
212
|
+
def _detect_metadata_cols(df: pd.DataFrame) -> tuple[pd.DataFrame, dict]:
|
|
213
|
+
"""Detect columns containing constant/single-value metadata.
|
|
214
|
+
|
|
215
|
+
Columns with a single unique value (excluding NaN) across all rows are treated
|
|
216
|
+
as metadata rather than data columns. These are typically instrument serial numbers,
|
|
217
|
+
experiment IDs, or other constant identifiers.
|
|
218
|
+
|
|
219
|
+
Args:
|
|
220
|
+
df: Input DataFrame
|
|
221
|
+
|
|
222
|
+
Returns:
|
|
223
|
+
A tuple (DataFrame with metadata columns removed,
|
|
224
|
+
dict of metadata key-value pairs)
|
|
225
|
+
"""
|
|
226
|
+
metadata = {}
|
|
227
|
+
cols_to_drop = []
|
|
228
|
+
|
|
229
|
+
# Start from column 1 (skip X column) and check for constant-value columns
|
|
230
|
+
for col_idx in range(1, df.shape[1]):
|
|
231
|
+
col_data = df.iloc[:, col_idx]
|
|
232
|
+
col_name = df.columns[col_idx]
|
|
233
|
+
|
|
234
|
+
# Get unique non-NaN values
|
|
235
|
+
unique_values = col_data.dropna().unique()
|
|
236
|
+
|
|
237
|
+
# If column has exactly one unique value (excluding NaN), it's metadata
|
|
238
|
+
if len(unique_values) == 1:
|
|
239
|
+
# Store the metadata
|
|
240
|
+
value = unique_values[0]
|
|
241
|
+
# Try to convert to appropriate type (keep as string if necessary)
|
|
242
|
+
try:
|
|
243
|
+
# Try int first
|
|
244
|
+
if float(value).is_integer():
|
|
245
|
+
value = int(float(value))
|
|
246
|
+
else:
|
|
247
|
+
value = float(value)
|
|
248
|
+
except (ValueError, TypeError):
|
|
249
|
+
# Keep as string
|
|
250
|
+
value = str(value)
|
|
251
|
+
|
|
252
|
+
metadata[str(col_name)] = value
|
|
253
|
+
cols_to_drop.append(col_name) # Store column name, not index
|
|
254
|
+
|
|
255
|
+
# Drop metadata columns from DataFrame
|
|
256
|
+
if cols_to_drop:
|
|
257
|
+
df = df.drop(columns=cols_to_drop)
|
|
258
|
+
|
|
259
|
+
return df, metadata
|
|
260
|
+
|
|
261
|
+
|
|
262
|
+
def _detect_datetime_col(df: pd.DataFrame) -> tuple[pd.DataFrame, dict | None]:
|
|
263
|
+
"""Try to detect the presence of a datetime column in a DataFrame.
|
|
264
|
+
|
|
265
|
+
Detect if the first or second column contains datetime values, and convert it to
|
|
266
|
+
float timestamps if so.
|
|
267
|
+
|
|
268
|
+
Args:
|
|
269
|
+
df: Input DataFrame
|
|
270
|
+
|
|
271
|
+
Returns:
|
|
272
|
+
A tuple (DataFrame with datetime column converted, datetime metadata dict)
|
|
273
|
+
"""
|
|
274
|
+
datetime_col_idx = None
|
|
275
|
+
|
|
276
|
+
for col_idx in [0, 1]: # Check first two columns
|
|
277
|
+
col_data = df.iloc[:, col_idx]
|
|
278
|
+
# Try to convert to datetime
|
|
279
|
+
try:
|
|
280
|
+
# Attempt to parse as datetime
|
|
281
|
+
# Note: format="mixed" was causing failures in some pandas versions,
|
|
282
|
+
# so we use warnings filter to suppress the UserWarning instead
|
|
283
|
+
with warnings.catch_warnings():
|
|
284
|
+
warnings.filterwarnings(
|
|
285
|
+
"ignore",
|
|
286
|
+
message="Could not infer format",
|
|
287
|
+
category=UserWarning,
|
|
288
|
+
)
|
|
289
|
+
datetime_series = pd.to_datetime(col_data, errors="coerce")
|
|
290
|
+
# Check if most values were successfully converted (>90%)
|
|
291
|
+
valid_ratio = datetime_series.notna().sum() / len(datetime_series)
|
|
292
|
+
|
|
293
|
+
# Skip if conversion ratio is too low
|
|
294
|
+
if valid_ratio <= 0.9:
|
|
295
|
+
continue
|
|
296
|
+
|
|
297
|
+
# Check if values have reasonable variation and are actual dates
|
|
298
|
+
unique_dates = datetime_series.dropna().nunique()
|
|
299
|
+
if unique_dates <= 1:
|
|
300
|
+
continue
|
|
301
|
+
|
|
302
|
+
# Check date range - should be reasonable dates, not epoch times
|
|
303
|
+
min_date = datetime_series.min()
|
|
304
|
+
max_date = datetime_series.max()
|
|
305
|
+
# Dates should be after 1900 and the range should be > 1 sec
|
|
306
|
+
valid_datetime = (
|
|
307
|
+
min_date.year >= 1900 and (max_date - min_date).total_seconds() > 1.0
|
|
308
|
+
)
|
|
309
|
+
|
|
310
|
+
if valid_datetime:
|
|
311
|
+
# This is a datetime column!
|
|
312
|
+
datetime_col_idx = col_idx
|
|
313
|
+
break
|
|
314
|
+
except (ValueError, TypeError, pd.errors.OutOfBoundsDatetime):
|
|
315
|
+
# Not a datetime column, continue checking
|
|
316
|
+
pass
|
|
317
|
+
|
|
318
|
+
datetime_metadata = None
|
|
319
|
+
|
|
320
|
+
if datetime_col_idx is not None:
|
|
321
|
+
# Convert datetime column to float timestamps
|
|
322
|
+
col_data = df.iloc[:, datetime_col_idx]
|
|
323
|
+
with warnings.catch_warnings():
|
|
324
|
+
warnings.filterwarnings(
|
|
325
|
+
"ignore", message="Could not infer format", category=UserWarning
|
|
326
|
+
)
|
|
327
|
+
datetime_series = pd.to_datetime(col_data, errors="coerce")
|
|
328
|
+
x_float = datetime_series.astype(np.int64) / 1e9
|
|
329
|
+
# Store datetime metadata (unit will be stored in xunit attribute)
|
|
330
|
+
datetime_metadata = {
|
|
331
|
+
DATETIME_X_KEY: True,
|
|
332
|
+
DATETIME_X_FORMAT_KEY: DEFAULT_DATETIME_FORMAT,
|
|
333
|
+
}
|
|
334
|
+
|
|
335
|
+
# If datetime is in column 1 and column 0 looks like an index, drop column 0
|
|
336
|
+
if datetime_col_idx == 1:
|
|
337
|
+
try:
|
|
338
|
+
# Try to convert first column to int - if sequential,
|
|
339
|
+
# it's likely an index column
|
|
340
|
+
first_col = pd.to_numeric(df.iloc[:, 0], errors="coerce")
|
|
341
|
+
if first_col.notna().all():
|
|
342
|
+
# Check if it's a sequential index (1, 2, 3, ...)
|
|
343
|
+
diffs = first_col.diff().dropna()
|
|
344
|
+
if (diffs == 1).sum() / len(diffs) > 0.9:
|
|
345
|
+
# Drop the index column
|
|
346
|
+
df = df.iloc[:, 1:].copy()
|
|
347
|
+
datetime_col_idx = 0 # Now datetime is in position 0
|
|
348
|
+
except (ValueError, TypeError):
|
|
349
|
+
pass
|
|
350
|
+
|
|
351
|
+
# Replace datetime column with float timestamps
|
|
352
|
+
df.iloc[:, datetime_col_idx] = x_float
|
|
353
|
+
|
|
354
|
+
return df, datetime_metadata
|
|
355
|
+
|
|
356
|
+
|
|
357
|
+
@dataclass
|
|
358
|
+
class CSVData:
|
|
359
|
+
"""Data structure for CSV file contents.
|
|
360
|
+
|
|
361
|
+
This dataclass encapsulates all the data extracted from a CSV file,
|
|
362
|
+
including the actual XY data, labels, units, and metadata.
|
|
363
|
+
|
|
364
|
+
Attributes:
|
|
365
|
+
xydata: Numpy array containing X and Y data columns
|
|
366
|
+
xlabel: Label for the X axis
|
|
367
|
+
xunit: Unit for the X axis
|
|
368
|
+
ylabels: List of labels for Y columns
|
|
369
|
+
yunits: List of units for Y columns
|
|
370
|
+
header: Optional header text from the CSV file
|
|
371
|
+
datetime_metadata: Optional dict with datetime conversion info
|
|
372
|
+
column_metadata: Optional dict with constant-value column metadata
|
|
373
|
+
"""
|
|
374
|
+
|
|
375
|
+
xydata: np.ndarray
|
|
376
|
+
xlabel: str | None = None
|
|
377
|
+
xunit: str | None = None
|
|
378
|
+
ylabels: list[str] | None = None
|
|
379
|
+
yunits: list[str] | None = None
|
|
380
|
+
header: str | None = None
|
|
381
|
+
datetime_metadata: dict | None = None
|
|
382
|
+
column_metadata: dict | None = None
|
|
383
|
+
|
|
384
|
+
|
|
385
|
+
def read_csv(
|
|
386
|
+
filename: str,
|
|
387
|
+
worker: CallbackWorkerProtocol | None = None,
|
|
388
|
+
) -> CSVData:
|
|
389
|
+
"""Read CSV data and return parsed components including datetime metadata.
|
|
390
|
+
|
|
391
|
+
Args:
|
|
392
|
+
filename: CSV file name
|
|
393
|
+
worker: Callback worker object
|
|
394
|
+
|
|
395
|
+
Returns:
|
|
396
|
+
CSVData object containing all parsed CSV components
|
|
397
|
+
"""
|
|
398
|
+
xydata, xlabel, xunit, ylabels, yunits = None, None, None, None, None
|
|
399
|
+
header, datetime_metadata, column_metadata = None, None, {}
|
|
400
|
+
|
|
401
|
+
# The first attempt is to read the CSV file assuming it has no header because it
|
|
402
|
+
# won't raise an error if the first line is data. If it fails, we try to read it
|
|
403
|
+
# with a header, and if it fails again, we try to skip some lines before reading
|
|
404
|
+
# the data.
|
|
405
|
+
|
|
406
|
+
skiprows = None
|
|
407
|
+
|
|
408
|
+
# Begin by reading the first 100 lines to search for a line that could mark the
|
|
409
|
+
# beginning of the data after it (e.g., a line '#DATA' or other).
|
|
410
|
+
first_100_lines = read_first_n_lines(filename, n=100).splitlines()
|
|
411
|
+
for data_header in DATA_HEADERS:
|
|
412
|
+
if data_header in first_100_lines:
|
|
413
|
+
# Skip the lines before the data header
|
|
414
|
+
skiprows = first_100_lines.index(data_header) + 1
|
|
415
|
+
break
|
|
416
|
+
|
|
417
|
+
# First attempt: no header (try to read with different delimiters)
|
|
418
|
+
read_without_header = True
|
|
419
|
+
df, decimal, delimiter = _read_df_without_header(filename, skiprows=skiprows)
|
|
420
|
+
|
|
421
|
+
# Second attempt: with header
|
|
422
|
+
if df is None:
|
|
423
|
+
df, decimal, delimiter = _read_df_with_header(filename)
|
|
424
|
+
|
|
425
|
+
if df is None:
|
|
426
|
+
raise ValueError("Unable to read CSV file (format not supported)")
|
|
427
|
+
|
|
428
|
+
# At this stage, we have a DataFrame with column names, but we don't know
|
|
429
|
+
# if the first line is a header or data. We try to read the first line as
|
|
430
|
+
# a header, and if it fails, we read it as data.
|
|
431
|
+
try:
|
|
432
|
+
# Try to convert columns to float - if first column is datetime, this will
|
|
433
|
+
# fail and we know we have a header
|
|
434
|
+
first_col_numeric = pd.to_numeric(df.columns[0], errors="coerce")
|
|
435
|
+
if pd.notna(first_col_numeric):
|
|
436
|
+
# First column name is numeric, might be data
|
|
437
|
+
df.columns.astype(float)
|
|
438
|
+
# This means the first line is data, so we re-read it, but
|
|
439
|
+
# without the header:
|
|
440
|
+
read_without_header = True
|
|
441
|
+
except (ValueError, TypeError): # TypeError can occur with pandas >= 2.2
|
|
442
|
+
read_without_header = False
|
|
443
|
+
# This means that the first line is a header, so we already have the data
|
|
444
|
+
# without missing values.
|
|
445
|
+
# However, it also means that there could be text information preceding
|
|
446
|
+
# the header. Let's try to read it and put it in `header` variable.
|
|
447
|
+
|
|
448
|
+
# 1. We read only the first 1000 lines to avoid reading the whole file
|
|
449
|
+
# 2. We keep only the lines beginning with a comment character
|
|
450
|
+
# 3. We join the lines to create a single string
|
|
451
|
+
header = ""
|
|
452
|
+
with open(filename, "r", encoding="utf-8") as file:
|
|
453
|
+
for _ in range(1000):
|
|
454
|
+
line = file.readline()
|
|
455
|
+
if line.startswith("#"):
|
|
456
|
+
header += line
|
|
457
|
+
else:
|
|
458
|
+
break
|
|
459
|
+
# Remove the last line if it contains the column names:
|
|
460
|
+
last_line = header.splitlines()[-1] if header.splitlines() else ""
|
|
461
|
+
if str(df.columns[0]) in last_line:
|
|
462
|
+
header = "\n".join(header.splitlines()[:-1])
|
|
463
|
+
|
|
464
|
+
# Now we read the whole file with the correct options
|
|
465
|
+
try:
|
|
466
|
+
df = read_csv_by_chunks(
|
|
467
|
+
filename,
|
|
468
|
+
worker=worker,
|
|
469
|
+
decimal=decimal,
|
|
470
|
+
delimiter=delimiter,
|
|
471
|
+
header=None if read_without_header else "infer",
|
|
472
|
+
skiprows=skiprows,
|
|
473
|
+
comment="#",
|
|
474
|
+
)
|
|
475
|
+
except pd.errors.ParserError:
|
|
476
|
+
# If chunked reading fails (e.g., ragged CSV), try different approaches
|
|
477
|
+
df = None
|
|
478
|
+
# Try with python engine (more flexible)
|
|
479
|
+
for skip in [skiprows, 0, 9, 10, 15, 20]: # Try different skiprows values
|
|
480
|
+
if df is not None:
|
|
481
|
+
break
|
|
482
|
+
try:
|
|
483
|
+
df = pd.read_csv(
|
|
484
|
+
filename,
|
|
485
|
+
decimal=decimal,
|
|
486
|
+
delimiter=delimiter,
|
|
487
|
+
header=None if read_without_header else "infer",
|
|
488
|
+
skiprows=skip,
|
|
489
|
+
comment="#",
|
|
490
|
+
engine="python",
|
|
491
|
+
encoding_errors="ignore",
|
|
492
|
+
)
|
|
493
|
+
break # Success!
|
|
494
|
+
except (pd.errors.ParserError, ValueError):
|
|
495
|
+
continue
|
|
496
|
+
|
|
497
|
+
# If still failing, try auto-detect
|
|
498
|
+
if df is None:
|
|
499
|
+
try:
|
|
500
|
+
df = pd.read_csv(
|
|
501
|
+
filename,
|
|
502
|
+
engine="python",
|
|
503
|
+
encoding_errors="ignore",
|
|
504
|
+
comment="#",
|
|
505
|
+
)
|
|
506
|
+
except (pd.errors.ParserError, ValueError) as e:
|
|
507
|
+
raise ValueError(f"Unable to parse CSV file: {e}") from e
|
|
508
|
+
|
|
509
|
+
# Remove rows and columns where all values are NaN in the DataFrame:
|
|
510
|
+
df = df.dropna(axis=0, how="all").dropna(axis=1, how="all")
|
|
511
|
+
|
|
512
|
+
# Check if first row contains header strings (non-numeric values in all columns)
|
|
513
|
+
# This happens when header="infer" fails to detect the header
|
|
514
|
+
if not df.empty and isinstance(df.columns[0], (int, np.integer)):
|
|
515
|
+
# Columns are integers, not strings - header wasn't properly parsed
|
|
516
|
+
first_row = df.iloc[0]
|
|
517
|
+
# Count how many values in first row are non-numeric strings
|
|
518
|
+
non_numeric_count = 0
|
|
519
|
+
for val in first_row:
|
|
520
|
+
try:
|
|
521
|
+
float(val)
|
|
522
|
+
except (ValueError, TypeError):
|
|
523
|
+
if isinstance(val, str):
|
|
524
|
+
non_numeric_count += 1
|
|
525
|
+
# If most of first row is non-numeric strings, it's likely a header row
|
|
526
|
+
if non_numeric_count / len(first_row) > 0.5:
|
|
527
|
+
# Use first row as column names
|
|
528
|
+
df.columns = first_row.values
|
|
529
|
+
# Drop the first row (header)
|
|
530
|
+
df = df.iloc[1:].reset_index(drop=True)
|
|
531
|
+
|
|
532
|
+
# Try to detect datetime columns - check first two columns
|
|
533
|
+
# Often CSV files have an index column, then a datetime column
|
|
534
|
+
if not df.empty and df.shape[1] >= 2:
|
|
535
|
+
df, datetime_metadata = _detect_datetime_col(df)
|
|
536
|
+
|
|
537
|
+
# Try to detect metadata columns (constant-value columns like serial numbers)
|
|
538
|
+
# This must be done after datetime detection but before converting to numpy
|
|
539
|
+
if not df.empty and df.shape[1] >= 2:
|
|
540
|
+
df, column_metadata = _detect_metadata_cols(df)
|
|
541
|
+
|
|
542
|
+
# Converting to NumPy array
|
|
543
|
+
try:
|
|
544
|
+
xydata = df.to_numpy(float)
|
|
545
|
+
except (ValueError, TypeError):
|
|
546
|
+
# If conversion fails, try converting each column individually
|
|
547
|
+
# and dropping columns that can't be converted
|
|
548
|
+
for col in df.columns:
|
|
549
|
+
df[col] = pd.to_numeric(df[col], errors="coerce")
|
|
550
|
+
df = df.dropna(axis=1, how="all")
|
|
551
|
+
xydata = df.to_numpy(float)
|
|
552
|
+
|
|
553
|
+
if xydata.size == 0:
|
|
554
|
+
raise ValueError(
|
|
555
|
+
f"Unable to read CSV file (no supported data after cleaning): {filename}"
|
|
556
|
+
)
|
|
557
|
+
|
|
558
|
+
xlabel, ylabels, xunit, yunits = get_labels_units_from_dataframe(df)
|
|
559
|
+
|
|
560
|
+
return CSVData(
|
|
561
|
+
xydata=xydata,
|
|
562
|
+
xlabel=xlabel,
|
|
563
|
+
xunit=xunit,
|
|
564
|
+
ylabels=ylabels,
|
|
565
|
+
yunits=yunits,
|
|
566
|
+
header=header,
|
|
567
|
+
datetime_metadata=datetime_metadata,
|
|
568
|
+
column_metadata=column_metadata,
|
|
569
|
+
)
|
|
570
|
+
|
|
571
|
+
|
|
572
|
+
def write_csv(
|
|
573
|
+
filename: str,
|
|
574
|
+
xydata: np.ndarray,
|
|
575
|
+
xlabel: str | None,
|
|
576
|
+
xunit: str | None,
|
|
577
|
+
ylabels: list[str] | None,
|
|
578
|
+
yunits: list[str] | None,
|
|
579
|
+
header: str | None,
|
|
580
|
+
) -> None:
|
|
581
|
+
"""Write CSV data.
|
|
582
|
+
|
|
583
|
+
Args:
|
|
584
|
+
filename: CSV file name
|
|
585
|
+
xydata: XY data
|
|
586
|
+
xlabel: X label
|
|
587
|
+
xunit: X unit
|
|
588
|
+
ylabels: Y labels
|
|
589
|
+
yunits: Y units
|
|
590
|
+
header: Header
|
|
591
|
+
"""
|
|
592
|
+
labels = ""
|
|
593
|
+
delimiter = ","
|
|
594
|
+
if len(ylabels) == 1:
|
|
595
|
+
ylabels = ["Y"] if not ylabels[0] else ylabels
|
|
596
|
+
elif ylabels:
|
|
597
|
+
ylabels = [
|
|
598
|
+
f"Y{i + 1}" if not label else label for i, label in enumerate(ylabels)
|
|
599
|
+
]
|
|
600
|
+
if yunits:
|
|
601
|
+
ylabels = [
|
|
602
|
+
f"{label} ({unit})" if unit else label
|
|
603
|
+
for label, unit in zip(ylabels, yunits)
|
|
604
|
+
]
|
|
605
|
+
if ylabels:
|
|
606
|
+
xlabel = xlabel or "X"
|
|
607
|
+
if xunit:
|
|
608
|
+
xlabel += f" ({xunit})"
|
|
609
|
+
labels = delimiter.join([xlabel] + ylabels)
|
|
610
|
+
df = pd.DataFrame(xydata.T, columns=[xlabel] + ylabels)
|
|
611
|
+
df.to_csv(filename, index=False, header=labels, sep=delimiter)
|
|
612
|
+
# Add header if present
|
|
613
|
+
if header:
|
|
614
|
+
with open(filename, "r+", encoding="utf-8") as file:
|
|
615
|
+
content = file.read()
|
|
616
|
+
file.seek(0, 0)
|
|
617
|
+
file.write(header + "\n" + content)
|
|
618
|
+
|
|
619
|
+
|
|
620
|
+
class MCAFile:
|
|
621
|
+
"""Class to handle MCA files."""
|
|
622
|
+
|
|
623
|
+
def __init__(self, filename: str) -> None:
|
|
624
|
+
self.filename = filename
|
|
625
|
+
self.raw_data: str = ""
|
|
626
|
+
self.xlabel: str | None = None
|
|
627
|
+
self.x: np.ndarray | None = None
|
|
628
|
+
self.y: np.ndarray | None = None
|
|
629
|
+
self.metadata: dict[str, str] = {}
|
|
630
|
+
|
|
631
|
+
def __try_decode(self, raw_bytes: bytes) -> str:
|
|
632
|
+
"""Try to decode raw bytes with the specified encoding."""
|
|
633
|
+
encodings_to_try = ["utf-8", "utf-8-sig", "latin-1", "cp1252"]
|
|
634
|
+
for enc in encodings_to_try:
|
|
635
|
+
try:
|
|
636
|
+
return raw_bytes.decode(enc)
|
|
637
|
+
except UnicodeDecodeError:
|
|
638
|
+
continue
|
|
639
|
+
# If all attempts fail, use 'utf-8' with replacement
|
|
640
|
+
warnings.warn("All decoding attempts failed. Used 'utf-8' with replacement.")
|
|
641
|
+
return raw_bytes.decode("utf-8", errors="replace")
|
|
642
|
+
|
|
643
|
+
def _read_raw_data(self) -> str:
|
|
644
|
+
"""Read the raw data from the MCA file, trying multiple encodings."""
|
|
645
|
+
with open(self.filename, "rb") as file:
|
|
646
|
+
raw_bytes = file.read()
|
|
647
|
+
raw_data = self.__try_decode(raw_bytes)
|
|
648
|
+
self.raw_data = raw_data.replace("\r\n", "\n").replace("\r", "\n")
|
|
649
|
+
|
|
650
|
+
def _read_section(self, section: str) -> str | None:
|
|
651
|
+
"""Read a section from the raw data."""
|
|
652
|
+
pattern = f"(?:.*)(^<<{section}>>$)(.*?)(?:<<.*>>)"
|
|
653
|
+
match = re.search(pattern, self.raw_data, re.DOTALL + re.MULTILINE)
|
|
654
|
+
if match:
|
|
655
|
+
return match.group(2).strip()
|
|
656
|
+
return None
|
|
657
|
+
|
|
658
|
+
@staticmethod
|
|
659
|
+
def _infer_string_value(value_str: str) -> str | float | int | datetime.datetime:
|
|
660
|
+
"""Infer the type of a string value and convert it accordingly."""
|
|
661
|
+
# Try to convert the value to a number or datetime
|
|
662
|
+
try:
|
|
663
|
+
if value_str.isdigit():
|
|
664
|
+
value = int(value_str)
|
|
665
|
+
else:
|
|
666
|
+
try:
|
|
667
|
+
value = float(value_str)
|
|
668
|
+
except ValueError:
|
|
669
|
+
# Try to parse as datetime
|
|
670
|
+
try:
|
|
671
|
+
value = datetime.datetime.strptime(
|
|
672
|
+
value_str, "%m/%d/%Y %H:%M:%S"
|
|
673
|
+
)
|
|
674
|
+
except ValueError:
|
|
675
|
+
value = value_str # Keep as string
|
|
676
|
+
except ValueError:
|
|
677
|
+
value = value_str
|
|
678
|
+
return value
|
|
679
|
+
|
|
680
|
+
def _extract_metadata_from_section(
|
|
681
|
+
self, section: str
|
|
682
|
+
) -> dict[str, str | float | int | datetime.datetime]:
|
|
683
|
+
"""Extract metadata from a specific section."""
|
|
684
|
+
section_contents = self._read_section(section)
|
|
685
|
+
if section_contents is None:
|
|
686
|
+
return {}
|
|
687
|
+
metadata = {}
|
|
688
|
+
patterns = (r"(.*?) - (.*?)$", r"(.*?)\s*: \s*(.*)$", r"(.*?)\s*=\s*(.*);")
|
|
689
|
+
for line in section_contents.splitlines():
|
|
690
|
+
for pattern in patterns:
|
|
691
|
+
match = re.match(pattern, line)
|
|
692
|
+
if match:
|
|
693
|
+
key, value_str = match.groups()
|
|
694
|
+
metadata[key.strip()] = self._infer_string_value(value_str.strip())
|
|
695
|
+
break
|
|
696
|
+
return metadata
|
|
697
|
+
|
|
698
|
+
def read(self) -> None:
|
|
699
|
+
"""Read the MCA file and extract data and metadata."""
|
|
700
|
+
self._read_raw_data()
|
|
701
|
+
self.metadata = self._extract_metadata_from_section("PMCA SPECTRUM")
|
|
702
|
+
additional_metadata = self._extract_metadata_from_section("DPP STATUS")
|
|
703
|
+
self.metadata.update(additional_metadata)
|
|
704
|
+
data_section = self._read_section("DATA")
|
|
705
|
+
self.y = np.fromstring(data_section, sep=" ") if data_section else None
|
|
706
|
+
if self.y is not None:
|
|
707
|
+
self.x = np.arange(len(self.y))
|
|
708
|
+
cal_section = self._read_section("CALIBRATION")
|
|
709
|
+
if cal_section:
|
|
710
|
+
cal_metadata = self._extract_metadata_from_section(cal_section)
|
|
711
|
+
self.xlabel = cal_metadata.get("LABEL")
|
|
712
|
+
cal_data = np.array(
|
|
713
|
+
[
|
|
714
|
+
[float(v) for v in val.split(" ")]
|
|
715
|
+
for val in cal_section.splitlines()[1:]
|
|
716
|
+
]
|
|
717
|
+
)
|
|
718
|
+
self.x = scipy.interpolate.interp1d(
|
|
719
|
+
cal_data[:, 0],
|
|
720
|
+
cal_data[:, 1],
|
|
721
|
+
bounds_error=False,
|
|
722
|
+
fill_value="extrapolate",
|
|
723
|
+
)(self.x)
|