sigima 0.0.1.dev0__py3-none-any.whl → 1.0.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (264) hide show
  1. sigima/__init__.py +142 -2
  2. sigima/client/__init__.py +105 -0
  3. sigima/client/base.py +780 -0
  4. sigima/client/remote.py +469 -0
  5. sigima/client/stub.py +814 -0
  6. sigima/client/utils.py +90 -0
  7. sigima/config.py +444 -0
  8. sigima/data/logo/Sigima.svg +135 -0
  9. sigima/data/tests/annotations.json +798 -0
  10. sigima/data/tests/curve_fitting/exponential_fit.txt +511 -0
  11. sigima/data/tests/curve_fitting/gaussian_fit.txt +100 -0
  12. sigima/data/tests/curve_fitting/piecewiseexponential_fit.txt +1022 -0
  13. sigima/data/tests/curve_fitting/polynomial_fit.txt +100 -0
  14. sigima/data/tests/curve_fitting/twohalfgaussian_fit.txt +1000 -0
  15. sigima/data/tests/curve_formats/bandwidth.txt +201 -0
  16. sigima/data/tests/curve_formats/boxcar.npy +0 -0
  17. sigima/data/tests/curve_formats/datetime.txt +1001 -0
  18. sigima/data/tests/curve_formats/dynamic_parameters.txt +4000 -0
  19. sigima/data/tests/curve_formats/fw1e2.txt +301 -0
  20. sigima/data/tests/curve_formats/fwhm.txt +319 -0
  21. sigima/data/tests/curve_formats/multiple_curves.csv +29 -0
  22. sigima/data/tests/curve_formats/noised_saw.mat +0 -0
  23. sigima/data/tests/curve_formats/oscilloscope.csv +111 -0
  24. sigima/data/tests/curve_formats/other/other2/recursive2.txt +5 -0
  25. sigima/data/tests/curve_formats/other/recursive1.txt +5 -0
  26. sigima/data/tests/curve_formats/paracetamol.npy +0 -0
  27. sigima/data/tests/curve_formats/paracetamol.txt +1010 -0
  28. sigima/data/tests/curve_formats/paracetamol_dx_dy.csv +1000 -0
  29. sigima/data/tests/curve_formats/paracetamol_dy.csv +1001 -0
  30. sigima/data/tests/curve_formats/pulse1.npy +0 -0
  31. sigima/data/tests/curve_formats/pulse2.npy +0 -0
  32. sigima/data/tests/curve_formats/simple.txt +5 -0
  33. sigima/data/tests/curve_formats/spectrum.mca +2139 -0
  34. sigima/data/tests/curve_formats/square2.npy +0 -0
  35. sigima/data/tests/curve_formats/step.npy +0 -0
  36. sigima/data/tests/fabry-perot1.jpg +0 -0
  37. sigima/data/tests/fabry-perot2.jpg +0 -0
  38. sigima/data/tests/flower.npy +0 -0
  39. sigima/data/tests/image_formats/NF 180338201.scor-data +11003 -0
  40. sigima/data/tests/image_formats/binary_image.npy +0 -0
  41. sigima/data/tests/image_formats/binary_image.png +0 -0
  42. sigima/data/tests/image_formats/centroid_test.npy +0 -0
  43. sigima/data/tests/image_formats/coordinated_text/complex_image.txt +10011 -0
  44. sigima/data/tests/image_formats/coordinated_text/complex_ref_image.txt +10010 -0
  45. sigima/data/tests/image_formats/coordinated_text/image.txt +15 -0
  46. sigima/data/tests/image_formats/coordinated_text/image2.txt +14 -0
  47. sigima/data/tests/image_formats/coordinated_text/image_no_unit_no_label.txt +14 -0
  48. sigima/data/tests/image_formats/coordinated_text/image_with_nan.txt +15 -0
  49. sigima/data/tests/image_formats/coordinated_text/image_with_unit.txt +14 -0
  50. sigima/data/tests/image_formats/fiber.csv +480 -0
  51. sigima/data/tests/image_formats/fiber.jpg +0 -0
  52. sigima/data/tests/image_formats/fiber.png +0 -0
  53. sigima/data/tests/image_formats/fiber.txt +480 -0
  54. sigima/data/tests/image_formats/gaussian_spot_with_noise.npy +0 -0
  55. sigima/data/tests/image_formats/mr-brain.dcm +0 -0
  56. sigima/data/tests/image_formats/noised_gaussian.mat +0 -0
  57. sigima/data/tests/image_formats/sif_reader/nd_lum_image_no_glue.sif +0 -0
  58. sigima/data/tests/image_formats/sif_reader/raman1.sif +0 -0
  59. sigima/data/tests/image_formats/tiling.txt +10 -0
  60. sigima/data/tests/image_formats/uint16.tiff +0 -0
  61. sigima/data/tests/image_formats/uint8.tiff +0 -0
  62. sigima/data/tests/laser_beam/TEM00_z_13.jpg +0 -0
  63. sigima/data/tests/laser_beam/TEM00_z_18.jpg +0 -0
  64. sigima/data/tests/laser_beam/TEM00_z_23.jpg +0 -0
  65. sigima/data/tests/laser_beam/TEM00_z_30.jpg +0 -0
  66. sigima/data/tests/laser_beam/TEM00_z_35.jpg +0 -0
  67. sigima/data/tests/laser_beam/TEM00_z_40.jpg +0 -0
  68. sigima/data/tests/laser_beam/TEM00_z_45.jpg +0 -0
  69. sigima/data/tests/laser_beam/TEM00_z_50.jpg +0 -0
  70. sigima/data/tests/laser_beam/TEM00_z_55.jpg +0 -0
  71. sigima/data/tests/laser_beam/TEM00_z_60.jpg +0 -0
  72. sigima/data/tests/laser_beam/TEM00_z_65.jpg +0 -0
  73. sigima/data/tests/laser_beam/TEM00_z_70.jpg +0 -0
  74. sigima/data/tests/laser_beam/TEM00_z_75.jpg +0 -0
  75. sigima/data/tests/laser_beam/TEM00_z_80.jpg +0 -0
  76. sigima/enums.py +195 -0
  77. sigima/io/__init__.py +123 -0
  78. sigima/io/base.py +311 -0
  79. sigima/io/common/__init__.py +5 -0
  80. sigima/io/common/basename.py +164 -0
  81. sigima/io/common/converters.py +189 -0
  82. sigima/io/common/objmeta.py +181 -0
  83. sigima/io/common/textreader.py +58 -0
  84. sigima/io/convenience.py +157 -0
  85. sigima/io/enums.py +17 -0
  86. sigima/io/ftlab.py +395 -0
  87. sigima/io/image/__init__.py +9 -0
  88. sigima/io/image/base.py +177 -0
  89. sigima/io/image/formats.py +1016 -0
  90. sigima/io/image/funcs.py +414 -0
  91. sigima/io/signal/__init__.py +9 -0
  92. sigima/io/signal/base.py +129 -0
  93. sigima/io/signal/formats.py +290 -0
  94. sigima/io/signal/funcs.py +723 -0
  95. sigima/objects/__init__.py +260 -0
  96. sigima/objects/base.py +937 -0
  97. sigima/objects/image/__init__.py +88 -0
  98. sigima/objects/image/creation.py +556 -0
  99. sigima/objects/image/object.py +524 -0
  100. sigima/objects/image/roi.py +904 -0
  101. sigima/objects/scalar/__init__.py +57 -0
  102. sigima/objects/scalar/common.py +215 -0
  103. sigima/objects/scalar/geometry.py +502 -0
  104. sigima/objects/scalar/table.py +784 -0
  105. sigima/objects/shape.py +290 -0
  106. sigima/objects/signal/__init__.py +133 -0
  107. sigima/objects/signal/constants.py +27 -0
  108. sigima/objects/signal/creation.py +1428 -0
  109. sigima/objects/signal/object.py +444 -0
  110. sigima/objects/signal/roi.py +274 -0
  111. sigima/params.py +405 -0
  112. sigima/proc/__init__.py +96 -0
  113. sigima/proc/base.py +381 -0
  114. sigima/proc/decorator.py +330 -0
  115. sigima/proc/image/__init__.py +513 -0
  116. sigima/proc/image/arithmetic.py +335 -0
  117. sigima/proc/image/base.py +260 -0
  118. sigima/proc/image/detection.py +519 -0
  119. sigima/proc/image/edges.py +329 -0
  120. sigima/proc/image/exposure.py +406 -0
  121. sigima/proc/image/extraction.py +458 -0
  122. sigima/proc/image/filtering.py +219 -0
  123. sigima/proc/image/fourier.py +147 -0
  124. sigima/proc/image/geometry.py +661 -0
  125. sigima/proc/image/mathops.py +340 -0
  126. sigima/proc/image/measurement.py +195 -0
  127. sigima/proc/image/morphology.py +155 -0
  128. sigima/proc/image/noise.py +107 -0
  129. sigima/proc/image/preprocessing.py +182 -0
  130. sigima/proc/image/restoration.py +235 -0
  131. sigima/proc/image/threshold.py +217 -0
  132. sigima/proc/image/transformations.py +393 -0
  133. sigima/proc/signal/__init__.py +376 -0
  134. sigima/proc/signal/analysis.py +206 -0
  135. sigima/proc/signal/arithmetic.py +551 -0
  136. sigima/proc/signal/base.py +262 -0
  137. sigima/proc/signal/extraction.py +60 -0
  138. sigima/proc/signal/features.py +310 -0
  139. sigima/proc/signal/filtering.py +484 -0
  140. sigima/proc/signal/fitting.py +276 -0
  141. sigima/proc/signal/fourier.py +259 -0
  142. sigima/proc/signal/mathops.py +420 -0
  143. sigima/proc/signal/processing.py +580 -0
  144. sigima/proc/signal/stability.py +175 -0
  145. sigima/proc/title_formatting.py +227 -0
  146. sigima/proc/validation.py +272 -0
  147. sigima/tests/__init__.py +7 -0
  148. sigima/tests/common/__init__.py +0 -0
  149. sigima/tests/common/arithmeticparam_unit_test.py +26 -0
  150. sigima/tests/common/basename_unit_test.py +126 -0
  151. sigima/tests/common/client_unit_test.py +412 -0
  152. sigima/tests/common/converters_unit_test.py +77 -0
  153. sigima/tests/common/decorator_unit_test.py +176 -0
  154. sigima/tests/common/examples_unit_test.py +104 -0
  155. sigima/tests/common/kernel_normalization_unit_test.py +242 -0
  156. sigima/tests/common/roi_basic_unit_test.py +73 -0
  157. sigima/tests/common/roi_geometry_unit_test.py +171 -0
  158. sigima/tests/common/scalar_builder_unit_test.py +142 -0
  159. sigima/tests/common/scalar_unit_test.py +991 -0
  160. sigima/tests/common/shape_unit_test.py +183 -0
  161. sigima/tests/common/stat_unit_test.py +138 -0
  162. sigima/tests/common/title_formatting_unit_test.py +338 -0
  163. sigima/tests/common/tools_coordinates_unit_test.py +60 -0
  164. sigima/tests/common/transformations_unit_test.py +178 -0
  165. sigima/tests/common/validation_unit_test.py +205 -0
  166. sigima/tests/conftest.py +129 -0
  167. sigima/tests/data.py +998 -0
  168. sigima/tests/env.py +280 -0
  169. sigima/tests/guiutils.py +163 -0
  170. sigima/tests/helpers.py +532 -0
  171. sigima/tests/image/__init__.py +28 -0
  172. sigima/tests/image/binning_unit_test.py +128 -0
  173. sigima/tests/image/blob_detection_unit_test.py +312 -0
  174. sigima/tests/image/centroid_unit_test.py +170 -0
  175. sigima/tests/image/check_2d_array_unit_test.py +63 -0
  176. sigima/tests/image/contour_unit_test.py +172 -0
  177. sigima/tests/image/convolution_unit_test.py +178 -0
  178. sigima/tests/image/datatype_unit_test.py +67 -0
  179. sigima/tests/image/edges_unit_test.py +155 -0
  180. sigima/tests/image/enclosingcircle_unit_test.py +88 -0
  181. sigima/tests/image/exposure_unit_test.py +223 -0
  182. sigima/tests/image/fft2d_unit_test.py +189 -0
  183. sigima/tests/image/filtering_unit_test.py +166 -0
  184. sigima/tests/image/geometry_unit_test.py +654 -0
  185. sigima/tests/image/hough_circle_unit_test.py +147 -0
  186. sigima/tests/image/imageobj_unit_test.py +737 -0
  187. sigima/tests/image/morphology_unit_test.py +71 -0
  188. sigima/tests/image/noise_unit_test.py +57 -0
  189. sigima/tests/image/offset_correction_unit_test.py +72 -0
  190. sigima/tests/image/operation_unit_test.py +518 -0
  191. sigima/tests/image/peak2d_limits_unit_test.py +41 -0
  192. sigima/tests/image/peak2d_unit_test.py +133 -0
  193. sigima/tests/image/profile_unit_test.py +159 -0
  194. sigima/tests/image/projections_unit_test.py +121 -0
  195. sigima/tests/image/restoration_unit_test.py +141 -0
  196. sigima/tests/image/roi2dparam_unit_test.py +53 -0
  197. sigima/tests/image/roi_advanced_unit_test.py +588 -0
  198. sigima/tests/image/roi_grid_unit_test.py +279 -0
  199. sigima/tests/image/spectrum2d_unit_test.py +40 -0
  200. sigima/tests/image/threshold_unit_test.py +91 -0
  201. sigima/tests/io/__init__.py +0 -0
  202. sigima/tests/io/addnewformat_unit_test.py +125 -0
  203. sigima/tests/io/convenience_funcs_unit_test.py +470 -0
  204. sigima/tests/io/coordinated_text_format_unit_test.py +495 -0
  205. sigima/tests/io/datetime_csv_unit_test.py +198 -0
  206. sigima/tests/io/imageio_formats_test.py +41 -0
  207. sigima/tests/io/ioregistry_unit_test.py +69 -0
  208. sigima/tests/io/objmeta_unit_test.py +87 -0
  209. sigima/tests/io/readobj_unit_test.py +130 -0
  210. sigima/tests/io/readwriteobj_unit_test.py +67 -0
  211. sigima/tests/signal/__init__.py +0 -0
  212. sigima/tests/signal/analysis_unit_test.py +135 -0
  213. sigima/tests/signal/check_1d_arrays_unit_test.py +169 -0
  214. sigima/tests/signal/convolution_unit_test.py +404 -0
  215. sigima/tests/signal/datetime_unit_test.py +176 -0
  216. sigima/tests/signal/fft1d_unit_test.py +303 -0
  217. sigima/tests/signal/filters_unit_test.py +403 -0
  218. sigima/tests/signal/fitting_unit_test.py +929 -0
  219. sigima/tests/signal/fwhm_unit_test.py +111 -0
  220. sigima/tests/signal/noise_unit_test.py +128 -0
  221. sigima/tests/signal/offset_correction_unit_test.py +34 -0
  222. sigima/tests/signal/operation_unit_test.py +489 -0
  223. sigima/tests/signal/peakdetection_unit_test.py +145 -0
  224. sigima/tests/signal/processing_unit_test.py +657 -0
  225. sigima/tests/signal/pulse/__init__.py +112 -0
  226. sigima/tests/signal/pulse/crossing_times_unit_test.py +123 -0
  227. sigima/tests/signal/pulse/plateau_detection_unit_test.py +102 -0
  228. sigima/tests/signal/pulse/pulse_unit_test.py +1824 -0
  229. sigima/tests/signal/roi_advanced_unit_test.py +392 -0
  230. sigima/tests/signal/signalobj_unit_test.py +603 -0
  231. sigima/tests/signal/stability_unit_test.py +431 -0
  232. sigima/tests/signal/uncertainty_unit_test.py +611 -0
  233. sigima/tests/vistools.py +1030 -0
  234. sigima/tools/__init__.py +59 -0
  235. sigima/tools/checks.py +290 -0
  236. sigima/tools/coordinates.py +308 -0
  237. sigima/tools/datatypes.py +26 -0
  238. sigima/tools/image/__init__.py +97 -0
  239. sigima/tools/image/detection.py +451 -0
  240. sigima/tools/image/exposure.py +77 -0
  241. sigima/tools/image/extraction.py +48 -0
  242. sigima/tools/image/fourier.py +260 -0
  243. sigima/tools/image/geometry.py +190 -0
  244. sigima/tools/image/preprocessing.py +165 -0
  245. sigima/tools/signal/__init__.py +86 -0
  246. sigima/tools/signal/dynamic.py +254 -0
  247. sigima/tools/signal/features.py +135 -0
  248. sigima/tools/signal/filtering.py +171 -0
  249. sigima/tools/signal/fitting.py +1171 -0
  250. sigima/tools/signal/fourier.py +466 -0
  251. sigima/tools/signal/interpolation.py +70 -0
  252. sigima/tools/signal/peakdetection.py +126 -0
  253. sigima/tools/signal/pulse.py +1626 -0
  254. sigima/tools/signal/scaling.py +50 -0
  255. sigima/tools/signal/stability.py +258 -0
  256. sigima/tools/signal/windowing.py +90 -0
  257. sigima/worker.py +79 -0
  258. sigima-1.0.0.dist-info/METADATA +233 -0
  259. sigima-1.0.0.dist-info/RECORD +262 -0
  260. {sigima-0.0.1.dev0.dist-info → sigima-1.0.0.dist-info}/licenses/LICENSE +29 -29
  261. sigima-0.0.1.dev0.dist-info/METADATA +0 -60
  262. sigima-0.0.1.dev0.dist-info/RECORD +0 -6
  263. {sigima-0.0.1.dev0.dist-info → sigima-1.0.0.dist-info}/WHEEL +0 -0
  264. {sigima-0.0.1.dev0.dist-info → sigima-1.0.0.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,723 @@
1
+ # Copyright (c) DataLab Platform Developers, BSD 3-Clause license, see LICENSE file.
2
+
3
+ """
4
+ I/O signal functions
5
+ """
6
+
7
+ # pylint: disable=invalid-name # Allows short reference names like x, y, ...
8
+
9
+ from __future__ import annotations
10
+
11
+ import datetime
12
+ import re
13
+ import warnings
14
+ from dataclasses import dataclass
15
+ from typing import TextIO
16
+
17
+ import numpy as np
18
+ import pandas as pd
19
+ import scipy.interpolate
20
+
21
+ from sigima.io.common.textreader import count_lines, read_first_n_lines
22
+ from sigima.objects.signal.constants import (
23
+ DATETIME_X_FORMAT_KEY,
24
+ DATETIME_X_KEY,
25
+ DEFAULT_DATETIME_FORMAT,
26
+ )
27
+ from sigima.worker import CallbackWorkerProtocol
28
+
29
+
30
+ def get_labels_units_from_dataframe(
31
+ df: pd.DataFrame,
32
+ ) -> tuple[str, list[str], str, list[str]]:
33
+ """Get labels and units from a DataFrame.
34
+
35
+ Args:
36
+ df: DataFrame
37
+
38
+ Returns:
39
+ Tuple (xlabel, ylabels, xunit, yunits)
40
+ """
41
+ # Reading X,Y labels
42
+ xlabel = str(df.columns[0])
43
+ ylabels = [str(col) for col in df.columns[1:]]
44
+
45
+ # Retrieving units from labels
46
+ xunit = ""
47
+ yunits = [""] * len(ylabels)
48
+ pattern = r"([\S ]*) \(([\S]*)\)"
49
+ match = re.match(pattern, xlabel)
50
+ if match is not None:
51
+ xlabel, xunit = match.groups()
52
+ for i, ylabel in enumerate(ylabels):
53
+ match = re.match(pattern, ylabel)
54
+ if match is not None:
55
+ ylabels[i], yunits[i] = match.groups()
56
+
57
+ return xlabel, ylabels, xunit, yunits
58
+
59
+
60
+ def read_csv_by_chunks(
61
+ fname_or_fileobj: str | TextIO,
62
+ nlines: int | None = None,
63
+ worker: CallbackWorkerProtocol | None = None,
64
+ decimal: str = ".",
65
+ delimiter: str | None = None,
66
+ header: int | None = "infer",
67
+ skiprows: int | None = None,
68
+ nrows: int | None = None,
69
+ comment: str | None = None,
70
+ chunksize: int = 1000,
71
+ ) -> pd.DataFrame:
72
+ """Read CSV data with primitive options, using pandas read_csv function defaults,
73
+ and reading data in chunks, using the iterator interface.
74
+
75
+ Args:
76
+ fname_or_fileobj: CSV file name or text stream object
77
+ nlines: Number of lines contained in file (this argument is mandatory if
78
+ `fname_or_fileobj` is a text stream object: counting line numbers from a
79
+ text stream is not efficient, especially if one already has access to the
80
+ initial text content from which the text stream was made)
81
+ worker: Callback worker object
82
+ decimal: Decimal character
83
+ delimiter: Delimiter
84
+ header: Header line
85
+ skiprows: Skip rows
86
+ nrows: Number of rows to read
87
+ comment: Comment character
88
+ chunksize: Chunk size
89
+
90
+ Returns:
91
+ DataFrame
92
+ """
93
+ if isinstance(fname_or_fileobj, str):
94
+ nlines = count_lines(fname_or_fileobj)
95
+ elif nlines is None:
96
+ raise ValueError("Argument `nlines` must be passed for text streams")
97
+ # Read data in chunks, and concatenate them at the end, thus allowing to call the
98
+ # progress callback function at each chunk read and to return an intermediate result
99
+ # if the operation is canceled.
100
+ chunks = []
101
+ for chunk in pd.read_csv(
102
+ fname_or_fileobj,
103
+ decimal=decimal,
104
+ delimiter=delimiter,
105
+ header=header,
106
+ skiprows=skiprows,
107
+ nrows=nrows,
108
+ comment=comment,
109
+ chunksize=chunksize,
110
+ encoding_errors="ignore",
111
+ ):
112
+ chunks.append(chunk)
113
+ # Compute the progression based on the number of lines read so far
114
+ if worker is not None:
115
+ worker.set_progress(sum(len(chunk) for chunk in chunks) / nlines)
116
+ if worker.was_canceled():
117
+ break
118
+ return pd.concat(chunks)
119
+
120
+
121
+ DATA_HEADERS = [
122
+ "#DATA", # Generic
123
+ "START_OF_DATA", # Various logging devices
124
+ ">>>>>Begin Spectral Data<<<<<", # Ocean Optics
125
+ ">>>Begin Data<<<", # Ocean Optics (alternative)
126
+ ">>>Begin Spectrum Data<<<", # Avantes
127
+ "# Data Start", # Andor, Horiba, Mass Spectrometry (Agilent, Thermo Fisher, ...)
128
+ ">DATA START<", # Mass Spectrometry, Chromatography
129
+ "BEGIN DATA", # Mass Spectrometry, Chromatography
130
+ "<Data>", # Mass Spectrometry (XML-based)
131
+ "##Start Data", # Bruker (X-ray, Raman, FTIR)
132
+ "[DataStart]", # PerkinElmer (FTIR, UV-Vis)
133
+ "BEGIN SPECTRUM", # PerkinElmer
134
+ "%% Data Start %%", # LabVIEW, MATLAB
135
+ "---Begin Data---", # General scientific instruments
136
+ "===DATA START===", # Industrial/scientific devices
137
+ ]
138
+
139
+
140
+ def _read_df_without_header(
141
+ filename: str, skiprows: int | None = None
142
+ ) -> tuple[pd.DataFrame | None, str, str]:
143
+ """Try to read a CSV file without header, testing various delimiters and decimal.
144
+
145
+ Args:
146
+ filename: CSV file name
147
+ skiprows: Number of rows to skip at the beginning of the file
148
+
149
+ Returns:
150
+ A tuple (DataFrame if successful, None otherwise, decimal used, delimiter used)
151
+ """
152
+ for decimal in (".", ","):
153
+ for delimiter in (",", ";", r"\s+"):
154
+ try:
155
+ df = pd.read_csv(
156
+ filename,
157
+ decimal=decimal,
158
+ delimiter=delimiter,
159
+ header=None,
160
+ comment="#",
161
+ nrows=1000, # Read only the first 1000 lines
162
+ encoding_errors="ignore",
163
+ skiprows=skiprows,
164
+ dtype=float, # Keep dtype to validate delimiter detection
165
+ )
166
+ break
167
+ except (pd.errors.ParserError, ValueError):
168
+ df = None
169
+ if df is not None:
170
+ break
171
+ return df, decimal, delimiter
172
+
173
+
174
+ def _read_df_with_header(filename: str) -> tuple[pd.DataFrame | None, str, str]:
175
+ """Try to read a CSV file with header, testing various delimiters and decimal.
176
+
177
+ Args:
178
+ filename: CSV file name
179
+
180
+ Returns:
181
+ A tuple (DataFrame if successful, None otherwise, decimal used, delimiter used)
182
+ """
183
+ for decimal in (".", ","):
184
+ for delimiter in (",", ";", r"\s+"):
185
+ # Headers are generally in the first 10 lines, so we try to skip the
186
+ # minimum number of lines before reading the data:
187
+ for skiprows in range(20):
188
+ try:
189
+ df = pd.read_csv(
190
+ filename,
191
+ decimal=decimal,
192
+ delimiter=delimiter,
193
+ skiprows=skiprows,
194
+ comment="#",
195
+ nrows=1000, # Read only the first 1000 lines
196
+ encoding_errors="ignore",
197
+ )
198
+ # Validate: CSV should have at least 2 columns (x and y)
199
+ # If only 1 column, likely wrong delimiter
200
+ if df.shape[1] >= 2:
201
+ break # Good delimiter found
202
+ df = None # Try next delimiter
203
+ except (pd.errors.ParserError, ValueError):
204
+ df = None
205
+ if df is not None:
206
+ break
207
+ if df is not None:
208
+ break
209
+ return df, decimal, delimiter
210
+
211
+
212
+ def _detect_metadata_cols(df: pd.DataFrame) -> tuple[pd.DataFrame, dict]:
213
+ """Detect columns containing constant/single-value metadata.
214
+
215
+ Columns with a single unique value (excluding NaN) across all rows are treated
216
+ as metadata rather than data columns. These are typically instrument serial numbers,
217
+ experiment IDs, or other constant identifiers.
218
+
219
+ Args:
220
+ df: Input DataFrame
221
+
222
+ Returns:
223
+ A tuple (DataFrame with metadata columns removed,
224
+ dict of metadata key-value pairs)
225
+ """
226
+ metadata = {}
227
+ cols_to_drop = []
228
+
229
+ # Start from column 1 (skip X column) and check for constant-value columns
230
+ for col_idx in range(1, df.shape[1]):
231
+ col_data = df.iloc[:, col_idx]
232
+ col_name = df.columns[col_idx]
233
+
234
+ # Get unique non-NaN values
235
+ unique_values = col_data.dropna().unique()
236
+
237
+ # If column has exactly one unique value (excluding NaN), it's metadata
238
+ if len(unique_values) == 1:
239
+ # Store the metadata
240
+ value = unique_values[0]
241
+ # Try to convert to appropriate type (keep as string if necessary)
242
+ try:
243
+ # Try int first
244
+ if float(value).is_integer():
245
+ value = int(float(value))
246
+ else:
247
+ value = float(value)
248
+ except (ValueError, TypeError):
249
+ # Keep as string
250
+ value = str(value)
251
+
252
+ metadata[str(col_name)] = value
253
+ cols_to_drop.append(col_name) # Store column name, not index
254
+
255
+ # Drop metadata columns from DataFrame
256
+ if cols_to_drop:
257
+ df = df.drop(columns=cols_to_drop)
258
+
259
+ return df, metadata
260
+
261
+
262
+ def _detect_datetime_col(df: pd.DataFrame) -> tuple[pd.DataFrame, dict | None]:
263
+ """Try to detect the presence of a datetime column in a DataFrame.
264
+
265
+ Detect if the first or second column contains datetime values, and convert it to
266
+ float timestamps if so.
267
+
268
+ Args:
269
+ df: Input DataFrame
270
+
271
+ Returns:
272
+ A tuple (DataFrame with datetime column converted, datetime metadata dict)
273
+ """
274
+ datetime_col_idx = None
275
+
276
+ for col_idx in [0, 1]: # Check first two columns
277
+ col_data = df.iloc[:, col_idx]
278
+ # Try to convert to datetime
279
+ try:
280
+ # Attempt to parse as datetime
281
+ # Note: format="mixed" was causing failures in some pandas versions,
282
+ # so we use warnings filter to suppress the UserWarning instead
283
+ with warnings.catch_warnings():
284
+ warnings.filterwarnings(
285
+ "ignore",
286
+ message="Could not infer format",
287
+ category=UserWarning,
288
+ )
289
+ datetime_series = pd.to_datetime(col_data, errors="coerce")
290
+ # Check if most values were successfully converted (>90%)
291
+ valid_ratio = datetime_series.notna().sum() / len(datetime_series)
292
+
293
+ # Skip if conversion ratio is too low
294
+ if valid_ratio <= 0.9:
295
+ continue
296
+
297
+ # Check if values have reasonable variation and are actual dates
298
+ unique_dates = datetime_series.dropna().nunique()
299
+ if unique_dates <= 1:
300
+ continue
301
+
302
+ # Check date range - should be reasonable dates, not epoch times
303
+ min_date = datetime_series.min()
304
+ max_date = datetime_series.max()
305
+ # Dates should be after 1900 and the range should be > 1 sec
306
+ valid_datetime = (
307
+ min_date.year >= 1900 and (max_date - min_date).total_seconds() > 1.0
308
+ )
309
+
310
+ if valid_datetime:
311
+ # This is a datetime column!
312
+ datetime_col_idx = col_idx
313
+ break
314
+ except (ValueError, TypeError, pd.errors.OutOfBoundsDatetime):
315
+ # Not a datetime column, continue checking
316
+ pass
317
+
318
+ datetime_metadata = None
319
+
320
+ if datetime_col_idx is not None:
321
+ # Convert datetime column to float timestamps
322
+ col_data = df.iloc[:, datetime_col_idx]
323
+ with warnings.catch_warnings():
324
+ warnings.filterwarnings(
325
+ "ignore", message="Could not infer format", category=UserWarning
326
+ )
327
+ datetime_series = pd.to_datetime(col_data, errors="coerce")
328
+ x_float = datetime_series.astype(np.int64) / 1e9
329
+ # Store datetime metadata (unit will be stored in xunit attribute)
330
+ datetime_metadata = {
331
+ DATETIME_X_KEY: True,
332
+ DATETIME_X_FORMAT_KEY: DEFAULT_DATETIME_FORMAT,
333
+ }
334
+
335
+ # If datetime is in column 1 and column 0 looks like an index, drop column 0
336
+ if datetime_col_idx == 1:
337
+ try:
338
+ # Try to convert first column to int - if sequential,
339
+ # it's likely an index column
340
+ first_col = pd.to_numeric(df.iloc[:, 0], errors="coerce")
341
+ if first_col.notna().all():
342
+ # Check if it's a sequential index (1, 2, 3, ...)
343
+ diffs = first_col.diff().dropna()
344
+ if (diffs == 1).sum() / len(diffs) > 0.9:
345
+ # Drop the index column
346
+ df = df.iloc[:, 1:].copy()
347
+ datetime_col_idx = 0 # Now datetime is in position 0
348
+ except (ValueError, TypeError):
349
+ pass
350
+
351
+ # Replace datetime column with float timestamps
352
+ df.iloc[:, datetime_col_idx] = x_float
353
+
354
+ return df, datetime_metadata
355
+
356
+
357
+ @dataclass
358
+ class CSVData:
359
+ """Data structure for CSV file contents.
360
+
361
+ This dataclass encapsulates all the data extracted from a CSV file,
362
+ including the actual XY data, labels, units, and metadata.
363
+
364
+ Attributes:
365
+ xydata: Numpy array containing X and Y data columns
366
+ xlabel: Label for the X axis
367
+ xunit: Unit for the X axis
368
+ ylabels: List of labels for Y columns
369
+ yunits: List of units for Y columns
370
+ header: Optional header text from the CSV file
371
+ datetime_metadata: Optional dict with datetime conversion info
372
+ column_metadata: Optional dict with constant-value column metadata
373
+ """
374
+
375
+ xydata: np.ndarray
376
+ xlabel: str | None = None
377
+ xunit: str | None = None
378
+ ylabels: list[str] | None = None
379
+ yunits: list[str] | None = None
380
+ header: str | None = None
381
+ datetime_metadata: dict | None = None
382
+ column_metadata: dict | None = None
383
+
384
+
385
+ def read_csv(
386
+ filename: str,
387
+ worker: CallbackWorkerProtocol | None = None,
388
+ ) -> CSVData:
389
+ """Read CSV data and return parsed components including datetime metadata.
390
+
391
+ Args:
392
+ filename: CSV file name
393
+ worker: Callback worker object
394
+
395
+ Returns:
396
+ CSVData object containing all parsed CSV components
397
+ """
398
+ xydata, xlabel, xunit, ylabels, yunits = None, None, None, None, None
399
+ header, datetime_metadata, column_metadata = None, None, {}
400
+
401
+ # The first attempt is to read the CSV file assuming it has no header because it
402
+ # won't raise an error if the first line is data. If it fails, we try to read it
403
+ # with a header, and if it fails again, we try to skip some lines before reading
404
+ # the data.
405
+
406
+ skiprows = None
407
+
408
+ # Begin by reading the first 100 lines to search for a line that could mark the
409
+ # beginning of the data after it (e.g., a line '#DATA' or other).
410
+ first_100_lines = read_first_n_lines(filename, n=100).splitlines()
411
+ for data_header in DATA_HEADERS:
412
+ if data_header in first_100_lines:
413
+ # Skip the lines before the data header
414
+ skiprows = first_100_lines.index(data_header) + 1
415
+ break
416
+
417
+ # First attempt: no header (try to read with different delimiters)
418
+ read_without_header = True
419
+ df, decimal, delimiter = _read_df_without_header(filename, skiprows=skiprows)
420
+
421
+ # Second attempt: with header
422
+ if df is None:
423
+ df, decimal, delimiter = _read_df_with_header(filename)
424
+
425
+ if df is None:
426
+ raise ValueError("Unable to read CSV file (format not supported)")
427
+
428
+ # At this stage, we have a DataFrame with column names, but we don't know
429
+ # if the first line is a header or data. We try to read the first line as
430
+ # a header, and if it fails, we read it as data.
431
+ try:
432
+ # Try to convert columns to float - if first column is datetime, this will
433
+ # fail and we know we have a header
434
+ first_col_numeric = pd.to_numeric(df.columns[0], errors="coerce")
435
+ if pd.notna(first_col_numeric):
436
+ # First column name is numeric, might be data
437
+ df.columns.astype(float)
438
+ # This means the first line is data, so we re-read it, but
439
+ # without the header:
440
+ read_without_header = True
441
+ except (ValueError, TypeError): # TypeError can occur with pandas >= 2.2
442
+ read_without_header = False
443
+ # This means that the first line is a header, so we already have the data
444
+ # without missing values.
445
+ # However, it also means that there could be text information preceding
446
+ # the header. Let's try to read it and put it in `header` variable.
447
+
448
+ # 1. We read only the first 1000 lines to avoid reading the whole file
449
+ # 2. We keep only the lines beginning with a comment character
450
+ # 3. We join the lines to create a single string
451
+ header = ""
452
+ with open(filename, "r", encoding="utf-8") as file:
453
+ for _ in range(1000):
454
+ line = file.readline()
455
+ if line.startswith("#"):
456
+ header += line
457
+ else:
458
+ break
459
+ # Remove the last line if it contains the column names:
460
+ last_line = header.splitlines()[-1] if header.splitlines() else ""
461
+ if str(df.columns[0]) in last_line:
462
+ header = "\n".join(header.splitlines()[:-1])
463
+
464
+ # Now we read the whole file with the correct options
465
+ try:
466
+ df = read_csv_by_chunks(
467
+ filename,
468
+ worker=worker,
469
+ decimal=decimal,
470
+ delimiter=delimiter,
471
+ header=None if read_without_header else "infer",
472
+ skiprows=skiprows,
473
+ comment="#",
474
+ )
475
+ except pd.errors.ParserError:
476
+ # If chunked reading fails (e.g., ragged CSV), try different approaches
477
+ df = None
478
+ # Try with python engine (more flexible)
479
+ for skip in [skiprows, 0, 9, 10, 15, 20]: # Try different skiprows values
480
+ if df is not None:
481
+ break
482
+ try:
483
+ df = pd.read_csv(
484
+ filename,
485
+ decimal=decimal,
486
+ delimiter=delimiter,
487
+ header=None if read_without_header else "infer",
488
+ skiprows=skip,
489
+ comment="#",
490
+ engine="python",
491
+ encoding_errors="ignore",
492
+ )
493
+ break # Success!
494
+ except (pd.errors.ParserError, ValueError):
495
+ continue
496
+
497
+ # If still failing, try auto-detect
498
+ if df is None:
499
+ try:
500
+ df = pd.read_csv(
501
+ filename,
502
+ engine="python",
503
+ encoding_errors="ignore",
504
+ comment="#",
505
+ )
506
+ except (pd.errors.ParserError, ValueError) as e:
507
+ raise ValueError(f"Unable to parse CSV file: {e}") from e
508
+
509
+ # Remove rows and columns where all values are NaN in the DataFrame:
510
+ df = df.dropna(axis=0, how="all").dropna(axis=1, how="all")
511
+
512
+ # Check if first row contains header strings (non-numeric values in all columns)
513
+ # This happens when header="infer" fails to detect the header
514
+ if not df.empty and isinstance(df.columns[0], (int, np.integer)):
515
+ # Columns are integers, not strings - header wasn't properly parsed
516
+ first_row = df.iloc[0]
517
+ # Count how many values in first row are non-numeric strings
518
+ non_numeric_count = 0
519
+ for val in first_row:
520
+ try:
521
+ float(val)
522
+ except (ValueError, TypeError):
523
+ if isinstance(val, str):
524
+ non_numeric_count += 1
525
+ # If most of first row is non-numeric strings, it's likely a header row
526
+ if non_numeric_count / len(first_row) > 0.5:
527
+ # Use first row as column names
528
+ df.columns = first_row.values
529
+ # Drop the first row (header)
530
+ df = df.iloc[1:].reset_index(drop=True)
531
+
532
+ # Try to detect datetime columns - check first two columns
533
+ # Often CSV files have an index column, then a datetime column
534
+ if not df.empty and df.shape[1] >= 2:
535
+ df, datetime_metadata = _detect_datetime_col(df)
536
+
537
+ # Try to detect metadata columns (constant-value columns like serial numbers)
538
+ # This must be done after datetime detection but before converting to numpy
539
+ if not df.empty and df.shape[1] >= 2:
540
+ df, column_metadata = _detect_metadata_cols(df)
541
+
542
+ # Converting to NumPy array
543
+ try:
544
+ xydata = df.to_numpy(float)
545
+ except (ValueError, TypeError):
546
+ # If conversion fails, try converting each column individually
547
+ # and dropping columns that can't be converted
548
+ for col in df.columns:
549
+ df[col] = pd.to_numeric(df[col], errors="coerce")
550
+ df = df.dropna(axis=1, how="all")
551
+ xydata = df.to_numpy(float)
552
+
553
+ if xydata.size == 0:
554
+ raise ValueError(
555
+ f"Unable to read CSV file (no supported data after cleaning): {filename}"
556
+ )
557
+
558
+ xlabel, ylabels, xunit, yunits = get_labels_units_from_dataframe(df)
559
+
560
+ return CSVData(
561
+ xydata=xydata,
562
+ xlabel=xlabel,
563
+ xunit=xunit,
564
+ ylabels=ylabels,
565
+ yunits=yunits,
566
+ header=header,
567
+ datetime_metadata=datetime_metadata,
568
+ column_metadata=column_metadata,
569
+ )
570
+
571
+
572
+ def write_csv(
573
+ filename: str,
574
+ xydata: np.ndarray,
575
+ xlabel: str | None,
576
+ xunit: str | None,
577
+ ylabels: list[str] | None,
578
+ yunits: list[str] | None,
579
+ header: str | None,
580
+ ) -> None:
581
+ """Write CSV data.
582
+
583
+ Args:
584
+ filename: CSV file name
585
+ xydata: XY data
586
+ xlabel: X label
587
+ xunit: X unit
588
+ ylabels: Y labels
589
+ yunits: Y units
590
+ header: Header
591
+ """
592
+ labels = ""
593
+ delimiter = ","
594
+ if len(ylabels) == 1:
595
+ ylabels = ["Y"] if not ylabels[0] else ylabels
596
+ elif ylabels:
597
+ ylabels = [
598
+ f"Y{i + 1}" if not label else label for i, label in enumerate(ylabels)
599
+ ]
600
+ if yunits:
601
+ ylabels = [
602
+ f"{label} ({unit})" if unit else label
603
+ for label, unit in zip(ylabels, yunits)
604
+ ]
605
+ if ylabels:
606
+ xlabel = xlabel or "X"
607
+ if xunit:
608
+ xlabel += f" ({xunit})"
609
+ labels = delimiter.join([xlabel] + ylabels)
610
+ df = pd.DataFrame(xydata.T, columns=[xlabel] + ylabels)
611
+ df.to_csv(filename, index=False, header=labels, sep=delimiter)
612
+ # Add header if present
613
+ if header:
614
+ with open(filename, "r+", encoding="utf-8") as file:
615
+ content = file.read()
616
+ file.seek(0, 0)
617
+ file.write(header + "\n" + content)
618
+
619
+
620
+ class MCAFile:
621
+ """Class to handle MCA files."""
622
+
623
+ def __init__(self, filename: str) -> None:
624
+ self.filename = filename
625
+ self.raw_data: str = ""
626
+ self.xlabel: str | None = None
627
+ self.x: np.ndarray | None = None
628
+ self.y: np.ndarray | None = None
629
+ self.metadata: dict[str, str] = {}
630
+
631
+ def __try_decode(self, raw_bytes: bytes) -> str:
632
+ """Try to decode raw bytes with the specified encoding."""
633
+ encodings_to_try = ["utf-8", "utf-8-sig", "latin-1", "cp1252"]
634
+ for enc in encodings_to_try:
635
+ try:
636
+ return raw_bytes.decode(enc)
637
+ except UnicodeDecodeError:
638
+ continue
639
+ # If all attempts fail, use 'utf-8' with replacement
640
+ warnings.warn("All decoding attempts failed. Used 'utf-8' with replacement.")
641
+ return raw_bytes.decode("utf-8", errors="replace")
642
+
643
+ def _read_raw_data(self) -> str:
644
+ """Read the raw data from the MCA file, trying multiple encodings."""
645
+ with open(self.filename, "rb") as file:
646
+ raw_bytes = file.read()
647
+ raw_data = self.__try_decode(raw_bytes)
648
+ self.raw_data = raw_data.replace("\r\n", "\n").replace("\r", "\n")
649
+
650
+ def _read_section(self, section: str) -> str | None:
651
+ """Read a section from the raw data."""
652
+ pattern = f"(?:.*)(^<<{section}>>$)(.*?)(?:<<.*>>)"
653
+ match = re.search(pattern, self.raw_data, re.DOTALL + re.MULTILINE)
654
+ if match:
655
+ return match.group(2).strip()
656
+ return None
657
+
658
+ @staticmethod
659
+ def _infer_string_value(value_str: str) -> str | float | int | datetime.datetime:
660
+ """Infer the type of a string value and convert it accordingly."""
661
+ # Try to convert the value to a number or datetime
662
+ try:
663
+ if value_str.isdigit():
664
+ value = int(value_str)
665
+ else:
666
+ try:
667
+ value = float(value_str)
668
+ except ValueError:
669
+ # Try to parse as datetime
670
+ try:
671
+ value = datetime.datetime.strptime(
672
+ value_str, "%m/%d/%Y %H:%M:%S"
673
+ )
674
+ except ValueError:
675
+ value = value_str # Keep as string
676
+ except ValueError:
677
+ value = value_str
678
+ return value
679
+
680
+ def _extract_metadata_from_section(
681
+ self, section: str
682
+ ) -> dict[str, str | float | int | datetime.datetime]:
683
+ """Extract metadata from a specific section."""
684
+ section_contents = self._read_section(section)
685
+ if section_contents is None:
686
+ return {}
687
+ metadata = {}
688
+ patterns = (r"(.*?) - (.*?)$", r"(.*?)\s*: \s*(.*)$", r"(.*?)\s*=\s*(.*);")
689
+ for line in section_contents.splitlines():
690
+ for pattern in patterns:
691
+ match = re.match(pattern, line)
692
+ if match:
693
+ key, value_str = match.groups()
694
+ metadata[key.strip()] = self._infer_string_value(value_str.strip())
695
+ break
696
+ return metadata
697
+
698
+ def read(self) -> None:
699
+ """Read the MCA file and extract data and metadata."""
700
+ self._read_raw_data()
701
+ self.metadata = self._extract_metadata_from_section("PMCA SPECTRUM")
702
+ additional_metadata = self._extract_metadata_from_section("DPP STATUS")
703
+ self.metadata.update(additional_metadata)
704
+ data_section = self._read_section("DATA")
705
+ self.y = np.fromstring(data_section, sep=" ") if data_section else None
706
+ if self.y is not None:
707
+ self.x = np.arange(len(self.y))
708
+ cal_section = self._read_section("CALIBRATION")
709
+ if cal_section:
710
+ cal_metadata = self._extract_metadata_from_section(cal_section)
711
+ self.xlabel = cal_metadata.get("LABEL")
712
+ cal_data = np.array(
713
+ [
714
+ [float(v) for v in val.split(" ")]
715
+ for val in cal_section.splitlines()[1:]
716
+ ]
717
+ )
718
+ self.x = scipy.interpolate.interp1d(
719
+ cal_data[:, 0],
720
+ cal_data[:, 1],
721
+ bounds_error=False,
722
+ fill_value="extrapolate",
723
+ )(self.x)