tsp 1.8.1__py3-none-any.whl → 1.10.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (127) hide show
  1. tsp/__init__.py +11 -11
  2. tsp/__meta__.py +1 -1
  3. tsp/concatenation.py +159 -153
  4. tsp/core.py +1306 -1162
  5. tsp/data/2023-01-06_755-test-Dataset_2031-Constant_Over_Interval-Hourly-Ground_Temperature-Thermistor_Automated.timeserie.csv +4 -4
  6. tsp/data/2023-01-06_755-test.metadata.txt +208 -208
  7. tsp/data/NTGS_example_csv.csv +6 -6
  8. tsp/data/NTGS_example_slash_dates.csv +6 -6
  9. tsp/data/NTGS_gtr_example_excel.xlsx +0 -0
  10. tsp/data/example_geotop.csv +5240 -5240
  11. tsp/data/example_gtnp.csv +1298 -1298
  12. tsp/data/example_permos.csv +7 -7
  13. tsp/data/ntgs-db-multi.txt +3872 -0
  14. tsp/data/ntgs-db-single.txt +2251 -0
  15. tsp/data/test_geotop_has_space.txt +5 -5
  16. tsp/data/tsp_format_long.csv +10 -0
  17. tsp/data/tsp_format_wide_1.csv +7 -0
  18. tsp/data/tsp_format_wide_2.csv +7 -0
  19. tsp/dataloggers/AbstractReader.py +43 -43
  20. tsp/dataloggers/FG2.py +110 -110
  21. tsp/dataloggers/GP5W.py +114 -114
  22. tsp/dataloggers/Geoprecision.py +34 -34
  23. tsp/dataloggers/HOBO.py +930 -914
  24. tsp/dataloggers/RBRXL800.py +190 -190
  25. tsp/dataloggers/RBRXR420.py +371 -308
  26. tsp/dataloggers/Vemco.py +84 -0
  27. tsp/dataloggers/__init__.py +15 -15
  28. tsp/dataloggers/logr.py +196 -115
  29. tsp/dataloggers/test_files/004448.DAT +2543 -2543
  30. tsp/dataloggers/test_files/004531.DAT +17106 -17106
  31. tsp/dataloggers/test_files/004531.HEX +3587 -3587
  32. tsp/dataloggers/test_files/004534.HEX +3587 -3587
  33. tsp/dataloggers/test_files/010252.dat +1731 -1731
  34. tsp/dataloggers/test_files/010252.hex +1739 -1739
  35. tsp/dataloggers/test_files/010274.hex +1291 -1291
  36. tsp/dataloggers/test_files/010278.hex +3544 -3544
  37. tsp/dataloggers/test_files/012064.dat +1286 -1286
  38. tsp/dataloggers/test_files/012064.hex +1294 -1294
  39. tsp/dataloggers/test_files/012064_modified_start.hex +1294 -0
  40. tsp/dataloggers/test_files/012081.hex +3532 -3532
  41. tsp/dataloggers/test_files/013138_recovery_stamp.hex +1123 -0
  42. tsp/dataloggers/test_files/014037-2007.hex +95 -0
  43. tsp/dataloggers/test_files/019360_20160918_1146_SlumpIslandTopofHill.hex +11253 -0
  44. tsp/dataloggers/test_files/019360_20160918_1146_SlumpIslandTopofHill.xls +0 -0
  45. tsp/dataloggers/test_files/07B1592.DAT +1483 -1483
  46. tsp/dataloggers/test_files/07B1592.HEX +1806 -1806
  47. tsp/dataloggers/test_files/07B4450.DAT +2234 -2234
  48. tsp/dataloggers/test_files/07B4450.HEX +2559 -2559
  49. tsp/dataloggers/test_files/2022018_2025-09-18T22-16-16.txt +36 -0
  50. tsp/dataloggers/test_files/2022018_2025-09-18T22-16-16_raw.csv +2074 -0
  51. tsp/dataloggers/test_files/2022018_2025-09-18T22-16-16_temp.csv +2074 -0
  52. tsp/dataloggers/test_files/2025004_2025-12-02T17-07-28_cfg.txt +30 -0
  53. tsp/dataloggers/test_files/2025004_2025-12-02T17-07-28_raw.csv +35 -0
  54. tsp/dataloggers/test_files/2025004_2025-12-02T17-07-28_temp.csv +35 -0
  55. tsp/dataloggers/test_files/204087.xlsx +0 -0
  56. tsp/dataloggers/test_files/Asc-1455As02.000 +2982 -0
  57. tsp/dataloggers/test_files/Asc-1456As02.000 +2992 -0
  58. tsp/dataloggers/test_files/Asc-1457As02.000 +2917 -0
  59. tsp/dataloggers/test_files/BGC_BH15_019362_20140610_1253.hex +1729 -0
  60. tsp/dataloggers/test_files/Bin2944.csv +759 -0
  61. tsp/dataloggers/test_files/Bin5494.csv +2972 -0
  62. tsp/dataloggers/test_files/Bin6786.csv +272 -0
  63. tsp/dataloggers/test_files/FG2_399.csv +9881 -9881
  64. tsp/dataloggers/test_files/GP5W.csv +1121 -1121
  65. tsp/dataloggers/test_files/GP5W_260.csv +1884 -1884
  66. tsp/dataloggers/test_files/GP5W_270.csv +2210 -2210
  67. tsp/dataloggers/test_files/H08-030-08_HOBOware.csv +998 -998
  68. tsp/dataloggers/test_files/Minilog-II-T_350763_20190711_1.csv +2075 -0
  69. tsp/dataloggers/test_files/Minilog-II-T_350769_20190921_1.csv +6384 -0
  70. tsp/dataloggers/test_files/Minilog-II-T_354284_20190921_1.csv +4712 -0
  71. tsp/dataloggers/test_files/Minilog-T_7943_20140920_1.csv +5826 -0
  72. tsp/dataloggers/test_files/Minilog-T_8979_20140806_1.csv +2954 -0
  73. tsp/dataloggers/test_files/Minilog-T_975_20110824_1.csv +4343 -0
  74. tsp/dataloggers/test_files/RBR_01.dat +1046 -1046
  75. tsp/dataloggers/test_files/RBR_02.dat +2426 -2426
  76. tsp/dataloggers/test_files/RI03b_062831_20240905_1801.rsk +0 -0
  77. tsp/dataloggers/test_files/RI03b_062831_20240905_1801.xlsx +0 -0
  78. tsp/dataloggers/test_files/RSTDT2055.csv +2152 -2152
  79. tsp/dataloggers/test_files/U23-001_HOBOware.csv +1001 -1001
  80. tsp/dataloggers/test_files/hobo-negative-2.txt +6396 -6396
  81. tsp/dataloggers/test_files/hobo-negative-3.txt +5593 -5593
  82. tsp/dataloggers/test_files/hobo-positive-number-1.txt +1000 -1000
  83. tsp/dataloggers/test_files/hobo-positive-number-2.csv +1003 -1003
  84. tsp/dataloggers/test_files/hobo-positive-number-3.csv +1133 -1133
  85. tsp/dataloggers/test_files/hobo-positive-number-4.csv +1209 -1209
  86. tsp/dataloggers/test_files/hobo2.csv +8702 -8702
  87. tsp/dataloggers/test_files/hobo_1_AB.csv +21732 -21732
  88. tsp/dataloggers/test_files/hobo_1_AB_Details.txt +133 -133
  89. tsp/dataloggers/test_files/hobo_1_AB_classic.csv +4373 -4373
  90. tsp/dataloggers/test_files/hobo_1_AB_defaults.csv +21732 -21732
  91. tsp/dataloggers/test_files/hobo_1_AB_minimal.txt +1358 -1358
  92. tsp/dataloggers/test_files/hobo_1_AB_var2.csv +3189 -3189
  93. tsp/dataloggers/test_files/hobo_1_AB_var3.csv +2458 -2458
  94. tsp/dataloggers/test_files/logR_ULogC16-32_1.csv +106 -106
  95. tsp/dataloggers/test_files/logR_ULogC16-32_2.csv +100 -100
  96. tsp/dataloggers/test_files/mon_3_Ta_2010-08-18_2013-02-08.txt +21724 -21724
  97. tsp/dataloggers/test_files/rbr_001.dat +1133 -1133
  98. tsp/dataloggers/test_files/rbr_001.hex +1139 -1139
  99. tsp/dataloggers/test_files/rbr_001_no_comment.dat +1132 -1132
  100. tsp/dataloggers/test_files/rbr_001_no_comment.hex +1138 -1138
  101. tsp/dataloggers/test_files/rbr_002.dat +1179 -1179
  102. tsp/dataloggers/test_files/rbr_002.hex +1185 -1185
  103. tsp/dataloggers/test_files/rbr_003.hex +1292 -1292
  104. tsp/dataloggers/test_files/rbr_xl_001.DAT +1105 -1105
  105. tsp/dataloggers/test_files/rbr_xl_002.DAT +1126 -1126
  106. tsp/dataloggers/test_files/rbr_xl_003.DAT +4622 -4622
  107. tsp/dataloggers/test_files/rbr_xl_003.HEX +3587 -3587
  108. tsp/gtnp.py +148 -148
  109. tsp/labels.py +3 -3
  110. tsp/misc.py +90 -90
  111. tsp/physics.py +101 -101
  112. tsp/plots/static.py +388 -374
  113. tsp/readers.py +829 -548
  114. tsp/standardization/__init__.py +0 -0
  115. tsp/standardization/metadata.py +95 -0
  116. tsp/standardization/metadata_ref.py +0 -0
  117. tsp/standardization/validator.py +535 -0
  118. tsp/time.py +45 -45
  119. tsp/tspwarnings.py +27 -15
  120. tsp/utils.py +131 -101
  121. tsp/version.py +1 -1
  122. {tsp-1.8.1.dist-info → tsp-1.10.2.dist-info}/METADATA +95 -86
  123. tsp-1.10.2.dist-info/RECORD +132 -0
  124. {tsp-1.8.1.dist-info → tsp-1.10.2.dist-info}/licenses/LICENSE +674 -674
  125. {tsp-1.8.1.dist-info → tsp-1.10.2.dist-info}/top_level.txt +1 -0
  126. tsp-1.8.1.dist-info/RECORD +0 -94
  127. {tsp-1.8.1.dist-info → tsp-1.10.2.dist-info}/WHEEL +0 -0
File without changes
@@ -0,0 +1,95 @@
1
+ """
2
+ Metadata keys described in tsp data format standard. These are prefixed with an underscore
3
+ in the TSP object dictionary
4
+ """
5
+ standardized_keys = {
6
+ '_latitude': 'latitude of the site',
7
+ '_longitude': 'longitude of the site',
8
+ '_site_id': 'identifier for the site'
9
+ }
10
+
11
+
12
+ """
13
+ Additional keys used by TSP software but not described in tsp format standard.
14
+ May or may not be prefixed with underscore.
15
+ """
16
+ additional_keys = {
17
+ '_source_file': 'path to the source data file',
18
+ 'CF': 'dictionary of CF-compliant metadata, able to be used in netCDF files',
19
+ }
20
+
21
+
22
+ def dict_to_metadata(d, parent_key='') -> list[str]:
23
+ lines = []
24
+ for key, value in d.items():
25
+ full_key = f"{parent_key}.{key}" if parent_key else key
26
+ if isinstance(value, dict):
27
+ lines.extend(dict_to_metadata(value, full_key))
28
+ else:
29
+ lines.append(f"# {full_key}={value}")
30
+ return lines
31
+
32
+
33
+ def metadata_to_dict(lines) -> dict:
34
+ """
35
+ Convert metadata lines to nested dictionary.
36
+
37
+ Args:
38
+ lines: List of strings like "# key=value" or "# key.subkey=value"
39
+
40
+ Returns:
41
+ Nested dictionary
42
+ """
43
+ result = {}
44
+
45
+ for line in lines:
46
+ line = line.strip()
47
+ if not line.startswith('#'):
48
+ continue
49
+
50
+ line = line[1:].strip()
51
+
52
+ if '=' not in line:
53
+ continue
54
+
55
+ key_path, value = line.split('=', 1)
56
+ key_path = key_path.strip()
57
+ value = value.strip()
58
+
59
+ value = _parse_value(value)
60
+
61
+ keys = key_path.split('.')
62
+
63
+ current = result
64
+ for key in keys[:-1]:
65
+ if key not in current:
66
+ current[key] = {}
67
+ current = current[key]
68
+
69
+ current[keys[-1]] = value
70
+
71
+ return result
72
+
73
+
74
+ def _parse_value(value):
75
+ """Try to parse value as int, float, bool, or leave as string."""
76
+ # Try boolean
77
+ if value.lower() in ('true', 'yes'):
78
+ return True
79
+ if value.lower() in ('false', 'no'):
80
+ return False
81
+
82
+ # Try int
83
+ try:
84
+ return int(value)
85
+ except ValueError:
86
+ pass
87
+
88
+ # Try float
89
+ try:
90
+ return float(value)
91
+ except ValueError:
92
+ pass
93
+
94
+ # Return as string
95
+ return value
File without changes
@@ -0,0 +1,535 @@
1
+ import re
2
+ from dataclasses import dataclass
3
+ from datetime import datetime
4
+ from enum import Enum
5
+ from pathlib import Path
6
+ from typing import Optional
7
+
8
+
9
+ class Severity(Enum):
10
+ ERROR = "error"
11
+ WARNING = "warning"
12
+
13
+
14
+ @dataclass
15
+ class ValidationIssue:
16
+ severity: Severity
17
+ line_num: Optional[int]
18
+ message: str
19
+
20
+ def __str__(self):
21
+ line_info = f"Line {self.line_num}: " if self.line_num else ""
22
+ return f"[{self.severity.value.upper()}] {line_info}{self.message}"
23
+
24
+
25
+ class TSPValidator:
26
+ def __init__(self, filepath: Path):
27
+ self.filepath = filepath
28
+ self.issues: list[ValidationIssue] = []
29
+ self.lines: list[str] = []
30
+ self.metadata_lines: list[str] = []
31
+ self.header_line: Optional[str] = None
32
+ self.header_idx: Optional[int] = None
33
+ self.data_lines: list[str] = []
34
+
35
+ def validate(self) -> list[ValidationIssue]:
36
+ self._validate_file_extension()
37
+ self._read_file()
38
+ self._parse_structure()
39
+ self._validate_encoding()
40
+ self._validate_metadata()
41
+ self._validate_header()
42
+ self._validate_data_format()
43
+ self._validate_timestamps()
44
+ self._validate_missing_data()
45
+ return self.issues
46
+
47
+ def _error(self, message: str, line_num: Optional[int] = None):
48
+ self.issues.append(ValidationIssue(Severity.ERROR, line_num, message))
49
+
50
+ def _warning(self, message: str, line_num: Optional[int] = None):
51
+ self.issues.append(ValidationIssue(Severity.WARNING, line_num, message))
52
+
53
+ def _validate_file_extension(self):
54
+ if self.filepath.suffix.lower() != '.csv':
55
+ self._warning(f"File should use .csv extension (found: {self.filepath.suffix})")
56
+
57
+ def _read_file(self):
58
+ try:
59
+ with open(self.filepath, 'r', encoding='utf-8') as f:
60
+ self.lines = f.read().splitlines()
61
+ except UnicodeDecodeError:
62
+ self._error("File must use UTF-8 encoding")
63
+ return
64
+ except Exception as e:
65
+ self._error(f"Could not read file: {e}")
66
+
67
+ def _parse_structure(self):
68
+ if not self.lines:
69
+ return
70
+
71
+ for idx, line in enumerate(self.lines):
72
+ if line.startswith('#'):
73
+ self.metadata_lines.append(line)
74
+ elif self.header_line is None:
75
+ self.header_line = line
76
+ self.header_idx = idx
77
+ else:
78
+ if line.strip() == '':
79
+ self._error("File must not have blank lines", idx)
80
+ else:
81
+ self.data_lines.append(line)
82
+
83
+ if self.header_line is None:
84
+ self._error("File must contain a header line")
85
+
86
+ def _validate_encoding(self):
87
+ try:
88
+ with open(self.filepath, 'rb') as f:
89
+ raw = f.read(3)
90
+ if raw.startswith(b'\xef\xbb\xbf'):
91
+ self._error("File must use UTF-8 without BOM")
92
+ except Exception:
93
+ pass
94
+
95
+ def _validate_metadata(self):
96
+ key_pattern = re.compile(r'^[a-z][a-z0-9_]*$')
97
+
98
+ for line in self.metadata_lines:
99
+ if '=' in line:
100
+ cleaned = line.lstrip('#').strip()
101
+ if '=' in cleaned:
102
+ parts = cleaned.split('=', 1)
103
+ key = parts[0].strip()
104
+ if not key_pattern.match(key):
105
+ self._warning(f"Metadata key should be lowercase with underscores: {key}")
106
+
107
+ def _validate_header(self):
108
+ if not self.header_line:
109
+ return
110
+
111
+ if ',' not in self.header_line and self.header_line.strip():
112
+ self._error("Header must use comma separator", self.header_idx)
113
+ return
114
+
115
+ headers = self.header_line.split(',')
116
+
117
+ if headers[0] != 'timestamp':
118
+ self._error("First column must be 'timestamp'", self.header_idx)
119
+
120
+ if any(h.strip() == '' for h in headers):
121
+ self._error("Header values must not be blank", self.header_idx)
122
+
123
+ if self.header_line.rstrip().endswith(','):
124
+ self._error("Header must not have trailing comma", self.header_idx)
125
+
126
+ if len(headers) != len(set(headers)):
127
+ self._error("Header values must be unique", self.header_idx)
128
+
129
+ column_name_pattern = re.compile(r'^[a-z][a-z0-9_]*$')
130
+ for h in headers[1:]:
131
+ if not self._is_depth_value(h) and not column_name_pattern.match(h):
132
+ self._error(f"Invalid column name (must start with letter, use lowercase/underscore): {h}", self.header_idx)
133
+
134
+ def _is_depth_value(self, value: str) -> bool:
135
+ try:
136
+ float(value)
137
+ return bool(re.match(r'^-?[0-9]+(\.[0-9]+)?$', value))
138
+ except ValueError:
139
+ return False
140
+
141
+ def _is_numeric(self, value: str) -> bool:
142
+ if value.strip() == '':
143
+ return True
144
+ try:
145
+ float(value)
146
+ return True
147
+ except ValueError:
148
+ return False
149
+
150
+ def _normalize_timestamp(self, ts: str) -> str:
151
+ """Normalize timestamp for comparison by padding to full ISO format"""
152
+ ts = ts.strip()
153
+
154
+ if 'T' not in ts:
155
+ ts = ts + 'T00:00:00'
156
+
157
+ # Check if timezone info is present at the end
158
+ has_tz = ts.endswith('Z') or re.search(r'[+-]\d{2}:\d{2}$', ts)
159
+ if not has_tz:
160
+ ts = ts + 'Z'
161
+
162
+ return ts
163
+
164
+ def _extract_timezone(self, ts: str) -> Optional[str]:
165
+ """Extract timezone from timestamp, returns None if no timezone"""
166
+ ts = ts.strip()
167
+ if ts.endswith('Z'):
168
+ return 'Z'
169
+ match = re.search(r'([+-]\d{2}:\d{2})$', ts)
170
+ if match:
171
+ return match.group(1)
172
+ return None
173
+
174
+ def _validate_wide_format(self, headers: list[str]):
175
+ depths = []
176
+ for h in headers[1:]:
177
+ try:
178
+ depths.append(float(h))
179
+ except ValueError:
180
+ self._error(f"Invalid depth value in header: {h}", self.header_idx)
181
+
182
+ if depths and depths != sorted(depths):
183
+ self._error("Depth values must be in ascending order", self.header_idx)
184
+
185
+ if len(depths) != len(set(depths)):
186
+ self._error("Depth values must be unique", self.header_idx)
187
+
188
+ timestamps = []
189
+ timezones = []
190
+
191
+ for idx, line in enumerate(self.data_lines, start=self.header_idx + 1):
192
+ values = line.split(',')
193
+
194
+ if len(values) != len(headers):
195
+ self._error(f"Row has {len(values)} values but header has {len(headers)} columns", idx)
196
+ continue
197
+
198
+ timestamp = values[0]
199
+ timestamps.append(timestamp)
200
+
201
+ tz = self._extract_timezone(timestamp)
202
+ if tz:
203
+ timezones.append(tz)
204
+
205
+ for i, val in enumerate(values[1:], start=1):
206
+ if not self._is_numeric(val):
207
+ self._error(f"Temperature value must be numeric or empty: '{val}'", idx)
208
+
209
+ normalized = [self._normalize_timestamp(ts) for ts in timestamps]
210
+ if normalized != sorted(normalized):
211
+ self._error("Timestamps must be in chronological order")
212
+
213
+ if len(timestamps) != len(set(timestamps)):
214
+ self._error("Timestamps must be unique in wide format")
215
+
216
+ self._check_timezone_consistency(timezones)
217
+
218
+ def _check_timezone_consistency(self, timezones: list[str]):
219
+ """Check that timezones are consistent and preferably UTC"""
220
+ if not timezones:
221
+ return
222
+
223
+ unique_tzs = set(timezones)
224
+ if len(unique_tzs) > 1:
225
+ self._warning(f"Mixed timezones found: {unique_tzs}. Timestamps should use consistent timezone.")
226
+
227
+ non_utc_count = sum(1 for tz in timezones if tz != 'Z')
228
+ if non_utc_count > 0 and non_utc_count == len(timezones):
229
+ self._warning(f"Timestamps with timezone should preferably use UTC (Z)")
230
+
231
+ def _validate_data_format(self):
232
+ if not self.header_line or not self.data_lines:
233
+ return
234
+
235
+ headers = self.header_line.split(',')
236
+
237
+ if self._is_wide_format(headers):
238
+ self._validate_wide_format(headers)
239
+ elif self._is_long_format(headers):
240
+ self._validate_long_format(headers)
241
+ else:
242
+ self._error("Could not determine format (wide or long)")
243
+
244
+ def _is_wide_format(self, headers: list[str]) -> bool:
245
+ return len(headers) > 1 and all(self._is_depth_value(h) for h in headers[1:])
246
+
247
+ def _is_long_format(self, headers: list[str]) -> bool:
248
+ return 'depth' in headers or ('depth_from' in headers and 'depth_to' in headers)
249
+
250
+ def _validate_long_format(self, headers: list[str]):
251
+ has_temp = 'temperature' in headers
252
+ has_depth = 'depth' in headers
253
+ has_depth_from = 'depth_from' in headers
254
+ has_depth_to = 'depth_to' in headers
255
+ has_site_id = 'site_id' in headers
256
+
257
+ if has_depth and (has_depth_from or has_depth_to):
258
+ self._error("Cannot have both 'depth' and 'depth_from'/'depth_to' columns")
259
+
260
+ if not has_depth and not (has_depth_from and has_depth_to):
261
+ self._error("Long format must have 'depth' or 'depth_from'/'depth_to' columns")
262
+
263
+ if has_depth_from != has_depth_to:
264
+ self._error("Both 'depth_from' and 'depth_to' required for intervals")
265
+
266
+ is_extended = has_depth_from and has_depth_to
267
+
268
+ if not has_temp and not is_extended:
269
+ self._error("Long format must include 'temperature' column")
270
+ elif not has_temp and is_extended:
271
+ measurement_cols = [h for h in headers if h not in
272
+ ['timestamp', 'depth', 'depth_from', 'depth_to', 'site_id']
273
+ and not h.endswith('_flag') and not h.endswith('_id')]
274
+ if not measurement_cols:
275
+ self._error("Extended format must have at least one measurement column when temperature is omitted")
276
+
277
+ seen_combinations = set()
278
+ site_groups = {}
279
+ timezones = []
280
+
281
+ for idx, line in enumerate(self.data_lines, start=self.header_idx + 1):
282
+ values = line.split(',')
283
+
284
+ if len(values) > len(headers):
285
+ if not (len(values) == len(headers) + 1 and values[-1].strip() == ''):
286
+ self._error(f"Row has {len(values)} values but header has {len(headers)} columns", idx)
287
+ elif len(values) < len(headers):
288
+ self._error(f"Row has {len(values)} values but header has {len(headers)} columns", idx)
289
+ continue
290
+
291
+ row_dict = dict(zip(headers, values[:len(headers)]))
292
+
293
+ timestamp = row_dict.get('timestamp', '')
294
+ site_id = row_dict.get('site_id', '')
295
+
296
+ tz = self._extract_timezone(timestamp)
297
+ if tz:
298
+ timezones.append(tz)
299
+
300
+ if has_depth:
301
+ depth = row_dict.get('depth', '')
302
+ if depth.strip() == '':
303
+ self._error("Depth values must not be missing", idx)
304
+ elif not self._is_numeric(depth):
305
+ self._error(f"Depth must be numeric: '{depth}'", idx)
306
+
307
+ combo_key = (timestamp, depth, site_id) if has_site_id else (timestamp, depth)
308
+ else:
309
+ depth_from = row_dict.get('depth_from', '')
310
+ depth_to = row_dict.get('depth_to', '')
311
+
312
+ if depth_from.strip() == '' or depth_to.strip() == '':
313
+ self._error("Depth interval values must not be missing", idx)
314
+ continue
315
+
316
+ if not self._is_numeric(depth_from) or not self._is_numeric(depth_to):
317
+ self._error(f"Depth values must be numeric", idx)
318
+ continue
319
+
320
+ try:
321
+ df = float(depth_from)
322
+ dt = float(depth_to)
323
+ if df > dt:
324
+ self._error(f"depth_from ({df}) must be <= depth_to ({dt})", idx)
325
+ except ValueError:
326
+ pass
327
+
328
+ combo_key = (timestamp, depth_from, depth_to, site_id) if has_site_id else (timestamp, depth_from, depth_to)
329
+
330
+ if combo_key in seen_combinations:
331
+ self._error(f"Duplicate combination found", idx)
332
+ seen_combinations.add(combo_key)
333
+
334
+ group_key = site_id if has_site_id else '_default'
335
+ if group_key not in site_groups:
336
+ site_groups[group_key] = []
337
+
338
+ # Store both timestamp and depth for ordering validation
339
+ if has_depth:
340
+ depth_value = row_dict.get('depth', '')
341
+ site_groups[group_key].append((timestamp, depth_value, idx))
342
+ else:
343
+ depth_from = row_dict.get('depth_from', '')
344
+ site_groups[group_key].append((timestamp, depth_from, idx))
345
+
346
+ if has_temp:
347
+ temp = row_dict.get('temperature', '')
348
+ if temp.strip() != '' and not self._is_numeric(temp):
349
+ self._error(f"Temperature must be numeric or empty: '{temp}'", idx)
350
+
351
+ # Validate ordering
352
+ for group_key, entries in site_groups.items():
353
+ if has_site_id:
354
+ self._validate_site_group_ordering(entries, group_key)
355
+ else:
356
+ # Without site_id, just check chronological order
357
+ timestamps = [self._normalize_timestamp(ts) for ts, _, _ in entries]
358
+ if timestamps != sorted(timestamps):
359
+ self._error("Timestamps must be in chronological order")
360
+
361
+ if has_depth_from and has_depth_to:
362
+ self._check_interval_overlaps(headers)
363
+
364
+ self._check_timezone_consistency(timezones)
365
+
366
+ def _validate_site_group_ordering(self, entries: list[tuple[str, str, int]], site_id: str):
367
+ """
368
+ Validate that entries within a site_id group are ordered either:
369
+ (a) by depth, with timestamps chronological within each depth, or
370
+ (b) by timestamp, with depths ascending within each timestamp
371
+ """
372
+ if len(entries) <= 1:
373
+ return
374
+
375
+ # Try to determine which ordering pattern is used
376
+ # Pattern (a): grouped by depth
377
+ depth_groups = {}
378
+ for timestamp, depth, idx in entries:
379
+ try:
380
+ depth_val = float(depth)
381
+ if depth_val not in depth_groups:
382
+ depth_groups[depth_val] = []
383
+ depth_groups[depth_val].append((timestamp, idx))
384
+ except ValueError:
385
+ continue
386
+
387
+ # Check if pattern (a) is valid: depths in order, timestamps chronological within depths
388
+ pattern_a_valid = True
389
+ sorted_depths = sorted(depth_groups.keys())
390
+ current_pos = 0
391
+
392
+ for depth in sorted_depths:
393
+ timestamps_at_depth = depth_groups[depth]
394
+ # Check if this depth group appears contiguously in the data
395
+ depth_positions = [idx for ts, idx in timestamps_at_depth]
396
+ if depth_positions != list(range(min(depth_positions), max(depth_positions) + 1)):
397
+ pattern_a_valid = False
398
+ break
399
+
400
+ # Check if timestamps are chronological within this depth
401
+ timestamps = [self._normalize_timestamp(ts) for ts, _ in timestamps_at_depth]
402
+ if timestamps != sorted(timestamps):
403
+ pattern_a_valid = False
404
+ break
405
+
406
+ # Check that this depth group comes after previous depths
407
+ if min(depth_positions) < current_pos:
408
+ pattern_a_valid = False
409
+ break
410
+ current_pos = max(depth_positions) + 1
411
+
412
+ # Pattern (b): grouped by timestamp
413
+ timestamp_groups = {}
414
+ for timestamp, depth, idx in entries:
415
+ norm_ts = self._normalize_timestamp(timestamp)
416
+ if norm_ts not in timestamp_groups:
417
+ timestamp_groups[norm_ts] = []
418
+ try:
419
+ depth_val = float(depth)
420
+ timestamp_groups[norm_ts].append((depth_val, idx))
421
+ except ValueError:
422
+ continue
423
+
424
+ # Check if pattern (b) is valid: timestamps in order, depths ascending within timestamps
425
+ pattern_b_valid = True
426
+ sorted_timestamps = sorted(timestamp_groups.keys())
427
+ current_pos = 0
428
+
429
+ for timestamp in sorted_timestamps:
430
+ depths_at_timestamp = timestamp_groups[timestamp]
431
+ # Check if this timestamp group appears contiguously in the data
432
+ ts_positions = [idx for depth, idx in depths_at_timestamp]
433
+ if ts_positions != list(range(min(ts_positions), max(ts_positions) + 1)):
434
+ pattern_b_valid = False
435
+ break
436
+
437
+ # Check if depths are ascending within this timestamp
438
+ depths = [d for d, _ in depths_at_timestamp]
439
+ if depths != sorted(depths):
440
+ pattern_b_valid = False
441
+ break
442
+
443
+ # Check that this timestamp group comes after previous timestamps
444
+ if min(ts_positions) < current_pos:
445
+ pattern_b_valid = False
446
+ break
447
+ current_pos = max(ts_positions) + 1
448
+
449
+ if not pattern_a_valid and not pattern_b_valid:
450
+ self._error(
451
+ f"Rows within site_id '{site_id}' must be ordered either: "
452
+ "(a) by depth with timestamps chronological within each depth, or "
453
+ "(b) by timestamp with depths ascending within each timestamp"
454
+ )
455
+
456
+ def _check_interval_overlaps(self, headers: list[str]):
457
+ has_site_id = 'site_id' in headers
458
+ intervals_by_timestamp = {}
459
+
460
+ for line in self.data_lines:
461
+ values = line.split(',')
462
+ row_dict = dict(zip(headers, values[:len(headers)]))
463
+
464
+ timestamp = row_dict.get('timestamp', '')
465
+ site_id = row_dict.get('site_id', '') if has_site_id else '_default'
466
+ depth_from = row_dict.get('depth_from', '')
467
+ depth_to = row_dict.get('depth_to', '')
468
+
469
+ if not depth_from or not depth_to:
470
+ continue
471
+
472
+ try:
473
+ df = float(depth_from)
474
+ dt = float(depth_to)
475
+ except ValueError:
476
+ continue
477
+
478
+ key = (timestamp, site_id)
479
+ if key not in intervals_by_timestamp:
480
+ intervals_by_timestamp[key] = []
481
+ intervals_by_timestamp[key].append((df, dt))
482
+
483
+ for key, intervals in intervals_by_timestamp.items():
484
+ sorted_intervals = sorted(intervals)
485
+ for i in range(len(sorted_intervals) - 1):
486
+ curr_start, curr_end = sorted_intervals[i]
487
+ next_start, next_end = sorted_intervals[i + 1]
488
+ if curr_end > next_start:
489
+ self._error(f"Overlapping intervals at timestamp {key[0]}: [{curr_start}, {curr_end}] and [{next_start}, {next_end}]")
490
+
491
+ def _validate_timestamps(self):
492
+ iso_pattern = re.compile(
493
+ r'^\d{4}-\d{2}-\d{2}(T\d{2}:\d{2}:\d{2}(\.\d+)?(Z|[+-]\d{2}:\d{2})?)?$'
494
+ )
495
+
496
+ for idx, line in enumerate(self.data_lines, start=self.header_idx + 1):
497
+ timestamp = line.split(',')[0]
498
+ if timestamp.strip() == '':
499
+ self._error("Timestamp values must not be missing", idx)
500
+ elif not iso_pattern.match(timestamp):
501
+ self._error(f"Invalid ISO 8601 timestamp: '{timestamp}'", idx)
502
+
503
+ def _validate_missing_data(self):
504
+ placeholders = ['NA', 'NaN', 'NULL', '-999', 'na', 'nan', 'null', '-9999']
505
+
506
+ for idx, line in enumerate(self.data_lines, start=self.header_idx + 1):
507
+ values = line.split(',')
508
+ for val in values:
509
+ if val.strip() in placeholders:
510
+ self._error(f"Placeholder values not allowed (use empty string): '{val}'", idx)
511
+
512
+
513
+ def validate_tsp_file(filepath: str | Path) -> list[ValidationIssue]:
514
+ validator = TSPValidator(Path(filepath))
515
+ return validator.validate()
516
+
517
+
518
+ if __name__ == '__main__':
519
+ import sys
520
+
521
+ if len(sys.argv) < 2:
522
+ print("Usage: python tsp_validator.py <file.csv>")
523
+ sys.exit(1)
524
+
525
+ issues = validate_tsp_file(sys.argv[1])
526
+
527
+ if not issues:
528
+ print("✓ File is valid")
529
+ sys.exit(0)
530
+
531
+ for issue in issues:
532
+ print(issue)
533
+
534
+ error_count = sum(1 for i in issues if i.severity == Severity.ERROR)
535
+ sys.exit(1 if error_count > 0 else 0)