tsp 1.8.1__py3-none-any.whl → 1.10.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- tsp/__init__.py +11 -11
- tsp/__meta__.py +1 -1
- tsp/concatenation.py +159 -153
- tsp/core.py +1306 -1162
- tsp/data/2023-01-06_755-test-Dataset_2031-Constant_Over_Interval-Hourly-Ground_Temperature-Thermistor_Automated.timeserie.csv +4 -4
- tsp/data/2023-01-06_755-test.metadata.txt +208 -208
- tsp/data/NTGS_example_csv.csv +6 -6
- tsp/data/NTGS_example_slash_dates.csv +6 -6
- tsp/data/NTGS_gtr_example_excel.xlsx +0 -0
- tsp/data/example_geotop.csv +5240 -5240
- tsp/data/example_gtnp.csv +1298 -1298
- tsp/data/example_permos.csv +7 -7
- tsp/data/ntgs-db-multi.txt +3872 -0
- tsp/data/ntgs-db-single.txt +2251 -0
- tsp/data/test_geotop_has_space.txt +5 -5
- tsp/data/tsp_format_long.csv +10 -0
- tsp/data/tsp_format_wide_1.csv +7 -0
- tsp/data/tsp_format_wide_2.csv +7 -0
- tsp/dataloggers/AbstractReader.py +43 -43
- tsp/dataloggers/FG2.py +110 -110
- tsp/dataloggers/GP5W.py +114 -114
- tsp/dataloggers/Geoprecision.py +34 -34
- tsp/dataloggers/HOBO.py +930 -914
- tsp/dataloggers/RBRXL800.py +190 -190
- tsp/dataloggers/RBRXR420.py +371 -308
- tsp/dataloggers/Vemco.py +84 -0
- tsp/dataloggers/__init__.py +15 -15
- tsp/dataloggers/logr.py +196 -115
- tsp/dataloggers/test_files/004448.DAT +2543 -2543
- tsp/dataloggers/test_files/004531.DAT +17106 -17106
- tsp/dataloggers/test_files/004531.HEX +3587 -3587
- tsp/dataloggers/test_files/004534.HEX +3587 -3587
- tsp/dataloggers/test_files/010252.dat +1731 -1731
- tsp/dataloggers/test_files/010252.hex +1739 -1739
- tsp/dataloggers/test_files/010274.hex +1291 -1291
- tsp/dataloggers/test_files/010278.hex +3544 -3544
- tsp/dataloggers/test_files/012064.dat +1286 -1286
- tsp/dataloggers/test_files/012064.hex +1294 -1294
- tsp/dataloggers/test_files/012064_modified_start.hex +1294 -0
- tsp/dataloggers/test_files/012081.hex +3532 -3532
- tsp/dataloggers/test_files/013138_recovery_stamp.hex +1123 -0
- tsp/dataloggers/test_files/014037-2007.hex +95 -0
- tsp/dataloggers/test_files/019360_20160918_1146_SlumpIslandTopofHill.hex +11253 -0
- tsp/dataloggers/test_files/019360_20160918_1146_SlumpIslandTopofHill.xls +0 -0
- tsp/dataloggers/test_files/07B1592.DAT +1483 -1483
- tsp/dataloggers/test_files/07B1592.HEX +1806 -1806
- tsp/dataloggers/test_files/07B4450.DAT +2234 -2234
- tsp/dataloggers/test_files/07B4450.HEX +2559 -2559
- tsp/dataloggers/test_files/2022018_2025-09-18T22-16-16.txt +36 -0
- tsp/dataloggers/test_files/2022018_2025-09-18T22-16-16_raw.csv +2074 -0
- tsp/dataloggers/test_files/2022018_2025-09-18T22-16-16_temp.csv +2074 -0
- tsp/dataloggers/test_files/2025004_2025-12-02T17-07-28_cfg.txt +30 -0
- tsp/dataloggers/test_files/2025004_2025-12-02T17-07-28_raw.csv +35 -0
- tsp/dataloggers/test_files/2025004_2025-12-02T17-07-28_temp.csv +35 -0
- tsp/dataloggers/test_files/204087.xlsx +0 -0
- tsp/dataloggers/test_files/Asc-1455As02.000 +2982 -0
- tsp/dataloggers/test_files/Asc-1456As02.000 +2992 -0
- tsp/dataloggers/test_files/Asc-1457As02.000 +2917 -0
- tsp/dataloggers/test_files/BGC_BH15_019362_20140610_1253.hex +1729 -0
- tsp/dataloggers/test_files/Bin2944.csv +759 -0
- tsp/dataloggers/test_files/Bin5494.csv +2972 -0
- tsp/dataloggers/test_files/Bin6786.csv +272 -0
- tsp/dataloggers/test_files/FG2_399.csv +9881 -9881
- tsp/dataloggers/test_files/GP5W.csv +1121 -1121
- tsp/dataloggers/test_files/GP5W_260.csv +1884 -1884
- tsp/dataloggers/test_files/GP5W_270.csv +2210 -2210
- tsp/dataloggers/test_files/H08-030-08_HOBOware.csv +998 -998
- tsp/dataloggers/test_files/Minilog-II-T_350763_20190711_1.csv +2075 -0
- tsp/dataloggers/test_files/Minilog-II-T_350769_20190921_1.csv +6384 -0
- tsp/dataloggers/test_files/Minilog-II-T_354284_20190921_1.csv +4712 -0
- tsp/dataloggers/test_files/Minilog-T_7943_20140920_1.csv +5826 -0
- tsp/dataloggers/test_files/Minilog-T_8979_20140806_1.csv +2954 -0
- tsp/dataloggers/test_files/Minilog-T_975_20110824_1.csv +4343 -0
- tsp/dataloggers/test_files/RBR_01.dat +1046 -1046
- tsp/dataloggers/test_files/RBR_02.dat +2426 -2426
- tsp/dataloggers/test_files/RI03b_062831_20240905_1801.rsk +0 -0
- tsp/dataloggers/test_files/RI03b_062831_20240905_1801.xlsx +0 -0
- tsp/dataloggers/test_files/RSTDT2055.csv +2152 -2152
- tsp/dataloggers/test_files/U23-001_HOBOware.csv +1001 -1001
- tsp/dataloggers/test_files/hobo-negative-2.txt +6396 -6396
- tsp/dataloggers/test_files/hobo-negative-3.txt +5593 -5593
- tsp/dataloggers/test_files/hobo-positive-number-1.txt +1000 -1000
- tsp/dataloggers/test_files/hobo-positive-number-2.csv +1003 -1003
- tsp/dataloggers/test_files/hobo-positive-number-3.csv +1133 -1133
- tsp/dataloggers/test_files/hobo-positive-number-4.csv +1209 -1209
- tsp/dataloggers/test_files/hobo2.csv +8702 -8702
- tsp/dataloggers/test_files/hobo_1_AB.csv +21732 -21732
- tsp/dataloggers/test_files/hobo_1_AB_Details.txt +133 -133
- tsp/dataloggers/test_files/hobo_1_AB_classic.csv +4373 -4373
- tsp/dataloggers/test_files/hobo_1_AB_defaults.csv +21732 -21732
- tsp/dataloggers/test_files/hobo_1_AB_minimal.txt +1358 -1358
- tsp/dataloggers/test_files/hobo_1_AB_var2.csv +3189 -3189
- tsp/dataloggers/test_files/hobo_1_AB_var3.csv +2458 -2458
- tsp/dataloggers/test_files/logR_ULogC16-32_1.csv +106 -106
- tsp/dataloggers/test_files/logR_ULogC16-32_2.csv +100 -100
- tsp/dataloggers/test_files/mon_3_Ta_2010-08-18_2013-02-08.txt +21724 -21724
- tsp/dataloggers/test_files/rbr_001.dat +1133 -1133
- tsp/dataloggers/test_files/rbr_001.hex +1139 -1139
- tsp/dataloggers/test_files/rbr_001_no_comment.dat +1132 -1132
- tsp/dataloggers/test_files/rbr_001_no_comment.hex +1138 -1138
- tsp/dataloggers/test_files/rbr_002.dat +1179 -1179
- tsp/dataloggers/test_files/rbr_002.hex +1185 -1185
- tsp/dataloggers/test_files/rbr_003.hex +1292 -1292
- tsp/dataloggers/test_files/rbr_xl_001.DAT +1105 -1105
- tsp/dataloggers/test_files/rbr_xl_002.DAT +1126 -1126
- tsp/dataloggers/test_files/rbr_xl_003.DAT +4622 -4622
- tsp/dataloggers/test_files/rbr_xl_003.HEX +3587 -3587
- tsp/gtnp.py +148 -148
- tsp/labels.py +3 -3
- tsp/misc.py +90 -90
- tsp/physics.py +101 -101
- tsp/plots/static.py +388 -374
- tsp/readers.py +829 -548
- tsp/standardization/__init__.py +0 -0
- tsp/standardization/metadata.py +95 -0
- tsp/standardization/metadata_ref.py +0 -0
- tsp/standardization/validator.py +535 -0
- tsp/time.py +45 -45
- tsp/tspwarnings.py +27 -15
- tsp/utils.py +131 -101
- tsp/version.py +1 -1
- {tsp-1.8.1.dist-info → tsp-1.10.2.dist-info}/METADATA +95 -86
- tsp-1.10.2.dist-info/RECORD +132 -0
- {tsp-1.8.1.dist-info → tsp-1.10.2.dist-info}/licenses/LICENSE +674 -674
- {tsp-1.8.1.dist-info → tsp-1.10.2.dist-info}/top_level.txt +1 -0
- tsp-1.8.1.dist-info/RECORD +0 -94
- {tsp-1.8.1.dist-info → tsp-1.10.2.dist-info}/WHEEL +0 -0
|
File without changes
|
|
@@ -0,0 +1,95 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Metadata keys described in tsp data format standard. These are prefixed with an underscore
|
|
3
|
+
in the TSP object dictionary
|
|
4
|
+
"""
|
|
5
|
+
standardized_keys = {
|
|
6
|
+
'_latitude': 'latitude of the site',
|
|
7
|
+
'_longitude': 'longitude of the site',
|
|
8
|
+
'_site_id': 'identifier for the site'
|
|
9
|
+
}
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
"""
|
|
13
|
+
Additional keys used by TSP software but not described in tsp format standard.
|
|
14
|
+
May or may not be prefixed with underscore.
|
|
15
|
+
"""
|
|
16
|
+
additional_keys = {
|
|
17
|
+
'_source_file': 'path to the source data file',
|
|
18
|
+
'CF': 'dictionary of CF-compliant metadata, able to be used in netCDF files',
|
|
19
|
+
}
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
def dict_to_metadata(d, parent_key='') -> list[str]:
|
|
23
|
+
lines = []
|
|
24
|
+
for key, value in d.items():
|
|
25
|
+
full_key = f"{parent_key}.{key}" if parent_key else key
|
|
26
|
+
if isinstance(value, dict):
|
|
27
|
+
lines.extend(dict_to_metadata(value, full_key))
|
|
28
|
+
else:
|
|
29
|
+
lines.append(f"# {full_key}={value}")
|
|
30
|
+
return lines
|
|
31
|
+
|
|
32
|
+
|
|
33
|
+
def metadata_to_dict(lines) -> dict:
|
|
34
|
+
"""
|
|
35
|
+
Convert metadata lines to nested dictionary.
|
|
36
|
+
|
|
37
|
+
Args:
|
|
38
|
+
lines: List of strings like "# key=value" or "# key.subkey=value"
|
|
39
|
+
|
|
40
|
+
Returns:
|
|
41
|
+
Nested dictionary
|
|
42
|
+
"""
|
|
43
|
+
result = {}
|
|
44
|
+
|
|
45
|
+
for line in lines:
|
|
46
|
+
line = line.strip()
|
|
47
|
+
if not line.startswith('#'):
|
|
48
|
+
continue
|
|
49
|
+
|
|
50
|
+
line = line[1:].strip()
|
|
51
|
+
|
|
52
|
+
if '=' not in line:
|
|
53
|
+
continue
|
|
54
|
+
|
|
55
|
+
key_path, value = line.split('=', 1)
|
|
56
|
+
key_path = key_path.strip()
|
|
57
|
+
value = value.strip()
|
|
58
|
+
|
|
59
|
+
value = _parse_value(value)
|
|
60
|
+
|
|
61
|
+
keys = key_path.split('.')
|
|
62
|
+
|
|
63
|
+
current = result
|
|
64
|
+
for key in keys[:-1]:
|
|
65
|
+
if key not in current:
|
|
66
|
+
current[key] = {}
|
|
67
|
+
current = current[key]
|
|
68
|
+
|
|
69
|
+
current[keys[-1]] = value
|
|
70
|
+
|
|
71
|
+
return result
|
|
72
|
+
|
|
73
|
+
|
|
74
|
+
def _parse_value(value):
|
|
75
|
+
"""Try to parse value as int, float, bool, or leave as string."""
|
|
76
|
+
# Try boolean
|
|
77
|
+
if value.lower() in ('true', 'yes'):
|
|
78
|
+
return True
|
|
79
|
+
if value.lower() in ('false', 'no'):
|
|
80
|
+
return False
|
|
81
|
+
|
|
82
|
+
# Try int
|
|
83
|
+
try:
|
|
84
|
+
return int(value)
|
|
85
|
+
except ValueError:
|
|
86
|
+
pass
|
|
87
|
+
|
|
88
|
+
# Try float
|
|
89
|
+
try:
|
|
90
|
+
return float(value)
|
|
91
|
+
except ValueError:
|
|
92
|
+
pass
|
|
93
|
+
|
|
94
|
+
# Return as string
|
|
95
|
+
return value
|
|
File without changes
|
|
@@ -0,0 +1,535 @@
|
|
|
1
|
+
import re
|
|
2
|
+
from dataclasses import dataclass
|
|
3
|
+
from datetime import datetime
|
|
4
|
+
from enum import Enum
|
|
5
|
+
from pathlib import Path
|
|
6
|
+
from typing import Optional
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
class Severity(Enum):
|
|
10
|
+
ERROR = "error"
|
|
11
|
+
WARNING = "warning"
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
@dataclass
|
|
15
|
+
class ValidationIssue:
|
|
16
|
+
severity: Severity
|
|
17
|
+
line_num: Optional[int]
|
|
18
|
+
message: str
|
|
19
|
+
|
|
20
|
+
def __str__(self):
|
|
21
|
+
line_info = f"Line {self.line_num}: " if self.line_num else ""
|
|
22
|
+
return f"[{self.severity.value.upper()}] {line_info}{self.message}"
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
class TSPValidator:
|
|
26
|
+
def __init__(self, filepath: Path):
|
|
27
|
+
self.filepath = filepath
|
|
28
|
+
self.issues: list[ValidationIssue] = []
|
|
29
|
+
self.lines: list[str] = []
|
|
30
|
+
self.metadata_lines: list[str] = []
|
|
31
|
+
self.header_line: Optional[str] = None
|
|
32
|
+
self.header_idx: Optional[int] = None
|
|
33
|
+
self.data_lines: list[str] = []
|
|
34
|
+
|
|
35
|
+
def validate(self) -> list[ValidationIssue]:
|
|
36
|
+
self._validate_file_extension()
|
|
37
|
+
self._read_file()
|
|
38
|
+
self._parse_structure()
|
|
39
|
+
self._validate_encoding()
|
|
40
|
+
self._validate_metadata()
|
|
41
|
+
self._validate_header()
|
|
42
|
+
self._validate_data_format()
|
|
43
|
+
self._validate_timestamps()
|
|
44
|
+
self._validate_missing_data()
|
|
45
|
+
return self.issues
|
|
46
|
+
|
|
47
|
+
def _error(self, message: str, line_num: Optional[int] = None):
|
|
48
|
+
self.issues.append(ValidationIssue(Severity.ERROR, line_num, message))
|
|
49
|
+
|
|
50
|
+
def _warning(self, message: str, line_num: Optional[int] = None):
|
|
51
|
+
self.issues.append(ValidationIssue(Severity.WARNING, line_num, message))
|
|
52
|
+
|
|
53
|
+
def _validate_file_extension(self):
|
|
54
|
+
if self.filepath.suffix.lower() != '.csv':
|
|
55
|
+
self._warning(f"File should use .csv extension (found: {self.filepath.suffix})")
|
|
56
|
+
|
|
57
|
+
def _read_file(self):
|
|
58
|
+
try:
|
|
59
|
+
with open(self.filepath, 'r', encoding='utf-8') as f:
|
|
60
|
+
self.lines = f.read().splitlines()
|
|
61
|
+
except UnicodeDecodeError:
|
|
62
|
+
self._error("File must use UTF-8 encoding")
|
|
63
|
+
return
|
|
64
|
+
except Exception as e:
|
|
65
|
+
self._error(f"Could not read file: {e}")
|
|
66
|
+
|
|
67
|
+
def _parse_structure(self):
|
|
68
|
+
if not self.lines:
|
|
69
|
+
return
|
|
70
|
+
|
|
71
|
+
for idx, line in enumerate(self.lines):
|
|
72
|
+
if line.startswith('#'):
|
|
73
|
+
self.metadata_lines.append(line)
|
|
74
|
+
elif self.header_line is None:
|
|
75
|
+
self.header_line = line
|
|
76
|
+
self.header_idx = idx
|
|
77
|
+
else:
|
|
78
|
+
if line.strip() == '':
|
|
79
|
+
self._error("File must not have blank lines", idx)
|
|
80
|
+
else:
|
|
81
|
+
self.data_lines.append(line)
|
|
82
|
+
|
|
83
|
+
if self.header_line is None:
|
|
84
|
+
self._error("File must contain a header line")
|
|
85
|
+
|
|
86
|
+
def _validate_encoding(self):
|
|
87
|
+
try:
|
|
88
|
+
with open(self.filepath, 'rb') as f:
|
|
89
|
+
raw = f.read(3)
|
|
90
|
+
if raw.startswith(b'\xef\xbb\xbf'):
|
|
91
|
+
self._error("File must use UTF-8 without BOM")
|
|
92
|
+
except Exception:
|
|
93
|
+
pass
|
|
94
|
+
|
|
95
|
+
def _validate_metadata(self):
|
|
96
|
+
key_pattern = re.compile(r'^[a-z][a-z0-9_]*$')
|
|
97
|
+
|
|
98
|
+
for line in self.metadata_lines:
|
|
99
|
+
if '=' in line:
|
|
100
|
+
cleaned = line.lstrip('#').strip()
|
|
101
|
+
if '=' in cleaned:
|
|
102
|
+
parts = cleaned.split('=', 1)
|
|
103
|
+
key = parts[0].strip()
|
|
104
|
+
if not key_pattern.match(key):
|
|
105
|
+
self._warning(f"Metadata key should be lowercase with underscores: {key}")
|
|
106
|
+
|
|
107
|
+
def _validate_header(self):
|
|
108
|
+
if not self.header_line:
|
|
109
|
+
return
|
|
110
|
+
|
|
111
|
+
if ',' not in self.header_line and self.header_line.strip():
|
|
112
|
+
self._error("Header must use comma separator", self.header_idx)
|
|
113
|
+
return
|
|
114
|
+
|
|
115
|
+
headers = self.header_line.split(',')
|
|
116
|
+
|
|
117
|
+
if headers[0] != 'timestamp':
|
|
118
|
+
self._error("First column must be 'timestamp'", self.header_idx)
|
|
119
|
+
|
|
120
|
+
if any(h.strip() == '' for h in headers):
|
|
121
|
+
self._error("Header values must not be blank", self.header_idx)
|
|
122
|
+
|
|
123
|
+
if self.header_line.rstrip().endswith(','):
|
|
124
|
+
self._error("Header must not have trailing comma", self.header_idx)
|
|
125
|
+
|
|
126
|
+
if len(headers) != len(set(headers)):
|
|
127
|
+
self._error("Header values must be unique", self.header_idx)
|
|
128
|
+
|
|
129
|
+
column_name_pattern = re.compile(r'^[a-z][a-z0-9_]*$')
|
|
130
|
+
for h in headers[1:]:
|
|
131
|
+
if not self._is_depth_value(h) and not column_name_pattern.match(h):
|
|
132
|
+
self._error(f"Invalid column name (must start with letter, use lowercase/underscore): {h}", self.header_idx)
|
|
133
|
+
|
|
134
|
+
def _is_depth_value(self, value: str) -> bool:
|
|
135
|
+
try:
|
|
136
|
+
float(value)
|
|
137
|
+
return bool(re.match(r'^-?[0-9]+(\.[0-9]+)?$', value))
|
|
138
|
+
except ValueError:
|
|
139
|
+
return False
|
|
140
|
+
|
|
141
|
+
def _is_numeric(self, value: str) -> bool:
|
|
142
|
+
if value.strip() == '':
|
|
143
|
+
return True
|
|
144
|
+
try:
|
|
145
|
+
float(value)
|
|
146
|
+
return True
|
|
147
|
+
except ValueError:
|
|
148
|
+
return False
|
|
149
|
+
|
|
150
|
+
def _normalize_timestamp(self, ts: str) -> str:
|
|
151
|
+
"""Normalize timestamp for comparison by padding to full ISO format"""
|
|
152
|
+
ts = ts.strip()
|
|
153
|
+
|
|
154
|
+
if 'T' not in ts:
|
|
155
|
+
ts = ts + 'T00:00:00'
|
|
156
|
+
|
|
157
|
+
# Check if timezone info is present at the end
|
|
158
|
+
has_tz = ts.endswith('Z') or re.search(r'[+-]\d{2}:\d{2}$', ts)
|
|
159
|
+
if not has_tz:
|
|
160
|
+
ts = ts + 'Z'
|
|
161
|
+
|
|
162
|
+
return ts
|
|
163
|
+
|
|
164
|
+
def _extract_timezone(self, ts: str) -> Optional[str]:
|
|
165
|
+
"""Extract timezone from timestamp, returns None if no timezone"""
|
|
166
|
+
ts = ts.strip()
|
|
167
|
+
if ts.endswith('Z'):
|
|
168
|
+
return 'Z'
|
|
169
|
+
match = re.search(r'([+-]\d{2}:\d{2})$', ts)
|
|
170
|
+
if match:
|
|
171
|
+
return match.group(1)
|
|
172
|
+
return None
|
|
173
|
+
|
|
174
|
+
def _validate_wide_format(self, headers: list[str]):
|
|
175
|
+
depths = []
|
|
176
|
+
for h in headers[1:]:
|
|
177
|
+
try:
|
|
178
|
+
depths.append(float(h))
|
|
179
|
+
except ValueError:
|
|
180
|
+
self._error(f"Invalid depth value in header: {h}", self.header_idx)
|
|
181
|
+
|
|
182
|
+
if depths and depths != sorted(depths):
|
|
183
|
+
self._error("Depth values must be in ascending order", self.header_idx)
|
|
184
|
+
|
|
185
|
+
if len(depths) != len(set(depths)):
|
|
186
|
+
self._error("Depth values must be unique", self.header_idx)
|
|
187
|
+
|
|
188
|
+
timestamps = []
|
|
189
|
+
timezones = []
|
|
190
|
+
|
|
191
|
+
for idx, line in enumerate(self.data_lines, start=self.header_idx + 1):
|
|
192
|
+
values = line.split(',')
|
|
193
|
+
|
|
194
|
+
if len(values) != len(headers):
|
|
195
|
+
self._error(f"Row has {len(values)} values but header has {len(headers)} columns", idx)
|
|
196
|
+
continue
|
|
197
|
+
|
|
198
|
+
timestamp = values[0]
|
|
199
|
+
timestamps.append(timestamp)
|
|
200
|
+
|
|
201
|
+
tz = self._extract_timezone(timestamp)
|
|
202
|
+
if tz:
|
|
203
|
+
timezones.append(tz)
|
|
204
|
+
|
|
205
|
+
for i, val in enumerate(values[1:], start=1):
|
|
206
|
+
if not self._is_numeric(val):
|
|
207
|
+
self._error(f"Temperature value must be numeric or empty: '{val}'", idx)
|
|
208
|
+
|
|
209
|
+
normalized = [self._normalize_timestamp(ts) for ts in timestamps]
|
|
210
|
+
if normalized != sorted(normalized):
|
|
211
|
+
self._error("Timestamps must be in chronological order")
|
|
212
|
+
|
|
213
|
+
if len(timestamps) != len(set(timestamps)):
|
|
214
|
+
self._error("Timestamps must be unique in wide format")
|
|
215
|
+
|
|
216
|
+
self._check_timezone_consistency(timezones)
|
|
217
|
+
|
|
218
|
+
def _check_timezone_consistency(self, timezones: list[str]):
|
|
219
|
+
"""Check that timezones are consistent and preferably UTC"""
|
|
220
|
+
if not timezones:
|
|
221
|
+
return
|
|
222
|
+
|
|
223
|
+
unique_tzs = set(timezones)
|
|
224
|
+
if len(unique_tzs) > 1:
|
|
225
|
+
self._warning(f"Mixed timezones found: {unique_tzs}. Timestamps should use consistent timezone.")
|
|
226
|
+
|
|
227
|
+
non_utc_count = sum(1 for tz in timezones if tz != 'Z')
|
|
228
|
+
if non_utc_count > 0 and non_utc_count == len(timezones):
|
|
229
|
+
self._warning(f"Timestamps with timezone should preferably use UTC (Z)")
|
|
230
|
+
|
|
231
|
+
def _validate_data_format(self):
|
|
232
|
+
if not self.header_line or not self.data_lines:
|
|
233
|
+
return
|
|
234
|
+
|
|
235
|
+
headers = self.header_line.split(',')
|
|
236
|
+
|
|
237
|
+
if self._is_wide_format(headers):
|
|
238
|
+
self._validate_wide_format(headers)
|
|
239
|
+
elif self._is_long_format(headers):
|
|
240
|
+
self._validate_long_format(headers)
|
|
241
|
+
else:
|
|
242
|
+
self._error("Could not determine format (wide or long)")
|
|
243
|
+
|
|
244
|
+
def _is_wide_format(self, headers: list[str]) -> bool:
|
|
245
|
+
return len(headers) > 1 and all(self._is_depth_value(h) for h in headers[1:])
|
|
246
|
+
|
|
247
|
+
def _is_long_format(self, headers: list[str]) -> bool:
|
|
248
|
+
return 'depth' in headers or ('depth_from' in headers and 'depth_to' in headers)
|
|
249
|
+
|
|
250
|
+
def _validate_long_format(self, headers: list[str]):
|
|
251
|
+
has_temp = 'temperature' in headers
|
|
252
|
+
has_depth = 'depth' in headers
|
|
253
|
+
has_depth_from = 'depth_from' in headers
|
|
254
|
+
has_depth_to = 'depth_to' in headers
|
|
255
|
+
has_site_id = 'site_id' in headers
|
|
256
|
+
|
|
257
|
+
if has_depth and (has_depth_from or has_depth_to):
|
|
258
|
+
self._error("Cannot have both 'depth' and 'depth_from'/'depth_to' columns")
|
|
259
|
+
|
|
260
|
+
if not has_depth and not (has_depth_from and has_depth_to):
|
|
261
|
+
self._error("Long format must have 'depth' or 'depth_from'/'depth_to' columns")
|
|
262
|
+
|
|
263
|
+
if has_depth_from != has_depth_to:
|
|
264
|
+
self._error("Both 'depth_from' and 'depth_to' required for intervals")
|
|
265
|
+
|
|
266
|
+
is_extended = has_depth_from and has_depth_to
|
|
267
|
+
|
|
268
|
+
if not has_temp and not is_extended:
|
|
269
|
+
self._error("Long format must include 'temperature' column")
|
|
270
|
+
elif not has_temp and is_extended:
|
|
271
|
+
measurement_cols = [h for h in headers if h not in
|
|
272
|
+
['timestamp', 'depth', 'depth_from', 'depth_to', 'site_id']
|
|
273
|
+
and not h.endswith('_flag') and not h.endswith('_id')]
|
|
274
|
+
if not measurement_cols:
|
|
275
|
+
self._error("Extended format must have at least one measurement column when temperature is omitted")
|
|
276
|
+
|
|
277
|
+
seen_combinations = set()
|
|
278
|
+
site_groups = {}
|
|
279
|
+
timezones = []
|
|
280
|
+
|
|
281
|
+
for idx, line in enumerate(self.data_lines, start=self.header_idx + 1):
|
|
282
|
+
values = line.split(',')
|
|
283
|
+
|
|
284
|
+
if len(values) > len(headers):
|
|
285
|
+
if not (len(values) == len(headers) + 1 and values[-1].strip() == ''):
|
|
286
|
+
self._error(f"Row has {len(values)} values but header has {len(headers)} columns", idx)
|
|
287
|
+
elif len(values) < len(headers):
|
|
288
|
+
self._error(f"Row has {len(values)} values but header has {len(headers)} columns", idx)
|
|
289
|
+
continue
|
|
290
|
+
|
|
291
|
+
row_dict = dict(zip(headers, values[:len(headers)]))
|
|
292
|
+
|
|
293
|
+
timestamp = row_dict.get('timestamp', '')
|
|
294
|
+
site_id = row_dict.get('site_id', '')
|
|
295
|
+
|
|
296
|
+
tz = self._extract_timezone(timestamp)
|
|
297
|
+
if tz:
|
|
298
|
+
timezones.append(tz)
|
|
299
|
+
|
|
300
|
+
if has_depth:
|
|
301
|
+
depth = row_dict.get('depth', '')
|
|
302
|
+
if depth.strip() == '':
|
|
303
|
+
self._error("Depth values must not be missing", idx)
|
|
304
|
+
elif not self._is_numeric(depth):
|
|
305
|
+
self._error(f"Depth must be numeric: '{depth}'", idx)
|
|
306
|
+
|
|
307
|
+
combo_key = (timestamp, depth, site_id) if has_site_id else (timestamp, depth)
|
|
308
|
+
else:
|
|
309
|
+
depth_from = row_dict.get('depth_from', '')
|
|
310
|
+
depth_to = row_dict.get('depth_to', '')
|
|
311
|
+
|
|
312
|
+
if depth_from.strip() == '' or depth_to.strip() == '':
|
|
313
|
+
self._error("Depth interval values must not be missing", idx)
|
|
314
|
+
continue
|
|
315
|
+
|
|
316
|
+
if not self._is_numeric(depth_from) or not self._is_numeric(depth_to):
|
|
317
|
+
self._error(f"Depth values must be numeric", idx)
|
|
318
|
+
continue
|
|
319
|
+
|
|
320
|
+
try:
|
|
321
|
+
df = float(depth_from)
|
|
322
|
+
dt = float(depth_to)
|
|
323
|
+
if df > dt:
|
|
324
|
+
self._error(f"depth_from ({df}) must be <= depth_to ({dt})", idx)
|
|
325
|
+
except ValueError:
|
|
326
|
+
pass
|
|
327
|
+
|
|
328
|
+
combo_key = (timestamp, depth_from, depth_to, site_id) if has_site_id else (timestamp, depth_from, depth_to)
|
|
329
|
+
|
|
330
|
+
if combo_key in seen_combinations:
|
|
331
|
+
self._error(f"Duplicate combination found", idx)
|
|
332
|
+
seen_combinations.add(combo_key)
|
|
333
|
+
|
|
334
|
+
group_key = site_id if has_site_id else '_default'
|
|
335
|
+
if group_key not in site_groups:
|
|
336
|
+
site_groups[group_key] = []
|
|
337
|
+
|
|
338
|
+
# Store both timestamp and depth for ordering validation
|
|
339
|
+
if has_depth:
|
|
340
|
+
depth_value = row_dict.get('depth', '')
|
|
341
|
+
site_groups[group_key].append((timestamp, depth_value, idx))
|
|
342
|
+
else:
|
|
343
|
+
depth_from = row_dict.get('depth_from', '')
|
|
344
|
+
site_groups[group_key].append((timestamp, depth_from, idx))
|
|
345
|
+
|
|
346
|
+
if has_temp:
|
|
347
|
+
temp = row_dict.get('temperature', '')
|
|
348
|
+
if temp.strip() != '' and not self._is_numeric(temp):
|
|
349
|
+
self._error(f"Temperature must be numeric or empty: '{temp}'", idx)
|
|
350
|
+
|
|
351
|
+
# Validate ordering
|
|
352
|
+
for group_key, entries in site_groups.items():
|
|
353
|
+
if has_site_id:
|
|
354
|
+
self._validate_site_group_ordering(entries, group_key)
|
|
355
|
+
else:
|
|
356
|
+
# Without site_id, just check chronological order
|
|
357
|
+
timestamps = [self._normalize_timestamp(ts) for ts, _, _ in entries]
|
|
358
|
+
if timestamps != sorted(timestamps):
|
|
359
|
+
self._error("Timestamps must be in chronological order")
|
|
360
|
+
|
|
361
|
+
if has_depth_from and has_depth_to:
|
|
362
|
+
self._check_interval_overlaps(headers)
|
|
363
|
+
|
|
364
|
+
self._check_timezone_consistency(timezones)
|
|
365
|
+
|
|
366
|
+
def _validate_site_group_ordering(self, entries: list[tuple[str, str, int]], site_id: str):
|
|
367
|
+
"""
|
|
368
|
+
Validate that entries within a site_id group are ordered either:
|
|
369
|
+
(a) by depth, with timestamps chronological within each depth, or
|
|
370
|
+
(b) by timestamp, with depths ascending within each timestamp
|
|
371
|
+
"""
|
|
372
|
+
if len(entries) <= 1:
|
|
373
|
+
return
|
|
374
|
+
|
|
375
|
+
# Try to determine which ordering pattern is used
|
|
376
|
+
# Pattern (a): grouped by depth
|
|
377
|
+
depth_groups = {}
|
|
378
|
+
for timestamp, depth, idx in entries:
|
|
379
|
+
try:
|
|
380
|
+
depth_val = float(depth)
|
|
381
|
+
if depth_val not in depth_groups:
|
|
382
|
+
depth_groups[depth_val] = []
|
|
383
|
+
depth_groups[depth_val].append((timestamp, idx))
|
|
384
|
+
except ValueError:
|
|
385
|
+
continue
|
|
386
|
+
|
|
387
|
+
# Check if pattern (a) is valid: depths in order, timestamps chronological within depths
|
|
388
|
+
pattern_a_valid = True
|
|
389
|
+
sorted_depths = sorted(depth_groups.keys())
|
|
390
|
+
current_pos = 0
|
|
391
|
+
|
|
392
|
+
for depth in sorted_depths:
|
|
393
|
+
timestamps_at_depth = depth_groups[depth]
|
|
394
|
+
# Check if this depth group appears contiguously in the data
|
|
395
|
+
depth_positions = [idx for ts, idx in timestamps_at_depth]
|
|
396
|
+
if depth_positions != list(range(min(depth_positions), max(depth_positions) + 1)):
|
|
397
|
+
pattern_a_valid = False
|
|
398
|
+
break
|
|
399
|
+
|
|
400
|
+
# Check if timestamps are chronological within this depth
|
|
401
|
+
timestamps = [self._normalize_timestamp(ts) for ts, _ in timestamps_at_depth]
|
|
402
|
+
if timestamps != sorted(timestamps):
|
|
403
|
+
pattern_a_valid = False
|
|
404
|
+
break
|
|
405
|
+
|
|
406
|
+
# Check that this depth group comes after previous depths
|
|
407
|
+
if min(depth_positions) < current_pos:
|
|
408
|
+
pattern_a_valid = False
|
|
409
|
+
break
|
|
410
|
+
current_pos = max(depth_positions) + 1
|
|
411
|
+
|
|
412
|
+
# Pattern (b): grouped by timestamp
|
|
413
|
+
timestamp_groups = {}
|
|
414
|
+
for timestamp, depth, idx in entries:
|
|
415
|
+
norm_ts = self._normalize_timestamp(timestamp)
|
|
416
|
+
if norm_ts not in timestamp_groups:
|
|
417
|
+
timestamp_groups[norm_ts] = []
|
|
418
|
+
try:
|
|
419
|
+
depth_val = float(depth)
|
|
420
|
+
timestamp_groups[norm_ts].append((depth_val, idx))
|
|
421
|
+
except ValueError:
|
|
422
|
+
continue
|
|
423
|
+
|
|
424
|
+
# Check if pattern (b) is valid: timestamps in order, depths ascending within timestamps
|
|
425
|
+
pattern_b_valid = True
|
|
426
|
+
sorted_timestamps = sorted(timestamp_groups.keys())
|
|
427
|
+
current_pos = 0
|
|
428
|
+
|
|
429
|
+
for timestamp in sorted_timestamps:
|
|
430
|
+
depths_at_timestamp = timestamp_groups[timestamp]
|
|
431
|
+
# Check if this timestamp group appears contiguously in the data
|
|
432
|
+
ts_positions = [idx for depth, idx in depths_at_timestamp]
|
|
433
|
+
if ts_positions != list(range(min(ts_positions), max(ts_positions) + 1)):
|
|
434
|
+
pattern_b_valid = False
|
|
435
|
+
break
|
|
436
|
+
|
|
437
|
+
# Check if depths are ascending within this timestamp
|
|
438
|
+
depths = [d for d, _ in depths_at_timestamp]
|
|
439
|
+
if depths != sorted(depths):
|
|
440
|
+
pattern_b_valid = False
|
|
441
|
+
break
|
|
442
|
+
|
|
443
|
+
# Check that this timestamp group comes after previous timestamps
|
|
444
|
+
if min(ts_positions) < current_pos:
|
|
445
|
+
pattern_b_valid = False
|
|
446
|
+
break
|
|
447
|
+
current_pos = max(ts_positions) + 1
|
|
448
|
+
|
|
449
|
+
if not pattern_a_valid and not pattern_b_valid:
|
|
450
|
+
self._error(
|
|
451
|
+
f"Rows within site_id '{site_id}' must be ordered either: "
|
|
452
|
+
"(a) by depth with timestamps chronological within each depth, or "
|
|
453
|
+
"(b) by timestamp with depths ascending within each timestamp"
|
|
454
|
+
)
|
|
455
|
+
|
|
456
|
+
def _check_interval_overlaps(self, headers: list[str]):
|
|
457
|
+
has_site_id = 'site_id' in headers
|
|
458
|
+
intervals_by_timestamp = {}
|
|
459
|
+
|
|
460
|
+
for line in self.data_lines:
|
|
461
|
+
values = line.split(',')
|
|
462
|
+
row_dict = dict(zip(headers, values[:len(headers)]))
|
|
463
|
+
|
|
464
|
+
timestamp = row_dict.get('timestamp', '')
|
|
465
|
+
site_id = row_dict.get('site_id', '') if has_site_id else '_default'
|
|
466
|
+
depth_from = row_dict.get('depth_from', '')
|
|
467
|
+
depth_to = row_dict.get('depth_to', '')
|
|
468
|
+
|
|
469
|
+
if not depth_from or not depth_to:
|
|
470
|
+
continue
|
|
471
|
+
|
|
472
|
+
try:
|
|
473
|
+
df = float(depth_from)
|
|
474
|
+
dt = float(depth_to)
|
|
475
|
+
except ValueError:
|
|
476
|
+
continue
|
|
477
|
+
|
|
478
|
+
key = (timestamp, site_id)
|
|
479
|
+
if key not in intervals_by_timestamp:
|
|
480
|
+
intervals_by_timestamp[key] = []
|
|
481
|
+
intervals_by_timestamp[key].append((df, dt))
|
|
482
|
+
|
|
483
|
+
for key, intervals in intervals_by_timestamp.items():
|
|
484
|
+
sorted_intervals = sorted(intervals)
|
|
485
|
+
for i in range(len(sorted_intervals) - 1):
|
|
486
|
+
curr_start, curr_end = sorted_intervals[i]
|
|
487
|
+
next_start, next_end = sorted_intervals[i + 1]
|
|
488
|
+
if curr_end > next_start:
|
|
489
|
+
self._error(f"Overlapping intervals at timestamp {key[0]}: [{curr_start}, {curr_end}] and [{next_start}, {next_end}]")
|
|
490
|
+
|
|
491
|
+
def _validate_timestamps(self):
|
|
492
|
+
iso_pattern = re.compile(
|
|
493
|
+
r'^\d{4}-\d{2}-\d{2}(T\d{2}:\d{2}:\d{2}(\.\d+)?(Z|[+-]\d{2}:\d{2})?)?$'
|
|
494
|
+
)
|
|
495
|
+
|
|
496
|
+
for idx, line in enumerate(self.data_lines, start=self.header_idx + 1):
|
|
497
|
+
timestamp = line.split(',')[0]
|
|
498
|
+
if timestamp.strip() == '':
|
|
499
|
+
self._error("Timestamp values must not be missing", idx)
|
|
500
|
+
elif not iso_pattern.match(timestamp):
|
|
501
|
+
self._error(f"Invalid ISO 8601 timestamp: '{timestamp}'", idx)
|
|
502
|
+
|
|
503
|
+
def _validate_missing_data(self):
|
|
504
|
+
placeholders = ['NA', 'NaN', 'NULL', '-999', 'na', 'nan', 'null', '-9999']
|
|
505
|
+
|
|
506
|
+
for idx, line in enumerate(self.data_lines, start=self.header_idx + 1):
|
|
507
|
+
values = line.split(',')
|
|
508
|
+
for val in values:
|
|
509
|
+
if val.strip() in placeholders:
|
|
510
|
+
self._error(f"Placeholder values not allowed (use empty string): '{val}'", idx)
|
|
511
|
+
|
|
512
|
+
|
|
513
|
+
def validate_tsp_file(filepath: str | Path) -> list[ValidationIssue]:
|
|
514
|
+
validator = TSPValidator(Path(filepath))
|
|
515
|
+
return validator.validate()
|
|
516
|
+
|
|
517
|
+
|
|
518
|
+
if __name__ == '__main__':
|
|
519
|
+
import sys
|
|
520
|
+
|
|
521
|
+
if len(sys.argv) < 2:
|
|
522
|
+
print("Usage: python tsp_validator.py <file.csv>")
|
|
523
|
+
sys.exit(1)
|
|
524
|
+
|
|
525
|
+
issues = validate_tsp_file(sys.argv[1])
|
|
526
|
+
|
|
527
|
+
if not issues:
|
|
528
|
+
print("✓ File is valid")
|
|
529
|
+
sys.exit(0)
|
|
530
|
+
|
|
531
|
+
for issue in issues:
|
|
532
|
+
print(issue)
|
|
533
|
+
|
|
534
|
+
error_count = sum(1 for i in issues if i.severity == Severity.ERROR)
|
|
535
|
+
sys.exit(1 if error_count > 0 else 0)
|