pypromice 1.5.3__py3-none-any.whl → 1.7.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of pypromice might be problematic. Click here for more details.

Files changed (67) hide show
  1. pypromice/__init__.py +2 -0
  2. pypromice/{qc → core/qc}/github_data_issues.py +22 -13
  3. pypromice/{qc → core/qc}/percentiles/compute_thresholds.py +2 -2
  4. pypromice/{qc → core/qc}/persistence.py +22 -29
  5. pypromice/{process → core/qc}/value_clipping.py +3 -3
  6. pypromice/core/resampling.py +142 -0
  7. pypromice/core/variables/__init__.py +1 -0
  8. pypromice/core/variables/air_temperature.py +64 -0
  9. pypromice/core/variables/gps.py +221 -0
  10. pypromice/core/variables/humidity.py +111 -0
  11. pypromice/core/variables/precipitation.py +108 -0
  12. pypromice/core/variables/pressure_transducer_depth.py +79 -0
  13. pypromice/core/variables/radiation.py +422 -0
  14. pypromice/core/variables/station_boom_height.py +75 -0
  15. pypromice/core/variables/station_pose.py +375 -0
  16. pypromice/io/bufr/__init__.py +0 -0
  17. pypromice/{postprocess → io/bufr}/bufr_to_csv.py +1 -1
  18. pypromice/{postprocess → io/bufr}/create_bufr_files.py +2 -2
  19. pypromice/{postprocess → io/bufr}/get_bufr.py +6 -6
  20. pypromice/{postprocess → io/bufr}/real_time_utilities.py +3 -3
  21. pypromice/io/ingest/__init__.py +0 -0
  22. pypromice/{utilities → io/ingest}/git.py +1 -3
  23. pypromice/io/ingest/l0.py +294 -0
  24. pypromice/io/ingest/l0_repository.py +103 -0
  25. pypromice/io/ingest/toa5.py +87 -0
  26. pypromice/{process → io}/write.py +1 -1
  27. pypromice/pipeline/L0toL1.py +291 -0
  28. pypromice/pipeline/L1toL2.py +233 -0
  29. pypromice/{process → pipeline}/L2toL3.py +113 -118
  30. pypromice/pipeline/__init__.py +4 -0
  31. pypromice/{process → pipeline}/aws.py +10 -82
  32. pypromice/{process → pipeline}/get_l2.py +2 -2
  33. pypromice/{process → pipeline}/get_l2tol3.py +19 -22
  34. pypromice/{process → pipeline}/join_l2.py +31 -32
  35. pypromice/{process → pipeline}/join_l3.py +16 -14
  36. pypromice/{process → pipeline}/resample.py +75 -51
  37. pypromice/{process → pipeline}/utilities.py +0 -22
  38. pypromice/resources/file_attributes.csv +4 -4
  39. pypromice/resources/variable_aliases_GC-Net.csv +2 -2
  40. pypromice/resources/variables.csv +27 -24
  41. {pypromice-1.5.3.dist-info → pypromice-1.7.0.dist-info}/METADATA +1 -2
  42. pypromice-1.7.0.dist-info/RECORD +65 -0
  43. pypromice-1.7.0.dist-info/entry_points.txt +12 -0
  44. pypromice/get/__init__.py +0 -1
  45. pypromice/get/get.py +0 -211
  46. pypromice/get/get_promice_data.py +0 -56
  47. pypromice/process/L0toL1.py +0 -564
  48. pypromice/process/L1toL2.py +0 -824
  49. pypromice/process/__init__.py +0 -4
  50. pypromice/process/load.py +0 -161
  51. pypromice-1.5.3.dist-info/RECORD +0 -54
  52. pypromice-1.5.3.dist-info/entry_points.txt +0 -13
  53. /pypromice/{postprocess → core}/__init__.py +0 -0
  54. /pypromice/{utilities → core}/dependency_graph.py +0 -0
  55. /pypromice/{qc → core/qc}/__init__.py +0 -0
  56. /pypromice/{qc → core/qc}/percentiles/__init__.py +0 -0
  57. /pypromice/{qc → core/qc}/percentiles/outlier_detector.py +0 -0
  58. /pypromice/{qc → core/qc}/percentiles/thresholds.csv +0 -0
  59. /pypromice/{process → core/variables}/wind.py +0 -0
  60. /pypromice/{utilities → io}/__init__.py +0 -0
  61. /pypromice/{postprocess → io/bufr}/bufr_utilities.py +0 -0
  62. /pypromice/{postprocess → io/bufr}/positions_seed.csv +0 -0
  63. /pypromice/{station_configuration.py → io/bufr/station_configuration.py} +0 -0
  64. /pypromice/{postprocess → io}/make_metadata_csv.py +0 -0
  65. {pypromice-1.5.3.dist-info → pypromice-1.7.0.dist-info}/WHEEL +0 -0
  66. {pypromice-1.5.3.dist-info → pypromice-1.7.0.dist-info}/licenses/LICENSE.txt +0 -0
  67. {pypromice-1.5.3.dist-info → pypromice-1.7.0.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,294 @@
1
+ """
2
+ Module for handling configuration loading and parsing of L0 data files.
3
+
4
+ This module provides the functionalities to interpret configuration files,
5
+ detect file types for data parsing, and process L0 data into xarray.Dataset
6
+ objects with associated metadata.
7
+
8
+ The module implements explicit input file type detection and parsing logic
9
+ for different data file types including `csv_v1`, `toa5`, and `csv_default`.
10
+ Additionally, it supports post-processing for time offsets and metadata
11
+ enrichment.
12
+
13
+ Functions
14
+ ---------
15
+ - load_data_files: Reads and processes multiple data files given a configuration dictionary.
16
+ - load_config: Parses a TOML configuration file and produces a dictionary of configurations.
17
+ """
18
+ import logging
19
+ import os
20
+ import re
21
+ from datetime import timedelta
22
+ from pathlib import Path
23
+ from typing import Dict, List, Optional, Sequence
24
+
25
+ import pandas as pd
26
+ import toml
27
+ import xarray as xr
28
+
29
+ from . import toa5
30
+
31
+ __all__ = [
32
+ "load_data_files",
33
+ "load_config",
34
+ ]
35
+
36
+ logger = logging.getLogger(__name__)
37
+
38
+ DELIMITER = ","
39
+ COMMENT = "#"
40
+
41
+
42
+ # ---------------------------------------------------------------------
43
+ # Explicit input file type detection
44
+ # ---------------------------------------------------------------------
45
+
46
+
47
+ def _detect_file_type(conf: Dict) -> str:
48
+ """Classify input file type explicitly.
49
+
50
+ Returns one of:
51
+ - 'csv_v1' : legacy layout with year, doy, hhmm columns
52
+ - 'toa5' : Campbell Scientific TOA5
53
+ - 'csv_default' : default CSV-like with timestamp in first column
54
+ """
55
+ infile = conf["file"]
56
+
57
+ # 1) Respect explicit version hint from config
58
+ file_version = conf.get("file_version", -1)
59
+ if file_version == 1:
60
+ return "csv_v1"
61
+
62
+ # 2) Peek file header to detect TOA5
63
+ try:
64
+ with open(infile, "r", encoding="utf-8", errors="ignore") as f:
65
+ # Read a handful of lines to detect markers
66
+ header_lines = []
67
+ for _ in range(10):
68
+ line = f.readline()
69
+ if not line:
70
+ break
71
+ header_lines.append(line.strip())
72
+ except Exception as e:
73
+ logger.debug(f"Failed reading header for detection from {infile}: {e}")
74
+ # Fall back to default if we cannot read
75
+ return "csv_default"
76
+
77
+ # Normalize: skip blank lines
78
+ header_nonblank = [ln for ln in header_lines if ln]
79
+
80
+ if header_nonblank:
81
+ first = header_nonblank[0]
82
+
83
+ # TOA5 files have a first line starting with 'TOA5'
84
+ if re.match(r'^["]?TOA5', first):
85
+ return "toa5"
86
+
87
+ # Default CSV-like parser as a safe fallback
88
+ return "csv_default"
89
+
90
+
91
+ def _parse_csv_v1(conf) -> pd.DataFrame:
92
+ df = pd.read_csv(
93
+ conf["file"],
94
+ comment=COMMENT,
95
+ parse_dates=True,
96
+ na_values=conf["nodata"],
97
+ names=conf["columns"],
98
+ sep=DELIMITER,
99
+ skiprows=conf["skiprows"],
100
+ skip_blank_lines=True,
101
+ usecols=range(len(conf["columns"])),
102
+ low_memory=False,
103
+ )
104
+ df["time"] = pd.to_datetime(
105
+ df.year.astype(str)
106
+ + df.doy.astype(str).str.zfill(3)
107
+ + df.hhmm.astype(str).str.zfill(4),
108
+ format="%Y%j%H%M",
109
+ )
110
+ return df.set_index("time")
111
+
112
+
113
+ def _parse_csv_default(conf) -> pd.DataFrame:
114
+ df = pd.read_csv(
115
+ conf["file"],
116
+ comment=COMMENT,
117
+ index_col=0,
118
+ parse_dates=True,
119
+ na_values=conf["nodata"],
120
+ names=conf["columns"],
121
+ sep=DELIMITER,
122
+ skiprows=conf["skiprows"],
123
+ skip_blank_lines=True,
124
+ usecols=range(len(conf["columns"])),
125
+ low_memory=False,
126
+ )
127
+ try:
128
+ df.index = pd.to_datetime(df.index)
129
+ except ValueError as e:
130
+ logger.info("\n" + conf["file"])
131
+ logger.info("\nValueError:")
132
+ logger.info(e)
133
+ logger.info("\t\t> Trying pd.to_datetime with format=mixed")
134
+ try:
135
+ df.index = pd.to_datetime(df.index, format="mixed")
136
+ except Exception as e:
137
+ logger.info("\nDateParseError:")
138
+ logger.info(e)
139
+ logger.info(
140
+ "\t\t> Trying again removing apostrophes in timestamp (old files format)"
141
+ )
142
+ df.index = pd.to_datetime(df.index.str.replace('"', ""))
143
+
144
+ return df
145
+
146
+
147
+ def _parse_toa5(conf) -> pd.DataFrame:
148
+ df = _parse_csv_default(conf)
149
+ # TODO: Convert to xr.DataSet to allow for metadata enrichment
150
+ try:
151
+ meta = toa5.read_metadata(conf["file"])
152
+ tao5_attrs = meta.get("attrs", {})
153
+ tao5_attrs["file_format"] = tao5_attrs.pop("format")
154
+ except Exception as e:
155
+ logger.warning(f"Failed to enrich TOA5 metadata for {conf['file']}: {e}")
156
+ return df
157
+
158
+
159
+ def load_data_file(conf: Dict) -> xr.Dataset:
160
+ """Read L0 data file to xarray.Dataset using config dictionary and
161
+ populate with initial metadata. The file type is detected automatically.
162
+
163
+ Parameters
164
+ ----------
165
+ conf : dict
166
+ Configuration parameters
167
+ delimiter : str
168
+ comment: str
169
+
170
+ Returns
171
+ -------
172
+ ds : xr.Dataset
173
+ L0 data
174
+ """
175
+ file_type = _detect_file_type(conf)
176
+ logger.info(f"Detected L0 file type '{file_type}' for {conf.get('file')}")
177
+
178
+ if file_type == "csv_v1":
179
+ df = _parse_csv_v1(conf)
180
+ elif file_type == "csv_default":
181
+ df = _parse_csv_default(conf)
182
+ elif file_type == "toa5":
183
+ df = _parse_toa5(conf)
184
+ else:
185
+ raise ValueError(f"Unknown file type: {file_type}")
186
+
187
+ df = _postprocess_dataframe(df, time_offset=conf.get("time_offset"))
188
+
189
+ # Carry relevant metadata with ds
190
+ ds = xr.Dataset.from_dataframe(df)
191
+ ds.attrs["level"] = "L0"
192
+ ds.attrs["detected_file_type"] = file_type
193
+ ds.attrs["filename"] = Path(conf["file"]).name
194
+
195
+ # populate meta with config keys
196
+ skip = ["columns", "skiprows", "modem", "file", "conf", "nodata"]
197
+ for k, v in conf.items():
198
+ if k not in skip:
199
+ ds.attrs[k] = v
200
+
201
+ return ds
202
+
203
+
204
+ def load_data_files(config: Dict[str, Dict]) -> List[xr.Dataset]:
205
+ """Load level 0 (L0) data from config mapping file names to configuration.
206
+
207
+ Tries read_l0_file() using the config with msg_lat & msg_lon appended.
208
+ If a pandas.errors.ParserError occurs due to mismatched columns, removes
209
+ msg_lat & msg_lon from the config and tries again.
210
+
211
+ Parameters
212
+ ----------
213
+ config : Dict[str, Dict]
214
+ Configuration dictionary as returned by pypromice.io.load.getConfig
215
+
216
+ Returns
217
+ -------
218
+ List[xr.Dataset]
219
+ List of L0 datasets
220
+ """
221
+ ds_list: List[xr.Dataset] = []
222
+ for k in config.keys():
223
+ target = config[k]
224
+ try:
225
+ ds_list.append(load_data_file(target))
226
+ except pd.errors.ParserError:
227
+ for item in ["msg_lat", "msg_lon"]:
228
+ if item in target["columns"]:
229
+ target["columns"].remove(item)
230
+ ds_list.append(load_data_file(target))
231
+ logger.info(f"L0 data successfully loaded from {k}")
232
+ return ds_list
233
+
234
+
235
+ def _postprocess_dataframe(
236
+ df: pd.DataFrame, time_offset: Optional[float] = None
237
+ ) -> pd.DataFrame:
238
+ """Apply common post-processing to parsed L0 dataframe."""
239
+ if time_offset is not None:
240
+ df.index = df.index + timedelta(hours=time_offset)
241
+ # Drop SKIP columns
242
+ for c in list(df.columns):
243
+ if c.startswith("SKIP"):
244
+ df.drop(columns=c, inplace=True)
245
+ return df
246
+
247
+
248
+ def load_config(
249
+ config_file: str | Path,
250
+ inpath: str | Path,
251
+ default_columns: Sequence[str] = ("msg_lat", "msg_lon"),
252
+ ):
253
+ """Load configuration from .toml file. PROMICE .toml files support defining
254
+ features at the top level which apply to all nested properties, but do not
255
+ overwrite nested properties if they are defined
256
+
257
+ Parameters
258
+ ----------
259
+ config_file
260
+ TOML file path
261
+ inpath
262
+ Input folder directory where L0 files can be found
263
+
264
+ Returns
265
+ -------
266
+ conf : dict
267
+ Configuration dictionary
268
+ """
269
+ config_file = Path(config_file)
270
+ inpath = Path(inpath)
271
+
272
+ conf = toml.load(config_file) # Move all top level keys to nested properties,
273
+ top = [
274
+ _ for _ in conf.keys() if not type(conf[_]) is dict
275
+ ] # if they are not already defined in the nested properties
276
+ subs = [
277
+ _ for _ in conf.keys() if type(conf[_]) is dict
278
+ ] # Insert the section name (config_file) as a file property and config file
279
+ for s in subs:
280
+ for t in top:
281
+ if t not in conf[s].keys():
282
+ conf[s][t] = conf[t]
283
+
284
+ conf[s]["conf"] = config_file.as_posix()
285
+ conf[s]["file"] = os.path.join(inpath, s)
286
+ conf[s]["columns"].extend(default_columns)
287
+
288
+ for t in top:
289
+ conf.pop(t) # Delete all top level keys beause each file
290
+ # should carry all properties with it
291
+ for k in conf.keys(): # Check required fields are present
292
+ for field in ["columns", "station_id", "format", "skiprows"]:
293
+ assert field in conf[k].keys(), field + " not in config keys"
294
+ return conf
@@ -0,0 +1,103 @@
1
+ """
2
+ Module for managing Level 0 data repositories for station-based datasets.
3
+
4
+ This module provides an abstraction for interacting with Level 0 (L0) datasets through
5
+ a repository interface. Two implementations are detailed: the `L0Repository` protocol
6
+ defines the interface, and `L0RepositoryFS` implements the interface using a file system-based
7
+ repository structure. This is intended for managing both raw and transformed datasets, along
8
+ with their configurations, for multiple stations.
9
+
10
+ Classes:
11
+ L0Repository: Protocol interface for accessing L0 datasets and metadata.
12
+ L0RepositoryFS: File system-based implementation of the `L0Repository` protocol.
13
+
14
+ Functions and attributes exposed:
15
+ - Methods to query and manage raw and transformed datasets.
16
+ - Mechanisms to verify dataset presence and access configuration paths.
17
+
18
+ """
19
+
20
+ import dataclasses
21
+ from pathlib import Path
22
+ from typing import List, Protocol, Iterable
23
+
24
+ import xarray as xr
25
+
26
+ __all__ = [
27
+ "L0Repository",
28
+ "L0RepositoryFS",
29
+ ]
30
+
31
+ from .l0 import load_config, load_data_files
32
+
33
+
34
+ class L0Repository(Protocol):
35
+ def get_tx(self, station_id: str) -> Iterable[xr.Dataset]: ...
36
+ def get_raw(self, station_id: str) -> Iterable[xr.Dataset]: ...
37
+ def get_available_stations(self) -> Iterable[str]: ...
38
+ def contains_tx(self, station_id: str) -> bool: ...
39
+ def contains_raw(self, station_id: str) -> bool: ...
40
+
41
+
42
+ @dataclasses.dataclass(slots=True)
43
+ class L0RepositoryFS:
44
+ root: Path
45
+
46
+ template_tx_config = "tx/config/{station_id}.toml"
47
+ template_tx_data_root = "tx/"
48
+ template_raw_config = "raw/config/{station_id}.toml"
49
+ template_row_data_root = "raw/{station_id}/"
50
+
51
+ def get_tx_config_path(self, station_id: str) -> Path:
52
+ return self.root / self.template_tx_config.format(station_id=station_id)
53
+
54
+ def get_tx_data_root(self, station_id: str) -> Path:
55
+ return self.root / self.template_tx_data_root.format(station_id=station_id)
56
+
57
+ def get_raw_config_path(self, station_id: str) -> Path:
58
+ return self.root / self.template_raw_config.format(station_id=station_id)
59
+
60
+ def get_raw_data_root(self, station_id: str) -> Path:
61
+ return self.root / self.template_row_data_root.format(station_id=station_id)
62
+
63
+ def contains_tx(self, station_id: str) -> bool:
64
+ return self.get_tx_config_path(station_id).exists()
65
+
66
+ def contains_raw(self, station_id: str) -> bool:
67
+ return self.get_raw_config_path(station_id).exists()
68
+
69
+ def get_tx(self, station_id: str) -> List[xr.Dataset]:
70
+ return load_data_files(self.get_tx_config(station_id))
71
+
72
+ def get_tx_config(self, station_id):
73
+ return load_config(
74
+ self.get_tx_config_path(station_id),
75
+ self.get_tx_data_root(station_id),
76
+ )
77
+
78
+ def get_raw(self, station_id: str) -> List[xr.Dataset]:
79
+ return load_data_files(self.get_raw_config(station_id))
80
+
81
+ def get_raw_config(self, station_id):
82
+ return load_config(
83
+ self.get_raw_config_path(station_id),
84
+ self.get_raw_data_root(station_id),
85
+ )
86
+
87
+ def get_available_stations(self) -> List[str]:
88
+ """
89
+ Iterate over all available station configuration files
90
+
91
+ """
92
+ tx_pattern = self.get_tx_config_path("*")
93
+ raw_pattern = self.get_raw_config_path("*")
94
+
95
+ station_ids = {
96
+ p.stem
97
+ for p in [
98
+ *tx_pattern.parent.glob(tx_pattern.name),
99
+ *raw_pattern.parent.glob(raw_pattern.name),
100
+ ]
101
+ }
102
+
103
+ return sorted(station_ids)
@@ -0,0 +1,87 @@
1
+ """
2
+ This module provides functionality to read and convert Campbell Scientific TOA5 files into xarray
3
+ datasets. It extracts metadata, variable names, units, and statistical types, and formats the
4
+ data for further analysis.
5
+ """
6
+ from pathlib import Path
7
+ from typing import Dict
8
+
9
+ import pandas as pd
10
+ import xarray as xr
11
+
12
+
13
+ def read_metadata(filepath: Path|str, raise_exception_on_error: bool = False) -> Dict | None:
14
+ # 1) Read the first four lines manually
15
+ with open(filepath, 'r', encoding='utf-8') as f:
16
+ # strip quotes and newline
17
+ meta_vals = next(f).strip().replace('"', '').split(',')
18
+ names = next(f).strip().replace('"', '').split(',')
19
+ units = next(f).strip().replace('"', '').split(',')
20
+ stats = next(f).strip().replace('"', '').split(',')
21
+
22
+ # Verify the format
23
+ if meta_vals[0] != 'TOA5':
24
+ if raise_exception_on_error:
25
+ raise ValueError(f"Unsupported file format: {meta_vals[0]}")
26
+ else:
27
+ return None
28
+
29
+ # 2) Map the first-line values to a set of metadata keys
30
+ attrs = {
31
+ "format" : meta_vals[0], # e.g. TOA5
32
+ "station_name" : meta_vals[1], # e.g. qas_l_21_correct
33
+ "datalogger" : meta_vals[2], # e.g. CR1000
34
+ "serial_number" : meta_vals[3], # e.g. E6745
35
+ "os_version" : meta_vals[4], # e.g. CR1000.Std.16
36
+ "program_name" : meta_vals[5], # e.g. Promice2015e.CR1
37
+ "program_signature": meta_vals[6], # e.g. 65241
38
+ "table_name" : meta_vals[7], # e.g. SlimTableMem
39
+ }
40
+
41
+ return dict(
42
+ names=names,
43
+ units=units,
44
+ stats=stats,
45
+ attrs=attrs,
46
+ )
47
+
48
+
49
+ def read(filepath: Path, **kwargs) -> xr.DataArray | None:
50
+ """
51
+ Read a Campbell TOA5 file and return as an xarray.Dataset.
52
+
53
+ - Line 1 → dataset.attrs (metadata)
54
+ - Line 2 → variable names
55
+ - Line 3 → variable units
56
+ - Line 4 → statistic/type (e.g. Avg, Smp)
57
+ - Remaining lines → data (with TIMESTAMP parsed as datetime index)
58
+ """
59
+
60
+ metadata = read_metadata(filepath, **kwargs)
61
+ if metadata is None:
62
+ return None
63
+
64
+
65
+ # 3) Read the rest of the file into a DataFrame
66
+ df = pd.read_csv(
67
+ filepath,
68
+ skiprows=4,
69
+ header=None,
70
+ names=metadata['names'],
71
+ parse_dates=["TIMESTAMP"],
72
+ index_col="TIMESTAMP",
73
+ na_values=('NAN', '')
74
+ )
75
+
76
+ # 4) Build an xarray.Dataset
77
+ ds = xr.Dataset.from_dataframe(df)
78
+ ds.attrs.update(metadata['attrs'])
79
+
80
+ # 5) Attach per-variable attributes
81
+ for name, unit, stat in zip(metadata['names'], metadata['units'], metadata['stats']):
82
+ # skip if the column wasn't read (e.g. extra blank columns)
83
+ if name in ds:
84
+ ds[name].attrs["units"] = unit
85
+ ds[name].attrs["statistic"] = stat
86
+
87
+ return ds
@@ -10,7 +10,7 @@ from pathlib import Path
10
10
 
11
11
  import numpy as np
12
12
  import pandas as pd
13
- from pypromice.process.resample import resample_dataset
13
+ from pypromice.pipeline.resample import resample_dataset
14
14
  import pypromice.resources
15
15
 
16
16
  logger = logging.getLogger(__name__)