PyPI - pypromice - Versions diffs - 1.5.3__py3-none-any.whl → 1.7.0__py3-none-any.whl - Mend

pypromice 1.5.3py3-none-any.whl → 1.7.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of pypromice might be problematic. Click here for more details.

Files changed (67) hide show

pypromice/__init__.py +2 -0
pypromice/{qc → core/qc}/github_data_issues.py +22 -13
pypromice/{qc → core/qc}/percentiles/compute_thresholds.py +2 -2
pypromice/{qc → core/qc}/persistence.py +22 -29
pypromice/{process → core/qc}/value_clipping.py +3 -3
pypromice/core/resampling.py +142 -0
pypromice/core/variables/__init__.py +1 -0
pypromice/core/variables/air_temperature.py +64 -0
pypromice/core/variables/gps.py +221 -0
pypromice/core/variables/humidity.py +111 -0
pypromice/core/variables/precipitation.py +108 -0
pypromice/core/variables/pressure_transducer_depth.py +79 -0
pypromice/core/variables/radiation.py +422 -0
pypromice/core/variables/station_boom_height.py +75 -0
pypromice/core/variables/station_pose.py +375 -0
pypromice/io/bufr/__init__.py +0 -0
pypromice/{postprocess → io/bufr}/bufr_to_csv.py +1 -1
pypromice/{postprocess → io/bufr}/create_bufr_files.py +2 -2
pypromice/{postprocess → io/bufr}/get_bufr.py +6 -6
pypromice/{postprocess → io/bufr}/real_time_utilities.py +3 -3
pypromice/io/ingest/__init__.py +0 -0
pypromice/{utilities → io/ingest}/git.py +1 -3
pypromice/io/ingest/l0.py +294 -0
pypromice/io/ingest/l0_repository.py +103 -0
pypromice/io/ingest/toa5.py +87 -0
pypromice/{process → io}/write.py +1 -1
pypromice/pipeline/L0toL1.py +291 -0
pypromice/pipeline/L1toL2.py +233 -0
pypromice/{process → pipeline}/L2toL3.py +113 -118
pypromice/pipeline/__init__.py +4 -0
pypromice/{process → pipeline}/aws.py +10 -82
pypromice/{process → pipeline}/get_l2.py +2 -2
pypromice/{process → pipeline}/get_l2tol3.py +19 -22
pypromice/{process → pipeline}/join_l2.py +31 -32
pypromice/{process → pipeline}/join_l3.py +16 -14
pypromice/{process → pipeline}/resample.py +75 -51
pypromice/{process → pipeline}/utilities.py +0 -22
pypromice/resources/file_attributes.csv +4 -4
pypromice/resources/variable_aliases_GC-Net.csv +2 -2
pypromice/resources/variables.csv +27 -24
{pypromice-1.5.3.dist-info → pypromice-1.7.0.dist-info}/METADATA +1 -2
pypromice-1.7.0.dist-info/RECORD +65 -0
pypromice-1.7.0.dist-info/entry_points.txt +12 -0
pypromice/get/__init__.py +0 -1
pypromice/get/get.py +0 -211
pypromice/get/get_promice_data.py +0 -56
pypromice/process/L0toL1.py +0 -564
pypromice/process/L1toL2.py +0 -824
pypromice/process/__init__.py +0 -4
pypromice/process/load.py +0 -161
pypromice-1.5.3.dist-info/RECORD +0 -54
pypromice-1.5.3.dist-info/entry_points.txt +0 -13
/pypromice/{postprocess → core}/__init__.py +0 -0
/pypromice/{utilities → core}/dependency_graph.py +0 -0
/pypromice/{qc → core/qc}/__init__.py +0 -0
/pypromice/{qc → core/qc}/percentiles/__init__.py +0 -0
/pypromice/{qc → core/qc}/percentiles/outlier_detector.py +0 -0
/pypromice/{qc → core/qc}/percentiles/thresholds.csv +0 -0
/pypromice/{process → core/variables}/wind.py +0 -0
/pypromice/{utilities → io}/__init__.py +0 -0
/pypromice/{postprocess → io/bufr}/bufr_utilities.py +0 -0
/pypromice/{postprocess → io/bufr}/positions_seed.csv +0 -0
/pypromice/{station_configuration.py → io/bufr/station_configuration.py} +0 -0
/pypromice/{postprocess → io}/make_metadata_csv.py +0 -0
{pypromice-1.5.3.dist-info → pypromice-1.7.0.dist-info}/WHEEL +0 -0
{pypromice-1.5.3.dist-info → pypromice-1.7.0.dist-info}/licenses/LICENSE.txt +0 -0
{pypromice-1.5.3.dist-info → pypromice-1.7.0.dist-info}/top_level.txt +0 -0

pypromice/io/ingest/l0.py ADDED Viewed

@@ -0,0 +1,294 @@
+"""
+Module for handling configuration loading and parsing of L0 data files.
+This module provides the functionalities to interpret configuration files,
+detect file types for data parsing, and process L0 data into xarray.Dataset
+objects with associated metadata.
+The module implements explicit input file type detection and parsing logic
+for different data file types including `csv_v1`, `toa5`, and `csv_default`.
+Additionally, it supports post-processing for time offsets and metadata
+enrichment.
+Functions
+---------
+- load_data_files: Reads and processes multiple data files given a configuration dictionary.
+- load_config: Parses a TOML configuration file and produces a dictionary of configurations.
+"""
+import logging
+import os
+import re
+from datetime import timedelta
+from pathlib import Path
+from typing import Dict, List, Optional, Sequence
+import pandas as pd
+import toml
+import xarray as xr
+from . import toa5
+__all__ = [
+    "load_data_files",
+    "load_config",
+]
+logger = logging.getLogger(__name__)
+DELIMITER = ","
+COMMENT = "#"
+# ---------------------------------------------------------------------
+# Explicit input file type detection
+# ---------------------------------------------------------------------
+def _detect_file_type(conf: Dict) -> str:
+    """Classify input file type explicitly.
+    Returns one of:
+      - 'csv_v1'      : legacy layout with year, doy, hhmm columns
+      - 'toa5'        : Campbell Scientific TOA5
+      - 'csv_default' : default CSV-like with timestamp in first column
+    """
+    infile = conf["file"]
+    # 1) Respect explicit version hint from config
+    file_version = conf.get("file_version", -1)
+    if file_version == 1:
+        return "csv_v1"
+    # 2) Peek file header to detect TOA5
+    try:
+        with open(infile, "r", encoding="utf-8", errors="ignore") as f:
+            # Read a handful of lines to detect markers
+            header_lines = []
+            for _ in range(10):
+                line = f.readline()
+                if not line:
+                    break
+                header_lines.append(line.strip())
+    except Exception as e:
+        logger.debug(f"Failed reading header for detection from {infile}: {e}")
+        # Fall back to default if we cannot read
+        return "csv_default"
+    # Normalize: skip blank lines
+    header_nonblank = [ln for ln in header_lines if ln]
+    if header_nonblank:
+        first = header_nonblank[0]
+        # TOA5 files have a first line starting with 'TOA5'
+        if re.match(r'^["]?TOA5', first):
+            return "toa5"
+    # Default CSV-like parser as a safe fallback
+    return "csv_default"
+def _parse_csv_v1(conf) -> pd.DataFrame:
+    df = pd.read_csv(
+        conf["file"],
+        comment=COMMENT,
+        parse_dates=True,
+        na_values=conf["nodata"],
+        names=conf["columns"],
+        sep=DELIMITER,
+        skiprows=conf["skiprows"],
+        skip_blank_lines=True,
+        usecols=range(len(conf["columns"])),
+        low_memory=False,
+    )
+    df["time"] = pd.to_datetime(
+        df.year.astype(str)
+        + df.doy.astype(str).str.zfill(3)
+        + df.hhmm.astype(str).str.zfill(4),
+        format="%Y%j%H%M",
+    )
+    return df.set_index("time")
+def _parse_csv_default(conf) -> pd.DataFrame:
+    df = pd.read_csv(
+        conf["file"],
+        comment=COMMENT,
+        index_col=0,
+        parse_dates=True,
+        na_values=conf["nodata"],
+        names=conf["columns"],
+        sep=DELIMITER,
+        skiprows=conf["skiprows"],
+        skip_blank_lines=True,
+        usecols=range(len(conf["columns"])),
+        low_memory=False,
+    )
+    try:
+        df.index = pd.to_datetime(df.index)
+    except ValueError as e:
+        logger.info("\n" + conf["file"])
+        logger.info("\nValueError:")
+        logger.info(e)
+        logger.info("\t\t> Trying pd.to_datetime with format=mixed")
+        try:
+            df.index = pd.to_datetime(df.index, format="mixed")
+        except Exception as e:
+            logger.info("\nDateParseError:")
+            logger.info(e)
+            logger.info(
+                "\t\t> Trying again removing apostrophes in timestamp (old files format)"
+            )
+            df.index = pd.to_datetime(df.index.str.replace('"', ""))
+    return df
+def _parse_toa5(conf) -> pd.DataFrame:
+    df = _parse_csv_default(conf)
+    # TODO: Convert to xr.DataSet to allow for metadata enrichment
+    try:
+        meta = toa5.read_metadata(conf["file"])
+        tao5_attrs = meta.get("attrs", {})
+        tao5_attrs["file_format"] = tao5_attrs.pop("format")
+    except Exception as e:
+        logger.warning(f"Failed to enrich TOA5 metadata for {conf['file']}: {e}")
+    return df
+def load_data_file(conf: Dict) -> xr.Dataset:
+    """Read L0 data file to xarray.Dataset using config dictionary and
+    populate with initial metadata. The file type is detected automatically.
+    Parameters
+    ----------
+    conf : dict
+        Configuration parameters
+    delimiter : str
+    comment: str
+    Returns
+    -------
+    ds : xr.Dataset
+        L0 data
+    """
+    file_type = _detect_file_type(conf)
+    logger.info(f"Detected L0 file type '{file_type}' for {conf.get('file')}")
+    if file_type == "csv_v1":
+        df = _parse_csv_v1(conf)
+    elif file_type == "csv_default":
+        df = _parse_csv_default(conf)
+    elif file_type == "toa5":
+        df = _parse_toa5(conf)
+    else:
+        raise ValueError(f"Unknown file type: {file_type}")
+    df = _postprocess_dataframe(df, time_offset=conf.get("time_offset"))
+    # Carry relevant metadata with ds
+    ds = xr.Dataset.from_dataframe(df)
+    ds.attrs["level"] = "L0"
+    ds.attrs["detected_file_type"] = file_type
+    ds.attrs["filename"] = Path(conf["file"]).name
+    # populate meta with config keys
+    skip = ["columns", "skiprows", "modem", "file", "conf", "nodata"]
+    for k, v in conf.items():
+        if k not in skip:
+            ds.attrs[k] = v
+    return ds
+def load_data_files(config: Dict[str, Dict]) -> List[xr.Dataset]:
+    """Load level 0 (L0) data from config mapping file names to configuration.
+    Tries read_l0_file() using the config with msg_lat & msg_lon appended.
+    If a pandas.errors.ParserError occurs due to mismatched columns, removes
+    msg_lat & msg_lon from the config and tries again.
+    Parameters
+    ----------
+    config : Dict[str, Dict]
+        Configuration dictionary as returned by pypromice.io.load.getConfig
+    Returns
+    -------
+    List[xr.Dataset]
+        List of L0 datasets
+    """
+    ds_list: List[xr.Dataset] = []
+    for k in config.keys():
+        target = config[k]
+        try:
+            ds_list.append(load_data_file(target))
+        except pd.errors.ParserError:
+            for item in ["msg_lat", "msg_lon"]:
+                if item in target["columns"]:
+                    target["columns"].remove(item)
+            ds_list.append(load_data_file(target))
+        logger.info(f"L0 data successfully loaded from {k}")
+    return ds_list
+def _postprocess_dataframe(
+    df: pd.DataFrame, time_offset: Optional[float] = None
+) -> pd.DataFrame:
+    """Apply common post-processing to parsed L0 dataframe."""
+    if time_offset is not None:
+        df.index = df.index + timedelta(hours=time_offset)
+    # Drop SKIP columns
+    for c in list(df.columns):
+        if c.startswith("SKIP"):
+            df.drop(columns=c, inplace=True)
+    return df
+def load_config(
+    config_file: str | Path,
+    inpath: str | Path,
+    default_columns: Sequence[str] = ("msg_lat", "msg_lon"),
+):
+    """Load configuration from .toml file. PROMICE .toml files support defining
+    features at the top level which apply to all nested properties, but do not
+    overwrite nested properties if they are defined
+    Parameters
+    ----------
+    config_file
+        TOML file path
+    inpath
+        Input folder directory where L0 files can be found
+    Returns
+    -------
+    conf : dict
+        Configuration dictionary
+    """
+    config_file = Path(config_file)
+    inpath = Path(inpath)
+    conf = toml.load(config_file)  # Move all top level keys to nested properties,
+    top = [
+        _ for _ in conf.keys() if not type(conf[_]) is dict
+    ]  # if they are not already defined in the nested properties
+    subs = [
+        _ for _ in conf.keys() if type(conf[_]) is dict
+    ]  # Insert the section name (config_file) as a file property and config file
+    for s in subs:
+        for t in top:
+            if t not in conf[s].keys():
+                conf[s][t] = conf[t]
+        conf[s]["conf"] = config_file.as_posix()
+        conf[s]["file"] = os.path.join(inpath, s)
+        conf[s]["columns"].extend(default_columns)
+    for t in top:
+        conf.pop(t)  # Delete all top level keys beause each file
+    # should carry all properties with it
+    for k in conf.keys():  # Check required fields are present
+        for field in ["columns", "station_id", "format", "skiprows"]:
+            assert field in conf[k].keys(), field + " not in config keys"
+    return conf

pypromice/io/ingest/l0_repository.py ADDED Viewed

@@ -0,0 +1,103 @@
+"""
+Module for managing Level 0 data repositories for station-based datasets.
+This module provides an abstraction for interacting with Level 0 (L0) datasets through
+a repository interface. Two implementations are detailed: the `L0Repository` protocol
+defines the interface, and `L0RepositoryFS` implements the interface using a file system-based
+repository structure. This is intended for managing both raw and transformed datasets, along
+with their configurations, for multiple stations.
+Classes:
+    L0Repository: Protocol interface for accessing L0 datasets and metadata.
+    L0RepositoryFS: File system-based implementation of the `L0Repository` protocol.
+Functions and attributes exposed:
+    - Methods to query and manage raw and transformed datasets.
+    - Mechanisms to verify dataset presence and access configuration paths.
+"""
+import dataclasses
+from pathlib import Path
+from typing import List, Protocol, Iterable
+import xarray as xr
+__all__ = [
+    "L0Repository",
+    "L0RepositoryFS",
+]
+from .l0 import load_config, load_data_files
+class L0Repository(Protocol):
+    def get_tx(self, station_id: str) -> Iterable[xr.Dataset]: ...
+    def get_raw(self, station_id: str) -> Iterable[xr.Dataset]: ...
+    def get_available_stations(self) -> Iterable[str]: ...
+    def contains_tx(self, station_id: str) -> bool: ...
+    def contains_raw(self, station_id: str) -> bool: ...
+@dataclasses.dataclass(slots=True)
+class L0RepositoryFS:
+    root: Path
+    template_tx_config = "tx/config/{station_id}.toml"
+    template_tx_data_root = "tx/"
+    template_raw_config = "raw/config/{station_id}.toml"
+    template_row_data_root = "raw/{station_id}/"
+    def get_tx_config_path(self, station_id: str) -> Path:
+        return self.root / self.template_tx_config.format(station_id=station_id)
+    def get_tx_data_root(self, station_id: str) -> Path:
+        return self.root / self.template_tx_data_root.format(station_id=station_id)
+    def get_raw_config_path(self, station_id: str) -> Path:
+        return self.root / self.template_raw_config.format(station_id=station_id)
+    def get_raw_data_root(self, station_id: str) -> Path:
+        return self.root / self.template_row_data_root.format(station_id=station_id)
+    def contains_tx(self, station_id: str) -> bool:
+        return self.get_tx_config_path(station_id).exists()
+    def contains_raw(self, station_id: str) -> bool:
+        return self.get_raw_config_path(station_id).exists()
+    def get_tx(self, station_id: str) -> List[xr.Dataset]:
+        return load_data_files(self.get_tx_config(station_id))
+    def get_tx_config(self, station_id):
+        return load_config(
+            self.get_tx_config_path(station_id),
+            self.get_tx_data_root(station_id),
+        )
+    def get_raw(self, station_id: str) -> List[xr.Dataset]:
+        return load_data_files(self.get_raw_config(station_id))
+    def get_raw_config(self, station_id):
+        return load_config(
+            self.get_raw_config_path(station_id),
+            self.get_raw_data_root(station_id),
+        )
+    def get_available_stations(self) -> List[str]:
+        """
+        Iterate over all available station configuration files
+        """
+        tx_pattern = self.get_tx_config_path("*")
+        raw_pattern = self.get_raw_config_path("*")
+        station_ids = {
+            p.stem
+            for p in [
+                *tx_pattern.parent.glob(tx_pattern.name),
+                *raw_pattern.parent.glob(raw_pattern.name),
+            ]
+        }
+        return sorted(station_ids)

pypromice/io/ingest/toa5.py ADDED Viewed

@@ -0,0 +1,87 @@
+"""
+This module provides functionality to read and convert Campbell Scientific TOA5 files into xarray
+datasets. It extracts metadata, variable names, units, and statistical types, and formats the
+data for further analysis.
+"""
+from pathlib import Path
+from typing import Dict
+import pandas as pd
+import xarray as xr
+def read_metadata(filepath: Path|str, raise_exception_on_error: bool = False)  -> Dict | None:
+    # 1) Read the first four lines manually
+    with open(filepath, 'r', encoding='utf-8') as f:
+        # strip quotes and newline
+        meta_vals = next(f).strip().replace('"', '').split(',')
+        names     = next(f).strip().replace('"', '').split(',')
+        units     = next(f).strip().replace('"', '').split(',')
+        stats     = next(f).strip().replace('"', '').split(',')
+    # Verify the format
+    if meta_vals[0] != 'TOA5':
+        if raise_exception_on_error:
+            raise ValueError(f"Unsupported file format: {meta_vals[0]}")
+        else:
+            return None
+    # 2) Map the first-line values to a set of metadata keys
+    attrs = {
+        "format"           : meta_vals[0], # e.g. TOA5
+        "station_name"     : meta_vals[1], # e.g. qas_l_21_correct
+        "datalogger"       : meta_vals[2], # e.g. CR1000
+        "serial_number"    : meta_vals[3], # e.g. E6745
+        "os_version"       : meta_vals[4], # e.g. CR1000.Std.16
+        "program_name"     : meta_vals[5], # e.g. Promice2015e.CR1
+        "program_signature": meta_vals[6], # e.g. 65241
+        "table_name"       : meta_vals[7], # e.g. SlimTableMem
+    }
+    return dict(
+        names=names,
+        units=units,
+        stats=stats,
+        attrs=attrs,
+    )
+def read(filepath: Path, **kwargs) -> xr.DataArray | None:
+    """
+    Read a Campbell TOA5 file and return as an xarray.Dataset.
+    - Line 1 → dataset.attrs (metadata)
+    - Line 2 → variable names
+    - Line 3 → variable units
+    - Line 4 → statistic/type (e.g. Avg, Smp)
+    - Remaining lines → data (with TIMESTAMP parsed as datetime index)
+    """
+    metadata = read_metadata(filepath, **kwargs)
+    if metadata is None:
+        return None
+    # 3) Read the rest of the file into a DataFrame
+    df = pd.read_csv(
+        filepath,
+        skiprows=4,
+        header=None,
+        names=metadata['names'],
+        parse_dates=["TIMESTAMP"],
+        index_col="TIMESTAMP",
+        na_values=('NAN', '')
+    )
+    # 4) Build an xarray.Dataset
+    ds = xr.Dataset.from_dataframe(df)
+    ds.attrs.update(metadata['attrs'])
+    # 5) Attach per-variable attributes
+    for name, unit, stat in zip(metadata['names'], metadata['units'], metadata['stats']):
+        # skip if the column wasn't read (e.g. extra blank columns)
+        if name in ds:
+            ds[name].attrs["units"]     = unit
+            ds[name].attrs["statistic"] = stat
+    return ds

pypromice/{process → io}/write.py RENAMED Viewed

@@ -10,7 +10,7 @@ from pathlib import Path
 import numpy as np
 import pandas as pd
-from pypromice.process.resample import resample_dataset
+from pypromice.pipeline.resample import resample_dataset
 import pypromice.resources
 logger = logging.getLogger(__name__)

pypromice 1.5.3__py3-none-any.whl → 1.7.0__py3-none-any.whl

Potentially problematic release.

pypromice 1.5.3py3-none-any.whl → 1.7.0py3-none-any.whl