PyPI - toolbox-utils - Versions diffs - 5.0.8__py3-none-any.whl - Mend

toolbox-utils 5.0.8__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (14) hide show

toolbox_utils/__init__.py +0 -0
toolbox_utils/readers/__init__.py +0 -0
toolbox_utils/readers/hbn.py +392 -0
toolbox_utils/readers/plotgen.py +55 -0
toolbox_utils/readers/wdm.py +233 -0
toolbox_utils/tssplit/setup.py +38 -0
toolbox_utils/tssplit/tssplit/__init__.py +1 -0
toolbox_utils/tssplit/tssplit/tssplit.py +45 -0
toolbox_utils/tsutils.py +2638 -0
toolbox_utils-5.0.8.dist-info/LICENSE.txt +27 -0
toolbox_utils-5.0.8.dist-info/METADATA +117 -0
toolbox_utils-5.0.8.dist-info/RECORD +14 -0
toolbox_utils-5.0.8.dist-info/WHEEL +5 -0
toolbox_utils-5.0.8.dist-info/top_level.txt +1 -0

toolbox_utils/__init__.py ADDED Viewed

File without changes

toolbox_utils/readers/__init__.py ADDED Viewed

File without changes

toolbox_utils/readers/hbn.py ADDED Viewed

@@ -0,0 +1,392 @@
+"""hspfbintoolbox to read HSPF binary files."""
+import datetime
+import struct
+import sys
+try:
+    from typing import Literal
+except ImportError:
+    from typing import Literal
+import pandas as pd
+from .. import tsutils
+code2intervalmap = {5: "yearly", 4: "monthly", 3: "daily", 2: "bivl"}
+interval2codemap = {"yearly": 5, "monthly": 4, "daily": 3, "bivl": 2}
+code2freqmap = {5: "A", 4: "M", 3: "D", 2: None}
+_LOCAL_DOCSTRINGS = {
+    "hbnfilename": """hbnfilename: str
+        The HSPF binary output file.  This file must have been created from
+        a completed model run."""
+}
+def tuple_match(findme, hay):
+    """Part of partial ordered matching.
+    See http://stackoverflow.com/a/4559604
+    """
+    return len(findme) == len(hay) and all(
+        i is None or j is None or i == j for i, j in zip(findme, hay)
+    )
+def tuple_combine(findme, hay):
+    """Part of partial ordered matching.
+    See http://stackoverflow.com/a/4559604
+    """
+    return tuple(i is None and j or i for i, j in zip(findme, hay))
+def tuple_search(findme, haystack):
+    """Partial ordered matching with 'None' as wildcard
+    See http://stackoverflow.com/a/4559604
+    """
+    return [
+        (index, tuple_combine(findme, hay))
+        for index, hay in enumerate(haystack)
+        if tuple_match(findme, hay)
+    ]
+def _get_data(binfilename, interval="daily", labels=None, catalog_only=True):
+    """Underlying function to read from the binary file.  Used by
+    'extract', 'catalog'.
+    """
+    if labels is None:
+        labels = [",,,"]
+    testem = {
+        "PERLND": [
+            "ATEMP",
+            "SNOW",
+            "PWATER",
+            "SEDMNT",
+            "PSTEMP",
+            "PWTGAS",
+            "PQUAL",
+            "MSTLAY",
+            "PEST",
+            "NITR",
+            "PHOS",
+            "TRACER",
+            "",
+        ],
+        "IMPLND": ["ATEMP", "SNOW", "IWATER", "SOLIDS", "IWTGAS", "IQUAL", ""],
+        "RCHRES": [
+            "HYDR",
+            "CONS",
+            "HTRCH",
+            "SEDTRN",
+            "GQUAL",
+            "OXRX",
+            "NUTRX",
+            "PLANK",
+            "PHCARB",
+            "INFLOW",
+            "OFLOW",
+            "ROFLOW",
+            "",
+        ],
+        "BMPRAC": [""],
+        "": [""],
+    }
+    collect_dict = {}
+    lablist = []
+    # Normalize interval code
+    try:
+        intervalcode = interval2codemap[interval.lower()]
+    except AttributeError:
+        intervalcode = None
+    # convert label tuples to lists
+    labels = list(labels)
+    # turn into a list of lists
+    nlabels = []
+    for label in labels:
+        if isinstance(label, str):
+            nlabels.append(label.split(","))
+        else:
+            nlabels.append(label)
+    labels = nlabels
+    # Check the list members for valid values
+    for label in labels:
+        if len(label) != 4:
+            raise ValueError(
+                tsutils.error_wrapper(
+                    f"""The label '{label}' has the wrong number of entries.
+                    """
+                )
+            )
+        # replace empty fields with None
+        # operation,lue_number,group,variable
+        words = [None if (i in ("", "None")) else i for i in label]
+        # first word must be a valid operation type or None
+        if words[0] is not None:
+            # force uppercase before comparison
+            words[0] = words[0].upper()
+            if words[0] not in testem:
+                raise ValueError(
+                    tsutils.error_wrapper(
+                        f"""Operation type must be one of 'PERLND', 'IMPLND',
+                        'RCHRES', or 'BMPRAC', or missing (to get all) instead
+                        of {words[0]}.
+                        """
+                    )
+                )
+        # second word must be integer 1-999 or None or range to parse
+        if words[1] is not None:
+            try:
+                words[1] = int(words[1])
+                luelist = [words[1]]
+            except ValueError:
+                luelist = tsutils.range_to_numlist(words[1])
+            for luenum in luelist:
+                if luenum < 1 or luenum > 999:
+                    raise ValueError(
+                        tsutils.error_wrapper(
+                            f"""The land use element must be an integer from
+                            1 to 999 inclusive, instead of {luenum}.
+                            """
+                        )
+                    )
+        else:
+            luelist = [None]
+        # third word must be a valid group name or None
+        if words[2] is not None:
+            words[2] = words[2].upper()
+            if words[2] not in testem[words[0]]:
+                raise ValueError(
+                    tsutils.error_wrapper(
+                        f"""The {words[0]} operation type only allows the
+                        variable groups: {testem[words[0]][:-1]}, instead you
+                        gave {words[2]}.
+                        """
+                    )
+                )
+        # fourth word is currently not checked - assumed to be a variable name
+        # if not, it will simply never be found in the file, so ok
+        # but no warning for the user - add check?
+        # add interval code as fifth word in list
+        words.append(intervalcode)
+        # add to new list of checked and expanded lists
+        for luenum in luelist:
+            words[1] = luenum
+            lablist.append(list(words))
+    # Now read through the binary file and collect the data matching the labels
+    with open(binfilename, "rb") as binfp:
+        labeltest = set()
+        vnames = {}
+        ndates = set()
+        # read first byte - must be hex FD (decimal 253) for valid file.
+        magicbyte = binfp.read(1)
+        if magicbyte != b"\xfd":
+            # not a valid HSPF binary file
+            raise ValueError(
+                tsutils.error_wrapper(
+                    f"""{binfilename} is not a valid HSPF binary output file
+                    (.hbn),  The first byte must be FD hexadecimal, but it was
+                    {magicbyte}.
+                    """
+                )
+            )
+        # loop through each record
+        while True:
+            # reinitialize counter for record length - used to compute skip at
+            # end
+            recpos = 0
+            # read first four bytes to get record length bitfield
+            try:
+                reclen1, reclen2, reclen3, reclen = struct.unpack("4B", binfp.read(4))
+                recpos += 4
+            except struct.error:
+                # End of file.
+                break
+            # get record leader - next 24 bytes
+            rectype, optype, lue, group = struct.unpack("I8sI8s", binfp.read(24))
+            recpos += 24
+            # clean up
+            rectype = int(rectype)
+            lue = int(lue)
+            optype = optype.strip()
+            group = group.strip()
+            if rectype == 0:
+                # header record - collect variable names for this
+                # operation and group
+                # parse reclen bitfield to get actual remaining length
+                # the " - 24 " subtracts the 24 bytes already read
+                reclen1 = int(reclen1 / 4)
+                reclen2 = reclen2 * 64 + reclen1
+                reclen3 = reclen3 * 16384 + reclen2
+                reclen = reclen * 4194304 + reclen3 - 24
+                # loop through rest of record
+                slen = 0
+                while slen < reclen:
+                    # read single 4B word for length of next variable name
+                    length = struct.unpack("I", binfp.read(4))[0]
+                    # read the variable name
+                    variable_name = struct.unpack(f"{length}s", binfp.read(length))[0]
+                    # add variable name to the set for this operation
+                    # why a set instead of a list? There should never be
+                    # a duplicate anyway
+                    vnames.setdefault((lue, group), []).append(variable_name)
+                    # update how far along the record we are
+                    slen += length + 4
+                    recpos += length + 4
+            elif rectype == 1:
+                # Data record
+                # record should contain a value for each variable name for this
+                # operation and group
+                numvals = len(vnames[(lue, group)])
+                (_, level, year, month, day, hour, minute) = struct.unpack(
+                    "7I", binfp.read(28)
+                )
+                recpos += 28
+                vals = struct.unpack(f"{numvals}f", binfp.read(4 * numvals))
+                recpos += 4 * numvals
+                delta = datetime.timedelta(hours=0)
+                if hour == 24:
+                    hour = 0
+                ndate = datetime.datetime(year, month, day, hour, minute) + delta
+                #  Go through labels to see if these values need to be
+                #  collected
+                for i, vname in enumerate(vnames[(lue, group)]):
+                    tmpkey = (
+                        optype.decode("ascii"),
+                        lue,
+                        group.decode("ascii"),
+                        vname.decode("ascii"),
+                        level,
+                    )
+                    for lbl in lablist:
+                        res = tuple_search(tmpkey, [lbl])
+                        if not res:
+                            continue
+                        labeltest.add(tuple(lbl))
+                        nres = res[0][1]
+                        ndates.add(ndate)
+                        if catalog_only is False:
+                            if intervalcode == level:
+                                collect_dict.setdefault(nres, []).append(vals[i])
+                        else:
+                            collect_dict[nres] = level
+            else:
+                # there was a problem with unexpected record length
+                # back up almost all the way and try again
+                binfp.seek(-31, 1)
+            # calculate and skip to the end of the variable-length back pointer
+            reccnt = recpos * 4 + 1
+            if reccnt >= 256**2:
+                skbytes = 3
+            elif reccnt >= 256:
+                skbytes = 2
+            else:
+                skbytes = 1
+            binfp.read(skbytes)
+    if not collect_dict:
+        raise ValueError(
+            tsutils.error_wrapper(
+                f"""The label specifications below matched no records in the
+                binary file.
+                {lablist}
+                """
+            )
+        )
+    ndates = sorted(list(ndates))
+    if catalog_only is False:
+        for lbl in lablist:
+            if tuple(lbl) not in labeltest:
+                sys.stderr.write(
+                    tsutils.error_wrapper(
+                        f"""Warning: The label '{lbl}' matched no records in
+                        the binary file.
+                        """
+                    )
+                )
+    else:
+        for key in collect_dict:
+            delta = ndates[1] - ndates[0] if key[4] == 2 else code2freqmap[key[4]]
+            collect_dict[key] = (
+                pd.Period(ndates[0], freq=delta),
+                pd.Period(ndates[-1], freq=delta),
+            )
+    return ndates, collect_dict
+def hbn_extract(
+    hbnfilename: str,
+    interval: Literal["yearly", "monthly", "daily", "bivl"],
+    *labels,
+    sort_columns: bool = False,
+):
+    """Returns a DataFrame from a HSPF binary output file."""
+    interval = interval.lower()
+    if interval not in ("bivl", "daily", "monthly", "yearly"):
+        raise ValueError(
+            tsutils.error_wrapper(
+                f"""The "interval" argument must be one of "bivl", "daily",
+                "monthly", or "yearly".  You supplied "{interval}".
+                """
+            )
+        )
+    index, data = _get_data(hbnfilename, interval, labels, catalog_only=False)
+    skeys = list(data.keys())
+    if sort_columns:
+        skeys.sort(key=lambda tup: tup[1:])
+    result = pd.DataFrame(
+        pd.concat(
+            [pd.Series(data[i], index=index) for i in skeys], sort=False, axis=1
+        ).reindex(pd.Index(index))
+    )
+    columns = [f"{i[0]}_{i[1]}_{i[3]}".replace(" ", "-") for i in skeys]
+    result.columns = columns
+    if interval == "bivl":
+        result.index = result.index.to_period(result.index[1] - result.index[0])
+    else:
+        result.index = result.index.to_period()
+    result.index.name = "Datetime"
+    return result

toolbox_utils/readers/plotgen.py ADDED Viewed

@@ -0,0 +1,55 @@
+"""For reading HSPF plotgen files."""
+import datetime
+import pandas as pd
+_END_OF_HEADER = 25
+def plotgen_extract(filename):
+    """Reads HSPF PLTGEN files and creates a DataFrame."""
+    foundcols = False
+    cols = []
+    lst = []
+    with open(filename, encoding="ascii") as fpointer:
+        for i, line in enumerate(fpointer):
+            if i < _END_OF_HEADER:
+                if "LINTYP" in line:
+                    foundcols = True
+                elif line[5:].startswith("Time series"):
+                    foundcols = False
+                elif foundcols:
+                    if header := line[4:30].strip():
+                        cols.append(header)
+                    else:
+                        foundcols = False
+            if i > _END_OF_HEADER:
+                year, month, day, hour, minute = line[4:22].split()
+                if int(hour) == 24:
+                    day = [
+                        datetime.datetime(int(year), int(month), int(day), tzinfo=None)
+                        + datetime.timedelta(days=1)
+                    ]
+                else:
+                    day = [
+                        datetime.datetime(
+                            int(year),
+                            int(month),
+                            int(day),
+                            int(hour),
+                            int(minute),
+                            tzinfo=None,
+                        )
+                    ]
+                data = [float(x) for x in line[23:].split()]
+                lst.append(day + data)
+    pgdf = pd.DataFrame(lst)
+    pgdf.columns = ["Datetime"] + cols
+    pgdf = pgdf.set_index(["Datetime"])
+    pgdf.index = pd.DatetimeIndex(pgdf.index)
+    return pgdf

toolbox_utils/readers/wdm.py ADDED Viewed

@@ -0,0 +1,233 @@
+"""
+Pure python WDM file reader.
+"""
+from datetime import datetime
+import numpy as np
+import pandas as pd
+# look up attributes NAME, data type (Integer; Real; String) and data length by attribute number
+attrinfo = {
+    1: ("TSTYPE", "S", 4),
+    2: ("STAID", "S", 16),
+    11: ("DAREA", "R", 1),
+    17: ("TCODE", "I", 1),
+    27: ("TSBYR", "I", 1),
+    28: ("TSBMO", "I", 1),
+    29: ("TSBDY", "I", 1),
+    30: ("TSBHR", "I", 1),
+    32: ("TFILL", "R", 1),
+    33: ("TSSTEP", "I", 1),
+    34: ("TGROUP", "I", 1),
+    45: ("STNAM", "S", 48),
+    83: ("COMPFG", "I", 1),
+    84: ("TSFORM", "I", 1),
+    85: ("VBTIME", "I", 1),
+    444: ("A444", "S", 12),
+    443: ("A443", "S", 12),
+    22: ("DCODE", "I", 1),
+    10: ("DESCRP", "S", 80),
+    7: ("ELEV", "R", 1),
+    8: ("LATDEG", "R", 1),
+    9: ("LNGDEG", "R", 1),
+    288: ("SCENARIO", "S", 8),
+    289: ("CONSTITUENT", "S", 8),
+    290: ("LOCATION", "S", 8),
+}
+freq = {
+    7: "100YS",
+    6: "YS",
+    5: "MS",
+    4: "D",
+    3: "H",
+    2: "min",
+    1: "S",
+}  # pandas date_range() frequency by TCODE, TGROUP
+def wdm_extract(wdmfile, *idsn):
+    """Extract DSN from WDM file."""
+    idsn = [int(i) for i in idsn]
+    iarray = np.fromfile(wdmfile, dtype=np.int32)
+    farray = np.fromfile(wdmfile, dtype=np.float32)
+    if iarray[0] != -998:
+        raise ValueError("Not a WDM file, magic number is not -990. Stopping!")
+    nrecords = iarray[28]  # first record is File Definition Record
+    ntimeseries = iarray[31]
+    dsnlist = [
+        index
+        for index in range(512, nrecords * 512, 512)
+        if (
+            not (
+                iarray[index] == iarray[index + 1] == iarray[index + 2] == 0
+                and iarray[index + 3]
+            )
+            and iarray[index + 5] == 1
+        )
+    ]
+    if len(dsnlist) != ntimeseries:
+        print("PROGRAM ERROR, wrong number of DSN records found")
+    retdf = pd.DataFrame()
+    for index in dsnlist:
+        # get layout information for TimeSeries Dataset frame
+        dsn = iarray[index + 4]
+        if dsn not in idsn:
+            continue
+        psa = iarray[index + 9]
+        if psa > 0:
+            sacnt = iarray[index + psa - 1]
+        pdat = iarray[index + 10]
+        pdatv = iarray[index + 11]
+        # get attributes
+        dattr = {
+            "TSBDY": 1,
+            "TSBHR": 1,
+            "TSBMO": 1,
+            "TSBYR": 1900,
+            "TFILL": -999.0,
+        }  # preset defaults
+        for i in range(psa + 1, psa + 1 + 2 * sacnt, 2):
+            iarray_id = iarray[index + i]
+            ptr = iarray[index + i + 1] - 1 + index
+            if iarray_id not in attrinfo:
+                print(
+                    "PROGRAM ERROR: ATTRIBUTE INDEX not found",
+                    iarray_id,
+                    "Attribute pointer",
+                    iarray[index + i + 1],
+                )
+                continue
+            name, atype, length = attrinfo[iarray_id]
+            if atype == "I":
+                dattr[name] = iarray[ptr]
+            elif atype == "R":
+                dattr[name] = farray[ptr]
+            else:
+                dattr[name] = "".join(
+                    [itostr(iarray[k]) for k in range(ptr, ptr + length // 4)]
+                ).strip()
+        # Get timeseries timebase data
+        records = []
+        for i in range(pdat + 1, pdatv - 1):
+            if a_record := iarray[index + i]:
+                records.append(splitposition(a_record))
+        if not records:
+            continue  # WDM preallocated, but nothing saved here yet
+        srec, soffset = records[0]
+        start = splitdate(iarray[srec * 512 + soffset])
+        # calculate number of data points in each group, tindex is final index
+        # for storage
+        tgroup = dattr["TGROUP"]
+        tstep = dattr["TSSTEP"]
+        tcode = dattr["TCODE"]
+        cindex = pd.date_range(start=start, periods=len(records) + 1, freq=freq[tgroup])
+        tindex = pd.date_range(
+            start=start, end=cindex[-1], freq=str(tstep) + freq[tcode]
+        )
+        counts = np.diff(np.searchsorted(tindex, cindex))
+        # Get timeseries data
+        floats = np.zeros(sum(counts), dtype=np.float32)
+        findex = 0
+        for (rec, offset), count in zip(records, counts):
+            findex = getfloats(iarray, farray, floats, findex, rec, offset, count)
+        series = pd.DataFrame(floats[:findex], index=tindex[:findex])
+        series = series[series[0] != dattr["TFILL"]]
+        series.columns = [f"{wdmfile}_{dsn}"]
+        retdf = retdf.join(series, how="outer")
+    return retdf
+def todatetime(year=1900, month=1, day=1, hour=0):
+    """takes yr,mo,dy,hr information then returns its datetime64"""
+    return (
+        datetime(year, month, day, 23) + pd.Timedelta(1, "h")
+        if hour == 24
+        else datetime(year, month, day, hour)
+    )
+def splitdate(datwrd):
+    """splits WDM compressed datetime int32 DATWRD into year, month, day, hour
+    -> datetime64"""
+    year = int((datwrd / 16384) % 131072)
+    month = int((datwrd / 1024) % 16)
+    day = int((datwrd / 32) % 32)
+    hour = int(datwrd % 32)
+    return todatetime(year, month, day, hour)
+def splitposition(recoffset):
+    """splits int32 into (record, offset), converting to Python zero based
+    indexing"""
+    return ((recoffset >> 9) - 1, (recoffset & 511) - 1)
+def itostr(i):
+    """Convert integer to string."""
+    return chr(i & 255) + chr(i >> 8 & 255) + chr(i >> 16 & 255) + chr(i >> 24 & 255)
+def getfloats(iarray, farray, floats, findex, rec, offset, count):
+    index = rec * 512 + offset + 1
+    stop = (rec + 1) * 512
+    cntr = 0
+    while cntr < count and findex < len(floats):
+        if index >= stop:
+            rec = (
+                iarray[rec * 512 + 3] - 1
+            )  # 3 is forward data pointer, -1 is python indexing
+            index = rec * 512 + 4  # 4 is index of start of new data
+            stop = (rec + 1) * 512
+        control_word = iarray[index]  # control word, don't need most of it here
+        nval = control_word >> 16
+        index += 1
+        if control_word >> 5 & 0x3:  # comp from control word, x
+            for _ in range(nval):
+                if findex >= len(floats):
+                    return findex
+                floats[findex] = farray[index]
+                findex += 1
+            index += 1
+        else:
+            for k in range(nval):
+                if findex >= len(floats):
+                    return findex
+                floats[findex] = farray[index + k]
+                findex += 1
+            index += nval
+        cntr += nval
+    return findex