PyPI - jupyter-analysis-tools - Versions diffs - 1.7.0__py3-none-any.whl - Mend

jupyter-analysis-tools 1.7.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (20) hide show

jupyter_analysis_tools/__init__.py +13 -0
jupyter_analysis_tools/analysis.py +47 -0
jupyter_analysis_tools/binning.py +443 -0
jupyter_analysis_tools/datalocations.py +128 -0
jupyter_analysis_tools/datastore.py +173 -0
jupyter_analysis_tools/distrib.py +444 -0
jupyter_analysis_tools/git.py +75 -0
jupyter_analysis_tools/plotting.py +70 -0
jupyter_analysis_tools/readdata.py +193 -0
jupyter_analysis_tools/ssfz2json.py +57 -0
jupyter_analysis_tools/ssfz_compare.py +54 -0
jupyter_analysis_tools/utils.py +262 -0
jupyter_analysis_tools/widgets.py +89 -0
jupyter_analysis_tools-1.7.0.dist-info/METADATA +807 -0
jupyter_analysis_tools-1.7.0.dist-info/RECORD +20 -0
jupyter_analysis_tools-1.7.0.dist-info/WHEEL +5 -0
jupyter_analysis_tools-1.7.0.dist-info/entry_points.txt +3 -0
jupyter_analysis_tools-1.7.0.dist-info/licenses/AUTHORS.rst +6 -0
jupyter_analysis_tools-1.7.0.dist-info/licenses/LICENSE +9 -0
jupyter_analysis_tools-1.7.0.dist-info/top_level.txt +1 -0

jupyter_analysis_tools/__init__.py ADDED Viewed

@@ -0,0 +1,13 @@
+# -*- coding: utf-8 -*-
+# __init__.py
+__version__ = "1.7.0"
+from .binning import reBin
+from .git import checkRepo, isNBstripoutActivated, isNBstripoutInstalled, isRepo
+from .plotting import createFigure, plotPDH
+from .readdata import readdata, readPDH, readPDHmeta, readSSF, readSSFZ
+from .utils import naturalKey, setLocaleUTF8
+from .widgets import PathSelector, showBoolStatus
+setLocaleUTF8()

jupyter_analysis_tools/analysis.py ADDED Viewed

@@ -0,0 +1,47 @@
+# -*- coding: utf-8 -*-
+# analysis.py
+import numpy as np
+# from https://stackoverflow.com/a/22357811
+# and https://github.com/joferkington/oost_paper_code/blob/master/utilities.py#L167
+# (code with MIT License)
+def getModZScore(points):
+    """
+    Returns a boolean array with True if points are outliers and False
+    otherwise.
+    **Note**:
+    Similar to https://docs.scipy.org/doc/scipy/reference/generated/scipy.stats.zscore.html
+    but uses the median instead of the mean.
+    :param points: An numobservations by numdimensions array of observations
+    :param thresh: The modified z-score to use as a threshold. Observations with
+        a modified z-score (based on the median absolute deviation) greater
+        than this value will be classified as outliers.
+    Returns
+    -------
+    mask: numpy array
+        A numobservations-length boolean array.
+    References
+    ----------
+    Boris Iglewicz and David Hoaglin (1993), "Volume 16: How to Detect and
+    Handle Outliers", The ASQC Basic References in Quality Control:
+    Statistical Techniques, Edward F. Mykytka, Ph.D., Editor.
+    """
+    if len(points.shape) == 1:
+        points = points[:, None]
+    median = np.median(points, axis=0)
+    diff = np.sqrt(np.sum((points - median) ** 2, axis=-1))
+    med_abs_deviation = np.median(diff)
+    # scale being the inverse of the standard normal quantile function at 0.75,
+    # which is approximately 0.67449, see also:
+    # https://docs.scipy.org/doc/scipy/reference/generated/scipy.stats.median_abs_deviation.html
+    # modified_z_score = 0.6745 * diff / med_abs_deviation
+    # let this indicator be =1 for the same data, makes it more intuitive to understand
+    modified_z_score = diff / med_abs_deviation
+    return modified_z_score

jupyter_analysis_tools/binning.py ADDED Viewed

@@ -0,0 +1,443 @@
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+# binning.py
+"""
+Overview
+========
+1D rebinning.
+Should take input file, read, rebin and write.
+Rebins to log bins.
+"""
+# __author__ = "Brian R. Pauw"
+# __contact__ = "brian@stack.nl"
+# __license__ = "GPLv3+"
+# __date__ = "2015/01/09"
+# __status__ = "beta"
+import argparse
+import itertools
+import os
+import sys
+import numpy as np
+import pandas
+from numpy import argsort, log10, reshape, shape, size, sqrt, zeros
+def argparser():
+    parser = argparse.ArgumentParser(
+        description="""
+            Re-binning function, reads three-column ASCII input files,
+            and outputs re-binned three-column ASCII files"""
+    )
+    # binning options
+    parser.add_argument("-n", "--numBins", type=int, default=50, help="Number of bins to use")
+    parser.add_argument(
+        "-q",
+        "--qMin",
+        type=float,
+        default=0.0,
+        help="Minimum Q to clip from original data",
+    )
+    parser.add_argument(
+        "-Q",
+        "--qMax",
+        type=float,
+        default=np.inf,
+        help="Minimum Q to clip from original data",
+    )
+    parser.add_argument(
+        "-e",
+        "--minE",
+        type=float,
+        default=0.01,
+        help="Minimum error is at least this times intensity value.",
+    )
+    parser.add_argument(
+        "-s",
+        "--scaling",
+        type=str,
+        action="store",
+        default="logarithmic",
+        help="q-axis scaling for binning, can be linear or logarithmic",
+    )
+    # csv / datafile options
+    parser.add_argument(
+        "-d",
+        "--delimiter",
+        type=str,
+        action="store",
+        default=",",
+        help="Delimiter in original file. '\\t' is tab. (with quotes)",
+    )
+    parser.add_argument(
+        "-H",
+        "--headerLines",
+        type=int,
+        default=0,
+        help="Number of header lines to skip",
+    )
+    parser.add_argument(
+        "-D",
+        "--outputDelimiter",
+        type=str,
+        action="store",
+        default=None,
+        help="Delimiter in final file (defaults to input delimiter)",
+    )
+    parser.add_argument(
+        "-c",
+        "--cleanEmpty",
+        action="store_true",
+        default=True,
+        help="Removes empty bins before writing",
+    )
+    parser.add_argument(
+        "-i",
+        "--iScale",
+        type=float,
+        default=1.0,
+        help="Intensity (and error) scaled by this factor on output.",
+    )
+    # program options
+    parser.add_argument(
+        "-v",
+        "--verbose",
+        action="store_true",
+        help="Be verbose about the steps",
+    )
+    parser.add_argument(
+        "-t",
+        "--test",
+        action="store_true",
+        help="Do not save output files, test run only",
+    )
+    parser.add_argument(
+        "-N",
+        "--noBin",
+        action="store_true",
+        help="Do not bin, just input -> output (for translation and scaling)",
+    )
+    parser.add_argument(
+        "fnames",
+        nargs="*",
+        metavar="FILENAME",
+        action="store",
+        help="One or more data files to rebin",
+    )
+    # show help if no files were provided, no arguments at all
+    args = parser.parse_args()
+    if len(args.fnames):
+        return args
+    parser.print_help(sys.stderr)
+    sys.exit(1)
+class reBin(object):
+    """all kinds of binning-related functions"""
+    # set defaults for file reading:
+    pandasArgs = {
+        "skipinitialspace": True,
+        "skip_blank_lines": True,
+        "engine": "python",
+        "header": None,
+    }
+    # set defaults for kwargs, in case this is not called from command line:
+    reBinArgs = {
+        "delimiter": ";",
+        "outputDelimiter": ";",
+        "headerLines": 0,
+        "fnames": "",
+        "verbose": False,
+        "qMin": -np.inf,
+        "qMax": np.inf,
+        "numBins": 100,
+        "scaling": "logarithmic",
+        "cleanEmpty": False,
+        "minE": 0.01,
+        "noBin": False,
+    }
+    def __init__(self, **kwargs):
+        # process defaults:
+        for kw in self.reBinArgs:
+            setattr(self, kw, self.reBinArgs[kw])
+        # process kwargs:
+        if "verbose" in kwargs:
+            self.verbose = kwargs.pop("verbose")
+        for kw in kwargs:
+            if self.verbose:
+                print("Processing input argument {}: {}".format(kw, kwargs[kw]))
+            setattr(self, kw, kwargs[kw])
+        # process delimiter options
+        # decode no longer necessary in python 3
+        if sys.version_info <= (3, 0):
+            self.delimiter = self.delimiter.decode("string-escape")
+        if self.outputDelimiter is None:
+            self.outputDelimiter = self.delimiter
+        else:
+            if sys.version_info <= (3, 0):
+                self.outputDelimiter = self.outputDelimiter.decode("string-escape")
+        self.pandasArgs.update({"delimiter": self.delimiter, "skiprows": self.headerLines})
+        # process files individually:
+        for filename in self.fnames:
+            self.readFile(filename)
+            self.validate()
+            self.defineBinEdges()
+            self.binning1D()
+            if self.cleanEmpty:
+                # removes bins with no intensity or error
+                self.cleanup()
+            if not self.test:
+                # generate output file name
+                ofname = self.outputFilename(filename)
+                # write binned data to file name
+                self.writeFile(ofname)
+    def cleanup(self):
+        # removes unwanted bin values
+        # cannot use lists, because:
+        # http://unspecified.wordpress.com/2009/02/12/thou-shalt-not-modify-a-list-during-iteration
+        validi = True ^ np.isnan(self.IBin)
+        validi[np.argwhere(self.binMask > 0)] = False
+        self.QBin = self.QBin[validi]
+        self.IBin = self.IBin[validi]
+        self.EBin = self.EBin[validi]
+        self.QEBin = self.QEBin[validi]
+        if self.verbose:
+            print("valid bins: {} of {}".format(validi.sum(), len(validi)))
+    def outputFilename(self, filename):
+        """returns an output filename based on the input filename"""
+        of = filename.strip()
+        # split at extension
+        ob, oe = of.rsplit(".", 1)
+        # add rebin tag and reassemble
+        ofname = "{}_reBin.{}".format(ob, oe)
+        if self.verbose:
+            print("output filename: {}".format(ofname))
+        return ofname
+    def readFile(self, filename):
+        if self.verbose:
+            print("reading file: {} with settings: {}".format(filename, self.pandasArgs))
+        dval = pandas.read_csv(filename, **self.pandasArgs).values
+        assert isinstance(dval, np.ndarray)  # no problems reading?
+        assert size(dval, axis=1) >= 3  # Q, I and E can be extracted
+        if self.verbose:
+            print("data read: {}".format(dval))
+        self.Q = np.float32(dval[:, 0])
+        self.I = np.float32(dval[:, 1])
+        self.E = np.maximum(self.minE * self.I, np.float32(dval[:, 2]))
+        numChanged = (self.minE * self.I > dval[:, 2]).sum()
+        if self.verbose:
+            print(
+                "Minimum uncertainty set for {} out of {} ({} %) datapoints".format(
+                    numChanged, size(self.Q), 100.0 * numChanged / size(self.Q)
+                )
+            )
+    # writer modified from imp2/modules/Write1D
+    def writeFile(self, ofname, hstrs=None, append=False):
+        sep = self.outputDelimiter
+        # scale if necessary
+        iterData = itertools.zip_longest(
+            self.QBin,
+            self.IBin * float(self.iScale),
+            self.EBin * float(self.iScale),
+        )
+        def writeLine(filename, line=None, append=True):
+            if append:
+                openarg = "a"
+            else:
+                openarg = "w"
+            with open(filename, openarg) as fh:
+                if isinstance(line, str):
+                    fh.write(line)
+                else:
+                    # iterable object containing multiple lines
+                    fh.writelines(line)
+        # truncate file if exists (i.e. discard)
+        if os.path.exists(ofname) and (not append):
+            os.remove(ofname)
+        # write header and data:
+        if hstrs is not None:
+            writeLine(ofname, hstrs)
+        # store in file
+        moreData = True
+        while moreData:
+            try:
+                # generate formatted datastring containing column data
+                wstr = sep.join(["{}".format(k) for k in next(iterData)]) + "\n"
+            except StopIteration:
+                # end of data reached
+                moreData = False
+                break
+            writeLine(ofname, wstr)
+    def validate(self):
+        """Applies limits to the data"""
+        mask = zeros(shape(self.Q), dtype="bool")
+        # appy integration limits:
+        iind = np.array(((self.Q < self.qMin) + (self.Q > self.qMax)), dtype=bool)
+        mask[iind] = True
+        # define binning limits
+        (qmin, qmax) = (
+            np.abs(self.Q[True ^ mask]).min(),
+            np.abs(self.Q[True ^ mask]).max(),
+        )
+        self.iqMin = np.maximum(qmin, self.qMin)
+        self.iqMax = np.minimum(qmax, self.qMax)
+        self.Q = self.Q[True ^ mask]
+        self.I = self.I[True ^ mask]
+        self.E = self.E[True ^ mask]
+        if self.verbose:
+            print(
+                "data Q-range: {}, integration Q-range: {}, masked: {} of {} ({}%)".format(
+                    (self.Q.min(), self.Q.max()),
+                    (self.iqMin, self.iqMax),
+                    mask.sum(),
+                    self.Q.size,
+                    mask.sum() / self.Q.size,
+                )
+            )
+    def defineBinEdges(self):
+        """defines binning edges"""
+        # define bin edges
+        if self.scaling.lower() in ("linear", "lin"):
+            qEdges = np.linspace(self.iqMin, self.iqMax, self.numBins + 1)
+        else:
+            qEdges = np.logspace(log10(self.iqMin), log10(self.iqMax), self.numBins + 1)
+        self.qEdges = qEdges
+        if self.verbose:
+            print("Bin edges used: {}".format(self.qEdges))
+    def binning1D(self, qError=None):
+        """An unweighted binning routine.
+        imp-version of binning, taking q-bin edges in which binning takes place,
+        and calculates the mean q uncertainty in the bin as well from the relative
+        Q uncertainties provided.
+        The intensities are sorted across bins of equal size. If provided error
+        is empty, the standard deviation of the intensities in the bins is
+        computed.
+        """
+        # no binning requested, just input -> output
+        if self.noBin:
+            self.QBin = self.Q.copy()
+            self.IBin = self.I.copy()
+            self.EBin = self.E.copy()
+            self.QEBin = np.zeros(np.shape(self.I))
+            return
+        # set values:
+        q = self.Q.copy()
+        intensity = self.I.copy()
+        error = self.E.copy()
+        numBins = self.numBins
+        qEdges = self.qEdges
+        # flatten q, intensity and error
+        q = reshape(q, size(q))
+        intensity = reshape(intensity, size(intensity))
+        # sort q, let intensity and error follow sort
+        sortInd = argsort(q, axis=None)
+        q = q[sortInd]
+        intensity = intensity[sortInd]
+        # initialise storage:
+        numBins = len(qEdges) - 1
+        ibin = zeros(numBins)
+        qbin = zeros(numBins)
+        sdbin = zeros(numBins)
+        sebin = zeros(numBins)
+        qebin = zeros(numBins)
+        binMask = zeros(numBins)  # set one for masked bin values
+        if error is not None:
+            error = reshape(error, size(error))
+            error = error[sortInd]
+        if qError is not None:
+            qError = reshape(qError, size(qError))
+            qError = qError[sortInd]
+        # now we can fill the bins
+        for bini in range(numBins):
+            # limit ourselves to only the bits we're interested in:
+            limMask = (q >= qEdges[bini]) & (q <= qEdges[bini + 1])
+            iToBin = intensity[limMask]
+            # sum the intensities in one bin and normalize by number of pixels
+            if limMask.sum() == 0:
+                # no pixels in bin
+                (ibin[bini], sebin[bini], qebin[bini], qbin[bini]) = (
+                    None,
+                    None,
+                    None,
+                    None,
+                )
+                binMask[bini] = 1
+                continue
+            elif limMask.sum() == 1:
+                ibin[bini] = iToBin.mean()
+                qbin[bini] = q[limMask].mean()
+                if error is not None:
+                    sebin[bini] = error[limMask]
+                if qError is not None:
+                    qebin[bini] = qError[limMask]
+            else:
+                ibin[bini] = iToBin.mean()
+                qbin[bini] = q[limMask].mean()
+                if error is not None:
+                    sebin[bini] = np.sqrt((error[limMask] ** 2).sum()) / limMask.sum()
+                # now we deal with the Errors:
+                # calculate the standard deviation of the intensity in the bin
+                # according to the definition of sample-standard deviation
+                sdbin = iToBin.std(ddof=1)
+                # what we want is to have the "standard error of the mean":
+                sdbin = sdbin / sqrt(1.0 * np.size(iToBin))
+                # maximum between standard error and Poisson statistics
+                sebin[bini] = np.maximum(sebin[bini], sdbin)
+                # qebin is the mean error of the q-values in the bin, should
+                # probably be superseded by the bin width
+                qe = 0.0
+                if qError is not None:
+                    qe = np.sqrt((qError[limMask] ** 2).sum())
+                # SSTD of q in bin:
+                qs = np.std(q[limMask], ddof=1)  # sample standard deviation
+                qebin[bini] = np.maximum(qe, qs)
+        self.QBin = qbin.copy()
+        self.IBin = ibin.copy()
+        self.EBin = sebin.copy()
+        self.QEBin = qebin.copy()
+        self.binMask = binMask.copy()
+        if self.verbose:
+            print("qbin: {}".format(qbin))
+            print("ibin: {}".format(ibin))
+            print("sebin: {}".format(sebin))
+            print("qebin: {}".format(qebin))
+            print("binMask: {}".format(binMask))
+if __name__ == "__main__":
+    # process input arguments
+    adict = argparser()
+    # transmogrify into kwargs object
+    adict = vars(adict)
+    # run the reBin program
+    reBin(**adict)

jupyter_analysis_tools/datalocations.py ADDED Viewed

@@ -0,0 +1,128 @@
+# -*- coding: utf-8 -*-
+# datalocations.py
+import glob
+import os
+import shutil
+import tempfile
+from pathlib import Path
+from .utils import indent, isList
+def getWorkDir(workDir=None, skip=False):
+    """Find a local work dir for temporary files, created during analysis.
+    The default is *$HOME/data*."""
+    if skip:  # stay in the current directory if desired
+        return os.path.abspath(".")
+    if not workDir or not len(workDir):
+        workDir = Path.home() / "data"
+    else:
+        workDir = Path(workDir).resolve()
+    if not workDir.is_dir():
+        os.mkdir(workDir)
+    print("Using '{}' as working directory.".format(workDir))
+    return workDir
+def prepareWorkDir(workDir, srcDir, useExisting=False):
+    """Create a temporary working directory and copy
+    the input data (series) to it if not already present."""
+    # source dir has to exist
+    if not os.path.isdir(srcDir):
+        raise RuntimeError("Provided source directory '{}' not found!".format(srcDir))
+    srcDir = os.path.realpath(srcDir)
+    # no separate work dir requested?
+    if os.path.samefile(workDir, os.getcwd()):
+        print("Working in current directory '{}'.".format(os.getcwd()))
+        return srcDir  # nothing to do
+    prefix = os.path.basename(srcDir) + "_"
+    if useExisting:  # use an existing work dir, avoid copying
+        dirs = glob.glob(os.path.join(workDir, prefix + "*"))
+        if len(dirs):
+            return dirs[0]  # use the first match
+        print("No existing work dir found, creating a new one.")
+    # copy all data from src dir to a newly created work dir
+    workDir = tempfile.mkdtemp(dir=workDir, prefix=prefix)
+    print("Copying data to {}:".format(workDir))
+    for dn in os.listdir(srcDir):
+        srcPath = os.path.join(srcDir, dn)
+        dstPath = os.path.join(workDir, dn)
+        if os.path.isdir(srcPath):
+            shutil.copytree(srcPath, dstPath)
+            print(indent, dn)
+        if os.path.isfile(srcPath):
+            shutil.copy(srcPath, dstPath)
+            print(indent, dn)
+    print("Done preparing work dir.")
+    return workDir
+def printFileList(fnlst, numParts=2, limit=20):
+    def printlst(lst):
+        return [print(indent, fn) for fn in lst]
+    def shorten(lst):
+        return [os.path.join(*Path(fn).parts[-numParts:]) for fn in lst]
+    if len(fnlst) > limit:
+        printlst(shorten(fnlst[:3]))
+        print(indent, "[...]")
+        printlst(shorten(fnlst[-3:]))
+    else:
+        printlst(shorten(fnlst))
+def getDataDirs(dataDir, noWorkDir=False, reuseWorkDir=True, workDir=None):
+    """Create a local work dir with a copy of the input data and for storing the results.
+    (Data might reside in synced folders which creates massive traffic once batch processing
+    results get replaced repeately.)
+    Parameters
+    ----------
+    noWorkDir: bool
+        False: Copy input data to a new working dir (default),
+        True: otherwise, use data where it is.
+    reuseWorkDir: bool
+        False: Create a new working dir each time,
+        True: reuse the work dir if it exists already (default).
+    Returns
+    -------
+    A list of absolute directory paths.
+    """
+    basedir = getWorkDir(workDir=workDir, skip=noWorkDir)
+    workDir = prepareWorkDir(basedir, dataDir, useExisting=reuseWorkDir)
+    print("Entering '{}':".format(workDir))
+    dirs = sorted([dn for dn in Path(workDir).iterdir() if dn.is_dir()])
+    dirs.append(Path(workDir))
+    # [print(os.path.join(*dn.parts[-2:])) for dn in dirs]
+    printFileList(dirs, numParts=1)
+    return dirs
+def getDataFiles(dataDirs, include=None, exclude=None):
+    """Return absolute file paths from given directories."""
+    def getFiles(dn, include=None):
+        if not include:
+            include = "*"
+        if not isList(include):
+            include = (include,)
+        return [path for inc in include for path in glob.glob(os.path.join(dn, inc))]
+    if not exclude:
+        exclude = ()
+    if not isList(exclude):
+        exclude = (exclude,)
+    if not isList(dataDirs):
+        dataDirs = (dataDirs,)
+    files = [
+        fn
+        for dn in dataDirs
+        for fn in getFiles(dn, include)
+        if not any([(ex in fn) for ex in exclude])
+    ]
+    print("{} files to be analyzed in subdirectories.".format(len(files)))
+    return sorted(files)