jupyter-analysis-tools 1.7.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,13 @@
1
+ # -*- coding: utf-8 -*-
2
+ # __init__.py
3
+
4
+ __version__ = "1.7.0"
5
+
6
+ from .binning import reBin
7
+ from .git import checkRepo, isNBstripoutActivated, isNBstripoutInstalled, isRepo
8
+ from .plotting import createFigure, plotPDH
9
+ from .readdata import readdata, readPDH, readPDHmeta, readSSF, readSSFZ
10
+ from .utils import naturalKey, setLocaleUTF8
11
+ from .widgets import PathSelector, showBoolStatus
12
+
13
+ setLocaleUTF8()
@@ -0,0 +1,47 @@
1
+ # -*- coding: utf-8 -*-
2
+ # analysis.py
3
+
4
+ import numpy as np
5
+
6
+
7
+ # from https://stackoverflow.com/a/22357811
8
+ # and https://github.com/joferkington/oost_paper_code/blob/master/utilities.py#L167
9
+ # (code with MIT License)
10
+ def getModZScore(points):
11
+ """
12
+ Returns a boolean array with True if points are outliers and False
13
+ otherwise.
14
+ **Note**:
15
+ Similar to https://docs.scipy.org/doc/scipy/reference/generated/scipy.stats.zscore.html
16
+ but uses the median instead of the mean.
17
+
18
+ :param points: An numobservations by numdimensions array of observations
19
+ :param thresh: The modified z-score to use as a threshold. Observations with
20
+ a modified z-score (based on the median absolute deviation) greater
21
+ than this value will be classified as outliers.
22
+
23
+ Returns
24
+ -------
25
+ mask: numpy array
26
+ A numobservations-length boolean array.
27
+
28
+ References
29
+ ----------
30
+ Boris Iglewicz and David Hoaglin (1993), "Volume 16: How to Detect and
31
+ Handle Outliers", The ASQC Basic References in Quality Control:
32
+ Statistical Techniques, Edward F. Mykytka, Ph.D., Editor.
33
+ """
34
+ if len(points.shape) == 1:
35
+ points = points[:, None]
36
+ median = np.median(points, axis=0)
37
+ diff = np.sqrt(np.sum((points - median) ** 2, axis=-1))
38
+ med_abs_deviation = np.median(diff)
39
+
40
+ # scale being the inverse of the standard normal quantile function at 0.75,
41
+ # which is approximately 0.67449, see also:
42
+ # https://docs.scipy.org/doc/scipy/reference/generated/scipy.stats.median_abs_deviation.html
43
+ # modified_z_score = 0.6745 * diff / med_abs_deviation
44
+ # let this indicator be =1 for the same data, makes it more intuitive to understand
45
+ modified_z_score = diff / med_abs_deviation
46
+
47
+ return modified_z_score
@@ -0,0 +1,443 @@
1
+ #!/usr/bin/env python
2
+ # -*- coding: utf-8 -*-
3
+ # binning.py
4
+
5
+ """
6
+ Overview
7
+ ========
8
+ 1D rebinning.
9
+ Should take input file, read, rebin and write.
10
+ Rebins to log bins.
11
+ """
12
+
13
+ # __author__ = "Brian R. Pauw"
14
+ # __contact__ = "brian@stack.nl"
15
+ # __license__ = "GPLv3+"
16
+ # __date__ = "2015/01/09"
17
+ # __status__ = "beta"
18
+
19
+ import argparse
20
+ import itertools
21
+ import os
22
+ import sys
23
+
24
+ import numpy as np
25
+ import pandas
26
+ from numpy import argsort, log10, reshape, shape, size, sqrt, zeros
27
+
28
+
29
+ def argparser():
30
+ parser = argparse.ArgumentParser(
31
+ description="""
32
+ Re-binning function, reads three-column ASCII input files,
33
+ and outputs re-binned three-column ASCII files"""
34
+ )
35
+ # binning options
36
+ parser.add_argument("-n", "--numBins", type=int, default=50, help="Number of bins to use")
37
+ parser.add_argument(
38
+ "-q",
39
+ "--qMin",
40
+ type=float,
41
+ default=0.0,
42
+ help="Minimum Q to clip from original data",
43
+ )
44
+ parser.add_argument(
45
+ "-Q",
46
+ "--qMax",
47
+ type=float,
48
+ default=np.inf,
49
+ help="Minimum Q to clip from original data",
50
+ )
51
+ parser.add_argument(
52
+ "-e",
53
+ "--minE",
54
+ type=float,
55
+ default=0.01,
56
+ help="Minimum error is at least this times intensity value.",
57
+ )
58
+ parser.add_argument(
59
+ "-s",
60
+ "--scaling",
61
+ type=str,
62
+ action="store",
63
+ default="logarithmic",
64
+ help="q-axis scaling for binning, can be linear or logarithmic",
65
+ )
66
+ # csv / datafile options
67
+ parser.add_argument(
68
+ "-d",
69
+ "--delimiter",
70
+ type=str,
71
+ action="store",
72
+ default=",",
73
+ help="Delimiter in original file. '\\t' is tab. (with quotes)",
74
+ )
75
+ parser.add_argument(
76
+ "-H",
77
+ "--headerLines",
78
+ type=int,
79
+ default=0,
80
+ help="Number of header lines to skip",
81
+ )
82
+ parser.add_argument(
83
+ "-D",
84
+ "--outputDelimiter",
85
+ type=str,
86
+ action="store",
87
+ default=None,
88
+ help="Delimiter in final file (defaults to input delimiter)",
89
+ )
90
+ parser.add_argument(
91
+ "-c",
92
+ "--cleanEmpty",
93
+ action="store_true",
94
+ default=True,
95
+ help="Removes empty bins before writing",
96
+ )
97
+ parser.add_argument(
98
+ "-i",
99
+ "--iScale",
100
+ type=float,
101
+ default=1.0,
102
+ help="Intensity (and error) scaled by this factor on output.",
103
+ )
104
+ # program options
105
+ parser.add_argument(
106
+ "-v",
107
+ "--verbose",
108
+ action="store_true",
109
+ help="Be verbose about the steps",
110
+ )
111
+ parser.add_argument(
112
+ "-t",
113
+ "--test",
114
+ action="store_true",
115
+ help="Do not save output files, test run only",
116
+ )
117
+ parser.add_argument(
118
+ "-N",
119
+ "--noBin",
120
+ action="store_true",
121
+ help="Do not bin, just input -> output (for translation and scaling)",
122
+ )
123
+ parser.add_argument(
124
+ "fnames",
125
+ nargs="*",
126
+ metavar="FILENAME",
127
+ action="store",
128
+ help="One or more data files to rebin",
129
+ )
130
+ # show help if no files were provided, no arguments at all
131
+ args = parser.parse_args()
132
+ if len(args.fnames):
133
+ return args
134
+ parser.print_help(sys.stderr)
135
+ sys.exit(1)
136
+
137
+
138
+ class reBin(object):
139
+ """all kinds of binning-related functions"""
140
+
141
+ # set defaults for file reading:
142
+ pandasArgs = {
143
+ "skipinitialspace": True,
144
+ "skip_blank_lines": True,
145
+ "engine": "python",
146
+ "header": None,
147
+ }
148
+ # set defaults for kwargs, in case this is not called from command line:
149
+ reBinArgs = {
150
+ "delimiter": ";",
151
+ "outputDelimiter": ";",
152
+ "headerLines": 0,
153
+ "fnames": "",
154
+ "verbose": False,
155
+ "qMin": -np.inf,
156
+ "qMax": np.inf,
157
+ "numBins": 100,
158
+ "scaling": "logarithmic",
159
+ "cleanEmpty": False,
160
+ "minE": 0.01,
161
+ "noBin": False,
162
+ }
163
+
164
+ def __init__(self, **kwargs):
165
+ # process defaults:
166
+ for kw in self.reBinArgs:
167
+ setattr(self, kw, self.reBinArgs[kw])
168
+ # process kwargs:
169
+ if "verbose" in kwargs:
170
+ self.verbose = kwargs.pop("verbose")
171
+ for kw in kwargs:
172
+ if self.verbose:
173
+ print("Processing input argument {}: {}".format(kw, kwargs[kw]))
174
+ setattr(self, kw, kwargs[kw])
175
+
176
+ # process delimiter options
177
+ # decode no longer necessary in python 3
178
+ if sys.version_info <= (3, 0):
179
+ self.delimiter = self.delimiter.decode("string-escape")
180
+ if self.outputDelimiter is None:
181
+ self.outputDelimiter = self.delimiter
182
+ else:
183
+ if sys.version_info <= (3, 0):
184
+ self.outputDelimiter = self.outputDelimiter.decode("string-escape")
185
+
186
+ self.pandasArgs.update({"delimiter": self.delimiter, "skiprows": self.headerLines})
187
+ # process files individually:
188
+ for filename in self.fnames:
189
+ self.readFile(filename)
190
+ self.validate()
191
+ self.defineBinEdges()
192
+ self.binning1D()
193
+ if self.cleanEmpty:
194
+ # removes bins with no intensity or error
195
+ self.cleanup()
196
+
197
+ if not self.test:
198
+ # generate output file name
199
+ ofname = self.outputFilename(filename)
200
+ # write binned data to file name
201
+ self.writeFile(ofname)
202
+
203
+ def cleanup(self):
204
+ # removes unwanted bin values
205
+ # cannot use lists, because:
206
+ # http://unspecified.wordpress.com/2009/02/12/thou-shalt-not-modify-a-list-during-iteration
207
+ validi = True ^ np.isnan(self.IBin)
208
+ validi[np.argwhere(self.binMask > 0)] = False
209
+ self.QBin = self.QBin[validi]
210
+ self.IBin = self.IBin[validi]
211
+ self.EBin = self.EBin[validi]
212
+ self.QEBin = self.QEBin[validi]
213
+ if self.verbose:
214
+ print("valid bins: {} of {}".format(validi.sum(), len(validi)))
215
+
216
+ def outputFilename(self, filename):
217
+ """returns an output filename based on the input filename"""
218
+ of = filename.strip()
219
+ # split at extension
220
+ ob, oe = of.rsplit(".", 1)
221
+ # add rebin tag and reassemble
222
+ ofname = "{}_reBin.{}".format(ob, oe)
223
+ if self.verbose:
224
+ print("output filename: {}".format(ofname))
225
+ return ofname
226
+
227
+ def readFile(self, filename):
228
+ if self.verbose:
229
+ print("reading file: {} with settings: {}".format(filename, self.pandasArgs))
230
+ dval = pandas.read_csv(filename, **self.pandasArgs).values
231
+ assert isinstance(dval, np.ndarray) # no problems reading?
232
+ assert size(dval, axis=1) >= 3 # Q, I and E can be extracted
233
+ if self.verbose:
234
+ print("data read: {}".format(dval))
235
+ self.Q = np.float32(dval[:, 0])
236
+ self.I = np.float32(dval[:, 1])
237
+ self.E = np.maximum(self.minE * self.I, np.float32(dval[:, 2]))
238
+ numChanged = (self.minE * self.I > dval[:, 2]).sum()
239
+ if self.verbose:
240
+ print(
241
+ "Minimum uncertainty set for {} out of {} ({} %) datapoints".format(
242
+ numChanged, size(self.Q), 100.0 * numChanged / size(self.Q)
243
+ )
244
+ )
245
+
246
+ # writer modified from imp2/modules/Write1D
247
+ def writeFile(self, ofname, hstrs=None, append=False):
248
+ sep = self.outputDelimiter
249
+ # scale if necessary
250
+ iterData = itertools.zip_longest(
251
+ self.QBin,
252
+ self.IBin * float(self.iScale),
253
+ self.EBin * float(self.iScale),
254
+ )
255
+
256
+ def writeLine(filename, line=None, append=True):
257
+ if append:
258
+ openarg = "a"
259
+ else:
260
+ openarg = "w"
261
+ with open(filename, openarg) as fh:
262
+ if isinstance(line, str):
263
+ fh.write(line)
264
+ else:
265
+ # iterable object containing multiple lines
266
+ fh.writelines(line)
267
+
268
+ # truncate file if exists (i.e. discard)
269
+ if os.path.exists(ofname) and (not append):
270
+ os.remove(ofname)
271
+
272
+ # write header and data:
273
+ if hstrs is not None:
274
+ writeLine(ofname, hstrs)
275
+
276
+ # store in file
277
+ moreData = True
278
+ while moreData:
279
+ try:
280
+ # generate formatted datastring containing column data
281
+ wstr = sep.join(["{}".format(k) for k in next(iterData)]) + "\n"
282
+ except StopIteration:
283
+ # end of data reached
284
+ moreData = False
285
+ break
286
+ writeLine(ofname, wstr)
287
+
288
+ def validate(self):
289
+ """Applies limits to the data"""
290
+ mask = zeros(shape(self.Q), dtype="bool")
291
+ # appy integration limits:
292
+ iind = np.array(((self.Q < self.qMin) + (self.Q > self.qMax)), dtype=bool)
293
+ mask[iind] = True
294
+
295
+ # define binning limits
296
+ (qmin, qmax) = (
297
+ np.abs(self.Q[True ^ mask]).min(),
298
+ np.abs(self.Q[True ^ mask]).max(),
299
+ )
300
+ self.iqMin = np.maximum(qmin, self.qMin)
301
+ self.iqMax = np.minimum(qmax, self.qMax)
302
+ self.Q = self.Q[True ^ mask]
303
+ self.I = self.I[True ^ mask]
304
+ self.E = self.E[True ^ mask]
305
+ if self.verbose:
306
+ print(
307
+ "data Q-range: {}, integration Q-range: {}, masked: {} of {} ({}%)".format(
308
+ (self.Q.min(), self.Q.max()),
309
+ (self.iqMin, self.iqMax),
310
+ mask.sum(),
311
+ self.Q.size,
312
+ mask.sum() / self.Q.size,
313
+ )
314
+ )
315
+
316
+ def defineBinEdges(self):
317
+ """defines binning edges"""
318
+ # define bin edges
319
+ if self.scaling.lower() in ("linear", "lin"):
320
+ qEdges = np.linspace(self.iqMin, self.iqMax, self.numBins + 1)
321
+ else:
322
+ qEdges = np.logspace(log10(self.iqMin), log10(self.iqMax), self.numBins + 1)
323
+ self.qEdges = qEdges
324
+ if self.verbose:
325
+ print("Bin edges used: {}".format(self.qEdges))
326
+
327
+ def binning1D(self, qError=None):
328
+ """An unweighted binning routine.
329
+ imp-version of binning, taking q-bin edges in which binning takes place,
330
+ and calculates the mean q uncertainty in the bin as well from the relative
331
+ Q uncertainties provided.
332
+
333
+ The intensities are sorted across bins of equal size. If provided error
334
+ is empty, the standard deviation of the intensities in the bins is
335
+ computed.
336
+ """
337
+ # no binning requested, just input -> output
338
+ if self.noBin:
339
+ self.QBin = self.Q.copy()
340
+ self.IBin = self.I.copy()
341
+ self.EBin = self.E.copy()
342
+ self.QEBin = np.zeros(np.shape(self.I))
343
+ return
344
+
345
+ # set values:
346
+ q = self.Q.copy()
347
+ intensity = self.I.copy()
348
+ error = self.E.copy()
349
+ numBins = self.numBins
350
+ qEdges = self.qEdges
351
+
352
+ # flatten q, intensity and error
353
+ q = reshape(q, size(q))
354
+ intensity = reshape(intensity, size(intensity))
355
+
356
+ # sort q, let intensity and error follow sort
357
+ sortInd = argsort(q, axis=None)
358
+ q = q[sortInd]
359
+ intensity = intensity[sortInd]
360
+
361
+ # initialise storage:
362
+ numBins = len(qEdges) - 1
363
+ ibin = zeros(numBins)
364
+ qbin = zeros(numBins)
365
+ sdbin = zeros(numBins)
366
+ sebin = zeros(numBins)
367
+ qebin = zeros(numBins)
368
+ binMask = zeros(numBins) # set one for masked bin values
369
+ if error is not None:
370
+ error = reshape(error, size(error))
371
+ error = error[sortInd]
372
+ if qError is not None:
373
+ qError = reshape(qError, size(qError))
374
+ qError = qError[sortInd]
375
+
376
+ # now we can fill the bins
377
+ for bini in range(numBins):
378
+ # limit ourselves to only the bits we're interested in:
379
+ limMask = (q >= qEdges[bini]) & (q <= qEdges[bini + 1])
380
+
381
+ iToBin = intensity[limMask]
382
+ # sum the intensities in one bin and normalize by number of pixels
383
+ if limMask.sum() == 0:
384
+ # no pixels in bin
385
+ (ibin[bini], sebin[bini], qebin[bini], qbin[bini]) = (
386
+ None,
387
+ None,
388
+ None,
389
+ None,
390
+ )
391
+ binMask[bini] = 1
392
+ continue
393
+
394
+ elif limMask.sum() == 1:
395
+ ibin[bini] = iToBin.mean()
396
+ qbin[bini] = q[limMask].mean()
397
+ if error is not None:
398
+ sebin[bini] = error[limMask]
399
+ if qError is not None:
400
+ qebin[bini] = qError[limMask]
401
+
402
+ else:
403
+ ibin[bini] = iToBin.mean()
404
+ qbin[bini] = q[limMask].mean()
405
+ if error is not None:
406
+ sebin[bini] = np.sqrt((error[limMask] ** 2).sum()) / limMask.sum()
407
+ # now we deal with the Errors:
408
+ # calculate the standard deviation of the intensity in the bin
409
+ # according to the definition of sample-standard deviation
410
+ sdbin = iToBin.std(ddof=1)
411
+ # what we want is to have the "standard error of the mean":
412
+ sdbin = sdbin / sqrt(1.0 * np.size(iToBin))
413
+ # maximum between standard error and Poisson statistics
414
+ sebin[bini] = np.maximum(sebin[bini], sdbin)
415
+ # qebin is the mean error of the q-values in the bin, should
416
+ # probably be superseded by the bin width
417
+ qe = 0.0
418
+ if qError is not None:
419
+ qe = np.sqrt((qError[limMask] ** 2).sum())
420
+ # SSTD of q in bin:
421
+ qs = np.std(q[limMask], ddof=1) # sample standard deviation
422
+ qebin[bini] = np.maximum(qe, qs)
423
+
424
+ self.QBin = qbin.copy()
425
+ self.IBin = ibin.copy()
426
+ self.EBin = sebin.copy()
427
+ self.QEBin = qebin.copy()
428
+ self.binMask = binMask.copy()
429
+ if self.verbose:
430
+ print("qbin: {}".format(qbin))
431
+ print("ibin: {}".format(ibin))
432
+ print("sebin: {}".format(sebin))
433
+ print("qebin: {}".format(qebin))
434
+ print("binMask: {}".format(binMask))
435
+
436
+
437
+ if __name__ == "__main__":
438
+ # process input arguments
439
+ adict = argparser()
440
+ # transmogrify into kwargs object
441
+ adict = vars(adict)
442
+ # run the reBin program
443
+ reBin(**adict)
@@ -0,0 +1,128 @@
1
+ # -*- coding: utf-8 -*-
2
+ # datalocations.py
3
+
4
+ import glob
5
+ import os
6
+ import shutil
7
+ import tempfile
8
+ from pathlib import Path
9
+
10
+ from .utils import indent, isList
11
+
12
+
13
+ def getWorkDir(workDir=None, skip=False):
14
+ """Find a local work dir for temporary files, created during analysis.
15
+ The default is *$HOME/data*."""
16
+ if skip: # stay in the current directory if desired
17
+ return os.path.abspath(".")
18
+ if not workDir or not len(workDir):
19
+ workDir = Path.home() / "data"
20
+ else:
21
+ workDir = Path(workDir).resolve()
22
+ if not workDir.is_dir():
23
+ os.mkdir(workDir)
24
+ print("Using '{}' as working directory.".format(workDir))
25
+ return workDir
26
+
27
+
28
+ def prepareWorkDir(workDir, srcDir, useExisting=False):
29
+ """Create a temporary working directory and copy
30
+ the input data (series) to it if not already present."""
31
+ # source dir has to exist
32
+ if not os.path.isdir(srcDir):
33
+ raise RuntimeError("Provided source directory '{}' not found!".format(srcDir))
34
+ srcDir = os.path.realpath(srcDir)
35
+ # no separate work dir requested?
36
+ if os.path.samefile(workDir, os.getcwd()):
37
+ print("Working in current directory '{}'.".format(os.getcwd()))
38
+ return srcDir # nothing to do
39
+ prefix = os.path.basename(srcDir) + "_"
40
+ if useExisting: # use an existing work dir, avoid copying
41
+ dirs = glob.glob(os.path.join(workDir, prefix + "*"))
42
+ if len(dirs):
43
+ return dirs[0] # use the first match
44
+ print("No existing work dir found, creating a new one.")
45
+ # copy all data from src dir to a newly created work dir
46
+ workDir = tempfile.mkdtemp(dir=workDir, prefix=prefix)
47
+ print("Copying data to {}:".format(workDir))
48
+ for dn in os.listdir(srcDir):
49
+ srcPath = os.path.join(srcDir, dn)
50
+ dstPath = os.path.join(workDir, dn)
51
+ if os.path.isdir(srcPath):
52
+ shutil.copytree(srcPath, dstPath)
53
+ print(indent, dn)
54
+ if os.path.isfile(srcPath):
55
+ shutil.copy(srcPath, dstPath)
56
+ print(indent, dn)
57
+ print("Done preparing work dir.")
58
+ return workDir
59
+
60
+
61
+ def printFileList(fnlst, numParts=2, limit=20):
62
+ def printlst(lst):
63
+ return [print(indent, fn) for fn in lst]
64
+
65
+ def shorten(lst):
66
+ return [os.path.join(*Path(fn).parts[-numParts:]) for fn in lst]
67
+
68
+ if len(fnlst) > limit:
69
+ printlst(shorten(fnlst[:3]))
70
+ print(indent, "[...]")
71
+ printlst(shorten(fnlst[-3:]))
72
+ else:
73
+ printlst(shorten(fnlst))
74
+
75
+
76
+ def getDataDirs(dataDir, noWorkDir=False, reuseWorkDir=True, workDir=None):
77
+ """Create a local work dir with a copy of the input data and for storing the results.
78
+ (Data might reside in synced folders which creates massive traffic once batch processing
79
+ results get replaced repeately.)
80
+
81
+ Parameters
82
+ ----------
83
+ noWorkDir: bool
84
+ False: Copy input data to a new working dir (default),
85
+ True: otherwise, use data where it is.
86
+ reuseWorkDir: bool
87
+ False: Create a new working dir each time,
88
+ True: reuse the work dir if it exists already (default).
89
+
90
+ Returns
91
+ -------
92
+ A list of absolute directory paths.
93
+ """
94
+ basedir = getWorkDir(workDir=workDir, skip=noWorkDir)
95
+ workDir = prepareWorkDir(basedir, dataDir, useExisting=reuseWorkDir)
96
+ print("Entering '{}':".format(workDir))
97
+ dirs = sorted([dn for dn in Path(workDir).iterdir() if dn.is_dir()])
98
+ dirs.append(Path(workDir))
99
+ # [print(os.path.join(*dn.parts[-2:])) for dn in dirs]
100
+ printFileList(dirs, numParts=1)
101
+ return dirs
102
+
103
+
104
+ def getDataFiles(dataDirs, include=None, exclude=None):
105
+ """Return absolute file paths from given directories."""
106
+
107
+ def getFiles(dn, include=None):
108
+ if not include:
109
+ include = "*"
110
+ if not isList(include):
111
+ include = (include,)
112
+ return [path for inc in include for path in glob.glob(os.path.join(dn, inc))]
113
+
114
+ if not exclude:
115
+ exclude = ()
116
+ if not isList(exclude):
117
+ exclude = (exclude,)
118
+ if not isList(dataDirs):
119
+ dataDirs = (dataDirs,)
120
+
121
+ files = [
122
+ fn
123
+ for dn in dataDirs
124
+ for fn in getFiles(dn, include)
125
+ if not any([(ex in fn) for ex in exclude])
126
+ ]
127
+ print("{} files to be analyzed in subdirectories.".format(len(files)))
128
+ return sorted(files)