linregmc 0.1.1__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
linregmc-0.1.1/LICENSE ADDED
@@ -0,0 +1,22 @@
1
+ MIT License
2
+
3
+ Copyright (c) 2025, Pär Söderhjelm
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
22
+
@@ -0,0 +1,25 @@
1
+ Metadata-Version: 2.3
2
+ Name: linregmc
3
+ Version: 0.1.1
4
+ Summary: Linear regression with Monte Carlo error analysis
5
+ License: MIT
6
+ Author: Pär Söderhjelm
7
+ Requires-Python: >=3.9,<4.0
8
+ Classifier: License :: OSI Approved :: MIT License
9
+ Classifier: Programming Language :: Python :: 3
10
+ Classifier: Programming Language :: Python :: 3.9
11
+ Classifier: Programming Language :: Python :: 3.10
12
+ Classifier: Programming Language :: Python :: 3.11
13
+ Classifier: Programming Language :: Python :: 3.12
14
+ Classifier: Programming Language :: Python :: 3.13
15
+ Requires-Dist: numpy (>=1.23,<3.0)
16
+ Description-Content-Type: text/markdown
17
+
18
+ # linregmc package (Linear regression with Monte Carlo error analysis)
19
+
20
+ This package is a single module which contains four functions:<br>
21
+ addnoise - introduce noise in the data based on known experimental precision<br>
22
+ linreg - perform multiple linear regressions and calculate goodness-of-fit<br>
23
+ confidence - get confidence intervals, plot distributions etc.<br>
24
+ linconf - do all the steps in succession, including possible transformation of the data<br>
25
+
@@ -0,0 +1,7 @@
1
+ # linregmc package (Linear regression with Monte Carlo error analysis)
2
+
3
+ This package is a single module which contains four functions:<br>
4
+ addnoise - introduce noise in the data based on known experimental precision<br>
5
+ linreg - perform multiple linear regressions and calculate goodness-of-fit<br>
6
+ confidence - get confidence intervals, plot distributions etc.<br>
7
+ linconf - do all the steps in succession, including possible transformation of the data<br>
@@ -0,0 +1,27 @@
1
+ [tool.poetry]
2
+ name = "linregmc"
3
+ version = "0.1.1"
4
+ description = "Linear regression with Monte Carlo error analysis"
5
+ authors = ["Pär Söderhjelm"]
6
+ license = "MIT"
7
+ readme = "README.md"
8
+ packages = [
9
+ { include = "linregmc", from = "src" }
10
+ ]
11
+
12
+
13
+ [tool.poetry.dependencies]
14
+ python = "^3.9"
15
+ numpy = ">=1.23,<3.0"
16
+
17
+ [tool.poetry.group.dev.dependencies]
18
+ myst-nb = "^1.3.0"
19
+ sphinx-autoapi = "^3.6.0"
20
+
21
+ [tool.poetry.group.docs.dependencies]
22
+ sphinx = "^7.0"
23
+ sphinx-rtd-theme = "^3.0.2"
24
+
25
+ [build-system]
26
+ requires = ["poetry-core>=1.0.0"]
27
+ build-backend = "poetry.core.masonry.api"
@@ -0,0 +1 @@
1
+ 521 cd linregmc_python2025/linregmc/src/linregmc/
@@ -0,0 +1 @@
1
+ from .linregmc import *
@@ -0,0 +1,345 @@
1
+ import numpy as np
2
+ from numpy.matlib import repmat,randn
3
+ import logging
4
+
5
+ def addnoise(yinp,ysiginp,nmc=10000,distrib='normal'):
6
+ """Adds noise to an array of data points (or a single value).
7
+
8
+ Creates a matrix of nmc vectors with the mean values of y but with
9
+ added random noise of standard deviation ysig.
10
+ You might want to initialize the random number generator in forehand.
11
+
12
+
13
+ Parameters
14
+ ----------
15
+ y : array
16
+ data vector
17
+ ysig : array
18
+ standard deviation vector (same length as y, or a single value)
19
+ nmc : int, default 10000
20
+ number of Monte Carlo copies
21
+ distrib : str, default 'normal'
22
+ 'norm'/'normal' gives normal distribution
23
+ 'lognorm'/'lognormal' give lognormal distribution (useful for example if negative results are unphysical)
24
+
25
+ Returns
26
+ -------
27
+ array
28
+ the data vector with added noise
29
+ (always two-dimensional, but only one column if y was a single value)
30
+
31
+ Examples
32
+ --------
33
+ >>> y = np.array([1.2, 2.3, 3.7])
34
+ >>> ysig = np.array([0.1, 0.3, 0.2])
35
+ >>> y1 = addnoise(y, ysig) # different stdev for the three points, normal distribution
36
+ >>> y1.shape # (10000, 3)
37
+ >>> y2 = addnoise(y, 0.2, 1000, distrib='lognorm') # same stdev for the three points, lognormal distribution
38
+ >>> y2.shape # (1000, 3)
39
+ >>> y3 = addnoise(1.5, 0.2) # only one point
40
+ >>> y3.shape # (10000, 1)
41
+ """
42
+
43
+
44
+ yinp = np.asarray(yinp)
45
+ ysiginp = np.asarray(ysiginp)
46
+ if np.ndim(yinp)>1 or np.ndim(ysiginp)>1:
47
+ raise Exception('y and ysig must not have higher dimension than 1.')
48
+ if np.size(ysiginp) == 1:
49
+ ysiginp = ysiginp*np.ones(np.size(yinp)) #If ysiginp is a scalar, turn it into a vector with identical elements
50
+ if np.size(yinp) != np.size(ysiginp):
51
+ raise Exception('y and ysig must have the same length.')
52
+
53
+ n=np.size(yinp)
54
+ y=yinp.reshape((1,n))
55
+ ysig=ysiginp.reshape((1,n))
56
+ if distrib.lower() in ('norm' ,'normal'):
57
+
58
+ return np.array(repmat(y,nmc,1)) + np.array(repmat(ysig,nmc,1))*np.array(randn(nmc,n))
59
+ elif distrib.lower() in ('lognorm','lognormal'):
60
+ mu = np.log(y**2/np.sqrt(ysig**2+y**2)) # mu of lognormal dist
61
+ sigma = np.sqrt(np.log(ysig**2/y**2+1)) # sigma of lognormal dist
62
+ return np.exp(np.array(randn(nmc,n))*np.array(repmat(sigma,nmc,1)) + np.array(repmat(mu,nmc,1)))
63
+ else:
64
+ raise Exception('Distribution named "' + distrib + '" is not recognized.')
65
+
66
+
67
+
68
+ def linreg(xinp, yinp, ndeg=1, fitfunc = None, weighted=True, plot = False):
69
+ """Performs linear fitting ax+b=y with error analysis using a Monte Carlo approach.
70
+
71
+ Parameters
72
+ ----------
73
+ xinp : array
74
+ an NM x N matrix: the NX data sets of x values (N data points)
75
+ yinp : array
76
+ an NY x N matrix: the NY data sets of y values (N data points)
77
+ NX and NY need not be the same. In particular one may use a
78
+ single data set (without added noise) for one of them.
79
+ The number of fits equals NM = max(NX,NY) and if there are less data
80
+ sets for one of x or y, they are just cyclically reused.
81
+ ndeg : int, default 1
82
+ the degree of the polynomial used for fitting
83
+ (the ordering of the obtained coefficients is from 0 to ndeg, i.e.
84
+ corresponds to numpy.polynomial.polynomial.polyfit)
85
+ fitfunc : list, optional
86
+ a list of functions of x that are used as basis functions instead of a polonymial
87
+ (ndeg and fitfunc cannot both be specified)
88
+ weighted: boolean, default True
89
+ weight the importance of each data point by 1/stdev (recommended)
90
+ plot : boolean, default False
91
+ an optional argument that specifies whether to plot the chi2 distribution
92
+ to visualize the "goodness-of-fit".
93
+
94
+ Returns
95
+ -------
96
+ pp : array (ndeg+1 elements)
97
+ single-fit value of each parameter (can be used as the result)
98
+ psig : array (ndeg+1 elements)
99
+ standard deviation of each parameter
100
+ pchi2 : float
101
+ goodness-of-fit, i.e. probability of chi>chi0
102
+ Note: not well-defined
103
+ pmc : array
104
+ a (NM x (ndeg+1) matrix, the fitted parameters for all data sets
105
+
106
+ Examples
107
+ --------
108
+ >>> x = np.array([0.1, 0.2, 0.3])
109
+ >>> y = np.array([1.2, 2.3, 3.7])
110
+ >>> y_mc=addnoise(y, 0.1)
111
+ >>> pp,psig,pchi2,pmc = linreg(x, y_mc)
112
+ >>> print(pp) #[-0.102 12.517]
113
+ >>> print(pmc.shape) # (10000, 2)
114
+ >>> pp,psig,pchi2,pmc = linreg(x, y_mc, fitfunc=[lambda x: np.exp(x), lambda x: 1]) #Fit to y=ae^x+b instead
115
+ """
116
+
117
+ if fitfunc != None and ndeg==1: #Assume that ndeg was not specified, there is no way to know...
118
+ ndeg = len(fitfunc)-1
119
+ elif fitfunc != None and ndeg!=1:
120
+ raise Exception('Only one of ndeg and fitfunc can be specified.')
121
+ if np.ndim(xinp) == 1:
122
+ x=xinp.reshape((1,np.size(xinp)))
123
+ else:
124
+ x= xinp
125
+ if np.ndim(yinp) == 1:
126
+ y=yinp.reshape((1,np.size(yinp)))
127
+ else:
128
+ y=yinp
129
+ if np.size(x,1) != np.size(y,1):
130
+ raise Exception('Number of columns in x and y must be equal')
131
+ N=np.size(x,1)
132
+ xn=np.size(x,0)
133
+ yn=np.size(y,0)
134
+
135
+
136
+ def buildmat(xx,ss=1.0):
137
+ if fitfunc==None:
138
+ return np.stack([np.ones(N)/ss]+[xx**k/ss for k in range(1,ndeg+1)], axis=1)
139
+ else:
140
+ return np.stack([(np.zeros(N)+func(xx))/ss for func in fitfunc], axis=1)
141
+
142
+
143
+ xs=np.median(x, axis=0)
144
+ ys=np.median(y, axis=0) #Reproduces original data points independent of distribution
145
+ if weighted:
146
+ deltax=np.std(xs)/(N*100) #gives deltax with correct order of magnitude
147
+ pnow=np.linalg.lstsq(buildmat(xs), ys, rcond=None)[0] #parameters if no weights are used
148
+ fprime=(np.matmul(buildmat(xs+deltax),pnow)-np.matmul(buildmat(xs-deltax),pnow))/(2*deltax) #numerical derivative
149
+ sig=np.sqrt(np.var(y, axis=0)+fprime**2*np.var(x, axis=0)) #Standard error propagation
150
+ #TODO: If weighting is important, the estimation of sig should be done iteratively because fprime depends on the fit and thus on sig
151
+ if xn==1 and yn==1:
152
+ sig=1.0
153
+ logging.warning('Single data sets, using unweighted fit instead')
154
+ elif np.any(sig==0):
155
+ sig=1.0
156
+ logging.warning('Points with no variation encountered, using unweighted fit instead')
157
+ else:
158
+ sig=1.0
159
+
160
+ #Perform single fit to get the base chi2 value
161
+ Xt=buildmat(xs)
162
+ X=buildmat(xs,sig)
163
+ YS=ys/sig
164
+ pp=np.linalg.lstsq(X,YS, rcond=None)[0]
165
+ yfit=np.matmul(Xt,pp) # y(xs) value according to model
166
+ chi2 = sum((YS - np.matmul(X,pp))**2)
167
+
168
+ nmc = max(xn,yn)
169
+ pmc = np.zeros((nmc,ndeg+1))
170
+ chi2mc = np.zeros(nmc)
171
+ for i in range(nmc):
172
+ X=buildmat(x[i%xn,:],sig)
173
+ Y=(yfit+y[i%yn,:]-ys)/sig
174
+ p=np.linalg.lstsq(X,Y, rcond=None)[0]
175
+ pmc[i,:]=p
176
+ chi2mc[i] = sum((Y - np.matmul(X,p))**2)
177
+
178
+ psig = np.std(pmc,0)
179
+ pmean = np.mean(pmc,0) #Not used
180
+ pchi2=sum(chi2mc>chi2)/nmc #Percentage of MC samples having greater chi2 than the observation
181
+
182
+ if plot:
183
+ import matplotlib.pyplot as plt
184
+ fig, ax = plt.subplots(1, 1, figsize=(4, 2))
185
+ counts,*_=ax.hist(chi2mc,bins=50)
186
+ ycent=0.5*max(counts)
187
+ ax.plot([chi2,chi2],[0,ycent],'r-')
188
+ ax.set_yticks([])
189
+ ax.set_xlabel(r"$\chi^2$")
190
+ plt.show()
191
+
192
+ return (pp,psig,pchi2,pmc)
193
+
194
+
195
+
196
+ def confidence(X, level=0.683, plot=False):
197
+ """Statistical analysis of the data in matrix X.
198
+
199
+ It is assumed that the number of data points are large; all properties
200
+ are calculated from the data itself.
201
+
202
+ Parameters
203
+ ----------
204
+ X : array
205
+ data matrix. Data in columns. For example, if X contains data
206
+ from two measurements, data for measurement 1 is in column 1
207
+ and measurement 2 in columns 2.
208
+ If only one column, a 1d-array is also acceptable
209
+ level : float, default 0.683
210
+ desired confidence level
211
+ plot : boolean, default False
212
+ an optional boolean specifying whether to plot histograms for each column
213
+ where a general statistic is shown as a red errorbar (median +/- stdev)
214
+ and the confidence intervals are shown with black lines.
215
+ The red markers at the bottom show the simpler (median +/- err)
216
+ interval which should normally coincide with the confidence interval
217
+ unless the distribution is skew (in which case the confidence interval is more reliable).
218
+ If X has exactly two columns, a scatter plot showing possible correlation between
219
+ the two columns is also produced.
220
+
221
+ Returns
222
+ -------
223
+ err : float
224
+ estimated error in the columns based on selected confidence level.
225
+ confint : list
226
+ a list of tuples (low, high), the confidence interval for each input column
227
+ (pconf*100% of values are found within this interval around median)
228
+ If the input x was a 1d-array, a single tuple is returned instead of a list
229
+
230
+ Examples
231
+ --------
232
+ >>> x = np.array([0.1, 0.2, 0.3])
233
+ >>> y = np.array([1.2, 2.3, 3.7])
234
+ >>> y_mc=addnoise(y, 0.1)
235
+ >>> pp,psig,pchi2,pmc = linreg(x, y_mc)
236
+ >>> err,confint = confidence(pmc, 0.95)
237
+ >>> print(err) # gives [1.364 0.296]
238
+ >>> print(confint) # gives [(11.16, 13.88), (-0.401, 0.190)]
239
+
240
+ """
241
+
242
+ onedim = (np.ndim(X) == 1)
243
+
244
+ if onedim: #convert to matrix, then convert back to onedim at the end
245
+ X=X.reshape((np.size(X),1))
246
+
247
+ if level <= 0 or level >= 1:
248
+ raise Exception("levvel must be 0 < level < 1.")
249
+
250
+ if np.size(X,1) > np.size(X,0):
251
+ print("Warning. It appears that your data is not placed column-wise.")
252
+
253
+ N = np.size(X,0) #number of data points
254
+ n = np.size(X,1) #number of dimensions (columns)
255
+
256
+ # GUM recommendation. ncut is the complement to pconf, ie the 1-pconf
257
+ # fraction of points.
258
+ #ncut = floor((N - floor(level*N+0.5) + 1)/2);
259
+
260
+ median = np.median(X,0)
261
+ sig = np.std(X,0)
262
+ absdiff = abs(X-np.mean(X,0)) #Absolute difference to mean value
263
+ plow = np.zeros(n)
264
+ phigh = np.zeros(n)
265
+ err = np.zeros(n)
266
+ for j in range(n):
267
+ tmp=np.sort(X[:,j])
268
+ plow[j]=tmp[round(max(1,0.5*(1-level)*N))-1]
269
+ phigh[j]=tmp[round(min(N,1-0.5*(1-level)*N))-1]
270
+ tmp=np.sort(absdiff[:,j])
271
+ err[j]=tmp[round(min(N,level*N))-1]
272
+
273
+ if plot:
274
+ import matplotlib.pyplot as plt
275
+ import matplotlib.gridspec as gridspec
276
+ nvar=np.size(X,1)
277
+ if nvar==2: #Exactly two parameters so produce a scatter plot and histograms
278
+ fig = plt.figure(figsize=(8, 4.8))
279
+ gs = gridspec.GridSpec(2, 2, width_ratios=[1.5, 1], height_ratios=[1, 1])
280
+ # Left square spans both rows
281
+ ax_left = fig.add_subplot(gs[:, 0])
282
+ axes = [fig.add_subplot(gs[0, 1]), fig.add_subplot(gs[1, 1])]
283
+ ax_left.set_aspect('equal')
284
+ ax_left.scatter(X[:,0],X[:,1],s=0.1)
285
+ ax_left.set_xlabel(r'$c_0$')
286
+ ax_left.set_ylabel(r'$c_1$')
287
+ ax_left.plot([plow[0],plow[0]],[np.min(X[:,1]),np.max(X[:,1])],'k--')
288
+ ax_left.plot([phigh[0],phigh[0]],[np.min(X[:,1]),np.max(X[:,1])],'k--')
289
+ ax_left.plot([np.min(X[:,0]),np.max(X[:,0])],[plow[1],plow[1]], 'k--')
290
+ ax_left.plot([np.min(X[:,0]),np.max(X[:,0])],[phigh[1],phigh[1]], 'k--')
291
+
292
+ ax_left.set_aspect(1.0/ax_left.get_data_ratio(), adjustable='box')
293
+ else: #only produce histograms
294
+ fig, axes = plt.subplots(nrows=nvar, ncols=1, figsize=(4, 2*nvar))
295
+ if nvar==1: axes=[axes] # fix stupid inconsistency in plt.subplots so that axes is always a list
296
+
297
+ for i,ax in enumerate(axes):
298
+ counts,*_=ax.hist(X[:,i], bins=50)
299
+ ycent=0.5*max(counts)
300
+ ax.errorbar(median[i],ycent,xerr=sig[i],fmt='ro',capsize=5)
301
+ ax.plot([plow[i],plow[i]] ,[0,0.8*ycent],'k--')
302
+ ax.plot([phigh[i],phigh[i]],[0,0.8*ycent],'k--')
303
+ ax.plot([median[i]-err[i], median[i]-err[i]], [0,0.1*ycent],'r-')
304
+ ax.plot([median[i]+err[i], median[i]+err[i]], [0,0.1*ycent],'r-')
305
+ ax.set_xlabel(r'$c_{%d}$'%i) #Name the variables c0,c1...
306
+ ax.set_yticks([])
307
+
308
+ plt.tight_layout()
309
+ plt.show()
310
+
311
+ if onedim:
312
+ return (err[0], (plow[0], phigh[0])) #simply return scalars
313
+ else:
314
+ return (err, list(zip(plow, phigh)))
315
+
316
+
317
+ def linconf(xinp, yinp, ysig, nmc=10000, distrib='normal', level=0.683, ytransform=None, restransform=None):
318
+ """Performs the full Monte Carlo linear regression with confidence calculation.
319
+
320
+ This is done by applying the following 5 steps in succession:
321
+ - addnoise to y values
322
+ - transform y values (skipped if ytransform==None)
323
+ - linreg (x,y)
324
+ - calculate a tuple of results from a,b (skipped if restransform==None)
325
+ - confidence for each result
326
+
327
+ For detailed description of parameters, see previous functions
328
+ Returns (reslist, pchi2) where reslist is a list of (result, error, confidenceinterval) for each calculated result
329
+ """
330
+
331
+ ymc=addnoise(yinp, ysig, nmc, distrib)
332
+ if ytransform!=None:
333
+ ymc = ytransform(ymc)
334
+ pp,psig,pchi2,pmc=linreg(xinp,ymc)
335
+ if restransform!=None:
336
+ results=restransform(pp[0],pp[1])
337
+ results_mc=restransform(pmc[:,0],pmc[:,1])
338
+ else:
339
+ results=(pp[0],pp[1])
340
+ results_mc=(pmc[:,0],pmc[:,1])
341
+ rlist=[]
342
+ for r,rmc in zip(results,results_mc):
343
+ perr,confint=confidence(rmc, level)
344
+ rlist.append((r,perr,confint))
345
+ return (rlist,pchi2)