poseigen-binmeths 0.0.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
|
@@ -0,0 +1 @@
|
|
|
1
|
+
from .poseigen_binmeths import *
|
|
@@ -0,0 +1,893 @@
|
|
|
1
|
+
import numpy as np
|
|
2
|
+
import pandas as pd
|
|
3
|
+
import math
|
|
4
|
+
import os
|
|
5
|
+
import itertools
|
|
6
|
+
|
|
7
|
+
from collections import Counter
|
|
8
|
+
|
|
9
|
+
import poseigen_seaside.basics as se
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
def BinParser(inp, uni, categorical = False, multi = False):
|
|
13
|
+
|
|
14
|
+
if multi is False: inp = [inp]
|
|
15
|
+
|
|
16
|
+
pars = []
|
|
17
|
+
for inpx in inp:
|
|
18
|
+
if categorical: par = [np.where(inpx == u)[0] for u in uni]
|
|
19
|
+
else: par = [np.where(np.logical_and(inpx>=u[0], inpx <= u[1]))[0] for u in uni]
|
|
20
|
+
pars.append(par)
|
|
21
|
+
|
|
22
|
+
if multi is False: pars = pars[0]
|
|
23
|
+
|
|
24
|
+
return pars
|
|
25
|
+
|
|
26
|
+
|
|
27
|
+
def Binner(inp, uni = None):
|
|
28
|
+
|
|
29
|
+
#This is Binner_v2 in origninal binstuff.
|
|
30
|
+
|
|
31
|
+
#should be able to handle a multi-dim shape, bin it and return
|
|
32
|
+
categorical = False
|
|
33
|
+
if isinstance(inp[0], str): categorical = True
|
|
34
|
+
|
|
35
|
+
inpy = inp.reshape(-1)
|
|
36
|
+
|
|
37
|
+
par = BinParser(inpy, uni, categorical = categorical)
|
|
38
|
+
|
|
39
|
+
inpx = np.zeros(len(inpy), dtype = int)
|
|
40
|
+
for ip, p in enumerate(par):
|
|
41
|
+
inpx[p] = ip
|
|
42
|
+
|
|
43
|
+
inpx = inpx.reshape(inp.shape)
|
|
44
|
+
|
|
45
|
+
return inpx
|
|
46
|
+
|
|
47
|
+
def HistUni(inp, bins = 10):
|
|
48
|
+
inp = np.array(inp)
|
|
49
|
+
bin_edges = np.linspace(inp.min(), inp.max(), bins + 1)
|
|
50
|
+
uni = se.ListWindower(bin_edges, win_size = 2)
|
|
51
|
+
return uni
|
|
52
|
+
|
|
53
|
+
|
|
54
|
+
def HistBinning(inp, bins = 10):
|
|
55
|
+
uni = HistUni(inp, bins = bins)
|
|
56
|
+
return Binner(inp, uni = uni)
|
|
57
|
+
|
|
58
|
+
|
|
59
|
+
def HistWeights(x, bins = 1000, recip = False, interp = False):
|
|
60
|
+
|
|
61
|
+
# Not the same as in original.
|
|
62
|
+
|
|
63
|
+
x = np.array(x)
|
|
64
|
+
x_bind = HistBinning(x, bins = bins)
|
|
65
|
+
|
|
66
|
+
histoz = np.histogram(x.reshape(-1), bins = bins)
|
|
67
|
+
histoz_scaled = histoz[0] / histoz[0].sum()
|
|
68
|
+
|
|
69
|
+
if interp:
|
|
70
|
+
|
|
71
|
+
x = np.array(x)
|
|
72
|
+
x_mi, x_ma = se.MinMax(x)
|
|
73
|
+
|
|
74
|
+
oldr = np.linspace(x_mi, x_ma, len(histoz_scaled))
|
|
75
|
+
weis = np.interp(x.reshape(-1), oldr, histoz_scaled)
|
|
76
|
+
|
|
77
|
+
else:
|
|
78
|
+
weis = np.zeros(x.shape)
|
|
79
|
+
for iz, z in enumerate(histoz_scaled):
|
|
80
|
+
weis[x_bind == iz] = z
|
|
81
|
+
|
|
82
|
+
if recip:
|
|
83
|
+
weis = 1 / weis
|
|
84
|
+
weis = weis / weis.sum()
|
|
85
|
+
|
|
86
|
+
return weis
|
|
87
|
+
|
|
88
|
+
|
|
89
|
+
|
|
90
|
+
def Harpoon(bind, select,
|
|
91
|
+
multi = False,
|
|
92
|
+
repeat = 1, custidx = None):
|
|
93
|
+
|
|
94
|
+
#BIND IS NOW INPUT! DOES NOT DO BINNING ANY MORE
|
|
95
|
+
|
|
96
|
+
#BIND FORMAT IS [LENGTH, TYPE] FOR THIS ONE
|
|
97
|
+
|
|
98
|
+
#OCT 2 23, ADDED REPEAT FUNCTION FOR HARPOON TRAINER
|
|
99
|
+
##cust idx is custom idx
|
|
100
|
+
|
|
101
|
+
#Added Multi feature (June 2)
|
|
102
|
+
#for multi #Inp is Observations as rows and each output as a column, else its a 1dim array
|
|
103
|
+
|
|
104
|
+
if select < 1: select = np.round(len(bind) * select).astype(int)
|
|
105
|
+
|
|
106
|
+
if multi:
|
|
107
|
+
h2, h1 = np.unique(bind, axis = 0, return_counts = True)
|
|
108
|
+
div = [np.where((bind == u).all(axis = 1))[0] for u in h2]
|
|
109
|
+
else:
|
|
110
|
+
h2, h1 = np.unique(bind, return_counts = True)
|
|
111
|
+
div = [np.where(bind == u)[0] for u in h2]
|
|
112
|
+
|
|
113
|
+
if custidx is not None: div = [np.array(custidx)[d] for d in div]
|
|
114
|
+
|
|
115
|
+
lu = len(h2)
|
|
116
|
+
nums = np.repeat(0, lu)
|
|
117
|
+
|
|
118
|
+
j = 0
|
|
119
|
+
for _ in range(select):
|
|
120
|
+
for i in np.random.permutation(np.arange(lu)):
|
|
121
|
+
if j == select: break
|
|
122
|
+
elif h1[i] > 0:
|
|
123
|
+
nums[i] += 1
|
|
124
|
+
h1[i] -= 1
|
|
125
|
+
j += 1
|
|
126
|
+
|
|
127
|
+
idx = np.stack([np.hstack([np.random.choice(d, replace = False, size = w) for d,w in zip(div, nums)])
|
|
128
|
+
for _ in range(repeat)], 0)
|
|
129
|
+
|
|
130
|
+
return idx[0] if repeat == 1 else idx
|
|
131
|
+
|
|
132
|
+
|
|
133
|
+
def BinFunc(inp, bind, uni = None,
|
|
134
|
+
mode = [np.mean, {}]):
|
|
135
|
+
|
|
136
|
+
#DOES NOT SUPPORT MULTI RIGHT NOW>
|
|
137
|
+
|
|
138
|
+
if uni is None: uni = np.unique(bind)
|
|
139
|
+
results = [mode[0](inp[bind == un], **mode[1]) for un in uni]
|
|
140
|
+
|
|
141
|
+
return results
|
|
142
|
+
|
|
143
|
+
|
|
144
|
+
|
|
145
|
+
def BinsOfBins(bind, uni = None):
|
|
146
|
+
#BIND FORMAT IS [LENGTH, TYPE] FOR THIS ONE
|
|
147
|
+
# uni is a list the length of TYPE. for each type, it tells the unique ones.
|
|
148
|
+
|
|
149
|
+
if uni is None: h2, h1 = np.unique(bind, axis = 0, return_counts = True)
|
|
150
|
+
else: h2 = [list(x) for x in list(itertools.product(*uni))]
|
|
151
|
+
|
|
152
|
+
div = [np.where((bind == u).all(axis = 1))[0] for u in h2]
|
|
153
|
+
lu = len(h2)
|
|
154
|
+
newbind = np.repeat(0, len(bind))
|
|
155
|
+
for i in range(lu):
|
|
156
|
+
newbind[div[i]] = i
|
|
157
|
+
|
|
158
|
+
return newbind
|
|
159
|
+
|
|
160
|
+
|
|
161
|
+
|
|
162
|
+
|
|
163
|
+
def BinCount(bind, uni = None, multi = False, prop = False):
|
|
164
|
+
#uni is shared
|
|
165
|
+
#inp is a list of 1D arrays or lists of values
|
|
166
|
+
|
|
167
|
+
if multi is False: bind = [bind]
|
|
168
|
+
bind = [np.array(m) for m in bind]
|
|
169
|
+
|
|
170
|
+
if uni is None: uni = np.sort(np.unique(bind))
|
|
171
|
+
|
|
172
|
+
hs = [Counter(m.reshape(-1)) for m in bind]
|
|
173
|
+
max_key = max(max(d) for d in hs)
|
|
174
|
+
empty = dict.fromkeys(uni, 0)
|
|
175
|
+
rere = [{**empty, **d} for d in hs]
|
|
176
|
+
|
|
177
|
+
cnv = pd.DataFrame(rere).to_numpy()
|
|
178
|
+
|
|
179
|
+
if multi is False: cnv = cnv[0]
|
|
180
|
+
|
|
181
|
+
if prop: cnv = cnv / np.sum(cnv)
|
|
182
|
+
|
|
183
|
+
return cnv
|
|
184
|
+
|
|
185
|
+
|
|
186
|
+
|
|
187
|
+
################## OBJECTIVE BINNING ##################################
|
|
188
|
+
|
|
189
|
+
|
|
190
|
+
def FindScaleFactor(inp,
|
|
191
|
+
mode = [se.tanh, {}],
|
|
192
|
+
mode_rev = [se.revtanh, {}],
|
|
193
|
+
|
|
194
|
+
bins = 10,
|
|
195
|
+
maxrange = 0.3, minobs = None,
|
|
196
|
+
start = 1, increm = 0.1,
|
|
197
|
+
mima = None,
|
|
198
|
+
lowerbound = None, upperbound = None):
|
|
199
|
+
|
|
200
|
+
#lets determine the maximum alpha:
|
|
201
|
+
|
|
202
|
+
inpx = inp.reshape(-1)
|
|
203
|
+
|
|
204
|
+
if mima is None: mi, ma = None, None
|
|
205
|
+
elif isinstance(mima, int): mi, ma = (0, int)
|
|
206
|
+
else: mi, ma = mima
|
|
207
|
+
|
|
208
|
+
if mi is None: mi = np.min(inp)
|
|
209
|
+
if ma is None: ma = np.max(inp)
|
|
210
|
+
|
|
211
|
+
lb = mi if lowerbound is None else lowerbound
|
|
212
|
+
ub = ma if upperbound is None else upperbound
|
|
213
|
+
|
|
214
|
+
inpx = inp.reshape(-1)
|
|
215
|
+
inpx[inpx < lb] = lb
|
|
216
|
+
inpx[inpx > ub] = ub
|
|
217
|
+
|
|
218
|
+
cent1 = None
|
|
219
|
+
|
|
220
|
+
totz = ma - mi
|
|
221
|
+
|
|
222
|
+
starto = start - increm
|
|
223
|
+
|
|
224
|
+
vars, scalefactors, edges, hix = [], [], [], []
|
|
225
|
+
|
|
226
|
+
if maxrange is None: maxrange = 1
|
|
227
|
+
if minobs is None: minobs = len(inp)
|
|
228
|
+
|
|
229
|
+
lenz_max, mino_obs = 0, 0
|
|
230
|
+
|
|
231
|
+
while lenz_max <= maxrange and mino_obs <= minobs:
|
|
232
|
+
|
|
233
|
+
starto = np.round(starto + increm, 5)
|
|
234
|
+
|
|
235
|
+
if cent1 is None:
|
|
236
|
+
|
|
237
|
+
liny = mode[0](starto * inpx, **mode[1])
|
|
238
|
+
|
|
239
|
+
hist, bin_edges = np.histogram(liny, bins = bins, density=False)
|
|
240
|
+
|
|
241
|
+
mino_obs = np.min(hist)
|
|
242
|
+
|
|
243
|
+
rt = mode_rev[0](bin_edges, ** mode_rev[1]) / starto
|
|
244
|
+
|
|
245
|
+
lens_props = np.array([rt[i+1] - rt[i] for i in np.arange(len(rt) -1)]) / totz
|
|
246
|
+
lenz_max = np.max(lens_props)
|
|
247
|
+
|
|
248
|
+
rt[0] = mi
|
|
249
|
+
rt[-1] = ma
|
|
250
|
+
|
|
251
|
+
hix.append(hist)
|
|
252
|
+
edges.append(rt)
|
|
253
|
+
vars.append(np.std(hist))
|
|
254
|
+
scalefactors.append(starto)
|
|
255
|
+
|
|
256
|
+
|
|
257
|
+
besto = np.argmin(vars)
|
|
258
|
+
#print(besto, vars[besto], edges[besto], hix[besto])
|
|
259
|
+
|
|
260
|
+
return scalefactors[besto]
|
|
261
|
+
|
|
262
|
+
|
|
263
|
+
|
|
264
|
+
|
|
265
|
+
|
|
266
|
+
def FindNumBins(inp,
|
|
267
|
+
mode = [se.tanh, {}],
|
|
268
|
+
mode_rev = [se.revtanh, {}],
|
|
269
|
+
|
|
270
|
+
maxrange = 0.3, minobs = 50,
|
|
271
|
+
minbins = 5, maxbins = 1000,
|
|
272
|
+
start = 1, increm = 0.1,
|
|
273
|
+
mima = None, lowerbound = None, upperbound = None):
|
|
274
|
+
|
|
275
|
+
#Lets just find the number of bins givin the maxrange range and minobs
|
|
276
|
+
# then find the scale factor that best satistifies whatever.
|
|
277
|
+
|
|
278
|
+
# As we increase the scale factor with the modes, we expect more obs in the bin
|
|
279
|
+
# As we decrease the number of bins, we're going to have to try more until the maxrange is reached.
|
|
280
|
+
|
|
281
|
+
|
|
282
|
+
inpx = inp.reshape(-1)
|
|
283
|
+
|
|
284
|
+
if mima is None: mi, ma = None, None
|
|
285
|
+
elif isinstance(mima, int): mi, ma = (0, int)
|
|
286
|
+
else: mi, ma = mima
|
|
287
|
+
|
|
288
|
+
if mi is None: mi = np.min(inp)
|
|
289
|
+
if ma is None: ma = np.max(inp)
|
|
290
|
+
|
|
291
|
+
lb = mi if lowerbound is None else lowerbound
|
|
292
|
+
ub = ma if upperbound is None else upperbound
|
|
293
|
+
|
|
294
|
+
totz = ma - mi
|
|
295
|
+
|
|
296
|
+
inpx = inp.reshape(-1)
|
|
297
|
+
inpx[inpx < lb] = lb
|
|
298
|
+
inpx[inpx > ub] = ub
|
|
299
|
+
|
|
300
|
+
starto = start
|
|
301
|
+
|
|
302
|
+
lenz_max, mino_obs = 0, 0
|
|
303
|
+
|
|
304
|
+
|
|
305
|
+
for bxo in np.arange(minbins, maxbins): #The maximum number of bins.
|
|
306
|
+
|
|
307
|
+
gxzo = None
|
|
308
|
+
|
|
309
|
+
print(f'TRYING: {bxo}')
|
|
310
|
+
|
|
311
|
+
for incrz in np.arange(start, 100, increm):
|
|
312
|
+
|
|
313
|
+
starto = incrz
|
|
314
|
+
|
|
315
|
+
liny = mode[0](starto * inpx, **mode[1])
|
|
316
|
+
|
|
317
|
+
hist, bin_edges = np.histogram(liny, bins = bxo, density=False)
|
|
318
|
+
|
|
319
|
+
mino_obs = np.min(hist)
|
|
320
|
+
|
|
321
|
+
rt = mode_rev[0](bin_edges, ** mode_rev[1]) / starto
|
|
322
|
+
|
|
323
|
+
lens_props = np.array([rt[i+1] - rt[i] for i in np.arange(len(rt) -1)]) / totz
|
|
324
|
+
lenz_max = np.max(lens_props)
|
|
325
|
+
|
|
326
|
+
if lenz_max <= maxrange and mino_obs >= minobs:
|
|
327
|
+
|
|
328
|
+
gxzo = starto
|
|
329
|
+
|
|
330
|
+
elif lenz_max > maxrange: break
|
|
331
|
+
|
|
332
|
+
|
|
333
|
+
if gxzo is not None:
|
|
334
|
+
|
|
335
|
+
maxb = bxo
|
|
336
|
+
|
|
337
|
+
#else: break
|
|
338
|
+
|
|
339
|
+
return maxb
|
|
340
|
+
|
|
341
|
+
|
|
342
|
+
|
|
343
|
+
|
|
344
|
+
def ObjUni(inp, scalefactor,
|
|
345
|
+
mode = [se.tanh, {}], mode_rev = [se.revtanh, {}],
|
|
346
|
+
bins = 10,
|
|
347
|
+
open_ends = False,
|
|
348
|
+
mima = None, lowerbound = None, upperbound = None):
|
|
349
|
+
|
|
350
|
+
inpx = inp.reshape(-1)
|
|
351
|
+
|
|
352
|
+
if mima is None: mi, ma = None, None
|
|
353
|
+
elif isinstance(mima, int): mi, ma = (0, int)
|
|
354
|
+
else: mi, ma = mima
|
|
355
|
+
|
|
356
|
+
if mi is None: mi = np.min(inp)
|
|
357
|
+
if ma is None: ma = np.max(inp)
|
|
358
|
+
|
|
359
|
+
lb = mi if lowerbound is None else lowerbound
|
|
360
|
+
ub = ma if upperbound is None else upperbound
|
|
361
|
+
|
|
362
|
+
inpx = inp.reshape(-1)
|
|
363
|
+
inpx[inpx < lb] = lb
|
|
364
|
+
inpx[inpx > ub] = ub
|
|
365
|
+
|
|
366
|
+
liny = mode[0](scalefactor * inpx, **mode[1])
|
|
367
|
+
|
|
368
|
+
hist, bin_edges = np.histogram(liny, bins = bins, density=False)
|
|
369
|
+
|
|
370
|
+
rt = mode_rev[0](bin_edges, **mode_rev[1]) / scalefactor
|
|
371
|
+
|
|
372
|
+
rt[0] = mi
|
|
373
|
+
rt[-1] = ma
|
|
374
|
+
|
|
375
|
+
return se.ListWindower(rt, win_size = 2)
|
|
376
|
+
|
|
377
|
+
|
|
378
|
+
|
|
379
|
+
|
|
380
|
+
|
|
381
|
+
|
|
382
|
+
|
|
383
|
+
|
|
384
|
+
|
|
385
|
+
|
|
386
|
+
|
|
387
|
+
###########################################################
|
|
388
|
+
|
|
389
|
+
############# SPLIT TOOLS ####################
|
|
390
|
+
|
|
391
|
+
|
|
392
|
+
|
|
393
|
+
|
|
394
|
+
|
|
395
|
+
def SplitExpander(X1, X2, Split, ma = None):
|
|
396
|
+
|
|
397
|
+
# X1 is the full length of the total split (modified to be able to expand subsplits)
|
|
398
|
+
if isinstance(X1, int) == False:
|
|
399
|
+
X1 = len(X1)
|
|
400
|
+
|
|
401
|
+
if ma is None: ma = np.sum([len(x) for x in Split])
|
|
402
|
+
|
|
403
|
+
#X2 is the multiplier
|
|
404
|
+
if isinstance(X2, int) == False: X2 = len(X2)
|
|
405
|
+
|
|
406
|
+
multiplier = X2 // X1 if X2 > X1 else X2 #June 1 mod
|
|
407
|
+
|
|
408
|
+
new = [np.hstack([*[e + (ma * m) for m in range(multiplier)]]) for e in Split]
|
|
409
|
+
|
|
410
|
+
return new
|
|
411
|
+
|
|
412
|
+
|
|
413
|
+
def SplitRepeater(split, reps):
|
|
414
|
+
#reps is repeats
|
|
415
|
+
#each obs reps times. Designed for btc days
|
|
416
|
+
return [np.hstack([np.arange(o*reps, (o+1)*reps)
|
|
417
|
+
for o in sp])
|
|
418
|
+
for sp in split]
|
|
419
|
+
|
|
420
|
+
|
|
421
|
+
|
|
422
|
+
def SplitData(data, split): return [data[s] for s in split]
|
|
423
|
+
|
|
424
|
+
def Data2Divided(inp, split):
|
|
425
|
+
return [[n[s] for n in inp] for s in split]
|
|
426
|
+
|
|
427
|
+
def Data2Grouped(inp, split = None):
|
|
428
|
+
#for Divided dictionary, you need to select which split to use e.g., dict[0] as its the same as divided format
|
|
429
|
+
if split is None or split is False:
|
|
430
|
+
grouped = [[inp[n][m] for n in range(len(inp))] for m in range(len(inp[0]))]
|
|
431
|
+
else:
|
|
432
|
+
grouped = [[d[s] for s in split] for d in inp]
|
|
433
|
+
|
|
434
|
+
return grouped
|
|
435
|
+
|
|
436
|
+
|
|
437
|
+
|
|
438
|
+
|
|
439
|
+
|
|
440
|
+
|
|
441
|
+
|
|
442
|
+
def Divided2NonDivided(divs):
|
|
443
|
+
#divs in the shape of (obs, features)
|
|
444
|
+
|
|
445
|
+
stacks = np.hstack(divs) if len(divs[0].shape) == 1 else np.vstack(divs)
|
|
446
|
+
|
|
447
|
+
cs = np.cumsum([0] + [len(d) for d in divs])
|
|
448
|
+
split = [np.arange(cs[i], cs[i+1]) for i in range(len(divs))]
|
|
449
|
+
|
|
450
|
+
return stacks, split
|
|
451
|
+
|
|
452
|
+
def Idx2Split(X, idx, props = [0.7, 0.3]):
|
|
453
|
+
if isinstance(X, int) == False: X = len(X)
|
|
454
|
+
if idx < 1: idx = se.Round2Int(idx*X)
|
|
455
|
+
|
|
456
|
+
f = np.arange(X)
|
|
457
|
+
ls = [se.Round2Int(X*p) for p in props]
|
|
458
|
+
lx = [se.Round2Int(np.sum(ls[:r])) for r in range(len(ls))]
|
|
459
|
+
|
|
460
|
+
fs = [se.Cutter(se.Cutter(f, u), idx) for u in lx]
|
|
461
|
+
|
|
462
|
+
return [j[:r] for j,r in zip(fs, ls)]
|
|
463
|
+
|
|
464
|
+
def UngroupSplits(Splits, group):
|
|
465
|
+
#Splits is a list of splits
|
|
466
|
+
|
|
467
|
+
LG = np.arange(len(group))
|
|
468
|
+
Splits = [[LG[np.isin(group, sp)] for sp in split] for split in Splits]
|
|
469
|
+
|
|
470
|
+
return Splits
|
|
471
|
+
|
|
472
|
+
|
|
473
|
+
|
|
474
|
+
def SplitGen_Random(X, num_splits = 1, proportions = [0.7, 0.3], group = None):
|
|
475
|
+
|
|
476
|
+
#group is the inp groups from OverlapGroup or whereever
|
|
477
|
+
|
|
478
|
+
if group is not None: li = len(np.unique(group))
|
|
479
|
+
else: li = X if isinstance(X, int) else len(X)
|
|
480
|
+
|
|
481
|
+
#proportions = np.array(proportions) / np.sum(proportions)
|
|
482
|
+
|
|
483
|
+
ls = [se.Round2Int(li*p) for p in proportions]
|
|
484
|
+
if np.sum(ls) < li: ls[0] = ls[0] + (li - np.sum(ls))
|
|
485
|
+
idx = np.arange(li)
|
|
486
|
+
|
|
487
|
+
Splits = []
|
|
488
|
+
for _ in range(num_splits):
|
|
489
|
+
LXX = np.random.permutation(idx)
|
|
490
|
+
Splits.append([LXX[e-l:e] for l,e in zip(ls, np.cumsum(ls))])
|
|
491
|
+
|
|
492
|
+
|
|
493
|
+
if group is not None: Splits = UngroupSplits(Splits, group)
|
|
494
|
+
|
|
495
|
+
if num_splits == 1: Splits = Splits[0]
|
|
496
|
+
|
|
497
|
+
return Splits
|
|
498
|
+
|
|
499
|
+
def SplitGen_Window(X, num_splits = 3,
|
|
500
|
+
proportions = [0.7, 0.3],
|
|
501
|
+
everyother = None, start_at = None, end_at = None,
|
|
502
|
+
random = True, group = None):
|
|
503
|
+
|
|
504
|
+
#if num_splits is None, it returns all possible splits
|
|
505
|
+
|
|
506
|
+
if group is not None: li = np.max(group)
|
|
507
|
+
else: li = X if isinstance(X, int) else len(X)
|
|
508
|
+
|
|
509
|
+
#proportions = np.array(proportions) / np.sum(proportions)
|
|
510
|
+
|
|
511
|
+
ls = [se.Round2Int(li*p) for p in proportions]
|
|
512
|
+
lx = [se.Round2Int(np.sum(ls[:r])) for r in range(len(ls))]
|
|
513
|
+
|
|
514
|
+
idx = np.arange(li)
|
|
515
|
+
idxs = [se.Cutter(idx, u) for u in lx]
|
|
516
|
+
|
|
517
|
+
contwin = [se.ContinuousWindower(idx, k) for k in ls] # GOOD
|
|
518
|
+
|
|
519
|
+
if num_splits is not None or everyother is not None:
|
|
520
|
+
if everyother is not None:
|
|
521
|
+
rando = idx[::everyother]
|
|
522
|
+
num_splits = None
|
|
523
|
+
else:
|
|
524
|
+
rando = np.random.choice(idx, num_splits, replace = False) if random is True else np.arange(num_splits)
|
|
525
|
+
|
|
526
|
+
idxs = [we[rando] for we in idxs]
|
|
527
|
+
contwin = [coco[ra] for coco, ra in zip(contwin, idxs)]
|
|
528
|
+
|
|
529
|
+
num_splits = len(contwin[0])
|
|
530
|
+
Splits = [[a[i] for a in contwin] for i in range(num_splits)]
|
|
531
|
+
|
|
532
|
+
if group is not None: Splits = UngroupSplits(Splits, group)
|
|
533
|
+
|
|
534
|
+
if num_splits == 1: Splits = Splits[0]
|
|
535
|
+
|
|
536
|
+
return Splits
|
|
537
|
+
|
|
538
|
+
|
|
539
|
+
def SubSplitGen(inp, split, onlyfirst = False,
|
|
540
|
+
num_subsamples = 1, proportion = 0.3, group = None,
|
|
541
|
+
subsample_mode = [se.SubSample_Random, {}]):
|
|
542
|
+
|
|
543
|
+
if isinstance(proportion, float) is True:
|
|
544
|
+
proportion = [proportion] * len(split)
|
|
545
|
+
|
|
546
|
+
subsample_mode[1].update({'num_subsamples': num_subsamples})
|
|
547
|
+
|
|
548
|
+
if group is None: ss = [subsample_mode[0](inp[s], proportion = p, **subsample_mode[1]) for s,p in zip(split, proportion)]
|
|
549
|
+
else: ss = [subsample_mode[0](inp[s], group = group[s], proportion = p, **subsample_mode[1]) for s,p in zip(split, proportion)]
|
|
550
|
+
|
|
551
|
+
if onlyfirst:
|
|
552
|
+
addon = [np.arange(len(s)) for s in split[1:]]
|
|
553
|
+
if num_subsamples != 1: addon = [[s] * num_subsamples for s in addon]
|
|
554
|
+
ss = [ss[0], *addon]
|
|
555
|
+
|
|
556
|
+
if num_subsamples == 1: ss = [[j] for j in ss]
|
|
557
|
+
|
|
558
|
+
ss = [[s[h] for h in j] for s,j in zip(split, ss)]
|
|
559
|
+
|
|
560
|
+
ss = [[j[i] for j in ss] for i in range(num_subsamples)]
|
|
561
|
+
|
|
562
|
+
if num_subsamples == 1: ss = ss[0]
|
|
563
|
+
return ss
|
|
564
|
+
|
|
565
|
+
|
|
566
|
+
def SplitGen_Stratified(bind, num_splits = 1, proportions = [0.7, 0.3]):
|
|
567
|
+
#bind is a 1d list of binned
|
|
568
|
+
#does not do multi currently
|
|
569
|
+
|
|
570
|
+
#Stratified random split
|
|
571
|
+
|
|
572
|
+
uni = np.sort(np.unique(bind))
|
|
573
|
+
idxs = [np.where(bind == u)[0] for u in uni]
|
|
574
|
+
|
|
575
|
+
counts = BinCount(bind, uni=uni, multi = False)
|
|
576
|
+
nums = [[np.round(p * c).astype(int) for p in proportions] for c in counts]
|
|
577
|
+
|
|
578
|
+
Splits = []
|
|
579
|
+
for _ in range(num_splits):
|
|
580
|
+
Split = []
|
|
581
|
+
for iu, u in enumerate(uni):
|
|
582
|
+
XX = np.random.permutation(idxs[iu])
|
|
583
|
+
Split.append([XX[e-l:e] for l,e in zip(nums[iu], np.cumsum(nums[iu]))])
|
|
584
|
+
Splits.append([np.hstack([s[ip] for s in Split]) for ip in range(len(proportions))])
|
|
585
|
+
|
|
586
|
+
if num_splits == 1: Splits = Splits[0]
|
|
587
|
+
|
|
588
|
+
return Splits
|
|
589
|
+
|
|
590
|
+
def SubSample_Stratified(bind,
|
|
591
|
+
proportion = 0.3, num_subsamples = 1,
|
|
592
|
+
guarantee1 = False):
|
|
593
|
+
#bind is a 1d list of binned
|
|
594
|
+
#does not do multi currently
|
|
595
|
+
|
|
596
|
+
li = len(bind)
|
|
597
|
+
if proportion > 1: proportion = proportion / li
|
|
598
|
+
|
|
599
|
+
uni = np.sort(np.unique(bind))
|
|
600
|
+
|
|
601
|
+
idxs = [np.where(bind == u)[0] for u in uni]
|
|
602
|
+
lidxs = np.array([len(x) for x in idxs])
|
|
603
|
+
|
|
604
|
+
nums = np.round(lidxs * proportion).astype(int)
|
|
605
|
+
nums = [li if n > li else n for n in nums]
|
|
606
|
+
if guarantee1: nums = [1 if n == 0 and lx > 0 else n for n,lx in zip(nums, lidxs)]
|
|
607
|
+
|
|
608
|
+
ss = [np.hstack([np.random.choice(b, n, replace = False) for b,n in zip(idxs, nums)]) for _ in range(num_subsamples)]
|
|
609
|
+
|
|
610
|
+
if num_subsamples == 1: ss = ss[0]
|
|
611
|
+
return ss
|
|
612
|
+
|
|
613
|
+
|
|
614
|
+
###### AKIN SPLIT #############
|
|
615
|
+
|
|
616
|
+
|
|
617
|
+
def Akin_Scorer(bind, idxs,
|
|
618
|
+
|
|
619
|
+
uni = None, multi = False,
|
|
620
|
+
|
|
621
|
+
closest = False,
|
|
622
|
+
onlyidx = None, p = 2, reciprocal = False, pseudo = True, weight_bymem = False,
|
|
623
|
+
summarize_mode = [se.Epsilon, {}]):
|
|
624
|
+
|
|
625
|
+
#idxs = list of lists where each has 1D arrays or a list of indices to which relate to inp
|
|
626
|
+
#idxs is [split, ... ]
|
|
627
|
+
|
|
628
|
+
if multi is False: bind = [bind]
|
|
629
|
+
|
|
630
|
+
all_histo = []
|
|
631
|
+
for bindx in bind:
|
|
632
|
+
|
|
633
|
+
if uni is None: uni = np.sort(np.unique(bindx))
|
|
634
|
+
vals = [[bindx[d].reshape(-1) for d in s] for s in idxs] #@@@@@
|
|
635
|
+
d2v = [BinCount(v, uni = uni, multi = True) for v in vals]
|
|
636
|
+
|
|
637
|
+
j = BinCount(bindx, uni = uni).reshape(-1) if closest else None
|
|
638
|
+
|
|
639
|
+
C = [se.NormalizedDistances(v, j,
|
|
640
|
+
onlyidx = onlyidx, p = p,
|
|
641
|
+
reciprocal = reciprocal, pseudo = pseudo,
|
|
642
|
+
weight_bymem = weight_bymem,
|
|
643
|
+
summarize_mode = summarize_mode)
|
|
644
|
+
for v in d2v]
|
|
645
|
+
|
|
646
|
+
all_histo.append(C)
|
|
647
|
+
|
|
648
|
+
all_histo = np.stack(all_histo, 1) #ALL HISTO SHAPE: (# SPLITS, #OF SCORES)
|
|
649
|
+
|
|
650
|
+
if all_histo.shape[1] > 1:
|
|
651
|
+
all_histo = np.array([summarize_mode[0](sc, **summarize_mode[1]) for sc in all_histo])
|
|
652
|
+
else: all_histo = all_histo[:, 0]
|
|
653
|
+
|
|
654
|
+
return all_histo #is a 1dim array
|
|
655
|
+
|
|
656
|
+
|
|
657
|
+
def SplitGen_Akin(bind,
|
|
658
|
+
proportions = [0.7, 0.3], num_splits = 1, rando = 100, window = False, group = None,
|
|
659
|
+
|
|
660
|
+
uni = None, multi = False,
|
|
661
|
+
closest = False,
|
|
662
|
+
onlyidx = None, p = 2, reciprocal = False, pseudo = True, weight_bymem = False,
|
|
663
|
+
summarize_mode = [se.Epsilon, {}],
|
|
664
|
+
|
|
665
|
+
atatime = 100, pathname = None, pickup = False):
|
|
666
|
+
|
|
667
|
+
#June 1 mod: added atatime for memory. It operates in rounds of atatime so that you can do alot of them with less memory.
|
|
668
|
+
|
|
669
|
+
rounds = np.arange(0, rando, atatime).tolist() + [rando]
|
|
670
|
+
|
|
671
|
+
if pathname is not None:
|
|
672
|
+
if pathname.endswith('.p'):
|
|
673
|
+
temppath = pathname[:-2] + '_temp' + '.p'
|
|
674
|
+
else:
|
|
675
|
+
pathname = pathname + '.p'
|
|
676
|
+
temppath = pathname + '_temp' + '.p'
|
|
677
|
+
|
|
678
|
+
if pathname is None: pickup = False
|
|
679
|
+
|
|
680
|
+
if pickup and os.path.isfile(temppath):
|
|
681
|
+
finround, past_splits, past_all_histo = se.PickleLoad(temppath)
|
|
682
|
+
else: finround, past_splits, past_all_histo = 0, [], []
|
|
683
|
+
|
|
684
|
+
lb = len(bind.T) if multi else len(bind)
|
|
685
|
+
|
|
686
|
+
|
|
687
|
+
for ir in np.arange(len(rounds[:-1]))[finround:]:
|
|
688
|
+
print(f'round {ir + 1} of {len(rounds[:-1])}')
|
|
689
|
+
|
|
690
|
+
randoz = rounds[ir+1] - rounds[ir]
|
|
691
|
+
if randoz > 1:
|
|
692
|
+
|
|
693
|
+
SplitGen_args = {'num_splits': randoz, 'proportions': proportions, 'group': group}
|
|
694
|
+
|
|
695
|
+
if window: Splits = SplitGen_Window(lb, random = True, **SplitGen_args)
|
|
696
|
+
else: Splits = SplitGen_Random(lb, **SplitGen_args)
|
|
697
|
+
|
|
698
|
+
all_histo = Akin_Scorer(bind, Splits,
|
|
699
|
+
|
|
700
|
+
uni = uni, multi = multi,
|
|
701
|
+
|
|
702
|
+
closest = closest,
|
|
703
|
+
onlyidx = onlyidx, p = p, reciprocal = reciprocal, pseudo = pseudo, weight_bymem = weight_bymem,
|
|
704
|
+
summarize_mode = summarize_mode)
|
|
705
|
+
|
|
706
|
+
mino = np.argsort(all_histo)[:num_splits]
|
|
707
|
+
|
|
708
|
+
past_splits.extend([Splits[m] for m in mino])
|
|
709
|
+
past_all_histo.extend([all_histo[m] for m in mino])
|
|
710
|
+
|
|
711
|
+
mino2 = np.argsort(past_all_histo)[:num_splits]
|
|
712
|
+
past_splits = [past_splits[m] for m in mino2] #juet keep the bests
|
|
713
|
+
past_all_histo = [past_all_histo[m] for m in mino2]
|
|
714
|
+
Splits2 = past_splits[0] if num_splits == 1 else past_splits
|
|
715
|
+
|
|
716
|
+
print(f'newbest:{past_all_histo[0]}')
|
|
717
|
+
|
|
718
|
+
if pathname is not None:
|
|
719
|
+
se.PickleDump(Splits2,pathname)
|
|
720
|
+
se.PickleDump([ir, past_splits, past_all_histo], temppath)
|
|
721
|
+
|
|
722
|
+
pathname: os.remove(temppath)
|
|
723
|
+
|
|
724
|
+
return Splits2
|
|
725
|
+
|
|
726
|
+
|
|
727
|
+
#####################################
|
|
728
|
+
|
|
729
|
+
|
|
730
|
+
def SubSample_Akin(bind,
|
|
731
|
+
proportion = 0.3, num_subsamples = 1, rando = 10, group = None,
|
|
732
|
+
|
|
733
|
+
uni = None, multi = False,
|
|
734
|
+
closest = False,
|
|
735
|
+
onlyidx = None, p = 2, reciprocal = False, pseudo = True, weight_bymem = False,
|
|
736
|
+
summarize_mode = [se.Epsilon, {}]):
|
|
737
|
+
|
|
738
|
+
|
|
739
|
+
ss = se.se.SubSample_Random(bind, proportion = proportion, num_subsamples = rando, group = group)
|
|
740
|
+
ss_l = [[s] for s in ss]
|
|
741
|
+
|
|
742
|
+
all_histo = Akin_Scorer(bind.T, ss_l,
|
|
743
|
+
|
|
744
|
+
uni = uni, multi = multi,
|
|
745
|
+
|
|
746
|
+
closest = closest,
|
|
747
|
+
onlyidx = onlyidx, p = p, reciprocal = reciprocal, pseudo = pseudo,
|
|
748
|
+
weight_bymem = weight_bymem,
|
|
749
|
+
summarize_mode = summarize_mode)
|
|
750
|
+
|
|
751
|
+
mino = np.argsort(all_histo)[:num_subsamples]
|
|
752
|
+
|
|
753
|
+
ss = [ss[m] for m in mino]
|
|
754
|
+
if num_subsamples == 1: ss = ss[0]
|
|
755
|
+
|
|
756
|
+
return ss
|
|
757
|
+
|
|
758
|
+
|
|
759
|
+
|
|
760
|
+
|
|
761
|
+
def SimpleStratifiedSplit(obs_bind, props = [0.5, 0.25, 0.25], prints = False):
|
|
762
|
+
uni = np.unique(obs_bind)
|
|
763
|
+
props = np.array(props)
|
|
764
|
+
props = props / np.sum(props)
|
|
765
|
+
|
|
766
|
+
lpx = np.arange(len(props))
|
|
767
|
+
|
|
768
|
+
splito = {ipo: [] for ipo in lpx}
|
|
769
|
+
|
|
770
|
+
for unx in uni:
|
|
771
|
+
idxs = np.where(obs_bind == unx)[0]
|
|
772
|
+
dd = np.random.choice(lpx, size = len(idxs), p = props, replace = True)
|
|
773
|
+
for ipo in lpx: splito[ipo].append(idxs[dd == ipo])
|
|
774
|
+
|
|
775
|
+
splito = [np.concatenate(splito[ipo]) for ipo in lpx]
|
|
776
|
+
|
|
777
|
+
if prints: print([BinCount(obs_bind[s], uni = uni) for s in splito])
|
|
778
|
+
|
|
779
|
+
return splito
|
|
780
|
+
|
|
781
|
+
|
|
782
|
+
|
|
783
|
+
class BinWeights:
|
|
784
|
+
|
|
785
|
+
def __init__(self, uni = None, multi = False, minus = True, newrange = True):
|
|
786
|
+
|
|
787
|
+
self.uni = uni
|
|
788
|
+
self.multi = multi
|
|
789
|
+
self.minus = minus
|
|
790
|
+
self.newrange = newrange
|
|
791
|
+
|
|
792
|
+
def fit(self, bind):
|
|
793
|
+
|
|
794
|
+
multi = self.multi
|
|
795
|
+
uni = self.uni
|
|
796
|
+
minus = self.minus
|
|
797
|
+
newrange = self.newrange
|
|
798
|
+
|
|
799
|
+
if multi is False: bind = [bind]
|
|
800
|
+
|
|
801
|
+
all_weights, unis = [], []
|
|
802
|
+
for bindx in bind:
|
|
803
|
+
|
|
804
|
+
if uni is None: uni = np.sort(np.unique(bindx))
|
|
805
|
+
unis.append(uni)
|
|
806
|
+
cnv = BinCount(bindx, uni = uni, multi = False)
|
|
807
|
+
cnv_prop = cnv / np.sum(cnv)
|
|
808
|
+
|
|
809
|
+
weights = 1 - cnv_prop if minus else 1 / cnv_prop
|
|
810
|
+
|
|
811
|
+
if newrange is not None:
|
|
812
|
+
mi, ma = se.MiMa(weights)
|
|
813
|
+
if isinstance(newrange, tuple):
|
|
814
|
+
|
|
815
|
+
newmi, newma = newrange
|
|
816
|
+
weights = (((weights - mi) * (newma - newmi)) / (ma - mi)) + newmi
|
|
817
|
+
|
|
818
|
+
elif newrange is True:
|
|
819
|
+
weights = weights / ma
|
|
820
|
+
|
|
821
|
+
all_weights.append(weights)
|
|
822
|
+
|
|
823
|
+
self.all_weights = all_weights
|
|
824
|
+
self.unis = unis
|
|
825
|
+
|
|
826
|
+
return self
|
|
827
|
+
|
|
828
|
+
def eval(self, inp, dtype = np.float32):
|
|
829
|
+
|
|
830
|
+
if self.multi is False: inp = [inp]
|
|
831
|
+
|
|
832
|
+
new_weights = []
|
|
833
|
+
|
|
834
|
+
for inpx, weights, uni in zip(inp, self.all_weights, self.unis):
|
|
835
|
+
weix = np.zeros(inpx.shape, dtype = dtype)
|
|
836
|
+
for u,w in zip(uni, weights): weix[inpx == u] = w
|
|
837
|
+
new_weights.append(weix)
|
|
838
|
+
|
|
839
|
+
new_weights = np.stack(new_weights) if self.multi else new_weights[0]
|
|
840
|
+
|
|
841
|
+
return new_weights
|
|
842
|
+
|
|
843
|
+
|
|
844
|
+
def BinWeighter(bind, onlyidx = None, byaxis = None,
|
|
845
|
+
uni = None, minus = True, newrange = True,
|
|
846
|
+
dtype = np.float32):
|
|
847
|
+
|
|
848
|
+
bind = np.array(bind) #just in case
|
|
849
|
+
|
|
850
|
+
origshape = bind.shape
|
|
851
|
+
|
|
852
|
+
inps = [bind] if byaxis is None else [bind.take(indices=a, axis=byaxis) for a in range(origshape[byaxis])]
|
|
853
|
+
|
|
854
|
+
inpshape = inps[0].shape
|
|
855
|
+
|
|
856
|
+
newei = []
|
|
857
|
+
|
|
858
|
+
for j in inps:
|
|
859
|
+
bine = j if onlyidx is None else j[onlyidx]
|
|
860
|
+
|
|
861
|
+
bw = BinWeights(uni = uni, multi = False, minus = minus, newrange = newrange).fit(bind = bine.reshape(-1))
|
|
862
|
+
wo = bw.eval(j.reshape(-1), dtype=dtype).reshape(inpshape)
|
|
863
|
+
newei.append(wo)
|
|
864
|
+
|
|
865
|
+
newei = np.stack(newei, axis = byaxis) if byaxis else newei[0]
|
|
866
|
+
|
|
867
|
+
return newei
|
|
868
|
+
|
|
869
|
+
|
|
870
|
+
|
|
871
|
+
|
|
872
|
+
|
|
873
|
+
|
|
874
|
+
|
|
875
|
+
|
|
876
|
+
|
|
877
|
+
|
|
878
|
+
|
|
879
|
+
|
|
880
|
+
|
|
881
|
+
|
|
882
|
+
|
|
883
|
+
|
|
884
|
+
|
|
885
|
+
|
|
886
|
+
|
|
887
|
+
|
|
888
|
+
|
|
889
|
+
|
|
890
|
+
|
|
891
|
+
|
|
892
|
+
|
|
893
|
+
|
|
@@ -0,0 +1,7 @@
|
|
|
1
|
+
Copyright 2025 Husam Abdulnabi
|
|
2
|
+
|
|
3
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the “Software”), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
|
|
4
|
+
|
|
5
|
+
The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
|
|
6
|
+
|
|
7
|
+
THE SOFTWARE IS PROVIDED “AS IS”, WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
Metadata-Version: 2.3
|
|
2
|
+
Name: poseigen_binmeths
|
|
3
|
+
Version: 0.0.1
|
|
4
|
+
Summary: The basics for Poseigen software
|
|
5
|
+
License: MIT
|
|
6
|
+
Author: Husam Abdulnabi
|
|
7
|
+
Author-email: husam.abdulnabi@gmail.com
|
|
8
|
+
Maintainer: Husam Abdulnabi
|
|
9
|
+
Maintainer-email: husam.abdulnabi@gmail.com
|
|
10
|
+
Requires-Python: >=3.12
|
|
11
|
+
Classifier: Development Status :: 3 - Alpha
|
|
12
|
+
Classifier: Intended Audience :: Developers
|
|
13
|
+
Classifier: Topic :: Software Development
|
|
14
|
+
Classifier: Programming Language :: Python :: 3
|
|
15
|
+
Requires-Dist: numpy
|
|
16
|
+
Requires-Dist: pandas
|
|
17
|
+
Requires-Dist: poseigen_seaside
|
|
18
|
+
Project-URL: Homepage, https://github.com/husam94/poseigen_binmeths
|
|
19
|
+
Description-Content-Type: text/markdown
|
|
20
|
+
|
|
21
|
+
Will do
|
|
@@ -0,0 +1,6 @@
|
|
|
1
|
+
poseigen_binmeths/__init__.py,sha256=GcDXBo2K_yq3xp9qTy1-Z1wJGC0D0VVIUlR1QSZL7Mg,32
|
|
2
|
+
poseigen_binmeths/poseigen_binmeths.py,sha256=1C9jK14B3hIRIZ-nQ2x4bNT4x4I9KGb86nilb8KlXJs,25843
|
|
3
|
+
poseigen_binmeths-0.0.1.dist-info/LICENSE.txt,sha256=k0BUnY6wRRyp1AeU0K8rK_DyvcwNc0dH1NJJOJsp1SA,1068
|
|
4
|
+
poseigen_binmeths-0.0.1.dist-info/METADATA,sha256=awpvbrVe05LkoIJXQ8cROgisknNhWJ_L752RnkRB1_o,646
|
|
5
|
+
poseigen_binmeths-0.0.1.dist-info/WHEEL,sha256=b4K_helf-jlQoXBBETfwnf4B04YC67LOev0jo4fX5m8,88
|
|
6
|
+
poseigen_binmeths-0.0.1.dist-info/RECORD,,
|