poseigen-binmeths 0.0.1__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,7 @@
1
+ Copyright 2025 Husam Abdulnabi
2
+
3
+ Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the “Software”), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
4
+
5
+ The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
6
+
7
+ THE SOFTWARE IS PROVIDED “AS IS”, WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
@@ -0,0 +1,21 @@
1
+ Metadata-Version: 2.3
2
+ Name: poseigen_binmeths
3
+ Version: 0.0.1
4
+ Summary: The basics for Poseigen software
5
+ License: MIT
6
+ Author: Husam Abdulnabi
7
+ Author-email: husam.abdulnabi@gmail.com
8
+ Maintainer: Husam Abdulnabi
9
+ Maintainer-email: husam.abdulnabi@gmail.com
10
+ Requires-Python: >=3.12
11
+ Classifier: Development Status :: 3 - Alpha
12
+ Classifier: Intended Audience :: Developers
13
+ Classifier: Topic :: Software Development
14
+ Classifier: Programming Language :: Python :: 3
15
+ Requires-Dist: numpy
16
+ Requires-Dist: pandas
17
+ Requires-Dist: poseigen_seaside
18
+ Project-URL: Homepage, https://github.com/husam94/poseigen_binmeths
19
+ Description-Content-Type: text/markdown
20
+
21
+ Will do
@@ -0,0 +1 @@
1
+ Will do
@@ -0,0 +1,57 @@
1
+
2
+
3
+ [build-system]
4
+
5
+ requires = ["setuptools",
6
+ "numpy",
7
+ "pandas",
8
+ "poseigen_seaside"
9
+ ]
10
+
11
+ [project]
12
+
13
+ name = "poseigen_binmeths"
14
+
15
+ version = "0.0.1"
16
+
17
+ requires-python = ">=3.12"
18
+
19
+ description = "The basics for Poseigen software"
20
+
21
+ readme = "README.md"
22
+
23
+ license = "MIT"
24
+
25
+ license-files = ["LICEN[CS]E.*"]
26
+
27
+ keywords = []
28
+
29
+ authors = [{ name = "Husam Abdulnabi", email = "husam.abdulnabi@gmail.com" }]
30
+
31
+ maintainers = [
32
+ { name = "Husam Abdulnabi", email = "husam.abdulnabi@gmail.com" },
33
+ ]
34
+
35
+ classifiers = [
36
+ # How mature is this project? Common values are
37
+ # 3 - Alpha
38
+ # 4 - Beta
39
+ # 5 - Production/Stable
40
+ "Development Status :: 3 - Alpha",
41
+
42
+ # Indicate who your project is intended for
43
+ "Intended Audience :: Developers",
44
+ "Topic :: Software Development",
45
+
46
+ "Programming Language :: Python :: 3",
47
+
48
+ ]
49
+
50
+ dependencies = [
51
+ "numpy",
52
+ "pandas",
53
+ "poseigen_seaside"
54
+ ]
55
+
56
+ [project.urls]
57
+ "Homepage" = "https://github.com/husam94/poseigen_binmeths"
@@ -0,0 +1 @@
1
+ from .poseigen_binmeths import *
@@ -0,0 +1,893 @@
1
+ import numpy as np
2
+ import pandas as pd
3
+ import math
4
+ import os
5
+ import itertools
6
+
7
+ from collections import Counter
8
+
9
+ import poseigen_seaside.basics as se
10
+
11
+
12
+ def BinParser(inp, uni, categorical = False, multi = False):
13
+
14
+ if multi is False: inp = [inp]
15
+
16
+ pars = []
17
+ for inpx in inp:
18
+ if categorical: par = [np.where(inpx == u)[0] for u in uni]
19
+ else: par = [np.where(np.logical_and(inpx>=u[0], inpx <= u[1]))[0] for u in uni]
20
+ pars.append(par)
21
+
22
+ if multi is False: pars = pars[0]
23
+
24
+ return pars
25
+
26
+
27
+ def Binner(inp, uni = None):
28
+
29
+ #This is Binner_v2 in origninal binstuff.
30
+
31
+ #should be able to handle a multi-dim shape, bin it and return
32
+ categorical = False
33
+ if isinstance(inp[0], str): categorical = True
34
+
35
+ inpy = inp.reshape(-1)
36
+
37
+ par = BinParser(inpy, uni, categorical = categorical)
38
+
39
+ inpx = np.zeros(len(inpy), dtype = int)
40
+ for ip, p in enumerate(par):
41
+ inpx[p] = ip
42
+
43
+ inpx = inpx.reshape(inp.shape)
44
+
45
+ return inpx
46
+
47
+ def HistUni(inp, bins = 10):
48
+ inp = np.array(inp)
49
+ bin_edges = np.linspace(inp.min(), inp.max(), bins + 1)
50
+ uni = se.ListWindower(bin_edges, win_size = 2)
51
+ return uni
52
+
53
+
54
+ def HistBinning(inp, bins = 10):
55
+ uni = HistUni(inp, bins = bins)
56
+ return Binner(inp, uni = uni)
57
+
58
+
59
+ def HistWeights(x, bins = 1000, recip = False, interp = False):
60
+
61
+ # Not the same as in original.
62
+
63
+ x = np.array(x)
64
+ x_bind = HistBinning(x, bins = bins)
65
+
66
+ histoz = np.histogram(x.reshape(-1), bins = bins)
67
+ histoz_scaled = histoz[0] / histoz[0].sum()
68
+
69
+ if interp:
70
+
71
+ x = np.array(x)
72
+ x_mi, x_ma = se.MinMax(x)
73
+
74
+ oldr = np.linspace(x_mi, x_ma, len(histoz_scaled))
75
+ weis = np.interp(x.reshape(-1), oldr, histoz_scaled)
76
+
77
+ else:
78
+ weis = np.zeros(x.shape)
79
+ for iz, z in enumerate(histoz_scaled):
80
+ weis[x_bind == iz] = z
81
+
82
+ if recip:
83
+ weis = 1 / weis
84
+ weis = weis / weis.sum()
85
+
86
+ return weis
87
+
88
+
89
+
90
+ def Harpoon(bind, select,
91
+ multi = False,
92
+ repeat = 1, custidx = None):
93
+
94
+ #BIND IS NOW INPUT! DOES NOT DO BINNING ANY MORE
95
+
96
+ #BIND FORMAT IS [LENGTH, TYPE] FOR THIS ONE
97
+
98
+ #OCT 2 23, ADDED REPEAT FUNCTION FOR HARPOON TRAINER
99
+ ##cust idx is custom idx
100
+
101
+ #Added Multi feature (June 2)
102
+ #for multi #Inp is Observations as rows and each output as a column, else its a 1dim array
103
+
104
+ if select < 1: select = np.round(len(bind) * select).astype(int)
105
+
106
+ if multi:
107
+ h2, h1 = np.unique(bind, axis = 0, return_counts = True)
108
+ div = [np.where((bind == u).all(axis = 1))[0] for u in h2]
109
+ else:
110
+ h2, h1 = np.unique(bind, return_counts = True)
111
+ div = [np.where(bind == u)[0] for u in h2]
112
+
113
+ if custidx is not None: div = [np.array(custidx)[d] for d in div]
114
+
115
+ lu = len(h2)
116
+ nums = np.repeat(0, lu)
117
+
118
+ j = 0
119
+ for _ in range(select):
120
+ for i in np.random.permutation(np.arange(lu)):
121
+ if j == select: break
122
+ elif h1[i] > 0:
123
+ nums[i] += 1
124
+ h1[i] -= 1
125
+ j += 1
126
+
127
+ idx = np.stack([np.hstack([np.random.choice(d, replace = False, size = w) for d,w in zip(div, nums)])
128
+ for _ in range(repeat)], 0)
129
+
130
+ return idx[0] if repeat == 1 else idx
131
+
132
+
133
+ def BinFunc(inp, bind, uni = None,
134
+ mode = [np.mean, {}]):
135
+
136
+ #DOES NOT SUPPORT MULTI RIGHT NOW>
137
+
138
+ if uni is None: uni = np.unique(bind)
139
+ results = [mode[0](inp[bind == un], **mode[1]) for un in uni]
140
+
141
+ return results
142
+
143
+
144
+
145
+ def BinsOfBins(bind, uni = None):
146
+ #BIND FORMAT IS [LENGTH, TYPE] FOR THIS ONE
147
+ # uni is a list the length of TYPE. for each type, it tells the unique ones.
148
+
149
+ if uni is None: h2, h1 = np.unique(bind, axis = 0, return_counts = True)
150
+ else: h2 = [list(x) for x in list(itertools.product(*uni))]
151
+
152
+ div = [np.where((bind == u).all(axis = 1))[0] for u in h2]
153
+ lu = len(h2)
154
+ newbind = np.repeat(0, len(bind))
155
+ for i in range(lu):
156
+ newbind[div[i]] = i
157
+
158
+ return newbind
159
+
160
+
161
+
162
+
163
+ def BinCount(bind, uni = None, multi = False, prop = False):
164
+ #uni is shared
165
+ #inp is a list of 1D arrays or lists of values
166
+
167
+ if multi is False: bind = [bind]
168
+ bind = [np.array(m) for m in bind]
169
+
170
+ if uni is None: uni = np.sort(np.unique(bind))
171
+
172
+ hs = [Counter(m.reshape(-1)) for m in bind]
173
+ max_key = max(max(d) for d in hs)
174
+ empty = dict.fromkeys(uni, 0)
175
+ rere = [{**empty, **d} for d in hs]
176
+
177
+ cnv = pd.DataFrame(rere).to_numpy()
178
+
179
+ if multi is False: cnv = cnv[0]
180
+
181
+ if prop: cnv = cnv / np.sum(cnv)
182
+
183
+ return cnv
184
+
185
+
186
+
187
+ ################## OBJECTIVE BINNING ##################################
188
+
189
+
190
+ def FindScaleFactor(inp,
191
+ mode = [se.tanh, {}],
192
+ mode_rev = [se.revtanh, {}],
193
+
194
+ bins = 10,
195
+ maxrange = 0.3, minobs = None,
196
+ start = 1, increm = 0.1,
197
+ mima = None,
198
+ lowerbound = None, upperbound = None):
199
+
200
+ #lets determine the maximum alpha:
201
+
202
+ inpx = inp.reshape(-1)
203
+
204
+ if mima is None: mi, ma = None, None
205
+ elif isinstance(mima, int): mi, ma = (0, int)
206
+ else: mi, ma = mima
207
+
208
+ if mi is None: mi = np.min(inp)
209
+ if ma is None: ma = np.max(inp)
210
+
211
+ lb = mi if lowerbound is None else lowerbound
212
+ ub = ma if upperbound is None else upperbound
213
+
214
+ inpx = inp.reshape(-1)
215
+ inpx[inpx < lb] = lb
216
+ inpx[inpx > ub] = ub
217
+
218
+ cent1 = None
219
+
220
+ totz = ma - mi
221
+
222
+ starto = start - increm
223
+
224
+ vars, scalefactors, edges, hix = [], [], [], []
225
+
226
+ if maxrange is None: maxrange = 1
227
+ if minobs is None: minobs = len(inp)
228
+
229
+ lenz_max, mino_obs = 0, 0
230
+
231
+ while lenz_max <= maxrange and mino_obs <= minobs:
232
+
233
+ starto = np.round(starto + increm, 5)
234
+
235
+ if cent1 is None:
236
+
237
+ liny = mode[0](starto * inpx, **mode[1])
238
+
239
+ hist, bin_edges = np.histogram(liny, bins = bins, density=False)
240
+
241
+ mino_obs = np.min(hist)
242
+
243
+ rt = mode_rev[0](bin_edges, ** mode_rev[1]) / starto
244
+
245
+ lens_props = np.array([rt[i+1] - rt[i] for i in np.arange(len(rt) -1)]) / totz
246
+ lenz_max = np.max(lens_props)
247
+
248
+ rt[0] = mi
249
+ rt[-1] = ma
250
+
251
+ hix.append(hist)
252
+ edges.append(rt)
253
+ vars.append(np.std(hist))
254
+ scalefactors.append(starto)
255
+
256
+
257
+ besto = np.argmin(vars)
258
+ #print(besto, vars[besto], edges[besto], hix[besto])
259
+
260
+ return scalefactors[besto]
261
+
262
+
263
+
264
+
265
+
266
+ def FindNumBins(inp,
267
+ mode = [se.tanh, {}],
268
+ mode_rev = [se.revtanh, {}],
269
+
270
+ maxrange = 0.3, minobs = 50,
271
+ minbins = 5, maxbins = 1000,
272
+ start = 1, increm = 0.1,
273
+ mima = None, lowerbound = None, upperbound = None):
274
+
275
+ #Lets just find the number of bins givin the maxrange range and minobs
276
+ # then find the scale factor that best satistifies whatever.
277
+
278
+ # As we increase the scale factor with the modes, we expect more obs in the bin
279
+ # As we decrease the number of bins, we're going to have to try more until the maxrange is reached.
280
+
281
+
282
+ inpx = inp.reshape(-1)
283
+
284
+ if mima is None: mi, ma = None, None
285
+ elif isinstance(mima, int): mi, ma = (0, int)
286
+ else: mi, ma = mima
287
+
288
+ if mi is None: mi = np.min(inp)
289
+ if ma is None: ma = np.max(inp)
290
+
291
+ lb = mi if lowerbound is None else lowerbound
292
+ ub = ma if upperbound is None else upperbound
293
+
294
+ totz = ma - mi
295
+
296
+ inpx = inp.reshape(-1)
297
+ inpx[inpx < lb] = lb
298
+ inpx[inpx > ub] = ub
299
+
300
+ starto = start
301
+
302
+ lenz_max, mino_obs = 0, 0
303
+
304
+
305
+ for bxo in np.arange(minbins, maxbins): #The maximum number of bins.
306
+
307
+ gxzo = None
308
+
309
+ print(f'TRYING: {bxo}')
310
+
311
+ for incrz in np.arange(start, 100, increm):
312
+
313
+ starto = incrz
314
+
315
+ liny = mode[0](starto * inpx, **mode[1])
316
+
317
+ hist, bin_edges = np.histogram(liny, bins = bxo, density=False)
318
+
319
+ mino_obs = np.min(hist)
320
+
321
+ rt = mode_rev[0](bin_edges, ** mode_rev[1]) / starto
322
+
323
+ lens_props = np.array([rt[i+1] - rt[i] for i in np.arange(len(rt) -1)]) / totz
324
+ lenz_max = np.max(lens_props)
325
+
326
+ if lenz_max <= maxrange and mino_obs >= minobs:
327
+
328
+ gxzo = starto
329
+
330
+ elif lenz_max > maxrange: break
331
+
332
+
333
+ if gxzo is not None:
334
+
335
+ maxb = bxo
336
+
337
+ #else: break
338
+
339
+ return maxb
340
+
341
+
342
+
343
+
344
+ def ObjUni(inp, scalefactor,
345
+ mode = [se.tanh, {}], mode_rev = [se.revtanh, {}],
346
+ bins = 10,
347
+ open_ends = False,
348
+ mima = None, lowerbound = None, upperbound = None):
349
+
350
+ inpx = inp.reshape(-1)
351
+
352
+ if mima is None: mi, ma = None, None
353
+ elif isinstance(mima, int): mi, ma = (0, int)
354
+ else: mi, ma = mima
355
+
356
+ if mi is None: mi = np.min(inp)
357
+ if ma is None: ma = np.max(inp)
358
+
359
+ lb = mi if lowerbound is None else lowerbound
360
+ ub = ma if upperbound is None else upperbound
361
+
362
+ inpx = inp.reshape(-1)
363
+ inpx[inpx < lb] = lb
364
+ inpx[inpx > ub] = ub
365
+
366
+ liny = mode[0](scalefactor * inpx, **mode[1])
367
+
368
+ hist, bin_edges = np.histogram(liny, bins = bins, density=False)
369
+
370
+ rt = mode_rev[0](bin_edges, **mode_rev[1]) / scalefactor
371
+
372
+ rt[0] = mi
373
+ rt[-1] = ma
374
+
375
+ return se.ListWindower(rt, win_size = 2)
376
+
377
+
378
+
379
+
380
+
381
+
382
+
383
+
384
+
385
+
386
+
387
+ ###########################################################
388
+
389
+ ############# SPLIT TOOLS ####################
390
+
391
+
392
+
393
+
394
+
395
+ def SplitExpander(X1, X2, Split, ma = None):
396
+
397
+ # X1 is the full length of the total split (modified to be able to expand subsplits)
398
+ if isinstance(X1, int) == False:
399
+ X1 = len(X1)
400
+
401
+ if ma is None: ma = np.sum([len(x) for x in Split])
402
+
403
+ #X2 is the multiplier
404
+ if isinstance(X2, int) == False: X2 = len(X2)
405
+
406
+ multiplier = X2 // X1 if X2 > X1 else X2 #June 1 mod
407
+
408
+ new = [np.hstack([*[e + (ma * m) for m in range(multiplier)]]) for e in Split]
409
+
410
+ return new
411
+
412
+
413
+ def SplitRepeater(split, reps):
414
+ #reps is repeats
415
+ #each obs reps times. Designed for btc days
416
+ return [np.hstack([np.arange(o*reps, (o+1)*reps)
417
+ for o in sp])
418
+ for sp in split]
419
+
420
+
421
+
422
+ def SplitData(data, split): return [data[s] for s in split]
423
+
424
+ def Data2Divided(inp, split):
425
+ return [[n[s] for n in inp] for s in split]
426
+
427
+ def Data2Grouped(inp, split = None):
428
+ #for Divided dictionary, you need to select which split to use e.g., dict[0] as its the same as divided format
429
+ if split is None or split is False:
430
+ grouped = [[inp[n][m] for n in range(len(inp))] for m in range(len(inp[0]))]
431
+ else:
432
+ grouped = [[d[s] for s in split] for d in inp]
433
+
434
+ return grouped
435
+
436
+
437
+
438
+
439
+
440
+
441
+
442
+ def Divided2NonDivided(divs):
443
+ #divs in the shape of (obs, features)
444
+
445
+ stacks = np.hstack(divs) if len(divs[0].shape) == 1 else np.vstack(divs)
446
+
447
+ cs = np.cumsum([0] + [len(d) for d in divs])
448
+ split = [np.arange(cs[i], cs[i+1]) for i in range(len(divs))]
449
+
450
+ return stacks, split
451
+
452
+ def Idx2Split(X, idx, props = [0.7, 0.3]):
453
+ if isinstance(X, int) == False: X = len(X)
454
+ if idx < 1: idx = se.Round2Int(idx*X)
455
+
456
+ f = np.arange(X)
457
+ ls = [se.Round2Int(X*p) for p in props]
458
+ lx = [se.Round2Int(np.sum(ls[:r])) for r in range(len(ls))]
459
+
460
+ fs = [se.Cutter(se.Cutter(f, u), idx) for u in lx]
461
+
462
+ return [j[:r] for j,r in zip(fs, ls)]
463
+
464
+ def UngroupSplits(Splits, group):
465
+ #Splits is a list of splits
466
+
467
+ LG = np.arange(len(group))
468
+ Splits = [[LG[np.isin(group, sp)] for sp in split] for split in Splits]
469
+
470
+ return Splits
471
+
472
+
473
+
474
+ def SplitGen_Random(X, num_splits = 1, proportions = [0.7, 0.3], group = None):
475
+
476
+ #group is the inp groups from OverlapGroup or whereever
477
+
478
+ if group is not None: li = len(np.unique(group))
479
+ else: li = X if isinstance(X, int) else len(X)
480
+
481
+ #proportions = np.array(proportions) / np.sum(proportions)
482
+
483
+ ls = [se.Round2Int(li*p) for p in proportions]
484
+ if np.sum(ls) < li: ls[0] = ls[0] + (li - np.sum(ls))
485
+ idx = np.arange(li)
486
+
487
+ Splits = []
488
+ for _ in range(num_splits):
489
+ LXX = np.random.permutation(idx)
490
+ Splits.append([LXX[e-l:e] for l,e in zip(ls, np.cumsum(ls))])
491
+
492
+
493
+ if group is not None: Splits = UngroupSplits(Splits, group)
494
+
495
+ if num_splits == 1: Splits = Splits[0]
496
+
497
+ return Splits
498
+
499
+ def SplitGen_Window(X, num_splits = 3,
500
+ proportions = [0.7, 0.3],
501
+ everyother = None, start_at = None, end_at = None,
502
+ random = True, group = None):
503
+
504
+ #if num_splits is None, it returns all possible splits
505
+
506
+ if group is not None: li = np.max(group)
507
+ else: li = X if isinstance(X, int) else len(X)
508
+
509
+ #proportions = np.array(proportions) / np.sum(proportions)
510
+
511
+ ls = [se.Round2Int(li*p) for p in proportions]
512
+ lx = [se.Round2Int(np.sum(ls[:r])) for r in range(len(ls))]
513
+
514
+ idx = np.arange(li)
515
+ idxs = [se.Cutter(idx, u) for u in lx]
516
+
517
+ contwin = [se.ContinuousWindower(idx, k) for k in ls] # GOOD
518
+
519
+ if num_splits is not None or everyother is not None:
520
+ if everyother is not None:
521
+ rando = idx[::everyother]
522
+ num_splits = None
523
+ else:
524
+ rando = np.random.choice(idx, num_splits, replace = False) if random is True else np.arange(num_splits)
525
+
526
+ idxs = [we[rando] for we in idxs]
527
+ contwin = [coco[ra] for coco, ra in zip(contwin, idxs)]
528
+
529
+ num_splits = len(contwin[0])
530
+ Splits = [[a[i] for a in contwin] for i in range(num_splits)]
531
+
532
+ if group is not None: Splits = UngroupSplits(Splits, group)
533
+
534
+ if num_splits == 1: Splits = Splits[0]
535
+
536
+ return Splits
537
+
538
+
539
+ def SubSplitGen(inp, split, onlyfirst = False,
540
+ num_subsamples = 1, proportion = 0.3, group = None,
541
+ subsample_mode = [se.SubSample_Random, {}]):
542
+
543
+ if isinstance(proportion, float) is True:
544
+ proportion = [proportion] * len(split)
545
+
546
+ subsample_mode[1].update({'num_subsamples': num_subsamples})
547
+
548
+ if group is None: ss = [subsample_mode[0](inp[s], proportion = p, **subsample_mode[1]) for s,p in zip(split, proportion)]
549
+ else: ss = [subsample_mode[0](inp[s], group = group[s], proportion = p, **subsample_mode[1]) for s,p in zip(split, proportion)]
550
+
551
+ if onlyfirst:
552
+ addon = [np.arange(len(s)) for s in split[1:]]
553
+ if num_subsamples != 1: addon = [[s] * num_subsamples for s in addon]
554
+ ss = [ss[0], *addon]
555
+
556
+ if num_subsamples == 1: ss = [[j] for j in ss]
557
+
558
+ ss = [[s[h] for h in j] for s,j in zip(split, ss)]
559
+
560
+ ss = [[j[i] for j in ss] for i in range(num_subsamples)]
561
+
562
+ if num_subsamples == 1: ss = ss[0]
563
+ return ss
564
+
565
+
566
+ def SplitGen_Stratified(bind, num_splits = 1, proportions = [0.7, 0.3]):
567
+ #bind is a 1d list of binned
568
+ #does not do multi currently
569
+
570
+ #Stratified random split
571
+
572
+ uni = np.sort(np.unique(bind))
573
+ idxs = [np.where(bind == u)[0] for u in uni]
574
+
575
+ counts = BinCount(bind, uni=uni, multi = False)
576
+ nums = [[np.round(p * c).astype(int) for p in proportions] for c in counts]
577
+
578
+ Splits = []
579
+ for _ in range(num_splits):
580
+ Split = []
581
+ for iu, u in enumerate(uni):
582
+ XX = np.random.permutation(idxs[iu])
583
+ Split.append([XX[e-l:e] for l,e in zip(nums[iu], np.cumsum(nums[iu]))])
584
+ Splits.append([np.hstack([s[ip] for s in Split]) for ip in range(len(proportions))])
585
+
586
+ if num_splits == 1: Splits = Splits[0]
587
+
588
+ return Splits
589
+
590
+ def SubSample_Stratified(bind,
591
+ proportion = 0.3, num_subsamples = 1,
592
+ guarantee1 = False):
593
+ #bind is a 1d list of binned
594
+ #does not do multi currently
595
+
596
+ li = len(bind)
597
+ if proportion > 1: proportion = proportion / li
598
+
599
+ uni = np.sort(np.unique(bind))
600
+
601
+ idxs = [np.where(bind == u)[0] for u in uni]
602
+ lidxs = np.array([len(x) for x in idxs])
603
+
604
+ nums = np.round(lidxs * proportion).astype(int)
605
+ nums = [li if n > li else n for n in nums]
606
+ if guarantee1: nums = [1 if n == 0 and lx > 0 else n for n,lx in zip(nums, lidxs)]
607
+
608
+ ss = [np.hstack([np.random.choice(b, n, replace = False) for b,n in zip(idxs, nums)]) for _ in range(num_subsamples)]
609
+
610
+ if num_subsamples == 1: ss = ss[0]
611
+ return ss
612
+
613
+
614
+ ###### AKIN SPLIT #############
615
+
616
+
617
+ def Akin_Scorer(bind, idxs,
618
+
619
+ uni = None, multi = False,
620
+
621
+ closest = False,
622
+ onlyidx = None, p = 2, reciprocal = False, pseudo = True, weight_bymem = False,
623
+ summarize_mode = [se.Epsilon, {}]):
624
+
625
+ #idxs = list of lists where each has 1D arrays or a list of indices to which relate to inp
626
+ #idxs is [split, ... ]
627
+
628
+ if multi is False: bind = [bind]
629
+
630
+ all_histo = []
631
+ for bindx in bind:
632
+
633
+ if uni is None: uni = np.sort(np.unique(bindx))
634
+ vals = [[bindx[d].reshape(-1) for d in s] for s in idxs] #@@@@@
635
+ d2v = [BinCount(v, uni = uni, multi = True) for v in vals]
636
+
637
+ j = BinCount(bindx, uni = uni).reshape(-1) if closest else None
638
+
639
+ C = [se.NormalizedDistances(v, j,
640
+ onlyidx = onlyidx, p = p,
641
+ reciprocal = reciprocal, pseudo = pseudo,
642
+ weight_bymem = weight_bymem,
643
+ summarize_mode = summarize_mode)
644
+ for v in d2v]
645
+
646
+ all_histo.append(C)
647
+
648
+ all_histo = np.stack(all_histo, 1) #ALL HISTO SHAPE: (# SPLITS, #OF SCORES)
649
+
650
+ if all_histo.shape[1] > 1:
651
+ all_histo = np.array([summarize_mode[0](sc, **summarize_mode[1]) for sc in all_histo])
652
+ else: all_histo = all_histo[:, 0]
653
+
654
+ return all_histo #is a 1dim array
655
+
656
+
657
+ def SplitGen_Akin(bind,
658
+ proportions = [0.7, 0.3], num_splits = 1, rando = 100, window = False, group = None,
659
+
660
+ uni = None, multi = False,
661
+ closest = False,
662
+ onlyidx = None, p = 2, reciprocal = False, pseudo = True, weight_bymem = False,
663
+ summarize_mode = [se.Epsilon, {}],
664
+
665
+ atatime = 100, pathname = None, pickup = False):
666
+
667
+ #June 1 mod: added atatime for memory. It operates in rounds of atatime so that you can do alot of them with less memory.
668
+
669
+ rounds = np.arange(0, rando, atatime).tolist() + [rando]
670
+
671
+ if pathname is not None:
672
+ if pathname.endswith('.p'):
673
+ temppath = pathname[:-2] + '_temp' + '.p'
674
+ else:
675
+ pathname = pathname + '.p'
676
+ temppath = pathname + '_temp' + '.p'
677
+
678
+ if pathname is None: pickup = False
679
+
680
+ if pickup and os.path.isfile(temppath):
681
+ finround, past_splits, past_all_histo = se.PickleLoad(temppath)
682
+ else: finround, past_splits, past_all_histo = 0, [], []
683
+
684
+ lb = len(bind.T) if multi else len(bind)
685
+
686
+
687
+ for ir in np.arange(len(rounds[:-1]))[finround:]:
688
+ print(f'round {ir + 1} of {len(rounds[:-1])}')
689
+
690
+ randoz = rounds[ir+1] - rounds[ir]
691
+ if randoz > 1:
692
+
693
+ SplitGen_args = {'num_splits': randoz, 'proportions': proportions, 'group': group}
694
+
695
+ if window: Splits = SplitGen_Window(lb, random = True, **SplitGen_args)
696
+ else: Splits = SplitGen_Random(lb, **SplitGen_args)
697
+
698
+ all_histo = Akin_Scorer(bind, Splits,
699
+
700
+ uni = uni, multi = multi,
701
+
702
+ closest = closest,
703
+ onlyidx = onlyidx, p = p, reciprocal = reciprocal, pseudo = pseudo, weight_bymem = weight_bymem,
704
+ summarize_mode = summarize_mode)
705
+
706
+ mino = np.argsort(all_histo)[:num_splits]
707
+
708
+ past_splits.extend([Splits[m] for m in mino])
709
+ past_all_histo.extend([all_histo[m] for m in mino])
710
+
711
+ mino2 = np.argsort(past_all_histo)[:num_splits]
712
+ past_splits = [past_splits[m] for m in mino2] #juet keep the bests
713
+ past_all_histo = [past_all_histo[m] for m in mino2]
714
+ Splits2 = past_splits[0] if num_splits == 1 else past_splits
715
+
716
+ print(f'newbest:{past_all_histo[0]}')
717
+
718
+ if pathname is not None:
719
+ se.PickleDump(Splits2,pathname)
720
+ se.PickleDump([ir, past_splits, past_all_histo], temppath)
721
+
722
+ pathname: os.remove(temppath)
723
+
724
+ return Splits2
725
+
726
+
727
+ #####################################
728
+
729
+
730
+ def SubSample_Akin(bind,
731
+ proportion = 0.3, num_subsamples = 1, rando = 10, group = None,
732
+
733
+ uni = None, multi = False,
734
+ closest = False,
735
+ onlyidx = None, p = 2, reciprocal = False, pseudo = True, weight_bymem = False,
736
+ summarize_mode = [se.Epsilon, {}]):
737
+
738
+
739
+ ss = se.se.SubSample_Random(bind, proportion = proportion, num_subsamples = rando, group = group)
740
+ ss_l = [[s] for s in ss]
741
+
742
+ all_histo = Akin_Scorer(bind.T, ss_l,
743
+
744
+ uni = uni, multi = multi,
745
+
746
+ closest = closest,
747
+ onlyidx = onlyidx, p = p, reciprocal = reciprocal, pseudo = pseudo,
748
+ weight_bymem = weight_bymem,
749
+ summarize_mode = summarize_mode)
750
+
751
+ mino = np.argsort(all_histo)[:num_subsamples]
752
+
753
+ ss = [ss[m] for m in mino]
754
+ if num_subsamples == 1: ss = ss[0]
755
+
756
+ return ss
757
+
758
+
759
+
760
+
761
+ def SimpleStratifiedSplit(obs_bind, props = [0.5, 0.25, 0.25], prints = False):
762
+ uni = np.unique(obs_bind)
763
+ props = np.array(props)
764
+ props = props / np.sum(props)
765
+
766
+ lpx = np.arange(len(props))
767
+
768
+ splito = {ipo: [] for ipo in lpx}
769
+
770
+ for unx in uni:
771
+ idxs = np.where(obs_bind == unx)[0]
772
+ dd = np.random.choice(lpx, size = len(idxs), p = props, replace = True)
773
+ for ipo in lpx: splito[ipo].append(idxs[dd == ipo])
774
+
775
+ splito = [np.concatenate(splito[ipo]) for ipo in lpx]
776
+
777
+ if prints: print([BinCount(obs_bind[s], uni = uni) for s in splito])
778
+
779
+ return splito
780
+
781
+
782
+
783
+ class BinWeights:
784
+
785
+ def __init__(self, uni = None, multi = False, minus = True, newrange = True):
786
+
787
+ self.uni = uni
788
+ self.multi = multi
789
+ self.minus = minus
790
+ self.newrange = newrange
791
+
792
+ def fit(self, bind):
793
+
794
+ multi = self.multi
795
+ uni = self.uni
796
+ minus = self.minus
797
+ newrange = self.newrange
798
+
799
+ if multi is False: bind = [bind]
800
+
801
+ all_weights, unis = [], []
802
+ for bindx in bind:
803
+
804
+ if uni is None: uni = np.sort(np.unique(bindx))
805
+ unis.append(uni)
806
+ cnv = BinCount(bindx, uni = uni, multi = False)
807
+ cnv_prop = cnv / np.sum(cnv)
808
+
809
+ weights = 1 - cnv_prop if minus else 1 / cnv_prop
810
+
811
+ if newrange is not None:
812
+ mi, ma = se.MiMa(weights)
813
+ if isinstance(newrange, tuple):
814
+
815
+ newmi, newma = newrange
816
+ weights = (((weights - mi) * (newma - newmi)) / (ma - mi)) + newmi
817
+
818
+ elif newrange is True:
819
+ weights = weights / ma
820
+
821
+ all_weights.append(weights)
822
+
823
+ self.all_weights = all_weights
824
+ self.unis = unis
825
+
826
+ return self
827
+
828
+ def eval(self, inp, dtype = np.float32):
829
+
830
+ if self.multi is False: inp = [inp]
831
+
832
+ new_weights = []
833
+
834
+ for inpx, weights, uni in zip(inp, self.all_weights, self.unis):
835
+ weix = np.zeros(inpx.shape, dtype = dtype)
836
+ for u,w in zip(uni, weights): weix[inpx == u] = w
837
+ new_weights.append(weix)
838
+
839
+ new_weights = np.stack(new_weights) if self.multi else new_weights[0]
840
+
841
+ return new_weights
842
+
843
+
844
+ def BinWeighter(bind, onlyidx = None, byaxis = None,
845
+ uni = None, minus = True, newrange = True,
846
+ dtype = np.float32):
847
+
848
+ bind = np.array(bind) #just in case
849
+
850
+ origshape = bind.shape
851
+
852
+ inps = [bind] if byaxis is None else [bind.take(indices=a, axis=byaxis) for a in range(origshape[byaxis])]
853
+
854
+ inpshape = inps[0].shape
855
+
856
+ newei = []
857
+
858
+ for j in inps:
859
+ bine = j if onlyidx is None else j[onlyidx]
860
+
861
+ bw = BinWeights(uni = uni, multi = False, minus = minus, newrange = newrange).fit(bind = bine.reshape(-1))
862
+ wo = bw.eval(j.reshape(-1), dtype=dtype).reshape(inpshape)
863
+ newei.append(wo)
864
+
865
+ newei = np.stack(newei, axis = byaxis) if byaxis else newei[0]
866
+
867
+ return newei
868
+
869
+
870
+
871
+
872
+
873
+
874
+
875
+
876
+
877
+
878
+
879
+
880
+
881
+
882
+
883
+
884
+
885
+
886
+
887
+
888
+
889
+
890
+
891
+
892
+
893
+