data-manipulation-utilities 0.2.1__py3-none-any.whl → 0.2.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,11 +1,11 @@
1
1
  Metadata-Version: 2.2
2
2
  Name: data_manipulation_utilities
3
- Version: 0.2.1
3
+ Version: 0.2.2
4
4
  Description-Content-Type: text/markdown
5
5
  Requires-Dist: logzero
6
6
  Requires-Dist: PyYAML
7
7
  Requires-Dist: scipy
8
- Requires-Dist: awkward==2.4.6
8
+ Requires-Dist: awkward
9
9
  Requires-Dist: tqdm
10
10
  Requires-Dist: joblib
11
11
  Requires-Dist: scikit-learn
@@ -424,6 +424,10 @@ where the settings for the training go in a config dictionary, which when writte
424
424
 
425
425
  ```yaml
426
426
  dataset:
427
+ # Before training, new features can be defined as below
428
+ define :
429
+ x : v + w
430
+ y : v - w
427
431
  # If the key is found to be NaN, replace its value with the number provided
428
432
  # This will be used in the training.
429
433
  # Otherwise the entries with NaNs will be dropped
@@ -433,7 +437,7 @@ dataset:
433
437
  z : -999
434
438
  training :
435
439
  nfold : 10
436
- features : [w, x, y, z]
440
+ features : [x, y, z]
437
441
  hyper :
438
442
  loss : log_loss
439
443
  n_estimators : 100
@@ -493,7 +497,9 @@ When training on real data, several things might go wrong and the code will try
493
497
  will end up in different folds. The tool checks for wether a model is evaluated for an entry that was used for training and raise an exception. Thus, repeated
494
498
  entries will be removed before training.
495
499
 
496
- - **NaNs**: Entries with NaNs will break the training with the scikit GradientBoostClassifier base class. Thus, we also remove them from the training.
500
+ - **NaNs**: Entries with NaNs will break the training with the scikit `GradientBoostClassifier` base class. Thus, we:
501
+ - Can use the `nan` section shown above to replace `NaN` values with something else
502
+ - For whatever remains we remove the entries from the training.
497
503
 
498
504
  ## Application
499
505
 
@@ -516,15 +522,24 @@ The picking process happens through the comparison of hashes between the samples
516
522
  The hashes of the training samples are stored in the pickled model itself; which therefore is a reimplementation of
517
523
  `GradientBoostClassifier`, here called `CVClassifier`.
518
524
 
519
- If a sample exist, that was used in the training of _every_ model, no model can be chosen for the prediction and an
525
+ If a sample exists, that was used in the training of _every_ model, no model can be chosen for the prediction and a
520
526
  `CVSameData` exception will be risen.
521
527
 
528
+ During training, the configuration will be stored in the model. Therefore, variable definitions can be picked up for evaluation
529
+ from that configuration and the user does not need to define extra columns.
530
+
522
531
  ### Caveats
523
532
 
524
533
  When evaluating the model with real data, problems might occur, we deal with them as follows:
525
534
 
526
535
  - **Repeated entries**: When there are repeated features in the dataset to be evaluated we assign the same probabilities, no filtering is used.
527
- - **NaNs**: Entries with NaNs will break the evaluation. These entries will be _patched_ with zeros and evaluated. However, before returning, the probabilities will be
536
+ - **NaNs**: Entries with NaNs will break the evaluation. These entries will be:
537
+ - Replaced by other values before evaluation IF a replacement was specified during training. The training configuration will be stored in the model
538
+ and can be accessed through:
539
+ ```python
540
+ model.cfg
541
+ ```
542
+ - For whatever entries that are still NaN, they will be _patched_ with zeros and evaluated. However, before returning, the probabilities will be
528
543
  saved as -1. I.e. entries with NaNs will have probabilities of -1.
529
544
 
530
545
  # Pandas dataframes
@@ -1,10 +1,10 @@
1
- data_manipulation_utilities-0.2.1.data/scripts/publish,sha256=-3K_Y2_4CfWCV50rPB8CRuhjxDu7xMGswinRwPovgLs,1976
1
+ data_manipulation_utilities-0.2.2.data/scripts/publish,sha256=-3K_Y2_4CfWCV50rPB8CRuhjxDu7xMGswinRwPovgLs,1976
2
2
  dmu/arrays/utilities.py,sha256=PKoYyybPptA2aU-V3KLnJXBudWxTXu4x1uGdIMQ49HY,1722
3
3
  dmu/generic/utilities.py,sha256=0Xnq9t35wuebAqKxbyAiMk1ISB7IcXK4cFH25MT1fgw,1741
4
4
  dmu/logging/log_store.py,sha256=umdvjNDuV3LdezbG26b0AiyTglbvkxST19CQu9QATbA,4184
5
5
  dmu/ml/cv_classifier.py,sha256=8Jwx6xMhJaRLktlRdq0tFl32v6t8i63KmpxrlnXlomU,3759
6
- dmu/ml/cv_predict.py,sha256=AhCsCnHWPWGIRVTdGS1NxA2m4yH7t2lV_OdALwQAcAE,4927
7
- dmu/ml/train_mva.py,sha256=xJCJZKaly4Mml7Dy-TWQxpB-VNftL7EjQ79QKxROWx0,16475
6
+ dmu/ml/cv_predict.py,sha256=4G7F_1yOvnLftsDC6zUpdvkxuHXGkPemhj0RsYySYDM,6708
7
+ dmu/ml/train_mva.py,sha256=SZ5cQHl7HBxn0c5Hh4HlN1aqMZaJUAlNmsfjnUSQrTg,16894
8
8
  dmu/ml/utilities.py,sha256=l348bufD95CuSYdIrHScQThIy2nKwGKXZn-FQg3CEwg,3930
9
9
  dmu/pdataframe/utilities.py,sha256=ypvLiFfJ82ga94qlW3t5dXnvEFwYOXnbtJb2zHwsbqk,987
10
10
  dmu/plotting/matrix.py,sha256=pXuUJn-LgOvrI9qGkZQw16BzLjOjeikYQ_ll2VIcIXU,4978
@@ -13,7 +13,7 @@ dmu/plotting/plotter_1d.py,sha256=g6H2xAgsL9a6vRkpbqHICb3qwV_qMiQPZxxw_oOSf9M,51
13
13
  dmu/plotting/plotter_2d.py,sha256=J-gKnagoHGfJFU7HBrhDFpGYH5Rxy0_zF5l8eE_7ZHE,2944
14
14
  dmu/plotting/utilities.py,sha256=SI9dvtZq2gr-PXVz71KE4o0i09rZOKgqJKD1jzf6KXk,1167
15
15
  dmu/rdataframe/atr_mgr.py,sha256=FdhaQWVpsm4OOe1IRbm7rfrq8VenTNdORyI-lZ2Bs1M,2386
16
- dmu/rdataframe/utilities.py,sha256=x8r379F2-vZPYzAdMFCn_V4Kx2Tx9t9pn_QHcZ1euew,2756
16
+ dmu/rdataframe/utilities.py,sha256=MDY3u_y0s-ANvHAWRzGyeuuZUKoaqilfmb8mqlgfrVc,2771
17
17
  dmu/rfile/rfprinter.py,sha256=mp5jd-oCJAnuokbdmGyL9i6tK2lY72jEfROuBIZ_ums,3941
18
18
  dmu/rfile/utilities.py,sha256=XuYY7HuSBj46iSu3c60UYBHtI6KIPoJU_oofuhb-be0,945
19
19
  dmu/stats/fitter.py,sha256=vHNZ16U3apoQyeyM8evq-if49doF48sKB3q9wmA96Fw,18387
@@ -47,8 +47,8 @@ dmu_scripts/rfile/compare_root_files.py,sha256=T8lDnQxsRNMr37x1Y7YvWD8ySHrJOWZki
47
47
  dmu_scripts/rfile/print_trees.py,sha256=Ze4Ccl_iUldl4eVEDVnYBoe4amqBT1fSBR1zN5WSztk,941
48
48
  dmu_scripts/ssh/coned.py,sha256=lhilYNHWRCGxC-jtyJ3LQ4oUgWW33B2l1tYCcyHHsR0,4858
49
49
  dmu_scripts/text/transform_text.py,sha256=9akj1LB0HAyopOvkLjNOJiptZw5XoOQLe17SlcrGMD0,1456
50
- data_manipulation_utilities-0.2.1.dist-info/METADATA,sha256=ojD6P0bBj9GFohtPd7ULl7sDW80bIVD6JZ-bnNpHYmc,26649
51
- data_manipulation_utilities-0.2.1.dist-info/WHEEL,sha256=In9FTNxeP60KnTkGw7wk6mJPYd_dQSjEZmXdBdMCI-8,91
52
- data_manipulation_utilities-0.2.1.dist-info/entry_points.txt,sha256=1TIZDed651KuOH-DgaN5AoBdirKmrKE_oM1b6b7zTUU,270
53
- data_manipulation_utilities-0.2.1.dist-info/top_level.txt,sha256=n_x5J6uWtSqy9mRImKtdA2V2NJNyU8Kn3u8DTOKJix0,25
54
- data_manipulation_utilities-0.2.1.dist-info/RECORD,,
50
+ data_manipulation_utilities-0.2.2.dist-info/METADATA,sha256=0QwhQmQML65qk2kaXf1znMZOVNuvaY3l35E7cXLRCZ8,27359
51
+ data_manipulation_utilities-0.2.2.dist-info/WHEEL,sha256=In9FTNxeP60KnTkGw7wk6mJPYd_dQSjEZmXdBdMCI-8,91
52
+ data_manipulation_utilities-0.2.2.dist-info/entry_points.txt,sha256=1TIZDed651KuOH-DgaN5AoBdirKmrKE_oM1b6b7zTUU,270
53
+ data_manipulation_utilities-0.2.2.dist-info/top_level.txt,sha256=n_x5J6uWtSqy9mRImKtdA2V2NJNyU8Kn3u8DTOKJix0,25
54
+ data_manipulation_utilities-0.2.2.dist-info/RECORD,,
dmu/ml/cv_predict.py CHANGED
@@ -32,11 +32,56 @@ class CVPredict:
32
32
  if rdf is None:
33
33
  raise ValueError('No ROOT dataframe passed')
34
34
 
35
- self._l_model = models
36
- self._rdf = rdf
35
+ self._l_model = models
36
+ self._rdf = rdf
37
+ self._d_nan_rep : dict[str,str]
37
38
 
38
39
  self._arr_patch : numpy.ndarray
39
40
  # --------------------------------------------
41
+ def _initialize(self):
42
+ self._rdf = self._define_columns(self._rdf)
43
+ self._d_nan_rep = self._get_nan_replacements()
44
+ # --------------------------------------------
45
+ def _define_columns(self, rdf : RDataFrame) -> RDataFrame:
46
+ cfg = self._l_model[0].cfg
47
+
48
+ if 'define' not in cfg['dataset']:
49
+ log.debug('No define section found in config, will not define extra columns')
50
+ return self._rdf
51
+
52
+ d_def = cfg['dataset']['define']
53
+ log.debug(60 * '-')
54
+ log.info('Defining columns in RDF before evaluating classifier')
55
+ log.debug(60 * '-')
56
+ for name, expr in d_def.items():
57
+ log.debug(f'{name:<20}{"<---":20}{expr:<100}')
58
+ rdf = rdf.Define(name, expr)
59
+
60
+ return rdf
61
+ # --------------------------------------------
62
+ def _get_nan_replacements(self) -> dict[str,str]:
63
+ cfg = self._l_model[0].cfg
64
+
65
+ if 'nan' not in cfg['dataset']:
66
+ log.debug('No define section found in config, will not define extra columns')
67
+ return {}
68
+
69
+ return cfg['dataset']['nan']
70
+ # --------------------------------------------
71
+ def _replace_nans(self, df : pnd.DataFrame) -> pnd.DataFrame:
72
+ if len(self._d_nan_rep) == 0:
73
+ log.debug('Not doing any NaN replacement')
74
+ return df
75
+
76
+ log.debug(60 * '-')
77
+ log.info('Doing NaN replacements')
78
+ log.debug(60 * '-')
79
+ for var, val in self._d_nan_rep.items():
80
+ log.debug(f'{var:<20}{"--->":20}{val:<20.3f}')
81
+ df[var] = df[var].fillna(val)
82
+
83
+ return df
84
+ # --------------------------------------------
40
85
  def _get_df(self):
41
86
  '''
42
87
  Will make ROOT rdf into dataframe and return it
@@ -45,6 +90,7 @@ class CVPredict:
45
90
  l_ft = model.features
46
91
  d_data= self._rdf.AsNumpy(l_ft)
47
92
  df_ft = pnd.DataFrame(d_data)
93
+ df_ft = self._replace_nans(df_ft)
48
94
  df_ft = ut.patch_and_tag(df_ft)
49
95
 
50
96
  if 'patched_indices' in df_ft.attrs:
@@ -136,6 +182,8 @@ class CVPredict:
136
182
  '''
137
183
  Will return array of prediction probabilities for the signal category
138
184
  '''
185
+ self._initialize()
186
+
139
187
  df_ft = self._get_df()
140
188
  model = self._l_model[0]
141
189
 
dmu/ml/train_mva.py CHANGED
@@ -26,7 +26,7 @@ from dmu.plotting.matrix import MatrixPlotter
26
26
  from dmu.logging.log_store import LogStore
27
27
 
28
28
  npa = numpy.ndarray
29
- log = LogStore.add_logger('data_checks:train_mva')
29
+ log = LogStore.add_logger('dmu:ml:train_mva')
30
30
  # ---------------------------------------------
31
31
  class TrainMva:
32
32
  '''
@@ -334,10 +334,10 @@ class TrainMva:
334
334
  if 'max' in self._cfg['plotting']['roc']:
335
335
  [max_x, max_y] = self._cfg['plotting']['roc']['max']
336
336
 
337
- self._plot_probabilities(xval_ts, yval_ts, l_prb_ts)
338
-
339
337
  plt.plot(xval_ts, yval_ts, color='b', label=f'Test: {area_ts:.3f}')
340
338
  plt.plot(xval_tr, yval_tr, color='r', label=f'Train: {area_tr:.3f}')
339
+ self._plot_probabilities(xval_ts, yval_ts, l_prb_ts, l_lab_ts)
340
+
341
341
  plt.xlabel('Signal efficiency')
342
342
  plt.ylabel('Background rejection')
343
343
  plt.title(f'Fold: {ifold}')
@@ -351,13 +351,17 @@ class TrainMva:
351
351
  def _plot_probabilities(self,
352
352
  arr_seff: npa,
353
353
  arr_brej: npa,
354
- arr_sprb: npa) -> None:
354
+ arr_sprb: npa,
355
+ arr_labl: npa) -> None:
355
356
 
356
357
  roc_cfg = self._cfg['plotting']['roc']
357
358
  if 'annotate' not in roc_cfg:
358
359
  log.debug('Annotation section in the ROC curve config not found, skipping annotation')
359
360
  return
360
361
 
362
+ l_sprb = [ sprb for sprb, labl in zip(arr_sprb, arr_labl) if labl == 1 ]
363
+ arr_sprb = numpy.array(l_sprb)
364
+
361
365
  plt_cfg = roc_cfg['annotate']
362
366
  if 'sig_eff' not in plt_cfg:
363
367
  l_seff_target = [0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 0.95]
@@ -366,17 +370,24 @@ class TrainMva:
366
370
  del plt_cfg['sig_eff']
367
371
 
368
372
  arr_seff_target = numpy.array(l_seff_target)
373
+ arr_quantile = 1 - arr_seff_target
369
374
 
370
- l_score = numpy.quantile(arr_sprb, 1 - arr_seff_target)
375
+ l_score = numpy.quantile(arr_sprb, arr_quantile)
371
376
  l_seff = []
372
377
  l_brej = []
373
- for seff_target in l_seff_target:
378
+
379
+ log.debug(60 * '-')
380
+ log.debug(f'{"SigEff":20}{"BkgRej":20}{"Score":20}')
381
+ log.debug(60 * '-')
382
+ for seff_target, score in zip(arr_seff_target, l_score):
374
383
  arr_diff = numpy.abs(arr_seff - seff_target)
375
384
  ind = numpy.argmin(arr_diff)
376
385
 
377
386
  seff = arr_seff[ind]
378
387
  brej = arr_brej[ind]
379
388
 
389
+ log.debug(f'{seff:<20.3f}{brej:<20.3f}{score:<20.2f}')
390
+
380
391
  l_seff.append(seff)
381
392
  l_brej.append(brej)
382
393
 
@@ -66,7 +66,7 @@ def add_column(rdf : RDataFrame, arr_val : Union[numpy.ndarray,None], name : str
66
66
  if arr_val.dtype == 'object':
67
67
  arr_val = arr_val.astype(float)
68
68
 
69
- d_data[name] = arr_val
69
+ d_data[name] = ak.from_numpy(arr_val)
70
70
 
71
71
  rdf = ak.to_rdataframe(d_data)
72
72