PyPI - data-manipulation-utilities - Versions diffs - 0.2.5__py3-none-any.whl → 0.2.7__py3-none-any.whl - Mend

data-manipulation-utilities 0.2.5py3-none-any.whl → 0.2.7py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (31) hide show

{data_manipulation_utilities-0.2.5.dist-info → data_manipulation_utilities-0.2.7.dist-info}/METADATA +179 -10
{data_manipulation_utilities-0.2.5.dist-info → data_manipulation_utilities-0.2.7.dist-info}/RECORD +31 -19
{data_manipulation_utilities-0.2.5.dist-info → data_manipulation_utilities-0.2.7.dist-info}/WHEEL +1 -1
dmu/generic/hashing.py +44 -0
dmu/generic/utilities.py +14 -1
dmu/generic/version_management.py +3 -5
dmu/ml/cv_diagnostics.py +221 -0
dmu/ml/train_mva.py +143 -46
dmu/pdataframe/utilities.py +36 -3
dmu/plotting/fwhm.py +64 -0
dmu/plotting/plotter.py +2 -0
dmu/plotting/plotter_1d.py +87 -6
dmu/stats/fitter.py +1 -1
dmu/stats/minimizers.py +40 -11
dmu/stats/model_factory.py +248 -44
dmu/stats/zfit_models.py +68 -0
dmu/stats/zfit_plotter.py +29 -21
dmu/testing/utilities.py +31 -4
dmu_data/ml/tests/diagnostics_from_file.yaml +13 -0
dmu_data/ml/tests/diagnostics_from_model.yaml +10 -0
dmu_data/ml/tests/diagnostics_multiple_methods.yaml +10 -0
dmu_data/ml/tests/diagnostics_overlay.yaml +33 -0
dmu_data/ml/tests/train_mva.yaml +19 -10
dmu_data/ml/tests/train_mva_with_diagnostics.yaml +82 -0
dmu_data/plotting/tests/plug_fwhm.yaml +24 -0
dmu_data/plotting/tests/plug_stats.yaml +19 -0
dmu_data/plotting/tests/simple.yaml +4 -3
dmu_data/plotting/tests/styling.yaml +11 -0
{data_manipulation_utilities-0.2.5.data → data_manipulation_utilities-0.2.7.data}/scripts/publish +0 -0
{data_manipulation_utilities-0.2.5.dist-info → data_manipulation_utilities-0.2.7.dist-info}/entry_points.txt +0 -0
{data_manipulation_utilities-0.2.5.dist-info → data_manipulation_utilities-0.2.7.dist-info}/top_level.txt +0 -0

{data_manipulation_utilities-0.2.5.dist-info → data_manipulation_utilities-0.2.7.dist-info}/METADATA RENAMED Viewed

@@ -1,20 +1,25 @@
-Metadata-Version: 2.2
+Metadata-Version: 2.4
 Name: data_manipulation_utilities
-Version: 0.2.5
+Version: 0.2.7
 Description-Content-Type: text/markdown
 Requires-Dist: logzero
 Requires-Dist: PyYAML
 Requires-Dist: scipy
 Requires-Dist: awkward
 Requires-Dist: tqdm
-Requires-Dist: joblib
-Requires-Dist: scikit-learn
+Requires-Dist: numpy
 Requires-Dist: toml
 Requires-Dist: numpy
 Requires-Dist: matplotlib
 Requires-Dist: mplhep
 Requires-Dist: hist[plot]
 Requires-Dist: pandas
+Provides-Extra: fit
+Requires-Dist: zfit; extra == "fit"
+Requires-Dist: tensorflow==2.18.0; extra == "fit"
+Provides-Extra: ml
+Requires-Dist: scikit-learn; extra == "ml"
+Requires-Dist: joblib; extra == "ml"
 Provides-Extra: dev
 Requires-Dist: pytest; extra == "dev"
@@ -51,6 +56,25 @@ Then, for each remote it pushes the tags and the commits.
 This section describes generic tools that could not be put in a specific category, but tend to be useful.
+## Hashing
+The snippet below:
+```python
+from dmu.generic  import hashing
+obj = [1, 'name', [1, 'sub', 'list'], {'x' : 1}]
+val = hashing.hash_object(obj)
+```
+will:
+- Make the input object into a JSON string
+- Encode it to utf-8
+- Make a 64 characters hash out of it
+in two lines, thus keeping the user's code clean.
 ## Timer
 In order to benchmark functions do:
@@ -67,9 +91,9 @@ def fun():
 fun()
 ```
-## JSON dumper
+## JSON dumper and loader
-The following lines will dump data (dictionaries, lists, etc) to a JSON file:
+The following lines will dump data (dictionaries, lists, etc) to a JSON file and load it back:
 ```python
 import dmu.generic.utilities as gut
@@ -77,8 +101,11 @@ import dmu.generic.utilities as gut
 data = [1,2,3,4]
 gut.dump_json(data, '/tmp/list.json')
+data = gut.load_json('/tmp/list.json')
 ```
+and it's meant to allow the user to bypass all the boilerplate and keep their code brief.
 # Physics
 ## Truth matching
@@ -132,7 +159,8 @@ from dmu.stats.model_factory import ModelFactory
 l_pdf = ['cbr'] + 2 * ['cbl']
 l_shr = ['mu', 'sg']
-mod   = ModelFactory(obs = Data.obs, l_pdf = l_pdf, l_shared=l_shr)
+d_fix = {'al_cbl' : 3, 'nr_cbr' : 1} # This is optional and will fix two parameters whose names start with the keys
+mod   = ModelFactory(obs = Data.obs, l_pdf = l_pdf, l_shared=l_shr, d_fix=d_fix)
 pdf   = mod.get_pdf()
 ```
@@ -145,10 +173,40 @@ pol1: Polynomial of degree 1
 pol2: Polynomial of degree 2
 cbr : CrystallBall with right tail
 cbl : CrystallBall with left tail
-gauss : Gaussian
+gauss : Gaussian
 dscb : Double sided CrystallBall
 ```
+### Model building with reparametrizations
+In order to introduce reparametrizations for the means and the resolutions, such that:
+$\mu\to\mu+\Delta\mu$
+$\sigma\to\sigma\cdot s_{\sigma}$
+where the reparametrized $\mu$ and $\sigma$ are constant, while the scale and resolution is floating, do:
+```python
+import zfit
+from dmu.stats.model_factory import ModelFactory
+l_shr = ['mu', 'sg']
+l_flt = []
+d_rep = {'mu' : 'scale', 'sg' : 'reso'}
+obs   = zfit.Space('mass', limits=(5080, 5680))
+mod   = ModelFactory(
+        preffix = name,
+        obs     = obs,
+        l_pdf   = l_name,
+        d_rep   = d_rep,
+        l_shared= l_shr,
+        l_float = l_flt)
+pdf   = mod.get_pdf()
+```
+Here, the floating parameters **should not** be the same as the reparametrized ones.
 ### Printing PDFs
 One can print a zfit PDF by doing:
@@ -427,7 +485,7 @@ rdf_bkg = _get_rdf(kind='bkg')
 cfg     = _get_config()
 obj= TrainMva(sig=rdf_sig, bkg=rdf_bkg, cfg=cfg)
-obj.run()
+obj.run(skip_fit=False) # by default it will be false, if true, it will only make plots of features
 ```
 where the settings for the training go in a config dictionary, which when written to YAML looks like:
@@ -549,9 +607,61 @@ When evaluating the model with real data, problems might occur, we deal with the
     ```python
     model.cfg
     ```
-    - For whatever entries that are still NaN, they will be _patched_  with zeros and evaluated. However, before returning, the probabilities will be
+    - For whatever features that are still NaN, they will be _patched_  with zeros when evaluated. However, the returned probabilities will be
 saved as -1. I.e. entries with NaNs will have probabilities of -1.
+## Diagnostics
+To run diagnostics on the trained model do:
+```python
+from dmu.ml.cv_diagnostics import CVDiagnostics
+# Where l_model is the list of models and cfg is a dictionary with the config
+cvd = CVDiagnostics(models=l_model, rdf=rdf, cfg=cfg)
+cvd.run()
+```
+the configuration can be loaded from a YAML file and would look like:
+```yaml
+# Directory where plots will go
+output         : /tmp/tests/dmu/ml/cv_diagnostics/overlay
+  # Optional, will assume that the target is already in the input dataframe
+  # and will use it, instead of evaluating models
+score_from_rdf : mva
+correlations:
+  # Variables with respect to which the correlations with the features will be measured
+  target :
+    name : mass
+    overlay :
+      wp :
+        - 0.2
+        - 0.5
+        - 0.7
+        - 0.9
+      general:
+        size : [20, 10]
+      saving:
+        plt_dir : /tmp/tests/dmu/ml/cv_diagnostics/from_rdf
+      plots:
+        z :
+          binning    : [1000, 4000, 30]
+          yscale     : 'linear'
+          labels     : ['mass', 'Entries']
+          normalized : true
+          styling :
+            linestyle: '-' # By default there is no line, just pointer
+  methods:
+    - Pearson
+    - Kendall-$\tau$
+  figure:
+    title: Scores from file
+    size : [10, 8]
+    xlabelsize: 18 # Constrols size of x axis labels. By default 30
+    rotate    : 60 # Will rotate xlabels by 60 degrees
+```
 # Pandas dataframes
 ## Utilities
@@ -582,6 +692,19 @@ put.df_to_tex(df,
         caption  = 'some caption')
 ```
+### Dataframe to and from YAML
+This extends the existing JSON functionality
+```python
+import dmu.pdataframe.utilities as put
+df_1 = _get_df()
+put.to_yaml(df_1, yml_path)
+df_2 = put.from_yaml(yml_path)
+```
+and is meant to be less verbose than doing it through the YAML module.
 # Rdataframes
 These are utility functions meant to be used with ROOT dataframes.
@@ -707,6 +830,11 @@ plots:
         labels     : ['x', 'Entries'] # Labels are optional, will use varname and Entries as labels if not present
         title      : 'some title can be added for different variable plots'
         name       : 'plot_of_x' # This will ensure that one gets plot_of_x.png as a result, if missing x.png would be saved
+        # Can add styling to specific plots, this should be the argument of
+        # hist.plot(...)
+        styling :
+            label : x
+            linestyle: '-'
     y :
         binning    : [-5.0, 8.0, 40]
         yscale     : 'linear'
@@ -730,6 +858,47 @@ stats:
 it's up to the user to build this dictionary and load it.
+### Pluggins
+Extra functionality can be `plugged` into the code by using the pluggins section like:
+#### FWHM
+```yaml
+plugin:
+  fwhm:
+    # Can control each variable fit separately
+    x :
+      plot   : true
+      obs    : [-2, 4]
+      plot   : true
+      format : FWHM={:.3f}
+      add_std: True
+    y :
+      plot   : true
+      obs    : [-4, 8]
+      plot   : true
+      format : FWHM={:.3f}
+      add_std: True
+```
+where the section will
+- Use a KDE to fit the distribution and plot it on top of the histogram
+- Add the value of the FullWidth at Half Maximum in the title, for each distribution with a specific formatting.
+#### stats
+```yaml
+plugin:
+  stats:
+    x :
+      mean : $\mu$={:.2f}
+      rms  : $\sigma$={:.2f}
+      sum  : $\Sigma$={:.0f}
+```
+Can be used to print statistics, mean, rms and weighted sum of entries for each distribution.
 ## 2D plots
 For the 2D case it would look like:

{data_manipulation_utilities-0.2.5.dist-info → data_manipulation_utilities-0.2.7.dist-info}/RECORD RENAMED Viewed

@@ -1,33 +1,42 @@
-data_manipulation_utilities-0.2.5.data/scripts/publish,sha256=-3K_Y2_4CfWCV50rPB8CRuhjxDu7xMGswinRwPovgLs,1976
+data_manipulation_utilities-0.2.7.data/scripts/publish,sha256=-3K_Y2_4CfWCV50rPB8CRuhjxDu7xMGswinRwPovgLs,1976
 dmu/arrays/utilities.py,sha256=PKoYyybPptA2aU-V3KLnJXBudWxTXu4x1uGdIMQ49HY,1722
-dmu/generic/utilities.py,sha256=0Xnq9t35wuebAqKxbyAiMk1ISB7IcXK4cFH25MT1fgw,1741
-dmu/generic/version_management.py,sha256=G_HjGY-hu8lotZuTdVAg0B8yD0AltE866q2vJxvTg1g,3749
+dmu/generic/hashing.py,sha256=Mxwr88jyGuEP5Bhqtw1gDva9fNO-l11jb2qA0ceOysw,1167
+dmu/generic/utilities.py,sha256=dBvrucgdM3vyyETharLvPAC7qEbgF9cL1mLVQ0nxNJM,2020
+dmu/generic/version_management.py,sha256=j0ImlAq6SVNjTh3xRsF6G7DSoyr1w8kTRY84dNriGRE,3750
 dmu/logging/log_store.py,sha256=umdvjNDuV3LdezbG26b0AiyTglbvkxST19CQu9QATbA,4184
 dmu/ml/cv_classifier.py,sha256=ZbzEm_jW9yoTC7k_xBA7hFpc1bDNayiVR3tbaj1_ieE,4228
+dmu/ml/cv_diagnostics.py,sha256=T6wkNcuLkdLrZ9z1YtAYMeb-M_bxnSChRHtdPeQv_To,7507
 dmu/ml/cv_predict.py,sha256=4wwYL_jcUExDqLJVfClxEUWSd_QAx8yKHO3rX-mx4vw,6711
-dmu/ml/train_mva.py,sha256=XzXE92PzyF3cjlx5yMhtp5h4t7wzisRAyO1fBArssvc,17282
+dmu/ml/train_mva.py,sha256=tkdSODahXG9R77wmcckKFxYceMmc-3O-Y4pJXMe8WAI,21137
 dmu/ml/utilities.py,sha256=PK_61fW7gBV9aGZyez3PI8zAT7_Fc6IlQzDB7f8iBTM,4133
-dmu/pdataframe/utilities.py,sha256=ypvLiFfJ82ga94qlW3t5dXnvEFwYOXnbtJb2zHwsbqk,987
+dmu/pdataframe/utilities.py,sha256=kwv9PuSmccz518e5bkrASacewCkMLTSPLtLiMH3olBc,2011
+dmu/plotting/fwhm.py,sha256=Vn0TWlxgNMNk_wb2CD_At-ltVmzI49fyhoJXa0CT0I4,1924
 dmu/plotting/matrix.py,sha256=pXuUJn-LgOvrI9qGkZQw16BzLjOjeikYQ_ll2VIcIXU,4978
-dmu/plotting/plotter.py,sha256=3WRbNOrFBWgI3iW5TbEgT4w_eF7-XUPs_32JL1AW3yY,7359
-dmu/plotting/plotter_1d.py,sha256=2AnVxulyhKtwN-2Srhfm6fqdEREZNhcpJolBsJrWcsc,5745
+dmu/plotting/plotter.py,sha256=f3hoZF1n3G-yjHsQ_2huxqz5cqFelZGs0RiaHYjF0JM,7391
+dmu/plotting/plotter_1d.py,sha256=zygY6XUk4_wmaAK-bietjYdhkA03TZyft5IYX5acoYI,9142
 dmu/plotting/plotter_2d.py,sha256=mZhp3D5I-JodOnFTEF1NqHtcLtuI-2WNpCQsrsoXNtw,3017
 dmu/plotting/utilities.py,sha256=SI9dvtZq2gr-PXVz71KE4o0i09rZOKgqJKD1jzf6KXk,1167
 dmu/rdataframe/atr_mgr.py,sha256=FdhaQWVpsm4OOe1IRbm7rfrq8VenTNdORyI-lZ2Bs1M,2386
 dmu/rdataframe/utilities.py,sha256=pNcQARMP7txMhy6k27UnDcYf0buNy5U2fshaJDl_h8o,3661
 dmu/rfile/rfprinter.py,sha256=mp5jd-oCJAnuokbdmGyL9i6tK2lY72jEfROuBIZ_ums,3941
 dmu/rfile/utilities.py,sha256=XuYY7HuSBj46iSu3c60UYBHtI6KIPoJU_oofuhb-be0,945
-dmu/stats/fitter.py,sha256=vHNZ16U3apoQyeyM8evq-if49doF48sKB3q9wmA96Fw,18387
+dmu/stats/fitter.py,sha256=gmSejQmoqsHmg6QPypbNb0MTfDFkSMy-DsDLKk1H3KE,18387
 dmu/stats/function.py,sha256=yzi_Fvp_ASsFzbWFivIf-comquy21WoeY7is6dgY0Go,9491
 dmu/stats/gof_calculator.py,sha256=4EN6OhULcztFvsAZ00rxgohJemnjtDNB5o0IBcv6kbk,4657
-dmu/stats/minimizers.py,sha256=f9cilFY9Kp9UvbSIUsKBGFzOOg7EEWZJLPod-4k-LAQ,6216
-dmu/stats/model_factory.py,sha256=ixWnhE8gPiOYW5pCb3eoVIaSvbUopEx4ldkZ3xL54Xg,7714
+dmu/stats/minimizers.py,sha256=db9R2G0SOV-k0BKi6m4EyB_yp6AtZdP23_28B0315oo,7094
+dmu/stats/model_factory.py,sha256=N0Q5R7GDfy2r_08xGduzydYO-ALvNTIRE1g0KKcRFyc,15872
 dmu/stats/utilities.py,sha256=LQy4kd3xSXqpApcWuYfZxkGQyjowaXv2Wr1c4Bj-4ys,4523
-dmu/stats/zfit_plotter.py,sha256=Xs6kisNEmNQXhYRCcjowxO6xHuyAyrfyQIFhGAR61U4,19719
-dmu/testing/utilities.py,sha256=moImLqGX9LAt5zJtE5j0gHHkUJ5kpbodryhiVswOsyM,3696
+dmu/stats/zfit_models.py,sha256=SI61KJ-OG1UAabDICU1iTh6JPKM3giR2ErDraRjkCV8,1842
+dmu/stats/zfit_plotter.py,sha256=bnkqFKyxhOEb5hVv4mxh4gpg-MrGeTrCh_5QKIT9hL0,19928
+dmu/testing/utilities.py,sha256=c91g_PEzIZfS3Kxj2EOh05mc5ia7vFpZM1VEhjh4DA0,4661
 dmu/text/transformer.py,sha256=4lrGknbAWRm0-rxbvgzOO-eR1-9bkYk61boJUEV3cQ0,6100
 dmu_data/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
-dmu_data/ml/tests/train_mva.yaml,sha256=o0ZIe43qPC-KwLT9y1qfYYw2bbOLlJIKRkCMUnM5sBo,1177
+dmu_data/ml/tests/diagnostics_from_file.yaml,sha256=quvXOPkRducnBsctyape_Rn5_aqMEpPo6nO_UweMORo,404
+dmu_data/ml/tests/diagnostics_from_model.yaml,sha256=rtCQlmGS9ld2xoQJEE35nA07yfRMklEfQEW0w3gRv2A,261
+dmu_data/ml/tests/diagnostics_multiple_methods.yaml,sha256=w8Fpmr7kX1Jsb_h6LL2hiuYKf5lYpckFCpYKzWetbA0,265
+dmu_data/ml/tests/diagnostics_overlay.yaml,sha256=ZVOsxLL8_JQtf41n8Ct-M9Ch10xBwHK54q1fttWPDlE,866
+dmu_data/ml/tests/train_mva.yaml,sha256=AFUJV5yrXIdHHzA9TzfJv3RWhvk0GUB_CFubePCzfxE,1437
+dmu_data/ml/tests/train_mva_with_diagnostics.yaml,sha256=ZyKU7W_mDGVvg8xP_wQX0NtO6N_tLSnKO1CyHAaYhuk,2036
 dmu_data/plotting/tests/2d.yaml,sha256=HSAtER-8CEqIGBY_jdcIdSVOHMfYPYhmgeZghTpVYh8,516
 dmu_data/plotting/tests/fig_size.yaml,sha256=7ROq49nwZ1A2EbPiySmu6n3G-Jq6YAOkc3d2X3YNZv0,294
 dmu_data/plotting/tests/high_stat.yaml,sha256=bLglBLCZK6ft0xMhQ5OltxE76cWsBMPMjO6GG0OkDr8,522
@@ -35,8 +44,11 @@ dmu_data/plotting/tests/legend.yaml,sha256=wGpj58ig-GOlqbWoN894zrCet2Fj9f5QtY0ri
 dmu_data/plotting/tests/name.yaml,sha256=mkcPAVg8wBAmlSbSRQ1bcaMl4vOS6LXMtpqQeDrrtO4,312
 dmu_data/plotting/tests/no_bounds.yaml,sha256=8e1QdphBjz-suDr857DoeUC2DXiy6SE-gvkORJQYv80,257
 dmu_data/plotting/tests/normalized.yaml,sha256=Y0eKtyV5pvlSxvqfsLjytYtv8xYF3HZ5WEdCJdeHGQI,193
-dmu_data/plotting/tests/simple.yaml,sha256=N_TvNBh_2dU0-VYgu_LMrtY0kV_hg2HxVuEoDlr1HX8,138
+dmu_data/plotting/tests/plug_fwhm.yaml,sha256=xl5LXc9Nt66anM-HOXAxCtlaxWNM7zzIXf1Y6U8M4Wg,449
+dmu_data/plotting/tests/plug_stats.yaml,sha256=ROO8soYXBbZIFYZcGngA_K5XHgIAFCmuAGfZCJgMmd0,384
+dmu_data/plotting/tests/simple.yaml,sha256=Xc59Pjfb3BKMicLVBxODVqomHFupcb5GvefKbKHCQWQ,195
 dmu_data/plotting/tests/stats.yaml,sha256=fSZjoV-xPnukpCH2OAXsz_SNPjI113qzDg8Ln3spaaA,165
+dmu_data/plotting/tests/styling.yaml,sha256=yUV75HewT2tFofS-5i-mU-3s4PSHcpZOZRn5R_iSRAw,218
 dmu_data/plotting/tests/title.yaml,sha256=bawKp9aGpeRrHzv69BOCbFX8sq9bb3Es9tdsPTE7jIk,333
 dmu_data/plotting/tests/weights.yaml,sha256=RWQ1KxbCq-uO62WJ2AoY4h5Umc37zG35s-TpKnNMABI,312
 dmu_data/text/transform.toml,sha256=R-832BZalzHZ6c5gD6jtT_Hj8BCsM5vxa1v6oeiwaP4,94
@@ -50,8 +62,8 @@ dmu_scripts/rfile/compare_root_files.py,sha256=T8lDnQxsRNMr37x1Y7YvWD8ySHrJOWZki
 dmu_scripts/rfile/print_trees.py,sha256=Ze4Ccl_iUldl4eVEDVnYBoe4amqBT1fSBR1zN5WSztk,941
 dmu_scripts/ssh/coned.py,sha256=lhilYNHWRCGxC-jtyJ3LQ4oUgWW33B2l1tYCcyHHsR0,4858
 dmu_scripts/text/transform_text.py,sha256=9akj1LB0HAyopOvkLjNOJiptZw5XoOQLe17SlcrGMD0,1456
-data_manipulation_utilities-0.2.5.dist-info/METADATA,sha256=d8rJbrtHEg_fOma5NA5qL4ox8bP4MaIV0mbyl6uRiJs,30104
-data_manipulation_utilities-0.2.5.dist-info/WHEEL,sha256=In9FTNxeP60KnTkGw7wk6mJPYd_dQSjEZmXdBdMCI-8,91
-data_manipulation_utilities-0.2.5.dist-info/entry_points.txt,sha256=1TIZDed651KuOH-DgaN5AoBdirKmrKE_oM1b6b7zTUU,270
-data_manipulation_utilities-0.2.5.dist-info/top_level.txt,sha256=n_x5J6uWtSqy9mRImKtdA2V2NJNyU8Kn3u8DTOKJix0,25
-data_manipulation_utilities-0.2.5.dist-info/RECORD,,
+data_manipulation_utilities-0.2.7.dist-info/METADATA,sha256=aaUGi8PUMDuJCdslarv5Rn10iI-Jvbu4WkWbEOwmtzw,34464
+data_manipulation_utilities-0.2.7.dist-info/WHEEL,sha256=CmyFI0kx5cdEMTLiONQRbGQwjIoR1aIYB7eCAQ4KPJ0,91
+data_manipulation_utilities-0.2.7.dist-info/entry_points.txt,sha256=1TIZDed651KuOH-DgaN5AoBdirKmrKE_oM1b6b7zTUU,270
+data_manipulation_utilities-0.2.7.dist-info/top_level.txt,sha256=n_x5J6uWtSqy9mRImKtdA2V2NJNyU8Kn3u8DTOKJix0,25
+data_manipulation_utilities-0.2.7.dist-info/RECORD,,

{data_manipulation_utilities-0.2.5.dist-info → data_manipulation_utilities-0.2.7.dist-info}/WHEEL RENAMED Viewed

@@ -1,5 +1,5 @@
 Wheel-Version: 1.0
-Generator: setuptools (75.8.0)
+Generator: setuptools (78.1.0)
 Root-Is-Purelib: true
 Tag: py3-none-any

dmu/generic/hashing.py ADDED Viewed

@@ -0,0 +1,44 @@
+'''
+Module with functions needed to provide hashes
+'''
+import json
+import hashlib
+from typing import Any
+import pandas as pnd
+from dmu.logging.log_store import LogStore
+log=LogStore.add_logger('dmu:generic.hashing')
+# ------------------------------------
+def _object_to_string(obj : Any) -> str:
+    try:
+        string = json.dumps(obj)
+    except Exception as exc:
+        raise ValueError(f'Cannot hash object: {obj}') from exc
+    return string
+# ------------------------------------
+def _dataframe_to_hash(df : pnd.DataFrame) -> str:
+    sr_hash = pnd.util.hash_pandas_object(df, index=True)
+    values  = sr_hash.values
+    hsh     = hashlib.sha256(values)
+    hsh     = hsh.hexdigest()
+    return hsh
+# ------------------------------------
+def hash_object(obj : Any) -> str:
+    '''
+    Function taking a python object and returning
+    a string representing the hash
+    '''
+    if isinstance(obj, pnd.DataFrame):
+        return _dataframe_to_hash(df=obj)
+    string     = _object_to_string(obj=obj)
+    string_bin = string.encode('utf-8')
+    hsh        = hashlib.sha256(string_bin)
+    return hsh.hexdigest()
+# ------------------------------------

dmu/generic/utilities.py CHANGED Viewed

@@ -52,7 +52,7 @@ def timeit(f):
         return result
     return wrap
 # --------------------------------
-def dump_json(data, path : str, sort_keys : bool = False):
+def dump_json(data, path : str, sort_keys : bool = False) -> None:
     '''
     Saves data as JSON
@@ -67,3 +67,16 @@ def dump_json(data, path : str, sort_keys : bool = False):
     with open(path, 'w', encoding='utf-8') as ofile:
         json.dump(data, ofile, indent=4, sort_keys=sort_keys)
 # --------------------------------
+def load_json(path : str):
+    '''
+    Loads data from JSON
+    Parameters
+    path     : Path to JSON file where data is saved
+    '''
+    with open(path, encoding='utf-8') as ofile:
+        data = json.load(ofile)
+    return data
+# --------------------------------

dmu/generic/version_management.py CHANGED Viewed

@@ -42,8 +42,7 @@ def get_last_version(dir_path : str, version_only : bool = True, main_only : boo
     l_obj = glob.glob(f'{dir_path}/*')
     if len(l_obj) == 0:
-        log.error(f'Nothing found in {dir_path}')
-        raise ValueError
+        raise ValueError(f'Nothing found in {dir_path}')
     d_dir_org = { os.path.basename(obj).replace('.', '') : obj for obj in l_obj if os.path.isdir(obj) }
     d_dir_num = { _get_numeric_version(name) : dir_path for name, dir_path in d_dir_org.items() }
@@ -52,9 +51,8 @@ def get_last_version(dir_path : str, version_only : bool = True, main_only : boo
     try:
         _, path = c_dir[-1]
-    except:
-        log.error(f'Cannot find path in: {dir_path}')
-        raise
+    except Exception as exc:
+        raise ValueError(f'Cannot find path in: {dir_path}') from exc
     name = os.path.basename(path)
     dirn = os.path.dirname(path)

dmu/ml/cv_diagnostics.py ADDED Viewed

@@ -0,0 +1,221 @@
+'''
+Module containing CVDiagnostics class
+'''
+import os
+import numpy
+import matplotlib
+import matplotlib.pyplot as plt
+import pandas            as pnd
+from scipy.stats             import kendalltau
+from ROOT                    import RDataFrame, RDF
+from dmu.ml.cv_classifier    import CVClassifier
+from dmu.ml.cv_predict       import CVPredict
+from dmu.logging.log_store   import LogStore
+from dmu.plotting.plotter_1d import Plotter1D as Plotter
+NPA = numpy.ndarray
+Axis= matplotlib.axes._axes.Axes
+log = LogStore.add_logger('dmu:ml:cv_diagnostics')
+# -------------------------
+class CVDiagnostics:
+    '''
+    Class meant to rundiagnostics on classifier
+    Correlations
+    ------------------
+    Will calculate correlations between features + signal probability and some external target variable specified in the config
+    '''
+    # -------------------------
+    def __init__(self, models : list[CVClassifier], rdf : RDataFrame, cfg : dict):
+        self._l_model = models
+        self._cfg     = cfg
+        self._rdf     = rdf
+        self._target  = cfg['correlations']['target']['name']
+        self._l_feat  = self._get_features()
+        self._d_xlab  = self._get_xlabels()
+    # -------------------------
+    def _get_features(self) -> list[str]:
+        cfg   = self._l_model[0].cfg
+        l_var = cfg['training']['features']
+        return l_var
+    # -------------------------
+    def _get_xlabels(self) -> dict[str,str]:
+        cfg   = self._l_model[0].cfg
+        d_var = cfg['plotting']['features']['plots']
+        d_lab = { varname : d_field['labels'][0] for varname, d_field in d_var.items() }
+        target= self._cfg['correlations']['target']['name']
+        if 'overlay' not in self._cfg['correlations']['target']:
+            xlabel = target
+        else:
+            xlabel= self._cfg['correlations']['target']['overlay']['plots'][target]['labels'][0]
+        d_lab[target]  = xlabel
+        d_lab['score'] = 'score'
+        d_lab = { var_id : var_name.replace('MeV', '') for var_id, var_name in d_lab.items() }
+        return d_lab
+    # -------------------------
+    def _add_columns(self, rdf : RDataFrame) -> RDataFrame:
+        cfg    = self._l_model[0].cfg
+        d_def  = cfg['dataset']['define']
+        for var, expr in d_def.items():
+            rdf = rdf.Define(var, expr)
+        return rdf
+    # -------------------------
+    def _get_scores(self) -> NPA:
+        if 'score_from_rdf' not in self._cfg:
+            log.debug('Using score from model')
+            prd = CVPredict(models=self._l_model, rdf = self._rdf)
+            return prd.predict()
+        name = self._cfg['score_from_rdf']
+        log.debug(f'Picking up score from dataframe, column: {name}')
+        arr_score = self._rdf.AsNumpy([name])[name]
+        return arr_score
+    # -------------------------
+    def _get_arrays(self) -> dict[str, NPA]:
+        rdf   = self._add_columns(self._rdf)
+        l_col = [ name.c_str() for name in rdf.GetColumnNames() ]
+        missing= False
+        l_var  = self._l_feat + [self._target]
+        for var in l_var:
+            if var not in l_col:
+                log.error(f'{"Missing":<20}{var}')
+                missing=True
+        if missing:
+            raise ValueError('Columns missing')
+        d_var          = rdf.AsNumpy(l_var)
+        d_var['score'] = self._get_scores()
+        return d_var
+    # -------------------------
+    def _run_correlations(self, method : str, ax : Axis) -> Axis:
+        d_arr      = self._get_arrays()
+        arr_target = d_arr[self._target]
+        d_corr= {}
+        for name, arr_val in d_arr.items():
+            if name == self._target:
+                continue
+            d_corr[name] = self._calculate_correlations(var=arr_val, target=arr_target, method=method)
+        ax = self._plot_correlations(d_corr=d_corr, method=method, ax=ax)
+        return ax
+    # -------------------------
+    def _plot_correlations(self, d_corr : dict[str,float], method : str, ax : Axis) -> Axis:
+        df             = pnd.DataFrame.from_dict(d_corr, orient="index", columns=[method])
+        df['variable'] = df.index.map(self._d_xlab)
+        figsize = self._cfg['correlations']['figure']['size']
+        ax      = df.plot(x='variable', y=method,label=method, figsize=figsize, ax=ax)
+        # Needed to show all labels on x axis
+        plt.xticks(ticks=range(len(df)), labels=df.variable)
+        if 'xlabelsize' in self._cfg['correlations']['figure']:
+            xlabsize= self._cfg['correlations']['figure']['xlabelsize']
+        else:
+            xlabsize= 30
+        ax.tick_params(axis='x', labelsize=xlabsize)
+        return ax
+    # -------------------------
+    def _save_plot(self):
+        plot_dir = self._cfg['output']
+        os.makedirs(plot_dir, exist_ok=True)
+        plot_path = f'{plot_dir}/correlations.png'
+        log.info(f'Saving to: {plot_path}')
+        title = None
+        if 'title' in self._cfg['correlations']['figure']:
+            title = self._cfg['correlations']['figure']['title']
+        rotation=30
+        if 'rotate' in self._cfg['correlations']['figure']:
+            rotation = self._cfg['correlations']['figure']['rotate']
+        plt.ylim(-1, +1)
+        plt.title(title)
+        plt.xlabel('')
+        plt.ylabel('Correlation')
+        plt.grid()
+        plt.xticks(rotation=rotation)
+        plt.tight_layout()
+        plt.savefig(plot_path)
+        plt.close()
+    # -------------------------
+    def _remove_nans(self, var : NPA, tgt : NPA) -> tuple[NPA,NPA]:
+        arr_nan_var = numpy.isnan(var)
+        arr_nan_tgt = numpy.isnan(tgt)
+        arr_is_nan  = numpy.logical_or(arr_nan_var, arr_nan_tgt)
+        arr_not_nan = numpy.logical_not(arr_is_nan)
+        var         = var[arr_not_nan]
+        tgt         = tgt[arr_not_nan]
+        return var, tgt
+    # -------------------------
+    def _calculate_correlations(self, var : NPA, target : NPA, method : str) -> float:
+        var, target = self._remove_nans(var, target)
+        if method == 'Pearson':
+            mat         = numpy.corrcoef(var, target)
+            return mat[0,1]
+        if method == r'Kendall-$\tau$':
+            tau, _ = kendalltau(var, target)
+            return tau
+        raise NotImplementedError(f'Correlation coefficient {method} not implemented')
+    # -------------------------
+    def _plot_cutflow(self) -> None:
+        if 'overlay' not in self._cfg['correlations']['target']:
+            log.debug('Not plotting cutflow of target distribution')
+            return
+        arr_score = self._get_scores()
+        arr_target= self._rdf.AsNumpy([self._target])[self._target]
+        arr_wp    = self._cfg['correlations']['target']['overlay']['wp']
+        rdf       = RDF.FromNumpy({'Score' : arr_score, self._target : arr_target})
+        d_rdf = {}
+        for wp in arr_wp:
+            name        = f'WP > {wp:.2}'
+            expr        = f'Score > {wp:.3}'
+            d_rdf[name] = rdf.Filter(expr)
+        cfg_target = self._cfg['correlations']['target']['overlay']
+        ptr=Plotter(d_rdf=d_rdf, cfg=cfg_target)
+        ptr.run()
+    # -------------------------
+    def run(self) -> None:
+        '''
+        Runs diagnostics
+        '''
+        if 'correlations' in self._cfg:
+            ax = None
+            for method in self._cfg['correlations']['methods']:
+                ax = self._run_correlations(method=method, ax=ax)
+            self._save_plot()
+            self._plot_cutflow()
+# -------------------------

data-manipulation-utilities 0.2.5__py3-none-any.whl → 0.2.7__py3-none-any.whl

data-manipulation-utilities 0.2.5py3-none-any.whl → 0.2.7py3-none-any.whl