PyPI - data-manipulation-utilities - Versions diffs - 0.2.7__py3-none-any.whl → 0.2.8.dev714__py3-none-any.whl - Mend

data-manipulation-utilities 0.2.7py3-none-any.whl → 0.2.8.dev714py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (56) hide show

{data_manipulation_utilities-0.2.7.dist-info → data_manipulation_utilities-0.2.8.dev714.dist-info}/METADATA +641 -44
data_manipulation_utilities-0.2.8.dev714.dist-info/RECORD +93 -0
{data_manipulation_utilities-0.2.7.dist-info → data_manipulation_utilities-0.2.8.dev714.dist-info}/WHEEL +1 -1
{data_manipulation_utilities-0.2.7.dist-info → data_manipulation_utilities-0.2.8.dev714.dist-info}/entry_points.txt +1 -0
dmu/__init__.py +0 -0
dmu/generic/hashing.py +34 -8
dmu/generic/utilities.py +164 -11
dmu/logging/log_store.py +34 -2
dmu/logging/messages.py +96 -0
dmu/ml/cv_classifier.py +3 -3
dmu/ml/cv_diagnostics.py +3 -0
dmu/ml/cv_performance.py +58 -0
dmu/ml/cv_predict.py +149 -46
dmu/ml/train_mva.py +482 -100
dmu/ml/utilities.py +29 -10
dmu/pdataframe/utilities.py +28 -3
dmu/plotting/fwhm.py +2 -2
dmu/plotting/matrix.py +1 -1
dmu/plotting/plotter.py +23 -3
dmu/plotting/plotter_1d.py +96 -32
dmu/plotting/plotter_2d.py +5 -0
dmu/rdataframe/utilities.py +54 -3
dmu/rfile/ddfgetter.py +102 -0
dmu/stats/fit_stats.py +129 -0
dmu/stats/fitter.py +55 -22
dmu/stats/gof_calculator.py +7 -0
dmu/stats/model_factory.py +153 -62
dmu/stats/parameters.py +100 -0
dmu/stats/utilities.py +443 -12
dmu/stats/wdata.py +187 -0
dmu/stats/zfit.py +17 -0
dmu/stats/zfit_plotter.py +147 -36
dmu/testing/utilities.py +102 -24
dmu/workflow/__init__.py +0 -0
dmu/workflow/cache.py +266 -0
dmu_data/ml/tests/train_mva.yaml +9 -7
dmu_data/ml/tests/train_mva_def.yaml +75 -0
dmu_data/ml/tests/train_mva_with_diagnostics.yaml +10 -5
dmu_data/ml/tests/train_mva_with_preffix.yaml +58 -0
dmu_data/plotting/tests/2d.yaml +5 -5
dmu_data/plotting/tests/line.yaml +15 -0
dmu_data/plotting/tests/styling.yaml +8 -1
dmu_data/rfile/friends.yaml +13 -0
dmu_data/stats/fitter/test_simple.yaml +28 -0
dmu_data/stats/kde_optimizer/control.json +1 -0
dmu_data/stats/kde_optimizer/signal.json +1 -0
dmu_data/stats/parameters/data.yaml +178 -0
dmu_data/tests/config.json +6 -0
dmu_data/tests/config.yaml +4 -0
dmu_data/tests/pdf_to_tex.txt +34 -0
dmu_scripts/kerberos/check_expiration +21 -0
dmu_scripts/kerberos/convert_certificate +22 -0
dmu_scripts/ml/compare_classifiers.py +85 -0
data_manipulation_utilities-0.2.7.dist-info/RECORD +0 -69
{data_manipulation_utilities-0.2.7.data → data_manipulation_utilities-0.2.8.dev714.data}/scripts/publish +0 -0
{data_manipulation_utilities-0.2.7.dist-info → data_manipulation_utilities-0.2.8.dev714.dist-info}/top_level.txt +0 -0

data_manipulation_utilities-0.2.8.dev714.dist-info/RECORD ADDED Viewed

@@ -0,0 +1,93 @@
+data_manipulation_utilities-0.2.8.dev714.data/scripts/publish,sha256=-3K_Y2_4CfWCV50rPB8CRuhjxDu7xMGswinRwPovgLs,1976
+dmu/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
+dmu/arrays/utilities.py,sha256=PKoYyybPptA2aU-V3KLnJXBudWxTXu4x1uGdIMQ49HY,1722
+dmu/generic/hashing.py,sha256=QR5Gbv6-ANvi5hL232UNMrw9DONpU27BWTynXGxQLGU,1806
+dmu/generic/utilities.py,sha256=0tT93vF_x0q8STRrTD0GvBEpALz-mqE-vJyen4zWCO8,6861
+dmu/generic/version_management.py,sha256=j0ImlAq6SVNjTh3xRsF6G7DSoyr1w8kTRY84dNriGRE,3750
+dmu/logging/log_store.py,sha256=eRSy8Y4fuiDFJK02Z6fq67XQzOrhQ7GMr2LvvJQbJ40,5172
+dmu/logging/messages.py,sha256=Oj3O5EO2KOPtffyVq2P7RPzjpoXtxZ6yXO5HwTftVcM,2903
+dmu/ml/cv_classifier.py,sha256=6rjezMahwL-WzLGKU-fzMzNxJZAGbM7YAbhaZVcJ3F0,4258
+dmu/ml/cv_diagnostics.py,sha256=PLh41mSVE8Kagp9KcuRDN_7tDL9MjPxQzuewY8jDnNo,7600
+dmu/ml/cv_performance.py,sha256=q9sLxIx7GP-dand3tnhHCBJnT6xqssNdRYv_TVjYWUM,1910
+dmu/ml/cv_predict.py,sha256=0sc_OqwOewKvipcMyi3QqkgG30nkpZZjE-SOhHWHMd0,10778
+dmu/ml/train_mva.py,sha256=7KAFX_zOx8MGbYx62U81JbdBkrZvqclSSkgmYvWX-60,34861
+dmu/ml/utilities.py,sha256=A9j3tBh-jfaFdwwLUleo1QnttfawN7XDiQRh4VTvqVY,4597
+dmu/pdataframe/utilities.py,sha256=xl6iLVKUccqVXYjuHsDUZ6UrCKQPw1k8D-f6407Yq30,2742
+dmu/plotting/fwhm.py,sha256=4e8n6624pxWLcOOtayCQ_hDSSMKU21-3UsdmbkX1ojk,1949
+dmu/plotting/matrix.py,sha256=s_5W8O3yXF3u8OX3f4J4hCoxIVZt1TF8S-qJsFBh2Go,5005
+dmu/plotting/plotter.py,sha256=oc_n9ug0JPaQZycrW_TJkgNxjr0LHNrVJcijqmiLUR4,8136
+dmu/plotting/plotter_1d.py,sha256=Kyoyh-QyZLXXqX19wqEDUWCD1nJEvEonGp9nlgEaoZE,10936
+dmu/plotting/plotter_2d.py,sha256=dXC-7Rsquibe5cn7622ryoKpuv7KCAmouIIXwQ_VEFM,3172
+dmu/plotting/utilities.py,sha256=SI9dvtZq2gr-PXVz71KE4o0i09rZOKgqJKD1jzf6KXk,1167
+dmu/rdataframe/atr_mgr.py,sha256=FdhaQWVpsm4OOe1IRbm7rfrq8VenTNdORyI-lZ2Bs1M,2386
+dmu/rdataframe/utilities.py,sha256=cY1Na8HbJ7kB2dwmBagRdsRyCA4ZT_vyIU86ewREj2Y,5322
+dmu/rfile/ddfgetter.py,sha256=0jfNzpv72_NQUKOK5SBsn289rUqVt2BMvuL-Ro5oY7I,3316
+dmu/rfile/rfprinter.py,sha256=mp5jd-oCJAnuokbdmGyL9i6tK2lY72jEfROuBIZ_ums,3941
+dmu/rfile/utilities.py,sha256=XuYY7HuSBj46iSu3c60UYBHtI6KIPoJU_oofuhb-be0,945
+dmu/stats/fit_stats.py,sha256=wzkQT9U32ljGe4azUj1Fj0ECF3zmnH2Ncn0O-_Pl1zQ,4070
+dmu/stats/fitter.py,sha256=rm_fwjkq-0LSjXB_gt3y6BnHoK8Xvd4gHYwKBUJaItQ,19603
+dmu/stats/function.py,sha256=yzi_Fvp_ASsFzbWFivIf-comquy21WoeY7is6dgY0Go,9491
+dmu/stats/gof_calculator.py,sha256=63zNJJGKPy-j_hPNPfu9qNlhrHjYIgJOyL8-VDtbwuI,4894
+dmu/stats/minimizers.py,sha256=db9R2G0SOV-k0BKi6m4EyB_yp6AtZdP23_28B0315oo,7094
+dmu/stats/model_factory.py,sha256=0_o5OmiX0cNhp9_cNqBOYfasBgKlQkQPiy5nqi9qQKA,18966
+dmu/stats/parameters.py,sha256=9lycexTT5ZcxXciiQY9HoJV8O1ahrTEkagd7dYXcfj8,3224
+dmu/stats/utilities.py,sha256=7_tr1j-dl3lLNpxIMWruZs4yUtlNuUTknwGMERpfLhs,17338
+dmu/stats/wdata.py,sha256=IbjZFU9SHTLSYfaBgqamDvqy1K7-3-SaKbU4bGsamK0,6799
+dmu/stats/zfit.py,sha256=aSZj_4IHi9IBthfqlNJeA8YSoMmXO5WipgiKnXKGbnM,286
+dmu/stats/zfit_models.py,sha256=SI61KJ-OG1UAabDICU1iTh6JPKM3giR2ErDraRjkCV8,1842
+dmu/stats/zfit_plotter.py,sha256=gbN5KxhJcP4ItCi98c-fj5_UtvVWL_NA9jkTHiRjvnE,23854
+dmu/testing/utilities.py,sha256=WYlz7Ve5lQjuWhhNL4gWe6_qcByBLV762Lhrc6A0P9E,7421
+dmu/text/transformer.py,sha256=4lrGknbAWRm0-rxbvgzOO-eR1-9bkYk61boJUEV3cQ0,6100
+dmu/workflow/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
+dmu/workflow/cache.py,sha256=CtkGwxuF4UJlD55SmUJcRgWYLsbZOyUvYLI8oTVzk_g,8768
+dmu_data/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
+dmu_data/ml/tests/diagnostics_from_file.yaml,sha256=quvXOPkRducnBsctyape_Rn5_aqMEpPo6nO_UweMORo,404
+dmu_data/ml/tests/diagnostics_from_model.yaml,sha256=rtCQlmGS9ld2xoQJEE35nA07yfRMklEfQEW0w3gRv2A,261
+dmu_data/ml/tests/diagnostics_multiple_methods.yaml,sha256=w8Fpmr7kX1Jsb_h6LL2hiuYKf5lYpckFCpYKzWetbA0,265
+dmu_data/ml/tests/diagnostics_overlay.yaml,sha256=ZVOsxLL8_JQtf41n8Ct-M9Ch10xBwHK54q1fttWPDlE,866
+dmu_data/ml/tests/train_mva.yaml,sha256=KArbTkaj6FqerrUhlkgyBde_4DfkpVza6kCMgMQPi9g,1388
+dmu_data/ml/tests/train_mva_def.yaml,sha256=UyPMo-9nshoB8BHxm9E6S0xd9ngRARdgUq6vnuMlhwI,1765
+dmu_data/ml/tests/train_mva_with_diagnostics.yaml,sha256=-2KKIJ8CiNgMlgpCXkmZRdPEo-sJmAqr01vizfeqkj0,2098
+dmu_data/ml/tests/train_mva_with_preffix.yaml,sha256=Q9SsJSXGbkHWGBvMZIkTZlKNUz5ZcSVBscrKgeMWBvE,1386
+dmu_data/plotting/tests/2d.yaml,sha256=40wKQmNbIabZ7CI8-2QnD6mG1a_B7vEcPdzvehHkseY,520
+dmu_data/plotting/tests/fig_size.yaml,sha256=7ROq49nwZ1A2EbPiySmu6n3G-Jq6YAOkc3d2X3YNZv0,294
+dmu_data/plotting/tests/high_stat.yaml,sha256=bLglBLCZK6ft0xMhQ5OltxE76cWsBMPMjO6GG0OkDr8,522
+dmu_data/plotting/tests/legend.yaml,sha256=wGpj58ig-GOlqbWoN894zrCet2Fj9f5QtY0rig_UC-c,213
+dmu_data/plotting/tests/line.yaml,sha256=EERDeTctbauwqAvmKFXC4Ot3Tgx-8kcIniGbepXwsKs,305
+dmu_data/plotting/tests/name.yaml,sha256=mkcPAVg8wBAmlSbSRQ1bcaMl4vOS6LXMtpqQeDrrtO4,312
+dmu_data/plotting/tests/no_bounds.yaml,sha256=8e1QdphBjz-suDr857DoeUC2DXiy6SE-gvkORJQYv80,257
+dmu_data/plotting/tests/normalized.yaml,sha256=Y0eKtyV5pvlSxvqfsLjytYtv8xYF3HZ5WEdCJdeHGQI,193
+dmu_data/plotting/tests/plug_fwhm.yaml,sha256=xl5LXc9Nt66anM-HOXAxCtlaxWNM7zzIXf1Y6U8M4Wg,449
+dmu_data/plotting/tests/plug_stats.yaml,sha256=ROO8soYXBbZIFYZcGngA_K5XHgIAFCmuAGfZCJgMmd0,384
+dmu_data/plotting/tests/simple.yaml,sha256=Xc59Pjfb3BKMicLVBxODVqomHFupcb5GvefKbKHCQWQ,195
+dmu_data/plotting/tests/stats.yaml,sha256=fSZjoV-xPnukpCH2OAXsz_SNPjI113qzDg8Ln3spaaA,165
+dmu_data/plotting/tests/styling.yaml,sha256=ZglA4fG6gr5Q_K2VinwVDPjIitiFizCzxr-KsHw2ERI,370
+dmu_data/plotting/tests/title.yaml,sha256=bawKp9aGpeRrHzv69BOCbFX8sq9bb3Es9tdsPTE7jIk,333
+dmu_data/plotting/tests/weights.yaml,sha256=RWQ1KxbCq-uO62WJ2AoY4h5Umc37zG35s-TpKnNMABI,312
+dmu_data/rfile/friends.yaml,sha256=sEGKFKK0q1U6b9qlfHUFBLZW0FeruR1t2LCOo6Ck1Rg,264
+dmu_data/stats/fitter/test_simple.yaml,sha256=lBw6igBT57BZnuG3GgoxOiXTMFHfs5LchbI3Ubb8Qz0,1549
+dmu_data/stats/kde_optimizer/control.json,sha256=EiArsHUAHBmzw4gmaNyOOW1ziYtNhdelIAqc3EH0K_M,1327616
+dmu_data/stats/kde_optimizer/signal.json,sha256=MocwnYizcKki4dlxEIsWwE8HzY-ZBQaUo-lrCR5N3Tw,1327616
+dmu_data/stats/parameters/data.yaml,sha256=lNmuolhUQmwB6sxHQvBRm-Kz5MUW_H1qAouynzBiWvs,2087
+dmu_data/tests/config.json,sha256=QSfx-irgPV-BHAVe1Xe1dgiVkZGPp0fxb9OhXeVaEBg,60
+dmu_data/tests/config.yaml,sha256=rFTk9PSFOgEVEcGDxr4K9vFIUrCVhbEMUoj683Py1AQ,38
+dmu_data/tests/pdf_to_tex.txt,sha256=yzzH1L7P2SOFrVxS737Ykg1SlcD0jhrrBwQGsui2oAQ,3854
+dmu_data/text/transform.toml,sha256=R-832BZalzHZ6c5gD6jtT_Hj8BCsM5vxa1v6oeiwaP4,94
+dmu_data/text/transform.txt,sha256=EX760da6Vkf-_EPxnQlC5hGSkfFhJCCGCD19NU-1Qto,44
+dmu_data/text/transform_set.toml,sha256=Jeh7BTz82idqvbOQJtl9-ur56mZkzDn5WtvmIb48LoE,150
+dmu_data/text/transform_set.txt,sha256=1KivMoP9LxPn9955QrRmOzjEqduEjhTetQ9MXykO5LY,46
+dmu_data/text/transform_trf.txt,sha256=zxBRTgcSmX7RdqfmWF88W1YqbyNHa4Ccruf1MmnYv2A,74
+dmu_scripts/git/publish,sha256=-3K_Y2_4CfWCV50rPB8CRuhjxDu7xMGswinRwPovgLs,1976
+dmu_scripts/kerberos/check_expiration,sha256=PRJopcyFSeiAHdWpLEZp6mu_OKctUdIJj0HZfC0EWxg,308
+dmu_scripts/kerberos/convert_certificate,sha256=_4k4fmxpK-MbSLkkRYEPLQc9twfYBqOIiYZqL9yAXKE,445
+dmu_scripts/ml/compare_classifiers.py,sha256=XuHdcVyDLFGoKfvfv6YrgIavRpjpMrnBSqUnlliD7ew,2312
+dmu_scripts/physics/check_truth.py,sha256=b1P_Pa9ef6VcFtyY6Y9KS9Om9L-QrCBjDKp4dqca0PQ,3964
+dmu_scripts/rfile/compare_root_files.py,sha256=T8lDnQxsRNMr37x1Y7YvWD8ySHrJOWZki7ZQynxXX9Q,9540
+dmu_scripts/rfile/print_trees.py,sha256=Ze4Ccl_iUldl4eVEDVnYBoe4amqBT1fSBR1zN5WSztk,941
+dmu_scripts/ssh/coned.py,sha256=lhilYNHWRCGxC-jtyJ3LQ4oUgWW33B2l1tYCcyHHsR0,4858
+dmu_scripts/text/transform_text.py,sha256=9akj1LB0HAyopOvkLjNOJiptZw5XoOQLe17SlcrGMD0,1456
+data_manipulation_utilities-0.2.8.dev714.dist-info/METADATA,sha256=M5n-tPUt3o_0kY4viuQj6lbP4JQxWhpxkSnWCW29PFg,50263
+data_manipulation_utilities-0.2.8.dev714.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
+data_manipulation_utilities-0.2.8.dev714.dist-info/entry_points.txt,sha256=-02cr8ibY6L_reX-_Owz2N7OUQyTAwydRIvLr9kKZK0,332
+data_manipulation_utilities-0.2.8.dev714.dist-info/top_level.txt,sha256=n_x5J6uWtSqy9mRImKtdA2V2NJNyU8Kn3u8DTOKJix0,25
+data_manipulation_utilities-0.2.8.dev714.dist-info/RECORD,,

{data_manipulation_utilities-0.2.7.dist-info → data_manipulation_utilities-0.2.8.dev714.dist-info}/WHEEL RENAMED Viewed

@@ -1,5 +1,5 @@
 Wheel-Version: 1.0
-Generator: setuptools (78.1.0)
+Generator: setuptools (80.9.0)
 Root-Is-Purelib: true
 Tag: py3-none-any

{data_manipulation_utilities-0.2.7.dist-info → data_manipulation_utilities-0.2.8.dev714.dist-info}/entry_points.txt RENAMED Viewed

@@ -1,5 +1,6 @@
 [console_scripts]
 check_truth = dmu_scripts.physics.check_truth:main
+compare_classifiers = dmu_scripts.ml.compare_classifiers:main
 compare_root_files = dmu_scripts.rfile.compare_root_files:main
 coned = dmu_scripts.ssh.coned:main
 print_trees = dmu_scripts.rfile.print_trees:main

dmu/__init__.py ADDED Viewed

File without changes

dmu/generic/hashing.py CHANGED Viewed

@@ -2,6 +2,7 @@
 Module with functions needed to provide hashes
 '''
+import os
 import json
 import hashlib
 from typing import Any
@@ -12,12 +13,10 @@ from dmu.logging.log_store import LogStore
 log=LogStore.add_logger('dmu:generic.hashing')
 # ------------------------------------
 def _object_to_string(obj : Any) -> str:
-    try:
-        string = json.dumps(obj)
-    except Exception as exc:
-        raise ValueError(f'Cannot hash object: {obj}') from exc
+    def default_encoder(x):
+        raise TypeError(f"Unserializable type: {type(x)}")
-    return string
+    return json.dumps(obj, sort_keys=True, default=default_encoder)
 # ------------------------------------
 def _dataframe_to_hash(df : pnd.DataFrame) -> str:
     sr_hash = pnd.util.hash_pandas_object(df, index=True)
@@ -29,16 +28,43 @@ def _dataframe_to_hash(df : pnd.DataFrame) -> str:
 # ------------------------------------
 def hash_object(obj : Any) -> str:
     '''
-    Function taking a python object and returning
+    Function taking a python object and returning
     a string representing the hash
     '''
     if isinstance(obj, pnd.DataFrame):
-        return _dataframe_to_hash(df=obj)
+        value = _dataframe_to_hash(df=obj)
+        value = value[:10]
+        return value
     string     = _object_to_string(obj=obj)
     string_bin = string.encode('utf-8')
     hsh        = hashlib.sha256(string_bin)
+    value      = hsh.hexdigest()
+    value      = value[:10]
+    return value
+# ------------------------------------
+def hash_file(path : str) -> str:
+    '''
+    Parameters
+    ----------------
+    path: Path to file whose content has to be hashed
+    Returns
+    ----------------
+    A string representing the hash
+    '''
+    if not os.path.isfile(path):
+        raise FileNotFoundError(f'Cannot find: {path}')
+    h = hashlib.sha256()
+    with open(path, 'rb') as f:
+        for chunk in iter(lambda: f.read(8192), b''):
+            h.update(chunk)
+    value = h.hexdigest()
-    return hsh.hexdigest()
+    return value[:10]
 # ------------------------------------

dmu/generic/utilities.py CHANGED Viewed

@@ -4,17 +4,69 @@ Module containing generic utility functions
 import os
 import time
 import json
+import pickle
 import inspect
-from typing import Callable
+from importlib.resources   import files
+from typing                import Callable, Any
 from functools             import wraps
+from contextlib            import contextmanager
+import yaml
+from omegaconf             import OmegaConf, DictConfig
+from dmu.generic           import hashing
+from dmu.generic           import utilities as gut
 from dmu.logging.log_store import LogStore
 TIMER_ON=False
 log = LogStore.add_logger('dmu:generic:utilities')
+# --------------------------------
+class BlockStyleDumper(yaml.SafeDumper):
+    '''
+    Class needed to specify proper indentation when
+    dumping data to YAML files
+    '''
+    def increase_indent(self, flow=False, indentless=False):
+        return super().increase_indent(flow=flow, indentless=False)
+# ---------------------------------
+def load_data(package : str, fpath : str) -> Any:
+    '''
+    This function will load a YAML or JSON file from a data package
+    Parameters
+    ---------------------
+    package: Data package, e.g. `dmu_data`
+    path   : Path to YAML/JSON file, relative to the data package
+    Returns
+    ---------------------
+    Dictionary or whatever structure the file is holding
+    '''
+    cpath = files(package).joinpath(fpath)
+    cpath = str(cpath)
+    data  = load_json(cpath)
+    return data
+# --------------------------------
+def load_conf(package : str, fpath : str) -> DictConfig:
+    '''
+    This function will load a YAML or JSON file from a data package
+    Parameters
+    ---------------------
+    package: Data package, e.g. `dmu_data`
+    path   : Path to YAML/JSON file, relative to the data package
+    Returns
+    ---------------------
+    DictConfig class from the OmegaConf package
+    '''
+    cpath = files(package).joinpath(fpath)
+    cfg   = OmegaConf.load(cpath)
+    return cfg
 # --------------------------------
 def _get_module_name( fun : Callable) -> str:
     mod = inspect.getmodule(fun)
@@ -28,7 +80,7 @@ def timeit(f):
     Decorator used to time functions, it is turned off by default, can be turned on with:
     from dmu.generic.utilities import TIMER_ON
-    from dmu.generic.utilities import timeit
+    from dmu.generic.utilities import timeit
     TIMER_ON=True
@@ -54,29 +106,130 @@ def timeit(f):
 # --------------------------------
 def dump_json(data, path : str, sort_keys : bool = False) -> None:
     '''
-    Saves data as JSON
+    Saves data as JSON or YAML, depending on the extension, supported .json, .yaml, .yml
     Parameters
     data     : dictionary, list, etc
-    path     : Path to JSON file where to save it
-    sort_keys: Will set sort_keys argument of json.dump function
+    path     : Path to output file where to save it
+    sort_keys: Will set sort_keys argument of json.dump function
     '''
     dir_name = os.path.dirname(path)
     os.makedirs(dir_name, exist_ok=True)
     with open(path, 'w', encoding='utf-8') as ofile:
-        json.dump(data, ofile, indent=4, sort_keys=sort_keys)
+        if path.endswith('.json'):
+            json.dump(data, ofile, indent=4, sort_keys=sort_keys)
+            return
+        if path.endswith('.yaml') or path.endswith('.yml'):
+            yaml.dump(data, ofile, Dumper=BlockStyleDumper, sort_keys=sort_keys)
+            return
+        raise NotImplementedError(f'Cannot deduce format from extension in path: {path}')
 # --------------------------------
 def load_json(path : str):
     '''
-    Loads data from JSON
+    Loads data from JSON or YAML, depending on extension of files, supported .json, .yaml, .yml
     Parameters
-    path     : Path to JSON file where data is saved
+    path     : Path to outut file where data is saved
     '''
     with open(path, encoding='utf-8') as ofile:
-        data = json.load(ofile)
+        if path.endswith('.json'):
+            data = json.load(ofile)
+            return data
+        if path.endswith('.yaml') or path.endswith('.yml'):
+            data = yaml.safe_load(ofile)
+            return data
+        raise NotImplementedError(f'Cannot deduce format from extension in path: {path}')
+# --------------------------------
+def dump_pickle(data, path : str) -> None:
+    '''
+    Saves data as pickle file
+    Parameters
+    data     : dictionary, list, etc
+    path     : Path to output file where to save it
+    '''
+    dir_name = os.path.dirname(path)
+    os.makedirs(dir_name, exist_ok=True)
+    with open(path, 'wb') as ofile:
+        pickle.dump(data, ofile)
+# --------------------------------
+def load_pickle(path : str) -> None:
+    '''
+    loads data file
+    Parameters
+    path     : Path to output file where to save it
+    '''
+    with open(path, 'rb') as ofile:
+        data = pickle.load(ofile)
     return data
 # --------------------------------
+@contextmanager
+def silent_import():
+    '''
+    In charge of suppressing messages
+    of imported modules
+    '''
+    saved_stdout_fd = os.dup(1)
+    saved_stderr_fd = os.dup(2)
+    with open(os.devnull, 'w', encoding='utf-8') as devnull:
+        os.dup2(devnull.fileno(), 1)
+        os.dup2(devnull.fileno(), 2)
+        try:
+            yield
+        finally:
+            os.dup2(saved_stdout_fd, 1)
+            os.dup2(saved_stderr_fd, 2)
+            os.close(saved_stdout_fd)
+            os.close(saved_stderr_fd)
+# --------------------------------
+# Caching
+# --------------------------------
+def cache_data(obj : Any, hash_obj : Any) -> None:
+    '''
+    Will save data to a text file using a name from a hash
+    Parameters
+    -----------
+    obj      : Object that can be saved to a text file, e.g. list, number, dictionary
+    hash_obj : Object that can be used to get hash e.g. immutable
+    '''
+    try:
+        json.dumps(obj)
+    except Exception as exc:
+        raise ValueError('Object is not JSON serializable') from exc
+    val  = hashing.hash_object(hash_obj)
+    path = f'/tmp/dmu/cache/{val}.json'
+    gut.dump_json(obj, path)
+# --------------------------------
+def load_cached(hash_obj : Any, on_fail : Any = None) -> Any:
+    '''
+    Loads data corresponding to hash from hash_obj
+    Parameters
+    ---------------
+    hash_obj: Object used to calculate hash, which is in the file name
+    on_fail : Value returned if no data was found.
+              By default None, and it will just raise a FileNotFoundError
+    '''
+    val  = hashing.hash_object(hash_obj)
+    path = f'/tmp/dmu/cache/{val}.json'
+    if os.path.isfile(path):
+        data = gut.load_json(path)
+        return data
+    if on_fail is not None:
+        return on_fail
+    raise FileNotFoundError(f'Cannot find cached data at: {path}')
+# --------------------------------

dmu/logging/log_store.py CHANGED Viewed

@@ -1,10 +1,11 @@
 '''
 Module holding LogStore
 '''
 import logging
-from logging import Logger
+import contextlib
+from typing import Union
+from logging import Logger
 import logzero
 #------------------------------------------------------------
@@ -40,6 +41,36 @@ class LogStore:
     backend       = 'logging'
     #--------------------------
     @staticmethod
+    @contextlib.contextmanager
+    def level(name : str, lvl : int) -> None:
+        '''
+        Context manager used to set the logging level of a given logger
+        Parameters
+        ------------------
+        name : Name of logger
+        lvl  : Integer representing logging level
+        '''
+        log       = LogStore.get_logger(name=name)
+        if log is None:
+            raise ValueError(f'Cannot find logger {name}')
+        old_lvl = log.getEffectiveLevel()
+        LogStore.set_level(name, lvl)
+        try:
+            yield
+        finally:
+            LogStore.set_level(name, old_lvl)
+    #--------------------------
+    @staticmethod
+    def get_logger(name : str) -> Union[Logger,None]:
+        '''
+        Returns logger for a given name or None, if no logger found for that name
+        '''
+        return LogStore.d_logger.get(name)
+    #--------------------------
+    @staticmethod
     def add_logger(name : str, exists_ok : bool = False) -> Logger:
         '''
         Will use underlying logging library logzero/logging, etc to make logger
@@ -78,6 +109,7 @@ class LogStore:
     @staticmethod
     def _get_logging_logger(name : str, level : int) -> Logger:
         logger = logging.getLogger(name=name)
+        logger.propagate = False
         logger.setLevel(level)

dmu/logging/messages.py ADDED Viewed

@@ -0,0 +1,96 @@
+'''
+Module containing code meant to deal with logging of
+third party tools
+'''
+import os
+import sys
+import time
+import threading
+from io                    import StringIO
+from contextlib            import contextmanager
+from dmu.logging.log_store import LogStore
+log = LogStore.add_logger('dmu:logging:messages')
+# --------------------------------
+class FilteredStderr:
+    '''
+    This class is meant to be used to filter the messages
+    in the error stream by substrings
+    '''
+    # --------------------------------
+    def __init__(
+            self,
+            banned_substrings : list[str],
+            capture_stream    : StringIO):
+        '''
+        Parameters
+        -------------
+        banned_substrings : List of substrings that, if found in error message, will drop error
+        capture_stream    : Used to store error stream filtered messages, expected to be sys.__stderr__
+        '''
+        self._banned         = banned_substrings
+        self._capture_stream = capture_stream
+    # --------------------------------
+    def write(self, message : str):
+        '''
+        Should allow filtering error messages
+        '''
+        if not any(bad in message for bad in self._banned):
+            # This will make it to the error messages
+            self._capture_stream.write(message)
+    # --------------------------------
+    def flush(self):
+        '''
+        Should override the error stream's flush method
+        '''
+        self._capture_stream.flush()
+# --------------------------------
+@contextmanager
+def filter_stderr(
+        banned_substrings : list[str],
+        capture_stream    : StringIO|None=None):
+    '''
+    This contextmanager will suppress error messages
+    Parameters
+    -----------------
+    banned_substrings : List of substrings that need to be found in error messages
+                        in order for them to be suppressed
+    capture_stream    : Buffer needed to run tests, not needed for normal use
+    '''
+    if capture_stream is None:
+        capture_stream = sys.__stderr__
+    read_fd, write_fd = os.pipe()
+    saved_fd          = os.dup(2)
+    os.dup2(write_fd, 2)
+    os.close(write_fd)
+    filtered        = FilteredStderr(banned_substrings, capture_stream)
+    reader_finished = threading.Event()
+    def reader():
+        try:
+            with os.fdopen(read_fd, 'r', buffering=1) as pipe:
+                while True:
+                    line = pipe.readline()
+                    if not line:
+                        break
+                    filtered.write(line)
+                filtered.flush()
+        finally:
+            reader_finished.set()
+    thread = threading.Thread(target=reader, daemon=True)
+    thread.start()
+    try:
+        yield
+    finally:
+        os.dup2(saved_fd, 2)
+        os.close(saved_fd)
+        time.sleep(0.1)
+        reader_finished.wait(timeout=1.0)
+# --------------------------------

dmu/ml/cv_classifier.py CHANGED Viewed

@@ -37,17 +37,17 @@ class CVClassifier(GradientBoostingClassifier):
         self._s_hash    = set()
         self._data      = {}
-        self._l_ft_name = None
+        self._l_ft_name : list[str]
     # ----------------------------------
     @property
-    def features(self):
+    def features(self) -> list[str]:
         '''
         Returns list of feature names used in training dataset
         '''
         return self._l_ft_name
     # ----------------------------------
     @property
-    def hashes(self):
+    def hashes(self) -> set[str]:
         '''
         Will return set with hashes of training data
         '''

dmu/ml/cv_diagnostics.py CHANGED Viewed

@@ -186,6 +186,9 @@ class CVDiagnostics:
         raise NotImplementedError(f'Correlation coefficient {method} not implemented')
     # -------------------------
     def _plot_cutflow(self) -> None:
+        '''
+        Plot the 'mass' column for different values of working point
+        '''
         if 'overlay' not in self._cfg['correlations']['target']:
             log.debug('Not plotting cutflow of target distribution')
             return

dmu/ml/cv_performance.py ADDED Viewed

@@ -0,0 +1,58 @@
+'''
+This module contains the class CVPerformance
+'''
+# pylint: disable=too-many-positional-arguments, too-many-arguments
+from ROOT                  import RDataFrame
+from dmu.ml.cv_classifier  import CVClassifier
+from dmu.ml.cv_predict     import CVPredict
+from dmu.ml.train_mva      import TrainMva
+from dmu.logging.log_store import LogStore
+log=LogStore.add_logger('dmu:ml:cv_performance')
+# -----------------------------------------------------
+class CVPerformance:
+    '''
+    This class is meant to:
+    - Compare the classifier performance, through the ROC curve, of a model, for a given background and signal sample
+    '''
+    # ---------------------------
+    def plot_roc(
+            self,
+            name  : str,
+            color : str,
+            sig   : RDataFrame,
+            bkg   : RDataFrame,
+            model : list[CVClassifier] ) -> float:
+        '''
+        Method in charge of picking up model and data and plotting ROC curve
+        Parameters
+        --------------------------
+        name : Label of combination, used for plots
+        sig  : ROOT dataframe storing signal samples
+        bkg  : ROOT dataframe storing background samples
+        model: List of instances of the CVClassifier
+        Returns
+        --------------------------
+        Area under the ROC curve
+        '''
+        log.info(f'Loading {name}')
+        cvp_sig = CVPredict(models=model, rdf=sig)
+        arr_sig = cvp_sig.predict()
+        cvp_bkg = CVPredict(models=model, rdf=bkg)
+        arr_bkg = cvp_bkg.predict()
+        _, _, auc = TrainMva.plot_roc_from_prob(
+                arr_sig_prb=arr_sig,
+                arr_bkg_prb=arr_bkg,
+                kind       =   name,
+                color      =  color, # This should allow the function to pick kind
+                ifold      =    999) # for the label
+        return auc
+# -----------------------------------------------------

data-manipulation-utilities 0.2.7__py3-none-any.whl → 0.2.8.dev714__py3-none-any.whl

data-manipulation-utilities 0.2.7py3-none-any.whl → 0.2.8.dev714py3-none-any.whl