molSimplify 1.7.4__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- docs/source/conf.py +224 -0
- molSimplify/Classes/__init__.py +6 -0
- molSimplify/Classes/atom3D.py +235 -0
- molSimplify/Classes/dft_obs.py +130 -0
- molSimplify/Classes/globalvars.py +827 -0
- molSimplify/Classes/helpers.py +161 -0
- molSimplify/Classes/ligand.py +2330 -0
- molSimplify/Classes/mGUI.py +2493 -0
- molSimplify/Classes/mWidgets.py +438 -0
- molSimplify/Classes/miniGUI.py +41 -0
- molSimplify/Classes/mol2D.py +260 -0
- molSimplify/Classes/mol3D.py +5846 -0
- molSimplify/Classes/monomer3D.py +253 -0
- molSimplify/Classes/partialcharges.py +226 -0
- molSimplify/Classes/protein3D.py +1178 -0
- molSimplify/Classes/rundiag.py +151 -0
- molSimplify/Data/ML.dat +212 -0
- molSimplify/Data/MLS_FSR_for_inter.dat +23 -0
- molSimplify/Data/MLS_FSR_for_inter2.dat +23 -0
- molSimplify/Data/MLS_angle_for_click.dat +8 -0
- molSimplify/Data/MLS_angle_for_inter.dat +23 -0
- molSimplify/Data/MLS_angle_for_inter2.dat +48 -0
- molSimplify/Data/MLS_angle_for_intra.dat +10 -0
- molSimplify/Data/MLS_angle_for_intra2.dat +6 -0
- molSimplify/Data/MLS_angle_for_oa.dat +18 -0
- molSimplify/Data/ML_FSR_for_inter.dat +112 -0
- molSimplify/Data/ML_FSR_for_inter2.dat +110 -0
- molSimplify/Data/ML_bond_for_cat.dat +8 -0
- molSimplify/Data/ML_bond_for_click.dat +8 -0
- molSimplify/Data/ML_bond_for_inter.dat +48 -0
- molSimplify/Data/ML_bond_for_inter2.dat +48 -0
- molSimplify/Data/ML_bond_for_intra.dat +10 -0
- molSimplify/Data/ML_bond_for_intra2.dat +6 -0
- molSimplify/Data/ML_bond_for_oa.dat +18 -0
- molSimplify/Data/bp1.dat +21 -0
- molSimplify/Data/li.dat +3 -0
- molSimplify/Data/no.dat +2 -0
- molSimplify/Data/oct.dat +7 -0
- molSimplify/Data/pbp.dat +8 -0
- molSimplify/Data/spy.dat +6 -0
- molSimplify/Data/sqap.dat +9 -0
- molSimplify/Data/sqp.dat +5 -0
- molSimplify/Data/tbp.dat +6 -0
- molSimplify/Data/tdhd.dat +9 -0
- molSimplify/Data/thd.dat +5 -0
- molSimplify/Data/tpl.dat +4 -0
- molSimplify/Data/tpr.dat +7 -0
- molSimplify/Informatics/HFXsensitivity/__init__.py +0 -0
- molSimplify/Informatics/HFXsensitivity/measure_HFX_sensitivity_oxo_hat_reb_rel.py +443 -0
- molSimplify/Informatics/HFXsensitivity/measure_HFX_stable.py +346 -0
- molSimplify/Informatics/MOF/Linker_rotation.py +179 -0
- molSimplify/Informatics/MOF/MOF_descriptors.py +1299 -0
- molSimplify/Informatics/MOF/MOF_descriptors_alternate_functional.py +589 -0
- molSimplify/Informatics/MOF/MOF_functionalizer.py +1648 -0
- molSimplify/Informatics/MOF/PBC_functions.py +1347 -0
- molSimplify/Informatics/MOF/__init__.py +0 -0
- molSimplify/Informatics/MOF/atomic.py +267 -0
- molSimplify/Informatics/MOF/cluster_extraction.py +388 -0
- molSimplify/Informatics/MOF/fragment_MOFs_for_pormake.py +895 -0
- molSimplify/Informatics/MOF/monofunctionalized_BDC/index_information.py +10 -0
- molSimplify/Informatics/Mol2Parser.py +46 -0
- molSimplify/Informatics/RACassemble.py +408 -0
- molSimplify/Informatics/__init__.py +0 -0
- molSimplify/Informatics/active_learning/__init__.py +0 -0
- molSimplify/Informatics/active_learning/expected_improvement.py +269 -0
- molSimplify/Informatics/autocorrelation.py +1930 -0
- molSimplify/Informatics/clean_autocorrelation.py +778 -0
- molSimplify/Informatics/coulomb_analyze.py +67 -0
- molSimplify/Informatics/decoration_manager.py +193 -0
- molSimplify/Informatics/geo_analyze.py +88 -0
- molSimplify/Informatics/geometrics.py +56 -0
- molSimplify/Informatics/graph_analyze.py +163 -0
- molSimplify/Informatics/graph_racs.py +288 -0
- molSimplify/Informatics/jupyter_vis.py +172 -0
- molSimplify/Informatics/lacRACAssemble.py +2192 -0
- molSimplify/Informatics/lacRACAssemble_bisdithiolenes.py +236 -0
- molSimplify/Informatics/misc_descriptors.py +198 -0
- molSimplify/Informatics/organic_fingerprints.py +61 -0
- molSimplify/Informatics/partialcharges.py +345 -0
- molSimplify/Informatics/protein/activesite.py +53 -0
- molSimplify/Informatics/protein/pymol_add_hs.py +33 -0
- molSimplify/Informatics/rac155_geo.py +48 -0
- molSimplify/Ligands/(1_methylbenzimidazol_2_yl)pyridine.xyz +45 -0
- molSimplify/Ligands/1-4-dimethyl-1-2-3-triazole.xyz +15 -0
- molSimplify/Ligands/12crown4.mol +62 -0
- molSimplify/Ligands/Antipyrine.mol +58 -0
- molSimplify/Ligands/BPAbipy.mol +106 -0
- molSimplify/Ligands/Hpyrrole.mol +26 -0
- molSimplify/Ligands/N-quinolinylbutyramidate.xyz +31 -0
- molSimplify/Ligands/N-quinolinylmethylmethinylacetamidate.xyz +30 -0
- molSimplify/Ligands/NMe2_-1.xyz +11 -0
- molSimplify/Ligands/PCy3.mol +111 -0
- molSimplify/Ligands/PMe3.xyz +15 -0
- molSimplify/Ligands/PPh3.mol +76 -0
- molSimplify/Ligands/Propyphenazone.mol +77 -0
- molSimplify/Ligands/acac.mol +33 -0
- molSimplify/Ligands/acacen.mol +76 -0
- molSimplify/Ligands/acetate.smi +1 -0
- molSimplify/Ligands/acetate.xyz +9 -0
- molSimplify/Ligands/aceticacidbipyridine.mol +70 -0
- molSimplify/Ligands/acetonitrile.mol +17 -0
- molSimplify/Ligands/alanine.mol +30 -0
- molSimplify/Ligands/alphabetizer.py +21 -0
- molSimplify/Ligands/amine.mol +11 -0
- molSimplify/Ligands/ammonia.mol +12 -0
- molSimplify/Ligands/arginine.mol +58 -0
- molSimplify/Ligands/asparagine.mol +38 -0
- molSimplify/Ligands/aspartic_acid.mol +35 -0
- molSimplify/Ligands/azide.mol +11 -0
- molSimplify/Ligands/benzene.mol +28 -0
- molSimplify/Ligands/benzene_pi.mol +30 -0
- molSimplify/Ligands/benzenedithiol.mol +30 -0
- molSimplify/Ligands/benzenethiol.mol +30 -0
- molSimplify/Ligands/benzylisocy.mol +38 -0
- molSimplify/Ligands/bidiazine.mol +42 -0
- molSimplify/Ligands/bidiazole.mol +38 -0
- molSimplify/Ligands/bifuran.mol +38 -0
- molSimplify/Ligands/bihydrodiazine.mol +58 -0
- molSimplify/Ligands/bihydrodiazole.mol +46 -0
- molSimplify/Ligands/bihydrooxazine.mol +54 -0
- molSimplify/Ligands/bihydrooxazole.mol +42 -0
- molSimplify/Ligands/bihydrothiazine.mol +54 -0
- molSimplify/Ligands/bihydrothiazole.mol +42 -0
- molSimplify/Ligands/biimidazole.mol +38 -0
- molSimplify/Ligands/bioxazole.mol +34 -0
- molSimplify/Ligands/bipy.mol +46 -0
- molSimplify/Ligands/bipyrazine.xyz +20 -0
- molSimplify/Ligands/bipyrimidine.mol +42 -0
- molSimplify/Ligands/bipyrrole.mol +42 -0
- molSimplify/Ligands/bisnapthyridylpyridine.mol +111 -0
- molSimplify/Ligands/bithiazole.mol +34 -0
- molSimplify/Ligands/bromide.mol +7 -0
- molSimplify/Ligands/bromide.smi +1 -0
- molSimplify/Ligands/c2.mol +9 -0
- molSimplify/Ligands/caprolactone.mol +41 -0
- molSimplify/Ligands/carbonyl.mol +8 -0
- molSimplify/Ligands/carboxyl.mol +13 -0
- molSimplify/Ligands/cat.mol +30 -0
- molSimplify/Ligands/chloride.mol +7 -0
- molSimplify/Ligands/chloride.smi +1 -0
- molSimplify/Ligands/chloropyridine.mol +27 -0
- molSimplify/Ligands/co2.mol +10 -0
- molSimplify/Ligands/corrolazine.mol +72 -0
- molSimplify/Ligands/cs.mol +8 -0
- molSimplify/Ligands/cyanate.xyz +5 -0
- molSimplify/Ligands/cyanide.mol +9 -0
- molSimplify/Ligands/cyanoaceticporphyrin.mol +114 -0
- molSimplify/Ligands/cyanopyridine.mol +29 -0
- molSimplify/Ligands/cyclam.mol +81 -0
- molSimplify/Ligands/cyclen.mol +69 -0
- molSimplify/Ligands/cyclopentadienyl.mol +26 -0
- molSimplify/Ligands/cysteine.mol +32 -0
- molSimplify/Ligands/diaminomethyl.mol +19 -0
- molSimplify/Ligands/diazine.mol +25 -0
- molSimplify/Ligands/diazole.mol +23 -0
- molSimplify/Ligands/dicyanamide.mol +15 -0
- molSimplify/Ligands/dihydrofuran.mol +27 -0
- molSimplify/Ligands/dmap.xyz +35 -0
- molSimplify/Ligands/dmf.mol +28 -0
- molSimplify/Ligands/dmi.mol +41 -0
- molSimplify/Ligands/dmpe.mol +52 -0
- molSimplify/Ligands/dpmu.mol +47 -0
- molSimplify/Ligands/dppe.mol +112 -0
- molSimplify/Ligands/edta.mol +69 -0
- molSimplify/Ligands/en.mol +28 -0
- molSimplify/Ligands/ethanethiol.mol +21 -0
- molSimplify/Ligands/ethanolamine.mol +26 -0
- molSimplify/Ligands/ethbipy.mol +70 -0
- molSimplify/Ligands/ethyl.mol +19 -0
- molSimplify/Ligands/ethylamine.mol +24 -0
- molSimplify/Ligands/ethylene.mol +16 -0
- molSimplify/Ligands/ethylesteracac.mol +57 -0
- molSimplify/Ligands/fluoride.mol +7 -0
- molSimplify/Ligands/fluoride.smi +1 -0
- molSimplify/Ligands/formaldehyde.mol +12 -0
- molSimplify/Ligands/formamidate.xyz +8 -0
- molSimplify/Ligands/formate.xyz +6 -0
- molSimplify/Ligands/furan.mol +23 -0
- molSimplify/Ligands/glutamic_acid.mol +42 -0
- molSimplify/Ligands/glutamine.mol +44 -0
- molSimplify/Ligands/glycinate.mol +23 -0
- molSimplify/Ligands/glycine.mol +24 -0
- molSimplify/Ligands/h2s.mol +10 -0
- molSimplify/Ligands/helium.mol +6 -0
- molSimplify/Ligands/histidine.mol +45 -0
- molSimplify/Ligands/hmpa.mol +62 -0
- molSimplify/Ligands/hs-.mol +9 -0
- molSimplify/Ligands/hydride.mol +7 -0
- molSimplify/Ligands/hydrocarboxyacetylide.xyz +8 -0
- molSimplify/Ligands/hydrocyanide.mol +10 -0
- molSimplify/Ligands/hydrodiazine.mol +33 -0
- molSimplify/Ligands/hydrodiazole.mol +27 -0
- molSimplify/Ligands/hydrogensulfide.mol +10 -0
- molSimplify/Ligands/hydroisocyanide.mol +11 -0
- molSimplify/Ligands/hydrooxazine.mol +31 -0
- molSimplify/Ligands/hydrooxazole.mol +25 -0
- molSimplify/Ligands/hydrothiazine.mol +31 -0
- molSimplify/Ligands/hydrothiazole.mol +25 -0
- molSimplify/Ligands/hydroxyl.mol +9 -0
- molSimplify/Ligands/imidazole.mol +23 -0
- molSimplify/Ligands/imidazolidinone.mol +29 -0
- molSimplify/Ligands/imine.mol +13 -0
- molSimplify/Ligands/iminodiacetic.mol +33 -0
- molSimplify/Ligands/iodide.mol +7 -0
- molSimplify/Ligands/iodobenzene.xyz +14 -0
- molSimplify/Ligands/isoleucine.mol +48 -0
- molSimplify/Ligands/isothiocyanate.mol +11 -0
- molSimplify/Ligands/leucine.mol +48 -0
- molSimplify/Ligands/ligands.dict +257 -0
- molSimplify/Ligands/lysine.mol +54 -0
- molSimplify/Ligands/mebenzenedithiol.mol +36 -0
- molSimplify/Ligands/mebim_py.xyz +29 -0
- molSimplify/Ligands/mebim_pz.xyz +28 -0
- molSimplify/Ligands/mebipy.mol +58 -0
- molSimplify/Ligands/mecat.mol +36 -0
- molSimplify/Ligands/methanal.mol +11 -0
- molSimplify/Ligands/methanethiol.mol +15 -0
- molSimplify/Ligands/methanol.mol +16 -0
- molSimplify/Ligands/methionine.mol +44 -0
- molSimplify/Ligands/methyl.mol +13 -0
- molSimplify/Ligands/methylacetylide.xyz +8 -0
- molSimplify/Ligands/methylamine.mol +19 -0
- molSimplify/Ligands/methylazide.xyz +9 -0
- molSimplify/Ligands/methylisocy.mol +17 -0
- molSimplify/Ligands/methylpyridine.mol +33 -0
- molSimplify/Ligands/n2.mol +8 -0
- molSimplify/Ligands/n4py.xyz +51 -0
- molSimplify/Ligands/nch.mol +10 -0
- molSimplify/Ligands/nco-.mol +11 -0
- molSimplify/Ligands/nethanolamine.mol +26 -0
- molSimplify/Ligands/nitrate.mol +14 -0
- molSimplify/Ligands/nitrite.mol +11 -0
- molSimplify/Ligands/nitro.mol +11 -0
- molSimplify/Ligands/nitrobipy.mol +54 -0
- molSimplify/Ligands/nitroso.mol +8 -0
- molSimplify/Ligands/nme3.mol +30 -0
- molSimplify/Ligands/no-.mol +10 -0
- molSimplify/Ligands/no2-.mol +11 -0
- molSimplify/Ligands/noxygen.mol +8 -0
- molSimplify/Ligands/ns-.mol +10 -0
- molSimplify/Ligands/o-pyridylbenzene.xyz +23 -0
- molSimplify/Ligands/o-pyridylphenylanion.xyz +22 -0
- molSimplify/Ligands/o2-.mol +9 -0
- molSimplify/Ligands/o2.xyz +4 -0
- molSimplify/Ligands/och2.mol +12 -0
- molSimplify/Ligands/oethanolamine.mol +26 -0
- molSimplify/Ligands/ome2.mol +22 -0
- molSimplify/Ligands/ooh.xyz +5 -0
- molSimplify/Ligands/oxalate.mol +17 -0
- molSimplify/Ligands/oxalate.smi +1 -0
- molSimplify/Ligands/oxygen.mol +7 -0
- molSimplify/Ligands/pentacyanocyclopentadienide.mol +36 -0
- molSimplify/Ligands/ph2-.mol +11 -0
- molSimplify/Ligands/ph3.mol +12 -0
- molSimplify/Ligands/phen.mol +51 -0
- molSimplify/Ligands/phenacac.mol +63 -0
- molSimplify/Ligands/phenalalanine.mol +51 -0
- molSimplify/Ligands/phendione.mol +51 -0
- molSimplify/Ligands/phenphen.mol +75 -0
- molSimplify/Ligands/phenylbenzoxazole.mol +54 -0
- molSimplify/Ligands/phenylcyc.mol +99 -0
- molSimplify/Ligands/phenylenediamine.mol +37 -0
- molSimplify/Ligands/phenylisocy.mol +32 -0
- molSimplify/Ligands/phosacidbipy.mol +66 -0
- molSimplify/Ligands/phosphine.mol +13 -0
- molSimplify/Ligands/phosphorine.mol +27 -0
- molSimplify/Ligands/phosphorustrifluoride.mol +12 -0
- molSimplify/Ligands/phthalocyanine.mol +126 -0
- molSimplify/Ligands/pme3o.mol +32 -0
- molSimplify/Ligands/porphyrin.mol +82 -0
- molSimplify/Ligands/pph3o.mol +77 -0
- molSimplify/Ligands/proline.mol +39 -0
- molSimplify/Ligands/propdiol.mol +21 -0
- molSimplify/Ligands/propylene.mol +23 -0
- molSimplify/Ligands/pyridine.mol +27 -0
- molSimplify/Ligands/pyrimidone.mol +27 -0
- molSimplify/Ligands/pyrrole.mol +24 -0
- molSimplify/Ligands/quinoxalinedithiol.mol +39 -0
- molSimplify/Ligands/s2-.mol +9 -0
- molSimplify/Ligands/salen.mol +75 -0
- molSimplify/Ligands/salphen.mol +84 -0
- molSimplify/Ligands/serine.mol +32 -0
- molSimplify/Ligands/simple_ligands.dict +14 -0
- molSimplify/Ligands/sulfacidbipy.mol +63 -0
- molSimplify/Ligands/tbucat.mol +54 -0
- molSimplify/Ligands/tbuphisocy.mol +56 -0
- molSimplify/Ligands/tbutylcyclen.mol +166 -0
- molSimplify/Ligands/tbutylisocy.mol +35 -0
- molSimplify/Ligands/tbutylthiol.mol +33 -0
- molSimplify/Ligands/tcnoet.mol +43 -0
- molSimplify/Ligands/tcnoetOH.mol +45 -0
- molSimplify/Ligands/terpy.mol +65 -0
- molSimplify/Ligands/tetrahydrofuran.mol +31 -0
- molSimplify/Ligands/thiane.mol +37 -0
- molSimplify/Ligands/thiazole.mol +21 -0
- molSimplify/Ligands/thiocyanate.mol +11 -0
- molSimplify/Ligands/thiol.mol +9 -0
- molSimplify/Ligands/thiophene.mol +23 -0
- molSimplify/Ligands/thiopyridine.mol +29 -0
- molSimplify/Ligands/threonine.mol +38 -0
- molSimplify/Ligands/tpp.mol +165 -0
- molSimplify/Ligands/tricyanomethyl.mol +19 -0
- molSimplify/Ligands/trifluoromethyl.mol +13 -0
- molSimplify/Ligands/tryptophan.mol +60 -0
- molSimplify/Ligands/tyrosine.mol +53 -0
- molSimplify/Ligands/uthiol.mol +11 -0
- molSimplify/Ligands/uthiolme2.mol +23 -0
- molSimplify/Ligands/valine.mol +42 -0
- molSimplify/Ligands/water.mol +10 -0
- molSimplify/Ligands/x.mol +6 -0
- molSimplify/Scripts/__init__.py +0 -0
- molSimplify/Scripts/addtodb.py +308 -0
- molSimplify/Scripts/cellbuilder.py +1592 -0
- molSimplify/Scripts/cellbuilder_tools.py +701 -0
- molSimplify/Scripts/chains.py +342 -0
- molSimplify/Scripts/convert_2to3.py +23 -0
- molSimplify/Scripts/dbinteract.py +631 -0
- molSimplify/Scripts/distgeom.py +617 -0
- molSimplify/Scripts/findcorrelations.py +287 -0
- molSimplify/Scripts/generator.py +267 -0
- molSimplify/Scripts/geometry.py +1224 -0
- molSimplify/Scripts/grabguivars.py +845 -0
- molSimplify/Scripts/in_b3lyp_usetc.py +141 -0
- molSimplify/Scripts/inparse.py +1673 -0
- molSimplify/Scripts/io.py +1149 -0
- molSimplify/Scripts/isomers.py +415 -0
- molSimplify/Scripts/jobgen.py +247 -0
- molSimplify/Scripts/krr_prep.py +1262 -0
- molSimplify/Scripts/molSimplify_io.py +18 -0
- molSimplify/Scripts/molden2psi4wfn.py +166 -0
- molSimplify/Scripts/namegen.py +32 -0
- molSimplify/Scripts/nn_prep.py +561 -0
- molSimplify/Scripts/oct_check_mols.py +782 -0
- molSimplify/Scripts/periodic_QE.py +97 -0
- molSimplify/Scripts/postmold.py +304 -0
- molSimplify/Scripts/postmwfn.py +709 -0
- molSimplify/Scripts/postparse.py +488 -0
- molSimplify/Scripts/postproc.py +139 -0
- molSimplify/Scripts/qcgen.py +1450 -0
- molSimplify/Scripts/rmsd.py +489 -0
- molSimplify/Scripts/rungen.py +670 -0
- molSimplify/Scripts/structgen.py +3040 -0
- molSimplify/Scripts/tf_nn_prep.py +894 -0
- molSimplify/Scripts/tsgen.py +295 -0
- molSimplify/Scripts/uq_calibration.py +69 -0
- molSimplify/__init__.py +0 -0
- molSimplify/__main__.py +197 -0
- molSimplify/icons/chemdb.png +0 -0
- molSimplify/icons/hjklogo.png +0 -0
- molSimplify/icons/icon.png +0 -0
- molSimplify/icons/logo.png +0 -0
- molSimplify/icons/logo_old.png +0 -0
- molSimplify/icons/petachem.png +0 -0
- molSimplify/icons/petachem2.png +0 -0
- molSimplify/icons/petachem_full.png +0 -0
- molSimplify/icons/pythonlogo.png +0 -0
- molSimplify/icons/sge copy.png +0 -0
- molSimplify/icons/sge.png +0 -0
- molSimplify/icons/slurm.png +0 -0
- molSimplify/icons/wft1.png +0 -0
- molSimplify/icons/wft2.png +0 -0
- molSimplify/icons/wft3.png +0 -0
- molSimplify/ml/__init__.py +0 -0
- molSimplify/ml/kernels.py +36 -0
- molSimplify/ml/layers.py +29 -0
- molSimplify/molscontrol/__init__.py +14 -0
- molSimplify/molscontrol/_version.py +521 -0
- molSimplify/molscontrol/clf_tools.py +144 -0
- molSimplify/molscontrol/data/README.md +21 -0
- molSimplify/molscontrol/data/look_and_say.dat +15 -0
- molSimplify/molscontrol/dynamic_classifier.py +514 -0
- molSimplify/molscontrol/io_tools.py +363 -0
- molSimplify/molscontrol/molscontrol.py +49 -0
- molSimplify/molscontrol/terachem/jobscript_control.sh +31 -0
- molSimplify/molscontrol/terachem/terachem_input +22 -0
- molSimplify/python_krr/X_train_TS.csv +535 -0
- molSimplify/python_krr/__init__.py +0 -0
- molSimplify/python_krr/hat2_X_mean_std.csv +3 -0
- molSimplify/python_krr/hat2_feature_names.csv +1 -0
- molSimplify/python_krr/hat2_y_mean_std.csv +2 -0
- molSimplify/python_krr/hat_X_mean_std.csv +6 -0
- molSimplify/python_krr/hat_feature_names.csv +1 -0
- molSimplify/python_krr/hat_krr_X_train.csv +5205 -0
- molSimplify/python_krr/hat_krr_dual_coef.csv +1 -0
- molSimplify/python_krr/hat_y_mean_std.csv +2 -0
- molSimplify/python_krr/sklearn_models.py +34 -0
- molSimplify/python_krr/y_train_TS.csv +535 -0
- molSimplify/python_nn/ANN.py +198 -0
- molSimplify/python_nn/__init__.py +0 -0
- molSimplify/python_nn/clf_analysis_tool.py +125 -0
- molSimplify/python_nn/dictionary_toolbox.py +49 -0
- molSimplify/python_nn/ensemble_test.py +309 -0
- molSimplify/python_nn/hs_center.csv +26 -0
- molSimplify/python_nn/hs_scale.csv +26 -0
- molSimplify/python_nn/ls_center.csv +26 -0
- molSimplify/python_nn/ls_scale.csv +26 -0
- molSimplify/python_nn/ms_hs_b1.csv +50 -0
- molSimplify/python_nn/ms_hs_b2.csv +50 -0
- molSimplify/python_nn/ms_hs_b3.csv +1 -0
- molSimplify/python_nn/ms_hs_w1.csv +50 -0
- molSimplify/python_nn/ms_hs_w2.csv +50 -0
- molSimplify/python_nn/ms_hs_w3.csv +1 -0
- molSimplify/python_nn/ms_ls_b1.csv +50 -0
- molSimplify/python_nn/ms_ls_b2.csv +50 -0
- molSimplify/python_nn/ms_ls_b3.csv +1 -0
- molSimplify/python_nn/ms_ls_w1.csv +50 -0
- molSimplify/python_nn/ms_ls_w2.csv +50 -0
- molSimplify/python_nn/ms_ls_w3.csv +1 -0
- molSimplify/python_nn/ms_slope_b1.csv +50 -0
- molSimplify/python_nn/ms_slope_b2.csv +50 -0
- molSimplify/python_nn/ms_slope_b3.csv +1 -0
- molSimplify/python_nn/ms_slope_w1.csv +50 -0
- molSimplify/python_nn/ms_slope_w2.csv +50 -0
- molSimplify/python_nn/ms_slope_w3.csv +1 -0
- molSimplify/python_nn/ms_split_b1.csv +50 -0
- molSimplify/python_nn/ms_split_b2.csv +50 -0
- molSimplify/python_nn/ms_split_b3.csv +1 -0
- molSimplify/python_nn/ms_split_w1.csv +50 -0
- molSimplify/python_nn/ms_split_w2.csv +50 -0
- molSimplify/python_nn/ms_split_w3.csv +1 -0
- molSimplify/python_nn/slope_center.csv +25 -0
- molSimplify/python_nn/slope_scale.csv +25 -0
- molSimplify/python_nn/split_center.csv +26 -0
- molSimplify/python_nn/split_scale.csv +26 -0
- molSimplify/python_nn/tf_ANN.py +762 -0
- molSimplify/python_nn/train_data.csv +1211 -0
- molSimplify/tf_nn/__init__.py +0 -0
- molSimplify/tf_nn/geo_static_clf/geo_static_clf_model.h5 +0 -0
- molSimplify/tf_nn/geo_static_clf/geo_static_clf_train_name.csv +1591 -0
- molSimplify/tf_nn/geo_static_clf/geo_static_clf_train_x.csv +2790 -0
- molSimplify/tf_nn/geo_static_clf/geo_static_clf_train_y.csv +2790 -0
- molSimplify/tf_nn/geo_static_clf/geo_static_clf_vars.csv +154 -0
- molSimplify/tf_nn/geos/hs_ii_bl_x.csv +1577 -0
- molSimplify/tf_nn/geos/hs_ii_bl_y.csv +1577 -0
- molSimplify/tf_nn/geos/hs_ii_model.h5 +0 -0
- molSimplify/tf_nn/geos/hs_ii_model.json +1 -0
- molSimplify/tf_nn/geos/hs_ii_vars.csv +154 -0
- molSimplify/tf_nn/geos/hs_iii_bl_x.csv +1659 -0
- molSimplify/tf_nn/geos/hs_iii_bl_y.csv +1659 -0
- molSimplify/tf_nn/geos/hs_iii_model.h5 +0 -0
- molSimplify/tf_nn/geos/hs_iii_model.json +1 -0
- molSimplify/tf_nn/geos/hs_iii_vars.csv +154 -0
- molSimplify/tf_nn/geos/ls_ii_bl_x.csv +1374 -0
- molSimplify/tf_nn/geos/ls_ii_bl_y.csv +1374 -0
- molSimplify/tf_nn/geos/ls_ii_model.h5 +0 -0
- molSimplify/tf_nn/geos/ls_ii_model.json +1 -0
- molSimplify/tf_nn/geos/ls_ii_vars.csv +154 -0
- molSimplify/tf_nn/geos/ls_iii_bl_x.csv +1364 -0
- molSimplify/tf_nn/geos/ls_iii_bl_y.csv +1364 -0
- molSimplify/tf_nn/geos/ls_iii_model.h5 +0 -0
- molSimplify/tf_nn/geos/ls_iii_model.json +1 -0
- molSimplify/tf_nn/geos/ls_iii_vars.csv +154 -0
- molSimplify/tf_nn/homolumo/gap_model.h5 +0 -0
- molSimplify/tf_nn/homolumo/gap_model.json +1 -0
- molSimplify/tf_nn/homolumo/gap_test_names.csv +175 -0
- molSimplify/tf_nn/homolumo/gap_test_x.csv +176 -0
- molSimplify/tf_nn/homolumo/gap_test_y.csv +176 -0
- molSimplify/tf_nn/homolumo/gap_train_names.csv +699 -0
- molSimplify/tf_nn/homolumo/gap_train_x.csv +700 -0
- molSimplify/tf_nn/homolumo/gap_train_y.csv +700 -0
- molSimplify/tf_nn/homolumo/gap_vars.csv +153 -0
- molSimplify/tf_nn/homolumo/homo_model.h5 +0 -0
- molSimplify/tf_nn/homolumo/homo_model.json +126 -0
- molSimplify/tf_nn/homolumo/homo_test_names.csv +175 -0
- molSimplify/tf_nn/homolumo/homo_test_x.csv +176 -0
- molSimplify/tf_nn/homolumo/homo_test_y.csv +176 -0
- molSimplify/tf_nn/homolumo/homo_train_names.csv +699 -0
- molSimplify/tf_nn/homolumo/homo_train_x.csv +700 -0
- molSimplify/tf_nn/homolumo/homo_train_y.csv +700 -0
- molSimplify/tf_nn/homolumo/homo_vars.csv +153 -0
- molSimplify/tf_nn/oxoandhomo/homo_empty_info.json +7 -0
- molSimplify/tf_nn/oxoandhomo/homo_empty_model.h5 +0 -0
- molSimplify/tf_nn/oxoandhomo/homo_empty_model.json +1 -0
- molSimplify/tf_nn/oxoandhomo/homo_empty_test_names.csv +143 -0
- molSimplify/tf_nn/oxoandhomo/homo_empty_test_x.csv +144 -0
- molSimplify/tf_nn/oxoandhomo/homo_empty_test_y.csv +144 -0
- molSimplify/tf_nn/oxoandhomo/homo_empty_train_names.csv +513 -0
- molSimplify/tf_nn/oxoandhomo/homo_empty_train_x.csv +514 -0
- molSimplify/tf_nn/oxoandhomo/homo_empty_train_y.csv +514 -0
- molSimplify/tf_nn/oxoandhomo/homo_empty_val_names.csv +143 -0
- molSimplify/tf_nn/oxoandhomo/homo_empty_val_x.csv +58 -0
- molSimplify/tf_nn/oxoandhomo/homo_empty_val_y.csv +58 -0
- molSimplify/tf_nn/oxoandhomo/homo_empty_vars.csv +155 -0
- molSimplify/tf_nn/oxoandhomo/oxo20_info.json +7 -0
- molSimplify/tf_nn/oxoandhomo/oxo20_model.h5 +0 -0
- molSimplify/tf_nn/oxoandhomo/oxo20_model.json +1 -0
- molSimplify/tf_nn/oxoandhomo/oxo20_test_names.csv +143 -0
- molSimplify/tf_nn/oxoandhomo/oxo20_test_x.csv +144 -0
- molSimplify/tf_nn/oxoandhomo/oxo20_test_y.csv +144 -0
- molSimplify/tf_nn/oxoandhomo/oxo20_train_names.csv +513 -0
- molSimplify/tf_nn/oxoandhomo/oxo20_train_x.csv +514 -0
- molSimplify/tf_nn/oxoandhomo/oxo20_train_y.csv +514 -0
- molSimplify/tf_nn/oxoandhomo/oxo20_val_names.csv +143 -0
- molSimplify/tf_nn/oxoandhomo/oxo20_val_x.csv +58 -0
- molSimplify/tf_nn/oxoandhomo/oxo20_val_y.csv +58 -0
- molSimplify/tf_nn/oxoandhomo/oxo20_vars.csv +154 -0
- molSimplify/tf_nn/oxocatalysis/hat_model.h5 +0 -0
- molSimplify/tf_nn/oxocatalysis/hat_model.json +1 -0
- molSimplify/tf_nn/oxocatalysis/hat_test_names.csv +419 -0
- molSimplify/tf_nn/oxocatalysis/hat_test_x.csv +420 -0
- molSimplify/tf_nn/oxocatalysis/hat_test_y.csv +420 -0
- molSimplify/tf_nn/oxocatalysis/hat_train_names.csv +1507 -0
- molSimplify/tf_nn/oxocatalysis/hat_train_x.csv +1508 -0
- molSimplify/tf_nn/oxocatalysis/hat_train_y.csv +1508 -0
- molSimplify/tf_nn/oxocatalysis/hat_val_x.csv +169 -0
- molSimplify/tf_nn/oxocatalysis/hat_val_y.csv +169 -0
- molSimplify/tf_nn/oxocatalysis/hat_vars.csv +162 -0
- molSimplify/tf_nn/oxocatalysis/oxo_model.h5 +0 -0
- molSimplify/tf_nn/oxocatalysis/oxo_model.json +1 -0
- molSimplify/tf_nn/oxocatalysis/oxo_test_names.csv +527 -0
- molSimplify/tf_nn/oxocatalysis/oxo_test_x.csv +528 -0
- molSimplify/tf_nn/oxocatalysis/oxo_test_y.csv +528 -0
- molSimplify/tf_nn/oxocatalysis/oxo_train_names.csv +1897 -0
- molSimplify/tf_nn/oxocatalysis/oxo_train_x.csv +1898 -0
- molSimplify/tf_nn/oxocatalysis/oxo_train_y.csv +1898 -0
- molSimplify/tf_nn/oxocatalysis/oxo_val_x.csv +212 -0
- molSimplify/tf_nn/oxocatalysis/oxo_val_y.csv +212 -0
- molSimplify/tf_nn/oxocatalysis/oxo_vars.csv +162 -0
- molSimplify/tf_nn/rescaling_data/gap_mean_x.csv +153 -0
- molSimplify/tf_nn/rescaling_data/gap_mean_y.csv +1 -0
- molSimplify/tf_nn/rescaling_data/gap_var_x.csv +153 -0
- molSimplify/tf_nn/rescaling_data/gap_var_y.csv +1 -0
- molSimplify/tf_nn/rescaling_data/geo_static_clf_mean_x.csv +154 -0
- molSimplify/tf_nn/rescaling_data/geo_static_clf_mean_y.csv +1 -0
- molSimplify/tf_nn/rescaling_data/geo_static_clf_var_x.csv +154 -0
- molSimplify/tf_nn/rescaling_data/geo_static_clf_var_y.csv +1 -0
- molSimplify/tf_nn/rescaling_data/hat_mean_x.csv +162 -0
- molSimplify/tf_nn/rescaling_data/hat_mean_y.csv +1 -0
- molSimplify/tf_nn/rescaling_data/hat_var_x.csv +162 -0
- molSimplify/tf_nn/rescaling_data/hat_var_y.csv +1 -0
- molSimplify/tf_nn/rescaling_data/homo_empty_mean_x.csv +155 -0
- molSimplify/tf_nn/rescaling_data/homo_empty_mean_y.csv +1 -0
- molSimplify/tf_nn/rescaling_data/homo_empty_var_x.csv +155 -0
- molSimplify/tf_nn/rescaling_data/homo_empty_var_y.csv +1 -0
- molSimplify/tf_nn/rescaling_data/homo_mean_x.csv +153 -0
- molSimplify/tf_nn/rescaling_data/homo_mean_y.csv +1 -0
- molSimplify/tf_nn/rescaling_data/homo_var_x.csv +153 -0
- molSimplify/tf_nn/rescaling_data/homo_var_y.csv +1 -0
- molSimplify/tf_nn/rescaling_data/hs_ii_mean_x.csv +154 -0
- molSimplify/tf_nn/rescaling_data/hs_ii_mean_y.csv +3 -0
- molSimplify/tf_nn/rescaling_data/hs_ii_var_x.csv +154 -0
- molSimplify/tf_nn/rescaling_data/hs_ii_var_y.csv +3 -0
- molSimplify/tf_nn/rescaling_data/hs_iii_mean_x.csv +154 -0
- molSimplify/tf_nn/rescaling_data/hs_iii_mean_y.csv +3 -0
- molSimplify/tf_nn/rescaling_data/hs_iii_var_x.csv +154 -0
- molSimplify/tf_nn/rescaling_data/hs_iii_var_y.csv +3 -0
- molSimplify/tf_nn/rescaling_data/ls_ii_mean_x.csv +154 -0
- molSimplify/tf_nn/rescaling_data/ls_ii_mean_y.csv +3 -0
- molSimplify/tf_nn/rescaling_data/ls_ii_var_x.csv +154 -0
- molSimplify/tf_nn/rescaling_data/ls_ii_var_y.csv +3 -0
- molSimplify/tf_nn/rescaling_data/ls_iii_mean_x.csv +154 -0
- molSimplify/tf_nn/rescaling_data/ls_iii_mean_y.csv +3 -0
- molSimplify/tf_nn/rescaling_data/ls_iii_var_x.csv +154 -0
- molSimplify/tf_nn/rescaling_data/ls_iii_var_y.csv +3 -0
- molSimplify/tf_nn/rescaling_data/oxo20_mean_x.csv +154 -0
- molSimplify/tf_nn/rescaling_data/oxo20_mean_y.csv +1 -0
- molSimplify/tf_nn/rescaling_data/oxo20_var_x.csv +154 -0
- molSimplify/tf_nn/rescaling_data/oxo20_var_y.csv +1 -0
- molSimplify/tf_nn/rescaling_data/oxo_mean_x.csv +162 -0
- molSimplify/tf_nn/rescaling_data/oxo_mean_y.csv +1 -0
- molSimplify/tf_nn/rescaling_data/oxo_var_x.csv +162 -0
- molSimplify/tf_nn/rescaling_data/oxo_var_y.csv +1 -0
- molSimplify/tf_nn/rescaling_data/sc_static_clf_mean_x.csv +154 -0
- molSimplify/tf_nn/rescaling_data/sc_static_clf_mean_y.csv +1 -0
- molSimplify/tf_nn/rescaling_data/sc_static_clf_var_x.csv +154 -0
- molSimplify/tf_nn/rescaling_data/sc_static_clf_var_y.csv +1 -0
- molSimplify/tf_nn/rescaling_data/split_mean_x.csv +155 -0
- molSimplify/tf_nn/rescaling_data/split_mean_y.csv +1 -0
- molSimplify/tf_nn/rescaling_data/split_var_x.csv +155 -0
- molSimplify/tf_nn/rescaling_data/split_var_y.csv +1 -0
- molSimplify/tf_nn/sc_static_clf/sc_static_clf_model.h5 +0 -0
- molSimplify/tf_nn/sc_static_clf/sc_static_clf_train_name.csv +1591 -0
- molSimplify/tf_nn/sc_static_clf/sc_static_clf_train_x.csv +1592 -0
- molSimplify/tf_nn/sc_static_clf/sc_static_clf_train_y.csv +1592 -0
- molSimplify/tf_nn/sc_static_clf/sc_static_clf_vars.csv +154 -0
- molSimplify/tf_nn/split/split_model.h5 +0 -0
- molSimplify/tf_nn/split/split_model.json +1 -0
- molSimplify/tf_nn/split/split_vars.csv +155 -0
- molSimplify/tf_nn/split/split_x.csv +1902 -0
- molSimplify/tf_nn/split/split_y.csv +1902 -0
- molSimplify/tf_nn/split/train_names.csv +1901 -0
- molSimplify/utils/__init__.py +0 -0
- molSimplify/utils/decorators.py +16 -0
- molSimplify/utils/metaclasses.py +12 -0
- molSimplify/utils/tensorflow.py +23 -0
- molSimplify/utils/timer.py +16 -0
- molSimplify-1.7.4.dist-info/LICENSE +674 -0
- molSimplify-1.7.4.dist-info/METADATA +821 -0
- molSimplify-1.7.4.dist-info/RECORD +651 -0
- molSimplify-1.7.4.dist-info/WHEEL +5 -0
- molSimplify-1.7.4.dist-info/entry_points.txt +3 -0
- molSimplify-1.7.4.dist-info/top_level.txt +4 -0
- tests/generateTests.py +122 -0
- tests/helperFuncs.py +658 -0
- tests/informatics/test_MOF_descriptors.py +128 -0
- tests/informatics/test_active_learning.py +113 -0
- tests/informatics/test_coulomb_analyze.py +24 -0
- tests/informatics/test_graph_racs.py +193 -0
- tests/ml/test_kernels.py +20 -0
- tests/ml/test_layers.py +47 -0
- tests/runtest.py +10 -0
- tests/test_Mol2D.py +128 -0
- tests/test_basic_imports.py +62 -0
- tests/test_bidentate.py +25 -0
- tests/test_cli.py +20 -0
- tests/test_distgeom.py +106 -0
- tests/test_example_1.py +29 -0
- tests/test_example_3.py +31 -0
- tests/test_example_5.py +43 -0
- tests/test_example_7.py +28 -0
- tests/test_example_8.py +15 -0
- tests/test_example_tbp.py +15 -0
- tests/test_ff_xtb.py +111 -0
- tests/test_geocheck_oct.py +26 -0
- tests/test_geocheck_one_empty.py +15 -0
- tests/test_geometry.py +44 -0
- tests/test_inparse.py +76 -0
- tests/test_io.py +84 -0
- tests/test_jobgen.py +84 -0
- tests/test_joption_pythonic.py +27 -0
- tests/test_ligand_assign.py +58 -0
- tests/test_ligand_assign_consistent.py +60 -0
- tests/test_ligand_class.py +26 -0
- tests/test_ligand_from_mol_file.py +35 -0
- tests/test_ligands.py +86 -0
- tests/test_mol3D.py +337 -0
- tests/test_molcas_caspt2.py +15 -0
- tests/test_molcas_casscf.py +15 -0
- tests/test_old_ANNs.py +68 -0
- tests/test_orca_ccsdt.py +15 -0
- tests/test_orca_dft.py +15 -0
- tests/test_qcgen.py +50 -0
- tests/test_racs.py +124 -0
- tests/test_rmsd.py +68 -0
- tests/test_structgen_functions.py +198 -0
- tests/test_tetrahedral.py +29 -0
- tests/test_tutorial_10_part_one.py +16 -0
- tests/test_tutorial_10_part_two.py +15 -0
- tests/test_tutorial_2.py +11 -0
- tests/test_tutorial_3.py +15 -0
- tests/test_tutorial_4.py +57 -0
- tests/test_tutorial_6.py +10 -0
- tests/test_tutorial_8.py +29 -0
- tests/test_tutorial_9_part_one.py +15 -0
- tests/test_tutorial_9_part_two.py +15 -0
- tests/test_tutorial_qm9_part_one.py +6 -0
- tests/testresources/refs/racs/generate_references.py +85 -0
- workflows/NandyJACSAu2022/bridge_functionalizer.py +253 -0
- workflows/NandyJACSAu2022/frag_functionalizer.py +242 -0
- workflows/NandyJACSAu2022/fragment_classes.py +586 -0
- workflows/NandyJACSAu2022/macrocycle_synthesis.py +179 -0
|
@@ -0,0 +1,762 @@
|
|
|
1
|
+
# Written by JP Janet for HJK Group
|
|
2
|
+
# Dpt of Chemical Engineering, MIT
|
|
3
|
+
|
|
4
|
+
##########################################################
|
|
5
|
+
######## This script contains a neural network ##########
|
|
6
|
+
##### trained on octahedral metal-ligand #######
|
|
7
|
+
######## bond distances and spin propensity ###########
|
|
8
|
+
##########################################################
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
import csv
|
|
12
|
+
import glob
|
|
13
|
+
import json
|
|
14
|
+
import os
|
|
15
|
+
|
|
16
|
+
import numpy as np
|
|
17
|
+
import pandas as pd
|
|
18
|
+
import scipy
|
|
19
|
+
from typing import List, Tuple, Union, Optional
|
|
20
|
+
from tensorflow.keras import backend as K
|
|
21
|
+
from tensorflow.keras.models import model_from_json, load_model
|
|
22
|
+
from importlib_resources import files as resource_files
|
|
23
|
+
from packaging import version
|
|
24
|
+
import tensorflow as tf
|
|
25
|
+
|
|
26
|
+
from molSimplify.python_nn.clf_analysis_tool import array_stack, get_layer_outputs, dist_neighbor, get_entropy
|
|
27
|
+
|
|
28
|
+
|
|
29
|
+
def perform_ANN_prediction(RAC_dataframe: pd.DataFrame, predictor_name: str,
|
|
30
|
+
RAC_column: str = 'RACs') -> pd.DataFrame:
|
|
31
|
+
# Performs a correctly normalized/rescaled prediction for a property specified by predictor_name.
|
|
32
|
+
# Also calculates latent vector and smallest latent distance from training data.
|
|
33
|
+
# RAC_dataframe can contain anything (e.g. a database pull) as long as it also contains the required RAC features.
|
|
34
|
+
# Predictor_name can be a name like ls_ii, hs_iii, homo, oxo, hat, etc.
|
|
35
|
+
# Input dataframe must have all RAC features in individual columns, or as dictionaries in a single column specified by `RAC_column`.
|
|
36
|
+
# Will not execute if RAC features are missing.
|
|
37
|
+
|
|
38
|
+
# Returns: RAC_dataframe with new columns added:
|
|
39
|
+
# - predictor_name_latent_vector
|
|
40
|
+
# - predictor_name_min_latent_distance,
|
|
41
|
+
# - predictor_name_prediction
|
|
42
|
+
|
|
43
|
+
assert type(RAC_dataframe) is pd.DataFrame
|
|
44
|
+
train_vars = load_ANN_variables(predictor_name)
|
|
45
|
+
train_mean_x, train_mean_y, train_var_x, train_var_y = load_normalization_data(predictor_name)
|
|
46
|
+
my_ANN = load_keras_ann(predictor_name)
|
|
47
|
+
|
|
48
|
+
# Check if any RAC elements are missing from the provided dataframe
|
|
49
|
+
missing_labels = [i for i in train_vars if i not in RAC_dataframe.columns]
|
|
50
|
+
|
|
51
|
+
if len(missing_labels) > 0:
|
|
52
|
+
# Try checking if there is anything in the column `RAC_column`. If so, deserialize it and re-run.
|
|
53
|
+
if RAC_column in RAC_dataframe.columns:
|
|
54
|
+
deserialized_RACs = pd.DataFrame.from_records(RAC_dataframe[RAC_column].values, index=RAC_dataframe.index.values)
|
|
55
|
+
deserialized_RACs = deserialized_RACs.astype(float)
|
|
56
|
+
RAC_dataframe = RAC_dataframe.join(deserialized_RACs)
|
|
57
|
+
return perform_ANN_prediction(RAC_dataframe, predictor_name, RAC_column='RACs')
|
|
58
|
+
else:
|
|
59
|
+
raise ValueError('Please supply missing variables in your RAC dataframe: %s' % missing_labels)
|
|
60
|
+
if 'alpha' in train_vars:
|
|
61
|
+
if any(RAC_dataframe.alpha > 1):
|
|
62
|
+
raise ValueError('Alpha is too large - should be between 0 and 1.')
|
|
63
|
+
|
|
64
|
+
RAC_subset_for_ANN = RAC_dataframe.loc[:, train_vars].astype(float)
|
|
65
|
+
normalized_input = data_normalize(RAC_subset_for_ANN, train_mean_x, train_var_x)
|
|
66
|
+
ANN_prediction = my_ANN.predict(normalized_input, verbose=0)
|
|
67
|
+
rescaled_output = data_rescale(ANN_prediction, train_mean_y, train_var_y)
|
|
68
|
+
|
|
69
|
+
# Get latent vectors for training data and queried data
|
|
70
|
+
train_x = pd.DataFrame(load_training_data(predictor_name), columns=train_vars).astype(float)
|
|
71
|
+
get_outputs = K.function([my_ANN.layers[0].input, K.learning_phase()],
|
|
72
|
+
[my_ANN.layers[len(my_ANN.layers) - 2].output])
|
|
73
|
+
normalized_train = data_normalize(train_x, train_mean_x, train_var_x)
|
|
74
|
+
training_latent = get_outputs([normalized_train, 0])[0]
|
|
75
|
+
query_latent = get_outputs([normalized_input, 0])[0]
|
|
76
|
+
|
|
77
|
+
# Append all results to dataframe
|
|
78
|
+
results_list = []
|
|
79
|
+
for i in range(len(RAC_dataframe)):
|
|
80
|
+
results_dict = {}
|
|
81
|
+
min_latent_distance = min(np.linalg.norm(training_latent - query_latent[i][:], axis=1))
|
|
82
|
+
results_dict['%s_latent_vector' % predictor_name] = query_latent[i]
|
|
83
|
+
results_dict['%s_min_latent_distance' % predictor_name] = min_latent_distance
|
|
84
|
+
output_value = rescaled_output[i]
|
|
85
|
+
if len(output_value) == 1: # squash array of length 1 to the value it contains
|
|
86
|
+
output_value = output_value[0]
|
|
87
|
+
results_dict['%s_prediction' % predictor_name] = output_value
|
|
88
|
+
results_list.append(results_dict)
|
|
89
|
+
results_df = pd.DataFrame(results_list, index=RAC_dataframe.index)
|
|
90
|
+
RAC_dataframe_with_results = RAC_dataframe.join(results_df)
|
|
91
|
+
return RAC_dataframe_with_results
|
|
92
|
+
|
|
93
|
+
|
|
94
|
+
def get_error_params(latent_distances, errors):
|
|
95
|
+
'''
|
|
96
|
+
Get the maximum-likelihood parameters for an error model N(a+b*(latent_distance)).
|
|
97
|
+
Inputs: latent_distances (vector), errors (vector)
|
|
98
|
+
Output: [a, b]
|
|
99
|
+
'''
|
|
100
|
+
def log_likelihood(params):
|
|
101
|
+
a = params[0]
|
|
102
|
+
b = params[1]
|
|
103
|
+
return -np.nansum(scipy.stats.norm.logpdf(errors, loc=0, scale=a+latent_distances*b))
|
|
104
|
+
results = scipy.optimize.minimize(log_likelihood, np.array([0.2, 0.01]), bounds=[(1e-9, None), (1e-9, None)])
|
|
105
|
+
return results.x
|
|
106
|
+
|
|
107
|
+
|
|
108
|
+
def matrix_loader(path: str, rownames: bool = False) -> Union[Tuple[List[List[str]], List[str]], List[List[str]]]:
|
|
109
|
+
# loads matrix with rowname option
|
|
110
|
+
path_to_file = resource_files("molSimplify.python_nn").joinpath(path.strip("/"))
|
|
111
|
+
if rownames:
|
|
112
|
+
with open(path_to_file, "r") as f:
|
|
113
|
+
csv_lines = list(csv.reader(f))
|
|
114
|
+
row_names = [row[0] for row in csv_lines]
|
|
115
|
+
mat = [row[1:] for row in csv_lines]
|
|
116
|
+
return mat, row_names
|
|
117
|
+
else:
|
|
118
|
+
with open(path_to_file, 'r') as csvfile:
|
|
119
|
+
lines = csv.reader(csvfile, delimiter=',')
|
|
120
|
+
mat = [a for a in lines]
|
|
121
|
+
return mat
|
|
122
|
+
|
|
123
|
+
|
|
124
|
+
def get_key(predictor: str, suffix: Optional[str] = None) -> str:
|
|
125
|
+
if suffix:
|
|
126
|
+
if predictor in ['ls_ii', 'hs_ii', 'ls_iii', 'hs_iii']:
|
|
127
|
+
key = 'geos/' + predictor + '_%s' % suffix
|
|
128
|
+
elif predictor in ['homo', 'gap']:
|
|
129
|
+
key = 'homolumo/' + predictor + '_%s' % suffix
|
|
130
|
+
elif predictor in ['oxo', 'hat']:
|
|
131
|
+
key = 'oxocatalysis/' + predictor + '_%s' % suffix
|
|
132
|
+
elif predictor in ['oxo20', 'homo_empty']:
|
|
133
|
+
key = 'oxoandhomo/' + predictor + '_%s' % suffix
|
|
134
|
+
elif predictor in ['geo_static_clf', 'sc_static_clf']:
|
|
135
|
+
key = predictor + '/' + predictor + '_%s' % suffix
|
|
136
|
+
else:
|
|
137
|
+
key = predictor + '/' + predictor + '_%s' % suffix
|
|
138
|
+
else:
|
|
139
|
+
if predictor in ['ls_ii', 'hs_ii', 'ls_iii', 'hs_iii']:
|
|
140
|
+
key = 'geos/'
|
|
141
|
+
elif predictor in ['homo', 'gap']:
|
|
142
|
+
key = 'homolumo/'
|
|
143
|
+
elif predictor in ['oxo', 'hat']:
|
|
144
|
+
key = 'oxocatalysis/'
|
|
145
|
+
elif predictor in ['oxo20', 'homo_empty']:
|
|
146
|
+
key = 'oxoandhomo/'
|
|
147
|
+
elif predictor in ['geo_static_clf', 'sc_static_clf']:
|
|
148
|
+
key = predictor + '/' + predictor + '_%s' % suffix
|
|
149
|
+
else:
|
|
150
|
+
key = predictor
|
|
151
|
+
return key
|
|
152
|
+
|
|
153
|
+
|
|
154
|
+
def data_rescale(scaled_dat, train_mean, train_var, debug=False) -> np.ndarray:
|
|
155
|
+
d = np.shape(train_mean)[0]
|
|
156
|
+
if debug:
|
|
157
|
+
print(('unnormalizing with number of dimensions = ' + str(d)))
|
|
158
|
+
dat = (np.multiply(scaled_dat.T, np.sqrt(train_var), ) + train_mean).T
|
|
159
|
+
return (dat)
|
|
160
|
+
|
|
161
|
+
|
|
162
|
+
def data_normalize(data, train_mean, train_var, debug=False) -> np.ndarray:
|
|
163
|
+
data = data.astype(float) # Make sure the data is always in float form
|
|
164
|
+
d = np.shape(train_mean)[0]
|
|
165
|
+
# ## double check the variance in the training data
|
|
166
|
+
delete_ind = list()
|
|
167
|
+
|
|
168
|
+
if debug:
|
|
169
|
+
print(('normalizing with number of dimensions = ' + str(d)))
|
|
170
|
+
print('shape of things in normalize:')
|
|
171
|
+
print(('data.shape ' + str(data.shape)))
|
|
172
|
+
print(('train_mean.shape ' + str(train_mean.shape)))
|
|
173
|
+
print(('train_mean.shape ' + str(train_var.shape)))
|
|
174
|
+
for idx, var in enumerate(np.squeeze(train_var)):
|
|
175
|
+
if var < 1e-16:
|
|
176
|
+
delete_ind.append(idx)
|
|
177
|
+
if len(delete_ind) > 0:
|
|
178
|
+
print(('Note: There are %d features with a variance smaller than 1e-16.' % len(delete_ind)))
|
|
179
|
+
print('Please double check your input data if this number is not what you expect...')
|
|
180
|
+
data = np.delete(data, delete_ind, axis=1)
|
|
181
|
+
train_mean = np.delete(train_mean, delete_ind, axis=0)
|
|
182
|
+
train_var = np.delete(train_var, delete_ind, axis=0)
|
|
183
|
+
|
|
184
|
+
scaled_dat = np.divide((data.T - train_mean), np.sqrt(train_var), ).T
|
|
185
|
+
return scaled_dat
|
|
186
|
+
|
|
187
|
+
|
|
188
|
+
def load_normalization_data(name: str):
|
|
189
|
+
train_mean_x = list()
|
|
190
|
+
path_to_file = resource_files("molSimplify.tf_nn").joinpath(f'rescaling_data/{name}_mean_x.csv')
|
|
191
|
+
if os.path.isfile(path_to_file):
|
|
192
|
+
with open(path_to_file, 'r') as f:
|
|
193
|
+
for lines in f.readlines():
|
|
194
|
+
train_mean_x.append([float(lines.strip().strip('[]'))])
|
|
195
|
+
|
|
196
|
+
train_var_x = list()
|
|
197
|
+
path_to_file = resource_files("molSimplify.tf_nn").joinpath(f'rescaling_data/{name}_var_x.csv')
|
|
198
|
+
with open(path_to_file, 'r') as f:
|
|
199
|
+
for lines in f.readlines():
|
|
200
|
+
train_var_x.append([float(lines.strip().strip('[]'))])
|
|
201
|
+
|
|
202
|
+
train_mean_y = list()
|
|
203
|
+
path_to_file = resource_files("molSimplify.tf_nn").joinpath(f'rescaling_data/{name}_mean_y.csv')
|
|
204
|
+
with open(path_to_file, 'r') as f:
|
|
205
|
+
for lines in f.readlines():
|
|
206
|
+
train_mean_y.append([float(lines.strip().strip('[]'))])
|
|
207
|
+
train_var_y = list()
|
|
208
|
+
path_to_file = resource_files("molSimplify.tf_nn").joinpath(f'rescaling_data/{name}_var_y.csv')
|
|
209
|
+
with open(path_to_file, 'r') as f:
|
|
210
|
+
for lines in f.readlines():
|
|
211
|
+
train_var_y.append([float(lines.strip().strip('[]'))])
|
|
212
|
+
else:
|
|
213
|
+
print('---Mean and Variance information do not exist. Calculate from training data...---')
|
|
214
|
+
train_mean_x, train_mean_y, train_var_x, train_var_y = get_data_mean_std(predictor=name)
|
|
215
|
+
|
|
216
|
+
return np.array(train_mean_x), np.array(train_mean_y), np.array(train_var_x), np.array(train_var_y)
|
|
217
|
+
|
|
218
|
+
|
|
219
|
+
def get_data_mean_std(predictor: str):
|
|
220
|
+
if predictor in ['ls_ii', 'hs_ii', 'ls_iii', 'hs_iii']:
|
|
221
|
+
key = 'geos/' + predictor + '_bl_x'
|
|
222
|
+
elif predictor in ['homo', 'gap']:
|
|
223
|
+
key = 'homolumo/' + predictor + '_train_x'
|
|
224
|
+
elif predictor in ['oxo', 'hat']:
|
|
225
|
+
key = 'oxocatalysis/' + predictor + '_train_x'
|
|
226
|
+
elif predictor in ['oxo20', 'homo_empty']:
|
|
227
|
+
key = 'oxoandhomo/' + predictor + '_train_x'
|
|
228
|
+
elif predictor == "split":
|
|
229
|
+
key = predictor + '/' + predictor + '_x'
|
|
230
|
+
elif predictor in ['geo_static_clf', 'sc_static_clf']:
|
|
231
|
+
key = f'{predictor}/{predictor}_train_x'
|
|
232
|
+
else:
|
|
233
|
+
key = predictor + '/' + predictor + '_x'
|
|
234
|
+
path_to_feature_file = resource_files("molSimplify.tf_nn").joinpath(f'{key}.csv')
|
|
235
|
+
df_feature = pd.read_csv(path_to_feature_file)
|
|
236
|
+
train_mean_x, train_var_x = list(), list()
|
|
237
|
+
for col in df_feature:
|
|
238
|
+
train_mean_x.append([np.mean(np.array(df_feature[col]))])
|
|
239
|
+
train_var_x.append([np.var(np.array(df_feature[col]))])
|
|
240
|
+
### labels
|
|
241
|
+
if predictor in ['ls_ii', 'hs_ii', 'ls_iii', 'hs_iii']:
|
|
242
|
+
key = 'geos/' + predictor + '_bl_y'
|
|
243
|
+
elif predictor in ['homo', 'gap']:
|
|
244
|
+
key = 'homolumo/' + predictor + '_train_y'
|
|
245
|
+
elif predictor in ['oxo', 'hat']:
|
|
246
|
+
key = 'oxocatalysis/' + predictor + '_train_y'
|
|
247
|
+
elif predictor in ['oxo20', 'homo_empty']:
|
|
248
|
+
key = 'oxoandhomo/' + predictor + '_train_y'
|
|
249
|
+
elif predictor == "split":
|
|
250
|
+
key = predictor + '/' + predictor + '_y'
|
|
251
|
+
elif predictor in ['geo_static_clf', 'sc_static_clf']:
|
|
252
|
+
key = f'{predictor}/{predictor}_train_y'
|
|
253
|
+
else:
|
|
254
|
+
key = predictor + '/' + predictor + '_y'
|
|
255
|
+
path_to_label_file = resource_files("molSimplify.tf_nn").joinpath(f'{key}.csv')
|
|
256
|
+
df_label = pd.read_csv(path_to_label_file)
|
|
257
|
+
train_mean_y, train_var_y = list(), list()
|
|
258
|
+
for col in df_label:
|
|
259
|
+
train_mean_y.append([np.mean(np.array(df_label[col]))])
|
|
260
|
+
train_var_y.append([np.var(np.array(df_label[col]))])
|
|
261
|
+
return train_mean_x, train_mean_y, train_var_x, train_var_y
|
|
262
|
+
|
|
263
|
+
|
|
264
|
+
def load_ANN_variables(predictor: str, suffix: str = 'vars') -> List[str]:
|
|
265
|
+
key = get_key(predictor, suffix)
|
|
266
|
+
path_to_file = resource_files("molSimplify.tf_nn").joinpath(f'{key}.csv')
|
|
267
|
+
names = []
|
|
268
|
+
with open(path_to_file, 'r') as f:
|
|
269
|
+
for lines in f.readlines():
|
|
270
|
+
names.append(lines.strip())
|
|
271
|
+
return names
|
|
272
|
+
|
|
273
|
+
|
|
274
|
+
def load_training_data(predictor: str) -> List[List[str]]:
|
|
275
|
+
if predictor in ['ls_ii', 'hs_ii', 'ls_iii', 'hs_iii']:
|
|
276
|
+
key = 'geos/' + predictor + '_bl_x'
|
|
277
|
+
elif predictor in ['homo', 'gap']:
|
|
278
|
+
key = 'homolumo/' + predictor + '_train_x'
|
|
279
|
+
elif predictor in ['oxo', 'hat']:
|
|
280
|
+
key = 'oxocatalysis/' + predictor + '_train_x'
|
|
281
|
+
elif predictor in ['oxo20', 'homo_empty']:
|
|
282
|
+
key = 'oxoandhomo/' + predictor + '_train_x'
|
|
283
|
+
elif predictor == "split":
|
|
284
|
+
key = predictor + '/' + predictor + '_x'
|
|
285
|
+
elif predictor in ['geo_static_clf', 'sc_static_clf']:
|
|
286
|
+
key = predictor + '/' + predictor + '_train_x'
|
|
287
|
+
else:
|
|
288
|
+
key = predictor + '/' + predictor + '_x'
|
|
289
|
+
path_to_file = resource_files("molSimplify.tf_nn").joinpath(f'{key}.csv')
|
|
290
|
+
with open(path_to_file, "r") as f:
|
|
291
|
+
csv_lines = list(csv.reader(f))
|
|
292
|
+
# row_names = [row[0] for row in csv_lines]
|
|
293
|
+
mat = [row for row in csv_lines[1:]]
|
|
294
|
+
return mat
|
|
295
|
+
|
|
296
|
+
|
|
297
|
+
def load_latent_training_data(predictor):
|
|
298
|
+
##### CURRENTLY LATENT TRAINING DATA NOT AVAIL
|
|
299
|
+
if predictor in ['ls_ii', 'hs_ii', 'ls_iii', 'hs_iii']:
|
|
300
|
+
key = 'geos/' + predictor + '_latent_bl_x'
|
|
301
|
+
elif predictor in ['homo', 'gap']:
|
|
302
|
+
key = 'homolumo/' + predictor + '_latent_train_x'
|
|
303
|
+
elif predictor in ['oxo', 'hat']:
|
|
304
|
+
key = 'oxocatalysis/' + predictor + '_latent_train_x'
|
|
305
|
+
elif predictor in ['oxo20', 'homo_empty']:
|
|
306
|
+
key = 'oxoandhomo/' + predictor + '_latent_train_x'
|
|
307
|
+
elif predictor == "split":
|
|
308
|
+
key = predictor + '/' + predictor + '_latent_x_41_OHE'
|
|
309
|
+
elif predictor in ['geo_static_clf', 'sc_static_clf']:
|
|
310
|
+
key = predictor + '/' + predictor + '_latent_train_x'
|
|
311
|
+
else:
|
|
312
|
+
key = predictor + '/' + predictor + '_latent_x_OHE'
|
|
313
|
+
path_to_file = resource_files("molSimplify.tf_nn").joinpath(f'{key}.csv')
|
|
314
|
+
with open(path_to_file, "r") as f:
|
|
315
|
+
csv_lines = list(csv.reader(f))
|
|
316
|
+
# row_names = [row[0] for row in csv_lines]
|
|
317
|
+
mat = [row for row in csv_lines[1:]]
|
|
318
|
+
return mat
|
|
319
|
+
|
|
320
|
+
|
|
321
|
+
def load_test_data(predictor):
|
|
322
|
+
if predictor in ['ls_ii', 'hs_ii', 'ls_iii', 'hs_iii']:
|
|
323
|
+
key = 'geos/' + predictor + '_bl_x' # Note, this test data is not available, will return train.
|
|
324
|
+
elif predictor in ['homo', 'gap']:
|
|
325
|
+
key = 'homolumo/' + predictor + '_test_x'
|
|
326
|
+
elif predictor in ['oxo', 'hat']:
|
|
327
|
+
key = 'oxocatalysis/' + predictor + '_test_x'
|
|
328
|
+
elif predictor == "split":
|
|
329
|
+
key = predictor + '/' + predictor + '_x' # Note, this test data is not available, will return train
|
|
330
|
+
elif predictor in ['geo_static_clf', 'sc_static_clf']:
|
|
331
|
+
key = predictor + '/' + predictor + '_test_x'
|
|
332
|
+
else:
|
|
333
|
+
key = predictor + '/' + predictor + '_x'
|
|
334
|
+
path_to_file = resource_files("molSimplify.tf_nn").joinpath(f'{key}.csv')
|
|
335
|
+
with open(path_to_file, "r") as f:
|
|
336
|
+
csv_lines = list(csv.reader(f))
|
|
337
|
+
# row_names = [row[0] for row in csv_lines]
|
|
338
|
+
mat = [row for row in csv_lines[1:]]
|
|
339
|
+
return mat
|
|
340
|
+
|
|
341
|
+
|
|
342
|
+
def load_training_labels(predictor: str) -> List[List[str]]:
|
|
343
|
+
if predictor in ['ls_ii', 'hs_ii', 'ls_iii', 'hs_iii']:
|
|
344
|
+
key = 'geos/' + predictor + '_bl_y'
|
|
345
|
+
elif predictor in ['homo', 'gap']:
|
|
346
|
+
key = 'homolumo/' + predictor + '_train_y'
|
|
347
|
+
elif predictor in ['oxo', 'hat']:
|
|
348
|
+
key = 'oxocatalysis/' + predictor + '_train_y'
|
|
349
|
+
elif predictor in ['oxo20', 'homo_empty']:
|
|
350
|
+
key = 'oxoandhomo/' + predictor + '_train_y'
|
|
351
|
+
elif predictor == "split":
|
|
352
|
+
key = predictor + '/' + predictor + '_y'
|
|
353
|
+
elif predictor in ['geo_static_clf', 'sc_static_clf']:
|
|
354
|
+
key = predictor + '/' + predictor + '_train_y'
|
|
355
|
+
else:
|
|
356
|
+
key = predictor + '/' + predictor + '_y'
|
|
357
|
+
path_to_file = resource_files("molSimplify.tf_nn").joinpath(f'{key}.csv')
|
|
358
|
+
with open(path_to_file, "r") as f:
|
|
359
|
+
csv_lines = list(csv.reader(f))
|
|
360
|
+
# row_names = [row[0] for row in csv_lines]
|
|
361
|
+
mat = [row for row in csv_lines[1:]]
|
|
362
|
+
return mat
|
|
363
|
+
|
|
364
|
+
|
|
365
|
+
def load_test_labels(predictor: str) -> List[List[str]]:
|
|
366
|
+
if predictor in ['ls_ii', 'hs_ii', 'ls_iii', 'hs_iii']:
|
|
367
|
+
key = 'geos/' + predictor + '_bl_y'
|
|
368
|
+
elif predictor in ['homo', 'gap']:
|
|
369
|
+
key = 'homolumo/' + predictor + '_test_y'
|
|
370
|
+
elif predictor in ['oxo', 'hat']:
|
|
371
|
+
key = 'oxocatalysis/' + predictor + '_test_y'
|
|
372
|
+
elif predictor in ['oxo20', 'homo_empty']:
|
|
373
|
+
key = 'oxoandhomo/' + predictor + '_test_y'
|
|
374
|
+
elif predictor == "split":
|
|
375
|
+
key = predictor + '/' + predictor + '_y'
|
|
376
|
+
elif predictor in ['geo_static_clf', 'sc_static_clf']:
|
|
377
|
+
key = predictor + '/' + predictor + '_test_y'
|
|
378
|
+
else:
|
|
379
|
+
key = predictor + '/' + predictor + '_y'
|
|
380
|
+
path_to_file = resource_files("molSimplify.tf_nn").joinpath(f'{key}.csv')
|
|
381
|
+
with open(path_to_file, "rU") as f:
|
|
382
|
+
csv_lines = list(csv.reader(f))
|
|
383
|
+
# row_names = [row[0] for row in csv_lines]
|
|
384
|
+
mat = [row for row in csv_lines[1:]]
|
|
385
|
+
return mat
|
|
386
|
+
|
|
387
|
+
|
|
388
|
+
def load_train_info(predictor: str, suffix: str = 'info') -> dict:
|
|
389
|
+
key = get_key(predictor, suffix)
|
|
390
|
+
path_to_file = resource_files("molSimplify.tf_nn").joinpath(f'{key}.json')
|
|
391
|
+
with open(path_to_file, 'r') as json_file:
|
|
392
|
+
loaded_info_dict = json.loads(json_file.read())
|
|
393
|
+
return loaded_info_dict
|
|
394
|
+
|
|
395
|
+
|
|
396
|
+
def load_keras_ann(predictor: str, suffix: str = 'model', compile: bool = False) -> tf.keras.Model:
|
|
397
|
+
# this function loads the ANN for property
|
|
398
|
+
# "predcitor"
|
|
399
|
+
# disable TF output text to reduce console spam
|
|
400
|
+
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'
|
|
401
|
+
key = get_key(predictor, suffix)
|
|
402
|
+
if "clf" not in predictor:
|
|
403
|
+
path_to_file = resource_files("molSimplify.tf_nn").joinpath(f'{key}.json')
|
|
404
|
+
with open(path_to_file, 'r') as json_file:
|
|
405
|
+
loaded_model_json = json_file.read()
|
|
406
|
+
loaded_model = model_from_json(loaded_model_json)
|
|
407
|
+
# load weights into model
|
|
408
|
+
path_to_file = resource_files("molSimplify.tf_nn").joinpath(f'{key}.h5')
|
|
409
|
+
loaded_model.load_weights(path_to_file)
|
|
410
|
+
else:
|
|
411
|
+
path_to_file = resource_files("molSimplify.tf_nn").joinpath(f'{key}.h5')
|
|
412
|
+
loaded_model = load_model(path_to_file)
|
|
413
|
+
if compile:
|
|
414
|
+
from tensorflow.keras.optimizers.legacy import Adam
|
|
415
|
+
if predictor == 'homo':
|
|
416
|
+
loaded_model.compile(loss="mse", optimizer=Adam(beta_2=1 - 0.0016204733101599046, beta_1=0.8718839135783554,
|
|
417
|
+
decay=7.770243145972892e-05, lr=0.0004961686075897741),
|
|
418
|
+
metrics=['mse', 'mae', 'mape'])
|
|
419
|
+
elif predictor == 'gap':
|
|
420
|
+
loaded_model.compile(loss="mse", optimizer=Adam(beta_2=1 - 0.00010929248596488832, beta_1=0.8406735969305784,
|
|
421
|
+
decay=0.00011224350434148253, lr=0.0006759924688701965),
|
|
422
|
+
metrics=['mse', 'mae', 'mape'])
|
|
423
|
+
elif predictor in ['oxo', 'hat']:
|
|
424
|
+
# loaded_model.compile(loss="mse", optimizer=Adam(beta_2=0.9637165412871632, beta_1=0.7560951483268549,
|
|
425
|
+
# decay=0.0006651401379502965, lr=0.0007727366541920176),
|
|
426
|
+
# metrics=['mse', 'mae', 'mape']) #decomissioned on 06/20/2019 by Aditya. Using hyperparams from oxo20.
|
|
427
|
+
loaded_model.compile(loss="mse", optimizer=Adam(lr=0.0012838133056087084, beta_1=0.9811686522122317,
|
|
428
|
+
beta_2=0.8264616523572279, decay=0.0005114008091318582),
|
|
429
|
+
metrics=['mse', 'mae', 'mape'])
|
|
430
|
+
elif predictor == 'oxo20':
|
|
431
|
+
loaded_model.compile(loss="mse", optimizer=Adam(lr=0.0012838133056087084, beta_1=0.9811686522122317,
|
|
432
|
+
beta_2=0.8264616523572279, decay=0.0005114008091318582),
|
|
433
|
+
metrics=['mse', 'mae', 'mape'])
|
|
434
|
+
elif predictor == 'homo_empty':
|
|
435
|
+
loaded_model.compile(loss="mse", optimizer=Adam(lr=0.006677578283098809, beta_1=0.8556594887870226,
|
|
436
|
+
beta_2=0.9463468021275508, decay=0.0006621877134674607),
|
|
437
|
+
metrics=['mse', 'mae', 'mape'])
|
|
438
|
+
|
|
439
|
+
elif predictor in ['geo_static_clf', 'sc_static_clf']:
|
|
440
|
+
loaded_model.compile(loss='binary_crossentropy',
|
|
441
|
+
optimizer=Adam(lr=0.00005, beta_1=0.95, decay=0.0001, amsgrad=True),
|
|
442
|
+
metrics=['accuracy'])
|
|
443
|
+
else:
|
|
444
|
+
loaded_model.compile(loss="mse", optimizer='adam',
|
|
445
|
+
metrics=['mse', 'mae', 'mape'])
|
|
446
|
+
# print("Keras/tf model loaded for " + str(predictor) + " from disk")
|
|
447
|
+
return loaded_model
|
|
448
|
+
|
|
449
|
+
|
|
450
|
+
def tf_ANN_excitation_prepare(predictor: str, descriptors: List[float], descriptor_names: List[str]) -> np.ndarray:
|
|
451
|
+
## this function reforms the provided list of descriptors and their
|
|
452
|
+
## names to match the expectations of the target ANN model.
|
|
453
|
+
## it does NOT perfrom standardization
|
|
454
|
+
|
|
455
|
+
## get variable names
|
|
456
|
+
target_names = load_ANN_variables(predictor)
|
|
457
|
+
if len(target_names) > len(descriptors):
|
|
458
|
+
print('Error: preparing features for ' + str(predictor) + ', recieved '
|
|
459
|
+
+ str(len(descriptors)) + ' descriptors')
|
|
460
|
+
print(('model requires ' + str(len(target_names)) + ' descriptors, attempting match'))
|
|
461
|
+
excitation = []
|
|
462
|
+
for var_name in target_names:
|
|
463
|
+
try:
|
|
464
|
+
excitation.append(descriptors[descriptor_names.index(var_name)])
|
|
465
|
+
except ValueError:
|
|
466
|
+
print(('looking for ' + str(var_name)))
|
|
467
|
+
print(('Error! variable ' + str(var_name) + ' not found!'))
|
|
468
|
+
break
|
|
469
|
+
output = np.array(excitation)
|
|
470
|
+
output = np.reshape(output, (1, len(target_names)))
|
|
471
|
+
return output
|
|
472
|
+
|
|
473
|
+
|
|
474
|
+
def ANN_supervisor(predictor: str,
|
|
475
|
+
descriptors: List[float],
|
|
476
|
+
descriptor_names: List[str],
|
|
477
|
+
debug: bool = False) -> Tuple[np.ndarray, np.ndarray]:
|
|
478
|
+
if debug:
|
|
479
|
+
print(('ANN activated for ' + str(predictor)))
|
|
480
|
+
|
|
481
|
+
## form the excitation in the corrrect order/variables
|
|
482
|
+
excitation = tf_ANN_excitation_prepare(predictor, descriptors, descriptor_names)
|
|
483
|
+
if debug:
|
|
484
|
+
print(('excitation is ' + str(excitation.shape)))
|
|
485
|
+
print('fetching non-dimensionalization data... ')
|
|
486
|
+
# sardines
|
|
487
|
+
train_mean_x, train_mean_y, train_var_x, train_var_y = load_normalization_data(predictor)
|
|
488
|
+
if debug:
|
|
489
|
+
print('rescaling input excitation...')
|
|
490
|
+
|
|
491
|
+
excitation = data_normalize(excitation, train_mean_x, train_var_x, debug=debug)
|
|
492
|
+
|
|
493
|
+
## fetch ANN
|
|
494
|
+
loaded_model = load_keras_ann(predictor)
|
|
495
|
+
result = data_rescale(loaded_model.predict(excitation, verbose=0), train_mean_y, train_var_y, debug=debug)
|
|
496
|
+
if "clf" not in predictor:
|
|
497
|
+
if debug:
|
|
498
|
+
print(('LOADED MODEL HAS ' + str(
|
|
499
|
+
len(loaded_model.layers)) + ' layers, so latent space measure will be from first ' + str(
|
|
500
|
+
len(loaded_model.layers) - 1) + ' layers'))
|
|
501
|
+
if not version.parse(tf.__version__) >= version.parse('2.0.0'):
|
|
502
|
+
get_outputs = K.function([loaded_model.layers[0].input, K.learning_phase()],
|
|
503
|
+
[loaded_model.layers[len(loaded_model.layers) - 2].output])
|
|
504
|
+
latent_space_vector = get_outputs([excitation, 0]) # Using test phase.
|
|
505
|
+
else:
|
|
506
|
+
latent_space_vector = get_layer_outputs(loaded_model, len(loaded_model.layers) - 2,
|
|
507
|
+
excitation, training_flag=False)
|
|
508
|
+
if debug:
|
|
509
|
+
print('calling ANN model...')
|
|
510
|
+
else:
|
|
511
|
+
latent_space_vector = find_clf_lse(predictor, excitation, loaded_model=loaded_model, ensemble=False,
|
|
512
|
+
modelname=None, debug=debug)
|
|
513
|
+
return result, latent_space_vector
|
|
514
|
+
|
|
515
|
+
|
|
516
|
+
def find_true_min_eu_dist(predictor: str,
|
|
517
|
+
descriptors: List[float],
|
|
518
|
+
descriptor_names: List[str],
|
|
519
|
+
debug: bool = False) -> float:
|
|
520
|
+
# returns scaled euclidean distance to nearest trainning
|
|
521
|
+
# vector in desciptor space
|
|
522
|
+
train_mean_x, train_mean_y, train_var_x, train_var_y = load_normalization_data(predictor)
|
|
523
|
+
|
|
524
|
+
## form the excitation in the corrrect order/variables
|
|
525
|
+
excitation = tf_ANN_excitation_prepare(predictor, descriptors, descriptor_names)
|
|
526
|
+
excitation = excitation.astype(float) # ensure that the excitation is a float, and not strings
|
|
527
|
+
scaled_excitation = data_normalize(excitation, train_mean_x, train_var_x, debug=debug) # normalize the excitation
|
|
528
|
+
## getting train matrix info
|
|
529
|
+
mat = load_training_data(predictor)
|
|
530
|
+
train_mat = np.array(mat, dtype='float64')
|
|
531
|
+
## loop over rows
|
|
532
|
+
min_dist = np.inf
|
|
533
|
+
min_ind = 0
|
|
534
|
+
for i, rows in enumerate(train_mat):
|
|
535
|
+
scaled_row = np.squeeze(
|
|
536
|
+
data_normalize(rows, train_mean_x.T, train_var_x.T, debug=debug)) # Normalizing the row before finding the distance
|
|
537
|
+
this_dist = float(np.linalg.norm(np.subtract(scaled_row, np.array(scaled_excitation)))) # Cast to float for mypy typing
|
|
538
|
+
if this_dist < min_dist:
|
|
539
|
+
min_dist = this_dist
|
|
540
|
+
min_ind = i
|
|
541
|
+
# best_row = rownames[i]
|
|
542
|
+
# min_row = rows
|
|
543
|
+
|
|
544
|
+
if debug:
|
|
545
|
+
print(('min dist EU is ' + str(min_dist)))
|
|
546
|
+
folder_dict = {'homo': 'homolumo', 'gap': 'homolumo',
|
|
547
|
+
'oxo': 'oxocatalysis', 'hat': 'oxocatalysis',
|
|
548
|
+
'oxo20': 'oxoandhomo', 'homo_empty': 'oxoandhomo'}
|
|
549
|
+
if predictor in folder_dict:
|
|
550
|
+
key = f'{folder_dict[predictor]}/{predictor}_train_names'
|
|
551
|
+
path_to_file = resource_files("molSimplify.tf_nn").joinpath(f'{key}.csv')
|
|
552
|
+
with open(path_to_file, "r") as f:
|
|
553
|
+
csv_lines = list(csv.reader(f))
|
|
554
|
+
print(('Closest Euc Dist Structure: ' + str(csv_lines[min_ind]).strip('[]') + ' for predictor ' + str(
|
|
555
|
+
predictor)))
|
|
556
|
+
# need to get normalized distances
|
|
557
|
+
|
|
558
|
+
########################################################################################
|
|
559
|
+
# Changed by Aditya on 08/13/2018. Previously, nearest neighbor was being found in the #
|
|
560
|
+
# unnormalized space, and then that was normalized. This was resulting in bad nearest #
|
|
561
|
+
# neighbor candidate structures. Now routine normalizes before finding the distance. #
|
|
562
|
+
########################################################################################
|
|
563
|
+
|
|
564
|
+
# train_mean_x,train_mean_y,train_var_x,train_var_y = load_normalization_data(predictor)
|
|
565
|
+
|
|
566
|
+
# flatten min row
|
|
567
|
+
# min_row = np.reshape(min_row, excitation.shape)
|
|
568
|
+
# scaled_excitation = data_normalize(excitation,train_mean_x,train_var_x)
|
|
569
|
+
# scaled_row = data_normalize(min_row,train_mean_x,train_var_x)
|
|
570
|
+
# min_dist = np.linalg.norm(np.subtract(scaled_row,(scaled_excitation)))
|
|
571
|
+
return (min_dist)
|
|
572
|
+
|
|
573
|
+
|
|
574
|
+
def find_ANN_10_NN_normalized_latent_dist(predictor, latent_space_vector, debug=False):
|
|
575
|
+
# returns scaled euclidean distance to nearest trainning
|
|
576
|
+
# vector in desciptor space
|
|
577
|
+
|
|
578
|
+
# average_train_train_10NN = {'homo_empty': 0.43517572, 'oxo20': 0.068675719}
|
|
579
|
+
train_mean_x, train_mean_y, train_var_x, train_var_y = load_normalization_data(predictor)
|
|
580
|
+
|
|
581
|
+
## getting train matrix info
|
|
582
|
+
mat = load_training_data(predictor)
|
|
583
|
+
train_mat = np.array(mat, dtype='float64')
|
|
584
|
+
|
|
585
|
+
loaded_model = load_keras_ann(predictor)
|
|
586
|
+
if debug:
|
|
587
|
+
print('measuring latent distances:')
|
|
588
|
+
print(('loaded model has ' + str(
|
|
589
|
+
len(loaded_model.layers)) + ' layers, so latent space measure will be from first ' + str(
|
|
590
|
+
len(loaded_model.layers) - 1) + ' layers'))
|
|
591
|
+
norm_train_mat = []
|
|
592
|
+
for i, row in enumerate(train_mat):
|
|
593
|
+
row = np.array(row)
|
|
594
|
+
scaled_excitation = data_normalize(row, train_mean_x.T, train_var_x.T)
|
|
595
|
+
norm_train_mat.append(scaled_excitation)
|
|
596
|
+
norm_train_mat = np.squeeze(np.array(norm_train_mat))
|
|
597
|
+
loaded_model = load_keras_ann(predictor)
|
|
598
|
+
if not version.parse(tf.__version__) >= version.parse('2.0.0'):
|
|
599
|
+
get_outputs = K.function([loaded_model.layers[0].input, K.learning_phase()],
|
|
600
|
+
[loaded_model.layers[len(loaded_model.layers) - 2].output])
|
|
601
|
+
latent_space_train = np.squeeze(np.array(get_outputs([norm_train_mat, 0])))
|
|
602
|
+
else:
|
|
603
|
+
latent_space_train = get_layer_outputs(loaded_model, len(loaded_model.layers) - 2,
|
|
604
|
+
norm_train_mat, training_flag=False)
|
|
605
|
+
latent_space_train = np.squeeze(np.array(latent_space_train))
|
|
606
|
+
dist_array = np.linalg.norm(np.subtract(np.squeeze(latent_space_train), np.squeeze(latent_space_vector)), axis=1)
|
|
607
|
+
# train_dist_array = np.linalg.norm(np.subtract(np.squeeze(latent_space_train), np.squeeze(latent_space_train)),axis=1)
|
|
608
|
+
from scipy.spatial import distance_matrix
|
|
609
|
+
train_dist_array = distance_matrix(latent_space_train, latent_space_train)
|
|
610
|
+
nearest_10_NN_train = []
|
|
611
|
+
for j, train_row in enumerate(train_dist_array):
|
|
612
|
+
nearest_10_NN_train.append(np.sort(np.squeeze(train_row))[0:10])
|
|
613
|
+
nearest_10_NN_train = np.array(nearest_10_NN_train)
|
|
614
|
+
avg_traintrain = np.mean(nearest_10_NN_train)
|
|
615
|
+
sorted_dist = np.sort(np.squeeze(dist_array))
|
|
616
|
+
avg_10_NN_dist = np.mean(sorted_dist[0:10])
|
|
617
|
+
norm_avg_10_NN_dist = avg_10_NN_dist/avg_traintrain
|
|
618
|
+
return norm_avg_10_NN_dist, avg_10_NN_dist, avg_traintrain
|
|
619
|
+
|
|
620
|
+
|
|
621
|
+
def find_ANN_latent_dist(predictor, latent_space_vector, debug=False):
|
|
622
|
+
# returns scaled euclidean distance to nearest trainning
|
|
623
|
+
# vector in desciptor space
|
|
624
|
+
train_mean_x, train_mean_y, train_var_x, train_var_y = load_normalization_data(predictor)
|
|
625
|
+
|
|
626
|
+
## getting train matrix info
|
|
627
|
+
mat = load_training_data(predictor)
|
|
628
|
+
train_mat = np.array(mat, dtype='float64')
|
|
629
|
+
## loop over rows
|
|
630
|
+
min_dist = 100000000
|
|
631
|
+
min_ind = 0
|
|
632
|
+
|
|
633
|
+
loaded_model = load_keras_ann(predictor)
|
|
634
|
+
|
|
635
|
+
if debug:
|
|
636
|
+
print('measuring latent distances:')
|
|
637
|
+
print(('loaded model has ' + str(
|
|
638
|
+
len(loaded_model.layers)) + ' layers, so latent space measure will be from first ' + str(
|
|
639
|
+
len(loaded_model.layers) - 1) + ' layers'))
|
|
640
|
+
if not version.parse(tf.__version__) >= version.parse('2.0.0'):
|
|
641
|
+
get_outputs = K.function([loaded_model.layers[0].input, K.learning_phase()],
|
|
642
|
+
[loaded_model.layers[len(loaded_model.layers) - 2].output])
|
|
643
|
+
for i, rows in enumerate(train_mat):
|
|
644
|
+
scaled_row = np.squeeze(
|
|
645
|
+
data_normalize(rows, train_mean_x.T, train_var_x.T, debug=debug)) # Normalizing the row before finding the distance
|
|
646
|
+
if not version.parse(tf.__version__) >= version.parse('2.0.0'):
|
|
647
|
+
latent_train_row = get_outputs([np.array([scaled_row]), 0])
|
|
648
|
+
else:
|
|
649
|
+
latent_train_row = get_layer_outputs(loaded_model, len(loaded_model.layers) - 2,
|
|
650
|
+
[np.array([scaled_row])], training_flag=False)
|
|
651
|
+
this_dist = np.linalg.norm(np.subtract(np.squeeze(latent_train_row), np.squeeze(latent_space_vector)))
|
|
652
|
+
if this_dist < min_dist:
|
|
653
|
+
min_dist = this_dist
|
|
654
|
+
min_ind = i
|
|
655
|
+
|
|
656
|
+
# flatten min row
|
|
657
|
+
if debug:
|
|
658
|
+
print(('min dist is ' + str(min_dist) + ' at ' + str(min_ind)))
|
|
659
|
+
folder_dict = {'homo': 'homolumo', 'gap': 'homolumo',
|
|
660
|
+
'oxo': 'oxocatalysis', 'hat': 'oxocatalysis',
|
|
661
|
+
'oxo20': 'oxoandhomo', 'homo_empty': 'oxoandhomo'}
|
|
662
|
+
if predictor in folder_dict:
|
|
663
|
+
key = f'{folder_dict[predictor]}/{predictor}_train_names'
|
|
664
|
+
path_to_file = resource_files("molSimplify.tf_nn").joinpath(f'{key}.csv')
|
|
665
|
+
with open(path_to_file, "r") as f:
|
|
666
|
+
csv_lines = list(csv.reader(f))
|
|
667
|
+
print(('Closest Latent Dist Structure: ' + str(csv_lines[min_ind]) + ' for predictor ' + str(predictor)))
|
|
668
|
+
return (min_dist)
|
|
669
|
+
|
|
670
|
+
|
|
671
|
+
def find_clf_lse(predictor: str,
|
|
672
|
+
excitation,
|
|
673
|
+
loaded_model,
|
|
674
|
+
ensemble: bool = False,
|
|
675
|
+
modelname: Optional[str] = None,
|
|
676
|
+
debug: bool = False) -> np.ndarray:
|
|
677
|
+
if modelname is None:
|
|
678
|
+
modelname = "spectro"
|
|
679
|
+
if predictor == "geo_static_clf":
|
|
680
|
+
avrg_latent_dist = 33.21736244173539
|
|
681
|
+
elif predictor == "sc_static_clf":
|
|
682
|
+
avrg_latent_dist = 38.276809428032685
|
|
683
|
+
else:
|
|
684
|
+
print("Unknown model type")
|
|
685
|
+
return np.zeros_like(excitation)
|
|
686
|
+
key = get_key(predictor, suffix='')
|
|
687
|
+
base_path = resource_files("molSimplify.tf_nn").joinpath(key)
|
|
688
|
+
train_mean_x, train_mean_y, train_var_x, train_var_y = load_normalization_data(predictor)
|
|
689
|
+
labels_train = np.array(load_training_labels(predictor), dtype='int')
|
|
690
|
+
fmat_train = np.array(load_training_data(predictor), dtype='float64')
|
|
691
|
+
fmat_train = data_normalize(fmat_train, train_mean_x, train_var_x, debug=debug)
|
|
692
|
+
fmat_train = np.array(fmat_train)
|
|
693
|
+
if not ensemble:
|
|
694
|
+
# model = base_path + 'model.h5'
|
|
695
|
+
# loaded_model = load_model(model)
|
|
696
|
+
train_latent = get_layer_outputs(loaded_model, -4, fmat_train, training_flag=False)
|
|
697
|
+
test_latent = get_layer_outputs(loaded_model, -4, excitation, training_flag=False)
|
|
698
|
+
nn_latent_dist_test, nn_dists, nn_labels = dist_neighbor(test_latent, train_latent, labels_train,
|
|
699
|
+
l=5, dist_ref=avrg_latent_dist)
|
|
700
|
+
lse = get_entropy(nn_dists, nn_labels)
|
|
701
|
+
else:
|
|
702
|
+
print("Using ensemble averaged LSE.")
|
|
703
|
+
base_path = base_path + 'ensemble_%s/' % modelname
|
|
704
|
+
model_list = sorted(glob.glob(base_path + '/*.h5'))
|
|
705
|
+
if len(model_list) != 10:
|
|
706
|
+
print(key)
|
|
707
|
+
print(base_path)
|
|
708
|
+
print(model_list)
|
|
709
|
+
print(("Error: LSE cannot be calculated with modelname %s--The number of models is wrong." % modelname))
|
|
710
|
+
return np.zeros_like(excitation)
|
|
711
|
+
fmat_train_split = np.array_split(fmat_train, 10, axis=0)
|
|
712
|
+
labels_train_split = np.array_split(labels_train, 10, axis=0)
|
|
713
|
+
entropies_list = []
|
|
714
|
+
for model in model_list:
|
|
715
|
+
print(model)
|
|
716
|
+
loaded_model = load_model(model)
|
|
717
|
+
model_idx = int(model.split("/")[-1].split(".")[0].split("_")[-1])
|
|
718
|
+
_fmat_train = array_stack(fmat_train_split, model_idx)
|
|
719
|
+
_labels_train = array_stack(labels_train_split, model_idx)
|
|
720
|
+
train_latent = get_layer_outputs(loaded_model, -4, _fmat_train, training_flag=False)
|
|
721
|
+
test_latent = get_layer_outputs(loaded_model, -4, excitation, training_flag=False)
|
|
722
|
+
nn_latent_dist_train, _, __ = dist_neighbor(train_latent, train_latent, _labels_train,
|
|
723
|
+
l=5, dist_ref=1)
|
|
724
|
+
avrg_latent_dist = np.mean(nn_latent_dist_train)
|
|
725
|
+
nn_latent_dist_test, nn_dists, nn_labels = dist_neighbor(test_latent, train_latent, _labels_train,
|
|
726
|
+
l=5, dist_ref=avrg_latent_dist)
|
|
727
|
+
entropies = get_entropy(nn_dists, nn_labels)
|
|
728
|
+
entropies_list.append(entropies)
|
|
729
|
+
lse = np.mean(np.array(entropies_list), axis=0)
|
|
730
|
+
return lse
|
|
731
|
+
|
|
732
|
+
|
|
733
|
+
def save_model(model: tf.keras.Model, predictor: str,
|
|
734
|
+
num: Optional[int] = None, suffix: Optional[str] = None):
|
|
735
|
+
key = get_key(predictor, suffix)
|
|
736
|
+
base_path = resource_files("molSimplify.tf_nn").joinpath(key)
|
|
737
|
+
base_path = base_path + 'ensemble_models'
|
|
738
|
+
if not os.path.exists(base_path):
|
|
739
|
+
os.makedirs(base_path)
|
|
740
|
+
if num is not None:
|
|
741
|
+
name = '%s/%s_%d' % (base_path, predictor, num)
|
|
742
|
+
else:
|
|
743
|
+
name = '%s/%s' % (base_path, predictor)
|
|
744
|
+
# serialize model to JSON
|
|
745
|
+
model_json = model.to_json()
|
|
746
|
+
with open("%s.json" % name, "w") as json_file:
|
|
747
|
+
json_file.write(model_json)
|
|
748
|
+
# serialize weights to HDF5
|
|
749
|
+
model.save_weights("%s.h5" % name)
|
|
750
|
+
print(("Saved model !%s! to disk" % name.split('/')[-1]))
|
|
751
|
+
|
|
752
|
+
|
|
753
|
+
def initialize_model_weights(model: tf.keras.Model) -> tf.keras.Model:
|
|
754
|
+
session = K.get_session()
|
|
755
|
+
for layer in model.layers:
|
|
756
|
+
for v in layer.__dict__:
|
|
757
|
+
v_arg = getattr(layer, v)
|
|
758
|
+
if hasattr(v_arg, 'initializer'):
|
|
759
|
+
initializer_method = getattr(v_arg, 'initializer')
|
|
760
|
+
initializer_method.run(session=session)
|
|
761
|
+
# print('reinitializing layer {}.{}'.format(layer.name, v))
|
|
762
|
+
return model
|