PyPI - emmet-builders - Versions diffs - 0.84.2__py3-none-any.whl → 0.86.0__py3-none-any.whl - Mend

emmet-builders 0.84.2py3-none-any.whl → 0.86.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (42) hide show

emmet/builders/abinit/phonon.py +27 -25
emmet/builders/abinit/sound_velocity.py +15 -11
emmet/builders/feff/xas.py +1 -2
emmet/builders/materials/absorption_spectrum.py +25 -14
emmet/builders/materials/alloys.py +3 -4
emmet/builders/materials/chemenv.py +2 -3
emmet/builders/materials/corrected_entries.py +15 -9
emmet/builders/materials/dielectric.py +19 -11
emmet/builders/materials/elasticity.py +44 -33
emmet/builders/materials/electrodes.py +24 -19
emmet/builders/materials/electronic_structure.py +17 -17
emmet/builders/materials/magnetism.py +11 -4
emmet/builders/materials/optimade.py +7 -3
emmet/builders/materials/piezoelectric.py +24 -21
emmet/builders/materials/provenance.py +15 -12
emmet/builders/materials/robocrys.py +2 -3
emmet/builders/materials/substrates.py +9 -8
emmet/builders/materials/summary.py +3 -3
emmet/builders/materials/thermo.py +17 -11
emmet/builders/matscholar/missing_compositions.py +12 -8
emmet/builders/mobility/migration_graph.py +5 -5
emmet/builders/settings.py +21 -17
emmet/builders/utils.py +15 -10
emmet/builders/vasp/materials.py +32 -16
emmet/builders/vasp/task_validator.py +15 -11
{emmet_builders-0.84.2.dist-info → emmet_builders-0.86.0.dist-info}/METADATA +21 -36
emmet_builders-0.86.0.dist-info/RECORD +41 -0
{emmet_builders-0.84.2.dist-info → emmet_builders-0.86.0.dist-info}/WHEEL +1 -1
emmet/builders/materials/ml.py +0 -87
emmet/builders/molecules/atomic.py +0 -589
emmet/builders/molecules/bonds.py +0 -324
emmet/builders/molecules/metal_binding.py +0 -526
emmet/builders/molecules/orbitals.py +0 -288
emmet/builders/molecules/redox.py +0 -496
emmet/builders/molecules/summary.py +0 -383
emmet/builders/molecules/thermo.py +0 -500
emmet/builders/molecules/vibration.py +0 -278
emmet/builders/qchem/__init__.py +0 -0
emmet/builders/qchem/molecules.py +0 -734
emmet_builders-0.84.2.dist-info/RECORD +0 -52
/emmet/builders/{molecules/__init__.py → py.typed} +0 -0
{emmet_builders-0.84.2.dist-info → emmet_builders-0.86.0.dist-info}/top_level.txt +0 -0

emmet/builders/molecules/redox.py DELETED Viewed

@@ -1,496 +0,0 @@
-from collections import defaultdict
-import copy
-from datetime import datetime
-from itertools import chain, groupby
-from math import ceil
-from typing import Any, Dict, Iterable, Iterator, List, Optional, Union
-from maggma.builders import Builder
-from maggma.core import Store
-from maggma.utils import grouper
-from emmet.core.qchem.task import TaskDocument
-from emmet.core.qchem.molecule import MoleculeDoc
-from emmet.core.molecules.bonds import metals
-from emmet.core.molecules.thermo import MoleculeThermoDoc
-from emmet.core.molecules.redox import RedoxDoc
-from emmet.core.utils import confirm_molecule, get_graph_hash, jsanitize
-from emmet.builders.settings import EmmetBuildSettings
-__author__ = "Evan Spotte-Smith"
-SETTINGS = EmmetBuildSettings()
-class RedoxBuilder(Builder):
-    """
-    The RedoxBuilder extracts the highest-quality redox data (vertical and
-    adiabatic reduction and oxidation potentials, etc.)
-    from a MoleculeDoc (lowest electronic energy, highest level of theory).
-    The process is as follows:
-        1. Gather MoleculeDocs by formula
-        2. Further group based on (covalent) isomorphism and charge
-        3. For each MoleculeDoc:
-            3a. Identify relevant MoleculeThermoDocs
-            3b. Look for single-point energy calculations conducted at the
-            molecule's charge +- 1. These will be used to calculation
-            vertical electron affinities and ionization energies
-            3c. Group MoleculeThermoDocs and single-point calculations based on solvent
-            and level of theory
-        4. Construct RedoxDocs by looking for molecules (with associated
-            calculations) that:
-            - Have charges that differ by +- 1
-            - Use the same solvent and level of theory
-    """
-    def __init__(
-        self,
-        tasks: Store,
-        molecules: Store,
-        thermo: Store,
-        redox: Store,
-        query: Optional[Dict] = None,
-        settings: Optional[EmmetBuildSettings] = None,
-        **kwargs,
-    ):
-        self.tasks = tasks
-        self.molecules = molecules
-        self.thermo = thermo
-        self.redox = redox
-        self.query = query if query else dict()
-        self.settings = EmmetBuildSettings.autoload(settings)
-        self.kwargs = kwargs
-        super().__init__(sources=[tasks, molecules, thermo], targets=[redox], **kwargs)
-        # Uncomment in case of issue with mrun not connecting automatically to collections
-        # for i in [self.tasks, self.molecules, self.thermo, self.redox]:
-        #     try:
-        #         i.connect()
-        #     except Exception as e:
-        #         print("Could not connect,", e)
-    def ensure_indexes(self):
-        """
-        Ensures indices on the collections needed for building
-        """
-        # Basic search index for tasks
-        self.tasks.ensure_index("task_id")
-        self.tasks.ensure_index("last_updated")
-        self.tasks.ensure_index("state")
-        self.tasks.ensure_index("formula_alphabetical")
-        # Search index for molecules
-        self.molecules.ensure_index("molecule_id")
-        self.molecules.ensure_index("last_updated")
-        self.molecules.ensure_index("task_ids")
-        self.molecules.ensure_index("formula_alphabetical")
-        # Search index for thermo
-        self.thermo.ensure_index("molecule_id")
-        self.thermo.ensure_index("task_id")
-        self.thermo.ensure_index("solvent")
-        self.thermo.ensure_index("lot_solvent")
-        self.thermo.ensure_index("property_id")
-        self.thermo.ensure_index("last_updated")
-        self.thermo.ensure_index("formula_alphabetical")
-        # Search index for redox
-        self.redox.ensure_index("molecule_id")
-        self.redox.ensure_index("solvent")
-        self.redox.ensure_index("lot_solvent")
-        self.redox.ensure_index("property_id")
-        self.redox.ensure_index("last_updated")
-        self.redox.ensure_index("formula_alphabetical")
-    def prechunk(self, number_splits: int) -> Iterable[Dict]:  # pragma: no cover
-        """Prechunk the builder for distributed computation"""
-        temp_query = dict(self.query)
-        temp_query["deprecated"] = False
-        self.logger.info("Finding documents to process")
-        all_mols = list(
-            self.molecules.query(
-                temp_query, [self.molecules.key, "formula_alphabetical"]
-            )
-        )
-        processed_docs = set([e for e in self.redox.distinct("molecule_id")])
-        to_process_docs = {d[self.molecules.key] for d in all_mols} - processed_docs
-        to_process_forms = {
-            d["formula_alphabetical"]
-            for d in all_mols
-            if d[self.molecules.key] in to_process_docs
-        }
-        N = ceil(len(to_process_forms) / number_splits)
-        for formula_chunk in grouper(to_process_forms, N):
-            yield {"query": {"formula_alphabetical": {"$in": list(formula_chunk)}}}
-    def get_items(self) -> Iterator[List[Dict]]:
-        """
-        Gets all items to process into redox documents.
-        This does no datetime checking; relying on on whether
-        task_ids are included in the orbitals Store
-        Returns:
-            generator or list relevant tasks and molecules to process into documents
-        """
-        self.logger.info("Redox builder started")
-        self.logger.info("Setting indexes")
-        self.ensure_indexes()
-        # Save timestamp to mark buildtime
-        self.timestamp = datetime.utcnow()
-        # Get all processed molecules
-        temp_query = dict(self.query)
-        temp_query["deprecated"] = False
-        self.logger.info("Finding documents to process")
-        all_mols = list(
-            self.molecules.query(
-                temp_query, [self.molecules.key, "formula_alphabetical"]
-            )
-        )
-        processed_docs = set([e for e in self.redox.distinct("molecule_id")])
-        to_process_docs = {d[self.molecules.key] for d in all_mols} - processed_docs
-        to_process_forms = {
-            d["formula_alphabetical"]
-            for d in all_mols
-            if d[self.molecules.key] in to_process_docs
-        }
-        self.logger.info(f"Found {len(to_process_docs)} unprocessed documents")
-        self.logger.info(f"Found {len(to_process_forms)} unprocessed formulas")
-        # Set total for builder bars to have a total
-        self.total = len(to_process_forms)
-        for formula in to_process_forms:
-            mol_query = dict(temp_query)
-            mol_query["formula_alphabetical"] = formula
-            molecules = list(self.molecules.query(criteria=mol_query))
-            yield molecules
-    def process_item(self, items: List[Dict]) -> List[Dict]:
-        """
-        Process the tasks into a RedoxDoc
-        Args:
-            tasks List[Dict] : a list of MoleculeDocs in dict form
-        Returns:
-            [dict] : a list of new redox docs
-        """
-        mols = [MoleculeDoc(**item) for item in items]
-        formula = mols[0].formula_alphabetical
-        mol_ids = [m.molecule_id for m in mols]
-        self.logger.debug(f"Processing {formula} : {mol_ids}")
-        redox_docs = list()
-        # Group by (covalent) molecular graph connectivity
-        group_by_graph = self._group_by_graph(mols)
-        for graph_group in group_by_graph.values():
-            # Molecule docs will be grouped by charge
-            charges: Dict[int, Any] = dict()
-            for gg in graph_group:
-                # First, grab relevant MoleculeThermoDocs and identify possible IE/EA single-points
-                thermo_docs = [
-                    MoleculeThermoDoc(**e)
-                    for e in self.thermo.query({"molecule_id": gg.molecule_id})
-                ]
-                if len(thermo_docs) == 0:
-                    # Current building scheme requires a MoleculeThermoDoc
-                    continue
-                ie_sp_task_ids = [
-                    e["task_id"]
-                    for e in gg.entries
-                    if e["charge"] == gg.charge + 1
-                    and e["task_type"] == "Single Point"
-                    and e["output"].get("final_energy")
-                ]
-                ie_tasks = list()
-                for i in ie_sp_task_ids:
-                    tdoc = self.tasks.query_one(
-                        {
-                            "task_id": i,
-                            "formula_alphabetical": formula,
-                            "orig": {"$exists": True},
-                        }
-                    )
-                    if tdoc is None:
-                        try:
-                            tdoc = self.tasks.query_one(
-                                {
-                                    "task_id": int(i),
-                                    "formula_alphabetical": formula,
-                                    "orig": {"$exists": True},
-                                }
-                            )
-                        except ValueError:
-                            tdoc = None
-                    if tdoc is None:
-                        continue
-                    ie_tasks.append(TaskDocument(**tdoc))
-                ea_sp_task_ids = [
-                    e["task_id"]
-                    for e in gg.entries
-                    if e["charge"] == gg.charge - 1
-                    and e["task_type"] == "Single Point"
-                    and e["output"].get("final_energy")
-                ]
-                ea_tasks = list()
-                for i in ea_sp_task_ids:
-                    tdoc = self.tasks.query_one(
-                        {
-                            "task_id": i,
-                            "formula_alphabetical": formula,
-                            "orig": {"$exists": True},
-                        }
-                    )
-                    if tdoc is None:
-                        try:
-                            tdoc = self.tasks.query_one(
-                                {
-                                    "task_id": int(i),
-                                    "formula_alphabetical": formula,
-                                    "orig": {"$exists": True},
-                                }
-                            )
-                        except ValueError:
-                            tdoc = None
-                    if tdoc is None:
-                        continue
-                    ea_tasks.append(TaskDocument(**tdoc))
-                grouped_docs = self._collect_by_lot_solvent(
-                    thermo_docs, ie_tasks, ea_tasks
-                )
-                if gg.charge in charges:
-                    charges[gg.charge].append((gg, grouped_docs))
-                else:
-                    charges[gg.charge] = [(gg, grouped_docs)]
-            for charge, collection in charges.items():
-                for mol, docs in collection:
-                    # Get all possible molecules for adiabatic oxidation and reduction
-                    red_coll = charges.get(charge - 1, list())
-                    ox_coll = charges.get(charge + 1, list())
-                    for lot_solv, docset in docs.items():
-                        # Collect other molecules that have MoleculeThermoDocs at the
-                        # exact same level of theory
-                        combined = docset["thermo_doc"].combined_lot_solvent
-                        relevant_red = list()
-                        relevant_ox = list()
-                        for rmol, rdocs in red_coll:
-                            if lot_solv in rdocs:
-                                if (
-                                    rdocs[lot_solv]["thermo_doc"].combined_lot_solvent
-                                    == combined
-                                ):
-                                    relevant_red.append(rdocs[lot_solv])
-                        for omol, odocs in ox_coll:
-                            if lot_solv in odocs:
-                                if (
-                                    odocs[lot_solv]["thermo_doc"].combined_lot_solvent
-                                    == combined
-                                ):
-                                    relevant_ox.append(odocs[lot_solv])
-                        # Take best options (based on electronic energy), where available
-                        if len(relevant_red) == 0:
-                            red_doc = None
-                        else:
-                            red_doc = sorted(
-                                relevant_red,
-                                key=lambda x: x["thermo_doc"].electronic_energy,
-                            )[0]["thermo_doc"]
-                        if len(relevant_ox) == 0:
-                            ox_doc = None
-                        else:
-                            ox_doc = sorted(
-                                relevant_ox,
-                                key=lambda x: x["thermo_doc"].electronic_energy,
-                            )[0]["thermo_doc"]
-                        ea_doc = docset.get("ea_doc")
-                        ie_doc = docset.get("ie_doc")
-                        redox_docs.append(
-                            RedoxDoc.from_docs(
-                                base_molecule_doc=mol,
-                                base_thermo_doc=docset["thermo_doc"],
-                                red_doc=red_doc,
-                                ox_doc=ox_doc,
-                                ea_doc=ea_doc,
-                                ie_doc=ie_doc,
-                            )
-                        )
-        self.logger.debug(f"Produced {len(redox_docs)} redox docs for {formula}")
-        return jsanitize(
-            [doc.model_dump() for doc in redox_docs if doc is not None], allow_bson=True
-        )
-    def update_targets(self, items: List[List[Dict]]):
-        """
-        Inserts the new documents into the orbitals collection
-        Args:
-            items [[dict]]: A list of documents to update
-        """
-        docs = list(chain.from_iterable(items))  # type: ignore
-        # Add timestamp
-        for item in docs:
-            item.update(
-                {
-                    "_bt": self.timestamp,
-                }
-            )
-        molecule_ids = list({item["molecule_id"] for item in docs})
-        if len(items) > 0:
-            self.logger.info(f"Updating {len(docs)} redox documents")
-            self.redox.remove_docs({self.redox.key: {"$in": molecule_ids}})
-            self.redox.update(
-                docs=docs,
-                key=["molecule_id", "solvent"],
-            )
-        else:
-            self.logger.info("No items to update")
-    @staticmethod
-    def _group_by_graph(mol_docs: List[MoleculeDoc]) -> Dict[int, List[MoleculeDoc]]:
-        """
-        Group molecule docs by molecular graph connectivity
-        :param entries: List of entries (dicts derived from TaskDocuments)
-        :return: Grouped molecule entries
-        """
-        graph_hashes_nometal: List[str] = list()
-        results = defaultdict(list)
-        # Within each group, group by the covalent molecular graph
-        for t in mol_docs:
-            mol = confirm_molecule(t.molecule)
-            mol_nometal = copy.deepcopy(mol)
-            if mol.composition.alphabetical_formula not in [m + "1" for m in metals]:
-                mol_nometal.remove_species(metals)
-            mol_nometal.set_charge_and_spin(0)
-            gh_nometal = get_graph_hash(mol_nometal, node_attr="specie")
-            match = None
-            for i, gh in enumerate(graph_hashes_nometal):
-                if gh_nometal == gh:
-                    match = i
-                    break
-            if match is None:
-                results[len(graph_hashes_nometal)].append(t)
-                graph_hashes_nometal.append(gh_nometal)
-            else:
-                results[match].append(t)
-        return results
-    @staticmethod
-    def _collect_by_lot_solvent(
-        thermo_docs: List[MoleculeThermoDoc],
-        ie_docs: List[TaskDocument],
-        ea_docs: List[TaskDocument],
-    ) -> Dict[str, Any]:
-        """
-        For a given MoleculeDoc, group potential MoleculeThermoDocs and TaskDocs for
-        IE/EA calculations based on level of theory and solvent.
-        Args:
-            thermo_docs (list of MoleculeThermoDocs): List of MoleculeThermoDocs for this MoleculeDoc
-            ie_docs (list of TaskDocuments): List of TaskDocs which could be used
-                to calculate vertical ionization energies for this MoleculeDoc
-            ea_docs (list of TaskDocuments): List of TaskDocs which could be used
-                to calculate vertical electron affinities for this MoleculeDoc:
-        Returns:
-            dict {<lot_solvent>: {
-                        "thermo_doc": MoleculeThermoDoc, "ie_doc": TaskDocument, "ea_doc": TaskDocument
-                    }
-                 }
-        """
-        def _lot_solv(doc: Union[MoleculeThermoDoc, TaskDocument]):
-            if isinstance(doc, MoleculeThermoDoc):
-                if doc.correction:
-                    return doc.correction_lot_solvent
-            return doc.lot_solvent
-        thermo_grouped = groupby(sorted(thermo_docs, key=_lot_solv), key=_lot_solv)
-        ie_grouped = groupby(sorted(ie_docs, key=_lot_solv), key=_lot_solv)
-        ea_grouped = groupby(sorted(ea_docs, key=_lot_solv), key=_lot_solv)
-        groups = dict()
-        for k, g in thermo_grouped:
-            g_list = list(g)
-            # Should never be more than one MoleculeThermoDoc per MoleculeDoc
-            # Just for safety...
-            if len(g_list) > 1:
-                g_list_sorted = sorted(g_list, key=lambda x: x.electronic_energy)
-                this_thermo_doc = g_list_sorted[0]
-            else:
-                this_thermo_doc = g_list[0]
-            groups[k] = {"thermo_doc": this_thermo_doc}
-        for k, g in ie_grouped:
-            # Must be a MoleculeThermoDoc to make a RedoxDoc
-            if k not in groups:
-                continue
-            this_ie_doc = sorted(list(g), key=lambda x: x.output.final_energy)[0]
-            groups[k]["ie_doc"] = this_ie_doc
-        for k, g in ea_grouped:
-            # Must be a MoleculeThermoDoc to make a RedoxDoc
-            if k not in groups:
-                continue
-            this_ea_doc = sorted(list(g), key=lambda x: x.output.final_energy)[0]
-            groups[k]["ea_doc"] = this_ea_doc
-        return groups

emmet-builders 0.84.2__py3-none-any.whl → 0.86.0__py3-none-any.whl

emmet-builders 0.84.2py3-none-any.whl → 0.86.0py3-none-any.whl