PyPI - emmet-builders - Versions diffs - 0.84.2rc7__py3-none-any.whl → 0.84.2rc9__py3-none-any.whl - Mend

emmet-builders 0.84.2rc7py3-none-any.whl → 0.84.2rc9py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of emmet-builders might be problematic. Click here for more details.

Files changed (16) hide show

emmet/builders/molecules/atomic.py +48 -46
emmet/builders/molecules/bonds.py +24 -24
emmet/builders/molecules/metal_binding.py +21 -20
emmet/builders/molecules/orbitals.py +23 -23
emmet/builders/molecules/redox.py +27 -27
emmet/builders/molecules/summary.py +21 -36
emmet/builders/molecules/thermo.py +23 -23
emmet/builders/molecules/vibration.py +23 -23
emmet/builders/qchem/molecules.py +15 -21
emmet/builders/vasp/mp_potcar_stats.json.gz +0 -0
{emmet_builders-0.84.2rc7.dist-info → emmet_builders-0.84.2rc9.dist-info}/METADATA +1 -1
{emmet_builders-0.84.2rc7.dist-info → emmet_builders-0.84.2rc9.dist-info}/RECORD +14 -16
emmet/builders/molecules/electric.py +0 -282
emmet/builders/molecules/trajectory.py +0 -525
{emmet_builders-0.84.2rc7.dist-info → emmet_builders-0.84.2rc9.dist-info}/WHEEL +0 -0
{emmet_builders-0.84.2rc7.dist-info → emmet_builders-0.84.2rc9.dist-info}/top_level.txt +0 -0

emmet/builders/molecules/redox.py CHANGED Viewed

@@ -30,7 +30,7 @@ class RedoxBuilder(Builder):
     from a MoleculeDoc (lowest electronic energy, highest level of theory).
     The process is as follows:
-        1. Gather MoleculeDocs by species hash
+        1. Gather MoleculeDocs by formula
         2. Further group based on (covalent) isomorphism and charge
         3. For each MoleculeDoc:
             3a. Identify relevant MoleculeThermoDocs
@@ -81,14 +81,12 @@ class RedoxBuilder(Builder):
         self.tasks.ensure_index("last_updated")
         self.tasks.ensure_index("state")
         self.tasks.ensure_index("formula_alphabetical")
-        self.tasks.ensure_index("species_hash")
         # Search index for molecules
         self.molecules.ensure_index("molecule_id")
         self.molecules.ensure_index("last_updated")
         self.molecules.ensure_index("task_ids")
         self.molecules.ensure_index("formula_alphabetical")
-        self.molecules.ensure_index("species_hash")
         # Search index for thermo
         self.thermo.ensure_index("molecule_id")
@@ -115,23 +113,23 @@ class RedoxBuilder(Builder):
         self.logger.info("Finding documents to process")
         all_mols = list(
-            self.molecules.query(temp_query, [self.molecules.key, "species_hash"])
+            self.molecules.query(
+                temp_query, [self.molecules.key, "formula_alphabetical"]
+            )
         )
         processed_docs = set([e for e in self.redox.distinct("molecule_id")])
         to_process_docs = {d[self.molecules.key] for d in all_mols} - processed_docs
-        to_process_hashes = {
-            d["species_hash"]
+        to_process_forms = {
+            d["formula_alphabetical"]
             for d in all_mols
             if d[self.molecules.key] in to_process_docs
         }
-        N = ceil(len(to_process_hashes) / number_splits)
+        N = ceil(len(to_process_forms) / number_splits)
-        for hash_chunk in grouper(to_process_hashes, N):
-            query = dict(temp_query)
-            query["species_hash"] = {"$in": list(hash_chunk)}
-            yield {"query": query}
+        for formula_chunk in grouper(to_process_forms, N):
+            yield {"query": {"formula_alphabetical": {"$in": list(formula_chunk)}}}
     def get_items(self) -> Iterator[List[Dict]]:
         """
@@ -156,26 +154,28 @@ class RedoxBuilder(Builder):
         self.logger.info("Finding documents to process")
         all_mols = list(
-            self.molecules.query(temp_query, [self.molecules.key, "species_hash"])
+            self.molecules.query(
+                temp_query, [self.molecules.key, "formula_alphabetical"]
+            )
         )
         processed_docs = set([e for e in self.redox.distinct("molecule_id")])
         to_process_docs = {d[self.molecules.key] for d in all_mols} - processed_docs
-        to_process_hashes = {
-            d["species_hash"]
+        to_process_forms = {
+            d["formula_alphabetical"]
             for d in all_mols
             if d[self.molecules.key] in to_process_docs
         }
         self.logger.info(f"Found {len(to_process_docs)} unprocessed documents")
-        self.logger.info(f"Found {len(to_process_hashes)} unprocessed hashes")
+        self.logger.info(f"Found {len(to_process_forms)} unprocessed formulas")
         # Set total for builder bars to have a total
-        self.total = len(to_process_hashes)
+        self.total = len(to_process_forms)
-        for shash in to_process_hashes:
+        for formula in to_process_forms:
             mol_query = dict(temp_query)
-            mol_query["species_hash"] = shash
+            mol_query["formula_alphabetical"] = formula
             molecules = list(self.molecules.query(criteria=mol_query))
             yield molecules
@@ -192,9 +192,9 @@ class RedoxBuilder(Builder):
         """
         mols = [MoleculeDoc(**item) for item in items]
-        shash = mols[0].species_hash
+        formula = mols[0].formula_alphabetical
         mol_ids = [m.molecule_id for m in mols]
-        self.logger.debug(f"Processing {shash} : {mol_ids}")
+        self.logger.debug(f"Processing {formula} : {mol_ids}")
         redox_docs = list()
@@ -220,7 +220,7 @@ class RedoxBuilder(Builder):
                     e["task_id"]
                     for e in gg.entries
                     if e["charge"] == gg.charge + 1
-                    and e["task_type"] in ["Single Point", "Force"]
+                    and e["task_type"] == "Single Point"
                     and e["output"].get("final_energy")
                 ]
                 ie_tasks = list()
@@ -228,7 +228,7 @@ class RedoxBuilder(Builder):
                     tdoc = self.tasks.query_one(
                         {
                             "task_id": i,
-                            "species_hash": shash,
+                            "formula_alphabetical": formula,
                             "orig": {"$exists": True},
                         }
                     )
@@ -238,7 +238,7 @@ class RedoxBuilder(Builder):
                             tdoc = self.tasks.query_one(
                                 {
                                     "task_id": int(i),
-                                    "species_hash": shash,
+                                    "formula_alphabetical": formula,
                                     "orig": {"$exists": True},
                                 }
                             )
@@ -254,7 +254,7 @@ class RedoxBuilder(Builder):
                     e["task_id"]
                     for e in gg.entries
                     if e["charge"] == gg.charge - 1
-                    and e["task_type"] in ["Single Point", "Force"]
+                    and e["task_type"] == "Single Point"
                     and e["output"].get("final_energy")
                 ]
                 ea_tasks = list()
@@ -262,7 +262,7 @@ class RedoxBuilder(Builder):
                     tdoc = self.tasks.query_one(
                         {
                             "task_id": i,
-                            "species_hash": shash,
+                            "formula_alphabetical": formula,
                             "orig": {"$exists": True},
                         }
                     )
@@ -272,7 +272,7 @@ class RedoxBuilder(Builder):
                             tdoc = self.tasks.query_one(
                                 {
                                     "task_id": int(i),
-                                    "species_hash": shash,
+                                    "formula_alphabetical": formula,
                                     "orig": {"$exists": True},
                                 }
                             )
@@ -354,7 +354,7 @@ class RedoxBuilder(Builder):
                             )
                         )
-        self.logger.debug(f"Produced {len(redox_docs)} redox docs for {shash}")
+        self.logger.debug(f"Produced {len(redox_docs)} redox docs for {formula}")
         return jsanitize(
             [doc.model_dump() for doc in redox_docs if doc is not None], allow_bson=True

emmet/builders/molecules/summary.py CHANGED Viewed

@@ -36,7 +36,6 @@ class SummaryBuilder(Builder):
         charges: Store,
         spins: Store,
         bonds: Store,
-        multipoles: Store,
         metal_binding: Store,
         orbitals: Store,
         redox: Store,
@@ -51,7 +50,6 @@ class SummaryBuilder(Builder):
         self.charges = charges
         self.spins = spins
         self.bonds = bonds
-        self.multipoles = multipoles
         self.metal_binding = metal_binding
         self.orbitals = orbitals
         self.redox = redox
@@ -68,7 +66,6 @@ class SummaryBuilder(Builder):
                 charges,
                 spins,
                 bonds,
-                multipoles,
                 metal_binding,
                 orbitals,
                 redox,
@@ -84,7 +81,6 @@ class SummaryBuilder(Builder):
         #     self.charges,
         #     self.spins,
         #     self.bonds,
-        #     self.multipoles,
         #     self.metal_binding,
         #     self.orbitals,
         #     self.redox,
@@ -107,7 +103,6 @@ class SummaryBuilder(Builder):
         self.molecules.ensure_index("last_updated")
         self.molecules.ensure_index("task_ids")
         self.molecules.ensure_index("formula_alphabetical")
-        self.molecules.ensure_index("species_hash")
         # Search index for charges
         self.charges.ensure_index("molecule_id")
@@ -139,15 +134,6 @@ class SummaryBuilder(Builder):
         self.bonds.ensure_index("last_updated")
         self.bonds.ensure_index("formula_alphabetical")
-        # Search index for multipoles
-        self.multipoles.ensure_index("molecule_id")
-        self.multipoles.ensure_index("task_id")
-        self.multipoles.ensure_index("solvent")
-        self.multipoles.ensure_index("lot_solvent")
-        self.multipoles.ensure_index("property_id")
-        self.multipoles.ensure_index("last_updated")
-        self.multipoles.ensure_index("formula_alphabetical")
         # Search index for metal_binding
         self.metal_binding.ensure_index("molecule_id")
         self.metal_binding.ensure_index("solvent")
@@ -206,23 +192,23 @@ class SummaryBuilder(Builder):
         self.logger.info("Finding documents to process")
         all_mols = list(
-            self.molecules.query(temp_query, [self.molecules.key, "species_hash"])
+            self.molecules.query(
+                temp_query, [self.molecules.key, "formula_alphabetical"]
+            )
         )
         processed_docs = set([e for e in self.summary.distinct("molecule_id")])
         to_process_docs = {d[self.molecules.key] for d in all_mols} - processed_docs
-        to_process_hashes = {
-            d["species_hash"]
+        to_process_forms = {
+            d["formula_alphabetical"]
             for d in all_mols
             if d[self.molecules.key] in to_process_docs
         }
-        N = ceil(len(to_process_hashes) / number_splits)
+        N = ceil(len(to_process_forms) / number_splits)
-        for hash_chunk in grouper(to_process_hashes, N):
-            query = dict(temp_query)
-            query["species_hash"] = {"$in": list(hash_chunk)}
-            yield {"query": query}
+        for formula_chunk in grouper(to_process_forms, N):
+            yield {"query": {"formula_alphabetical": {"$in": list(formula_chunk)}}}
     def get_items(self) -> Iterator[List[Dict]]:
         """
@@ -247,26 +233,28 @@ class SummaryBuilder(Builder):
         self.logger.info("Finding documents to process")
         all_mols = list(
-            self.molecules.query(temp_query, [self.molecules.key, "species_hash"])
+            self.molecules.query(
+                temp_query, [self.molecules.key, "formula_alphabetical"]
+            )
         )
         processed_docs = set([e for e in self.summary.distinct("molecule_id")])
         to_process_docs = {d[self.molecules.key] for d in all_mols} - processed_docs
-        to_process_hashes = {
-            d["species_hash"]
+        to_process_forms = {
+            d["formula_alphabetical"]
             for d in all_mols
             if d[self.molecules.key] in to_process_docs
         }
         self.logger.info(f"Found {len(to_process_docs)} unprocessed documents")
-        self.logger.info(f"Found {len(to_process_hashes)} unprocessed hashes")
+        self.logger.info(f"Found {len(to_process_forms)} unprocessed formulas")
         # Set total for builder bars to have a total
-        self.total = len(to_process_hashes)
+        self.total = len(to_process_forms)
-        for shash in to_process_hashes:
+        for formula in to_process_forms:
             mol_query = dict(temp_query)
-            mol_query["species_hash"] = shash
+            mol_query["formula_alphabetical"] = formula
             molecules = list(self.molecules.query(criteria=mol_query))
             yield molecules
@@ -304,12 +292,12 @@ class SummaryBuilder(Builder):
                     else:
                         grouped[solvent][method] = doc
-            return grouped
+            return (grouped, by_method)
         mols = items
-        shash = mols[0]["species_hash"]
+        formula = mols[0]["formula_alphabetical"]
         mol_ids = [m["molecule_id"] for m in mols]
-        self.logger.debug(f"Processing {shash} : {mol_ids}")
+        self.logger.debug(f"Processing {formula} : {mol_ids}")
         summary_docs = list()
@@ -330,9 +318,6 @@ class SummaryBuilder(Builder):
                 "metal_binding": _group_docs(
                     list(self.metal_binding.query({"molecule_id": mol_id})), True
                 ),
-                "multipole_moments": _group_docs(
-                    list(self.multipoles.query({"molecule_id": mol_id})), False
-                ),
                 "orbitals": _group_docs(
                     list(self.orbitals.query({"molecule_id": mol_id})), False
                 ),
@@ -363,7 +348,7 @@ class SummaryBuilder(Builder):
             summary_doc = MoleculeSummaryDoc.from_docs(molecule_id=mol_id, docs=d)
             summary_docs.append(summary_doc)
-        self.logger.debug(f"Produced {len(summary_docs)} summary docs for {shash}")
+        self.logger.debug(f"Produced {len(summary_docs)} summary docs for {formula}")
         return jsanitize([doc.model_dump() for doc in summary_docs], allow_bson=True)

emmet/builders/molecules/thermo.py CHANGED Viewed

@@ -84,7 +84,7 @@ class ThermoBuilder(Builder):
     single-point energy corrections.
     Before any documents are constructed, the following steps are taken:
-        1. Gather MoleculeDocs by species hash
+        1. Gather MoleculeDocs by formula
         2. For each doc, identify tasks with thermodynamic information such as
             zero-point energy, enthalpy, and entropy. Collect these "documents
              including complete thermodynamics" (DICTs).
@@ -148,14 +148,12 @@ class ThermoBuilder(Builder):
         self.tasks.ensure_index("last_updated")
         self.tasks.ensure_index("state")
         self.tasks.ensure_index("formula_alphabetical")
-        self.tasks.ensure_index("species_hash")
         # Search index for molecules
         self.molecules.ensure_index("molecule_id")
         self.molecules.ensure_index("last_updated")
         self.molecules.ensure_index("task_ids")
         self.molecules.ensure_index("formula_alphabetical")
-        self.molecules.ensure_index("species_hash")
         # Search index for thermo
         self.thermo.ensure_index("molecule_id")
@@ -174,23 +172,23 @@ class ThermoBuilder(Builder):
         self.logger.info("Finding documents to process")
         all_mols = list(
-            self.molecules.query(temp_query, [self.molecules.key, "species_hash"])
+            self.molecules.query(
+                temp_query, [self.molecules.key, "formula_alphabetical"]
+            )
         )
         processed_docs = set([e for e in self.thermo.distinct("molecule_id")])
         to_process_docs = {d[self.molecules.key] for d in all_mols} - processed_docs
-        to_process_hashes = {
-            d["species_hash"]
+        to_process_forms = {
+            d["formula_alphabetical"]
             for d in all_mols
             if d[self.molecules.key] in to_process_docs
         }
-        N = ceil(len(to_process_hashes) / number_splits)
+        N = ceil(len(to_process_forms) / number_splits)
-        for hash_chunk in grouper(to_process_hashes, N):
-            query = dict(temp_query)
-            query["species_hash"] = {"$in": list(hash_chunk)}
-            yield {"query": query}
+        for formula_chunk in grouper(to_process_forms, N):
+            yield {"query": {"formula_alphabetical": {"$in": list(formula_chunk)}}}
     def get_items(self) -> Iterator[List[Dict]]:
         """
@@ -215,26 +213,28 @@ class ThermoBuilder(Builder):
         self.logger.info("Finding documents to process")
         all_mols = list(
-            self.molecules.query(temp_query, [self.molecules.key, "species_hash"])
+            self.molecules.query(
+                temp_query, [self.molecules.key, "formula_alphabetical"]
+            )
         )
         processed_docs = set([e for e in self.thermo.distinct("molecule_id")])
         to_process_docs = {d[self.molecules.key] for d in all_mols} - processed_docs
-        to_process_hashes = {
-            d["species_hash"]
+        to_process_forms = {
+            d["formula_alphabetical"]
             for d in all_mols
             if d[self.molecules.key] in to_process_docs
         }
         self.logger.info(f"Found {len(to_process_docs)} unprocessed documents")
-        self.logger.info(f"Found {len(to_process_hashes)} unprocessed hashes")
+        self.logger.info(f"Found {len(to_process_forms)} unprocessed formulas")
         # Set total for builder bars to have a total
-        self.total = len(to_process_hashes)
+        self.total = len(to_process_forms)
-        for shash in to_process_hashes:
+        for formula in to_process_forms:
             mol_query = dict(temp_query)
-            mol_query["species_hash"] = shash
+            mol_query["formula_alphabetical"] = formula
             molecules = list(self.molecules.query(criteria=mol_query))
             yield molecules
@@ -273,9 +273,9 @@ class ThermoBuilder(Builder):
             return doc
         mols = [MoleculeDoc(**item) for item in items]
-        shash = mols[0].species_hash
+        formula = mols[0].formula_alphabetical
         mol_ids = [m.molecule_id for m in mols]
-        self.logger.debug(f"Processing {shash} : {mol_ids}")
+        self.logger.debug(f"Processing {formula} : {mol_ids}")
         thermo_docs = list()
@@ -334,7 +334,7 @@ class ThermoBuilder(Builder):
                 tdoc = self.tasks.query_one(
                     {
                         "task_id": task,
-                        "species_hash": shash,
+                        "formula_alphabetical": formula,
                         "orig": {"$exists": True},
                     }
                 )
@@ -344,7 +344,7 @@ class ThermoBuilder(Builder):
                         tdoc = self.tasks.query_one(
                             {
                                 "task_id": int(task),
-                                "species_hash": shash,
+                                "formula_alphabetical": formula,
                                 "orig": {"$exists": True},
                             }
                         )
@@ -465,7 +465,7 @@ class ThermoBuilder(Builder):
                     sorted(with_eval_e, key=lambda x: (x[1], x[2]))[0][0]
                 )
-        self.logger.debug(f"Produced {len(thermo_docs)} thermo docs for {shash}")
+        self.logger.debug(f"Produced {len(thermo_docs)} thermo docs for {formula}")
         return jsanitize([doc.model_dump() for doc in thermo_docs], allow_bson=True)

emmet/builders/molecules/vibration.py CHANGED Viewed

@@ -27,7 +27,7 @@ class VibrationBuilder(Builder):
     each solvent available).
     The process is as follows:
-        1. Gather MoleculeDocs by species hash
+        1. Gather MoleculeDocs by formula
         2. For each doc, sort tasks by solvent
         3. For each solvent, grab the best TaskDoc (doc with vibrational
             information that has the highest level of theory with lowest
@@ -73,14 +73,12 @@ class VibrationBuilder(Builder):
         self.tasks.ensure_index("last_updated")
         self.tasks.ensure_index("state")
         self.tasks.ensure_index("formula_alphabetical")
-        self.tasks.ensure_index("species_hash")
         # Search index for molecules
         self.molecules.ensure_index("molecule_id")
         self.molecules.ensure_index("last_updated")
         self.molecules.ensure_index("task_ids")
         self.molecules.ensure_index("formula_alphabetical")
-        self.molecules.ensure_index("species_hash")
         # Search index for vibrational properties
         self.vibes.ensure_index("molecule_id")
@@ -99,23 +97,23 @@ class VibrationBuilder(Builder):
         self.logger.info("Finding documents to process")
         all_mols = list(
-            self.molecules.query(temp_query, [self.molecules.key, "species_hash"])
+            self.molecules.query(
+                temp_query, [self.molecules.key, "formula_alphabetical"]
+            )
         )
         processed_docs = set([e for e in self.vibes.distinct("molecule_id")])
         to_process_docs = {d[self.molecules.key] for d in all_mols} - processed_docs
-        to_process_hashes = {
-            d["species_hash"]
+        to_process_forms = {
+            d["formula_alphabetical"]
             for d in all_mols
             if d[self.molecules.key] in to_process_docs
         }
-        N = ceil(len(to_process_hashes) / number_splits)
+        N = ceil(len(to_process_forms) / number_splits)
-        for hash_chunk in grouper(to_process_hashes, N):
-            query = dict(temp_query)
-            query["species_hash"] = {"$in": list(hash_chunk)}
-            yield {"query": query}
+        for formula_chunk in grouper(to_process_forms, N):
+            yield {"query": {"formula_alphabetical": {"$in": list(formula_chunk)}}}
     def get_items(self) -> Iterator[List[Dict]]:
         """
@@ -140,26 +138,28 @@ class VibrationBuilder(Builder):
         self.logger.info("Finding documents to process")
         all_mols = list(
-            self.molecules.query(temp_query, [self.molecules.key, "species_hash"])
+            self.molecules.query(
+                temp_query, [self.molecules.key, "formula_alphabetical"]
+            )
         )
         processed_docs = set([e for e in self.vibes.distinct("molecule_id")])
         to_process_docs = {d[self.molecules.key] for d in all_mols} - processed_docs
-        to_process_hashes = {
-            d["species_hash"]
+        to_process_forms = {
+            d["formula_alphabetical"]
             for d in all_mols
             if d[self.molecules.key] in to_process_docs
         }
         self.logger.info(f"Found {len(to_process_docs)} unprocessed documents")
-        self.logger.info(f"Found {len(to_process_hashes)} unprocessed hashes")
+        self.logger.info(f"Found {len(to_process_forms)} unprocessed formulas")
         # Set total for builder bars to have a total
-        self.total = len(to_process_hashes)
+        self.total = len(to_process_forms)
-        for shash in to_process_hashes:
+        for formula in to_process_forms:
             mol_query = dict(temp_query)
-            mol_query["species_hash"] = shash
+            mol_query["formula_alphabetical"] = formula
             molecules = list(self.molecules.query(criteria=mol_query))
             yield molecules
@@ -176,9 +176,9 @@ class VibrationBuilder(Builder):
         """
         mols = [MoleculeDoc(**item) for item in items]
-        shash = mols[0].species_hash
+        formula = mols[0].formula_alphabetical
         mol_ids = [m.molecule_id for m in mols]
-        self.logger.debug(f"Processing {shash} : {mol_ids}")
+        self.logger.debug(f"Processing {formula} : {mol_ids}")
         vibe_docs = list()
@@ -213,7 +213,7 @@ class VibrationBuilder(Builder):
                 tdoc = self.tasks.query_one(
                     {
                         "task_id": task,
-                        "species_hash": shash,
+                        "formula_alphabetical": formula,
                         "orig": {"$exists": True},
                     }
                 )
@@ -223,7 +223,7 @@ class VibrationBuilder(Builder):
                         tdoc = self.tasks.query_one(
                             {
                                 "task_id": int(task),
-                                "species_hash": shash,
+                                "formula_alphabetical": formula,
                                 "orig": {"$exists": True},
                             }
                         )
@@ -243,7 +243,7 @@ class VibrationBuilder(Builder):
                 )
                 vibe_docs.append(vibe_doc)
-        self.logger.debug(f"Produced {len(vibe_docs)} vibration docs for {shash}")
+        self.logger.debug(f"Produced {len(vibe_docs)} vibration docs for {formula}")
         return jsanitize([doc.model_dump() for doc in vibe_docs], allow_bson=True)

emmet-builders 0.84.2rc7__py3-none-any.whl → 0.84.2rc9__py3-none-any.whl

Potentially problematic release.

emmet-builders 0.84.2rc7py3-none-any.whl → 0.84.2rc9py3-none-any.whl