emmet-builders 0.84.10rc2__py3-none-any.whl → 0.85.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of emmet-builders might be problematic. Click here for more details.

Files changed (36) hide show
  1. emmet/builders/abinit/phonon.py +12 -14
  2. emmet/builders/abinit/sound_velocity.py +1 -1
  3. emmet/builders/materials/absorption_spectrum.py +16 -10
  4. emmet/builders/materials/alloys.py +1 -1
  5. emmet/builders/materials/corrected_entries.py +1 -1
  6. emmet/builders/materials/dielectric.py +10 -7
  7. emmet/builders/materials/elasticity.py +12 -9
  8. emmet/builders/materials/electrodes.py +1 -1
  9. emmet/builders/materials/electronic_structure.py +1 -1
  10. emmet/builders/materials/magnetism.py +2 -1
  11. emmet/builders/materials/piezoelectric.py +23 -19
  12. emmet/builders/materials/provenance.py +3 -4
  13. emmet/builders/materials/summary.py +1 -1
  14. emmet/builders/settings.py +14 -9
  15. emmet/builders/utils.py +5 -4
  16. emmet/builders/vasp/materials.py +11 -4
  17. emmet/builders/vasp/task_validator.py +3 -1
  18. {emmet_builders-0.84.10rc2.dist-info → emmet_builders-0.85.0.dist-info}/METADATA +7 -30
  19. emmet_builders-0.85.0.dist-info/RECORD +41 -0
  20. emmet/builders/materials/ml.py +0 -101
  21. emmet/builders/molecules/atomic.py +0 -592
  22. emmet/builders/molecules/bonds.py +0 -329
  23. emmet/builders/molecules/electric.py +0 -287
  24. emmet/builders/molecules/metal_binding.py +0 -528
  25. emmet/builders/molecules/orbitals.py +0 -292
  26. emmet/builders/molecules/redox.py +0 -502
  27. emmet/builders/molecules/summary.py +0 -406
  28. emmet/builders/molecules/thermo.py +0 -505
  29. emmet/builders/molecules/trajectory.py +0 -530
  30. emmet/builders/molecules/vibration.py +0 -282
  31. emmet/builders/qchem/__init__.py +0 -0
  32. emmet/builders/qchem/molecules.py +0 -745
  33. emmet_builders-0.84.10rc2.dist-info/RECORD +0 -54
  34. /emmet/builders/{molecules/__init__.py → py.typed} +0 -0
  35. {emmet_builders-0.84.10rc2.dist-info → emmet_builders-0.85.0.dist-info}/WHEEL +0 -0
  36. {emmet_builders-0.84.10rc2.dist-info → emmet_builders-0.85.0.dist-info}/top_level.txt +0 -0
@@ -1,329 +0,0 @@
1
- from __future__ import annotations
2
-
3
- from collections import defaultdict
4
- from datetime import datetime
5
- from itertools import chain
6
- from math import ceil
7
-
8
- from maggma.builders import Builder
9
- from maggma.core import Store
10
- from maggma.utils import grouper
11
-
12
- from emmet.builders.settings import EmmetBuildSettings
13
- from emmet.core.molecules.bonds import BOND_METHODS, MoleculeBondingDoc
14
- from emmet.core.qchem.molecule import MoleculeDoc, evaluate_lot
15
- from emmet.core.qchem.task import TaskDocument
16
- from emmet.core.utils import jsanitize
17
-
18
- from typing import TYPE_CHECKING
19
-
20
- if TYPE_CHECKING:
21
- from collections.abc import Iterable, Iterator
22
-
23
- __author__ = "Evan Spotte-Smith"
24
-
25
- SETTINGS = EmmetBuildSettings()
26
-
27
-
28
- class BondingBuilder(Builder):
29
- """
30
- The BondingBuilder defines the bonds in a MoleculeDoc.
31
-
32
- Various methods can be used to define bonding, including:
33
- - OpenBabelNN + metal_edge_extender: Combining the bond detection algorithms in OpenBabel (OpenBabelNN in
34
- pymatgen) with a heuristic to add metal coordinate bonds (metal_edge_extender
35
- in pymatgen)
36
- - critic2: Using critical points of the electron density to define bonds
37
- - nbo: Using Natural Bonding Orbital analysis to define bonds and other
38
- interatomic interactions
39
-
40
- NOTE: Only NBO7 can be used to generate bonding. Bonding (especially when metals
41
- are involved) is unreliable with earlier version of NBO!
42
-
43
- This builder will attempt to build documents for each molecule, in each solvent,
44
- with each method. For each molecule-solvent-method combination, the highest-quality
45
- data available (based on level of theory and electronic energy) will be used.
46
-
47
- The process is as follows:
48
- 1. Gather MoleculeDocs by species hash
49
- 2. For each molecule, group all tasks by solvent.
50
- 3. For each solvent, sort tasks by level of theory and electronic energy
51
- 4. For each method:
52
- 4.1. Find task docs with necessary data to define bonding by that method
53
- 4.2. Take best (defined by level of theory and electronic energy) task
54
- 4.3. Convert TaskDoc to MoleculeBondingDoc
55
- """
56
-
57
- def __init__(
58
- self,
59
- tasks: Store,
60
- molecules: Store,
61
- bonds: Store,
62
- query: dict | None = None,
63
- methods: list | None = None,
64
- settings: EmmetBuildSettings | None = None,
65
- **kwargs,
66
- ):
67
- self.tasks = tasks
68
- self.molecules = molecules
69
- self.bonds = bonds
70
- self.query = query if query else dict()
71
- self.methods = methods if methods else BOND_METHODS
72
- self.settings = EmmetBuildSettings.autoload(settings)
73
- self.kwargs = kwargs
74
-
75
- super().__init__(sources=[tasks, molecules], targets=[bonds], **kwargs)
76
- # Uncomment in case of issue with mrun not connecting automatically to collections
77
- # for i in [self.tasks, self.molecules, self.bonds]:
78
- # try:
79
- # i.connect()
80
- # except Exception as e:
81
- # print("Could not connect,", e)
82
-
83
- def ensure_indexes(self):
84
- """
85
- Ensures indices on the collections needed for building
86
- """
87
-
88
- # Basic search index for tasks
89
- self.tasks.ensure_index("task_id")
90
- self.tasks.ensure_index("last_updated")
91
- self.tasks.ensure_index("state")
92
- self.tasks.ensure_index("formula_alphabetical")
93
- self.tasks.ensure_index("species_hash")
94
-
95
- # Search index for molecules
96
- self.molecules.ensure_index("molecule_id")
97
- self.molecules.ensure_index("last_updated")
98
- self.molecules.ensure_index("task_ids")
99
- self.molecules.ensure_index("formula_alphabetical")
100
- self.molecules.ensure_index("species_hash")
101
-
102
- # Search index for bonds
103
- self.bonds.ensure_index("molecule_id")
104
- self.bonds.ensure_index("method")
105
- self.bonds.ensure_index("task_id")
106
- self.bonds.ensure_index("solvent")
107
- self.bonds.ensure_index("lot_solvent")
108
- self.bonds.ensure_index("property_id")
109
- self.bonds.ensure_index("last_updated")
110
- self.bonds.ensure_index("formula_alphabetical")
111
-
112
- def prechunk(self, number_splits: int) -> Iterable[dict]: # pragma: no cover
113
- """Prechunk the builder for distributed computation"""
114
-
115
- temp_query = dict(self.query)
116
- temp_query["deprecated"] = False
117
-
118
- self.logger.info("Finding documents to process")
119
- all_mols = list(
120
- self.molecules.query(temp_query, [self.molecules.key, "species_hash"])
121
- )
122
-
123
- processed_docs = set([e for e in self.bonds.distinct("molecule_id")])
124
- to_process_docs = {d[self.molecules.key] for d in all_mols} - processed_docs
125
- to_process_hashes = {
126
- d["species_hash"]
127
- for d in all_mols
128
- if d[self.molecules.key] in to_process_docs
129
- }
130
-
131
- N = ceil(len(to_process_hashes) / number_splits)
132
-
133
- for hash_chunk in grouper(to_process_hashes, N):
134
- query = dict(temp_query)
135
- query["species_hash"] = {"$in": list(hash_chunk)}
136
- yield {"query": query}
137
-
138
- def get_items(self) -> Iterator[list[dict]]:
139
- """
140
- Gets all items to process into bonding documents.
141
- This does no datetime checking; relying on on whether
142
- task_ids are included in the bonds Store
143
-
144
- Returns:
145
- generator or list relevant tasks and molecules to process into documents
146
- """
147
-
148
- self.logger.info("Bonding builder started")
149
- self.logger.info("Setting indexes")
150
- self.ensure_indexes()
151
-
152
- # Save timestamp to mark buildtime
153
- self.timestamp = datetime.utcnow()
154
-
155
- # Get all processed molecules
156
- temp_query = dict(self.query)
157
- temp_query["deprecated"] = False
158
-
159
- self.logger.info("Finding documents to process")
160
- all_mols = list(
161
- self.molecules.query(temp_query, [self.molecules.key, "species_hash"])
162
- )
163
-
164
- processed_docs = set([e for e in self.bonds.distinct("molecule_id")])
165
- to_process_docs = {d[self.molecules.key] for d in all_mols} - processed_docs
166
- to_process_hashes = {
167
- d["species_hash"]
168
- for d in all_mols
169
- if d[self.molecules.key] in to_process_docs
170
- }
171
-
172
- self.logger.info(f"Found {len(to_process_docs)} unprocessed documents")
173
- self.logger.info(f"Found {len(to_process_hashes)} unprocessed hashes")
174
-
175
- # Set total for builder bars to have a total
176
- self.total = len(to_process_hashes)
177
-
178
- for shash in to_process_hashes:
179
- mol_query = dict(temp_query)
180
- mol_query["species_hash"] = shash
181
- molecules = list(self.molecules.query(criteria=mol_query))
182
-
183
- yield molecules
184
-
185
- def process_item(self, items: list[dict]) -> list[dict]:
186
- """
187
- Process the tasks into MoleculeBondingDocs
188
-
189
- Args:
190
- tasks list[dict] : a list of MoleculeDocs in dict form
191
-
192
- Returns:
193
- [dict] : a list of new bonding docs
194
- """
195
-
196
- mols = [MoleculeDoc(**item) for item in items]
197
- shash = mols[0].species_hash
198
- mol_ids = [m.molecule_id for m in mols]
199
- self.logger.debug(f"Processing {shash} : {mol_ids}")
200
-
201
- bonding_docs = list()
202
-
203
- for mol in mols:
204
- correct_charge_spin = [
205
- e
206
- for e in mol.entries
207
- if e["charge"] == mol.charge
208
- and e["spin_multiplicity"] == mol.spin_multiplicity
209
- ]
210
-
211
- # Organize by solvent environment
212
- by_solvent = defaultdict(list)
213
- for entry in correct_charge_spin:
214
- by_solvent[entry["solvent"]].append(entry)
215
-
216
- for solvent, entries in by_solvent.items():
217
- sorted_entries = sorted(
218
- entries,
219
- key=lambda x: (
220
- sum(evaluate_lot(x["level_of_theory"])),
221
- x["energy"],
222
- ),
223
- )
224
-
225
- for method in self.methods:
226
- # For each method, grab entries that have the relevant data
227
- if method == "OpenBabelNN + metal_edge_extender":
228
- # This is sort of silly. Since, at the MoleculeDoc level,
229
- # the structures have to be identical, bonding defined
230
- # using heuristic methods like OpenBabel should always
231
- # be identical.
232
- # TODO: Decide if only one OpenBabelNN + m_e_e doc
233
- # TODO: should be allowed.
234
- relevant_entries = sorted_entries
235
- else:
236
- relevant_entries = [
237
- e
238
- for e in sorted_entries
239
- if e.get(method) is not None
240
- or e["output"].get(method) is not None
241
- ]
242
-
243
- if method == "nbo":
244
- # Only allow NBO7 to be used. No earlier versions can be
245
- # relied upon for bonding
246
- relevant_entries = [
247
- e
248
- for e in relevant_entries
249
- if e["orig"]["rem"].get("run_nbo6", False)
250
- or e["orig"]["rem"].get("nbo_external", False)
251
- ]
252
-
253
- if len(relevant_entries) == 0:
254
- continue
255
-
256
- # Grab task document of best entry
257
- best_entry = relevant_entries[0]
258
- task = best_entry["task_id"]
259
-
260
- tdoc = self.tasks.query_one(
261
- {
262
- "task_id": task,
263
- "species_hash": shash,
264
- "orig": {"$exists": True},
265
- }
266
- )
267
-
268
- if tdoc is None:
269
- try:
270
- tdoc = self.tasks.query_one(
271
- {
272
- "task_id": int(task),
273
- "species_hash": shash,
274
- "orig": {"$exists": True},
275
- }
276
- )
277
- except ValueError:
278
- tdoc = None
279
-
280
- if tdoc is None:
281
- continue
282
-
283
- task_doc = TaskDocument(**tdoc)
284
-
285
- if task_doc is None:
286
- continue
287
-
288
- doc = MoleculeBondingDoc.from_task(
289
- task_doc,
290
- molecule_id=mol.molecule_id,
291
- preferred_methods=[method],
292
- deprecated=False,
293
- )
294
- bonding_docs.append(doc)
295
-
296
- self.logger.debug(f"Produced {len(bonding_docs)} bonding docs for {shash}")
297
-
298
- return jsanitize([doc.model_dump() for doc in bonding_docs], allow_bson=True)
299
-
300
- def update_targets(self, items: list[list[dict]]):
301
- """
302
- Inserts the new documents into the bonds collection
303
-
304
- Args:
305
- items [[dict]]: A list of documents to update
306
- """
307
-
308
- docs = list(chain.from_iterable(items)) # type: ignore
309
-
310
- # Add timestamp
311
- for item in docs:
312
- item.update(
313
- {
314
- "_bt": self.timestamp,
315
- }
316
- )
317
-
318
- molecule_ids = list({item["molecule_id"] for item in docs})
319
-
320
- if len(items) > 0:
321
- self.logger.info(f"Updating {len(docs)} bonding documents")
322
- self.bonds.remove_docs({self.bonds.key: {"$in": molecule_ids}})
323
- # Neither molecule_id nor method need to be unique, but the combination must be
324
- self.bonds.update(
325
- docs=docs,
326
- key=["molecule_id", "method", "solvent"],
327
- )
328
- else:
329
- self.logger.info("No items to update")
@@ -1,287 +0,0 @@
1
- from __future__ import annotations
2
-
3
- from collections import defaultdict
4
- from datetime import datetime
5
- from itertools import chain
6
- from math import ceil
7
-
8
- from maggma.builders import Builder
9
- from maggma.core import Store
10
- from maggma.utils import grouper
11
-
12
- from emmet.builders.settings import EmmetBuildSettings
13
- from emmet.core.molecules.electric import ElectricMultipoleDoc
14
- from emmet.core.qchem.molecule import MoleculeDoc, evaluate_lot
15
- from emmet.core.qchem.task import TaskDocument
16
- from emmet.core.utils import jsanitize
17
-
18
- from typing import TYPE_CHECKING
19
-
20
- if TYPE_CHECKING:
21
- from collections.abc import Iterable, Iterator
22
-
23
- __author__ = "Evan Spotte-Smith"
24
-
25
- SETTINGS = EmmetBuildSettings()
26
-
27
-
28
- class ElectricMultipoleBuilder(Builder):
29
- """
30
- The ElectricMultipoleBuilder defines the electric multipole properties of a MoleculeDoc.
31
-
32
- This builder will attempt to build documents for each molecule, in each solvent.
33
- For each molecule-solvent combination, the highest-quality
34
- data available (based on level of theory and electronic energy) will be used.
35
-
36
- The process is as follows:
37
- 1. Gather MoleculeDocs by species hash
38
- 2. For each molecule, group all tasks by solvent.
39
- 3. For each solvent, grab the best TaskDoc (doc with elecrtric dipole/multipole information
40
- that has the highest level of theory with the lowest electronic energy) for the molecule
41
- 4. Convert TaskDoc to ElectricMultipoleDoc
42
- """
43
-
44
- def __init__(
45
- self,
46
- tasks: Store,
47
- molecules: Store,
48
- multipoles: Store,
49
- query: dict | None = None,
50
- settings: EmmetBuildSettings | None = None,
51
- **kwargs,
52
- ):
53
- self.tasks = tasks
54
- self.molecules = molecules
55
- self.multipoles = multipoles
56
- self.query = query if query else dict()
57
- self.settings = EmmetBuildSettings.autoload(settings)
58
- self.kwargs = kwargs
59
-
60
- super().__init__(sources=[tasks, molecules], targets=[multipoles], **kwargs)
61
- # Uncomment in case of issue with mrun not connecting automatically to collections
62
- # for i in [self.tasks, self.molecules, self.multipoles]:
63
- # try:
64
- # i.connect()
65
- # except Exception as e:
66
- # print("Could not connect,", e)
67
-
68
- def ensure_indexes(self):
69
- """
70
- Ensures indices on the collections needed for building
71
- """
72
-
73
- # Basic search index for tasks
74
- self.tasks.ensure_index("task_id")
75
- self.tasks.ensure_index("last_updated")
76
- self.tasks.ensure_index("state")
77
- self.tasks.ensure_index("formula_alphabetical")
78
- self.tasks.ensure_index("species_hash")
79
-
80
- # Search index for molecules
81
- self.molecules.ensure_index("molecule_id")
82
- self.molecules.ensure_index("last_updated")
83
- self.molecules.ensure_index("task_ids")
84
- self.molecules.ensure_index("formula_alphabetical")
85
- self.molecules.ensure_index("species_hash")
86
-
87
- # Search index for electric
88
- self.multipoles.ensure_index("method")
89
- self.multipoles.ensure_index("molecule_id")
90
- self.multipoles.ensure_index("task_id")
91
- self.multipoles.ensure_index("solvent")
92
- self.multipoles.ensure_index("lot_solvent")
93
- self.multipoles.ensure_index("property_id")
94
- self.multipoles.ensure_index("last_updated")
95
- self.multipoles.ensure_index("formula_alphabetical")
96
-
97
- def prechunk(self, number_splits: int) -> Iterable[dict]: # pragma: no cover
98
- """Prechunk the builder for distributed computation"""
99
-
100
- temp_query = dict(self.query)
101
- temp_query["deprecated"] = False
102
-
103
- self.logger.info("Finding documents to process")
104
- all_mols = list(
105
- self.molecules.query(temp_query, [self.molecules.key, "species_hash"])
106
- )
107
-
108
- processed_docs = set([e for e in self.multipoles.distinct("molecule_id")])
109
- to_process_docs = {d[self.molecules.key] for d in all_mols} - processed_docs
110
- to_process_hashes = {
111
- d["species_hash"]
112
- for d in all_mols
113
- if d[self.molecules.key] in to_process_docs
114
- }
115
-
116
- N = ceil(len(to_process_hashes) / number_splits)
117
-
118
- for hash_chunk in grouper(to_process_hashes, N):
119
- query = dict(temp_query)
120
- query["species_hash"] = {"$in": list(hash_chunk)}
121
- yield {"query": query}
122
-
123
- def get_items(self) -> Iterator[list[dict]]:
124
- """
125
- Gets all items to process into multipole documents.
126
- This does no datetime checking; relying on on whether
127
- task_ids are included in the multipoles Store
128
-
129
- Returns:
130
- generator or list relevant tasks and molecules to process into documents
131
- """
132
-
133
- self.logger.info("Electric multipoles builder started")
134
- self.logger.info("Setting indexes")
135
- self.ensure_indexes()
136
-
137
- # Save timestamp to mark buildtime
138
- self.timestamp = datetime.utcnow()
139
-
140
- # Get all processed molecules
141
- temp_query = dict(self.query)
142
- temp_query["deprecated"] = False
143
-
144
- self.logger.info("Finding documents to process")
145
- all_mols = list(
146
- self.molecules.query(temp_query, [self.molecules.key, "species_hash"])
147
- )
148
-
149
- processed_docs = set([e for e in self.multipoles.distinct("molecule_id")])
150
- to_process_docs = {d[self.molecules.key] for d in all_mols} - processed_docs
151
- to_process_hashes = {
152
- d["species_hash"]
153
- for d in all_mols
154
- if d[self.molecules.key] in to_process_docs
155
- }
156
-
157
- self.logger.info(f"Found {len(to_process_docs)} unprocessed documents")
158
- self.logger.info(f"Found {len(to_process_hashes)} unprocessed hashes")
159
-
160
- # Set total for builder bars to have a total
161
- self.total = len(to_process_hashes)
162
-
163
- for shash in to_process_hashes:
164
- mol_query = dict(temp_query)
165
- mol_query["species_hash"] = shash
166
- molecules = list(self.molecules.query(criteria=mol_query))
167
-
168
- yield molecules
169
-
170
- def process_item(self, items: list[dict]) -> list[dict]:
171
- """
172
- Process the tasks into ElectricMultipoleDocs
173
-
174
- Args:
175
- tasks list[dict] : a list of MoleculeDocs in dict form
176
-
177
- Returns:
178
- [dict] : a list of new electric multipole docs
179
- """
180
-
181
- mols = [MoleculeDoc(**item) for item in items]
182
- shash = mols[0].species_hash
183
- mol_ids = [m.molecule_id for m in mols]
184
- self.logger.debug(f"Processing {shash} : {mol_ids}")
185
-
186
- multipole_docs = list()
187
-
188
- for mol in mols:
189
- # Relevant tasks are those with the correct charge and spin
190
- # for which there are AT LEAST electric dipoles present
191
- # (ideally, multipole information would also be present)
192
- multip_entries = [
193
- e
194
- for e in mol.entries
195
- if e["charge"] == mol.charge
196
- and e["spin_multiplicity"] == mol.spin_multiplicity
197
- and (e["output"].get("dipoles") is not None)
198
- ]
199
-
200
- # Organize by solvent environment
201
- by_solvent = defaultdict(list)
202
- for entry in multip_entries:
203
- by_solvent[entry["solvent"]].append(entry)
204
-
205
- for solvent, entries in by_solvent.items():
206
- # No documents with enthalpy and entropy
207
- if len(entries) == 0:
208
- continue
209
- else:
210
- best = sorted(
211
- entries,
212
- key=lambda x: (
213
- sum(evaluate_lot(x["level_of_theory"])),
214
- x["energy"],
215
- ),
216
- )[0]
217
- task = best["task_id"]
218
-
219
- tdoc = self.tasks.query_one(
220
- {
221
- "task_id": task,
222
- "species_hash": shash,
223
- "orig": {"$exists": True},
224
- }
225
- )
226
-
227
- if tdoc is None:
228
- try:
229
- tdoc = self.tasks.query_one(
230
- {
231
- "task_id": int(task),
232
- "species_hash": shash,
233
- "orig": {"$exists": True},
234
- }
235
- )
236
- except ValueError:
237
- tdoc = None
238
-
239
- if tdoc is None:
240
- continue
241
-
242
- task_doc = TaskDocument(**tdoc)
243
-
244
- if task_doc is None:
245
- continue
246
-
247
- multipole_doc = ElectricMultipoleDoc.from_task(
248
- task_doc, molecule_id=mol.molecule_id, deprecated=False
249
- )
250
- multipole_docs.append(multipole_doc)
251
-
252
- self.logger.debug(
253
- f"Produced {len(multipole_docs)} electric multipole docs for {shash}"
254
- )
255
-
256
- return jsanitize([doc.model_dump() for doc in multipole_docs], allow_bson=True)
257
-
258
- def update_targets(self, items: list[list[dict]]):
259
- """
260
- Inserts the new documents into the multipoles collection
261
-
262
- Args:
263
- items [[dict]]: A list of documents to update
264
- """
265
-
266
- docs = list(chain.from_iterable(items)) # type: ignore
267
-
268
- # Add timestamp
269
- for item in docs:
270
- item.update(
271
- {
272
- "_bt": self.timestamp,
273
- }
274
- )
275
-
276
- molecule_ids = list({item["molecule_id"] for item in docs})
277
-
278
- if len(items) > 0:
279
- self.logger.info(f"Updating {len(docs)} electric multipole documents")
280
- self.multipoles.remove_docs({self.multipoles.key: {"$in": molecule_ids}})
281
- # Neither molecule_id nor method need to be unique, but the combination must be
282
- self.multipoles.update(
283
- docs=docs,
284
- key=["molecule_id", "solvent"],
285
- )
286
- else:
287
- self.logger.info("No items to update")