emmet-builders 0.84.2__py3-none-any.whl → 0.86.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (42) hide show
  1. emmet/builders/abinit/phonon.py +27 -25
  2. emmet/builders/abinit/sound_velocity.py +15 -11
  3. emmet/builders/feff/xas.py +1 -2
  4. emmet/builders/materials/absorption_spectrum.py +25 -14
  5. emmet/builders/materials/alloys.py +3 -4
  6. emmet/builders/materials/chemenv.py +2 -3
  7. emmet/builders/materials/corrected_entries.py +15 -9
  8. emmet/builders/materials/dielectric.py +19 -11
  9. emmet/builders/materials/elasticity.py +44 -33
  10. emmet/builders/materials/electrodes.py +24 -19
  11. emmet/builders/materials/electronic_structure.py +17 -17
  12. emmet/builders/materials/magnetism.py +11 -4
  13. emmet/builders/materials/optimade.py +7 -3
  14. emmet/builders/materials/piezoelectric.py +24 -21
  15. emmet/builders/materials/provenance.py +15 -12
  16. emmet/builders/materials/robocrys.py +2 -3
  17. emmet/builders/materials/substrates.py +9 -8
  18. emmet/builders/materials/summary.py +3 -3
  19. emmet/builders/materials/thermo.py +17 -11
  20. emmet/builders/matscholar/missing_compositions.py +12 -8
  21. emmet/builders/mobility/migration_graph.py +5 -5
  22. emmet/builders/settings.py +21 -17
  23. emmet/builders/utils.py +15 -10
  24. emmet/builders/vasp/materials.py +32 -16
  25. emmet/builders/vasp/task_validator.py +15 -11
  26. {emmet_builders-0.84.2.dist-info → emmet_builders-0.86.0.dist-info}/METADATA +21 -36
  27. emmet_builders-0.86.0.dist-info/RECORD +41 -0
  28. {emmet_builders-0.84.2.dist-info → emmet_builders-0.86.0.dist-info}/WHEEL +1 -1
  29. emmet/builders/materials/ml.py +0 -87
  30. emmet/builders/molecules/atomic.py +0 -589
  31. emmet/builders/molecules/bonds.py +0 -324
  32. emmet/builders/molecules/metal_binding.py +0 -526
  33. emmet/builders/molecules/orbitals.py +0 -288
  34. emmet/builders/molecules/redox.py +0 -496
  35. emmet/builders/molecules/summary.py +0 -383
  36. emmet/builders/molecules/thermo.py +0 -500
  37. emmet/builders/molecules/vibration.py +0 -278
  38. emmet/builders/qchem/__init__.py +0 -0
  39. emmet/builders/qchem/molecules.py +0 -734
  40. emmet_builders-0.84.2.dist-info/RECORD +0 -52
  41. /emmet/builders/{molecules/__init__.py → py.typed} +0 -0
  42. {emmet_builders-0.84.2.dist-info → emmet_builders-0.86.0.dist-info}/top_level.txt +0 -0
@@ -1,496 +0,0 @@
1
- from collections import defaultdict
2
- import copy
3
- from datetime import datetime
4
- from itertools import chain, groupby
5
- from math import ceil
6
- from typing import Any, Dict, Iterable, Iterator, List, Optional, Union
7
-
8
- from maggma.builders import Builder
9
- from maggma.core import Store
10
- from maggma.utils import grouper
11
-
12
- from emmet.core.qchem.task import TaskDocument
13
- from emmet.core.qchem.molecule import MoleculeDoc
14
- from emmet.core.molecules.bonds import metals
15
- from emmet.core.molecules.thermo import MoleculeThermoDoc
16
- from emmet.core.molecules.redox import RedoxDoc
17
- from emmet.core.utils import confirm_molecule, get_graph_hash, jsanitize
18
- from emmet.builders.settings import EmmetBuildSettings
19
-
20
-
21
- __author__ = "Evan Spotte-Smith"
22
-
23
- SETTINGS = EmmetBuildSettings()
24
-
25
-
26
- class RedoxBuilder(Builder):
27
- """
28
- The RedoxBuilder extracts the highest-quality redox data (vertical and
29
- adiabatic reduction and oxidation potentials, etc.)
30
- from a MoleculeDoc (lowest electronic energy, highest level of theory).
31
-
32
- The process is as follows:
33
- 1. Gather MoleculeDocs by formula
34
- 2. Further group based on (covalent) isomorphism and charge
35
- 3. For each MoleculeDoc:
36
- 3a. Identify relevant MoleculeThermoDocs
37
- 3b. Look for single-point energy calculations conducted at the
38
- molecule's charge +- 1. These will be used to calculation
39
- vertical electron affinities and ionization energies
40
- 3c. Group MoleculeThermoDocs and single-point calculations based on solvent
41
- and level of theory
42
- 4. Construct RedoxDocs by looking for molecules (with associated
43
- calculations) that:
44
- - Have charges that differ by +- 1
45
- - Use the same solvent and level of theory
46
- """
47
-
48
- def __init__(
49
- self,
50
- tasks: Store,
51
- molecules: Store,
52
- thermo: Store,
53
- redox: Store,
54
- query: Optional[Dict] = None,
55
- settings: Optional[EmmetBuildSettings] = None,
56
- **kwargs,
57
- ):
58
- self.tasks = tasks
59
- self.molecules = molecules
60
- self.thermo = thermo
61
- self.redox = redox
62
- self.query = query if query else dict()
63
- self.settings = EmmetBuildSettings.autoload(settings)
64
- self.kwargs = kwargs
65
-
66
- super().__init__(sources=[tasks, molecules, thermo], targets=[redox], **kwargs)
67
- # Uncomment in case of issue with mrun not connecting automatically to collections
68
- # for i in [self.tasks, self.molecules, self.thermo, self.redox]:
69
- # try:
70
- # i.connect()
71
- # except Exception as e:
72
- # print("Could not connect,", e)
73
-
74
- def ensure_indexes(self):
75
- """
76
- Ensures indices on the collections needed for building
77
- """
78
-
79
- # Basic search index for tasks
80
- self.tasks.ensure_index("task_id")
81
- self.tasks.ensure_index("last_updated")
82
- self.tasks.ensure_index("state")
83
- self.tasks.ensure_index("formula_alphabetical")
84
-
85
- # Search index for molecules
86
- self.molecules.ensure_index("molecule_id")
87
- self.molecules.ensure_index("last_updated")
88
- self.molecules.ensure_index("task_ids")
89
- self.molecules.ensure_index("formula_alphabetical")
90
-
91
- # Search index for thermo
92
- self.thermo.ensure_index("molecule_id")
93
- self.thermo.ensure_index("task_id")
94
- self.thermo.ensure_index("solvent")
95
- self.thermo.ensure_index("lot_solvent")
96
- self.thermo.ensure_index("property_id")
97
- self.thermo.ensure_index("last_updated")
98
- self.thermo.ensure_index("formula_alphabetical")
99
-
100
- # Search index for redox
101
- self.redox.ensure_index("molecule_id")
102
- self.redox.ensure_index("solvent")
103
- self.redox.ensure_index("lot_solvent")
104
- self.redox.ensure_index("property_id")
105
- self.redox.ensure_index("last_updated")
106
- self.redox.ensure_index("formula_alphabetical")
107
-
108
- def prechunk(self, number_splits: int) -> Iterable[Dict]: # pragma: no cover
109
- """Prechunk the builder for distributed computation"""
110
-
111
- temp_query = dict(self.query)
112
- temp_query["deprecated"] = False
113
-
114
- self.logger.info("Finding documents to process")
115
- all_mols = list(
116
- self.molecules.query(
117
- temp_query, [self.molecules.key, "formula_alphabetical"]
118
- )
119
- )
120
-
121
- processed_docs = set([e for e in self.redox.distinct("molecule_id")])
122
- to_process_docs = {d[self.molecules.key] for d in all_mols} - processed_docs
123
- to_process_forms = {
124
- d["formula_alphabetical"]
125
- for d in all_mols
126
- if d[self.molecules.key] in to_process_docs
127
- }
128
-
129
- N = ceil(len(to_process_forms) / number_splits)
130
-
131
- for formula_chunk in grouper(to_process_forms, N):
132
- yield {"query": {"formula_alphabetical": {"$in": list(formula_chunk)}}}
133
-
134
- def get_items(self) -> Iterator[List[Dict]]:
135
- """
136
- Gets all items to process into redox documents.
137
- This does no datetime checking; relying on on whether
138
- task_ids are included in the orbitals Store
139
-
140
- Returns:
141
- generator or list relevant tasks and molecules to process into documents
142
- """
143
-
144
- self.logger.info("Redox builder started")
145
- self.logger.info("Setting indexes")
146
- self.ensure_indexes()
147
-
148
- # Save timestamp to mark buildtime
149
- self.timestamp = datetime.utcnow()
150
-
151
- # Get all processed molecules
152
- temp_query = dict(self.query)
153
- temp_query["deprecated"] = False
154
-
155
- self.logger.info("Finding documents to process")
156
- all_mols = list(
157
- self.molecules.query(
158
- temp_query, [self.molecules.key, "formula_alphabetical"]
159
- )
160
- )
161
-
162
- processed_docs = set([e for e in self.redox.distinct("molecule_id")])
163
- to_process_docs = {d[self.molecules.key] for d in all_mols} - processed_docs
164
- to_process_forms = {
165
- d["formula_alphabetical"]
166
- for d in all_mols
167
- if d[self.molecules.key] in to_process_docs
168
- }
169
-
170
- self.logger.info(f"Found {len(to_process_docs)} unprocessed documents")
171
- self.logger.info(f"Found {len(to_process_forms)} unprocessed formulas")
172
-
173
- # Set total for builder bars to have a total
174
- self.total = len(to_process_forms)
175
-
176
- for formula in to_process_forms:
177
- mol_query = dict(temp_query)
178
- mol_query["formula_alphabetical"] = formula
179
- molecules = list(self.molecules.query(criteria=mol_query))
180
-
181
- yield molecules
182
-
183
- def process_item(self, items: List[Dict]) -> List[Dict]:
184
- """
185
- Process the tasks into a RedoxDoc
186
-
187
- Args:
188
- tasks List[Dict] : a list of MoleculeDocs in dict form
189
-
190
- Returns:
191
- [dict] : a list of new redox docs
192
- """
193
-
194
- mols = [MoleculeDoc(**item) for item in items]
195
- formula = mols[0].formula_alphabetical
196
- mol_ids = [m.molecule_id for m in mols]
197
- self.logger.debug(f"Processing {formula} : {mol_ids}")
198
-
199
- redox_docs = list()
200
-
201
- # Group by (covalent) molecular graph connectivity
202
- group_by_graph = self._group_by_graph(mols)
203
-
204
- for graph_group in group_by_graph.values():
205
- # Molecule docs will be grouped by charge
206
- charges: Dict[int, Any] = dict()
207
-
208
- for gg in graph_group:
209
- # First, grab relevant MoleculeThermoDocs and identify possible IE/EA single-points
210
- thermo_docs = [
211
- MoleculeThermoDoc(**e)
212
- for e in self.thermo.query({"molecule_id": gg.molecule_id})
213
- ]
214
-
215
- if len(thermo_docs) == 0:
216
- # Current building scheme requires a MoleculeThermoDoc
217
- continue
218
-
219
- ie_sp_task_ids = [
220
- e["task_id"]
221
- for e in gg.entries
222
- if e["charge"] == gg.charge + 1
223
- and e["task_type"] == "Single Point"
224
- and e["output"].get("final_energy")
225
- ]
226
- ie_tasks = list()
227
- for i in ie_sp_task_ids:
228
- tdoc = self.tasks.query_one(
229
- {
230
- "task_id": i,
231
- "formula_alphabetical": formula,
232
- "orig": {"$exists": True},
233
- }
234
- )
235
-
236
- if tdoc is None:
237
- try:
238
- tdoc = self.tasks.query_one(
239
- {
240
- "task_id": int(i),
241
- "formula_alphabetical": formula,
242
- "orig": {"$exists": True},
243
- }
244
- )
245
- except ValueError:
246
- tdoc = None
247
-
248
- if tdoc is None:
249
- continue
250
-
251
- ie_tasks.append(TaskDocument(**tdoc))
252
-
253
- ea_sp_task_ids = [
254
- e["task_id"]
255
- for e in gg.entries
256
- if e["charge"] == gg.charge - 1
257
- and e["task_type"] == "Single Point"
258
- and e["output"].get("final_energy")
259
- ]
260
- ea_tasks = list()
261
- for i in ea_sp_task_ids:
262
- tdoc = self.tasks.query_one(
263
- {
264
- "task_id": i,
265
- "formula_alphabetical": formula,
266
- "orig": {"$exists": True},
267
- }
268
- )
269
-
270
- if tdoc is None:
271
- try:
272
- tdoc = self.tasks.query_one(
273
- {
274
- "task_id": int(i),
275
- "formula_alphabetical": formula,
276
- "orig": {"$exists": True},
277
- }
278
- )
279
- except ValueError:
280
- tdoc = None
281
-
282
- if tdoc is None:
283
- continue
284
-
285
- ea_tasks.append(TaskDocument(**tdoc))
286
-
287
- grouped_docs = self._collect_by_lot_solvent(
288
- thermo_docs, ie_tasks, ea_tasks
289
- )
290
- if gg.charge in charges:
291
- charges[gg.charge].append((gg, grouped_docs))
292
- else:
293
- charges[gg.charge] = [(gg, grouped_docs)]
294
-
295
- for charge, collection in charges.items():
296
- for mol, docs in collection:
297
- # Get all possible molecules for adiabatic oxidation and reduction
298
- red_coll = charges.get(charge - 1, list())
299
- ox_coll = charges.get(charge + 1, list())
300
-
301
- for lot_solv, docset in docs.items():
302
- # Collect other molecules that have MoleculeThermoDocs at the
303
- # exact same level of theory
304
-
305
- combined = docset["thermo_doc"].combined_lot_solvent
306
-
307
- relevant_red = list()
308
- relevant_ox = list()
309
-
310
- for rmol, rdocs in red_coll:
311
- if lot_solv in rdocs:
312
- if (
313
- rdocs[lot_solv]["thermo_doc"].combined_lot_solvent
314
- == combined
315
- ):
316
- relevant_red.append(rdocs[lot_solv])
317
-
318
- for omol, odocs in ox_coll:
319
- if lot_solv in odocs:
320
- if (
321
- odocs[lot_solv]["thermo_doc"].combined_lot_solvent
322
- == combined
323
- ):
324
- relevant_ox.append(odocs[lot_solv])
325
-
326
- # Take best options (based on electronic energy), where available
327
- if len(relevant_red) == 0:
328
- red_doc = None
329
- else:
330
- red_doc = sorted(
331
- relevant_red,
332
- key=lambda x: x["thermo_doc"].electronic_energy,
333
- )[0]["thermo_doc"]
334
-
335
- if len(relevant_ox) == 0:
336
- ox_doc = None
337
- else:
338
- ox_doc = sorted(
339
- relevant_ox,
340
- key=lambda x: x["thermo_doc"].electronic_energy,
341
- )[0]["thermo_doc"]
342
-
343
- ea_doc = docset.get("ea_doc")
344
- ie_doc = docset.get("ie_doc")
345
-
346
- redox_docs.append(
347
- RedoxDoc.from_docs(
348
- base_molecule_doc=mol,
349
- base_thermo_doc=docset["thermo_doc"],
350
- red_doc=red_doc,
351
- ox_doc=ox_doc,
352
- ea_doc=ea_doc,
353
- ie_doc=ie_doc,
354
- )
355
- )
356
-
357
- self.logger.debug(f"Produced {len(redox_docs)} redox docs for {formula}")
358
-
359
- return jsanitize(
360
- [doc.model_dump() for doc in redox_docs if doc is not None], allow_bson=True
361
- )
362
-
363
- def update_targets(self, items: List[List[Dict]]):
364
- """
365
- Inserts the new documents into the orbitals collection
366
-
367
- Args:
368
- items [[dict]]: A list of documents to update
369
- """
370
-
371
- docs = list(chain.from_iterable(items)) # type: ignore
372
-
373
- # Add timestamp
374
- for item in docs:
375
- item.update(
376
- {
377
- "_bt": self.timestamp,
378
- }
379
- )
380
-
381
- molecule_ids = list({item["molecule_id"] for item in docs})
382
-
383
- if len(items) > 0:
384
- self.logger.info(f"Updating {len(docs)} redox documents")
385
- self.redox.remove_docs({self.redox.key: {"$in": molecule_ids}})
386
- self.redox.update(
387
- docs=docs,
388
- key=["molecule_id", "solvent"],
389
- )
390
- else:
391
- self.logger.info("No items to update")
392
-
393
- @staticmethod
394
- def _group_by_graph(mol_docs: List[MoleculeDoc]) -> Dict[int, List[MoleculeDoc]]:
395
- """
396
- Group molecule docs by molecular graph connectivity
397
-
398
- :param entries: List of entries (dicts derived from TaskDocuments)
399
- :return: Grouped molecule entries
400
- """
401
-
402
- graph_hashes_nometal: List[str] = list()
403
- results = defaultdict(list)
404
-
405
- # Within each group, group by the covalent molecular graph
406
- for t in mol_docs:
407
- mol = confirm_molecule(t.molecule)
408
-
409
- mol_nometal = copy.deepcopy(mol)
410
-
411
- if mol.composition.alphabetical_formula not in [m + "1" for m in metals]:
412
- mol_nometal.remove_species(metals)
413
-
414
- mol_nometal.set_charge_and_spin(0)
415
- gh_nometal = get_graph_hash(mol_nometal, node_attr="specie")
416
-
417
- match = None
418
- for i, gh in enumerate(graph_hashes_nometal):
419
- if gh_nometal == gh:
420
- match = i
421
- break
422
-
423
- if match is None:
424
- results[len(graph_hashes_nometal)].append(t)
425
- graph_hashes_nometal.append(gh_nometal)
426
- else:
427
- results[match].append(t)
428
-
429
- return results
430
-
431
- @staticmethod
432
- def _collect_by_lot_solvent(
433
- thermo_docs: List[MoleculeThermoDoc],
434
- ie_docs: List[TaskDocument],
435
- ea_docs: List[TaskDocument],
436
- ) -> Dict[str, Any]:
437
- """
438
- For a given MoleculeDoc, group potential MoleculeThermoDocs and TaskDocs for
439
- IE/EA calculations based on level of theory and solvent.
440
-
441
- Args:
442
- thermo_docs (list of MoleculeThermoDocs): List of MoleculeThermoDocs for this MoleculeDoc
443
- ie_docs (list of TaskDocuments): List of TaskDocs which could be used
444
- to calculate vertical ionization energies for this MoleculeDoc
445
- ea_docs (list of TaskDocuments): List of TaskDocs which could be used
446
- to calculate vertical electron affinities for this MoleculeDoc:
447
-
448
- Returns:
449
- dict {<lot_solvent>: {
450
- "thermo_doc": MoleculeThermoDoc, "ie_doc": TaskDocument, "ea_doc": TaskDocument
451
- }
452
- }
453
- """
454
-
455
- def _lot_solv(doc: Union[MoleculeThermoDoc, TaskDocument]):
456
- if isinstance(doc, MoleculeThermoDoc):
457
- if doc.correction:
458
- return doc.correction_lot_solvent
459
- return doc.lot_solvent
460
-
461
- thermo_grouped = groupby(sorted(thermo_docs, key=_lot_solv), key=_lot_solv)
462
- ie_grouped = groupby(sorted(ie_docs, key=_lot_solv), key=_lot_solv)
463
- ea_grouped = groupby(sorted(ea_docs, key=_lot_solv), key=_lot_solv)
464
-
465
- groups = dict()
466
-
467
- for k, g in thermo_grouped:
468
- g_list = list(g)
469
-
470
- # Should never be more than one MoleculeThermoDoc per MoleculeDoc
471
- # Just for safety...
472
- if len(g_list) > 1:
473
- g_list_sorted = sorted(g_list, key=lambda x: x.electronic_energy)
474
- this_thermo_doc = g_list_sorted[0]
475
- else:
476
- this_thermo_doc = g_list[0]
477
-
478
- groups[k] = {"thermo_doc": this_thermo_doc}
479
-
480
- for k, g in ie_grouped:
481
- # Must be a MoleculeThermoDoc to make a RedoxDoc
482
- if k not in groups:
483
- continue
484
-
485
- this_ie_doc = sorted(list(g), key=lambda x: x.output.final_energy)[0]
486
- groups[k]["ie_doc"] = this_ie_doc
487
-
488
- for k, g in ea_grouped:
489
- # Must be a MoleculeThermoDoc to make a RedoxDoc
490
- if k not in groups:
491
- continue
492
-
493
- this_ea_doc = sorted(list(g), key=lambda x: x.output.final_energy)[0]
494
- groups[k]["ea_doc"] = this_ea_doc
495
-
496
- return groups