emmet-builders 0.78.3__py3-none-any.whl → 0.86.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (44) hide show
  1. emmet/builders/abinit/phonon.py +47 -47
  2. emmet/builders/abinit/sound_velocity.py +15 -11
  3. emmet/builders/feff/xas.py +1 -2
  4. emmet/builders/materials/absorption_spectrum.py +25 -14
  5. emmet/builders/materials/alloys.py +10 -11
  6. emmet/builders/materials/chemenv.py +2 -3
  7. emmet/builders/materials/corrected_entries.py +21 -15
  8. emmet/builders/materials/dielectric.py +19 -11
  9. emmet/builders/materials/elasticity.py +44 -33
  10. emmet/builders/materials/electrodes.py +35 -28
  11. emmet/builders/materials/electronic_structure.py +17 -17
  12. emmet/builders/materials/magnetism.py +11 -4
  13. emmet/builders/materials/optimade.py +7 -3
  14. emmet/builders/materials/piezoelectric.py +24 -21
  15. emmet/builders/materials/provenance.py +16 -13
  16. emmet/builders/materials/robocrys.py +2 -3
  17. emmet/builders/materials/substrates.py +9 -8
  18. emmet/builders/materials/summary.py +3 -3
  19. emmet/builders/materials/thermo.py +17 -11
  20. emmet/builders/matscholar/missing_compositions.py +12 -8
  21. emmet/builders/mobility/migration_graph.py +5 -5
  22. emmet/builders/settings.py +21 -17
  23. emmet/builders/utils.py +101 -12
  24. emmet/builders/vasp/materials.py +40 -51
  25. emmet/builders/vasp/mp_potcar_stats.json.gz +0 -0
  26. emmet/builders/vasp/task_validator.py +25 -36
  27. emmet_builders-0.86.0.dist-info/METADATA +37 -0
  28. emmet_builders-0.86.0.dist-info/RECORD +41 -0
  29. {emmet_builders-0.78.3.dist-info → emmet_builders-0.86.0.dist-info}/WHEEL +1 -1
  30. emmet/builders/materials/ml.py +0 -87
  31. emmet/builders/molecules/atomic.py +0 -589
  32. emmet/builders/molecules/bonds.py +0 -324
  33. emmet/builders/molecules/metal_binding.py +0 -526
  34. emmet/builders/molecules/orbitals.py +0 -288
  35. emmet/builders/molecules/redox.py +0 -496
  36. emmet/builders/molecules/summary.py +0 -383
  37. emmet/builders/molecules/thermo.py +0 -500
  38. emmet/builders/molecules/vibration.py +0 -278
  39. emmet/builders/qchem/__init__.py +0 -0
  40. emmet/builders/qchem/molecules.py +0 -734
  41. emmet_builders-0.78.3.dist-info/METADATA +0 -47
  42. emmet_builders-0.78.3.dist-info/RECORD +0 -51
  43. /emmet/builders/{molecules/__init__.py → py.typed} +0 -0
  44. {emmet_builders-0.78.3.dist-info → emmet_builders-0.86.0.dist-info}/top_level.txt +0 -0
@@ -1,500 +0,0 @@
1
- from collections import defaultdict
2
- from datetime import datetime
3
- from itertools import chain
4
- from math import ceil
5
- from typing import Optional, Iterable, Iterator, List, Dict
6
-
7
- from pymatgen.core.structure import Molecule
8
- from pymatgen.analysis.molecule_matcher import MoleculeMatcher
9
-
10
- from maggma.builders import Builder
11
- from maggma.core import Store
12
- from maggma.utils import grouper
13
-
14
- from emmet.core.qchem.task import TaskDocument
15
- from emmet.core.qchem.molecule import MoleculeDoc, evaluate_lot
16
- from emmet.core.molecules.thermo import get_free_energy, MoleculeThermoDoc
17
- from emmet.core.qchem.calc_types import TaskType
18
- from emmet.core.utils import jsanitize
19
- from emmet.builders.settings import EmmetBuildSettings
20
-
21
-
22
- __author__ = "Evan Spotte-Smith"
23
-
24
- SETTINGS = EmmetBuildSettings()
25
-
26
- single_mol_thermo = {
27
- "Zn1": {"enthalpy": 1.481, "entropy": 38.384},
28
- "Xe1": {"enthalpy": 1.481, "entropy": 40.543},
29
- "Tl1": {"enthalpy": 1.481, "entropy": 41.857},
30
- "Ti1": {"enthalpy": 1.481, "entropy": 37.524},
31
- "Te1": {"enthalpy": 1.481, "entropy": 40.498},
32
- "Sr1": {"enthalpy": 1.481, "entropy": 39.334},
33
- "Sn1": {"enthalpy": 1.481, "entropy": 40.229},
34
- "Si1": {"enthalpy": 1.481, "entropy": 35.921},
35
- "Sb1": {"enthalpy": 1.481, "entropy": 40.284},
36
- "Se1": {"enthalpy": 1.481, "entropy": 39.05},
37
- "S1": {"enthalpy": 1.481, "entropy": 36.319},
38
- "Rn1": {"enthalpy": 1.481, "entropy": 42.095},
39
- "Pt1": {"enthalpy": 1.481, "entropy": 41.708},
40
- "Rb1": {"enthalpy": 1.481, "entropy": 39.23},
41
- "Po1": {"enthalpy": 1.481, "entropy": 41.915},
42
- "Pb1": {"enthalpy": 1.481, "entropy": 41.901},
43
- "P1": {"enthalpy": 1.481, "entropy": 36.224},
44
- "O1": {"enthalpy": 1.481, "entropy": 34.254},
45
- "Ne1": {"enthalpy": 1.481, "entropy": 34.919},
46
- "N1": {"enthalpy": 1.481, "entropy": 33.858},
47
- "Na1": {"enthalpy": 1.481, "entropy": 35.336},
48
- "Mg1": {"enthalpy": 1.481, "entropy": 35.462},
49
- "Li1": {"enthalpy": 1.481, "entropy": 31.798},
50
- "Kr1": {"enthalpy": 1.481, "entropy": 39.191},
51
- "K1": {"enthalpy": 1.481, "entropy": 36.908},
52
- "In1": {"enthalpy": 1.481, "entropy": 40.132},
53
- "I1": {"enthalpy": 1.481, "entropy": 40.428},
54
- "H1": {"enthalpy": 1.481, "entropy": 26.014},
55
- "He1": {"enthalpy": 1.481, "entropy": 30.125},
56
- "Ge1": {"enthalpy": 1.481, "entropy": 38.817},
57
- "Ga1": {"enthalpy": 1.481, "entropy": 38.609},
58
- "F1": {"enthalpy": 1.481, "entropy": 34.767},
59
- "Cu1": {"enthalpy": 1.481, "entropy": 38.337},
60
- "Cl1": {"enthalpy": 1.481, "entropy": 36.586},
61
- "Ca1": {"enthalpy": 1.481, "entropy": 36.984},
62
- "C1": {"enthalpy": 1.481, "entropy": 33.398},
63
- "Br1": {"enthalpy": 1.481, "entropy": 39.012},
64
- "Bi1": {"enthalpy": 1.481, "entropy": 41.915},
65
- "Be1": {"enthalpy": 1.481, "entropy": 32.544},
66
- "Ba1": {"enthalpy": 1.481, "entropy": 40.676},
67
- "B1": {"enthalpy": 1.481, "entropy": 33.141},
68
- "Au1": {"enthalpy": 1.481, "entropy": 41.738},
69
- "At1": {"enthalpy": 1.481, "entropy": 41.929},
70
- "As1": {"enthalpy": 1.481, "entropy": 38.857},
71
- "Ar1": {"enthalpy": 1.481, "entropy": 36.983},
72
- "Al1": {"enthalpy": 1.481, "entropy": 35.813},
73
- "Ag1": {"enthalpy": 1.481, "entropy": 39.917},
74
- }
75
-
76
-
77
- class ThermoBuilder(Builder):
78
- """
79
- The ThermoBuilder extracts the highest-quality thermodynamic data from a
80
- MoleculeDoc (lowest electronic energy, highest level of theory for each
81
- solvent available).
82
-
83
- This builder constructs MoleculeThermoDocs in two different ways: with and without
84
- single-point energy corrections.
85
-
86
- Before any documents are constructed, the following steps are taken:
87
- 1. Gather MoleculeDocs by formula
88
- 2. For each doc, identify tasks with thermodynamic information such as
89
- zero-point energy, enthalpy, and entropy. Collect these "documents
90
- including complete thermodynamics" (DICTs).
91
- 3. Separately, collect single-point energy calculations (SPECs).
92
- 4. Sort both sets of collected tasks (DICT and SPEC) by solvent
93
-
94
- The first type of doc - those without corrections - can be constructed in
95
- a straightforward fashion:
96
- 5. For each solvent, grab the best DICT (where "best" is defined as the
97
- task generated using the highest level of theory with the lowest
98
- electronic energy)
99
- 6. Convert this TaskDoc to MoleculeThermoDoc
100
-
101
- The second type - those involving single-point energy corrections - are
102
- generated differently and in a slightly more involved process:
103
- 7. For each of the "best" DICT docs identified in step 5 above:
104
- 7.1 For each solvent, grab the best SPEC
105
- 7.2 Try to match each best SPEC with a matching DICT (meaning that
106
- the DICT and the SPEC have identical structure) where the DICT
107
- is calculated at a lower or the same level of theory than the
108
- SPEC
109
- 7.3 Convert each DICT-SPEC combination to create a MoleculeThermoDoc
110
-
111
- In the case where there are multiple MoleculeThermoDocs made for a given solvent,
112
- the different MoleculeThermoDocs will be ranked, first by level of theory (for
113
- a doc made with an energy correction, the scores of the DICT and the SPEC
114
- levels of theory will be averaged) and then by electronic energy.
115
- """
116
-
117
- def __init__(
118
- self,
119
- tasks: Store,
120
- molecules: Store,
121
- thermo: Store,
122
- query: Optional[Dict] = None,
123
- settings: Optional[EmmetBuildSettings] = None,
124
- **kwargs,
125
- ):
126
- self.tasks = tasks
127
- self.molecules = molecules
128
- self.thermo = thermo
129
- self.query = query if query else dict()
130
- self.settings = EmmetBuildSettings.autoload(settings)
131
- self.kwargs = kwargs
132
-
133
- super().__init__(sources=[tasks, molecules], targets=[thermo], **kwargs)
134
- # Uncomment in case of issue with mrun not connecting automatically to collections
135
- # for i in [self.tasks, self.molecules, self.thermo]:
136
- # try:
137
- # i.connect()
138
- # except Exception as e:
139
- # print("Could not connect,", e)
140
-
141
- def ensure_indexes(self):
142
- """
143
- Ensures indices on the collections needed for building
144
- """
145
-
146
- # Basic search index for tasks
147
- self.tasks.ensure_index("task_id")
148
- self.tasks.ensure_index("last_updated")
149
- self.tasks.ensure_index("state")
150
- self.tasks.ensure_index("formula_alphabetical")
151
-
152
- # Search index for molecules
153
- self.molecules.ensure_index("molecule_id")
154
- self.molecules.ensure_index("last_updated")
155
- self.molecules.ensure_index("task_ids")
156
- self.molecules.ensure_index("formula_alphabetical")
157
-
158
- # Search index for thermo
159
- self.thermo.ensure_index("molecule_id")
160
- self.thermo.ensure_index("task_id")
161
- self.thermo.ensure_index("solvent")
162
- self.thermo.ensure_index("lot_solvent")
163
- self.thermo.ensure_index("property_id")
164
- self.thermo.ensure_index("last_updated")
165
- self.thermo.ensure_index("formula_alphabetical")
166
-
167
- def prechunk(self, number_splits: int) -> Iterable[Dict]: # pragma: no cover
168
- """Prechunk the builder for distributed computation"""
169
-
170
- temp_query = dict(self.query)
171
- temp_query["deprecated"] = False
172
-
173
- self.logger.info("Finding documents to process")
174
- all_mols = list(
175
- self.molecules.query(
176
- temp_query, [self.molecules.key, "formula_alphabetical"]
177
- )
178
- )
179
-
180
- processed_docs = set([e for e in self.thermo.distinct("molecule_id")])
181
- to_process_docs = {d[self.molecules.key] for d in all_mols} - processed_docs
182
- to_process_forms = {
183
- d["formula_alphabetical"]
184
- for d in all_mols
185
- if d[self.molecules.key] in to_process_docs
186
- }
187
-
188
- N = ceil(len(to_process_forms) / number_splits)
189
-
190
- for formula_chunk in grouper(to_process_forms, N):
191
- yield {"query": {"formula_alphabetical": {"$in": list(formula_chunk)}}}
192
-
193
- def get_items(self) -> Iterator[List[Dict]]:
194
- """
195
- Gets all items to process into thermo documents.
196
- This does no datetime checking; relying on on whether
197
- task_ids are included in the thermo Store
198
-
199
- Returns:
200
- generator or list relevant tasks and molecules to process into documents
201
- """
202
-
203
- self.logger.info("Thermo builder started")
204
- self.logger.info("Setting indexes")
205
- self.ensure_indexes()
206
-
207
- # Save timestamp to mark buildtime
208
- self.timestamp = datetime.utcnow()
209
-
210
- # Get all processed molecules
211
- temp_query = dict(self.query)
212
- temp_query["deprecated"] = False
213
-
214
- self.logger.info("Finding documents to process")
215
- all_mols = list(
216
- self.molecules.query(
217
- temp_query, [self.molecules.key, "formula_alphabetical"]
218
- )
219
- )
220
-
221
- processed_docs = set([e for e in self.thermo.distinct("molecule_id")])
222
- to_process_docs = {d[self.molecules.key] for d in all_mols} - processed_docs
223
- to_process_forms = {
224
- d["formula_alphabetical"]
225
- for d in all_mols
226
- if d[self.molecules.key] in to_process_docs
227
- }
228
-
229
- self.logger.info(f"Found {len(to_process_docs)} unprocessed documents")
230
- self.logger.info(f"Found {len(to_process_forms)} unprocessed formulas")
231
-
232
- # Set total for builder bars to have a total
233
- self.total = len(to_process_forms)
234
-
235
- for formula in to_process_forms:
236
- mol_query = dict(temp_query)
237
- mol_query["formula_alphabetical"] = formula
238
- molecules = list(self.molecules.query(criteria=mol_query))
239
-
240
- yield molecules
241
-
242
- def process_item(self, items: List[Dict]) -> List[Dict]:
243
- """
244
- Process the tasks into a MoleculeThermoDoc
245
-
246
- Args:
247
- items List[dict] : a list of MoleculeDocs in dict form
248
-
249
- Returns:
250
- [dict] : a list of new thermo docs
251
- """
252
-
253
- def _add_single_atom_enthalpy_entropy(
254
- task: TaskDocument, doc: MoleculeThermoDoc
255
- ):
256
- initial_mol = task.output.initial_molecule
257
- # If single atom, try to add enthalpy and entropy
258
- if len(initial_mol) == 1:
259
- if doc.total_enthalpy is None or doc.total_entropy is None:
260
- formula = initial_mol.composition.alphabetical_formula
261
- if formula in single_mol_thermo:
262
- vals = single_mol_thermo[formula]
263
- doc.total_enthalpy = vals["enthalpy"] * 0.043363
264
- doc.total_entropy = vals["entropy"] * 0.000043363
265
- doc.translational_enthalpy = vals["enthalpy"] * 0.043363
266
- doc.translational_entropy = vals["entropy"] * 0.000043363
267
- doc.free_energy = get_free_energy(
268
- doc.electronic_energy,
269
- vals["enthalpy"],
270
- vals["entropy"],
271
- convert_energy=False,
272
- )
273
- return doc
274
-
275
- mols = [MoleculeDoc(**item) for item in items]
276
- formula = mols[0].formula_alphabetical
277
- mol_ids = [m.molecule_id for m in mols]
278
- self.logger.debug(f"Processing {formula} : {mol_ids}")
279
-
280
- thermo_docs = list()
281
-
282
- mm = MoleculeMatcher(tolerance=0.000001)
283
-
284
- for mol in mols:
285
- this_thermo_docs = list()
286
- # Collect DICTs and SPECs
287
- thermo_entries = [
288
- e
289
- for e in mol.entries
290
- if e["output"]["enthalpy"] is not None
291
- and e["output"]["entropy"] is not None
292
- and e["charge"] == mol.charge
293
- and e["spin_multiplicity"] == mol.spin_multiplicity
294
- ]
295
-
296
- sp_entries = list()
297
- for entry in mol.entries:
298
- if isinstance(entry["task_type"], TaskType):
299
- task_type = entry["task_type"].value
300
- else:
301
- task_type = entry["task_type"]
302
-
303
- if (
304
- task_type in ["Single Point", "Force"]
305
- and entry["charge"] == mol.charge
306
- and entry["spin_multiplicity"] == mol.spin_multiplicity
307
- ):
308
- sp_entries.append(entry)
309
-
310
- # Group both DICTs and SPECs by solvent environment
311
- by_solvent_dict = defaultdict(list)
312
- by_solvent_spec = defaultdict(list)
313
- for entry in thermo_entries:
314
- by_solvent_dict[entry["solvent"]].append(entry)
315
- for entry in sp_entries:
316
- by_solvent_spec[entry["solvent"]].append(entry)
317
-
318
- if len(thermo_entries) == 0:
319
- without_corrections = by_solvent_spec
320
- else:
321
- without_corrections = by_solvent_dict
322
-
323
- # Construct without corrections
324
- for solvent, entries in without_corrections.items():
325
- best = sorted(
326
- entries,
327
- key=lambda x: (
328
- sum(evaluate_lot(x["level_of_theory"])),
329
- x["energy"],
330
- ),
331
- )[0]
332
- task = best["task_id"]
333
-
334
- tdoc = self.tasks.query_one(
335
- {
336
- "task_id": task,
337
- "formula_alphabetical": formula,
338
- "orig": {"$exists": True},
339
- }
340
- )
341
-
342
- if tdoc is None:
343
- try:
344
- tdoc = self.tasks.query_one(
345
- {
346
- "task_id": int(task),
347
- "formula_alphabetical": formula,
348
- "orig": {"$exists": True},
349
- }
350
- )
351
- except ValueError:
352
- tdoc = None
353
-
354
- if tdoc is None:
355
- continue
356
-
357
- task_doc = TaskDocument(**tdoc)
358
-
359
- if task_doc is None:
360
- continue
361
-
362
- thermo_doc = MoleculeThermoDoc.from_task(
363
- task_doc, molecule_id=mol.molecule_id, deprecated=False
364
- )
365
- thermo_doc = _add_single_atom_enthalpy_entropy(task_doc, thermo_doc)
366
- this_thermo_docs.append(thermo_doc)
367
-
368
- # Construct with corrections
369
- for solvent, entries in by_solvent_spec.items():
370
- spec_sorted = sorted(
371
- entries,
372
- key=lambda x: (
373
- sum(evaluate_lot(x["level_of_theory"])),
374
- x["energy"],
375
- ),
376
- )
377
-
378
- for best_spec in spec_sorted:
379
- task_spec = best_spec["task_id"]
380
-
381
- matching_structures = list()
382
- for entry in thermo_entries:
383
- mol1 = Molecule.from_dict(entry["molecule"])
384
- mol2 = Molecule.from_dict(best_spec["molecule"])
385
- if (mm.fit(mol1, mol2) or mol1 == mol2) and (
386
- sum(evaluate_lot(best_spec["level_of_theory"]))
387
- < sum(evaluate_lot(entry["level_of_theory"]))
388
- ):
389
- matching_structures.append(entry)
390
-
391
- if len(matching_structures) == 0:
392
- continue
393
-
394
- best_dict = sorted(
395
- matching_structures,
396
- key=lambda x: (
397
- sum(evaluate_lot(x["level_of_theory"])),
398
- x["energy"],
399
- ),
400
- )[0]
401
- task_dict = best_dict["task_id"]
402
-
403
- tdict = self.tasks.query_one({"task_id": task_dict})
404
- if tdict is None:
405
- try:
406
- tdict = self.tasks.query_one({"task_id": int(task_dict)})
407
- except ValueError:
408
- tdict = None
409
-
410
- tspec = self.tasks.query_one({"task_id": task_spec})
411
- if tspec is None:
412
- try:
413
- tspec = self.tasks.query_one({"task_id": int(task_spec)})
414
- except ValueError:
415
- tspec = None
416
-
417
- if tdict is None or tspec is None:
418
- continue
419
-
420
- task_doc_dict = TaskDocument(**tdict)
421
- task_doc_spec = TaskDocument(**tspec)
422
- thermo_doc = MoleculeThermoDoc.from_task(
423
- task_doc_dict,
424
- correction_task=task_doc_spec,
425
- molecule_id=mol.molecule_id,
426
- deprecated=False,
427
- )
428
- thermo_doc = _add_single_atom_enthalpy_entropy(
429
- task_doc_dict, thermo_doc
430
- )
431
- this_thermo_docs.append(thermo_doc)
432
- break
433
-
434
- docs_by_solvent = defaultdict(list)
435
- for doc in this_thermo_docs:
436
- if doc.correction_solvent is not None:
437
- docs_by_solvent[doc.correction_solvent].append(doc)
438
- else:
439
- docs_by_solvent[doc.solvent].append(doc)
440
-
441
- # If multiple documents exist for the same solvent, grab the best one
442
- for _, collection in docs_by_solvent.items():
443
- with_eval_e = list()
444
- for member in collection:
445
- if member.correction_level_of_theory is None:
446
- with_eval_e.append(
447
- (
448
- member,
449
- sum(evaluate_lot(member.level_of_theory)),
450
- member.electronic_energy,
451
- )
452
- )
453
- else:
454
- dict_lot = sum(evaluate_lot(member.level_of_theory))
455
- spec_lot = sum(evaluate_lot(member.correction_level_of_theory))
456
- with_eval_e.append(
457
- (
458
- member,
459
- (dict_lot + spec_lot) / 2,
460
- member.electronic_energy,
461
- )
462
- )
463
-
464
- thermo_docs.append(
465
- sorted(with_eval_e, key=lambda x: (x[1], x[2]))[0][0]
466
- )
467
-
468
- self.logger.debug(f"Produced {len(thermo_docs)} thermo docs for {formula}")
469
-
470
- return jsanitize([doc.model_dump() for doc in thermo_docs], allow_bson=True)
471
-
472
- def update_targets(self, items: List[List[Dict]]):
473
- """
474
- Inserts the new thermo docs into the thermo collection
475
-
476
- Args:
477
- items [[dict]]: A list of documents to update
478
- """
479
-
480
- docs = list(chain.from_iterable(items)) # type: ignore
481
-
482
- # Add timestamp
483
- for item in docs:
484
- item.update(
485
- {
486
- "_bt": self.timestamp,
487
- }
488
- )
489
-
490
- molecule_ids = list({item["molecule_id"] for item in docs})
491
-
492
- if len(items) > 0:
493
- self.logger.info(f"Updating {len(docs)} thermo documents")
494
- self.thermo.remove_docs({self.thermo.key: {"$in": molecule_ids}})
495
- self.thermo.update(
496
- docs=docs,
497
- key=["molecule_id", "solvent"],
498
- )
499
- else:
500
- self.logger.info("No items to update")