emmet-builders 0.84.10rc1__py3-none-any.whl → 0.85.0rc0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of emmet-builders might be problematic. Click here for more details.

Files changed (35) hide show
  1. emmet/builders/abinit/phonon.py +12 -14
  2. emmet/builders/abinit/sound_velocity.py +1 -1
  3. emmet/builders/materials/absorption_spectrum.py +16 -10
  4. emmet/builders/materials/dielectric.py +10 -7
  5. emmet/builders/materials/elasticity.py +13 -9
  6. emmet/builders/materials/electrodes.py +1 -1
  7. emmet/builders/materials/electronic_structure.py +1 -1
  8. emmet/builders/materials/magnetism.py +2 -1
  9. emmet/builders/materials/piezoelectric.py +23 -19
  10. emmet/builders/materials/provenance.py +3 -4
  11. emmet/builders/materials/substrates.py +2 -2
  12. emmet/builders/materials/summary.py +2 -2
  13. emmet/builders/settings.py +14 -9
  14. emmet/builders/utils.py +5 -4
  15. emmet/builders/vasp/materials.py +11 -4
  16. emmet/builders/vasp/task_validator.py +3 -1
  17. {emmet_builders-0.84.10rc1.dist-info → emmet_builders-0.85.0rc0.dist-info}/METADATA +7 -30
  18. emmet_builders-0.85.0rc0.dist-info/RECORD +41 -0
  19. emmet/builders/materials/ml.py +0 -101
  20. emmet/builders/molecules/atomic.py +0 -592
  21. emmet/builders/molecules/bonds.py +0 -329
  22. emmet/builders/molecules/electric.py +0 -287
  23. emmet/builders/molecules/metal_binding.py +0 -528
  24. emmet/builders/molecules/orbitals.py +0 -292
  25. emmet/builders/molecules/redox.py +0 -502
  26. emmet/builders/molecules/summary.py +0 -406
  27. emmet/builders/molecules/thermo.py +0 -505
  28. emmet/builders/molecules/trajectory.py +0 -530
  29. emmet/builders/molecules/vibration.py +0 -282
  30. emmet/builders/qchem/__init__.py +0 -0
  31. emmet/builders/qchem/molecules.py +0 -745
  32. emmet_builders-0.84.10rc1.dist-info/RECORD +0 -54
  33. /emmet/builders/{molecules/__init__.py → py.typed} +0 -0
  34. {emmet_builders-0.84.10rc1.dist-info → emmet_builders-0.85.0rc0.dist-info}/WHEEL +0 -0
  35. {emmet_builders-0.84.10rc1.dist-info → emmet_builders-0.85.0rc0.dist-info}/top_level.txt +0 -0
@@ -1,505 +0,0 @@
1
- from __future__ import annotations
2
-
3
- from collections import defaultdict
4
- from datetime import datetime
5
- from itertools import chain
6
- from math import ceil
7
-
8
- from maggma.builders import Builder
9
- from maggma.core import Store
10
- from maggma.utils import grouper
11
- from pymatgen.analysis.molecule_matcher import MoleculeMatcher
12
- from pymatgen.core.structure import Molecule
13
-
14
- from emmet.builders.settings import EmmetBuildSettings
15
- from emmet.core.molecules.thermo import MoleculeThermoDoc, get_free_energy
16
- from emmet.core.qchem.calc_types import TaskType
17
- from emmet.core.qchem.molecule import MoleculeDoc, evaluate_lot
18
- from emmet.core.qchem.task import TaskDocument
19
- from emmet.core.utils import jsanitize
20
-
21
- from typing import TYPE_CHECKING
22
-
23
- if TYPE_CHECKING:
24
- from collections.abc import Iterable, Iterator
25
-
26
-
27
- __author__ = "Evan Spotte-Smith"
28
-
29
- SETTINGS = EmmetBuildSettings()
30
-
31
- single_mol_thermo = {
32
- "Zn1": {"enthalpy": 1.481, "entropy": 38.384},
33
- "Xe1": {"enthalpy": 1.481, "entropy": 40.543},
34
- "Tl1": {"enthalpy": 1.481, "entropy": 41.857},
35
- "Ti1": {"enthalpy": 1.481, "entropy": 37.524},
36
- "Te1": {"enthalpy": 1.481, "entropy": 40.498},
37
- "Sr1": {"enthalpy": 1.481, "entropy": 39.334},
38
- "Sn1": {"enthalpy": 1.481, "entropy": 40.229},
39
- "Si1": {"enthalpy": 1.481, "entropy": 35.921},
40
- "Sb1": {"enthalpy": 1.481, "entropy": 40.284},
41
- "Se1": {"enthalpy": 1.481, "entropy": 39.05},
42
- "S1": {"enthalpy": 1.481, "entropy": 36.319},
43
- "Rn1": {"enthalpy": 1.481, "entropy": 42.095},
44
- "Pt1": {"enthalpy": 1.481, "entropy": 41.708},
45
- "Rb1": {"enthalpy": 1.481, "entropy": 39.23},
46
- "Po1": {"enthalpy": 1.481, "entropy": 41.915},
47
- "Pb1": {"enthalpy": 1.481, "entropy": 41.901},
48
- "P1": {"enthalpy": 1.481, "entropy": 36.224},
49
- "O1": {"enthalpy": 1.481, "entropy": 34.254},
50
- "Ne1": {"enthalpy": 1.481, "entropy": 34.919},
51
- "N1": {"enthalpy": 1.481, "entropy": 33.858},
52
- "Na1": {"enthalpy": 1.481, "entropy": 35.336},
53
- "Mg1": {"enthalpy": 1.481, "entropy": 35.462},
54
- "Li1": {"enthalpy": 1.481, "entropy": 31.798},
55
- "Kr1": {"enthalpy": 1.481, "entropy": 39.191},
56
- "K1": {"enthalpy": 1.481, "entropy": 36.908},
57
- "In1": {"enthalpy": 1.481, "entropy": 40.132},
58
- "I1": {"enthalpy": 1.481, "entropy": 40.428},
59
- "H1": {"enthalpy": 1.481, "entropy": 26.014},
60
- "He1": {"enthalpy": 1.481, "entropy": 30.125},
61
- "Ge1": {"enthalpy": 1.481, "entropy": 38.817},
62
- "Ga1": {"enthalpy": 1.481, "entropy": 38.609},
63
- "F1": {"enthalpy": 1.481, "entropy": 34.767},
64
- "Cu1": {"enthalpy": 1.481, "entropy": 38.337},
65
- "Cl1": {"enthalpy": 1.481, "entropy": 36.586},
66
- "Ca1": {"enthalpy": 1.481, "entropy": 36.984},
67
- "C1": {"enthalpy": 1.481, "entropy": 33.398},
68
- "Br1": {"enthalpy": 1.481, "entropy": 39.012},
69
- "Bi1": {"enthalpy": 1.481, "entropy": 41.915},
70
- "Be1": {"enthalpy": 1.481, "entropy": 32.544},
71
- "Ba1": {"enthalpy": 1.481, "entropy": 40.676},
72
- "B1": {"enthalpy": 1.481, "entropy": 33.141},
73
- "Au1": {"enthalpy": 1.481, "entropy": 41.738},
74
- "At1": {"enthalpy": 1.481, "entropy": 41.929},
75
- "As1": {"enthalpy": 1.481, "entropy": 38.857},
76
- "Ar1": {"enthalpy": 1.481, "entropy": 36.983},
77
- "Al1": {"enthalpy": 1.481, "entropy": 35.813},
78
- "Ag1": {"enthalpy": 1.481, "entropy": 39.917},
79
- }
80
-
81
-
82
- class ThermoBuilder(Builder):
83
- """
84
- The ThermoBuilder extracts the highest-quality thermodynamic data from a
85
- MoleculeDoc (lowest electronic energy, highest level of theory for each
86
- solvent available).
87
-
88
- This builder constructs MoleculeThermoDocs in two different ways: with and without
89
- single-point energy corrections.
90
-
91
- Before any documents are constructed, the following steps are taken:
92
- 1. Gather MoleculeDocs by species hash
93
- 2. For each doc, identify tasks with thermodynamic information such as
94
- zero-point energy, enthalpy, and entropy. Collect these "documents
95
- including complete thermodynamics" (DICTs).
96
- 3. Separately, collect single-point energy calculations (SPECs).
97
- 4. Sort both sets of collected tasks (DICT and SPEC) by solvent
98
-
99
- The first type of doc - those without corrections - can be constructed in
100
- a straightforward fashion:
101
- 5. For each solvent, grab the best DICT (where "best" is defined as the
102
- task generated using the highest level of theory with the lowest
103
- electronic energy)
104
- 6. Convert this TaskDoc to MoleculeThermoDoc
105
-
106
- The second type - those involving single-point energy corrections - are
107
- generated differently and in a slightly more involved process:
108
- 7. For each of the "best" DICT docs identified in step 5 above:
109
- 7.1 For each solvent, grab the best SPEC
110
- 7.2 Try to match each best SPEC with a matching DICT (meaning that
111
- the DICT and the SPEC have identical structure) where the DICT
112
- is calculated at a lower or the same level of theory than the
113
- SPEC
114
- 7.3 Convert each DICT-SPEC combination to create a MoleculeThermoDoc
115
-
116
- In the case where there are multiple MoleculeThermoDocs made for a given solvent,
117
- the different MoleculeThermoDocs will be ranked, first by level of theory (for
118
- a doc made with an energy correction, the scores of the DICT and the SPEC
119
- levels of theory will be averaged) and then by electronic energy.
120
- """
121
-
122
- def __init__(
123
- self,
124
- tasks: Store,
125
- molecules: Store,
126
- thermo: Store,
127
- query: dict | None = None,
128
- settings: EmmetBuildSettings | None = None,
129
- **kwargs,
130
- ):
131
- self.tasks = tasks
132
- self.molecules = molecules
133
- self.thermo = thermo
134
- self.query = query if query else dict()
135
- self.settings = EmmetBuildSettings.autoload(settings)
136
- self.kwargs = kwargs
137
-
138
- super().__init__(sources=[tasks, molecules], targets=[thermo], **kwargs)
139
- # Uncomment in case of issue with mrun not connecting automatically to collections
140
- # for i in [self.tasks, self.molecules, self.thermo]:
141
- # try:
142
- # i.connect()
143
- # except Exception as e:
144
- # print("Could not connect,", e)
145
-
146
- def ensure_indexes(self):
147
- """
148
- Ensures indices on the collections needed for building
149
- """
150
-
151
- # Basic search index for tasks
152
- self.tasks.ensure_index("task_id")
153
- self.tasks.ensure_index("last_updated")
154
- self.tasks.ensure_index("state")
155
- self.tasks.ensure_index("formula_alphabetical")
156
- self.tasks.ensure_index("species_hash")
157
-
158
- # Search index for molecules
159
- self.molecules.ensure_index("molecule_id")
160
- self.molecules.ensure_index("last_updated")
161
- self.molecules.ensure_index("task_ids")
162
- self.molecules.ensure_index("formula_alphabetical")
163
- self.molecules.ensure_index("species_hash")
164
-
165
- # Search index for thermo
166
- self.thermo.ensure_index("molecule_id")
167
- self.thermo.ensure_index("task_id")
168
- self.thermo.ensure_index("solvent")
169
- self.thermo.ensure_index("lot_solvent")
170
- self.thermo.ensure_index("property_id")
171
- self.thermo.ensure_index("last_updated")
172
- self.thermo.ensure_index("formula_alphabetical")
173
-
174
- def prechunk(self, number_splits: int) -> Iterable[dict]: # pragma: no cover
175
- """Prechunk the builder for distributed computation"""
176
-
177
- temp_query = dict(self.query)
178
- temp_query["deprecated"] = False
179
-
180
- self.logger.info("Finding documents to process")
181
- all_mols = list(
182
- self.molecules.query(temp_query, [self.molecules.key, "species_hash"])
183
- )
184
-
185
- processed_docs = set([e for e in self.thermo.distinct("molecule_id")])
186
- to_process_docs = {d[self.molecules.key] for d in all_mols} - processed_docs
187
- to_process_hashes = {
188
- d["species_hash"]
189
- for d in all_mols
190
- if d[self.molecules.key] in to_process_docs
191
- }
192
-
193
- N = ceil(len(to_process_hashes) / number_splits)
194
-
195
- for hash_chunk in grouper(to_process_hashes, N):
196
- query = dict(temp_query)
197
- query["species_hash"] = {"$in": list(hash_chunk)}
198
- yield {"query": query}
199
-
200
- def get_items(self) -> Iterator[list[dict]]:
201
- """
202
- Gets all items to process into thermo documents.
203
- This does no datetime checking; relying on on whether
204
- task_ids are included in the thermo Store
205
-
206
- Returns:
207
- generator or list relevant tasks and molecules to process into documents
208
- """
209
-
210
- self.logger.info("Thermo builder started")
211
- self.logger.info("Setting indexes")
212
- self.ensure_indexes()
213
-
214
- # Save timestamp to mark buildtime
215
- self.timestamp = datetime.utcnow()
216
-
217
- # Get all processed molecules
218
- temp_query = dict(self.query)
219
- temp_query["deprecated"] = False
220
-
221
- self.logger.info("Finding documents to process")
222
- all_mols = list(
223
- self.molecules.query(temp_query, [self.molecules.key, "species_hash"])
224
- )
225
-
226
- processed_docs = set([e for e in self.thermo.distinct("molecule_id")])
227
- to_process_docs = {d[self.molecules.key] for d in all_mols} - processed_docs
228
- to_process_hashes = {
229
- d["species_hash"]
230
- for d in all_mols
231
- if d[self.molecules.key] in to_process_docs
232
- }
233
-
234
- self.logger.info(f"Found {len(to_process_docs)} unprocessed documents")
235
- self.logger.info(f"Found {len(to_process_hashes)} unprocessed hashes")
236
-
237
- # Set total for builder bars to have a total
238
- self.total = len(to_process_hashes)
239
-
240
- for shash in to_process_hashes:
241
- mol_query = dict(temp_query)
242
- mol_query["species_hash"] = shash
243
- molecules = list(self.molecules.query(criteria=mol_query))
244
-
245
- yield molecules
246
-
247
- def process_item(self, items: list[dict]) -> list[dict]:
248
- """
249
- Process the tasks into a MoleculeThermoDoc
250
-
251
- Args:
252
- items list[dict] : a list of MoleculeDocs in dict form
253
-
254
- Returns:
255
- [dict] : a list of new thermo docs
256
- """
257
-
258
- def _add_single_atom_enthalpy_entropy(
259
- task: TaskDocument, doc: MoleculeThermoDoc
260
- ):
261
- initial_mol = task.output.initial_molecule
262
- # If single atom, try to add enthalpy and entropy
263
- if len(initial_mol) == 1:
264
- if doc.total_enthalpy is None or doc.total_entropy is None:
265
- formula = initial_mol.composition.alphabetical_formula
266
- if formula in single_mol_thermo:
267
- vals = single_mol_thermo[formula]
268
- doc.total_enthalpy = vals["enthalpy"] * 0.043363
269
- doc.total_entropy = vals["entropy"] * 0.000043363
270
- doc.translational_enthalpy = vals["enthalpy"] * 0.043363
271
- doc.translational_entropy = vals["entropy"] * 0.000043363
272
- doc.free_energy = get_free_energy(
273
- doc.electronic_energy,
274
- vals["enthalpy"],
275
- vals["entropy"],
276
- convert_energy=False,
277
- )
278
- return doc
279
-
280
- mols = [MoleculeDoc(**item) for item in items]
281
- shash = mols[0].species_hash
282
- mol_ids = [m.molecule_id for m in mols]
283
- self.logger.debug(f"Processing {shash} : {mol_ids}")
284
-
285
- thermo_docs = list()
286
-
287
- mm = MoleculeMatcher(tolerance=0.000001)
288
-
289
- for mol in mols:
290
- this_thermo_docs = list()
291
- # Collect DICTs and SPECs
292
- thermo_entries = [
293
- e
294
- for e in mol.entries
295
- if e["output"]["enthalpy"] is not None
296
- and e["output"]["entropy"] is not None
297
- and e["charge"] == mol.charge
298
- and e["spin_multiplicity"] == mol.spin_multiplicity
299
- ]
300
-
301
- sp_entries = list()
302
- for entry in mol.entries:
303
- if isinstance(entry["task_type"], TaskType):
304
- task_type = entry["task_type"].value
305
- else:
306
- task_type = entry["task_type"]
307
-
308
- if (
309
- task_type in ["Single Point", "Force"]
310
- and entry["charge"] == mol.charge
311
- and entry["spin_multiplicity"] == mol.spin_multiplicity
312
- ):
313
- sp_entries.append(entry)
314
-
315
- # Group both DICTs and SPECs by solvent environment
316
- by_solvent_dict = defaultdict(list)
317
- by_solvent_spec = defaultdict(list)
318
- for entry in thermo_entries:
319
- by_solvent_dict[entry["solvent"]].append(entry)
320
- for entry in sp_entries:
321
- by_solvent_spec[entry["solvent"]].append(entry)
322
-
323
- if len(thermo_entries) == 0:
324
- without_corrections = by_solvent_spec
325
- else:
326
- without_corrections = by_solvent_dict
327
-
328
- # Construct without corrections
329
- for solvent, entries in without_corrections.items():
330
- best = sorted(
331
- entries,
332
- key=lambda x: (
333
- sum(evaluate_lot(x["level_of_theory"])),
334
- x["energy"],
335
- ),
336
- )[0]
337
- task = best["task_id"]
338
-
339
- tdoc = self.tasks.query_one(
340
- {
341
- "task_id": task,
342
- "species_hash": shash,
343
- "orig": {"$exists": True},
344
- }
345
- )
346
-
347
- if tdoc is None:
348
- try:
349
- tdoc = self.tasks.query_one(
350
- {
351
- "task_id": int(task),
352
- "species_hash": shash,
353
- "orig": {"$exists": True},
354
- }
355
- )
356
- except ValueError:
357
- tdoc = None
358
-
359
- if tdoc is None:
360
- continue
361
-
362
- task_doc = TaskDocument(**tdoc)
363
-
364
- if task_doc is None:
365
- continue
366
-
367
- thermo_doc = MoleculeThermoDoc.from_task(
368
- task_doc, molecule_id=mol.molecule_id, deprecated=False
369
- )
370
- thermo_doc = _add_single_atom_enthalpy_entropy(task_doc, thermo_doc)
371
- this_thermo_docs.append(thermo_doc)
372
-
373
- # Construct with corrections
374
- for solvent, entries in by_solvent_spec.items():
375
- spec_sorted = sorted(
376
- entries,
377
- key=lambda x: (
378
- sum(evaluate_lot(x["level_of_theory"])),
379
- x["energy"],
380
- ),
381
- )
382
-
383
- for best_spec in spec_sorted:
384
- task_spec = best_spec["task_id"]
385
-
386
- matching_structures = list()
387
- for entry in thermo_entries:
388
- mol1 = Molecule.from_dict(entry["molecule"])
389
- mol2 = Molecule.from_dict(best_spec["molecule"])
390
- if (mm.fit(mol1, mol2) or mol1 == mol2) and (
391
- sum(evaluate_lot(best_spec["level_of_theory"]))
392
- < sum(evaluate_lot(entry["level_of_theory"]))
393
- ):
394
- matching_structures.append(entry)
395
-
396
- if len(matching_structures) == 0:
397
- continue
398
-
399
- best_dict = sorted(
400
- matching_structures,
401
- key=lambda x: (
402
- sum(evaluate_lot(x["level_of_theory"])),
403
- x["energy"],
404
- ),
405
- )[0]
406
- task_dict = best_dict["task_id"]
407
-
408
- tdict = self.tasks.query_one({"task_id": task_dict})
409
- if tdict is None:
410
- try:
411
- tdict = self.tasks.query_one({"task_id": int(task_dict)})
412
- except ValueError:
413
- tdict = None
414
-
415
- tspec = self.tasks.query_one({"task_id": task_spec})
416
- if tspec is None:
417
- try:
418
- tspec = self.tasks.query_one({"task_id": int(task_spec)})
419
- except ValueError:
420
- tspec = None
421
-
422
- if tdict is None or tspec is None:
423
- continue
424
-
425
- task_doc_dict = TaskDocument(**tdict)
426
- task_doc_spec = TaskDocument(**tspec)
427
- thermo_doc = MoleculeThermoDoc.from_task(
428
- task_doc_dict,
429
- correction_task=task_doc_spec,
430
- molecule_id=mol.molecule_id,
431
- deprecated=False,
432
- )
433
- thermo_doc = _add_single_atom_enthalpy_entropy(
434
- task_doc_dict, thermo_doc
435
- )
436
- this_thermo_docs.append(thermo_doc)
437
- break
438
-
439
- docs_by_solvent = defaultdict(list)
440
- for doc in this_thermo_docs:
441
- if doc.correction_solvent is not None:
442
- docs_by_solvent[doc.correction_solvent].append(doc)
443
- else:
444
- docs_by_solvent[doc.solvent].append(doc)
445
-
446
- # If multiple documents exist for the same solvent, grab the best one
447
- for _, collection in docs_by_solvent.items():
448
- with_eval_e = list()
449
- for member in collection:
450
- if member.correction_level_of_theory is None:
451
- with_eval_e.append(
452
- (
453
- member,
454
- sum(evaluate_lot(member.level_of_theory)),
455
- member.electronic_energy,
456
- )
457
- )
458
- else:
459
- dict_lot = sum(evaluate_lot(member.level_of_theory))
460
- spec_lot = sum(evaluate_lot(member.correction_level_of_theory))
461
- with_eval_e.append(
462
- (
463
- member,
464
- (dict_lot + spec_lot) / 2,
465
- member.electronic_energy,
466
- )
467
- )
468
-
469
- thermo_docs.append(
470
- sorted(with_eval_e, key=lambda x: (x[1], x[2]))[0][0]
471
- )
472
-
473
- self.logger.debug(f"Produced {len(thermo_docs)} thermo docs for {shash}")
474
-
475
- return jsanitize([doc.model_dump() for doc in thermo_docs], allow_bson=True)
476
-
477
- def update_targets(self, items: list[list[dict]]):
478
- """
479
- Inserts the new thermo docs into the thermo collection
480
-
481
- Args:
482
- items [[dict]]: A list of documents to update
483
- """
484
-
485
- docs = list(chain.from_iterable(items)) # type: ignore
486
-
487
- # Add timestamp
488
- for item in docs:
489
- item.update(
490
- {
491
- "_bt": self.timestamp,
492
- }
493
- )
494
-
495
- molecule_ids = list({item["molecule_id"] for item in docs})
496
-
497
- if len(items) > 0:
498
- self.logger.info(f"Updating {len(docs)} thermo documents")
499
- self.thermo.remove_docs({self.thermo.key: {"$in": molecule_ids}})
500
- self.thermo.update(
501
- docs=docs,
502
- key=["molecule_id", "solvent"],
503
- )
504
- else:
505
- self.logger.info("No items to update")