emmet-builders 0.84.10rc2__py3-none-any.whl → 0.85.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of emmet-builders might be problematic. Click here for more details.

Files changed (36) hide show
  1. emmet/builders/abinit/phonon.py +12 -14
  2. emmet/builders/abinit/sound_velocity.py +1 -1
  3. emmet/builders/materials/absorption_spectrum.py +16 -10
  4. emmet/builders/materials/alloys.py +1 -1
  5. emmet/builders/materials/corrected_entries.py +1 -1
  6. emmet/builders/materials/dielectric.py +10 -7
  7. emmet/builders/materials/elasticity.py +12 -9
  8. emmet/builders/materials/electrodes.py +1 -1
  9. emmet/builders/materials/electronic_structure.py +1 -1
  10. emmet/builders/materials/magnetism.py +2 -1
  11. emmet/builders/materials/piezoelectric.py +23 -19
  12. emmet/builders/materials/provenance.py +3 -4
  13. emmet/builders/materials/summary.py +1 -1
  14. emmet/builders/settings.py +14 -9
  15. emmet/builders/utils.py +5 -4
  16. emmet/builders/vasp/materials.py +11 -4
  17. emmet/builders/vasp/task_validator.py +3 -1
  18. {emmet_builders-0.84.10rc2.dist-info → emmet_builders-0.85.0.dist-info}/METADATA +7 -30
  19. emmet_builders-0.85.0.dist-info/RECORD +41 -0
  20. emmet/builders/materials/ml.py +0 -101
  21. emmet/builders/molecules/atomic.py +0 -592
  22. emmet/builders/molecules/bonds.py +0 -329
  23. emmet/builders/molecules/electric.py +0 -287
  24. emmet/builders/molecules/metal_binding.py +0 -528
  25. emmet/builders/molecules/orbitals.py +0 -292
  26. emmet/builders/molecules/redox.py +0 -502
  27. emmet/builders/molecules/summary.py +0 -406
  28. emmet/builders/molecules/thermo.py +0 -505
  29. emmet/builders/molecules/trajectory.py +0 -530
  30. emmet/builders/molecules/vibration.py +0 -282
  31. emmet/builders/qchem/__init__.py +0 -0
  32. emmet/builders/qchem/molecules.py +0 -745
  33. emmet_builders-0.84.10rc2.dist-info/RECORD +0 -54
  34. /emmet/builders/{molecules/__init__.py → py.typed} +0 -0
  35. {emmet_builders-0.84.10rc2.dist-info → emmet_builders-0.85.0.dist-info}/WHEEL +0 -0
  36. {emmet_builders-0.84.10rc2.dist-info → emmet_builders-0.85.0.dist-info}/top_level.txt +0 -0
@@ -1,502 +0,0 @@
1
- from __future__ import annotations
2
-
3
- import copy
4
- from collections import defaultdict
5
- from datetime import datetime
6
- from itertools import chain, groupby
7
- from math import ceil
8
-
9
- from maggma.builders import Builder
10
- from maggma.core import Store
11
- from maggma.utils import grouper
12
-
13
- from emmet.builders.settings import EmmetBuildSettings
14
- from emmet.core.molecules.bonds import metals
15
- from emmet.core.molecules.redox import RedoxDoc
16
- from emmet.core.molecules.thermo import MoleculeThermoDoc
17
- from emmet.core.qchem.molecule import MoleculeDoc
18
- from emmet.core.qchem.task import TaskDocument
19
- from emmet.core.utils import confirm_molecule, get_graph_hash, jsanitize
20
-
21
- from typing import TYPE_CHECKING
22
-
23
- if TYPE_CHECKING:
24
- from collections.abc import Iterable, Iterator
25
- from typing import Any
26
-
27
- __author__ = "Evan Spotte-Smith"
28
-
29
- SETTINGS = EmmetBuildSettings()
30
-
31
-
32
- class RedoxBuilder(Builder):
33
- """
34
- The RedoxBuilder extracts the highest-quality redox data (vertical and
35
- adiabatic reduction and oxidation potentials, etc.)
36
- from a MoleculeDoc (lowest electronic energy, highest level of theory).
37
-
38
- The process is as follows:
39
- 1. Gather MoleculeDocs by species hash
40
- 2. Further group based on (covalent) isomorphism and charge
41
- 3. For each MoleculeDoc:
42
- 3a. Identify relevant MoleculeThermoDocs
43
- 3b. Look for single-point energy calculations conducted at the
44
- molecule's charge +- 1. These will be used to calculation
45
- vertical electron affinities and ionization energies
46
- 3c. Group MoleculeThermoDocs and single-point calculations based on solvent
47
- and level of theory
48
- 4. Construct RedoxDocs by looking for molecules (with associated
49
- calculations) that:
50
- - Have charges that differ by +- 1
51
- - Use the same solvent and level of theory
52
- """
53
-
54
- def __init__(
55
- self,
56
- tasks: Store,
57
- molecules: Store,
58
- thermo: Store,
59
- redox: Store,
60
- query: dict | None = None,
61
- settings: EmmetBuildSettings | None = None,
62
- **kwargs,
63
- ):
64
- self.tasks = tasks
65
- self.molecules = molecules
66
- self.thermo = thermo
67
- self.redox = redox
68
- self.query = query if query else dict()
69
- self.settings = EmmetBuildSettings.autoload(settings)
70
- self.kwargs = kwargs
71
-
72
- super().__init__(sources=[tasks, molecules, thermo], targets=[redox], **kwargs)
73
- # Uncomment in case of issue with mrun not connecting automatically to collections
74
- # for i in [self.tasks, self.molecules, self.thermo, self.redox]:
75
- # try:
76
- # i.connect()
77
- # except Exception as e:
78
- # print("Could not connect,", e)
79
-
80
- def ensure_indexes(self):
81
- """
82
- Ensures indices on the collections needed for building
83
- """
84
-
85
- # Basic search index for tasks
86
- self.tasks.ensure_index("task_id")
87
- self.tasks.ensure_index("last_updated")
88
- self.tasks.ensure_index("state")
89
- self.tasks.ensure_index("formula_alphabetical")
90
- self.tasks.ensure_index("species_hash")
91
-
92
- # Search index for molecules
93
- self.molecules.ensure_index("molecule_id")
94
- self.molecules.ensure_index("last_updated")
95
- self.molecules.ensure_index("task_ids")
96
- self.molecules.ensure_index("formula_alphabetical")
97
- self.molecules.ensure_index("species_hash")
98
-
99
- # Search index for thermo
100
- self.thermo.ensure_index("molecule_id")
101
- self.thermo.ensure_index("task_id")
102
- self.thermo.ensure_index("solvent")
103
- self.thermo.ensure_index("lot_solvent")
104
- self.thermo.ensure_index("property_id")
105
- self.thermo.ensure_index("last_updated")
106
- self.thermo.ensure_index("formula_alphabetical")
107
-
108
- # Search index for redox
109
- self.redox.ensure_index("molecule_id")
110
- self.redox.ensure_index("solvent")
111
- self.redox.ensure_index("lot_solvent")
112
- self.redox.ensure_index("property_id")
113
- self.redox.ensure_index("last_updated")
114
- self.redox.ensure_index("formula_alphabetical")
115
-
116
- def prechunk(self, number_splits: int) -> Iterable[dict]: # pragma: no cover
117
- """Prechunk the builder for distributed computation"""
118
-
119
- temp_query = dict(self.query)
120
- temp_query["deprecated"] = False
121
-
122
- self.logger.info("Finding documents to process")
123
- all_mols = list(
124
- self.molecules.query(temp_query, [self.molecules.key, "species_hash"])
125
- )
126
-
127
- processed_docs = set([e for e in self.redox.distinct("molecule_id")])
128
- to_process_docs = {d[self.molecules.key] for d in all_mols} - processed_docs
129
- to_process_hashes = {
130
- d["species_hash"]
131
- for d in all_mols
132
- if d[self.molecules.key] in to_process_docs
133
- }
134
-
135
- N = ceil(len(to_process_hashes) / number_splits)
136
-
137
- for hash_chunk in grouper(to_process_hashes, N):
138
- query = dict(temp_query)
139
- query["species_hash"] = {"$in": list(hash_chunk)}
140
- yield {"query": query}
141
-
142
- def get_items(self) -> Iterator[list[dict]]:
143
- """
144
- Gets all items to process into redox documents.
145
- This does no datetime checking; relying on on whether
146
- task_ids are included in the orbitals Store
147
-
148
- Returns:
149
- generator or list relevant tasks and molecules to process into documents
150
- """
151
-
152
- self.logger.info("Redox builder started")
153
- self.logger.info("Setting indexes")
154
- self.ensure_indexes()
155
-
156
- # Save timestamp to mark buildtime
157
- self.timestamp = datetime.utcnow()
158
-
159
- # Get all processed molecules
160
- temp_query = dict(self.query)
161
- temp_query["deprecated"] = False
162
-
163
- self.logger.info("Finding documents to process")
164
- all_mols = list(
165
- self.molecules.query(temp_query, [self.molecules.key, "species_hash"])
166
- )
167
-
168
- processed_docs = set([e for e in self.redox.distinct("molecule_id")])
169
- to_process_docs = {d[self.molecules.key] for d in all_mols} - processed_docs
170
- to_process_hashes = {
171
- d["species_hash"]
172
- for d in all_mols
173
- if d[self.molecules.key] in to_process_docs
174
- }
175
-
176
- self.logger.info(f"Found {len(to_process_docs)} unprocessed documents")
177
- self.logger.info(f"Found {len(to_process_hashes)} unprocessed hashes")
178
-
179
- # Set total for builder bars to have a total
180
- self.total = len(to_process_hashes)
181
-
182
- for shash in to_process_hashes:
183
- mol_query = dict(temp_query)
184
- mol_query["species_hash"] = shash
185
- molecules = list(self.molecules.query(criteria=mol_query))
186
-
187
- yield molecules
188
-
189
- def process_item(self, items: list[dict]) -> list[dict]:
190
- """
191
- Process the tasks into a RedoxDoc
192
-
193
- Args:
194
- tasks list[dict] : a list of MoleculeDocs in dict form
195
-
196
- Returns:
197
- [dict] : a list of new redox docs
198
- """
199
-
200
- mols = [MoleculeDoc(**item) for item in items]
201
- shash = mols[0].species_hash
202
- mol_ids = [m.molecule_id for m in mols]
203
- self.logger.debug(f"Processing {shash} : {mol_ids}")
204
-
205
- redox_docs = list()
206
-
207
- # Group by (covalent) molecular graph connectivity
208
- group_by_graph = self._group_by_graph(mols)
209
-
210
- for graph_group in group_by_graph.values():
211
- # Molecule docs will be grouped by charge
212
- charges: dict[int, Any] = dict()
213
-
214
- for gg in graph_group:
215
- # First, grab relevant MoleculeThermoDocs and identify possible IE/EA single-points
216
- thermo_docs = [
217
- MoleculeThermoDoc(**e)
218
- for e in self.thermo.query({"molecule_id": gg.molecule_id})
219
- ]
220
-
221
- if len(thermo_docs) == 0:
222
- # Current building scheme requires a MoleculeThermoDoc
223
- continue
224
-
225
- ie_sp_task_ids = [
226
- e["task_id"]
227
- for e in gg.entries
228
- if e["charge"] == gg.charge + 1
229
- and e["task_type"] in ["Single Point", "Force"]
230
- and e["output"].get("final_energy")
231
- ]
232
- ie_tasks = list()
233
- for i in ie_sp_task_ids:
234
- tdoc = self.tasks.query_one(
235
- {
236
- "task_id": i,
237
- "species_hash": shash,
238
- "orig": {"$exists": True},
239
- }
240
- )
241
-
242
- if tdoc is None:
243
- try:
244
- tdoc = self.tasks.query_one(
245
- {
246
- "task_id": int(i),
247
- "species_hash": shash,
248
- "orig": {"$exists": True},
249
- }
250
- )
251
- except ValueError:
252
- tdoc = None
253
-
254
- if tdoc is None:
255
- continue
256
-
257
- ie_tasks.append(TaskDocument(**tdoc))
258
-
259
- ea_sp_task_ids = [
260
- e["task_id"]
261
- for e in gg.entries
262
- if e["charge"] == gg.charge - 1
263
- and e["task_type"] in ["Single Point", "Force"]
264
- and e["output"].get("final_energy")
265
- ]
266
- ea_tasks = list()
267
- for i in ea_sp_task_ids:
268
- tdoc = self.tasks.query_one(
269
- {
270
- "task_id": i,
271
- "species_hash": shash,
272
- "orig": {"$exists": True},
273
- }
274
- )
275
-
276
- if tdoc is None:
277
- try:
278
- tdoc = self.tasks.query_one(
279
- {
280
- "task_id": int(i),
281
- "species_hash": shash,
282
- "orig": {"$exists": True},
283
- }
284
- )
285
- except ValueError:
286
- tdoc = None
287
-
288
- if tdoc is None:
289
- continue
290
-
291
- ea_tasks.append(TaskDocument(**tdoc))
292
-
293
- grouped_docs = self._collect_by_lot_solvent(
294
- thermo_docs, ie_tasks, ea_tasks
295
- )
296
- if gg.charge in charges:
297
- charges[gg.charge].append((gg, grouped_docs))
298
- else:
299
- charges[gg.charge] = [(gg, grouped_docs)]
300
-
301
- for charge, collection in charges.items():
302
- for mol, docs in collection:
303
- # Get all possible molecules for adiabatic oxidation and reduction
304
- red_coll = charges.get(charge - 1, list())
305
- ox_coll = charges.get(charge + 1, list())
306
-
307
- for lot_solv, docset in docs.items():
308
- # Collect other molecules that have MoleculeThermoDocs at the
309
- # exact same level of theory
310
-
311
- combined = docset["thermo_doc"].combined_lot_solvent
312
-
313
- relevant_red = list()
314
- relevant_ox = list()
315
-
316
- for rmol, rdocs in red_coll:
317
- if lot_solv in rdocs:
318
- if (
319
- rdocs[lot_solv]["thermo_doc"].combined_lot_solvent
320
- == combined
321
- ):
322
- relevant_red.append(rdocs[lot_solv])
323
-
324
- for omol, odocs in ox_coll:
325
- if lot_solv in odocs:
326
- if (
327
- odocs[lot_solv]["thermo_doc"].combined_lot_solvent
328
- == combined
329
- ):
330
- relevant_ox.append(odocs[lot_solv])
331
-
332
- # Take best options (based on electronic energy), where available
333
- if len(relevant_red) == 0:
334
- red_doc = None
335
- else:
336
- red_doc = sorted(
337
- relevant_red,
338
- key=lambda x: x["thermo_doc"].electronic_energy,
339
- )[0]["thermo_doc"]
340
-
341
- if len(relevant_ox) == 0:
342
- ox_doc = None
343
- else:
344
- ox_doc = sorted(
345
- relevant_ox,
346
- key=lambda x: x["thermo_doc"].electronic_energy,
347
- )[0]["thermo_doc"]
348
-
349
- ea_doc = docset.get("ea_doc")
350
- ie_doc = docset.get("ie_doc")
351
-
352
- redox_docs.append(
353
- RedoxDoc.from_docs(
354
- base_molecule_doc=mol,
355
- base_thermo_doc=docset["thermo_doc"],
356
- red_doc=red_doc,
357
- ox_doc=ox_doc,
358
- ea_doc=ea_doc,
359
- ie_doc=ie_doc,
360
- )
361
- )
362
-
363
- self.logger.debug(f"Produced {len(redox_docs)} redox docs for {shash}")
364
-
365
- return jsanitize(
366
- [doc.model_dump() for doc in redox_docs if doc is not None], allow_bson=True
367
- )
368
-
369
- def update_targets(self, items: list[list[dict]]):
370
- """
371
- Inserts the new documents into the orbitals collection
372
-
373
- Args:
374
- items [[dict]]: A list of documents to update
375
- """
376
-
377
- docs = list(chain.from_iterable(items)) # type: ignore
378
-
379
- # Add timestamp
380
- for item in docs:
381
- item.update(
382
- {
383
- "_bt": self.timestamp,
384
- }
385
- )
386
-
387
- molecule_ids = list({item["molecule_id"] for item in docs})
388
-
389
- if len(items) > 0:
390
- self.logger.info(f"Updating {len(docs)} redox documents")
391
- self.redox.remove_docs({self.redox.key: {"$in": molecule_ids}})
392
- self.redox.update(
393
- docs=docs,
394
- key=["molecule_id", "solvent"],
395
- )
396
- else:
397
- self.logger.info("No items to update")
398
-
399
- @staticmethod
400
- def _group_by_graph(mol_docs: list[MoleculeDoc]) -> dict[int, list[MoleculeDoc]]:
401
- """
402
- Group molecule docs by molecular graph connectivity
403
-
404
- :param entries: List of entries (dicts derived from TaskDocuments)
405
- :return: Grouped molecule entries
406
- """
407
-
408
- graph_hashes_nometal: list[str] = list()
409
- results = defaultdict(list)
410
-
411
- # Within each group, group by the covalent molecular graph
412
- for t in mol_docs:
413
- mol = confirm_molecule(t.molecule)
414
-
415
- mol_nometal = copy.deepcopy(mol)
416
-
417
- if mol.composition.alphabetical_formula not in [m + "1" for m in metals]:
418
- mol_nometal.remove_species(metals)
419
-
420
- mol_nometal.set_charge_and_spin(0)
421
- gh_nometal = get_graph_hash(mol_nometal, node_attr="specie")
422
-
423
- match = None
424
- for i, gh in enumerate(graph_hashes_nometal):
425
- if gh_nometal == gh:
426
- match = i
427
- break
428
-
429
- if match is None:
430
- results[len(graph_hashes_nometal)].append(t)
431
- graph_hashes_nometal.append(gh_nometal)
432
- else:
433
- results[match].append(t)
434
-
435
- return results
436
-
437
- @staticmethod
438
- def _collect_by_lot_solvent(
439
- thermo_docs: list[MoleculeThermoDoc],
440
- ie_docs: list[TaskDocument],
441
- ea_docs: list[TaskDocument],
442
- ) -> dict[str, Any]:
443
- """
444
- For a given MoleculeDoc, group potential MoleculeThermoDocs and TaskDocs for
445
- IE/EA calculations based on level of theory and solvent.
446
-
447
- Args:
448
- thermo_docs (list of MoleculeThermoDocs): List of MoleculeThermoDocs for this MoleculeDoc
449
- ie_docs (list of TaskDocuments): List of TaskDocs which could be used
450
- to calculate vertical ionization energies for this MoleculeDoc
451
- ea_docs (list of TaskDocuments): List of TaskDocs which could be used
452
- to calculate vertical electron affinities for this MoleculeDoc:
453
-
454
- Returns:
455
- dict {<lot_solvent>: {
456
- "thermo_doc": MoleculeThermoDoc, "ie_doc": TaskDocument, "ea_doc": TaskDocument
457
- }
458
- }
459
- """
460
-
461
- def _lot_solv(doc: MoleculeThermoDoc | TaskDocument):
462
- if isinstance(doc, MoleculeThermoDoc):
463
- if doc.correction:
464
- return doc.correction_lot_solvent
465
- return doc.lot_solvent
466
-
467
- thermo_grouped = groupby(sorted(thermo_docs, key=_lot_solv), key=_lot_solv)
468
- ie_grouped = groupby(sorted(ie_docs, key=_lot_solv), key=_lot_solv)
469
- ea_grouped = groupby(sorted(ea_docs, key=_lot_solv), key=_lot_solv)
470
-
471
- groups = dict()
472
-
473
- for k, g in thermo_grouped:
474
- g_list = list(g)
475
-
476
- # Should never be more than one MoleculeThermoDoc per MoleculeDoc
477
- # Just for safety...
478
- if len(g_list) > 1:
479
- g_list_sorted = sorted(g_list, key=lambda x: x.electronic_energy)
480
- this_thermo_doc = g_list_sorted[0]
481
- else:
482
- this_thermo_doc = g_list[0]
483
-
484
- groups[k] = {"thermo_doc": this_thermo_doc}
485
-
486
- for k, g in ie_grouped:
487
- # Must be a MoleculeThermoDoc to make a RedoxDoc
488
- if k not in groups:
489
- continue
490
-
491
- this_ie_doc = sorted(list(g), key=lambda x: x.output.final_energy)[0]
492
- groups[k]["ie_doc"] = this_ie_doc
493
-
494
- for k, g in ea_grouped:
495
- # Must be a MoleculeThermoDoc to make a RedoxDoc
496
- if k not in groups:
497
- continue
498
-
499
- this_ea_doc = sorted(list(g), key=lambda x: x.output.final_energy)[0]
500
- groups[k]["ea_doc"] = this_ea_doc
501
-
502
- return groups