emmet-builders 0.78.3__py3-none-any.whl → 0.86.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (44) hide show
  1. emmet/builders/abinit/phonon.py +47 -47
  2. emmet/builders/abinit/sound_velocity.py +15 -11
  3. emmet/builders/feff/xas.py +1 -2
  4. emmet/builders/materials/absorption_spectrum.py +25 -14
  5. emmet/builders/materials/alloys.py +10 -11
  6. emmet/builders/materials/chemenv.py +2 -3
  7. emmet/builders/materials/corrected_entries.py +21 -15
  8. emmet/builders/materials/dielectric.py +19 -11
  9. emmet/builders/materials/elasticity.py +44 -33
  10. emmet/builders/materials/electrodes.py +35 -28
  11. emmet/builders/materials/electronic_structure.py +17 -17
  12. emmet/builders/materials/magnetism.py +11 -4
  13. emmet/builders/materials/optimade.py +7 -3
  14. emmet/builders/materials/piezoelectric.py +24 -21
  15. emmet/builders/materials/provenance.py +16 -13
  16. emmet/builders/materials/robocrys.py +2 -3
  17. emmet/builders/materials/substrates.py +9 -8
  18. emmet/builders/materials/summary.py +3 -3
  19. emmet/builders/materials/thermo.py +17 -11
  20. emmet/builders/matscholar/missing_compositions.py +12 -8
  21. emmet/builders/mobility/migration_graph.py +5 -5
  22. emmet/builders/settings.py +21 -17
  23. emmet/builders/utils.py +101 -12
  24. emmet/builders/vasp/materials.py +40 -51
  25. emmet/builders/vasp/mp_potcar_stats.json.gz +0 -0
  26. emmet/builders/vasp/task_validator.py +25 -36
  27. emmet_builders-0.86.0.dist-info/METADATA +37 -0
  28. emmet_builders-0.86.0.dist-info/RECORD +41 -0
  29. {emmet_builders-0.78.3.dist-info → emmet_builders-0.86.0.dist-info}/WHEEL +1 -1
  30. emmet/builders/materials/ml.py +0 -87
  31. emmet/builders/molecules/atomic.py +0 -589
  32. emmet/builders/molecules/bonds.py +0 -324
  33. emmet/builders/molecules/metal_binding.py +0 -526
  34. emmet/builders/molecules/orbitals.py +0 -288
  35. emmet/builders/molecules/redox.py +0 -496
  36. emmet/builders/molecules/summary.py +0 -383
  37. emmet/builders/molecules/thermo.py +0 -500
  38. emmet/builders/molecules/vibration.py +0 -278
  39. emmet/builders/qchem/__init__.py +0 -0
  40. emmet/builders/qchem/molecules.py +0 -734
  41. emmet_builders-0.78.3.dist-info/METADATA +0 -47
  42. emmet_builders-0.78.3.dist-info/RECORD +0 -51
  43. /emmet/builders/{molecules/__init__.py → py.typed} +0 -0
  44. {emmet_builders-0.78.3.dist-info → emmet_builders-0.86.0.dist-info}/top_level.txt +0 -0
@@ -1,526 +0,0 @@
1
- from datetime import datetime
2
- from itertools import chain
3
- from math import ceil
4
- from typing import Optional, Iterable, Iterator, List, Dict
5
- import copy
6
-
7
- from pymatgen.core.structure import Molecule
8
- from pymatgen.util.graph_hashing import weisfeiler_lehman_graph_hash
9
-
10
- from maggma.builders import Builder
11
- from maggma.core import Store
12
- from maggma.utils import grouper
13
-
14
- from emmet.core.qchem.molecule import MoleculeDoc
15
- from emmet.core.molecules.atomic import PartialChargesDoc, PartialSpinsDoc
16
- from emmet.core.molecules.bonds import MoleculeBondingDoc, metals
17
- from emmet.core.molecules.thermo import MoleculeThermoDoc
18
- from emmet.core.molecules.metal_binding import MetalBindingDoc, METAL_BINDING_METHODS
19
- from emmet.core.utils import jsanitize
20
- from emmet.builders.settings import EmmetBuildSettings
21
-
22
-
23
- __author__ = "Evan Spotte-Smith"
24
-
25
- SETTINGS = EmmetBuildSettings()
26
-
27
-
28
- class MetalBindingBuilder(Builder):
29
- """
30
- The MetalBindingBuilder extracts information about metal binding in molecules
31
- from MoleculeDocs, MoleculeThermoDocs, MoleculeBondingDocs, PartialChargesDocs,
32
- and PartialSpinsDocs.
33
-
34
- NBO is the strongly preferred method to approximate partial charges and bonding,
35
- and so if NBO bonding and partial charges/spins documents are available, they will
36
- be used.
37
-
38
- If NBO docs are not available, then bonding can be taken from the "OpenBabelNN + metal_edge_extender"
39
- method, and partial charges and spins can be taken from "mulliken".
40
-
41
- This builder will attempt to build documents for each molecule in each solvent.
42
- For each molecule-solvent combination, the highest-quality data available
43
- (based on level of theory, electronic energy, and the method used to generate bonding/charges/spins)
44
- will be used.
45
-
46
- The process is as follows:
47
- 1. Gather MoleculeDocs by formula
48
- 2. For each molecule, first identify if there are any metals. If not, then no MetalBindingDoc can be made.
49
- If so, then identify the possible solvents that can be used to generate MetalBindingDocs
50
- 3. For each combination of Molecule ID and solvent, search for additional documents:
51
- - MoleculeBondingDocs
52
- - PartialChargesDocs
53
- - PartialSpinsDocs (for open-shell molecules)
54
- - MoleculeThermoDocs
55
- 4. Group these additional documents by level of theory and (where applicable) method, and choose the best
56
- possible methods for which all required data is available
57
- 5. For each metal in the molecule:
58
- 5.1 Use partial charge and spin information to determine the oxidation and spin state of the metal
59
- 5.2 Search for MoleculeThermoDocs for the metal atom/ion with appropriate charge and spin with the
60
- chosen level of theory
61
- 5.3 Use graph comparisons (hashing or isomorphism) to identify a molecule with the same structure as
62
- the molecule of interest WITHOUT the metal of interest, as well as the appropriate charge and spin
63
- 5.4 If an appropriate metal-less molecule can be found, search for a MoleculeThermoDoc for that molecule'
64
- with the chosen level of theory
65
- 6. Use the obtained bonding, charges, spins, and thermo docs to construct a MetalBindingDoc
66
- """
67
-
68
- def __init__(
69
- self,
70
- molecules: Store,
71
- charges: Store,
72
- spins: Store,
73
- bonds: Store,
74
- thermo: Store,
75
- metal_binding: Store,
76
- query: Optional[Dict] = None,
77
- methods: Optional[List] = None,
78
- settings: Optional[EmmetBuildSettings] = None,
79
- **kwargs,
80
- ):
81
- self.molecules = molecules
82
- self.charges = charges
83
- self.spins = spins
84
- self.bonds = bonds
85
- self.thermo = thermo
86
- self.metal_binding = metal_binding
87
- self.query = query if query else dict()
88
- self.methods = methods if methods else METAL_BINDING_METHODS
89
- self.settings = EmmetBuildSettings.autoload(settings)
90
- self.kwargs = kwargs
91
-
92
- super().__init__(
93
- sources=[molecules, charges, spins, bonds, thermo],
94
- targets=[metal_binding],
95
- **kwargs,
96
- )
97
- # Uncomment in case of issue with mrun not connecting automatically to collections
98
- # for i in [self.molecules, self.charges, self.spins, self.bonds, self.thermo, self.metal_binding]:
99
- # try:
100
- # i.connect()
101
- # except Exception as e:
102
- # print("Could not connect,", e)
103
-
104
- def ensure_indexes(self):
105
- """
106
- Ensures indices on the collections needed for building
107
- """
108
-
109
- # Search index for molecules
110
- self.molecules.ensure_index("molecule_id")
111
- self.molecules.ensure_index("last_updated")
112
- self.molecules.ensure_index("task_ids")
113
- self.molecules.ensure_index("formula_alphabetical")
114
-
115
- # Search index for charges
116
- self.charges.ensure_index("molecule_id")
117
- self.charges.ensure_index("task_id")
118
- self.charges.ensure_index("method")
119
- self.charges.ensure_index("solvent")
120
- self.charges.ensure_index("lot_solvent")
121
- self.charges.ensure_index("property_id")
122
- self.charges.ensure_index("last_updated")
123
- self.charges.ensure_index("formula_alphabetical")
124
-
125
- # Search index for spins
126
- self.spins.ensure_index("molecule_id")
127
- self.spins.ensure_index("task_id")
128
- self.spins.ensure_index("method")
129
- self.spins.ensure_index("solvent")
130
- self.spins.ensure_index("lot_solvent")
131
- self.spins.ensure_index("property_id")
132
- self.spins.ensure_index("last_updated")
133
- self.spins.ensure_index("formula_alphabetical")
134
-
135
- # Search index for bonds
136
- self.bonds.ensure_index("molecule_id")
137
- self.bonds.ensure_index("method")
138
- self.bonds.ensure_index("task_id")
139
- self.bonds.ensure_index("solvent")
140
- self.bonds.ensure_index("lot_solvent")
141
- self.bonds.ensure_index("property_id")
142
- self.bonds.ensure_index("last_updated")
143
- self.bonds.ensure_index("formula_alphabetical")
144
-
145
- # Search index for thermo
146
- self.thermo.ensure_index("molecule_id")
147
- self.thermo.ensure_index("task_id")
148
- self.thermo.ensure_index("solvent")
149
- self.thermo.ensure_index("lot_solvent")
150
- self.thermo.ensure_index("property_id")
151
- self.thermo.ensure_index("last_updated")
152
- self.thermo.ensure_index("formula_alphabetical")
153
-
154
- # Search index for metal_binding
155
- self.metal_binding.ensure_index("molecule_id")
156
- self.metal_binding.ensure_index("solvent")
157
- self.metal_binding.ensure_index("lot_solvent")
158
- self.metal_binding.ensure_index("property_id")
159
- self.metal_binding.ensure_index("last_updated")
160
- self.metal_binding.ensure_index("formula_alphabetical")
161
- self.metal_binding.ensure_index("method")
162
-
163
- def prechunk(self, number_splits: int) -> Iterable[Dict]: # pragma: no cover
164
- """Prechunk the builder for distributed computation"""
165
-
166
- temp_query = dict(self.query)
167
- temp_query["deprecated"] = False
168
-
169
- self.logger.info("Finding documents to process")
170
- all_mols = list(
171
- self.molecules.query(
172
- temp_query, [self.molecules.key, "formula_alphabetical"]
173
- )
174
- )
175
-
176
- processed_docs = set([e for e in self.metal_binding.distinct("molecule_id")])
177
- to_process_docs = {d[self.molecules.key] for d in all_mols} - processed_docs
178
- to_process_forms = {
179
- d["formula_alphabetical"]
180
- for d in all_mols
181
- if d[self.molecules.key] in to_process_docs
182
- }
183
-
184
- N = ceil(len(to_process_forms) / number_splits)
185
-
186
- for formula_chunk in grouper(to_process_forms, N):
187
- yield {"query": {"formula_alphabetical": {"$in": list(formula_chunk)}}}
188
-
189
- def get_items(self) -> Iterator[List[Dict]]:
190
- """
191
- Gets all items to process into metal_binding documents.
192
-
193
- Returns:
194
- generator or list relevant molecules to process into documents
195
- """
196
-
197
- self.logger.info("Metal binding builder started")
198
- self.logger.info("Setting indexes")
199
- self.ensure_indexes()
200
-
201
- # Save timestamp to mark buildtime
202
- self.timestamp = datetime.utcnow()
203
-
204
- # Get all processed molecules
205
- temp_query = dict(self.query)
206
- temp_query["deprecated"] = False
207
-
208
- self.logger.info("Finding documents to process")
209
- all_mols = list(
210
- self.molecules.query(
211
- temp_query, [self.molecules.key, "formula_alphabetical"]
212
- )
213
- )
214
-
215
- processed_docs = set([e for e in self.metal_binding.distinct("molecule_id")])
216
- to_process_docs = {d[self.molecules.key] for d in all_mols} - processed_docs
217
- to_process_forms = {
218
- d["formula_alphabetical"]
219
- for d in all_mols
220
- if d[self.molecules.key] in to_process_docs
221
- }
222
-
223
- self.logger.info(f"Found {len(to_process_docs)} unprocessed documents")
224
- self.logger.info(f"Found {len(to_process_forms)} unprocessed formulas")
225
-
226
- # Set total for builder bars to have a total
227
- self.total = len(to_process_forms)
228
-
229
- for formula in to_process_forms:
230
- mol_query = dict(temp_query)
231
- mol_query["formula_alphabetical"] = formula
232
- molecules = list(self.molecules.query(criteria=mol_query))
233
-
234
- yield molecules
235
-
236
- def process_item(self, items: List[Dict]) -> List[Dict]:
237
- """
238
- Process molecule, bonding, partial charges, partial spins, and thermo documents into MetalBindingDocs
239
-
240
- Args:
241
- tasks List[Dict] : a list of MoleculeDocs in dict form
242
-
243
- Returns:
244
- [dict] : a list of new metal binding docs
245
- """
246
-
247
- mols = [MoleculeDoc(**item) for item in items]
248
- formula = mols[0].formula_alphabetical
249
- mol_ids = [m.molecule_id for m in mols]
250
- self.logger.debug(f"Processing {formula} : {mol_ids}")
251
-
252
- binding_docs = list()
253
-
254
- for mol in mols:
255
- # First: do we need to do this? Are there actually metals in this molecule? And species other than metals?
256
- species = mol.species
257
- metal_indices = [i for i, e in enumerate(species) if e in metals]
258
- if len(metal_indices) == 0 or len(species) == 1:
259
- # print(mol.molecule_id, mol.formula_alphabetical)
260
- continue
261
-
262
- # Grab the basic documents needed to create a metal binding document
263
- molecule_id = mol.molecule_id
264
- solvents = mol.unique_solvents
265
- charges = [
266
- PartialChargesDoc(**e)
267
- for e in self.charges.query({"molecule_id": molecule_id})
268
- ]
269
- if mol.spin_multiplicity != 1:
270
- spins = [
271
- PartialSpinsDoc(**e)
272
- for e in self.spins.query({"molecule_id": molecule_id})
273
- ]
274
- else:
275
- spins = list()
276
- bonds = [
277
- MoleculeBondingDoc(**e)
278
- for e in self.bonds.query({"molecule_id": molecule_id})
279
- ]
280
- thermo = [
281
- MoleculeThermoDoc(**e)
282
- for e in self.thermo.query({"molecule_id": molecule_id})
283
- ]
284
-
285
- if any([len(x) == 0 for x in [charges, bonds, thermo]]):
286
- # Not enough information to construct MetalBindingDoc
287
- continue
288
- elif mol.spin_multiplicity != 1 and len(spins) == 0:
289
- # For open-shell molecule, partial spins information needed
290
- continue
291
-
292
- # Group by solvent and (where appropriate) method
293
- charge_bysolv_meth = dict()
294
- for c in charges:
295
- if c.solvent not in charge_bysolv_meth:
296
- charge_bysolv_meth[c.solvent] = {c.method: c}
297
- else:
298
- charge_bysolv_meth[c.solvent][c.method] = c
299
-
300
- spins_bysolv_meth = dict()
301
- for s in spins:
302
- if s.solvent not in spins_bysolv_meth:
303
- spins_bysolv_meth[s.solvent] = {s.method: s}
304
- else:
305
- spins_bysolv_meth[s.solvent][s.method] = s
306
-
307
- bonds_bysolv_meth = dict()
308
- for b in bonds:
309
- if b.solvent not in bonds_bysolv_meth:
310
- bonds_bysolv_meth[b.solvent] = {b.method: b}
311
- else:
312
- bonds_bysolv_meth[b.solvent][b.method] = b
313
-
314
- thermo_bysolv = {t.solvent: t for t in thermo}
315
-
316
- for solvent in solvents:
317
- this_charge = charge_bysolv_meth.get(solvent) # type: ignore
318
- this_spin = spins_bysolv_meth.get(solvent) # type: ignore
319
- this_bond = bonds_bysolv_meth.get(solvent) # type: ignore
320
- base_thermo_doc = thermo_bysolv.get(solvent) # type: ignore
321
-
322
- # Do we have the requisite docs for this solvent?
323
- if mol.spin_multiplicity == 1:
324
- needed = [this_charge, this_bond, base_thermo_doc]
325
- else:
326
- needed = [this_charge, this_spin, this_bond, base_thermo_doc]
327
-
328
- if any([x is None for x in needed]):
329
- continue
330
-
331
- # What method will we use?
332
- # Currently allows two options:
333
- # 1. Using NBO for everything ("nbo")
334
- # 2. Using Mulliken for charges/spins and OpenBabel + metal_edge_extender for bonding
335
- # ("mulliken-OB-mee")
336
- for method in self.methods:
337
- plan = False
338
- if mol.spin_multiplicity == 1:
339
- if method == "nbo" and all(
340
- [x.get("nbo") is not None for x in [this_charge, this_bond]] # type: ignore
341
- ): # type: ignore
342
- plan = True
343
- charge_doc = this_charge.get("nbo") # type: ignore
344
- spin_doc = None
345
- bond_doc = this_bond.get("nbo") # type: ignore
346
- elif method == "mulliken-OB-mee" and (
347
- this_charge.get("mulliken") is not None # type: ignore
348
- and this_bond.get("OpenBabelNN + metal_edge_extender") is not None # type: ignore
349
- ):
350
- plan = True
351
- charge_doc = this_charge.get("mulliken") # type: ignore
352
- spin_doc = None
353
- bond_doc = this_bond.get("OpenBabelNN + metal_edge_extender") # type: ignore
354
- else:
355
- if method == "nbo" and all(
356
- [x.get("nbo") is not None for x in [this_charge, this_spin, this_bond]] # type: ignore
357
- ): # type: ignore
358
- charge_lot = this_charge.get("nbo").level_of_theory # type: ignore
359
- spin_lot = this_spin.get("nbo").level_of_theory # type: ignore
360
- if charge_lot == spin_lot: # type: ignore
361
- plan = True
362
- charge_doc = this_charge.get("nbo") # type: ignore
363
- spin_doc = this_spin.get("nbo") # type: ignore
364
- bond_doc = this_bond.get("nbo") # type: ignore
365
- elif (
366
- method == "mulliken-OB-mee"
367
- and this_charge.get("mulliken") is not None # type: ignore
368
- and this_spin.get("mulliken") is not None # type: ignore
369
- and this_bond.get("OpenBabelNN + metal_edge_extender") is not None # type: ignore
370
- ):
371
- charge_lot = this_charge.get("mulliken").level_of_theory # type: ignore
372
- spin_lot = this_spin.get("mulliken").level_of_theory # type: ignore
373
- if charge_lot == spin_lot: # type: ignore
374
- plan = True
375
- charge_doc = this_charge.get("mulliken") # type: ignore
376
- spin_doc = this_spin.get("mulliken") # type: ignore
377
- bond_doc = this_bond.get("OpenBabelNN + metal_edge_extender") # type: ignore
378
-
379
- # Don't have the right combinations of level of theory and method
380
- if plan is False:
381
- continue
382
-
383
- # Obtain relevant thermo documents for each metal atom/ion in the molecule
384
- metal_thermo = dict()
385
- nometal_thermo = dict()
386
- for metal_index in metal_indices:
387
- # First, determine the appropriate charge and spin of the metal
388
- # TODO: figure out better charge assignment
389
- partial_charge = charge_doc.partial_charges[metal_index] # type: ignore
390
-
391
- if mol.spin_multiplicity == 1:
392
- # For now, just round to nearest whole number
393
- charge = round(partial_charge)
394
- spin = 1
395
- else:
396
- partial_spin = spin_doc.partial_spins[metal_index] # type: ignore
397
- charge = round(partial_charge)
398
- spin = round(abs(partial_spin)) + 1
399
-
400
- # Sanity check that charge and spin are compatible
401
- metal_species = species[metal_index]
402
- try:
403
- _ = Molecule(
404
- [metal_species],
405
- [[0.0, 0.0, 0.0]],
406
- charge=charge,
407
- spin_multiplicity=spin,
408
- )
409
- except ValueError:
410
- # Assume spin assignment is correct, and change charge accordingly
411
- diff_up = abs(partial_charge - (charge + 1))
412
- diff_down = abs(partial_charge - (charge - 1))
413
- if diff_up < diff_down:
414
- charge += 1
415
- else:
416
- charge -= 1
417
-
418
- # Grab thermo doc for the relevant metal ion/atom (if available)
419
- this_metal_thermo = [
420
- MoleculeThermoDoc(**e)
421
- for e in self.thermo.query(
422
- {
423
- "formula_alphabetical": f"{metal_species}1",
424
- "charge": charge,
425
- "spin_multiplicity": spin,
426
- "lot_solvent": base_thermo_doc.lot_solvent, # type: ignore
427
- }
428
- )
429
- ]
430
- if len(this_metal_thermo) == 0:
431
- continue
432
-
433
- this_metal_thermo = this_metal_thermo[0]
434
- metal_thermo[metal_index] = this_metal_thermo
435
-
436
- # Now the (somewhat) harder part - finding the document for this molecule without the metal
437
- # Make sure charges and spins add up
438
- nometal_charge = mol.charge - charge
439
- nometal_spin = mol.spin_multiplicity - spin + 1
440
- mg_copy = copy.deepcopy(bond_doc.molecule_graph) # type: ignore
441
- mg_copy.remove_nodes([metal_index])
442
- new_hash = weisfeiler_lehman_graph_hash(
443
- mg_copy.graph.to_undirected(), node_attr="specie"
444
- )
445
- nometal_mol_doc = [
446
- MoleculeDoc(**e)
447
- for e in self.molecules.query(
448
- {
449
- "species_hash": new_hash,
450
- "charge": nometal_charge,
451
- "spin_multiplicity": nometal_spin,
452
- }
453
- )
454
- ]
455
- if len(nometal_mol_doc) == 0:
456
- continue
457
-
458
- nometal_mol_id = nometal_mol_doc[0].molecule_id
459
- this_nometal_thermo = [
460
- MoleculeThermoDoc(**e)
461
- for e in self.thermo.query(
462
- {
463
- "molecule_id": nometal_mol_id,
464
- "lot_solvent": base_thermo_doc.lot_solvent, # type: ignore
465
- }
466
- )
467
- ]
468
- if len(this_nometal_thermo) == 0:
469
- continue
470
-
471
- this_nometal_thermo = this_nometal_thermo[0]
472
- nometal_thermo[metal_index] = this_nometal_thermo
473
-
474
- doc = MetalBindingDoc.from_docs(
475
- method=method,
476
- metal_indices=metal_indices,
477
- base_molecule_doc=mol,
478
- partial_charges=charge_doc,
479
- partial_spins=spin_doc,
480
- bonding=bond_doc,
481
- base_thermo=base_thermo_doc,
482
- metal_thermo=metal_thermo,
483
- nometal_thermo=nometal_thermo,
484
- )
485
-
486
- if doc is not None and len(doc.binding_data) != 0:
487
- binding_docs.append(doc)
488
-
489
- self.logger.debug(
490
- f"Produced {len(binding_docs)} metal binding docs for {formula}"
491
- )
492
-
493
- return jsanitize([doc.model_dump() for doc in binding_docs], allow_bson=True)
494
-
495
- def update_targets(self, items: List[List[Dict]]):
496
- """
497
- Inserts the new documents into the metal_binding collection
498
-
499
- Args:
500
- items [[dict]]: A list of documents to update
501
- """
502
-
503
- docs = list(chain.from_iterable(items)) # type: ignore
504
-
505
- # Add timestamp
506
- for item in docs:
507
- item.update(
508
- {
509
- "_bt": self.timestamp,
510
- }
511
- )
512
-
513
- molecule_ids = list({item["molecule_id"] for item in docs})
514
-
515
- if len(items) > 0:
516
- self.logger.info(f"Updating {len(docs)} metal binding documents")
517
- self.metal_binding.remove_docs(
518
- {self.metal_binding.key: {"$in": molecule_ids}}
519
- )
520
- # Neither molecule_id nor solvent need to be unique, but the combination must be
521
- self.metal_binding.update(
522
- docs=docs,
523
- key=["molecule_id", "solvent", "method"],
524
- )
525
- else:
526
- self.logger.info("No items to update")