emmet-builders 0.84.10rc2__py3-none-any.whl → 0.85.0rc0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of emmet-builders might be problematic. Click here for more details.

Files changed (33) hide show
  1. emmet/builders/abinit/phonon.py +12 -14
  2. emmet/builders/abinit/sound_velocity.py +1 -1
  3. emmet/builders/materials/absorption_spectrum.py +16 -10
  4. emmet/builders/materials/dielectric.py +10 -7
  5. emmet/builders/materials/elasticity.py +12 -9
  6. emmet/builders/materials/electrodes.py +1 -1
  7. emmet/builders/materials/electronic_structure.py +1 -1
  8. emmet/builders/materials/magnetism.py +2 -1
  9. emmet/builders/materials/piezoelectric.py +23 -19
  10. emmet/builders/materials/provenance.py +3 -4
  11. emmet/builders/settings.py +14 -9
  12. emmet/builders/utils.py +5 -4
  13. emmet/builders/vasp/materials.py +11 -4
  14. emmet/builders/vasp/task_validator.py +3 -1
  15. {emmet_builders-0.84.10rc2.dist-info → emmet_builders-0.85.0rc0.dist-info}/METADATA +7 -30
  16. emmet_builders-0.85.0rc0.dist-info/RECORD +41 -0
  17. emmet/builders/materials/ml.py +0 -101
  18. emmet/builders/molecules/atomic.py +0 -592
  19. emmet/builders/molecules/bonds.py +0 -329
  20. emmet/builders/molecules/electric.py +0 -287
  21. emmet/builders/molecules/metal_binding.py +0 -528
  22. emmet/builders/molecules/orbitals.py +0 -292
  23. emmet/builders/molecules/redox.py +0 -502
  24. emmet/builders/molecules/summary.py +0 -406
  25. emmet/builders/molecules/thermo.py +0 -505
  26. emmet/builders/molecules/trajectory.py +0 -530
  27. emmet/builders/molecules/vibration.py +0 -282
  28. emmet/builders/qchem/__init__.py +0 -0
  29. emmet/builders/qchem/molecules.py +0 -745
  30. emmet_builders-0.84.10rc2.dist-info/RECORD +0 -54
  31. /emmet/builders/{molecules/__init__.py → py.typed} +0 -0
  32. {emmet_builders-0.84.10rc2.dist-info → emmet_builders-0.85.0rc0.dist-info}/WHEEL +0 -0
  33. {emmet_builders-0.84.10rc2.dist-info → emmet_builders-0.85.0rc0.dist-info}/top_level.txt +0 -0
@@ -1,528 +0,0 @@
1
- from __future__ import annotations
2
-
3
- import copy
4
- from datetime import datetime
5
- from itertools import chain
6
- from math import ceil
7
- from typing import TYPE_CHECKING
8
-
9
- from maggma.builders import Builder
10
- from maggma.core import Store
11
- from maggma.utils import grouper
12
- from pymatgen.core.structure import Molecule
13
- from pymatgen.util.graph_hashing import weisfeiler_lehman_graph_hash
14
-
15
- from emmet.builders.settings import EmmetBuildSettings
16
- from emmet.core.molecules.atomic import PartialChargesDoc, PartialSpinsDoc
17
- from emmet.core.molecules.bonds import MoleculeBondingDoc, metals
18
- from emmet.core.molecules.metal_binding import METAL_BINDING_METHODS, MetalBindingDoc
19
- from emmet.core.molecules.thermo import MoleculeThermoDoc
20
- from emmet.core.qchem.molecule import MoleculeDoc
21
- from emmet.core.utils import jsanitize
22
-
23
- if TYPE_CHECKING:
24
- from collections.abc import Iterable, Iterator
25
-
26
- __author__ = "Evan Spotte-Smith"
27
-
28
- SETTINGS = EmmetBuildSettings()
29
-
30
-
31
- class MetalBindingBuilder(Builder):
32
- """
33
- The MetalBindingBuilder extracts information about metal binding in molecules
34
- from MoleculeDocs, MoleculeThermoDocs, MoleculeBondingDocs, PartialChargesDocs,
35
- and PartialSpinsDocs.
36
-
37
- NBO is the strongly preferred method to approximate partial charges and bonding,
38
- and so if NBO bonding and partial charges/spins documents are available, they will
39
- be used.
40
-
41
- If NBO docs are not available, then bonding can be taken from the "OpenBabelNN + metal_edge_extender"
42
- method, and partial charges and spins can be taken from "mulliken".
43
-
44
- This builder will attempt to build documents for each molecule in each solvent.
45
- For each molecule-solvent combination, the highest-quality data available
46
- (based on level of theory, electronic energy, and the method used to generate bonding/charges/spins)
47
- will be used.
48
-
49
- The process is as follows:
50
- 1. Gather MoleculeDocs by species hash
51
- 2. For each molecule, first identify if there are any metals. If not, then no MetalBindingDoc can be made.
52
- If so, then identify the possible solvents that can be used to generate MetalBindingDocs
53
- 3. For each combination of Molecule ID and solvent, search for additional documents:
54
- - MoleculeBondingDocs
55
- - PartialChargesDocs
56
- - PartialSpinsDocs (for open-shell molecules)
57
- - MoleculeThermoDocs
58
- 4. Group these additional documents by level of theory and (where applicable) method, and choose the best
59
- possible methods for which all required data is available
60
- 5. For each metal in the molecule:
61
- 5.1 Use partial charge and spin information to determine the oxidation and spin state of the metal
62
- 5.2 Search for MoleculeThermoDocs for the metal atom/ion with appropriate charge and spin with the
63
- chosen level of theory
64
- 5.3 Use graph comparisons (hashing or isomorphism) to identify a molecule with the same structure as
65
- the molecule of interest WITHOUT the metal of interest, as well as the appropriate charge and spin
66
- 5.4 If an appropriate metal-less molecule can be found, search for a MoleculeThermoDoc for that molecule'
67
- with the chosen level of theory
68
- 6. Use the obtained bonding, charges, spins, and thermo docs to construct a MetalBindingDoc
69
- """
70
-
71
- def __init__(
72
- self,
73
- molecules: Store,
74
- charges: Store,
75
- spins: Store,
76
- bonds: Store,
77
- thermo: Store,
78
- metal_binding: Store,
79
- query: dict | None = None,
80
- methods: list | None = None,
81
- settings: EmmetBuildSettings | None = None,
82
- **kwargs,
83
- ):
84
- self.molecules = molecules
85
- self.charges = charges
86
- self.spins = spins
87
- self.bonds = bonds
88
- self.thermo = thermo
89
- self.metal_binding = metal_binding
90
- self.query = query if query else dict()
91
- self.methods = methods if methods else METAL_BINDING_METHODS
92
- self.settings = EmmetBuildSettings.autoload(settings)
93
- self.kwargs = kwargs
94
-
95
- super().__init__(
96
- sources=[molecules, charges, spins, bonds, thermo],
97
- targets=[metal_binding],
98
- **kwargs,
99
- )
100
- # Uncomment in case of issue with mrun not connecting automatically to collections
101
- # for i in [self.molecules, self.charges, self.spins, self.bonds, self.thermo, self.metal_binding]:
102
- # try:
103
- # i.connect()
104
- # except Exception as e:
105
- # print("Could not connect,", e)
106
-
107
- def ensure_indexes(self):
108
- """
109
- Ensures indices on the collections needed for building
110
- """
111
-
112
- # Search index for molecules
113
- self.molecules.ensure_index("molecule_id")
114
- self.molecules.ensure_index("last_updated")
115
- self.molecules.ensure_index("task_ids")
116
- self.molecules.ensure_index("formula_alphabetical")
117
- self.molecules.ensure_index("species_hash")
118
-
119
- # Search index for charges
120
- self.charges.ensure_index("molecule_id")
121
- self.charges.ensure_index("task_id")
122
- self.charges.ensure_index("method")
123
- self.charges.ensure_index("solvent")
124
- self.charges.ensure_index("lot_solvent")
125
- self.charges.ensure_index("property_id")
126
- self.charges.ensure_index("last_updated")
127
- self.charges.ensure_index("formula_alphabetical")
128
-
129
- # Search index for spins
130
- self.spins.ensure_index("molecule_id")
131
- self.spins.ensure_index("task_id")
132
- self.spins.ensure_index("method")
133
- self.spins.ensure_index("solvent")
134
- self.spins.ensure_index("lot_solvent")
135
- self.spins.ensure_index("property_id")
136
- self.spins.ensure_index("last_updated")
137
- self.spins.ensure_index("formula_alphabetical")
138
-
139
- # Search index for bonds
140
- self.bonds.ensure_index("molecule_id")
141
- self.bonds.ensure_index("method")
142
- self.bonds.ensure_index("task_id")
143
- self.bonds.ensure_index("solvent")
144
- self.bonds.ensure_index("lot_solvent")
145
- self.bonds.ensure_index("property_id")
146
- self.bonds.ensure_index("last_updated")
147
- self.bonds.ensure_index("formula_alphabetical")
148
-
149
- # Search index for thermo
150
- self.thermo.ensure_index("molecule_id")
151
- self.thermo.ensure_index("task_id")
152
- self.thermo.ensure_index("solvent")
153
- self.thermo.ensure_index("lot_solvent")
154
- self.thermo.ensure_index("property_id")
155
- self.thermo.ensure_index("last_updated")
156
- self.thermo.ensure_index("formula_alphabetical")
157
-
158
- # Search index for metal_binding
159
- self.metal_binding.ensure_index("molecule_id")
160
- self.metal_binding.ensure_index("solvent")
161
- self.metal_binding.ensure_index("lot_solvent")
162
- self.metal_binding.ensure_index("property_id")
163
- self.metal_binding.ensure_index("last_updated")
164
- self.metal_binding.ensure_index("formula_alphabetical")
165
- self.metal_binding.ensure_index("method")
166
-
167
- def prechunk(self, number_splits: int) -> Iterable[dict]: # pragma: no cover
168
- """Prechunk the builder for distributed computation"""
169
-
170
- temp_query = dict(self.query)
171
- temp_query["deprecated"] = False
172
-
173
- self.logger.info("Finding documents to process")
174
- all_mols = list(
175
- self.molecules.query(temp_query, [self.molecules.key, "species_hash"])
176
- )
177
-
178
- processed_docs = set([e for e in self.metal_binding.distinct("molecule_id")])
179
- to_process_docs = {d[self.molecules.key] for d in all_mols} - processed_docs
180
- to_process_hashes = {
181
- d["species_hash"]
182
- for d in all_mols
183
- if d[self.molecules.key] in to_process_docs
184
- }
185
-
186
- N = ceil(len(to_process_hashes) / number_splits)
187
-
188
- for hash_chunk in grouper(to_process_hashes, N):
189
- query = dict(temp_query)
190
- query["species_hash"] = {"$in": list(hash_chunk)}
191
- yield {"query": query}
192
-
193
- def get_items(self) -> Iterator[list[dict]]:
194
- """
195
- Gets all items to process into metal_binding documents.
196
-
197
- Returns:
198
- generator or list relevant molecules to process into documents
199
- """
200
-
201
- self.logger.info("Metal binding builder started")
202
- self.logger.info("Setting indexes")
203
- self.ensure_indexes()
204
-
205
- # Save timestamp to mark buildtime
206
- self.timestamp = datetime.utcnow()
207
-
208
- # Get all processed molecules
209
- temp_query = dict(self.query)
210
- temp_query["deprecated"] = False
211
-
212
- self.logger.info("Finding documents to process")
213
- all_mols = list(
214
- self.molecules.query(temp_query, [self.molecules.key, "species_hash"])
215
- )
216
-
217
- processed_docs = set([e for e in self.metal_binding.distinct("molecule_id")])
218
- to_process_docs = {d[self.molecules.key] for d in all_mols} - processed_docs
219
- to_process_hashes = {
220
- d["species_hash"]
221
- for d in all_mols
222
- if d[self.molecules.key] in to_process_docs
223
- }
224
-
225
- self.logger.info(f"Found {len(to_process_docs)} unprocessed documents")
226
- self.logger.info(f"Found {len(to_process_hashes)} unprocessed hashes")
227
-
228
- # Set total for builder bars to have a total
229
- self.total = len(to_process_hashes)
230
-
231
- for shash in to_process_hashes:
232
- mol_query = dict(temp_query)
233
- mol_query["species_hash"] = shash
234
- molecules = list(self.molecules.query(criteria=mol_query))
235
-
236
- yield molecules
237
-
238
- def process_item(self, items: list[dict]) -> list[dict]:
239
- """
240
- Process molecule, bonding, partial charges, partial spins, and thermo documents into MetalBindingDocs
241
-
242
- Args:
243
- tasks list[dict] : a list of MoleculeDocs in dict form
244
-
245
- Returns:
246
- [dict] : a list of new metal binding docs
247
- """
248
-
249
- mols = [MoleculeDoc(**item) for item in items]
250
- shash = mols[0].species_hash
251
- mol_ids = [m.molecule_id for m in mols]
252
- self.logger.debug(f"Processing {shash} : {mol_ids}")
253
-
254
- binding_docs = list()
255
-
256
- for mol in mols:
257
- # First: do we need to do this? Are there actually metals in this molecule? And species other than metals?
258
- species = mol.species
259
- metal_indices = [i for i, e in enumerate(species) if e in metals]
260
- if len(metal_indices) == 0 or len(species) == 1:
261
- # print(mol.molecule_id, mol.formula_alphabetical)
262
- continue
263
-
264
- # Grab the basic documents needed to create a metal binding document
265
- molecule_id = mol.molecule_id
266
- solvents = mol.unique_solvents
267
- charges = [
268
- PartialChargesDoc(**e)
269
- for e in self.charges.query({"molecule_id": molecule_id})
270
- ]
271
- if mol.spin_multiplicity != 1:
272
- spins = [
273
- PartialSpinsDoc(**e)
274
- for e in self.spins.query({"molecule_id": molecule_id})
275
- ]
276
- else:
277
- spins = list()
278
- bonds = [
279
- MoleculeBondingDoc(**e)
280
- for e in self.bonds.query({"molecule_id": molecule_id})
281
- ]
282
- thermo = [
283
- MoleculeThermoDoc(**e)
284
- for e in self.thermo.query({"molecule_id": molecule_id})
285
- ]
286
-
287
- if any([len(x) == 0 for x in [charges, bonds, thermo]]):
288
- # Not enough information to construct MetalBindingDoc
289
- continue
290
- elif mol.spin_multiplicity != 1 and len(spins) == 0:
291
- # For open-shell molecule, partial spins information needed
292
- continue
293
-
294
- # Group by solvent and (where appropriate) method
295
- charge_bysolv_meth = dict()
296
- for c in charges:
297
- if c.solvent not in charge_bysolv_meth:
298
- charge_bysolv_meth[c.solvent] = {c.method: c}
299
- else:
300
- charge_bysolv_meth[c.solvent][c.method] = c
301
-
302
- spins_bysolv_meth = dict()
303
- for s in spins:
304
- if s.solvent not in spins_bysolv_meth:
305
- spins_bysolv_meth[s.solvent] = {s.method: s}
306
- else:
307
- spins_bysolv_meth[s.solvent][s.method] = s
308
-
309
- bonds_bysolv_meth = dict()
310
- for b in bonds:
311
- if b.solvent not in bonds_bysolv_meth:
312
- bonds_bysolv_meth[b.solvent] = {b.method: b}
313
- else:
314
- bonds_bysolv_meth[b.solvent][b.method] = b
315
-
316
- thermo_bysolv = {t.solvent: t for t in thermo}
317
-
318
- for solvent in solvents:
319
- this_charge = charge_bysolv_meth.get(solvent) # type: ignore
320
- this_spin = spins_bysolv_meth.get(solvent) # type: ignore
321
- this_bond = bonds_bysolv_meth.get(solvent) # type: ignore
322
- base_thermo_doc = thermo_bysolv.get(solvent) # type: ignore
323
-
324
- # Do we have the requisite docs for this solvent?
325
- if mol.spin_multiplicity == 1:
326
- needed = [this_charge, this_bond, base_thermo_doc]
327
- else:
328
- needed = [this_charge, this_spin, this_bond, base_thermo_doc]
329
-
330
- if any([x is None for x in needed]):
331
- continue
332
-
333
- # What method will we use?
334
- # Currently allows two options:
335
- # 1. Using NBO for everything ("nbo")
336
- # 2. Using Mulliken for charges/spins and OpenBabel + metal_edge_extender for bonding
337
- # ("mulliken-OB-mee")
338
- for method in self.methods:
339
- plan = False
340
- if mol.spin_multiplicity == 1:
341
- if method == "nbo" and all(
342
- [x.get("nbo") is not None for x in [this_charge, this_bond]] # type: ignore
343
- ): # type: ignore
344
- plan = True
345
- charge_doc = this_charge.get("nbo") # type: ignore
346
- spin_doc = None
347
- bond_doc = this_bond.get("nbo") # type: ignore
348
- elif method == "mulliken-OB-mee" and (
349
- this_charge.get("mulliken") is not None # type: ignore
350
- and this_bond.get("OpenBabelNN + metal_edge_extender") is not None # type: ignore
351
- ):
352
- plan = True
353
- charge_doc = this_charge.get("mulliken") # type: ignore
354
- spin_doc = None
355
- bond_doc = this_bond.get("OpenBabelNN + metal_edge_extender") # type: ignore
356
- else:
357
- if method == "nbo" and all(
358
- [x.get("nbo") is not None for x in [this_charge, this_spin, this_bond]] # type: ignore
359
- ): # type: ignore
360
- charge_lot = this_charge.get("nbo").level_of_theory # type: ignore
361
- spin_lot = this_spin.get("nbo").level_of_theory # type: ignore
362
- if charge_lot == spin_lot: # type: ignore
363
- plan = True
364
- charge_doc = this_charge.get("nbo") # type: ignore
365
- spin_doc = this_spin.get("nbo") # type: ignore
366
- bond_doc = this_bond.get("nbo") # type: ignore
367
- elif (
368
- method == "mulliken-OB-mee"
369
- and this_charge.get("mulliken") is not None # type: ignore
370
- and this_spin.get("mulliken") is not None # type: ignore
371
- and this_bond.get("OpenBabelNN + metal_edge_extender") is not None # type: ignore
372
- ):
373
- charge_lot = this_charge.get("mulliken").level_of_theory # type: ignore
374
- spin_lot = this_spin.get("mulliken").level_of_theory # type: ignore
375
- if charge_lot == spin_lot: # type: ignore
376
- plan = True
377
- charge_doc = this_charge.get("mulliken") # type: ignore
378
- spin_doc = this_spin.get("mulliken") # type: ignore
379
- bond_doc = this_bond.get("OpenBabelNN + metal_edge_extender") # type: ignore
380
-
381
- # Don't have the right combinations of level of theory and method
382
- if plan is False:
383
- continue
384
-
385
- # Obtain relevant thermo documents for each metal atom/ion in the molecule
386
- metal_thermo = dict()
387
- nometal_thermo = dict()
388
- for metal_index in metal_indices:
389
- # First, determine the appropriate charge and spin of the metal
390
- # TODO: figure out better charge assignment
391
- partial_charge = charge_doc.partial_charges[metal_index] # type: ignore
392
-
393
- if mol.spin_multiplicity == 1:
394
- # For now, just round to nearest whole number
395
- charge = round(partial_charge)
396
- spin = 1
397
- else:
398
- partial_spin = spin_doc.partial_spins[metal_index] # type: ignore
399
- charge = round(partial_charge)
400
- spin = round(abs(partial_spin)) + 1
401
-
402
- # Sanity check that charge and spin are compatible
403
- metal_species = species[metal_index]
404
- try:
405
- _ = Molecule(
406
- [metal_species],
407
- [[0.0, 0.0, 0.0]],
408
- charge=charge,
409
- spin_multiplicity=spin,
410
- )
411
- except ValueError:
412
- # Assume spin assignment is correct, and change charge accordingly
413
- diff_up = abs(partial_charge - (charge + 1))
414
- diff_down = abs(partial_charge - (charge - 1))
415
- if diff_up < diff_down:
416
- charge += 1
417
- else:
418
- charge -= 1
419
-
420
- # Grab thermo doc for the relevant metal ion/atom (if available)
421
- this_metal_thermo = [
422
- MoleculeThermoDoc(**e)
423
- for e in self.thermo.query(
424
- {
425
- "formula_alphabetical": f"{metal_species}1",
426
- "charge": charge,
427
- "spin_multiplicity": spin,
428
- "lot_solvent": base_thermo_doc.lot_solvent, # type: ignore
429
- }
430
- )
431
- ]
432
- if len(this_metal_thermo) == 0:
433
- continue
434
-
435
- this_metal_thermo = this_metal_thermo[0]
436
- metal_thermo[metal_index] = this_metal_thermo
437
-
438
- # Now the (somewhat) harder part - finding the document for this molecule without the metal
439
- # Make sure charges and spins add up
440
- nometal_charge = mol.charge - charge
441
- nometal_spin = mol.spin_multiplicity - spin + 1
442
- mg_copy = copy.deepcopy(bond_doc.molecule_graph) # type: ignore
443
- mg_copy.remove_nodes([metal_index])
444
- new_hash = weisfeiler_lehman_graph_hash(
445
- mg_copy.graph.to_undirected(), node_attr="specie"
446
- )
447
- nometal_mol_doc = [
448
- MoleculeDoc(**e)
449
- for e in self.molecules.query(
450
- {
451
- "species_hash": new_hash,
452
- "charge": nometal_charge,
453
- "spin_multiplicity": nometal_spin,
454
- }
455
- )
456
- ]
457
- if len(nometal_mol_doc) == 0:
458
- continue
459
-
460
- nometal_mol_id = nometal_mol_doc[0].molecule_id
461
- this_nometal_thermo = [
462
- MoleculeThermoDoc(**e)
463
- for e in self.thermo.query(
464
- {
465
- "molecule_id": nometal_mol_id,
466
- "lot_solvent": base_thermo_doc.lot_solvent, # type: ignore
467
- }
468
- )
469
- ]
470
- if len(this_nometal_thermo) == 0:
471
- continue
472
-
473
- this_nometal_thermo = this_nometal_thermo[0]
474
- nometal_thermo[metal_index] = this_nometal_thermo
475
-
476
- doc = MetalBindingDoc.from_docs(
477
- method=method,
478
- metal_indices=metal_indices,
479
- base_molecule_doc=mol,
480
- partial_charges=charge_doc,
481
- partial_spins=spin_doc,
482
- bonding=bond_doc,
483
- base_thermo=base_thermo_doc,
484
- metal_thermo=metal_thermo,
485
- nometal_thermo=nometal_thermo,
486
- )
487
-
488
- if doc is not None and len(doc.binding_data) != 0:
489
- binding_docs.append(doc)
490
-
491
- self.logger.debug(
492
- f"Produced {len(binding_docs)} metal binding docs for {shash}"
493
- )
494
-
495
- return jsanitize([doc.model_dump() for doc in binding_docs], allow_bson=True)
496
-
497
- def update_targets(self, items: list[list[dict]]):
498
- """
499
- Inserts the new documents into the metal_binding collection
500
-
501
- Args:
502
- items [[dict]]: A list of documents to update
503
- """
504
-
505
- docs = list(chain.from_iterable(items)) # type: ignore
506
-
507
- # Add timestamp
508
- for item in docs:
509
- item.update(
510
- {
511
- "_bt": self.timestamp,
512
- }
513
- )
514
-
515
- molecule_ids = list({item["molecule_id"] for item in docs})
516
-
517
- if len(items) > 0:
518
- self.logger.info(f"Updating {len(docs)} metal binding documents")
519
- self.metal_binding.remove_docs(
520
- {self.metal_binding.key: {"$in": molecule_ids}}
521
- )
522
- # Neither molecule_id nor solvent need to be unique, but the combination must be
523
- self.metal_binding.update(
524
- docs=docs,
525
- key=["molecule_id", "solvent", "method"],
526
- )
527
- else:
528
- self.logger.info("No items to update")