emmet-builders 0.84.2__py3-none-any.whl → 0.86.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- emmet/builders/abinit/phonon.py +27 -25
- emmet/builders/abinit/sound_velocity.py +15 -11
- emmet/builders/feff/xas.py +1 -2
- emmet/builders/materials/absorption_spectrum.py +25 -14
- emmet/builders/materials/alloys.py +3 -4
- emmet/builders/materials/chemenv.py +2 -3
- emmet/builders/materials/corrected_entries.py +15 -9
- emmet/builders/materials/dielectric.py +19 -11
- emmet/builders/materials/elasticity.py +44 -33
- emmet/builders/materials/electrodes.py +24 -19
- emmet/builders/materials/electronic_structure.py +17 -17
- emmet/builders/materials/magnetism.py +11 -4
- emmet/builders/materials/optimade.py +7 -3
- emmet/builders/materials/piezoelectric.py +24 -21
- emmet/builders/materials/provenance.py +15 -12
- emmet/builders/materials/robocrys.py +2 -3
- emmet/builders/materials/substrates.py +9 -8
- emmet/builders/materials/summary.py +3 -3
- emmet/builders/materials/thermo.py +17 -11
- emmet/builders/matscholar/missing_compositions.py +12 -8
- emmet/builders/mobility/migration_graph.py +5 -5
- emmet/builders/settings.py +21 -17
- emmet/builders/utils.py +15 -10
- emmet/builders/vasp/materials.py +32 -16
- emmet/builders/vasp/task_validator.py +15 -11
- {emmet_builders-0.84.2.dist-info → emmet_builders-0.86.0.dist-info}/METADATA +21 -36
- emmet_builders-0.86.0.dist-info/RECORD +41 -0
- {emmet_builders-0.84.2.dist-info → emmet_builders-0.86.0.dist-info}/WHEEL +1 -1
- emmet/builders/materials/ml.py +0 -87
- emmet/builders/molecules/atomic.py +0 -589
- emmet/builders/molecules/bonds.py +0 -324
- emmet/builders/molecules/metal_binding.py +0 -526
- emmet/builders/molecules/orbitals.py +0 -288
- emmet/builders/molecules/redox.py +0 -496
- emmet/builders/molecules/summary.py +0 -383
- emmet/builders/molecules/thermo.py +0 -500
- emmet/builders/molecules/vibration.py +0 -278
- emmet/builders/qchem/__init__.py +0 -0
- emmet/builders/qchem/molecules.py +0 -734
- emmet_builders-0.84.2.dist-info/RECORD +0 -52
- /emmet/builders/{molecules/__init__.py → py.typed} +0 -0
- {emmet_builders-0.84.2.dist-info → emmet_builders-0.86.0.dist-info}/top_level.txt +0 -0
|
@@ -1,324 +0,0 @@
|
|
|
1
|
-
from collections import defaultdict
|
|
2
|
-
from datetime import datetime
|
|
3
|
-
from itertools import chain
|
|
4
|
-
from math import ceil
|
|
5
|
-
from typing import Optional, Iterable, Iterator, List, Dict
|
|
6
|
-
|
|
7
|
-
from maggma.builders import Builder
|
|
8
|
-
from maggma.core import Store
|
|
9
|
-
from maggma.utils import grouper
|
|
10
|
-
|
|
11
|
-
from emmet.core.qchem.task import TaskDocument
|
|
12
|
-
from emmet.core.qchem.molecule import MoleculeDoc, evaluate_lot
|
|
13
|
-
from emmet.core.molecules.bonds import MoleculeBondingDoc, BOND_METHODS
|
|
14
|
-
from emmet.core.utils import jsanitize
|
|
15
|
-
from emmet.builders.settings import EmmetBuildSettings
|
|
16
|
-
|
|
17
|
-
|
|
18
|
-
__author__ = "Evan Spotte-Smith"
|
|
19
|
-
|
|
20
|
-
SETTINGS = EmmetBuildSettings()
|
|
21
|
-
|
|
22
|
-
|
|
23
|
-
class BondingBuilder(Builder):
|
|
24
|
-
"""
|
|
25
|
-
The BondingBuilder defines the bonds in a MoleculeDoc.
|
|
26
|
-
|
|
27
|
-
Various methods can be used to define bonding, including:
|
|
28
|
-
- OpenBabelNN + metal_edge_extender: Combining the bond detection algorithms in OpenBabel (OpenBabelNN in
|
|
29
|
-
pymatgen) with a heuristic to add metal coordinate bonds (metal_edge_extender
|
|
30
|
-
in pymatgen)
|
|
31
|
-
- critic2: Using critical points of the electron density to define bonds
|
|
32
|
-
- nbo: Using Natural Bonding Orbital analysis to define bonds and other
|
|
33
|
-
interatomic interactions
|
|
34
|
-
|
|
35
|
-
NOTE: Only NBO7 can be used to generate bonding. Bonding (especially when metals
|
|
36
|
-
are involved) is unreliable with earlier version of NBO!
|
|
37
|
-
|
|
38
|
-
This builder will attempt to build documents for each molecule, in each solvent,
|
|
39
|
-
with each method. For each molecule-solvent-method combination, the highest-quality
|
|
40
|
-
data available (based on level of theory and electronic energy) will be used.
|
|
41
|
-
|
|
42
|
-
The process is as follows:
|
|
43
|
-
1. Gather MoleculeDocs by formula
|
|
44
|
-
2. For each molecule, group all tasks by solvent.
|
|
45
|
-
3. For each solvent, sort tasks by level of theory and electronic energy
|
|
46
|
-
4. For each method:
|
|
47
|
-
4.1. Find task docs with necessary data to define bonding by that method
|
|
48
|
-
4.2. Take best (defined by level of theory and electronic energy) task
|
|
49
|
-
4.3. Convert TaskDoc to MoleculeBondingDoc
|
|
50
|
-
"""
|
|
51
|
-
|
|
52
|
-
def __init__(
|
|
53
|
-
self,
|
|
54
|
-
tasks: Store,
|
|
55
|
-
molecules: Store,
|
|
56
|
-
bonds: Store,
|
|
57
|
-
query: Optional[Dict] = None,
|
|
58
|
-
methods: Optional[List] = None,
|
|
59
|
-
settings: Optional[EmmetBuildSettings] = None,
|
|
60
|
-
**kwargs,
|
|
61
|
-
):
|
|
62
|
-
self.tasks = tasks
|
|
63
|
-
self.molecules = molecules
|
|
64
|
-
self.bonds = bonds
|
|
65
|
-
self.query = query if query else dict()
|
|
66
|
-
self.methods = methods if methods else BOND_METHODS
|
|
67
|
-
self.settings = EmmetBuildSettings.autoload(settings)
|
|
68
|
-
self.kwargs = kwargs
|
|
69
|
-
|
|
70
|
-
super().__init__(sources=[tasks, molecules], targets=[bonds], **kwargs)
|
|
71
|
-
# Uncomment in case of issue with mrun not connecting automatically to collections
|
|
72
|
-
# for i in [self.tasks, self.molecules, self.bonds]:
|
|
73
|
-
# try:
|
|
74
|
-
# i.connect()
|
|
75
|
-
# except Exception as e:
|
|
76
|
-
# print("Could not connect,", e)
|
|
77
|
-
|
|
78
|
-
def ensure_indexes(self):
|
|
79
|
-
"""
|
|
80
|
-
Ensures indices on the collections needed for building
|
|
81
|
-
"""
|
|
82
|
-
|
|
83
|
-
# Basic search index for tasks
|
|
84
|
-
self.tasks.ensure_index("task_id")
|
|
85
|
-
self.tasks.ensure_index("last_updated")
|
|
86
|
-
self.tasks.ensure_index("state")
|
|
87
|
-
self.tasks.ensure_index("formula_alphabetical")
|
|
88
|
-
|
|
89
|
-
# Search index for molecules
|
|
90
|
-
self.molecules.ensure_index("molecule_id")
|
|
91
|
-
self.molecules.ensure_index("last_updated")
|
|
92
|
-
self.molecules.ensure_index("task_ids")
|
|
93
|
-
self.molecules.ensure_index("formula_alphabetical")
|
|
94
|
-
|
|
95
|
-
# Search index for bonds
|
|
96
|
-
self.bonds.ensure_index("molecule_id")
|
|
97
|
-
self.bonds.ensure_index("method")
|
|
98
|
-
self.bonds.ensure_index("task_id")
|
|
99
|
-
self.bonds.ensure_index("solvent")
|
|
100
|
-
self.bonds.ensure_index("lot_solvent")
|
|
101
|
-
self.bonds.ensure_index("property_id")
|
|
102
|
-
self.bonds.ensure_index("last_updated")
|
|
103
|
-
self.bonds.ensure_index("formula_alphabetical")
|
|
104
|
-
|
|
105
|
-
def prechunk(self, number_splits: int) -> Iterable[Dict]: # pragma: no cover
|
|
106
|
-
"""Prechunk the builder for distributed computation"""
|
|
107
|
-
|
|
108
|
-
temp_query = dict(self.query)
|
|
109
|
-
temp_query["deprecated"] = False
|
|
110
|
-
|
|
111
|
-
self.logger.info("Finding documents to process")
|
|
112
|
-
all_mols = list(
|
|
113
|
-
self.molecules.query(
|
|
114
|
-
temp_query, [self.molecules.key, "formula_alphabetical"]
|
|
115
|
-
)
|
|
116
|
-
)
|
|
117
|
-
|
|
118
|
-
processed_docs = set([e for e in self.bonds.distinct("molecule_id")])
|
|
119
|
-
to_process_docs = {d[self.molecules.key] for d in all_mols} - processed_docs
|
|
120
|
-
to_process_forms = {
|
|
121
|
-
d["formula_alphabetical"]
|
|
122
|
-
for d in all_mols
|
|
123
|
-
if d[self.molecules.key] in to_process_docs
|
|
124
|
-
}
|
|
125
|
-
|
|
126
|
-
N = ceil(len(to_process_forms) / number_splits)
|
|
127
|
-
|
|
128
|
-
for formula_chunk in grouper(to_process_forms, N):
|
|
129
|
-
yield {"query": {"formula_alphabetical": {"$in": list(formula_chunk)}}}
|
|
130
|
-
|
|
131
|
-
def get_items(self) -> Iterator[List[Dict]]:
|
|
132
|
-
"""
|
|
133
|
-
Gets all items to process into bonding documents.
|
|
134
|
-
This does no datetime checking; relying on on whether
|
|
135
|
-
task_ids are included in the bonds Store
|
|
136
|
-
|
|
137
|
-
Returns:
|
|
138
|
-
generator or list relevant tasks and molecules to process into documents
|
|
139
|
-
"""
|
|
140
|
-
|
|
141
|
-
self.logger.info("Bonding builder started")
|
|
142
|
-
self.logger.info("Setting indexes")
|
|
143
|
-
self.ensure_indexes()
|
|
144
|
-
|
|
145
|
-
# Save timestamp to mark buildtime
|
|
146
|
-
self.timestamp = datetime.utcnow()
|
|
147
|
-
|
|
148
|
-
# Get all processed molecules
|
|
149
|
-
temp_query = dict(self.query)
|
|
150
|
-
temp_query["deprecated"] = False
|
|
151
|
-
|
|
152
|
-
self.logger.info("Finding documents to process")
|
|
153
|
-
all_mols = list(
|
|
154
|
-
self.molecules.query(
|
|
155
|
-
temp_query, [self.molecules.key, "formula_alphabetical"]
|
|
156
|
-
)
|
|
157
|
-
)
|
|
158
|
-
|
|
159
|
-
processed_docs = set([e for e in self.bonds.distinct("molecule_id")])
|
|
160
|
-
to_process_docs = {d[self.molecules.key] for d in all_mols} - processed_docs
|
|
161
|
-
to_process_forms = {
|
|
162
|
-
d["formula_alphabetical"]
|
|
163
|
-
for d in all_mols
|
|
164
|
-
if d[self.molecules.key] in to_process_docs
|
|
165
|
-
}
|
|
166
|
-
|
|
167
|
-
self.logger.info(f"Found {len(to_process_docs)} unprocessed documents")
|
|
168
|
-
self.logger.info(f"Found {len(to_process_forms)} unprocessed formulas")
|
|
169
|
-
|
|
170
|
-
# Set total for builder bars to have a total
|
|
171
|
-
self.total = len(to_process_forms)
|
|
172
|
-
|
|
173
|
-
for formula in to_process_forms:
|
|
174
|
-
mol_query = dict(temp_query)
|
|
175
|
-
mol_query["formula_alphabetical"] = formula
|
|
176
|
-
molecules = list(self.molecules.query(criteria=mol_query))
|
|
177
|
-
|
|
178
|
-
yield molecules
|
|
179
|
-
|
|
180
|
-
def process_item(self, items: List[Dict]) -> List[Dict]:
|
|
181
|
-
"""
|
|
182
|
-
Process the tasks into MoleculeBondingDocs
|
|
183
|
-
|
|
184
|
-
Args:
|
|
185
|
-
tasks List[Dict] : a list of MoleculeDocs in dict form
|
|
186
|
-
|
|
187
|
-
Returns:
|
|
188
|
-
[dict] : a list of new bonding docs
|
|
189
|
-
"""
|
|
190
|
-
|
|
191
|
-
mols = [MoleculeDoc(**item) for item in items]
|
|
192
|
-
formula = mols[0].formula_alphabetical
|
|
193
|
-
mol_ids = [m.molecule_id for m in mols]
|
|
194
|
-
self.logger.debug(f"Processing {formula} : {mol_ids}")
|
|
195
|
-
|
|
196
|
-
bonding_docs = list()
|
|
197
|
-
|
|
198
|
-
for mol in mols:
|
|
199
|
-
correct_charge_spin = [
|
|
200
|
-
e
|
|
201
|
-
for e in mol.entries
|
|
202
|
-
if e["charge"] == mol.charge
|
|
203
|
-
and e["spin_multiplicity"] == mol.spin_multiplicity
|
|
204
|
-
]
|
|
205
|
-
|
|
206
|
-
# Organize by solvent environment
|
|
207
|
-
by_solvent = defaultdict(list)
|
|
208
|
-
for entry in correct_charge_spin:
|
|
209
|
-
by_solvent[entry["solvent"]].append(entry)
|
|
210
|
-
|
|
211
|
-
for solvent, entries in by_solvent.items():
|
|
212
|
-
sorted_entries = sorted(
|
|
213
|
-
entries,
|
|
214
|
-
key=lambda x: (
|
|
215
|
-
sum(evaluate_lot(x["level_of_theory"])),
|
|
216
|
-
x["energy"],
|
|
217
|
-
),
|
|
218
|
-
)
|
|
219
|
-
|
|
220
|
-
for method in self.methods:
|
|
221
|
-
# For each method, grab entries that have the relevant data
|
|
222
|
-
if method == "OpenBabelNN + metal_edge_extender":
|
|
223
|
-
# This is sort of silly. Since, at the MoleculeDoc level,
|
|
224
|
-
# the structures have to be identical, bonding defined
|
|
225
|
-
# using heuristic methods like OpenBabel should always
|
|
226
|
-
# be identical.
|
|
227
|
-
# TODO: Decide if only one OpenBabelNN + m_e_e doc
|
|
228
|
-
# TODO: should be allowed.
|
|
229
|
-
relevant_entries = sorted_entries
|
|
230
|
-
else:
|
|
231
|
-
relevant_entries = [
|
|
232
|
-
e
|
|
233
|
-
for e in sorted_entries
|
|
234
|
-
if e.get(method) is not None
|
|
235
|
-
or e["output"].get(method) is not None
|
|
236
|
-
]
|
|
237
|
-
|
|
238
|
-
if method == "nbo":
|
|
239
|
-
# Only allow NBO7 to be used. No earlier versions can be
|
|
240
|
-
# relied upon for bonding
|
|
241
|
-
relevant_entries = [
|
|
242
|
-
e
|
|
243
|
-
for e in relevant_entries
|
|
244
|
-
if e["orig"]["rem"].get("run_nbo6", False)
|
|
245
|
-
or e["orig"]["rem"].get("nbo_external", False)
|
|
246
|
-
]
|
|
247
|
-
|
|
248
|
-
if len(relevant_entries) == 0:
|
|
249
|
-
continue
|
|
250
|
-
|
|
251
|
-
# Grab task document of best entry
|
|
252
|
-
best_entry = relevant_entries[0]
|
|
253
|
-
task = best_entry["task_id"]
|
|
254
|
-
|
|
255
|
-
tdoc = self.tasks.query_one(
|
|
256
|
-
{
|
|
257
|
-
"task_id": task,
|
|
258
|
-
"formula_alphabetical": formula,
|
|
259
|
-
"orig": {"$exists": True},
|
|
260
|
-
}
|
|
261
|
-
)
|
|
262
|
-
|
|
263
|
-
if tdoc is None:
|
|
264
|
-
try:
|
|
265
|
-
tdoc = self.tasks.query_one(
|
|
266
|
-
{
|
|
267
|
-
"task_id": int(task),
|
|
268
|
-
"formula_alphabetical": formula,
|
|
269
|
-
"orig": {"$exists": True},
|
|
270
|
-
}
|
|
271
|
-
)
|
|
272
|
-
except ValueError:
|
|
273
|
-
tdoc = None
|
|
274
|
-
|
|
275
|
-
if tdoc is None:
|
|
276
|
-
continue
|
|
277
|
-
|
|
278
|
-
task_doc = TaskDocument(**tdoc)
|
|
279
|
-
|
|
280
|
-
if task_doc is None:
|
|
281
|
-
continue
|
|
282
|
-
|
|
283
|
-
doc = MoleculeBondingDoc.from_task(
|
|
284
|
-
task_doc,
|
|
285
|
-
molecule_id=mol.molecule_id,
|
|
286
|
-
preferred_methods=[method],
|
|
287
|
-
deprecated=False,
|
|
288
|
-
)
|
|
289
|
-
bonding_docs.append(doc)
|
|
290
|
-
|
|
291
|
-
self.logger.debug(f"Produced {len(bonding_docs)} bonding docs for {formula}")
|
|
292
|
-
|
|
293
|
-
return jsanitize([doc.model_dump() for doc in bonding_docs], allow_bson=True)
|
|
294
|
-
|
|
295
|
-
def update_targets(self, items: List[List[Dict]]):
|
|
296
|
-
"""
|
|
297
|
-
Inserts the new documents into the charges collection
|
|
298
|
-
|
|
299
|
-
Args:
|
|
300
|
-
items [[dict]]: A list of documents to update
|
|
301
|
-
"""
|
|
302
|
-
|
|
303
|
-
docs = list(chain.from_iterable(items)) # type: ignore
|
|
304
|
-
|
|
305
|
-
# Add timestamp
|
|
306
|
-
for item in docs:
|
|
307
|
-
item.update(
|
|
308
|
-
{
|
|
309
|
-
"_bt": self.timestamp,
|
|
310
|
-
}
|
|
311
|
-
)
|
|
312
|
-
|
|
313
|
-
molecule_ids = list({item["molecule_id"] for item in docs})
|
|
314
|
-
|
|
315
|
-
if len(items) > 0:
|
|
316
|
-
self.logger.info(f"Updating {len(docs)} bonding documents")
|
|
317
|
-
self.bonds.remove_docs({self.bonds.key: {"$in": molecule_ids}})
|
|
318
|
-
# Neither molecule_id nor method need to be unique, but the combination must be
|
|
319
|
-
self.bonds.update(
|
|
320
|
-
docs=docs,
|
|
321
|
-
key=["molecule_id", "method", "solvent"],
|
|
322
|
-
)
|
|
323
|
-
else:
|
|
324
|
-
self.logger.info("No items to update")
|