emmet-builders 0.78.3__py3-none-any.whl → 0.86.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- emmet/builders/abinit/phonon.py +47 -47
- emmet/builders/abinit/sound_velocity.py +15 -11
- emmet/builders/feff/xas.py +1 -2
- emmet/builders/materials/absorption_spectrum.py +25 -14
- emmet/builders/materials/alloys.py +10 -11
- emmet/builders/materials/chemenv.py +2 -3
- emmet/builders/materials/corrected_entries.py +21 -15
- emmet/builders/materials/dielectric.py +19 -11
- emmet/builders/materials/elasticity.py +44 -33
- emmet/builders/materials/electrodes.py +35 -28
- emmet/builders/materials/electronic_structure.py +17 -17
- emmet/builders/materials/magnetism.py +11 -4
- emmet/builders/materials/optimade.py +7 -3
- emmet/builders/materials/piezoelectric.py +24 -21
- emmet/builders/materials/provenance.py +16 -13
- emmet/builders/materials/robocrys.py +2 -3
- emmet/builders/materials/substrates.py +9 -8
- emmet/builders/materials/summary.py +3 -3
- emmet/builders/materials/thermo.py +17 -11
- emmet/builders/matscholar/missing_compositions.py +12 -8
- emmet/builders/mobility/migration_graph.py +5 -5
- emmet/builders/settings.py +21 -17
- emmet/builders/utils.py +101 -12
- emmet/builders/vasp/materials.py +40 -51
- emmet/builders/vasp/mp_potcar_stats.json.gz +0 -0
- emmet/builders/vasp/task_validator.py +25 -36
- emmet_builders-0.86.0.dist-info/METADATA +37 -0
- emmet_builders-0.86.0.dist-info/RECORD +41 -0
- {emmet_builders-0.78.3.dist-info → emmet_builders-0.86.0.dist-info}/WHEEL +1 -1
- emmet/builders/materials/ml.py +0 -87
- emmet/builders/molecules/atomic.py +0 -589
- emmet/builders/molecules/bonds.py +0 -324
- emmet/builders/molecules/metal_binding.py +0 -526
- emmet/builders/molecules/orbitals.py +0 -288
- emmet/builders/molecules/redox.py +0 -496
- emmet/builders/molecules/summary.py +0 -383
- emmet/builders/molecules/thermo.py +0 -500
- emmet/builders/molecules/vibration.py +0 -278
- emmet/builders/qchem/__init__.py +0 -0
- emmet/builders/qchem/molecules.py +0 -734
- emmet_builders-0.78.3.dist-info/METADATA +0 -47
- emmet_builders-0.78.3.dist-info/RECORD +0 -51
- /emmet/builders/{molecules/__init__.py → py.typed} +0 -0
- {emmet_builders-0.78.3.dist-info → emmet_builders-0.86.0.dist-info}/top_level.txt +0 -0
|
@@ -1,383 +0,0 @@
|
|
|
1
|
-
from datetime import datetime
|
|
2
|
-
from itertools import chain
|
|
3
|
-
from math import ceil
|
|
4
|
-
from typing import Any, Optional, Iterable, Iterator, List, Dict
|
|
5
|
-
|
|
6
|
-
# from monty.serialization import loadfn, dumpfn
|
|
7
|
-
|
|
8
|
-
from maggma.builders import Builder
|
|
9
|
-
from maggma.core import Store
|
|
10
|
-
from maggma.utils import grouper
|
|
11
|
-
|
|
12
|
-
from emmet.core.molecules.summary import MoleculeSummaryDoc
|
|
13
|
-
from emmet.core.utils import jsanitize
|
|
14
|
-
from emmet.builders.settings import EmmetBuildSettings
|
|
15
|
-
|
|
16
|
-
|
|
17
|
-
__author__ = "Evan Spotte-Smith"
|
|
18
|
-
|
|
19
|
-
SETTINGS = EmmetBuildSettings()
|
|
20
|
-
|
|
21
|
-
|
|
22
|
-
class SummaryBuilder(Builder):
|
|
23
|
-
"""
|
|
24
|
-
The SummaryBuilder collects all property documents and gathers their properties
|
|
25
|
-
into a single MoleculeSummaryDoc
|
|
26
|
-
|
|
27
|
-
The process is as follows:
|
|
28
|
-
1. Gather MoleculeDocs by formula
|
|
29
|
-
2. For each doc, grab the relevant property docs
|
|
30
|
-
3. Convert property docs to MoleculeSummaryDoc
|
|
31
|
-
"""
|
|
32
|
-
|
|
33
|
-
def __init__(
|
|
34
|
-
self,
|
|
35
|
-
molecules: Store,
|
|
36
|
-
charges: Store,
|
|
37
|
-
spins: Store,
|
|
38
|
-
bonds: Store,
|
|
39
|
-
metal_binding: Store,
|
|
40
|
-
orbitals: Store,
|
|
41
|
-
redox: Store,
|
|
42
|
-
thermo: Store,
|
|
43
|
-
vibes: Store,
|
|
44
|
-
summary: Store,
|
|
45
|
-
query: Optional[Dict] = None,
|
|
46
|
-
settings: Optional[EmmetBuildSettings] = None,
|
|
47
|
-
**kwargs,
|
|
48
|
-
):
|
|
49
|
-
self.molecules = molecules
|
|
50
|
-
self.charges = charges
|
|
51
|
-
self.spins = spins
|
|
52
|
-
self.bonds = bonds
|
|
53
|
-
self.metal_binding = metal_binding
|
|
54
|
-
self.orbitals = orbitals
|
|
55
|
-
self.redox = redox
|
|
56
|
-
self.thermo = thermo
|
|
57
|
-
self.vibes = vibes
|
|
58
|
-
self.summary = summary
|
|
59
|
-
self.query = query if query else dict()
|
|
60
|
-
self.settings = EmmetBuildSettings.autoload(settings)
|
|
61
|
-
self.kwargs = kwargs
|
|
62
|
-
|
|
63
|
-
super().__init__(
|
|
64
|
-
sources=[
|
|
65
|
-
molecules,
|
|
66
|
-
charges,
|
|
67
|
-
spins,
|
|
68
|
-
bonds,
|
|
69
|
-
metal_binding,
|
|
70
|
-
orbitals,
|
|
71
|
-
redox,
|
|
72
|
-
thermo,
|
|
73
|
-
vibes,
|
|
74
|
-
],
|
|
75
|
-
targets=[summary],
|
|
76
|
-
**kwargs,
|
|
77
|
-
)
|
|
78
|
-
# Uncomment in case of issue with mrun not connecting automatically to collections
|
|
79
|
-
# for i in [
|
|
80
|
-
# self.molecules,
|
|
81
|
-
# self.charges,
|
|
82
|
-
# self.spins,
|
|
83
|
-
# self.bonds,
|
|
84
|
-
# self.metal_binding,
|
|
85
|
-
# self.orbitals,
|
|
86
|
-
# self.redox,
|
|
87
|
-
# self.thermo,
|
|
88
|
-
# self.vibes,
|
|
89
|
-
# self.summary
|
|
90
|
-
# ]:
|
|
91
|
-
# try:
|
|
92
|
-
# i.connect()
|
|
93
|
-
# except Exception as e:
|
|
94
|
-
# print("Could not connect,", e)
|
|
95
|
-
|
|
96
|
-
def ensure_indexes(self):
|
|
97
|
-
"""
|
|
98
|
-
Ensures indices on the collections needed for building
|
|
99
|
-
"""
|
|
100
|
-
|
|
101
|
-
# Search index for molecules
|
|
102
|
-
self.molecules.ensure_index("molecule_id")
|
|
103
|
-
self.molecules.ensure_index("last_updated")
|
|
104
|
-
self.molecules.ensure_index("task_ids")
|
|
105
|
-
self.molecules.ensure_index("formula_alphabetical")
|
|
106
|
-
|
|
107
|
-
# Search index for charges
|
|
108
|
-
self.charges.ensure_index("molecule_id")
|
|
109
|
-
self.charges.ensure_index("method")
|
|
110
|
-
self.charges.ensure_index("task_id")
|
|
111
|
-
self.charges.ensure_index("solvent")
|
|
112
|
-
self.charges.ensure_index("lot_solvent")
|
|
113
|
-
self.charges.ensure_index("property_id")
|
|
114
|
-
self.charges.ensure_index("last_updated")
|
|
115
|
-
self.charges.ensure_index("formula_alphabetical")
|
|
116
|
-
|
|
117
|
-
# Search index for charges
|
|
118
|
-
self.spins.ensure_index("molecule_id")
|
|
119
|
-
self.spins.ensure_index("method")
|
|
120
|
-
self.spins.ensure_index("task_id")
|
|
121
|
-
self.spins.ensure_index("solvent")
|
|
122
|
-
self.spins.ensure_index("lot_solvent")
|
|
123
|
-
self.spins.ensure_index("property_id")
|
|
124
|
-
self.spins.ensure_index("last_updated")
|
|
125
|
-
self.spins.ensure_index("formula_alphabetical")
|
|
126
|
-
|
|
127
|
-
# Search index for charges
|
|
128
|
-
self.bonds.ensure_index("molecule_id")
|
|
129
|
-
self.bonds.ensure_index("method")
|
|
130
|
-
self.bonds.ensure_index("task_id")
|
|
131
|
-
self.bonds.ensure_index("solvent")
|
|
132
|
-
self.bonds.ensure_index("lot_solvent")
|
|
133
|
-
self.bonds.ensure_index("property_id")
|
|
134
|
-
self.bonds.ensure_index("last_updated")
|
|
135
|
-
self.bonds.ensure_index("formula_alphabetical")
|
|
136
|
-
|
|
137
|
-
# Search index for metal_binding
|
|
138
|
-
self.metal_binding.ensure_index("molecule_id")
|
|
139
|
-
self.metal_binding.ensure_index("solvent")
|
|
140
|
-
self.metal_binding.ensure_index("lot_solvent")
|
|
141
|
-
self.metal_binding.ensure_index("property_id")
|
|
142
|
-
self.metal_binding.ensure_index("last_updated")
|
|
143
|
-
self.metal_binding.ensure_index("formula_alphabetical")
|
|
144
|
-
self.metal_binding.ensure_index("method")
|
|
145
|
-
|
|
146
|
-
# Search index for orbitals
|
|
147
|
-
self.orbitals.ensure_index("molecule_id")
|
|
148
|
-
self.orbitals.ensure_index("task_id")
|
|
149
|
-
self.orbitals.ensure_index("solvent")
|
|
150
|
-
self.orbitals.ensure_index("lot_solvent")
|
|
151
|
-
self.orbitals.ensure_index("property_id")
|
|
152
|
-
self.orbitals.ensure_index("last_updated")
|
|
153
|
-
self.orbitals.ensure_index("formula_alphabetical")
|
|
154
|
-
|
|
155
|
-
# Search index for orbitals
|
|
156
|
-
self.redox.ensure_index("molecule_id")
|
|
157
|
-
self.redox.ensure_index("task_id")
|
|
158
|
-
self.redox.ensure_index("solvent")
|
|
159
|
-
self.redox.ensure_index("lot_solvent")
|
|
160
|
-
self.redox.ensure_index("property_id")
|
|
161
|
-
self.redox.ensure_index("last_updated")
|
|
162
|
-
self.redox.ensure_index("formula_alphabetical")
|
|
163
|
-
|
|
164
|
-
# Search index for thermo
|
|
165
|
-
self.thermo.ensure_index("molecule_id")
|
|
166
|
-
self.thermo.ensure_index("task_id")
|
|
167
|
-
self.thermo.ensure_index("solvent")
|
|
168
|
-
self.thermo.ensure_index("lot_solvent")
|
|
169
|
-
self.thermo.ensure_index("property_id")
|
|
170
|
-
self.thermo.ensure_index("last_updated")
|
|
171
|
-
self.thermo.ensure_index("formula_alphabetical")
|
|
172
|
-
|
|
173
|
-
# Search index for vibrational properties
|
|
174
|
-
self.vibes.ensure_index("molecule_id")
|
|
175
|
-
self.vibes.ensure_index("task_id")
|
|
176
|
-
self.vibes.ensure_index("solvent")
|
|
177
|
-
self.vibes.ensure_index("lot_solvent")
|
|
178
|
-
self.vibes.ensure_index("property_id")
|
|
179
|
-
self.vibes.ensure_index("last_updated")
|
|
180
|
-
self.vibes.ensure_index("formula_alphabetical")
|
|
181
|
-
|
|
182
|
-
# Search index for molecules
|
|
183
|
-
self.summary.ensure_index("molecule_id")
|
|
184
|
-
self.summary.ensure_index("last_updated")
|
|
185
|
-
self.summary.ensure_index("formula_alphabetical")
|
|
186
|
-
|
|
187
|
-
def prechunk(self, number_splits: int) -> Iterable[Dict]: # pragma: no cover
|
|
188
|
-
"""Prechunk the builder for distributed computation"""
|
|
189
|
-
|
|
190
|
-
temp_query = dict(self.query)
|
|
191
|
-
temp_query["deprecated"] = False
|
|
192
|
-
|
|
193
|
-
self.logger.info("Finding documents to process")
|
|
194
|
-
all_mols = list(
|
|
195
|
-
self.molecules.query(
|
|
196
|
-
temp_query, [self.molecules.key, "formula_alphabetical"]
|
|
197
|
-
)
|
|
198
|
-
)
|
|
199
|
-
|
|
200
|
-
processed_docs = set([e for e in self.summary.distinct("molecule_id")])
|
|
201
|
-
to_process_docs = {d[self.molecules.key] for d in all_mols} - processed_docs
|
|
202
|
-
to_process_forms = {
|
|
203
|
-
d["formula_alphabetical"]
|
|
204
|
-
for d in all_mols
|
|
205
|
-
if d[self.molecules.key] in to_process_docs
|
|
206
|
-
}
|
|
207
|
-
|
|
208
|
-
N = ceil(len(to_process_forms) / number_splits)
|
|
209
|
-
|
|
210
|
-
for formula_chunk in grouper(to_process_forms, N):
|
|
211
|
-
yield {"query": {"formula_alphabetical": {"$in": list(formula_chunk)}}}
|
|
212
|
-
|
|
213
|
-
def get_items(self) -> Iterator[List[Dict]]:
|
|
214
|
-
"""
|
|
215
|
-
Gets all items to process into summary documents.
|
|
216
|
-
This does no datetime checking; relying on on whether
|
|
217
|
-
task_ids are included in the summary Store
|
|
218
|
-
|
|
219
|
-
Returns:
|
|
220
|
-
generator or list relevant tasks and molecules to process into documents
|
|
221
|
-
"""
|
|
222
|
-
|
|
223
|
-
self.logger.info("Summary builder started")
|
|
224
|
-
self.logger.info("Setting indexes")
|
|
225
|
-
self.ensure_indexes()
|
|
226
|
-
|
|
227
|
-
# Save timestamp to mark buildtime
|
|
228
|
-
self.timestamp = datetime.utcnow()
|
|
229
|
-
|
|
230
|
-
# Get all processed molecules
|
|
231
|
-
temp_query = dict(self.query)
|
|
232
|
-
temp_query["deprecated"] = False
|
|
233
|
-
|
|
234
|
-
self.logger.info("Finding documents to process")
|
|
235
|
-
all_mols = list(
|
|
236
|
-
self.molecules.query(
|
|
237
|
-
temp_query, [self.molecules.key, "formula_alphabetical"]
|
|
238
|
-
)
|
|
239
|
-
)
|
|
240
|
-
|
|
241
|
-
processed_docs = set([e for e in self.summary.distinct("molecule_id")])
|
|
242
|
-
to_process_docs = {d[self.molecules.key] for d in all_mols} - processed_docs
|
|
243
|
-
to_process_forms = {
|
|
244
|
-
d["formula_alphabetical"]
|
|
245
|
-
for d in all_mols
|
|
246
|
-
if d[self.molecules.key] in to_process_docs
|
|
247
|
-
}
|
|
248
|
-
|
|
249
|
-
self.logger.info(f"Found {len(to_process_docs)} unprocessed documents")
|
|
250
|
-
self.logger.info(f"Found {len(to_process_forms)} unprocessed formulas")
|
|
251
|
-
|
|
252
|
-
# Set total for builder bars to have a total
|
|
253
|
-
self.total = len(to_process_forms)
|
|
254
|
-
|
|
255
|
-
for formula in to_process_forms:
|
|
256
|
-
mol_query = dict(temp_query)
|
|
257
|
-
mol_query["formula_alphabetical"] = formula
|
|
258
|
-
molecules = list(self.molecules.query(criteria=mol_query))
|
|
259
|
-
|
|
260
|
-
yield molecules
|
|
261
|
-
|
|
262
|
-
def process_item(self, items: List[Dict]) -> List[Dict]:
|
|
263
|
-
"""
|
|
264
|
-
Process the tasks into a MoleculeSummaryDoc
|
|
265
|
-
|
|
266
|
-
Args:
|
|
267
|
-
tasks List[Dict] : a list of MoleculeDocs in dict form
|
|
268
|
-
|
|
269
|
-
Returns:
|
|
270
|
-
[dict] : a list of new orbital docs
|
|
271
|
-
"""
|
|
272
|
-
|
|
273
|
-
def _group_docs(docs: List[Dict[str, Any]], by_method: bool = False):
|
|
274
|
-
"""Helper function to group docs by solvent"""
|
|
275
|
-
grouped: Dict[str, Any] = dict()
|
|
276
|
-
|
|
277
|
-
for doc in docs:
|
|
278
|
-
solvent = doc.get("solvent")
|
|
279
|
-
method = doc.get("method")
|
|
280
|
-
if not solvent:
|
|
281
|
-
# Need to group by solvent
|
|
282
|
-
continue
|
|
283
|
-
if by_method and method is None:
|
|
284
|
-
# Trying to group by method, but no method present
|
|
285
|
-
continue
|
|
286
|
-
|
|
287
|
-
if not by_method:
|
|
288
|
-
grouped[solvent] = doc
|
|
289
|
-
else:
|
|
290
|
-
if solvent not in grouped:
|
|
291
|
-
grouped[solvent] = {method: doc}
|
|
292
|
-
else:
|
|
293
|
-
grouped[solvent][method] = doc
|
|
294
|
-
|
|
295
|
-
return (grouped, by_method)
|
|
296
|
-
|
|
297
|
-
mols = items
|
|
298
|
-
formula = mols[0]["formula_alphabetical"]
|
|
299
|
-
mol_ids = [m["molecule_id"] for m in mols]
|
|
300
|
-
self.logger.debug(f"Processing {formula} : {mol_ids}")
|
|
301
|
-
|
|
302
|
-
summary_docs = list()
|
|
303
|
-
|
|
304
|
-
for mol in mols:
|
|
305
|
-
mol_id = mol["molecule_id"]
|
|
306
|
-
|
|
307
|
-
d = {
|
|
308
|
-
"molecules": mol,
|
|
309
|
-
"partial_charges": _group_docs(
|
|
310
|
-
list(self.charges.query({"molecule_id": mol_id})), True
|
|
311
|
-
),
|
|
312
|
-
"partial_spins": _group_docs(
|
|
313
|
-
list(self.spins.query({"molecule_id": mol_id})), True
|
|
314
|
-
),
|
|
315
|
-
"bonding": _group_docs(
|
|
316
|
-
list(self.bonds.query({"molecule_id": mol_id})), True
|
|
317
|
-
),
|
|
318
|
-
"metal_binding": _group_docs(
|
|
319
|
-
list(self.metal_binding.query({"molecule_id": mol_id})), True
|
|
320
|
-
),
|
|
321
|
-
"orbitals": _group_docs(
|
|
322
|
-
list(self.orbitals.query({"molecule_id": mol_id})), False
|
|
323
|
-
),
|
|
324
|
-
"redox": _group_docs(
|
|
325
|
-
list(self.redox.query({"molecule_id": mol_id})), False
|
|
326
|
-
),
|
|
327
|
-
"thermo": _group_docs(
|
|
328
|
-
list(self.thermo.query({"molecule_id": mol_id})), False
|
|
329
|
-
),
|
|
330
|
-
"vibration": _group_docs(
|
|
331
|
-
list(self.vibes.query({"molecule_id": mol_id})), False
|
|
332
|
-
),
|
|
333
|
-
}
|
|
334
|
-
|
|
335
|
-
to_delete = list()
|
|
336
|
-
|
|
337
|
-
for k, v in d.items():
|
|
338
|
-
if isinstance(v, dict) and len(v) == 0:
|
|
339
|
-
to_delete.append(k)
|
|
340
|
-
|
|
341
|
-
for td in to_delete:
|
|
342
|
-
del d[td]
|
|
343
|
-
|
|
344
|
-
# # For debugging; keep because it might be needed again
|
|
345
|
-
# dumpfn(d, f"{mol_id}.json.gz")
|
|
346
|
-
# break
|
|
347
|
-
|
|
348
|
-
summary_doc = MoleculeSummaryDoc.from_docs(molecule_id=mol_id, docs=d)
|
|
349
|
-
summary_docs.append(summary_doc)
|
|
350
|
-
|
|
351
|
-
self.logger.debug(f"Produced {len(summary_docs)} summary docs for {formula}")
|
|
352
|
-
|
|
353
|
-
return jsanitize([doc.model_dump() for doc in summary_docs], allow_bson=True)
|
|
354
|
-
|
|
355
|
-
def update_targets(self, items: List[List[Dict]]):
|
|
356
|
-
"""
|
|
357
|
-
Inserts the new documents into the summary collection
|
|
358
|
-
|
|
359
|
-
Args:
|
|
360
|
-
items [[dict]]: A list of documents to update
|
|
361
|
-
"""
|
|
362
|
-
|
|
363
|
-
docs = list(chain.from_iterable(items)) # type: ignore
|
|
364
|
-
|
|
365
|
-
# Add timestamp
|
|
366
|
-
for item in docs:
|
|
367
|
-
item.update(
|
|
368
|
-
{
|
|
369
|
-
"_bt": self.timestamp,
|
|
370
|
-
}
|
|
371
|
-
)
|
|
372
|
-
|
|
373
|
-
molecule_ids = list({item["molecule_id"] for item in docs})
|
|
374
|
-
|
|
375
|
-
if len(items) > 0:
|
|
376
|
-
self.logger.info(f"Updating {len(docs)} summary documents")
|
|
377
|
-
self.summary.remove_docs({self.summary.key: {"$in": molecule_ids}})
|
|
378
|
-
self.summary.update(
|
|
379
|
-
docs=docs,
|
|
380
|
-
key=["molecule_id"],
|
|
381
|
-
)
|
|
382
|
-
else:
|
|
383
|
-
self.logger.info("No items to update")
|