emmet-builders 0.84.2__py3-none-any.whl → 0.86.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- emmet/builders/abinit/phonon.py +27 -25
- emmet/builders/abinit/sound_velocity.py +15 -11
- emmet/builders/feff/xas.py +1 -2
- emmet/builders/materials/absorption_spectrum.py +25 -14
- emmet/builders/materials/alloys.py +3 -4
- emmet/builders/materials/chemenv.py +2 -3
- emmet/builders/materials/corrected_entries.py +15 -9
- emmet/builders/materials/dielectric.py +19 -11
- emmet/builders/materials/elasticity.py +44 -33
- emmet/builders/materials/electrodes.py +24 -19
- emmet/builders/materials/electronic_structure.py +17 -17
- emmet/builders/materials/magnetism.py +11 -4
- emmet/builders/materials/optimade.py +7 -3
- emmet/builders/materials/piezoelectric.py +24 -21
- emmet/builders/materials/provenance.py +15 -12
- emmet/builders/materials/robocrys.py +2 -3
- emmet/builders/materials/substrates.py +9 -8
- emmet/builders/materials/summary.py +3 -3
- emmet/builders/materials/thermo.py +17 -11
- emmet/builders/matscholar/missing_compositions.py +12 -8
- emmet/builders/mobility/migration_graph.py +5 -5
- emmet/builders/settings.py +21 -17
- emmet/builders/utils.py +15 -10
- emmet/builders/vasp/materials.py +32 -16
- emmet/builders/vasp/task_validator.py +15 -11
- {emmet_builders-0.84.2.dist-info → emmet_builders-0.86.0.dist-info}/METADATA +21 -36
- emmet_builders-0.86.0.dist-info/RECORD +41 -0
- {emmet_builders-0.84.2.dist-info → emmet_builders-0.86.0.dist-info}/WHEEL +1 -1
- emmet/builders/materials/ml.py +0 -87
- emmet/builders/molecules/atomic.py +0 -589
- emmet/builders/molecules/bonds.py +0 -324
- emmet/builders/molecules/metal_binding.py +0 -526
- emmet/builders/molecules/orbitals.py +0 -288
- emmet/builders/molecules/redox.py +0 -496
- emmet/builders/molecules/summary.py +0 -383
- emmet/builders/molecules/thermo.py +0 -500
- emmet/builders/molecules/vibration.py +0 -278
- emmet/builders/qchem/__init__.py +0 -0
- emmet/builders/qchem/molecules.py +0 -734
- emmet_builders-0.84.2.dist-info/RECORD +0 -52
- /emmet/builders/{molecules/__init__.py → py.typed} +0 -0
- {emmet_builders-0.84.2.dist-info → emmet_builders-0.86.0.dist-info}/top_level.txt +0 -0
|
@@ -1,734 +0,0 @@
|
|
|
1
|
-
from datetime import datetime
|
|
2
|
-
from itertools import chain, groupby
|
|
3
|
-
from math import ceil
|
|
4
|
-
from typing import Any, Dict, Iterable, Iterator, List, Optional, Set, Union
|
|
5
|
-
|
|
6
|
-
import networkx as nx
|
|
7
|
-
|
|
8
|
-
from maggma.builders import Builder
|
|
9
|
-
from maggma.stores import Store
|
|
10
|
-
from maggma.utils import grouper
|
|
11
|
-
|
|
12
|
-
from emmet.builders.settings import EmmetBuildSettings
|
|
13
|
-
from emmet.core.utils import get_molecule_id, group_molecules, jsanitize, make_mol_graph
|
|
14
|
-
from emmet.core.qchem.molecule import (
|
|
15
|
-
best_lot,
|
|
16
|
-
evaluate_lot,
|
|
17
|
-
evaluate_task_entry,
|
|
18
|
-
MoleculeDoc,
|
|
19
|
-
)
|
|
20
|
-
from emmet.core.qchem.task import TaskDocument
|
|
21
|
-
from emmet.core.qchem.calc_types import LevelOfTheory, CalcType, TaskType
|
|
22
|
-
|
|
23
|
-
|
|
24
|
-
__author__ = "Evan Spotte-Smith <ewcspottesmith@lbl.gov>"
|
|
25
|
-
|
|
26
|
-
|
|
27
|
-
SETTINGS = EmmetBuildSettings()
|
|
28
|
-
|
|
29
|
-
|
|
30
|
-
def evaluate_molecule(
|
|
31
|
-
mol_doc: MoleculeDoc,
|
|
32
|
-
funct_scores: Dict[str, int] = SETTINGS.QCHEM_FUNCTIONAL_QUALITY_SCORES,
|
|
33
|
-
basis_scores: Dict[str, int] = SETTINGS.QCHEM_BASIS_QUALITY_SCORES,
|
|
34
|
-
solvent_scores: Dict[str, int] = SETTINGS.QCHEM_SOLVENT_MODEL_QUALITY_SCORES,
|
|
35
|
-
):
|
|
36
|
-
"""
|
|
37
|
-
Helper function to order optimization calcs by
|
|
38
|
-
- Level of theory
|
|
39
|
-
- Electronic energy
|
|
40
|
-
|
|
41
|
-
:param mol_doc: Molecule to be evaluated
|
|
42
|
-
:param funct_scores: Scores for various density functionals
|
|
43
|
-
:param basis_scores: Scores for various basis sets
|
|
44
|
-
:param solvent_scores: Scores for various implicit solvent models
|
|
45
|
-
:return:
|
|
46
|
-
"""
|
|
47
|
-
|
|
48
|
-
opt_lot = None
|
|
49
|
-
for origin in mol_doc.origins:
|
|
50
|
-
if origin.name == "molecule":
|
|
51
|
-
opt_lot = mol_doc.levels_of_theory[origin.task_id]
|
|
52
|
-
if isinstance(opt_lot, LevelOfTheory):
|
|
53
|
-
opt_lot = opt_lot.value
|
|
54
|
-
|
|
55
|
-
if opt_lot is None:
|
|
56
|
-
opt_eval = [0]
|
|
57
|
-
else:
|
|
58
|
-
opt_eval = evaluate_lot(opt_lot, funct_scores, basis_scores, solvent_scores)
|
|
59
|
-
|
|
60
|
-
best = best_lot(mol_doc, funct_scores, basis_scores, solvent_scores)
|
|
61
|
-
|
|
62
|
-
best_eval = evaluate_lot(best, funct_scores, basis_scores, solvent_scores)
|
|
63
|
-
|
|
64
|
-
return (
|
|
65
|
-
-1 * int(mol_doc.deprecated),
|
|
66
|
-
sum(best_eval),
|
|
67
|
-
sum(opt_eval),
|
|
68
|
-
mol_doc.best_entries[best]["energy"],
|
|
69
|
-
)
|
|
70
|
-
|
|
71
|
-
|
|
72
|
-
def _optimizing_solvent(mol_doc):
|
|
73
|
-
"""
|
|
74
|
-
Returns which solvent was used to optimize this (associated) MoleculeDoc.
|
|
75
|
-
|
|
76
|
-
Args:
|
|
77
|
-
mol_doc: MoleculeDoc
|
|
78
|
-
|
|
79
|
-
Returns:
|
|
80
|
-
solvent (str)
|
|
81
|
-
|
|
82
|
-
"""
|
|
83
|
-
|
|
84
|
-
for origin in mol_doc.origins:
|
|
85
|
-
if origin.name.startswith("molecule"):
|
|
86
|
-
solvent = mol_doc.solvents[origin.task_id]
|
|
87
|
-
return solvent
|
|
88
|
-
|
|
89
|
-
|
|
90
|
-
class MoleculesAssociationBuilder(Builder):
|
|
91
|
-
"""
|
|
92
|
-
The MoleculesAssociationBuilder matches Q-Chem task documents by composition
|
|
93
|
-
and collects tasks associated with identical structures.
|
|
94
|
-
The purpose of this builder is to group calculations in preparation for the
|
|
95
|
-
MoleculesBuilder.
|
|
96
|
-
|
|
97
|
-
The process is as follows:
|
|
98
|
-
|
|
99
|
-
1.) Find all documents with the same formula
|
|
100
|
-
2.) Select only task documents for the task_types we can select properties from
|
|
101
|
-
3.) Aggregate task documents based on nuclear geometry
|
|
102
|
-
4.) Create a MoleculeDoc from the group of task documents
|
|
103
|
-
"""
|
|
104
|
-
|
|
105
|
-
def __init__(
|
|
106
|
-
self,
|
|
107
|
-
tasks: Store,
|
|
108
|
-
assoc: Store,
|
|
109
|
-
query: Optional[Dict] = None,
|
|
110
|
-
settings: Optional[EmmetBuildSettings] = None,
|
|
111
|
-
**kwargs,
|
|
112
|
-
):
|
|
113
|
-
"""
|
|
114
|
-
Args:
|
|
115
|
-
tasks: Store of task documents
|
|
116
|
-
assoc: Store of associated molecules documents to prepare
|
|
117
|
-
query: dictionary to limit tasks to be analyzed
|
|
118
|
-
settings: EmmetSettings to use in the build process
|
|
119
|
-
"""
|
|
120
|
-
|
|
121
|
-
self.tasks = tasks
|
|
122
|
-
self.assoc = assoc
|
|
123
|
-
self.query = query if query else dict()
|
|
124
|
-
self.settings = EmmetBuildSettings.autoload(settings)
|
|
125
|
-
self.kwargs = kwargs
|
|
126
|
-
|
|
127
|
-
super().__init__(sources=[tasks], targets=[assoc], **kwargs)
|
|
128
|
-
|
|
129
|
-
def ensure_indexes(self):
|
|
130
|
-
"""
|
|
131
|
-
Ensures indices on the collections needed for building
|
|
132
|
-
"""
|
|
133
|
-
|
|
134
|
-
# Basic search index for tasks
|
|
135
|
-
self.tasks.ensure_index("task_id")
|
|
136
|
-
self.tasks.ensure_index("last_updated")
|
|
137
|
-
self.tasks.ensure_index("state")
|
|
138
|
-
self.tasks.ensure_index("formula_alphabetical")
|
|
139
|
-
self.tasks.ensure_index("smiles")
|
|
140
|
-
self.tasks.ensure_index("species_hash")
|
|
141
|
-
|
|
142
|
-
# Search index for molecules
|
|
143
|
-
self.assoc.ensure_index("molecule_id")
|
|
144
|
-
self.assoc.ensure_index("last_updated")
|
|
145
|
-
self.assoc.ensure_index("task_ids")
|
|
146
|
-
self.assoc.ensure_index("formula_alphabetical")
|
|
147
|
-
|
|
148
|
-
def prechunk(self, number_splits: int) -> Iterable[Dict]: # pragma: no cover
|
|
149
|
-
"""Prechunk the molecule builder for distributed computation"""
|
|
150
|
-
|
|
151
|
-
temp_query = dict(self.query)
|
|
152
|
-
temp_query["state"] = "successful"
|
|
153
|
-
|
|
154
|
-
self.logger.info("Finding tasks to process")
|
|
155
|
-
all_tasks = list(self.tasks.query(temp_query, [self.tasks.key, "species_hash"]))
|
|
156
|
-
|
|
157
|
-
processed_tasks = set(self.assoc.distinct("task_ids"))
|
|
158
|
-
to_process_tasks = {d[self.tasks.key] for d in all_tasks} - processed_tasks
|
|
159
|
-
to_process_hashes = {
|
|
160
|
-
d["species_hash"]
|
|
161
|
-
for d in all_tasks
|
|
162
|
-
if d[self.tasks.key] in to_process_tasks
|
|
163
|
-
}
|
|
164
|
-
|
|
165
|
-
N = ceil(len(to_process_hashes) / number_splits)
|
|
166
|
-
|
|
167
|
-
for hash_chunk in grouper(to_process_hashes, N):
|
|
168
|
-
yield {"query": {"species_hash": {"$in": list(hash_chunk)}}}
|
|
169
|
-
|
|
170
|
-
def get_items(self) -> Iterator[List[TaskDocument]]:
|
|
171
|
-
"""
|
|
172
|
-
Gets all items to process into molecules (and other) documents.
|
|
173
|
-
This does no datetime checking; relying on on whether
|
|
174
|
-
task_ids are included in the molecules Store
|
|
175
|
-
|
|
176
|
-
Returns:
|
|
177
|
-
generator or list relevant tasks and molecules to process into documents
|
|
178
|
-
"""
|
|
179
|
-
|
|
180
|
-
self.logger.info("Molecule association builder started")
|
|
181
|
-
self.logger.info(
|
|
182
|
-
f"Allowed task types: {[task_type.value for task_type in self.settings.QCHEM_ALLOWED_TASK_TYPES]}"
|
|
183
|
-
)
|
|
184
|
-
|
|
185
|
-
self.logger.info("Setting indexes")
|
|
186
|
-
self.ensure_indexes()
|
|
187
|
-
|
|
188
|
-
# Save timestamp to mark buildtime
|
|
189
|
-
self.timestamp = datetime.utcnow()
|
|
190
|
-
|
|
191
|
-
# Get all processed tasks
|
|
192
|
-
temp_query = dict(self.query)
|
|
193
|
-
temp_query["state"] = "successful"
|
|
194
|
-
|
|
195
|
-
self.logger.info("Finding tasks to process")
|
|
196
|
-
all_tasks = list(self.tasks.query(temp_query, [self.tasks.key, "species_hash"]))
|
|
197
|
-
|
|
198
|
-
processed_tasks = set(self.assoc.distinct("task_ids"))
|
|
199
|
-
to_process_tasks = {d[self.tasks.key] for d in all_tasks} - processed_tasks
|
|
200
|
-
to_process_hashes = {
|
|
201
|
-
d["species_hash"]
|
|
202
|
-
for d in all_tasks
|
|
203
|
-
if d[self.tasks.key] in to_process_tasks
|
|
204
|
-
}
|
|
205
|
-
|
|
206
|
-
self.logger.info(f"Found {len(to_process_tasks)} unprocessed tasks")
|
|
207
|
-
self.logger.info(f"Found {len(to_process_hashes)} unprocessed hashes")
|
|
208
|
-
|
|
209
|
-
# Set total for builder bars to have a total
|
|
210
|
-
self.total = len(to_process_hashes)
|
|
211
|
-
|
|
212
|
-
projected_fields = [
|
|
213
|
-
"last_updated",
|
|
214
|
-
"task_id",
|
|
215
|
-
"formula_alphabetical",
|
|
216
|
-
"species_hash",
|
|
217
|
-
"coord_hash",
|
|
218
|
-
"smiles",
|
|
219
|
-
"orig",
|
|
220
|
-
"tags",
|
|
221
|
-
"walltime",
|
|
222
|
-
"cputime",
|
|
223
|
-
"output",
|
|
224
|
-
"calcs_reversed",
|
|
225
|
-
"special_run_type",
|
|
226
|
-
"custom_smd",
|
|
227
|
-
"critic2",
|
|
228
|
-
]
|
|
229
|
-
|
|
230
|
-
for shash in to_process_hashes:
|
|
231
|
-
tasks_query = dict(temp_query)
|
|
232
|
-
tasks_query["species_hash"] = shash
|
|
233
|
-
tasks = list(
|
|
234
|
-
self.tasks.query(criteria=tasks_query, properties=projected_fields)
|
|
235
|
-
)
|
|
236
|
-
to_yield = list()
|
|
237
|
-
for t in tasks:
|
|
238
|
-
# TODO: Validation
|
|
239
|
-
# basic validation here ensures that tasks with invalid levels of
|
|
240
|
-
# theory don't halt the build pipeline
|
|
241
|
-
try:
|
|
242
|
-
task = TaskDocument(**t)
|
|
243
|
-
to_yield.append(task)
|
|
244
|
-
except Exception as e:
|
|
245
|
-
self.logger.info(
|
|
246
|
-
f"Processing task {t['task_id']} failed with Exception - {e}"
|
|
247
|
-
)
|
|
248
|
-
continue
|
|
249
|
-
|
|
250
|
-
yield to_yield
|
|
251
|
-
|
|
252
|
-
def process_item(self, tasks: List[TaskDocument]) -> List[Dict]:
|
|
253
|
-
"""
|
|
254
|
-
Process the tasks into a MoleculeDoc
|
|
255
|
-
|
|
256
|
-
Args:
|
|
257
|
-
tasks [TaskDocument] : a list of task docs
|
|
258
|
-
|
|
259
|
-
Returns:
|
|
260
|
-
[dict] : a list of new molecule docs
|
|
261
|
-
"""
|
|
262
|
-
|
|
263
|
-
if len(tasks) == 0:
|
|
264
|
-
return list()
|
|
265
|
-
shash = tasks[0].species_hash
|
|
266
|
-
task_ids = [task.task_id for task in tasks]
|
|
267
|
-
self.logger.debug(f"Processing {shash} : {task_ids}")
|
|
268
|
-
molecules = list()
|
|
269
|
-
|
|
270
|
-
for group in self.filter_and_group_tasks(tasks):
|
|
271
|
-
try:
|
|
272
|
-
doc = MoleculeDoc.from_tasks(group)
|
|
273
|
-
molecules.append(doc)
|
|
274
|
-
except Exception as e:
|
|
275
|
-
failed_ids = list({t_.task_id for t_ in group})
|
|
276
|
-
doc = MoleculeDoc.construct_deprecated_molecule(group)
|
|
277
|
-
doc.warnings.append(str(e))
|
|
278
|
-
molecules.append(doc)
|
|
279
|
-
self.logger.warning(
|
|
280
|
-
f"Failed making molecule for {failed_ids}."
|
|
281
|
-
f" Inserted as deprecated molecule: {doc.molecule_id}"
|
|
282
|
-
)
|
|
283
|
-
|
|
284
|
-
self.logger.debug(f"Produced {len(molecules)} molecules for {shash}")
|
|
285
|
-
|
|
286
|
-
return jsanitize([mol.model_dump() for mol in molecules], allow_bson=True)
|
|
287
|
-
|
|
288
|
-
def update_targets(self, items: List[List[Dict]]):
|
|
289
|
-
"""
|
|
290
|
-
Inserts the new molecules into the molecules collection
|
|
291
|
-
|
|
292
|
-
Args:
|
|
293
|
-
items [[dict]]: A list of molecules to update
|
|
294
|
-
"""
|
|
295
|
-
|
|
296
|
-
docs = list(chain.from_iterable(items)) # type: ignore
|
|
297
|
-
|
|
298
|
-
for item in docs:
|
|
299
|
-
item.update({"_bt": self.timestamp})
|
|
300
|
-
|
|
301
|
-
molecule_ids = list({item["molecule_id"] for item in docs})
|
|
302
|
-
|
|
303
|
-
if len(items) > 0:
|
|
304
|
-
self.logger.info(f"Updating {len(docs)} molecules")
|
|
305
|
-
self.assoc.remove_docs({self.assoc.key: {"$in": molecule_ids}})
|
|
306
|
-
self.assoc.update(
|
|
307
|
-
docs=docs,
|
|
308
|
-
key=["molecule_id"],
|
|
309
|
-
)
|
|
310
|
-
else:
|
|
311
|
-
self.logger.info("No items to update")
|
|
312
|
-
|
|
313
|
-
def filter_and_group_tasks(
|
|
314
|
-
self, tasks: List[TaskDocument]
|
|
315
|
-
) -> Iterator[List[TaskDocument]]:
|
|
316
|
-
"""
|
|
317
|
-
Groups tasks by identical structure
|
|
318
|
-
"""
|
|
319
|
-
|
|
320
|
-
filtered_tasks = [
|
|
321
|
-
task
|
|
322
|
-
for task in tasks
|
|
323
|
-
if any(
|
|
324
|
-
allowed_type is task.task_type
|
|
325
|
-
for allowed_type in self.settings.QCHEM_ALLOWED_TASK_TYPES
|
|
326
|
-
)
|
|
327
|
-
]
|
|
328
|
-
|
|
329
|
-
molecules = list()
|
|
330
|
-
|
|
331
|
-
for idx, task in enumerate(filtered_tasks):
|
|
332
|
-
if task.output.optimized_molecule:
|
|
333
|
-
m = task.output.optimized_molecule
|
|
334
|
-
else:
|
|
335
|
-
m = task.output.initial_molecule
|
|
336
|
-
m.ind: int = idx # type: ignore
|
|
337
|
-
molecules.append(m)
|
|
338
|
-
|
|
339
|
-
grouped_molecules = group_molecules(molecules)
|
|
340
|
-
for group in grouped_molecules:
|
|
341
|
-
grouped_tasks = [filtered_tasks[mol.ind] for mol in group] # type: ignore
|
|
342
|
-
yield grouped_tasks
|
|
343
|
-
|
|
344
|
-
|
|
345
|
-
class MoleculesBuilder(Builder):
|
|
346
|
-
"""
|
|
347
|
-
The MoleculesBuilder collects MoleculeDocs from the MoleculesAssociationBuilder
|
|
348
|
-
and groups them by key properties (charge, spin multiplicity, bonding).
|
|
349
|
-
Then, the best molecular structure is identified (based on electronic energy),
|
|
350
|
-
and this document becomes the representative MoleculeDoc.
|
|
351
|
-
|
|
352
|
-
The process is as follows:
|
|
353
|
-
|
|
354
|
-
1.) Find all documents with the same formula
|
|
355
|
-
2.) Group documents based on charge, spin, and bonding
|
|
356
|
-
3.) Create a MoleculeDoc from the group of task documents
|
|
357
|
-
"""
|
|
358
|
-
|
|
359
|
-
def __init__(
|
|
360
|
-
self,
|
|
361
|
-
assoc: Store,
|
|
362
|
-
molecules: Store,
|
|
363
|
-
query: Optional[Dict] = None,
|
|
364
|
-
settings: Optional[EmmetBuildSettings] = None,
|
|
365
|
-
**kwargs,
|
|
366
|
-
):
|
|
367
|
-
"""
|
|
368
|
-
Args:
|
|
369
|
-
assoc: Store of associated molecules documents, created by MoleculesAssociationBuilder
|
|
370
|
-
molecules: Store of processed molecules documents
|
|
371
|
-
query: dictionary to limit tasks to be analyzed
|
|
372
|
-
settings: EmmetSettings to use in the build process
|
|
373
|
-
"""
|
|
374
|
-
|
|
375
|
-
self.assoc = assoc
|
|
376
|
-
self.molecules = molecules
|
|
377
|
-
self.query = query if query else dict()
|
|
378
|
-
self.settings = EmmetBuildSettings.autoload(settings)
|
|
379
|
-
self.kwargs = kwargs
|
|
380
|
-
|
|
381
|
-
super().__init__(sources=[assoc], targets=[molecules], **kwargs)
|
|
382
|
-
|
|
383
|
-
def ensure_indexes(self):
|
|
384
|
-
"""
|
|
385
|
-
Ensures indices on the collections needed for building
|
|
386
|
-
"""
|
|
387
|
-
|
|
388
|
-
# Search index for associated molecules
|
|
389
|
-
self.assoc.ensure_index("molecule_id")
|
|
390
|
-
self.assoc.ensure_index("last_updated")
|
|
391
|
-
self.assoc.ensure_index("task_ids")
|
|
392
|
-
self.assoc.ensure_index("formula_alphabetical")
|
|
393
|
-
|
|
394
|
-
# Search index for molecules
|
|
395
|
-
self.molecules.ensure_index("molecule_id")
|
|
396
|
-
self.molecules.ensure_index("last_updated")
|
|
397
|
-
self.molecules.ensure_index("task_ids")
|
|
398
|
-
self.molecules.ensure_index("formula_alphabetical")
|
|
399
|
-
|
|
400
|
-
def prechunk(self, number_splits: int) -> Iterable[Dict]: # pragma: no cover
|
|
401
|
-
"""Prechunk the molecule builder for distributed computation"""
|
|
402
|
-
|
|
403
|
-
temp_query = dict(self.query)
|
|
404
|
-
temp_query["deprecated"] = False
|
|
405
|
-
|
|
406
|
-
self.logger.info("Finding documents to process")
|
|
407
|
-
all_assoc = list(
|
|
408
|
-
self.assoc.query(
|
|
409
|
-
temp_query,
|
|
410
|
-
[
|
|
411
|
-
self.assoc.key,
|
|
412
|
-
"formula_alphabetical",
|
|
413
|
-
"species_hash",
|
|
414
|
-
"charge",
|
|
415
|
-
"spin_multiplicity",
|
|
416
|
-
],
|
|
417
|
-
)
|
|
418
|
-
)
|
|
419
|
-
|
|
420
|
-
# Should be using species hash, rather than coord hash, at this point
|
|
421
|
-
processed_docs = set(list(self.molecules.distinct("molecule_id")))
|
|
422
|
-
assoc_ids = set()
|
|
423
|
-
|
|
424
|
-
xyz_species_id_map = dict()
|
|
425
|
-
for d in all_assoc:
|
|
426
|
-
this_id = "{}-{}-{}-{}".format(
|
|
427
|
-
d["species_hash"],
|
|
428
|
-
d["formula_alphabetical"].replace(" ", ""),
|
|
429
|
-
str(int(d["charge"])).replace("-", "m"),
|
|
430
|
-
str(int(d["spin_multiplicity"])),
|
|
431
|
-
)
|
|
432
|
-
assoc_ids.add(this_id)
|
|
433
|
-
xyz_species_id_map[d[self.assoc.key]] = this_id
|
|
434
|
-
to_process_docs = assoc_ids - processed_docs
|
|
435
|
-
|
|
436
|
-
to_process_forms = {
|
|
437
|
-
d["formula_alphabetical"]
|
|
438
|
-
for d in all_assoc
|
|
439
|
-
if xyz_species_id_map[d[self.assoc.key]] in to_process_docs
|
|
440
|
-
}
|
|
441
|
-
|
|
442
|
-
N = ceil(len(to_process_forms) / number_splits)
|
|
443
|
-
|
|
444
|
-
for formula_chunk in grouper(to_process_forms, N):
|
|
445
|
-
yield {"query": {"formula_alphabetical": {"$in": list(formula_chunk)}}}
|
|
446
|
-
|
|
447
|
-
def get_items(self) -> Iterator[List[Dict]]:
|
|
448
|
-
"""
|
|
449
|
-
Gets all items to process into molecules (and other) documents.
|
|
450
|
-
This does no datetime checking; relying on on whether
|
|
451
|
-
task_ids are included in the molecules Store
|
|
452
|
-
|
|
453
|
-
Returns:
|
|
454
|
-
generator or list relevant tasks and molecules to process into documents
|
|
455
|
-
"""
|
|
456
|
-
|
|
457
|
-
self.logger.info("Molecules builder started")
|
|
458
|
-
self.logger.info("Setting indexes")
|
|
459
|
-
self.ensure_indexes()
|
|
460
|
-
|
|
461
|
-
# Save timestamp to mark buildtime
|
|
462
|
-
self.timestamp = datetime.utcnow()
|
|
463
|
-
|
|
464
|
-
# Get all processed molecules
|
|
465
|
-
temp_query = dict(self.query)
|
|
466
|
-
temp_query["deprecated"] = False
|
|
467
|
-
|
|
468
|
-
self.logger.info("Finding documents to process")
|
|
469
|
-
all_assoc = list(
|
|
470
|
-
self.assoc.query(
|
|
471
|
-
temp_query,
|
|
472
|
-
[
|
|
473
|
-
self.assoc.key,
|
|
474
|
-
"formula_alphabetical",
|
|
475
|
-
"species_hash",
|
|
476
|
-
"charge",
|
|
477
|
-
"spin_multiplicity",
|
|
478
|
-
],
|
|
479
|
-
)
|
|
480
|
-
)
|
|
481
|
-
|
|
482
|
-
# Should be using species hash, rather than coord hash, at this point
|
|
483
|
-
processed_docs = set(list(self.molecules.distinct("molecule_id")))
|
|
484
|
-
assoc_ids = set()
|
|
485
|
-
|
|
486
|
-
xyz_species_id_map = dict()
|
|
487
|
-
for d in all_assoc:
|
|
488
|
-
this_id = "{}-{}-{}-{}".format(
|
|
489
|
-
d["species_hash"],
|
|
490
|
-
d["formula_alphabetical"].replace(" ", ""),
|
|
491
|
-
str(int(d["charge"])).replace("-", "m"),
|
|
492
|
-
str(int(d["spin_multiplicity"])),
|
|
493
|
-
)
|
|
494
|
-
assoc_ids.add(this_id)
|
|
495
|
-
xyz_species_id_map[d[self.assoc.key]] = this_id
|
|
496
|
-
to_process_docs = assoc_ids - processed_docs
|
|
497
|
-
|
|
498
|
-
to_process_forms = {
|
|
499
|
-
d["formula_alphabetical"]
|
|
500
|
-
for d in all_assoc
|
|
501
|
-
if xyz_species_id_map[d[self.assoc.key]] in to_process_docs
|
|
502
|
-
}
|
|
503
|
-
|
|
504
|
-
self.logger.info(f"Found {len(to_process_docs)} unprocessed documents")
|
|
505
|
-
self.logger.info(f"Found {len(to_process_forms)} unprocessed formulas")
|
|
506
|
-
|
|
507
|
-
# Set total for builder bars to have a total
|
|
508
|
-
self.total = len(to_process_forms)
|
|
509
|
-
|
|
510
|
-
for formula in to_process_forms:
|
|
511
|
-
assoc_query = dict(temp_query)
|
|
512
|
-
assoc_query["formula_alphabetical"] = formula
|
|
513
|
-
assoc = list(self.assoc.query(criteria=assoc_query))
|
|
514
|
-
|
|
515
|
-
yield assoc
|
|
516
|
-
|
|
517
|
-
def process_item(self, items: List[Dict]) -> List[Dict]:
|
|
518
|
-
"""
|
|
519
|
-
Process the tasks into a MoleculeDoc
|
|
520
|
-
|
|
521
|
-
Args:
|
|
522
|
-
tasks List[Dict] : a list of task docs
|
|
523
|
-
|
|
524
|
-
Returns:
|
|
525
|
-
[dict] : a list of new molecule docs
|
|
526
|
-
"""
|
|
527
|
-
|
|
528
|
-
assoc = [MoleculeDoc(**item) for item in items]
|
|
529
|
-
formula = assoc[0].formula_alphabetical
|
|
530
|
-
mol_ids = [a.molecule_id for a in assoc]
|
|
531
|
-
self.logger.debug(f"Processing {formula} : {mol_ids}")
|
|
532
|
-
|
|
533
|
-
complete_mol_docs = list()
|
|
534
|
-
|
|
535
|
-
# This is only slightly unholy
|
|
536
|
-
# Need to combine many variables of the various constituent associated docs
|
|
537
|
-
# into one MoleculeDoc, where the best associated doc for each solvent is taken
|
|
538
|
-
for group in self.group_mol_docs(assoc):
|
|
539
|
-
# Maybe all are disconnected and therefore none get grouped?
|
|
540
|
-
if len(group) == 0:
|
|
541
|
-
continue
|
|
542
|
-
|
|
543
|
-
docs_by_solvent = dict()
|
|
544
|
-
mols_by_solvent = dict()
|
|
545
|
-
mol_lots = dict()
|
|
546
|
-
|
|
547
|
-
task_ids = list()
|
|
548
|
-
calc_types = dict()
|
|
549
|
-
task_types = dict()
|
|
550
|
-
levels_of_theory = dict()
|
|
551
|
-
solvents = dict()
|
|
552
|
-
lot_solvents = dict()
|
|
553
|
-
unique_calc_types: Set[Union[str, CalcType]] = set()
|
|
554
|
-
unique_task_types: Set[Union[str, TaskType]] = set()
|
|
555
|
-
unique_levels_of_theory: Set[Union[str, LevelOfTheory]] = set()
|
|
556
|
-
unique_solvents: Set[str] = set()
|
|
557
|
-
unique_lot_solvents: Set[str] = set()
|
|
558
|
-
origins = list()
|
|
559
|
-
entries = list()
|
|
560
|
-
best_entries: Dict[str, Any] = dict()
|
|
561
|
-
constituent_molecules = list()
|
|
562
|
-
similar_molecules = list()
|
|
563
|
-
|
|
564
|
-
base_doc: Optional[MoleculeDoc] = None
|
|
565
|
-
|
|
566
|
-
# Grab best doc for each solvent
|
|
567
|
-
# A doc is given a solvent based on how the molecule was optimized
|
|
568
|
-
for solv, subgroup in groupby(
|
|
569
|
-
sorted(group, key=_optimizing_solvent), key=_optimizing_solvent
|
|
570
|
-
):
|
|
571
|
-
sorted_docs = sorted(subgroup, key=evaluate_molecule)
|
|
572
|
-
docs_by_solvent[solv] = sorted_docs[0]
|
|
573
|
-
mols_by_solvent[solv] = sorted_docs[0].molecule
|
|
574
|
-
mol_lots[solv] = sorted_docs[0].levels_of_theory[
|
|
575
|
-
sorted_docs[0].origins[0].task_id
|
|
576
|
-
]
|
|
577
|
-
constituent_molecules.append(sorted_docs[0].molecule_id)
|
|
578
|
-
|
|
579
|
-
if len(sorted_docs) > 1:
|
|
580
|
-
for m in sorted_docs[1:]:
|
|
581
|
-
if m.molecule_id not in constituent_molecules:
|
|
582
|
-
similar_molecules.append(m.molecule_id)
|
|
583
|
-
|
|
584
|
-
if base_doc is None:
|
|
585
|
-
base_doc = docs_by_solvent[solv]
|
|
586
|
-
|
|
587
|
-
if base_doc is None:
|
|
588
|
-
continue
|
|
589
|
-
|
|
590
|
-
else:
|
|
591
|
-
# Compile data on each constituent doc
|
|
592
|
-
for solv, doc in docs_by_solvent.items():
|
|
593
|
-
task_ids.extend(doc.task_ids)
|
|
594
|
-
calc_types.update(doc.calc_types)
|
|
595
|
-
task_types.update(doc.task_types)
|
|
596
|
-
levels_of_theory.update(doc.levels_of_theory)
|
|
597
|
-
solvents.update(doc.solvents)
|
|
598
|
-
lot_solvents.update(doc.lot_solvents)
|
|
599
|
-
unique_calc_types = unique_calc_types.union(
|
|
600
|
-
set(doc.unique_calc_types)
|
|
601
|
-
)
|
|
602
|
-
unique_task_types = unique_task_types.union(
|
|
603
|
-
set(doc.unique_task_types)
|
|
604
|
-
)
|
|
605
|
-
unique_levels_of_theory = unique_levels_of_theory.union(
|
|
606
|
-
set(doc.unique_levels_of_theory)
|
|
607
|
-
)
|
|
608
|
-
unique_solvents = unique_solvents.union(set(doc.unique_solvents))
|
|
609
|
-
unique_lot_solvents = unique_lot_solvents.union(
|
|
610
|
-
set(doc.unique_lot_solvents)
|
|
611
|
-
)
|
|
612
|
-
origins.extend(doc.origins)
|
|
613
|
-
entries.extend(doc.entries)
|
|
614
|
-
|
|
615
|
-
for lot_solv, entry in doc.best_entries.items():
|
|
616
|
-
if lot_solv in best_entries:
|
|
617
|
-
current_eval = evaluate_task_entry(best_entries[lot_solv])
|
|
618
|
-
this_eval = evaluate_task_entry(entry)
|
|
619
|
-
if this_eval < current_eval:
|
|
620
|
-
best_entries[lot_solv] = entry
|
|
621
|
-
else:
|
|
622
|
-
best_entries[lot_solv] = entry
|
|
623
|
-
|
|
624
|
-
# Assign new doc info
|
|
625
|
-
base_doc.molecule_id = get_molecule_id(
|
|
626
|
-
base_doc.molecule, node_attr="specie"
|
|
627
|
-
)
|
|
628
|
-
base_doc.molecules = mols_by_solvent
|
|
629
|
-
base_doc.molecule_levels_of_theory = mol_lots
|
|
630
|
-
base_doc.task_ids = task_ids
|
|
631
|
-
base_doc.calc_types = calc_types
|
|
632
|
-
base_doc.task_types = task_types
|
|
633
|
-
base_doc.levels_of_theory = levels_of_theory
|
|
634
|
-
base_doc.solvents = solvents
|
|
635
|
-
base_doc.lot_solvents = lot_solvents
|
|
636
|
-
base_doc.unique_calc_types = unique_calc_types
|
|
637
|
-
base_doc.unique_task_types = unique_task_types
|
|
638
|
-
base_doc.unique_levels_of_theory = unique_levels_of_theory
|
|
639
|
-
base_doc.unique_solvents = unique_solvents
|
|
640
|
-
base_doc.unique_lot_solvents = unique_lot_solvents
|
|
641
|
-
base_doc.origins = origins
|
|
642
|
-
base_doc.entries = entries
|
|
643
|
-
base_doc.best_entries = best_entries
|
|
644
|
-
base_doc.constituent_molecules = constituent_molecules
|
|
645
|
-
base_doc.similar_molecules = similar_molecules
|
|
646
|
-
|
|
647
|
-
complete_mol_docs.append(base_doc)
|
|
648
|
-
|
|
649
|
-
self.logger.debug(f"Produced {len(complete_mol_docs)} molecules for {formula}")
|
|
650
|
-
|
|
651
|
-
return jsanitize(
|
|
652
|
-
[mol.model_dump() for mol in complete_mol_docs], allow_bson=True
|
|
653
|
-
)
|
|
654
|
-
|
|
655
|
-
def update_targets(self, items: List[List[Dict]]):
|
|
656
|
-
"""
|
|
657
|
-
Inserts the new molecules into the molecules collection
|
|
658
|
-
|
|
659
|
-
Args:
|
|
660
|
-
items [[dict]]: A list of molecules to update
|
|
661
|
-
"""
|
|
662
|
-
|
|
663
|
-
self.logger.debug(f"Updating {len(items)} molecules")
|
|
664
|
-
|
|
665
|
-
docs = list(chain.from_iterable(items)) # type: ignore
|
|
666
|
-
|
|
667
|
-
# Add timestamp, add prefix to molecule id
|
|
668
|
-
for item in docs:
|
|
669
|
-
molid = item["molecule_id"]
|
|
670
|
-
|
|
671
|
-
item.update({"_bt": self.timestamp})
|
|
672
|
-
|
|
673
|
-
for entry in item["entries"]:
|
|
674
|
-
entry["entry_id"] = molid
|
|
675
|
-
|
|
676
|
-
molecule_ids = list({item["molecule_id"] for item in docs})
|
|
677
|
-
|
|
678
|
-
if len(items) > 0:
|
|
679
|
-
self.logger.info(f"Updating {len(docs)} molecules")
|
|
680
|
-
self.molecules.remove_docs({self.molecules.key: {"$in": molecule_ids}})
|
|
681
|
-
self.molecules.update(
|
|
682
|
-
docs=docs,
|
|
683
|
-
key=["molecule_id"],
|
|
684
|
-
)
|
|
685
|
-
else:
|
|
686
|
-
self.logger.info("No items to update")
|
|
687
|
-
|
|
688
|
-
def group_mol_docs(self, assoc: List[MoleculeDoc]) -> Iterator[List[MoleculeDoc]]:
|
|
689
|
-
"""
|
|
690
|
-
Groups molecules by:
|
|
691
|
-
- highest level of theory
|
|
692
|
-
- charge
|
|
693
|
-
- spin multiplicity
|
|
694
|
-
- bonding (molecule graph isomorphism)
|
|
695
|
-
- solvent environment used for the structure
|
|
696
|
-
"""
|
|
697
|
-
|
|
698
|
-
# Molecules are already grouped by formula
|
|
699
|
-
|
|
700
|
-
# First, group by charge, spin multiplicity
|
|
701
|
-
# Then group by graph isomorphism, using OpenBabelNN + metal_edge_extender
|
|
702
|
-
|
|
703
|
-
def charge_spin(mol_doc):
|
|
704
|
-
return (mol_doc.charge, mol_doc.spin_multiplicity)
|
|
705
|
-
|
|
706
|
-
# Group by charge and spin
|
|
707
|
-
for c_s, group in groupby(sorted(assoc, key=charge_spin), key=charge_spin):
|
|
708
|
-
subgroups: List[Dict[str, Any]] = list()
|
|
709
|
-
for mol_doc in group:
|
|
710
|
-
mol_graph = make_mol_graph(mol_doc.molecule)
|
|
711
|
-
mol_hash = mol_doc.species_hash
|
|
712
|
-
|
|
713
|
-
# Finally, group by graph isomorphism
|
|
714
|
-
# When bonding is defined by OpenBabelNN + metal_edge_extender
|
|
715
|
-
# Unconnected molecule graphs are discarded at this step
|
|
716
|
-
# TODO: What about molecules that would be connected under a different
|
|
717
|
-
# TODO: bonding scheme? For now, ¯\_(ツ)_/¯
|
|
718
|
-
# TODO: MAKE ClusterBuilder FOR THIS PURPOSE
|
|
719
|
-
if nx.is_connected(mol_graph.graph.to_undirected()):
|
|
720
|
-
matched = False
|
|
721
|
-
|
|
722
|
-
for subgroup in subgroups:
|
|
723
|
-
if mol_hash == subgroup["hash"]:
|
|
724
|
-
subgroup["mol_docs"].append(mol_doc)
|
|
725
|
-
matched = True
|
|
726
|
-
break
|
|
727
|
-
|
|
728
|
-
if not matched:
|
|
729
|
-
subgroups.append({"hash": mol_hash, "mol_docs": [mol_doc]})
|
|
730
|
-
|
|
731
|
-
self.logger.debug(f"Unique hashes: {[x['hash'] for x in subgroups]}")
|
|
732
|
-
|
|
733
|
-
for subgroup in subgroups:
|
|
734
|
-
yield subgroup["mol_docs"]
|