emmet-builders 0.84.3rc2__py3-none-any.whl → 0.84.3rc3__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of emmet-builders might be problematic. Click here for more details.
- emmet/builders/molecules/atomic.py +46 -48
- emmet/builders/molecules/bonds.py +24 -24
- emmet/builders/molecules/electric.py +282 -0
- emmet/builders/molecules/metal_binding.py +20 -21
- emmet/builders/molecules/orbitals.py +23 -23
- emmet/builders/molecules/redox.py +27 -27
- emmet/builders/molecules/summary.py +36 -21
- emmet/builders/molecules/thermo.py +23 -23
- emmet/builders/molecules/trajectory.py +525 -0
- emmet/builders/molecules/vibration.py +23 -23
- emmet/builders/qchem/molecules.py +21 -15
- {emmet_builders-0.84.3rc2.dist-info → emmet_builders-0.84.3rc3.dist-info}/METADATA +1 -1
- {emmet_builders-0.84.3rc2.dist-info → emmet_builders-0.84.3rc3.dist-info}/RECORD +15 -13
- {emmet_builders-0.84.3rc2.dist-info → emmet_builders-0.84.3rc3.dist-info}/WHEEL +0 -0
- {emmet_builders-0.84.3rc2.dist-info → emmet_builders-0.84.3rc3.dist-info}/top_level.txt +0 -0
|
@@ -84,7 +84,7 @@ class ThermoBuilder(Builder):
|
|
|
84
84
|
single-point energy corrections.
|
|
85
85
|
|
|
86
86
|
Before any documents are constructed, the following steps are taken:
|
|
87
|
-
1. Gather MoleculeDocs by
|
|
87
|
+
1. Gather MoleculeDocs by species hash
|
|
88
88
|
2. For each doc, identify tasks with thermodynamic information such as
|
|
89
89
|
zero-point energy, enthalpy, and entropy. Collect these "documents
|
|
90
90
|
including complete thermodynamics" (DICTs).
|
|
@@ -148,12 +148,14 @@ class ThermoBuilder(Builder):
|
|
|
148
148
|
self.tasks.ensure_index("last_updated")
|
|
149
149
|
self.tasks.ensure_index("state")
|
|
150
150
|
self.tasks.ensure_index("formula_alphabetical")
|
|
151
|
+
self.tasks.ensure_index("species_hash")
|
|
151
152
|
|
|
152
153
|
# Search index for molecules
|
|
153
154
|
self.molecules.ensure_index("molecule_id")
|
|
154
155
|
self.molecules.ensure_index("last_updated")
|
|
155
156
|
self.molecules.ensure_index("task_ids")
|
|
156
157
|
self.molecules.ensure_index("formula_alphabetical")
|
|
158
|
+
self.molecules.ensure_index("species_hash")
|
|
157
159
|
|
|
158
160
|
# Search index for thermo
|
|
159
161
|
self.thermo.ensure_index("molecule_id")
|
|
@@ -172,23 +174,23 @@ class ThermoBuilder(Builder):
|
|
|
172
174
|
|
|
173
175
|
self.logger.info("Finding documents to process")
|
|
174
176
|
all_mols = list(
|
|
175
|
-
self.molecules.query(
|
|
176
|
-
temp_query, [self.molecules.key, "formula_alphabetical"]
|
|
177
|
-
)
|
|
177
|
+
self.molecules.query(temp_query, [self.molecules.key, "species_hash"])
|
|
178
178
|
)
|
|
179
179
|
|
|
180
180
|
processed_docs = set([e for e in self.thermo.distinct("molecule_id")])
|
|
181
181
|
to_process_docs = {d[self.molecules.key] for d in all_mols} - processed_docs
|
|
182
|
-
|
|
183
|
-
d["
|
|
182
|
+
to_process_hashes = {
|
|
183
|
+
d["species_hash"]
|
|
184
184
|
for d in all_mols
|
|
185
185
|
if d[self.molecules.key] in to_process_docs
|
|
186
186
|
}
|
|
187
187
|
|
|
188
|
-
N = ceil(len(
|
|
188
|
+
N = ceil(len(to_process_hashes) / number_splits)
|
|
189
189
|
|
|
190
|
-
for
|
|
191
|
-
|
|
190
|
+
for hash_chunk in grouper(to_process_hashes, N):
|
|
191
|
+
query = dict(temp_query)
|
|
192
|
+
query["species_hash"] = {"$in": list(hash_chunk)}
|
|
193
|
+
yield {"query": query}
|
|
192
194
|
|
|
193
195
|
def get_items(self) -> Iterator[List[Dict]]:
|
|
194
196
|
"""
|
|
@@ -213,28 +215,26 @@ class ThermoBuilder(Builder):
|
|
|
213
215
|
|
|
214
216
|
self.logger.info("Finding documents to process")
|
|
215
217
|
all_mols = list(
|
|
216
|
-
self.molecules.query(
|
|
217
|
-
temp_query, [self.molecules.key, "formula_alphabetical"]
|
|
218
|
-
)
|
|
218
|
+
self.molecules.query(temp_query, [self.molecules.key, "species_hash"])
|
|
219
219
|
)
|
|
220
220
|
|
|
221
221
|
processed_docs = set([e for e in self.thermo.distinct("molecule_id")])
|
|
222
222
|
to_process_docs = {d[self.molecules.key] for d in all_mols} - processed_docs
|
|
223
|
-
|
|
224
|
-
d["
|
|
223
|
+
to_process_hashes = {
|
|
224
|
+
d["species_hash"]
|
|
225
225
|
for d in all_mols
|
|
226
226
|
if d[self.molecules.key] in to_process_docs
|
|
227
227
|
}
|
|
228
228
|
|
|
229
229
|
self.logger.info(f"Found {len(to_process_docs)} unprocessed documents")
|
|
230
|
-
self.logger.info(f"Found {len(
|
|
230
|
+
self.logger.info(f"Found {len(to_process_hashes)} unprocessed hashes")
|
|
231
231
|
|
|
232
232
|
# Set total for builder bars to have a total
|
|
233
|
-
self.total = len(
|
|
233
|
+
self.total = len(to_process_hashes)
|
|
234
234
|
|
|
235
|
-
for
|
|
235
|
+
for shash in to_process_hashes:
|
|
236
236
|
mol_query = dict(temp_query)
|
|
237
|
-
mol_query["
|
|
237
|
+
mol_query["species_hash"] = shash
|
|
238
238
|
molecules = list(self.molecules.query(criteria=mol_query))
|
|
239
239
|
|
|
240
240
|
yield molecules
|
|
@@ -273,9 +273,9 @@ class ThermoBuilder(Builder):
|
|
|
273
273
|
return doc
|
|
274
274
|
|
|
275
275
|
mols = [MoleculeDoc(**item) for item in items]
|
|
276
|
-
|
|
276
|
+
shash = mols[0].species_hash
|
|
277
277
|
mol_ids = [m.molecule_id for m in mols]
|
|
278
|
-
self.logger.debug(f"Processing {
|
|
278
|
+
self.logger.debug(f"Processing {shash} : {mol_ids}")
|
|
279
279
|
|
|
280
280
|
thermo_docs = list()
|
|
281
281
|
|
|
@@ -334,7 +334,7 @@ class ThermoBuilder(Builder):
|
|
|
334
334
|
tdoc = self.tasks.query_one(
|
|
335
335
|
{
|
|
336
336
|
"task_id": task,
|
|
337
|
-
"
|
|
337
|
+
"species_hash": shash,
|
|
338
338
|
"orig": {"$exists": True},
|
|
339
339
|
}
|
|
340
340
|
)
|
|
@@ -344,7 +344,7 @@ class ThermoBuilder(Builder):
|
|
|
344
344
|
tdoc = self.tasks.query_one(
|
|
345
345
|
{
|
|
346
346
|
"task_id": int(task),
|
|
347
|
-
"
|
|
347
|
+
"species_hash": shash,
|
|
348
348
|
"orig": {"$exists": True},
|
|
349
349
|
}
|
|
350
350
|
)
|
|
@@ -465,7 +465,7 @@ class ThermoBuilder(Builder):
|
|
|
465
465
|
sorted(with_eval_e, key=lambda x: (x[1], x[2]))[0][0]
|
|
466
466
|
)
|
|
467
467
|
|
|
468
|
-
self.logger.debug(f"Produced {len(thermo_docs)} thermo docs for {
|
|
468
|
+
self.logger.debug(f"Produced {len(thermo_docs)} thermo docs for {shash}")
|
|
469
469
|
|
|
470
470
|
return jsanitize([doc.model_dump() for doc in thermo_docs], allow_bson=True)
|
|
471
471
|
|
|
@@ -0,0 +1,525 @@
|
|
|
1
|
+
from collections import defaultdict
|
|
2
|
+
from datetime import datetime
|
|
3
|
+
from itertools import chain
|
|
4
|
+
from math import ceil
|
|
5
|
+
from typing import Optional, Iterable, Iterator, List, Dict
|
|
6
|
+
|
|
7
|
+
from maggma.builders import Builder
|
|
8
|
+
from maggma.core import Store
|
|
9
|
+
from maggma.utils import grouper
|
|
10
|
+
|
|
11
|
+
from emmet.core.qchem.task import TaskDocument
|
|
12
|
+
from emmet.core.qchem.molecule import MoleculeDoc, evaluate_lot
|
|
13
|
+
from emmet.core.molecules.trajectory import ForcesDoc, TrajectoryDoc
|
|
14
|
+
from emmet.core.utils import jsanitize
|
|
15
|
+
from emmet.builders.settings import EmmetBuildSettings
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
__author__ = "Evan Spotte-Smith"
|
|
19
|
+
|
|
20
|
+
SETTINGS = EmmetBuildSettings()
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
class ForcesBuilder(Builder):
|
|
24
|
+
"""
|
|
25
|
+
The ForcesBuilder extracts the highest-quality force data from a
|
|
26
|
+
MoleculeDoc (lowest electronic energy, highest level of theory for
|
|
27
|
+
each solvent available).
|
|
28
|
+
|
|
29
|
+
The process is as follows:
|
|
30
|
+
1. Gather MoleculeDocs by species hash
|
|
31
|
+
2. For each doc, sort tasks by solvent
|
|
32
|
+
3. For each solvent, grab the best TaskDoc (doc with force
|
|
33
|
+
information that has the highest level of theory with lowest
|
|
34
|
+
electronic energy for the molecule)
|
|
35
|
+
4. Convert TaskDoc to ForcesDoc
|
|
36
|
+
|
|
37
|
+
"""
|
|
38
|
+
|
|
39
|
+
def __init__(
|
|
40
|
+
self,
|
|
41
|
+
tasks: Store,
|
|
42
|
+
molecules: Store,
|
|
43
|
+
forces: Store,
|
|
44
|
+
query: Optional[Dict] = None,
|
|
45
|
+
settings: Optional[EmmetBuildSettings] = None,
|
|
46
|
+
**kwargs,
|
|
47
|
+
):
|
|
48
|
+
self.tasks = tasks
|
|
49
|
+
self.molecules = molecules
|
|
50
|
+
self.forces = forces
|
|
51
|
+
self.query = query if query else dict()
|
|
52
|
+
self.settings = EmmetBuildSettings.autoload(settings)
|
|
53
|
+
self.kwargs = kwargs
|
|
54
|
+
|
|
55
|
+
super().__init__(sources=[tasks, molecules], targets=[forces], **kwargs)
|
|
56
|
+
# Uncomment in case of issue with mrun not connecting automatically to collections
|
|
57
|
+
# for i in [self.tasks, self.molecules, self.forces]:
|
|
58
|
+
# try:
|
|
59
|
+
# i.connect()
|
|
60
|
+
# except Exception as e:
|
|
61
|
+
# print("Could not connect,", e)
|
|
62
|
+
|
|
63
|
+
def ensure_indexes(self):
|
|
64
|
+
"""
|
|
65
|
+
Ensures indices on the collections needed for building
|
|
66
|
+
"""
|
|
67
|
+
|
|
68
|
+
# Basic search index for tasks
|
|
69
|
+
self.tasks.ensure_index("task_id")
|
|
70
|
+
self.tasks.ensure_index("last_updated")
|
|
71
|
+
self.tasks.ensure_index("state")
|
|
72
|
+
self.tasks.ensure_index("formula_alphabetical")
|
|
73
|
+
self.tasks.ensure_index("species_hash")
|
|
74
|
+
|
|
75
|
+
# Search index for molecules
|
|
76
|
+
self.molecules.ensure_index("molecule_id")
|
|
77
|
+
self.molecules.ensure_index("last_updated")
|
|
78
|
+
self.molecules.ensure_index("task_ids")
|
|
79
|
+
self.molecules.ensure_index("formula_alphabetical")
|
|
80
|
+
self.molecules.ensure_index("species_hash")
|
|
81
|
+
|
|
82
|
+
# Search index for force properties
|
|
83
|
+
self.forces.ensure_index("molecule_id")
|
|
84
|
+
self.forces.ensure_index("task_id")
|
|
85
|
+
self.forces.ensure_index("solvent")
|
|
86
|
+
self.forces.ensure_index("lot_solvent")
|
|
87
|
+
self.forces.ensure_index("property_id")
|
|
88
|
+
self.forces.ensure_index("last_updated")
|
|
89
|
+
self.forces.ensure_index("formula_alphabetical")
|
|
90
|
+
|
|
91
|
+
def prechunk(self, number_splits: int) -> Iterable[Dict]: # pragma: no cover
|
|
92
|
+
"""Prechunk the builder for distributed computation"""
|
|
93
|
+
|
|
94
|
+
temp_query = dict(self.query)
|
|
95
|
+
temp_query["deprecated"] = False
|
|
96
|
+
|
|
97
|
+
self.logger.info("Finding documents to process")
|
|
98
|
+
all_mols = list(
|
|
99
|
+
self.molecules.query(temp_query, [self.molecules.key, "species_hash"])
|
|
100
|
+
)
|
|
101
|
+
|
|
102
|
+
processed_docs = set([e for e in self.forces.distinct("molecule_id")])
|
|
103
|
+
to_process_docs = {d[self.molecules.key] for d in all_mols} - processed_docs
|
|
104
|
+
to_process_hashes = {
|
|
105
|
+
d["species_hash"]
|
|
106
|
+
for d in all_mols
|
|
107
|
+
if d[self.molecules.key] in to_process_docs
|
|
108
|
+
}
|
|
109
|
+
|
|
110
|
+
N = ceil(len(to_process_hashes) / number_splits)
|
|
111
|
+
|
|
112
|
+
for hash_chunk in grouper(to_process_hashes, N):
|
|
113
|
+
query = dict(temp_query)
|
|
114
|
+
query["species_hash"] = {"$in": list(hash_chunk)}
|
|
115
|
+
yield {"query": query}
|
|
116
|
+
|
|
117
|
+
def get_items(self) -> Iterator[List[Dict]]:
|
|
118
|
+
"""
|
|
119
|
+
Gets all items to process into force documents.
|
|
120
|
+
This does no datetime checking; relying on on whether
|
|
121
|
+
task_ids are included in the forces Store
|
|
122
|
+
|
|
123
|
+
Returns:
|
|
124
|
+
generator or list relevant tasks and molecules to process into documents
|
|
125
|
+
"""
|
|
126
|
+
|
|
127
|
+
self.logger.info("Forces builder started")
|
|
128
|
+
self.logger.info("Setting indexes")
|
|
129
|
+
self.ensure_indexes()
|
|
130
|
+
|
|
131
|
+
# Save timestamp to mark buildtime
|
|
132
|
+
self.timestamp = datetime.utcnow()
|
|
133
|
+
|
|
134
|
+
# Get all processed molecules
|
|
135
|
+
temp_query = dict(self.query)
|
|
136
|
+
temp_query["deprecated"] = False
|
|
137
|
+
|
|
138
|
+
self.logger.info("Finding documents to process")
|
|
139
|
+
all_mols = list(
|
|
140
|
+
self.molecules.query(temp_query, [self.molecules.key, "species_hash"])
|
|
141
|
+
)
|
|
142
|
+
|
|
143
|
+
processed_docs = set([e for e in self.forces.distinct("molecule_id")])
|
|
144
|
+
to_process_docs = {d[self.molecules.key] for d in all_mols} - processed_docs
|
|
145
|
+
to_process_hashes = {
|
|
146
|
+
d["species_hash"]
|
|
147
|
+
for d in all_mols
|
|
148
|
+
if d[self.molecules.key] in to_process_docs
|
|
149
|
+
}
|
|
150
|
+
|
|
151
|
+
self.logger.info(f"Found {len(to_process_docs)} unprocessed documents")
|
|
152
|
+
self.logger.info(f"Found {len(to_process_hashes)} unprocessed hashes")
|
|
153
|
+
|
|
154
|
+
# Set total for builder bars to have a total
|
|
155
|
+
self.total = len(to_process_hashes)
|
|
156
|
+
|
|
157
|
+
for shash in to_process_hashes:
|
|
158
|
+
mol_query = dict(temp_query)
|
|
159
|
+
mol_query["species_hash"] = shash
|
|
160
|
+
molecules = list(self.molecules.query(criteria=mol_query))
|
|
161
|
+
|
|
162
|
+
yield molecules
|
|
163
|
+
|
|
164
|
+
def process_item(self, items: List[Dict]) -> List[Dict]:
|
|
165
|
+
"""
|
|
166
|
+
Process the tasks into ForcesDoc
|
|
167
|
+
|
|
168
|
+
Args:
|
|
169
|
+
items List[Dict] : a list of MoleculeDocs in dict form
|
|
170
|
+
|
|
171
|
+
Returns:
|
|
172
|
+
[dict] : a list of new forces docs
|
|
173
|
+
"""
|
|
174
|
+
|
|
175
|
+
mols = [MoleculeDoc(**item) for item in items]
|
|
176
|
+
shash = mols[0].species_hash
|
|
177
|
+
mol_ids = [m.molecule_id for m in mols]
|
|
178
|
+
self.logger.debug(f"Processing {shash} : {mol_ids}")
|
|
179
|
+
|
|
180
|
+
force_docs = list()
|
|
181
|
+
|
|
182
|
+
for mol in mols:
|
|
183
|
+
force_entries = [
|
|
184
|
+
e
|
|
185
|
+
for e in mol.entries
|
|
186
|
+
if e["charge"] == mol.charge and e["task_type"] == "Force"
|
|
187
|
+
]
|
|
188
|
+
|
|
189
|
+
# Organize by solvent environment
|
|
190
|
+
by_solvent = defaultdict(list)
|
|
191
|
+
for entry in force_entries:
|
|
192
|
+
by_solvent[entry["solvent"]].append(entry)
|
|
193
|
+
|
|
194
|
+
for solvent, entries in by_solvent.items():
|
|
195
|
+
# No force calculations
|
|
196
|
+
if len(entries) == 0:
|
|
197
|
+
continue
|
|
198
|
+
else:
|
|
199
|
+
best = sorted(
|
|
200
|
+
entries,
|
|
201
|
+
key=lambda x: (
|
|
202
|
+
sum(evaluate_lot(x["level_of_theory"])),
|
|
203
|
+
x["energy"],
|
|
204
|
+
),
|
|
205
|
+
)[0]
|
|
206
|
+
task = best["task_id"]
|
|
207
|
+
|
|
208
|
+
tdoc = self.tasks.query_one(
|
|
209
|
+
{
|
|
210
|
+
"task_id": task,
|
|
211
|
+
"species_hash": shash,
|
|
212
|
+
"orig": {"$exists": True},
|
|
213
|
+
}
|
|
214
|
+
)
|
|
215
|
+
|
|
216
|
+
if tdoc is None:
|
|
217
|
+
try:
|
|
218
|
+
tdoc = self.tasks.query_one(
|
|
219
|
+
{
|
|
220
|
+
"task_id": int(task),
|
|
221
|
+
"species_hash": shash,
|
|
222
|
+
"orig": {"$exists": True},
|
|
223
|
+
}
|
|
224
|
+
)
|
|
225
|
+
except ValueError:
|
|
226
|
+
tdoc = None
|
|
227
|
+
|
|
228
|
+
if tdoc is None:
|
|
229
|
+
continue
|
|
230
|
+
|
|
231
|
+
task_doc = TaskDocument(**tdoc)
|
|
232
|
+
|
|
233
|
+
if task_doc is None:
|
|
234
|
+
continue
|
|
235
|
+
|
|
236
|
+
force_doc = ForcesDoc.from_task(
|
|
237
|
+
task_doc, molecule_id=mol.molecule_id, deprecated=False
|
|
238
|
+
)
|
|
239
|
+
force_docs.append(force_doc)
|
|
240
|
+
|
|
241
|
+
self.logger.debug(f"Produced {len(force_docs)} force docs for {shash}")
|
|
242
|
+
|
|
243
|
+
return jsanitize([doc.model_dump() for doc in force_docs], allow_bson=True)
|
|
244
|
+
|
|
245
|
+
def update_targets(self, items: List[List[Dict]]):
|
|
246
|
+
"""
|
|
247
|
+
Inserts the new force docs into the forces collection
|
|
248
|
+
|
|
249
|
+
Args:
|
|
250
|
+
items [[dict]]: A list of documents to update
|
|
251
|
+
"""
|
|
252
|
+
|
|
253
|
+
docs = list(chain.from_iterable(items)) # type: ignore
|
|
254
|
+
|
|
255
|
+
# Add timestamp
|
|
256
|
+
for item in docs:
|
|
257
|
+
item.update(
|
|
258
|
+
{
|
|
259
|
+
"_bt": self.timestamp,
|
|
260
|
+
}
|
|
261
|
+
)
|
|
262
|
+
|
|
263
|
+
molecule_ids = list({item["molecule_id"] for item in docs})
|
|
264
|
+
|
|
265
|
+
if len(items) > 0:
|
|
266
|
+
self.logger.info(f"Updating {len(docs)} force documents")
|
|
267
|
+
self.forces.remove_docs({self.forces.key: {"$in": molecule_ids}})
|
|
268
|
+
self.forces.update(
|
|
269
|
+
docs=docs,
|
|
270
|
+
key=["molecule_id", "solvent"],
|
|
271
|
+
)
|
|
272
|
+
else:
|
|
273
|
+
self.logger.info("No items to update")
|
|
274
|
+
|
|
275
|
+
|
|
276
|
+
class TrajectoryBuilder(Builder):
|
|
277
|
+
"""
|
|
278
|
+
The TrajectoryBuilder extracts the highest-quality optimization trajectory data from a
|
|
279
|
+
MoleculeDoc. In general, this will mean that the geometry optimization calculation(s)
|
|
280
|
+
corresponding to the structure(s) that make up this molecule will be used to extract
|
|
281
|
+
optimization trajectories.
|
|
282
|
+
|
|
283
|
+
The process is as follows:
|
|
284
|
+
1. Gather MoleculeDocs by species hash
|
|
285
|
+
2. For each doc, sort tasks by solvent
|
|
286
|
+
3. For each solvent, grab the best TaskDoc (geometry optimization calculation
|
|
287
|
+
that has the highest level of theory with lowest
|
|
288
|
+
electronic energy for the molecule)
|
|
289
|
+
4. Convert TaskDoc to TrajectoryDoc
|
|
290
|
+
"""
|
|
291
|
+
|
|
292
|
+
def __init__(
|
|
293
|
+
self,
|
|
294
|
+
tasks: Store,
|
|
295
|
+
molecules: Store,
|
|
296
|
+
trajectories: Store,
|
|
297
|
+
query: Optional[Dict] = None,
|
|
298
|
+
settings: Optional[EmmetBuildSettings] = None,
|
|
299
|
+
**kwargs,
|
|
300
|
+
):
|
|
301
|
+
self.tasks = tasks
|
|
302
|
+
self.molecules = molecules
|
|
303
|
+
self.trajectories = trajectories
|
|
304
|
+
self.query = query if query else dict()
|
|
305
|
+
self.settings = EmmetBuildSettings.autoload(settings)
|
|
306
|
+
self.kwargs = kwargs
|
|
307
|
+
|
|
308
|
+
super().__init__(sources=[tasks, molecules], targets=[trajectories], **kwargs)
|
|
309
|
+
# Uncomment in case of issue with mrun not connecting automatically to collections
|
|
310
|
+
# for i in [self.tasks, self.molecules, self.trajectories]:
|
|
311
|
+
# try:
|
|
312
|
+
# i.connect()
|
|
313
|
+
# except Exception as e:
|
|
314
|
+
# print("Could not connect,", e)
|
|
315
|
+
|
|
316
|
+
def ensure_indexes(self):
|
|
317
|
+
"""
|
|
318
|
+
Ensures indices on the collections needed for building
|
|
319
|
+
"""
|
|
320
|
+
|
|
321
|
+
# Basic search index for tasks
|
|
322
|
+
self.tasks.ensure_index("task_id")
|
|
323
|
+
self.tasks.ensure_index("last_updated")
|
|
324
|
+
self.tasks.ensure_index("state")
|
|
325
|
+
self.tasks.ensure_index("formula_alphabetical")
|
|
326
|
+
self.tasks.ensure_index("species_hash")
|
|
327
|
+
|
|
328
|
+
# Search index for molecules
|
|
329
|
+
self.molecules.ensure_index("molecule_id")
|
|
330
|
+
self.molecules.ensure_index("last_updated")
|
|
331
|
+
self.molecules.ensure_index("task_ids")
|
|
332
|
+
self.molecules.ensure_index("formula_alphabetical")
|
|
333
|
+
self.molecules.ensure_index("species_hash")
|
|
334
|
+
|
|
335
|
+
# Search index for geometry optimization trajectory properties
|
|
336
|
+
self.trajectories.ensure_index("molecule_id")
|
|
337
|
+
self.trajectories.ensure_index("task_id")
|
|
338
|
+
self.trajectories.ensure_index("solvent")
|
|
339
|
+
self.trajectories.ensure_index("lot_solvent")
|
|
340
|
+
self.trajectories.ensure_index("property_id")
|
|
341
|
+
self.trajectories.ensure_index("last_updated")
|
|
342
|
+
self.trajectories.ensure_index("formula_alphabetical")
|
|
343
|
+
|
|
344
|
+
def prechunk(self, number_splits: int) -> Iterable[Dict]: # pragma: no cover
|
|
345
|
+
"""Prechunk the builder for distributed computation"""
|
|
346
|
+
|
|
347
|
+
temp_query = dict(self.query)
|
|
348
|
+
temp_query["deprecated"] = False
|
|
349
|
+
|
|
350
|
+
self.logger.info("Finding documents to process")
|
|
351
|
+
all_mols = list(
|
|
352
|
+
self.molecules.query(temp_query, [self.molecules.key, "species_hash"])
|
|
353
|
+
)
|
|
354
|
+
|
|
355
|
+
processed_docs = set([e for e in self.trajectories.distinct("molecule_id")])
|
|
356
|
+
to_process_docs = {d[self.molecules.key] for d in all_mols} - processed_docs
|
|
357
|
+
to_process_hashes = {
|
|
358
|
+
d["species_hash"]
|
|
359
|
+
for d in all_mols
|
|
360
|
+
if d[self.molecules.key] in to_process_docs
|
|
361
|
+
}
|
|
362
|
+
|
|
363
|
+
N = ceil(len(to_process_hashes) / number_splits)
|
|
364
|
+
|
|
365
|
+
for hash_chunk in grouper(to_process_hashes, N):
|
|
366
|
+
yield {"query": {"species_hash": {"$in": list(hash_chunk)}}}
|
|
367
|
+
|
|
368
|
+
def get_items(self) -> Iterator[List[Dict]]:
|
|
369
|
+
"""
|
|
370
|
+
Gets all items to process into trajectory documents.
|
|
371
|
+
This does no datetime checking; relying on on whether
|
|
372
|
+
task_ids are included in the forces Store
|
|
373
|
+
|
|
374
|
+
Returns:
|
|
375
|
+
generator or list relevant tasks and molecules to process into documents
|
|
376
|
+
"""
|
|
377
|
+
|
|
378
|
+
self.logger.info("Trajectories builder started")
|
|
379
|
+
self.logger.info("Setting indexes")
|
|
380
|
+
self.ensure_indexes()
|
|
381
|
+
|
|
382
|
+
# Save timestamp to mark buildtime
|
|
383
|
+
self.timestamp = datetime.utcnow()
|
|
384
|
+
|
|
385
|
+
# Get all processed molecules
|
|
386
|
+
temp_query = dict(self.query)
|
|
387
|
+
temp_query["deprecated"] = False
|
|
388
|
+
|
|
389
|
+
self.logger.info("Finding documents to process")
|
|
390
|
+
all_mols = list(
|
|
391
|
+
self.molecules.query(temp_query, [self.molecules.key, "species_hash"])
|
|
392
|
+
)
|
|
393
|
+
|
|
394
|
+
processed_docs = set([e for e in self.trajectories.distinct("molecule_id")])
|
|
395
|
+
to_process_docs = {d[self.molecules.key] for d in all_mols} - processed_docs
|
|
396
|
+
to_process_hashes = {
|
|
397
|
+
d["species_hash"]
|
|
398
|
+
for d in all_mols
|
|
399
|
+
if d[self.molecules.key] in to_process_docs
|
|
400
|
+
}
|
|
401
|
+
|
|
402
|
+
self.logger.info(f"Found {len(to_process_docs)} unprocessed documents")
|
|
403
|
+
self.logger.info(f"Found {len(to_process_hashes)} unprocessed hashes")
|
|
404
|
+
|
|
405
|
+
# Set total for builder bars to have a total
|
|
406
|
+
self.total = len(to_process_hashes)
|
|
407
|
+
|
|
408
|
+
for shash in to_process_hashes:
|
|
409
|
+
mol_query = dict(temp_query)
|
|
410
|
+
mol_query["species_hash"] = shash
|
|
411
|
+
molecules = list(self.molecules.query(criteria=mol_query))
|
|
412
|
+
|
|
413
|
+
yield molecules
|
|
414
|
+
|
|
415
|
+
def process_item(self, items: List[Dict]) -> List[Dict]:
|
|
416
|
+
"""
|
|
417
|
+
Process the tasks into TrajectoryDocs
|
|
418
|
+
|
|
419
|
+
Args:
|
|
420
|
+
items List[Dict] : a list of MoleculeDocs in dict form
|
|
421
|
+
|
|
422
|
+
Returns:
|
|
423
|
+
[dict] : a list of new trajectory docs
|
|
424
|
+
"""
|
|
425
|
+
|
|
426
|
+
mols = [MoleculeDoc(**item) for item in items]
|
|
427
|
+
shash = mols[0].species_hash
|
|
428
|
+
mol_ids = [m.molecule_id for m in mols]
|
|
429
|
+
self.logger.debug(f"Processing {shash} : {mol_ids}")
|
|
430
|
+
|
|
431
|
+
trajectory_docs = list()
|
|
432
|
+
|
|
433
|
+
for mol in mols:
|
|
434
|
+
entries = mol.best_entries
|
|
435
|
+
|
|
436
|
+
# Organize by solvent environment
|
|
437
|
+
by_solvent = defaultdict(list)
|
|
438
|
+
for entry in entries.values():
|
|
439
|
+
by_solvent[entry["solvent"]].append(entry)
|
|
440
|
+
|
|
441
|
+
for solvent, entries in by_solvent.items():
|
|
442
|
+
# No "best" entry - shouldn't happen, but just in case
|
|
443
|
+
if len(entries) == 0:
|
|
444
|
+
continue
|
|
445
|
+
else:
|
|
446
|
+
# In case there are multiple optimized structures with the same solvent but different LOT
|
|
447
|
+
best = sorted(
|
|
448
|
+
entries,
|
|
449
|
+
key=lambda x: (
|
|
450
|
+
sum(evaluate_lot(x["level_of_theory"])),
|
|
451
|
+
x["energy"],
|
|
452
|
+
),
|
|
453
|
+
)[0]
|
|
454
|
+
task = best["task_id"]
|
|
455
|
+
|
|
456
|
+
tdoc = self.tasks.query_one(
|
|
457
|
+
{
|
|
458
|
+
"task_id": task,
|
|
459
|
+
"species_hash": shash,
|
|
460
|
+
"orig": {"$exists": True},
|
|
461
|
+
}
|
|
462
|
+
)
|
|
463
|
+
|
|
464
|
+
if tdoc is None:
|
|
465
|
+
try:
|
|
466
|
+
tdoc = self.tasks.query_one(
|
|
467
|
+
{
|
|
468
|
+
"task_id": int(task),
|
|
469
|
+
"species_hash": shash,
|
|
470
|
+
"orig": {"$exists": True},
|
|
471
|
+
}
|
|
472
|
+
)
|
|
473
|
+
except ValueError:
|
|
474
|
+
tdoc = None
|
|
475
|
+
|
|
476
|
+
if tdoc is None:
|
|
477
|
+
continue
|
|
478
|
+
|
|
479
|
+
task_doc = TaskDocument(**tdoc)
|
|
480
|
+
|
|
481
|
+
if task_doc is None:
|
|
482
|
+
continue
|
|
483
|
+
|
|
484
|
+
trajectory_doc = TrajectoryDoc.from_task(
|
|
485
|
+
task_doc, molecule_id=mol.molecule_id, deprecated=False
|
|
486
|
+
)
|
|
487
|
+
trajectory_docs.append(trajectory_doc)
|
|
488
|
+
|
|
489
|
+
self.logger.debug(
|
|
490
|
+
f"Produced {len(trajectory_docs)} trajectory docs for {shash}"
|
|
491
|
+
)
|
|
492
|
+
|
|
493
|
+
return jsanitize([doc.model_dump() for doc in trajectory_docs], allow_bson=True)
|
|
494
|
+
|
|
495
|
+
def update_targets(self, items: List[List[Dict]]):
|
|
496
|
+
"""
|
|
497
|
+
Inserts the new force docs into the trajectories collection
|
|
498
|
+
|
|
499
|
+
Args:
|
|
500
|
+
items [[dict]]: A list of documents to update
|
|
501
|
+
"""
|
|
502
|
+
|
|
503
|
+
docs = list(chain.from_iterable(items)) # type: ignore
|
|
504
|
+
|
|
505
|
+
# Add timestamp
|
|
506
|
+
for item in docs:
|
|
507
|
+
item.update(
|
|
508
|
+
{
|
|
509
|
+
"_bt": self.timestamp,
|
|
510
|
+
}
|
|
511
|
+
)
|
|
512
|
+
|
|
513
|
+
molecule_ids = list({item["molecule_id"] for item in docs})
|
|
514
|
+
|
|
515
|
+
if len(items) > 0:
|
|
516
|
+
self.logger.info(f"Updating {len(docs)} trajectory documents")
|
|
517
|
+
self.trajectories.remove_docs(
|
|
518
|
+
{self.trajectories.key: {"$in": molecule_ids}}
|
|
519
|
+
)
|
|
520
|
+
self.trajectories.update(
|
|
521
|
+
docs=docs,
|
|
522
|
+
key=["molecule_id", "solvent"],
|
|
523
|
+
)
|
|
524
|
+
else:
|
|
525
|
+
self.logger.info("No items to update")
|