emmet-builders 0.84.2rc8__py3-none-any.whl → 0.84.2rc10__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of emmet-builders might be problematic. Click here for more details.

@@ -1,525 +0,0 @@
1
- from collections import defaultdict
2
- from datetime import datetime
3
- from itertools import chain
4
- from math import ceil
5
- from typing import Optional, Iterable, Iterator, List, Dict
6
-
7
- from maggma.builders import Builder
8
- from maggma.core import Store
9
- from maggma.utils import grouper
10
-
11
- from emmet.core.qchem.task import TaskDocument
12
- from emmet.core.qchem.molecule import MoleculeDoc, evaluate_lot
13
- from emmet.core.molecules.trajectory import ForcesDoc, TrajectoryDoc
14
- from emmet.core.utils import jsanitize
15
- from emmet.builders.settings import EmmetBuildSettings
16
-
17
-
18
- __author__ = "Evan Spotte-Smith"
19
-
20
- SETTINGS = EmmetBuildSettings()
21
-
22
-
23
- class ForcesBuilder(Builder):
24
- """
25
- The ForcesBuilder extracts the highest-quality force data from a
26
- MoleculeDoc (lowest electronic energy, highest level of theory for
27
- each solvent available).
28
-
29
- The process is as follows:
30
- 1. Gather MoleculeDocs by species hash
31
- 2. For each doc, sort tasks by solvent
32
- 3. For each solvent, grab the best TaskDoc (doc with force
33
- information that has the highest level of theory with lowest
34
- electronic energy for the molecule)
35
- 4. Convert TaskDoc to ForcesDoc
36
-
37
- """
38
-
39
- def __init__(
40
- self,
41
- tasks: Store,
42
- molecules: Store,
43
- forces: Store,
44
- query: Optional[Dict] = None,
45
- settings: Optional[EmmetBuildSettings] = None,
46
- **kwargs,
47
- ):
48
- self.tasks = tasks
49
- self.molecules = molecules
50
- self.forces = forces
51
- self.query = query if query else dict()
52
- self.settings = EmmetBuildSettings.autoload(settings)
53
- self.kwargs = kwargs
54
-
55
- super().__init__(sources=[tasks, molecules], targets=[forces], **kwargs)
56
- # Uncomment in case of issue with mrun not connecting automatically to collections
57
- # for i in [self.tasks, self.molecules, self.forces]:
58
- # try:
59
- # i.connect()
60
- # except Exception as e:
61
- # print("Could not connect,", e)
62
-
63
- def ensure_indexes(self):
64
- """
65
- Ensures indices on the collections needed for building
66
- """
67
-
68
- # Basic search index for tasks
69
- self.tasks.ensure_index("task_id")
70
- self.tasks.ensure_index("last_updated")
71
- self.tasks.ensure_index("state")
72
- self.tasks.ensure_index("formula_alphabetical")
73
- self.tasks.ensure_index("species_hash")
74
-
75
- # Search index for molecules
76
- self.molecules.ensure_index("molecule_id")
77
- self.molecules.ensure_index("last_updated")
78
- self.molecules.ensure_index("task_ids")
79
- self.molecules.ensure_index("formula_alphabetical")
80
- self.molecules.ensure_index("species_hash")
81
-
82
- # Search index for force properties
83
- self.forces.ensure_index("molecule_id")
84
- self.forces.ensure_index("task_id")
85
- self.forces.ensure_index("solvent")
86
- self.forces.ensure_index("lot_solvent")
87
- self.forces.ensure_index("property_id")
88
- self.forces.ensure_index("last_updated")
89
- self.forces.ensure_index("formula_alphabetical")
90
-
91
- def prechunk(self, number_splits: int) -> Iterable[Dict]: # pragma: no cover
92
- """Prechunk the builder for distributed computation"""
93
-
94
- temp_query = dict(self.query)
95
- temp_query["deprecated"] = False
96
-
97
- self.logger.info("Finding documents to process")
98
- all_mols = list(
99
- self.molecules.query(temp_query, [self.molecules.key, "species_hash"])
100
- )
101
-
102
- processed_docs = set([e for e in self.forces.distinct("molecule_id")])
103
- to_process_docs = {d[self.molecules.key] for d in all_mols} - processed_docs
104
- to_process_hashes = {
105
- d["species_hash"]
106
- for d in all_mols
107
- if d[self.molecules.key] in to_process_docs
108
- }
109
-
110
- N = ceil(len(to_process_hashes) / number_splits)
111
-
112
- for hash_chunk in grouper(to_process_hashes, N):
113
- query = dict(temp_query)
114
- query["species_hash"] = {"$in": list(hash_chunk)}
115
- yield {"query": query}
116
-
117
- def get_items(self) -> Iterator[List[Dict]]:
118
- """
119
- Gets all items to process into force documents.
120
- This does no datetime checking; relying on on whether
121
- task_ids are included in the forces Store
122
-
123
- Returns:
124
- generator or list relevant tasks and molecules to process into documents
125
- """
126
-
127
- self.logger.info("Forces builder started")
128
- self.logger.info("Setting indexes")
129
- self.ensure_indexes()
130
-
131
- # Save timestamp to mark buildtime
132
- self.timestamp = datetime.utcnow()
133
-
134
- # Get all processed molecules
135
- temp_query = dict(self.query)
136
- temp_query["deprecated"] = False
137
-
138
- self.logger.info("Finding documents to process")
139
- all_mols = list(
140
- self.molecules.query(temp_query, [self.molecules.key, "species_hash"])
141
- )
142
-
143
- processed_docs = set([e for e in self.forces.distinct("molecule_id")])
144
- to_process_docs = {d[self.molecules.key] for d in all_mols} - processed_docs
145
- to_process_hashes = {
146
- d["species_hash"]
147
- for d in all_mols
148
- if d[self.molecules.key] in to_process_docs
149
- }
150
-
151
- self.logger.info(f"Found {len(to_process_docs)} unprocessed documents")
152
- self.logger.info(f"Found {len(to_process_hashes)} unprocessed hashes")
153
-
154
- # Set total for builder bars to have a total
155
- self.total = len(to_process_hashes)
156
-
157
- for shash in to_process_hashes:
158
- mol_query = dict(temp_query)
159
- mol_query["species_hash"] = shash
160
- molecules = list(self.molecules.query(criteria=mol_query))
161
-
162
- yield molecules
163
-
164
- def process_item(self, items: List[Dict]) -> List[Dict]:
165
- """
166
- Process the tasks into ForcesDoc
167
-
168
- Args:
169
- items List[Dict] : a list of MoleculeDocs in dict form
170
-
171
- Returns:
172
- [dict] : a list of new forces docs
173
- """
174
-
175
- mols = [MoleculeDoc(**item) for item in items]
176
- shash = mols[0].species_hash
177
- mol_ids = [m.molecule_id for m in mols]
178
- self.logger.debug(f"Processing {shash} : {mol_ids}")
179
-
180
- force_docs = list()
181
-
182
- for mol in mols:
183
- force_entries = [
184
- e
185
- for e in mol.entries
186
- if e["charge"] == mol.charge and e["task_type"] == "Force"
187
- ]
188
-
189
- # Organize by solvent environment
190
- by_solvent = defaultdict(list)
191
- for entry in force_entries:
192
- by_solvent[entry["solvent"]].append(entry)
193
-
194
- for solvent, entries in by_solvent.items():
195
- # No force calculations
196
- if len(entries) == 0:
197
- continue
198
- else:
199
- best = sorted(
200
- entries,
201
- key=lambda x: (
202
- sum(evaluate_lot(x["level_of_theory"])),
203
- x["energy"],
204
- ),
205
- )[0]
206
- task = best["task_id"]
207
-
208
- tdoc = self.tasks.query_one(
209
- {
210
- "task_id": task,
211
- "species_hash": shash,
212
- "orig": {"$exists": True},
213
- }
214
- )
215
-
216
- if tdoc is None:
217
- try:
218
- tdoc = self.tasks.query_one(
219
- {
220
- "task_id": int(task),
221
- "species_hash": shash,
222
- "orig": {"$exists": True},
223
- }
224
- )
225
- except ValueError:
226
- tdoc = None
227
-
228
- if tdoc is None:
229
- continue
230
-
231
- task_doc = TaskDocument(**tdoc)
232
-
233
- if task_doc is None:
234
- continue
235
-
236
- force_doc = ForcesDoc.from_task(
237
- task_doc, molecule_id=mol.molecule_id, deprecated=False
238
- )
239
- force_docs.append(force_doc)
240
-
241
- self.logger.debug(f"Produced {len(force_docs)} force docs for {shash}")
242
-
243
- return jsanitize([doc.model_dump() for doc in force_docs], allow_bson=True)
244
-
245
- def update_targets(self, items: List[List[Dict]]):
246
- """
247
- Inserts the new force docs into the forces collection
248
-
249
- Args:
250
- items [[dict]]: A list of documents to update
251
- """
252
-
253
- docs = list(chain.from_iterable(items)) # type: ignore
254
-
255
- # Add timestamp
256
- for item in docs:
257
- item.update(
258
- {
259
- "_bt": self.timestamp,
260
- }
261
- )
262
-
263
- molecule_ids = list({item["molecule_id"] for item in docs})
264
-
265
- if len(items) > 0:
266
- self.logger.info(f"Updating {len(docs)} force documents")
267
- self.forces.remove_docs({self.forces.key: {"$in": molecule_ids}})
268
- self.forces.update(
269
- docs=docs,
270
- key=["molecule_id", "solvent"],
271
- )
272
- else:
273
- self.logger.info("No items to update")
274
-
275
-
276
- class TrajectoryBuilder(Builder):
277
- """
278
- The TrajectoryBuilder extracts the highest-quality optimization trajectory data from a
279
- MoleculeDoc. In general, this will mean that the geometry optimization calculation(s)
280
- corresponding to the structure(s) that make up this molecule will be used to extract
281
- optimization trajectories.
282
-
283
- The process is as follows:
284
- 1. Gather MoleculeDocs by species hash
285
- 2. For each doc, sort tasks by solvent
286
- 3. For each solvent, grab the best TaskDoc (geometry optimization calculation
287
- that has the highest level of theory with lowest
288
- electronic energy for the molecule)
289
- 4. Convert TaskDoc to TrajectoryDoc
290
- """
291
-
292
- def __init__(
293
- self,
294
- tasks: Store,
295
- molecules: Store,
296
- trajectories: Store,
297
- query: Optional[Dict] = None,
298
- settings: Optional[EmmetBuildSettings] = None,
299
- **kwargs,
300
- ):
301
- self.tasks = tasks
302
- self.molecules = molecules
303
- self.trajectories = trajectories
304
- self.query = query if query else dict()
305
- self.settings = EmmetBuildSettings.autoload(settings)
306
- self.kwargs = kwargs
307
-
308
- super().__init__(sources=[tasks, molecules], targets=[trajectories], **kwargs)
309
- # Uncomment in case of issue with mrun not connecting automatically to collections
310
- # for i in [self.tasks, self.molecules, self.trajectories]:
311
- # try:
312
- # i.connect()
313
- # except Exception as e:
314
- # print("Could not connect,", e)
315
-
316
- def ensure_indexes(self):
317
- """
318
- Ensures indices on the collections needed for building
319
- """
320
-
321
- # Basic search index for tasks
322
- self.tasks.ensure_index("task_id")
323
- self.tasks.ensure_index("last_updated")
324
- self.tasks.ensure_index("state")
325
- self.tasks.ensure_index("formula_alphabetical")
326
- self.tasks.ensure_index("species_hash")
327
-
328
- # Search index for molecules
329
- self.molecules.ensure_index("molecule_id")
330
- self.molecules.ensure_index("last_updated")
331
- self.molecules.ensure_index("task_ids")
332
- self.molecules.ensure_index("formula_alphabetical")
333
- self.molecules.ensure_index("species_hash")
334
-
335
- # Search index for geometry optimization trajectory properties
336
- self.trajectories.ensure_index("molecule_id")
337
- self.trajectories.ensure_index("task_id")
338
- self.trajectories.ensure_index("solvent")
339
- self.trajectories.ensure_index("lot_solvent")
340
- self.trajectories.ensure_index("property_id")
341
- self.trajectories.ensure_index("last_updated")
342
- self.trajectories.ensure_index("formula_alphabetical")
343
-
344
- def prechunk(self, number_splits: int) -> Iterable[Dict]: # pragma: no cover
345
- """Prechunk the builder for distributed computation"""
346
-
347
- temp_query = dict(self.query)
348
- temp_query["deprecated"] = False
349
-
350
- self.logger.info("Finding documents to process")
351
- all_mols = list(
352
- self.molecules.query(temp_query, [self.molecules.key, "species_hash"])
353
- )
354
-
355
- processed_docs = set([e for e in self.trajectories.distinct("molecule_id")])
356
- to_process_docs = {d[self.molecules.key] for d in all_mols} - processed_docs
357
- to_process_hashes = {
358
- d["species_hash"]
359
- for d in all_mols
360
- if d[self.molecules.key] in to_process_docs
361
- }
362
-
363
- N = ceil(len(to_process_hashes) / number_splits)
364
-
365
- for hash_chunk in grouper(to_process_hashes, N):
366
- yield {"query": {"species_hash": {"$in": list(hash_chunk)}}}
367
-
368
- def get_items(self) -> Iterator[List[Dict]]:
369
- """
370
- Gets all items to process into trajectory documents.
371
- This does no datetime checking; relying on on whether
372
- task_ids are included in the forces Store
373
-
374
- Returns:
375
- generator or list relevant tasks and molecules to process into documents
376
- """
377
-
378
- self.logger.info("Trajectories builder started")
379
- self.logger.info("Setting indexes")
380
- self.ensure_indexes()
381
-
382
- # Save timestamp to mark buildtime
383
- self.timestamp = datetime.utcnow()
384
-
385
- # Get all processed molecules
386
- temp_query = dict(self.query)
387
- temp_query["deprecated"] = False
388
-
389
- self.logger.info("Finding documents to process")
390
- all_mols = list(
391
- self.molecules.query(temp_query, [self.molecules.key, "species_hash"])
392
- )
393
-
394
- processed_docs = set([e for e in self.trajectories.distinct("molecule_id")])
395
- to_process_docs = {d[self.molecules.key] for d in all_mols} - processed_docs
396
- to_process_hashes = {
397
- d["species_hash"]
398
- for d in all_mols
399
- if d[self.molecules.key] in to_process_docs
400
- }
401
-
402
- self.logger.info(f"Found {len(to_process_docs)} unprocessed documents")
403
- self.logger.info(f"Found {len(to_process_hashes)} unprocessed hashes")
404
-
405
- # Set total for builder bars to have a total
406
- self.total = len(to_process_hashes)
407
-
408
- for shash in to_process_hashes:
409
- mol_query = dict(temp_query)
410
- mol_query["species_hash"] = shash
411
- molecules = list(self.molecules.query(criteria=mol_query))
412
-
413
- yield molecules
414
-
415
- def process_item(self, items: List[Dict]) -> List[Dict]:
416
- """
417
- Process the tasks into TrajectoryDocs
418
-
419
- Args:
420
- items List[Dict] : a list of MoleculeDocs in dict form
421
-
422
- Returns:
423
- [dict] : a list of new trajectory docs
424
- """
425
-
426
- mols = [MoleculeDoc(**item) for item in items]
427
- shash = mols[0].species_hash
428
- mol_ids = [m.molecule_id for m in mols]
429
- self.logger.debug(f"Processing {shash} : {mol_ids}")
430
-
431
- trajectory_docs = list()
432
-
433
- for mol in mols:
434
- entries = mol.best_entries
435
-
436
- # Organize by solvent environment
437
- by_solvent = defaultdict(list)
438
- for entry in entries.values():
439
- by_solvent[entry["solvent"]].append(entry)
440
-
441
- for solvent, entries in by_solvent.items():
442
- # No "best" entry - shouldn't happen, but just in case
443
- if len(entries) == 0:
444
- continue
445
- else:
446
- # In case there are multiple optimized structures with the same solvent but different LOT
447
- best = sorted(
448
- entries,
449
- key=lambda x: (
450
- sum(evaluate_lot(x["level_of_theory"])),
451
- x["energy"],
452
- ),
453
- )[0]
454
- task = best["task_id"]
455
-
456
- tdoc = self.tasks.query_one(
457
- {
458
- "task_id": task,
459
- "species_hash": shash,
460
- "orig": {"$exists": True},
461
- }
462
- )
463
-
464
- if tdoc is None:
465
- try:
466
- tdoc = self.tasks.query_one(
467
- {
468
- "task_id": int(task),
469
- "species_hash": shash,
470
- "orig": {"$exists": True},
471
- }
472
- )
473
- except ValueError:
474
- tdoc = None
475
-
476
- if tdoc is None:
477
- continue
478
-
479
- task_doc = TaskDocument(**tdoc)
480
-
481
- if task_doc is None:
482
- continue
483
-
484
- trajectory_doc = TrajectoryDoc.from_task(
485
- task_doc, molecule_id=mol.molecule_id, deprecated=False
486
- )
487
- trajectory_docs.append(trajectory_doc)
488
-
489
- self.logger.debug(
490
- f"Produced {len(trajectory_docs)} trajectory docs for {shash}"
491
- )
492
-
493
- return jsanitize([doc.model_dump() for doc in trajectory_docs], allow_bson=True)
494
-
495
- def update_targets(self, items: List[List[Dict]]):
496
- """
497
- Inserts the new force docs into the trajectories collection
498
-
499
- Args:
500
- items [[dict]]: A list of documents to update
501
- """
502
-
503
- docs = list(chain.from_iterable(items)) # type: ignore
504
-
505
- # Add timestamp
506
- for item in docs:
507
- item.update(
508
- {
509
- "_bt": self.timestamp,
510
- }
511
- )
512
-
513
- molecule_ids = list({item["molecule_id"] for item in docs})
514
-
515
- if len(items) > 0:
516
- self.logger.info(f"Updating {len(docs)} trajectory documents")
517
- self.trajectories.remove_docs(
518
- {self.trajectories.key: {"$in": molecule_ids}}
519
- )
520
- self.trajectories.update(
521
- docs=docs,
522
- key=["molecule_id", "solvent"],
523
- )
524
- else:
525
- self.logger.info("No items to update")