emmet-builders 0.84.2rc6__py3-none-any.whl → 0.84.2rc8__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of emmet-builders might be problematic. Click here for more details.

@@ -84,7 +84,7 @@ class ThermoBuilder(Builder):
84
84
  single-point energy corrections.
85
85
 
86
86
  Before any documents are constructed, the following steps are taken:
87
- 1. Gather MoleculeDocs by formula
87
+ 1. Gather MoleculeDocs by species hash
88
88
  2. For each doc, identify tasks with thermodynamic information such as
89
89
  zero-point energy, enthalpy, and entropy. Collect these "documents
90
90
  including complete thermodynamics" (DICTs).
@@ -148,12 +148,14 @@ class ThermoBuilder(Builder):
148
148
  self.tasks.ensure_index("last_updated")
149
149
  self.tasks.ensure_index("state")
150
150
  self.tasks.ensure_index("formula_alphabetical")
151
+ self.tasks.ensure_index("species_hash")
151
152
 
152
153
  # Search index for molecules
153
154
  self.molecules.ensure_index("molecule_id")
154
155
  self.molecules.ensure_index("last_updated")
155
156
  self.molecules.ensure_index("task_ids")
156
157
  self.molecules.ensure_index("formula_alphabetical")
158
+ self.molecules.ensure_index("species_hash")
157
159
 
158
160
  # Search index for thermo
159
161
  self.thermo.ensure_index("molecule_id")
@@ -172,23 +174,23 @@ class ThermoBuilder(Builder):
172
174
 
173
175
  self.logger.info("Finding documents to process")
174
176
  all_mols = list(
175
- self.molecules.query(
176
- temp_query, [self.molecules.key, "formula_alphabetical"]
177
- )
177
+ self.molecules.query(temp_query, [self.molecules.key, "species_hash"])
178
178
  )
179
179
 
180
180
  processed_docs = set([e for e in self.thermo.distinct("molecule_id")])
181
181
  to_process_docs = {d[self.molecules.key] for d in all_mols} - processed_docs
182
- to_process_forms = {
183
- d["formula_alphabetical"]
182
+ to_process_hashes = {
183
+ d["species_hash"]
184
184
  for d in all_mols
185
185
  if d[self.molecules.key] in to_process_docs
186
186
  }
187
187
 
188
- N = ceil(len(to_process_forms) / number_splits)
188
+ N = ceil(len(to_process_hashes) / number_splits)
189
189
 
190
- for formula_chunk in grouper(to_process_forms, N):
191
- yield {"query": {"formula_alphabetical": {"$in": list(formula_chunk)}}}
190
+ for hash_chunk in grouper(to_process_hashes, N):
191
+ query = dict(temp_query)
192
+ query["species_hash"] = {"$in": list(hash_chunk)}
193
+ yield {"query": query}
192
194
 
193
195
  def get_items(self) -> Iterator[List[Dict]]:
194
196
  """
@@ -213,28 +215,26 @@ class ThermoBuilder(Builder):
213
215
 
214
216
  self.logger.info("Finding documents to process")
215
217
  all_mols = list(
216
- self.molecules.query(
217
- temp_query, [self.molecules.key, "formula_alphabetical"]
218
- )
218
+ self.molecules.query(temp_query, [self.molecules.key, "species_hash"])
219
219
  )
220
220
 
221
221
  processed_docs = set([e for e in self.thermo.distinct("molecule_id")])
222
222
  to_process_docs = {d[self.molecules.key] for d in all_mols} - processed_docs
223
- to_process_forms = {
224
- d["formula_alphabetical"]
223
+ to_process_hashes = {
224
+ d["species_hash"]
225
225
  for d in all_mols
226
226
  if d[self.molecules.key] in to_process_docs
227
227
  }
228
228
 
229
229
  self.logger.info(f"Found {len(to_process_docs)} unprocessed documents")
230
- self.logger.info(f"Found {len(to_process_forms)} unprocessed formulas")
230
+ self.logger.info(f"Found {len(to_process_hashes)} unprocessed hashes")
231
231
 
232
232
  # Set total for builder bars to have a total
233
- self.total = len(to_process_forms)
233
+ self.total = len(to_process_hashes)
234
234
 
235
- for formula in to_process_forms:
235
+ for shash in to_process_hashes:
236
236
  mol_query = dict(temp_query)
237
- mol_query["formula_alphabetical"] = formula
237
+ mol_query["species_hash"] = shash
238
238
  molecules = list(self.molecules.query(criteria=mol_query))
239
239
 
240
240
  yield molecules
@@ -273,9 +273,9 @@ class ThermoBuilder(Builder):
273
273
  return doc
274
274
 
275
275
  mols = [MoleculeDoc(**item) for item in items]
276
- formula = mols[0].formula_alphabetical
276
+ shash = mols[0].species_hash
277
277
  mol_ids = [m.molecule_id for m in mols]
278
- self.logger.debug(f"Processing {formula} : {mol_ids}")
278
+ self.logger.debug(f"Processing {shash} : {mol_ids}")
279
279
 
280
280
  thermo_docs = list()
281
281
 
@@ -334,7 +334,7 @@ class ThermoBuilder(Builder):
334
334
  tdoc = self.tasks.query_one(
335
335
  {
336
336
  "task_id": task,
337
- "formula_alphabetical": formula,
337
+ "species_hash": shash,
338
338
  "orig": {"$exists": True},
339
339
  }
340
340
  )
@@ -344,7 +344,7 @@ class ThermoBuilder(Builder):
344
344
  tdoc = self.tasks.query_one(
345
345
  {
346
346
  "task_id": int(task),
347
- "formula_alphabetical": formula,
347
+ "species_hash": shash,
348
348
  "orig": {"$exists": True},
349
349
  }
350
350
  )
@@ -465,7 +465,7 @@ class ThermoBuilder(Builder):
465
465
  sorted(with_eval_e, key=lambda x: (x[1], x[2]))[0][0]
466
466
  )
467
467
 
468
- self.logger.debug(f"Produced {len(thermo_docs)} thermo docs for {formula}")
468
+ self.logger.debug(f"Produced {len(thermo_docs)} thermo docs for {shash}")
469
469
 
470
470
  return jsanitize([doc.model_dump() for doc in thermo_docs], allow_bson=True)
471
471
 
@@ -0,0 +1,525 @@
1
+ from collections import defaultdict
2
+ from datetime import datetime
3
+ from itertools import chain
4
+ from math import ceil
5
+ from typing import Optional, Iterable, Iterator, List, Dict
6
+
7
+ from maggma.builders import Builder
8
+ from maggma.core import Store
9
+ from maggma.utils import grouper
10
+
11
+ from emmet.core.qchem.task import TaskDocument
12
+ from emmet.core.qchem.molecule import MoleculeDoc, evaluate_lot
13
+ from emmet.core.molecules.trajectory import ForcesDoc, TrajectoryDoc
14
+ from emmet.core.utils import jsanitize
15
+ from emmet.builders.settings import EmmetBuildSettings
16
+
17
+
18
+ __author__ = "Evan Spotte-Smith"
19
+
20
+ SETTINGS = EmmetBuildSettings()
21
+
22
+
23
+ class ForcesBuilder(Builder):
24
+ """
25
+ The ForcesBuilder extracts the highest-quality force data from a
26
+ MoleculeDoc (lowest electronic energy, highest level of theory for
27
+ each solvent available).
28
+
29
+ The process is as follows:
30
+ 1. Gather MoleculeDocs by species hash
31
+ 2. For each doc, sort tasks by solvent
32
+ 3. For each solvent, grab the best TaskDoc (doc with force
33
+ information that has the highest level of theory with lowest
34
+ electronic energy for the molecule)
35
+ 4. Convert TaskDoc to ForcesDoc
36
+
37
+ """
38
+
39
+ def __init__(
40
+ self,
41
+ tasks: Store,
42
+ molecules: Store,
43
+ forces: Store,
44
+ query: Optional[Dict] = None,
45
+ settings: Optional[EmmetBuildSettings] = None,
46
+ **kwargs,
47
+ ):
48
+ self.tasks = tasks
49
+ self.molecules = molecules
50
+ self.forces = forces
51
+ self.query = query if query else dict()
52
+ self.settings = EmmetBuildSettings.autoload(settings)
53
+ self.kwargs = kwargs
54
+
55
+ super().__init__(sources=[tasks, molecules], targets=[forces], **kwargs)
56
+ # Uncomment in case of issue with mrun not connecting automatically to collections
57
+ # for i in [self.tasks, self.molecules, self.forces]:
58
+ # try:
59
+ # i.connect()
60
+ # except Exception as e:
61
+ # print("Could not connect,", e)
62
+
63
+ def ensure_indexes(self):
64
+ """
65
+ Ensures indices on the collections needed for building
66
+ """
67
+
68
+ # Basic search index for tasks
69
+ self.tasks.ensure_index("task_id")
70
+ self.tasks.ensure_index("last_updated")
71
+ self.tasks.ensure_index("state")
72
+ self.tasks.ensure_index("formula_alphabetical")
73
+ self.tasks.ensure_index("species_hash")
74
+
75
+ # Search index for molecules
76
+ self.molecules.ensure_index("molecule_id")
77
+ self.molecules.ensure_index("last_updated")
78
+ self.molecules.ensure_index("task_ids")
79
+ self.molecules.ensure_index("formula_alphabetical")
80
+ self.molecules.ensure_index("species_hash")
81
+
82
+ # Search index for force properties
83
+ self.forces.ensure_index("molecule_id")
84
+ self.forces.ensure_index("task_id")
85
+ self.forces.ensure_index("solvent")
86
+ self.forces.ensure_index("lot_solvent")
87
+ self.forces.ensure_index("property_id")
88
+ self.forces.ensure_index("last_updated")
89
+ self.forces.ensure_index("formula_alphabetical")
90
+
91
+ def prechunk(self, number_splits: int) -> Iterable[Dict]: # pragma: no cover
92
+ """Prechunk the builder for distributed computation"""
93
+
94
+ temp_query = dict(self.query)
95
+ temp_query["deprecated"] = False
96
+
97
+ self.logger.info("Finding documents to process")
98
+ all_mols = list(
99
+ self.molecules.query(temp_query, [self.molecules.key, "species_hash"])
100
+ )
101
+
102
+ processed_docs = set([e for e in self.forces.distinct("molecule_id")])
103
+ to_process_docs = {d[self.molecules.key] for d in all_mols} - processed_docs
104
+ to_process_hashes = {
105
+ d["species_hash"]
106
+ for d in all_mols
107
+ if d[self.molecules.key] in to_process_docs
108
+ }
109
+
110
+ N = ceil(len(to_process_hashes) / number_splits)
111
+
112
+ for hash_chunk in grouper(to_process_hashes, N):
113
+ query = dict(temp_query)
114
+ query["species_hash"] = {"$in": list(hash_chunk)}
115
+ yield {"query": query}
116
+
117
+ def get_items(self) -> Iterator[List[Dict]]:
118
+ """
119
+ Gets all items to process into force documents.
120
+ This does no datetime checking; relying on on whether
121
+ task_ids are included in the forces Store
122
+
123
+ Returns:
124
+ generator or list relevant tasks and molecules to process into documents
125
+ """
126
+
127
+ self.logger.info("Forces builder started")
128
+ self.logger.info("Setting indexes")
129
+ self.ensure_indexes()
130
+
131
+ # Save timestamp to mark buildtime
132
+ self.timestamp = datetime.utcnow()
133
+
134
+ # Get all processed molecules
135
+ temp_query = dict(self.query)
136
+ temp_query["deprecated"] = False
137
+
138
+ self.logger.info("Finding documents to process")
139
+ all_mols = list(
140
+ self.molecules.query(temp_query, [self.molecules.key, "species_hash"])
141
+ )
142
+
143
+ processed_docs = set([e for e in self.forces.distinct("molecule_id")])
144
+ to_process_docs = {d[self.molecules.key] for d in all_mols} - processed_docs
145
+ to_process_hashes = {
146
+ d["species_hash"]
147
+ for d in all_mols
148
+ if d[self.molecules.key] in to_process_docs
149
+ }
150
+
151
+ self.logger.info(f"Found {len(to_process_docs)} unprocessed documents")
152
+ self.logger.info(f"Found {len(to_process_hashes)} unprocessed hashes")
153
+
154
+ # Set total for builder bars to have a total
155
+ self.total = len(to_process_hashes)
156
+
157
+ for shash in to_process_hashes:
158
+ mol_query = dict(temp_query)
159
+ mol_query["species_hash"] = shash
160
+ molecules = list(self.molecules.query(criteria=mol_query))
161
+
162
+ yield molecules
163
+
164
+ def process_item(self, items: List[Dict]) -> List[Dict]:
165
+ """
166
+ Process the tasks into ForcesDoc
167
+
168
+ Args:
169
+ items List[Dict] : a list of MoleculeDocs in dict form
170
+
171
+ Returns:
172
+ [dict] : a list of new forces docs
173
+ """
174
+
175
+ mols = [MoleculeDoc(**item) for item in items]
176
+ shash = mols[0].species_hash
177
+ mol_ids = [m.molecule_id for m in mols]
178
+ self.logger.debug(f"Processing {shash} : {mol_ids}")
179
+
180
+ force_docs = list()
181
+
182
+ for mol in mols:
183
+ force_entries = [
184
+ e
185
+ for e in mol.entries
186
+ if e["charge"] == mol.charge and e["task_type"] == "Force"
187
+ ]
188
+
189
+ # Organize by solvent environment
190
+ by_solvent = defaultdict(list)
191
+ for entry in force_entries:
192
+ by_solvent[entry["solvent"]].append(entry)
193
+
194
+ for solvent, entries in by_solvent.items():
195
+ # No force calculations
196
+ if len(entries) == 0:
197
+ continue
198
+ else:
199
+ best = sorted(
200
+ entries,
201
+ key=lambda x: (
202
+ sum(evaluate_lot(x["level_of_theory"])),
203
+ x["energy"],
204
+ ),
205
+ )[0]
206
+ task = best["task_id"]
207
+
208
+ tdoc = self.tasks.query_one(
209
+ {
210
+ "task_id": task,
211
+ "species_hash": shash,
212
+ "orig": {"$exists": True},
213
+ }
214
+ )
215
+
216
+ if tdoc is None:
217
+ try:
218
+ tdoc = self.tasks.query_one(
219
+ {
220
+ "task_id": int(task),
221
+ "species_hash": shash,
222
+ "orig": {"$exists": True},
223
+ }
224
+ )
225
+ except ValueError:
226
+ tdoc = None
227
+
228
+ if tdoc is None:
229
+ continue
230
+
231
+ task_doc = TaskDocument(**tdoc)
232
+
233
+ if task_doc is None:
234
+ continue
235
+
236
+ force_doc = ForcesDoc.from_task(
237
+ task_doc, molecule_id=mol.molecule_id, deprecated=False
238
+ )
239
+ force_docs.append(force_doc)
240
+
241
+ self.logger.debug(f"Produced {len(force_docs)} force docs for {shash}")
242
+
243
+ return jsanitize([doc.model_dump() for doc in force_docs], allow_bson=True)
244
+
245
+ def update_targets(self, items: List[List[Dict]]):
246
+ """
247
+ Inserts the new force docs into the forces collection
248
+
249
+ Args:
250
+ items [[dict]]: A list of documents to update
251
+ """
252
+
253
+ docs = list(chain.from_iterable(items)) # type: ignore
254
+
255
+ # Add timestamp
256
+ for item in docs:
257
+ item.update(
258
+ {
259
+ "_bt": self.timestamp,
260
+ }
261
+ )
262
+
263
+ molecule_ids = list({item["molecule_id"] for item in docs})
264
+
265
+ if len(items) > 0:
266
+ self.logger.info(f"Updating {len(docs)} force documents")
267
+ self.forces.remove_docs({self.forces.key: {"$in": molecule_ids}})
268
+ self.forces.update(
269
+ docs=docs,
270
+ key=["molecule_id", "solvent"],
271
+ )
272
+ else:
273
+ self.logger.info("No items to update")
274
+
275
+
276
+ class TrajectoryBuilder(Builder):
277
+ """
278
+ The TrajectoryBuilder extracts the highest-quality optimization trajectory data from a
279
+ MoleculeDoc. In general, this will mean that the geometry optimization calculation(s)
280
+ corresponding to the structure(s) that make up this molecule will be used to extract
281
+ optimization trajectories.
282
+
283
+ The process is as follows:
284
+ 1. Gather MoleculeDocs by species hash
285
+ 2. For each doc, sort tasks by solvent
286
+ 3. For each solvent, grab the best TaskDoc (geometry optimization calculation
287
+ that has the highest level of theory with lowest
288
+ electronic energy for the molecule)
289
+ 4. Convert TaskDoc to TrajectoryDoc
290
+ """
291
+
292
+ def __init__(
293
+ self,
294
+ tasks: Store,
295
+ molecules: Store,
296
+ trajectories: Store,
297
+ query: Optional[Dict] = None,
298
+ settings: Optional[EmmetBuildSettings] = None,
299
+ **kwargs,
300
+ ):
301
+ self.tasks = tasks
302
+ self.molecules = molecules
303
+ self.trajectories = trajectories
304
+ self.query = query if query else dict()
305
+ self.settings = EmmetBuildSettings.autoload(settings)
306
+ self.kwargs = kwargs
307
+
308
+ super().__init__(sources=[tasks, molecules], targets=[trajectories], **kwargs)
309
+ # Uncomment in case of issue with mrun not connecting automatically to collections
310
+ # for i in [self.tasks, self.molecules, self.trajectories]:
311
+ # try:
312
+ # i.connect()
313
+ # except Exception as e:
314
+ # print("Could not connect,", e)
315
+
316
+ def ensure_indexes(self):
317
+ """
318
+ Ensures indices on the collections needed for building
319
+ """
320
+
321
+ # Basic search index for tasks
322
+ self.tasks.ensure_index("task_id")
323
+ self.tasks.ensure_index("last_updated")
324
+ self.tasks.ensure_index("state")
325
+ self.tasks.ensure_index("formula_alphabetical")
326
+ self.tasks.ensure_index("species_hash")
327
+
328
+ # Search index for molecules
329
+ self.molecules.ensure_index("molecule_id")
330
+ self.molecules.ensure_index("last_updated")
331
+ self.molecules.ensure_index("task_ids")
332
+ self.molecules.ensure_index("formula_alphabetical")
333
+ self.molecules.ensure_index("species_hash")
334
+
335
+ # Search index for geometry optimization trajectory properties
336
+ self.trajectories.ensure_index("molecule_id")
337
+ self.trajectories.ensure_index("task_id")
338
+ self.trajectories.ensure_index("solvent")
339
+ self.trajectories.ensure_index("lot_solvent")
340
+ self.trajectories.ensure_index("property_id")
341
+ self.trajectories.ensure_index("last_updated")
342
+ self.trajectories.ensure_index("formula_alphabetical")
343
+
344
+ def prechunk(self, number_splits: int) -> Iterable[Dict]: # pragma: no cover
345
+ """Prechunk the builder for distributed computation"""
346
+
347
+ temp_query = dict(self.query)
348
+ temp_query["deprecated"] = False
349
+
350
+ self.logger.info("Finding documents to process")
351
+ all_mols = list(
352
+ self.molecules.query(temp_query, [self.molecules.key, "species_hash"])
353
+ )
354
+
355
+ processed_docs = set([e for e in self.trajectories.distinct("molecule_id")])
356
+ to_process_docs = {d[self.molecules.key] for d in all_mols} - processed_docs
357
+ to_process_hashes = {
358
+ d["species_hash"]
359
+ for d in all_mols
360
+ if d[self.molecules.key] in to_process_docs
361
+ }
362
+
363
+ N = ceil(len(to_process_hashes) / number_splits)
364
+
365
+ for hash_chunk in grouper(to_process_hashes, N):
366
+ yield {"query": {"species_hash": {"$in": list(hash_chunk)}}}
367
+
368
+ def get_items(self) -> Iterator[List[Dict]]:
369
+ """
370
+ Gets all items to process into trajectory documents.
371
+ This does no datetime checking; relying on on whether
372
+ task_ids are included in the forces Store
373
+
374
+ Returns:
375
+ generator or list relevant tasks and molecules to process into documents
376
+ """
377
+
378
+ self.logger.info("Trajectories builder started")
379
+ self.logger.info("Setting indexes")
380
+ self.ensure_indexes()
381
+
382
+ # Save timestamp to mark buildtime
383
+ self.timestamp = datetime.utcnow()
384
+
385
+ # Get all processed molecules
386
+ temp_query = dict(self.query)
387
+ temp_query["deprecated"] = False
388
+
389
+ self.logger.info("Finding documents to process")
390
+ all_mols = list(
391
+ self.molecules.query(temp_query, [self.molecules.key, "species_hash"])
392
+ )
393
+
394
+ processed_docs = set([e for e in self.trajectories.distinct("molecule_id")])
395
+ to_process_docs = {d[self.molecules.key] for d in all_mols} - processed_docs
396
+ to_process_hashes = {
397
+ d["species_hash"]
398
+ for d in all_mols
399
+ if d[self.molecules.key] in to_process_docs
400
+ }
401
+
402
+ self.logger.info(f"Found {len(to_process_docs)} unprocessed documents")
403
+ self.logger.info(f"Found {len(to_process_hashes)} unprocessed hashes")
404
+
405
+ # Set total for builder bars to have a total
406
+ self.total = len(to_process_hashes)
407
+
408
+ for shash in to_process_hashes:
409
+ mol_query = dict(temp_query)
410
+ mol_query["species_hash"] = shash
411
+ molecules = list(self.molecules.query(criteria=mol_query))
412
+
413
+ yield molecules
414
+
415
+ def process_item(self, items: List[Dict]) -> List[Dict]:
416
+ """
417
+ Process the tasks into TrajectoryDocs
418
+
419
+ Args:
420
+ items List[Dict] : a list of MoleculeDocs in dict form
421
+
422
+ Returns:
423
+ [dict] : a list of new trajectory docs
424
+ """
425
+
426
+ mols = [MoleculeDoc(**item) for item in items]
427
+ shash = mols[0].species_hash
428
+ mol_ids = [m.molecule_id for m in mols]
429
+ self.logger.debug(f"Processing {shash} : {mol_ids}")
430
+
431
+ trajectory_docs = list()
432
+
433
+ for mol in mols:
434
+ entries = mol.best_entries
435
+
436
+ # Organize by solvent environment
437
+ by_solvent = defaultdict(list)
438
+ for entry in entries.values():
439
+ by_solvent[entry["solvent"]].append(entry)
440
+
441
+ for solvent, entries in by_solvent.items():
442
+ # No "best" entry - shouldn't happen, but just in case
443
+ if len(entries) == 0:
444
+ continue
445
+ else:
446
+ # In case there are multiple optimized structures with the same solvent but different LOT
447
+ best = sorted(
448
+ entries,
449
+ key=lambda x: (
450
+ sum(evaluate_lot(x["level_of_theory"])),
451
+ x["energy"],
452
+ ),
453
+ )[0]
454
+ task = best["task_id"]
455
+
456
+ tdoc = self.tasks.query_one(
457
+ {
458
+ "task_id": task,
459
+ "species_hash": shash,
460
+ "orig": {"$exists": True},
461
+ }
462
+ )
463
+
464
+ if tdoc is None:
465
+ try:
466
+ tdoc = self.tasks.query_one(
467
+ {
468
+ "task_id": int(task),
469
+ "species_hash": shash,
470
+ "orig": {"$exists": True},
471
+ }
472
+ )
473
+ except ValueError:
474
+ tdoc = None
475
+
476
+ if tdoc is None:
477
+ continue
478
+
479
+ task_doc = TaskDocument(**tdoc)
480
+
481
+ if task_doc is None:
482
+ continue
483
+
484
+ trajectory_doc = TrajectoryDoc.from_task(
485
+ task_doc, molecule_id=mol.molecule_id, deprecated=False
486
+ )
487
+ trajectory_docs.append(trajectory_doc)
488
+
489
+ self.logger.debug(
490
+ f"Produced {len(trajectory_docs)} trajectory docs for {shash}"
491
+ )
492
+
493
+ return jsanitize([doc.model_dump() for doc in trajectory_docs], allow_bson=True)
494
+
495
+ def update_targets(self, items: List[List[Dict]]):
496
+ """
497
+ Inserts the new force docs into the trajectories collection
498
+
499
+ Args:
500
+ items [[dict]]: A list of documents to update
501
+ """
502
+
503
+ docs = list(chain.from_iterable(items)) # type: ignore
504
+
505
+ # Add timestamp
506
+ for item in docs:
507
+ item.update(
508
+ {
509
+ "_bt": self.timestamp,
510
+ }
511
+ )
512
+
513
+ molecule_ids = list({item["molecule_id"] for item in docs})
514
+
515
+ if len(items) > 0:
516
+ self.logger.info(f"Updating {len(docs)} trajectory documents")
517
+ self.trajectories.remove_docs(
518
+ {self.trajectories.key: {"$in": molecule_ids}}
519
+ )
520
+ self.trajectories.update(
521
+ docs=docs,
522
+ key=["molecule_id", "solvent"],
523
+ )
524
+ else:
525
+ self.logger.info("No items to update")