emmet-builders 0.84.10rc2__py3-none-any.whl → 0.85.0rc0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of emmet-builders might be problematic. Click here for more details.

Files changed (33) hide show
  1. emmet/builders/abinit/phonon.py +12 -14
  2. emmet/builders/abinit/sound_velocity.py +1 -1
  3. emmet/builders/materials/absorption_spectrum.py +16 -10
  4. emmet/builders/materials/dielectric.py +10 -7
  5. emmet/builders/materials/elasticity.py +12 -9
  6. emmet/builders/materials/electrodes.py +1 -1
  7. emmet/builders/materials/electronic_structure.py +1 -1
  8. emmet/builders/materials/magnetism.py +2 -1
  9. emmet/builders/materials/piezoelectric.py +23 -19
  10. emmet/builders/materials/provenance.py +3 -4
  11. emmet/builders/settings.py +14 -9
  12. emmet/builders/utils.py +5 -4
  13. emmet/builders/vasp/materials.py +11 -4
  14. emmet/builders/vasp/task_validator.py +3 -1
  15. {emmet_builders-0.84.10rc2.dist-info → emmet_builders-0.85.0rc0.dist-info}/METADATA +7 -30
  16. emmet_builders-0.85.0rc0.dist-info/RECORD +41 -0
  17. emmet/builders/materials/ml.py +0 -101
  18. emmet/builders/molecules/atomic.py +0 -592
  19. emmet/builders/molecules/bonds.py +0 -329
  20. emmet/builders/molecules/electric.py +0 -287
  21. emmet/builders/molecules/metal_binding.py +0 -528
  22. emmet/builders/molecules/orbitals.py +0 -292
  23. emmet/builders/molecules/redox.py +0 -502
  24. emmet/builders/molecules/summary.py +0 -406
  25. emmet/builders/molecules/thermo.py +0 -505
  26. emmet/builders/molecules/trajectory.py +0 -530
  27. emmet/builders/molecules/vibration.py +0 -282
  28. emmet/builders/qchem/__init__.py +0 -0
  29. emmet/builders/qchem/molecules.py +0 -745
  30. emmet_builders-0.84.10rc2.dist-info/RECORD +0 -54
  31. /emmet/builders/{molecules/__init__.py → py.typed} +0 -0
  32. {emmet_builders-0.84.10rc2.dist-info → emmet_builders-0.85.0rc0.dist-info}/WHEEL +0 -0
  33. {emmet_builders-0.84.10rc2.dist-info → emmet_builders-0.85.0rc0.dist-info}/top_level.txt +0 -0
@@ -1,745 +0,0 @@
1
- from __future__ import annotations
2
-
3
- from datetime import datetime
4
- from itertools import chain, groupby
5
- from math import ceil
6
-
7
- import networkx as nx
8
- from maggma.builders import Builder
9
- from maggma.stores import Store
10
- from maggma.utils import grouper
11
-
12
- from emmet.builders.settings import EmmetBuildSettings
13
- from emmet.core.qchem.calc_types import CalcType, LevelOfTheory, TaskType
14
- from emmet.core.qchem.molecule import (
15
- MoleculeDoc,
16
- best_lot,
17
- evaluate_lot,
18
- evaluate_task_entry,
19
- )
20
- from emmet.core.qchem.task import TaskDocument
21
- from emmet.core.utils import get_molecule_id, group_molecules, jsanitize, make_mol_graph
22
-
23
- from typing import TYPE_CHECKING
24
-
25
- if TYPE_CHECKING:
26
- from collections.abc import Iterable, Iterator
27
- from typing import Any
28
-
29
- __author__ = "Evan Spotte-Smith <ewcspottesmith@lbl.gov>"
30
-
31
-
32
- SETTINGS = EmmetBuildSettings()
33
-
34
-
35
- def evaluate_molecule(
36
- mol_doc: MoleculeDoc,
37
- funct_scores: dict[str, int] = SETTINGS.QCHEM_FUNCTIONAL_QUALITY_SCORES,
38
- basis_scores: dict[str, int] = SETTINGS.QCHEM_BASIS_QUALITY_SCORES,
39
- solvent_scores: dict[str, int] = SETTINGS.QCHEM_SOLVENT_MODEL_QUALITY_SCORES,
40
- ):
41
- """
42
- Helper function to order optimization calcs by
43
- - Level of theory
44
- - Electronic energy
45
-
46
- :param mol_doc: Molecule to be evaluated
47
- :param funct_scores: Scores for various density functionals
48
- :param basis_scores: Scores for various basis sets
49
- :param solvent_scores: Scores for various implicit solvent models
50
- :return:
51
- """
52
-
53
- opt_lot = None
54
- for origin in mol_doc.origins:
55
- if origin.name == "molecule":
56
- opt_lot = mol_doc.levels_of_theory[origin.task_id]
57
- if isinstance(opt_lot, LevelOfTheory):
58
- opt_lot = opt_lot.value
59
-
60
- if opt_lot is None:
61
- opt_eval = [0]
62
- else:
63
- opt_eval = evaluate_lot(opt_lot, funct_scores, basis_scores, solvent_scores)
64
-
65
- best = best_lot(mol_doc, funct_scores, basis_scores, solvent_scores)
66
-
67
- best_eval = evaluate_lot(best, funct_scores, basis_scores, solvent_scores)
68
-
69
- return (
70
- -1 * int(mol_doc.deprecated),
71
- sum(best_eval),
72
- sum(opt_eval),
73
- mol_doc.best_entries[best]["energy"],
74
- )
75
-
76
-
77
- def _optimizing_solvent(mol_doc):
78
- """
79
- Returns which solvent was used to optimize this (associated) MoleculeDoc.
80
-
81
- Args:
82
- mol_doc: MoleculeDoc
83
-
84
- Returns:
85
- solvent (str)
86
-
87
- """
88
-
89
- for origin in mol_doc.origins:
90
- if origin.name.startswith("molecule"):
91
- solvent = mol_doc.solvents[origin.task_id]
92
- return solvent
93
-
94
-
95
- class MoleculesAssociationBuilder(Builder):
96
- """
97
- The MoleculesAssociationBuilder matches Q-Chem task documents by composition
98
- and collects tasks associated with identical structures.
99
- The purpose of this builder is to group calculations in preparation for the
100
- MoleculesBuilder.
101
-
102
- The process is as follows:
103
-
104
- 1.) Find all documents with the same formula
105
- 2.) Select only task documents for the task_types we can select properties from
106
- 3.) Aggregate task documents based on nuclear geometry
107
- 4.) Create a MoleculeDoc from the group of task documents
108
- """
109
-
110
- def __init__(
111
- self,
112
- tasks: Store,
113
- assoc: Store,
114
- query: dict | None = None,
115
- settings: EmmetBuildSettings | None = None,
116
- **kwargs,
117
- ):
118
- """
119
- Args:
120
- tasks: Store of task documents
121
- assoc: Store of associated molecules documents to prepare
122
- query: dictionary to limit tasks to be analyzed
123
- settings: EmmetSettings to use in the build process
124
- """
125
-
126
- self.tasks = tasks
127
- self.assoc = assoc
128
- self.query = query if query else dict()
129
- self.settings = EmmetBuildSettings.autoload(settings)
130
- self.kwargs = kwargs
131
-
132
- super().__init__(sources=[tasks], targets=[assoc], **kwargs)
133
-
134
- def ensure_indexes(self):
135
- """
136
- Ensures indices on the collections needed for building
137
- """
138
-
139
- # Basic search index for tasks
140
- self.tasks.ensure_index("task_id")
141
- self.tasks.ensure_index("last_updated")
142
- self.tasks.ensure_index("state")
143
- self.tasks.ensure_index("formula_alphabetical")
144
- self.tasks.ensure_index("smiles")
145
- self.tasks.ensure_index("species_hash")
146
- self.tasks.ensure_index("coord_hash")
147
-
148
- # Search index for molecules
149
- self.assoc.ensure_index("molecule_id")
150
- self.assoc.ensure_index("last_updated")
151
- self.assoc.ensure_index("task_ids")
152
- self.assoc.ensure_index("formula_alphabetical")
153
-
154
- def prechunk(self, number_splits: int) -> Iterable[dict]: # pragma: no cover
155
- """Prechunk the molecule builder for distributed computation"""
156
-
157
- temp_query = dict(self.query)
158
- temp_query["state"] = "successful"
159
-
160
- self.logger.info("Finding tasks to process")
161
- all_tasks = list(self.tasks.query(temp_query, [self.tasks.key, "species_hash"]))
162
-
163
- processed_tasks = set(self.assoc.distinct("task_ids"))
164
- to_process_tasks = {d[self.tasks.key] for d in all_tasks} - processed_tasks
165
- to_process_hashes = {
166
- d["species_hash"]
167
- for d in all_tasks
168
- if d[self.tasks.key] in to_process_tasks
169
- }
170
-
171
- N = ceil(len(to_process_hashes) / number_splits)
172
-
173
- for hash_chunk in grouper(to_process_hashes, N):
174
- query = dict(temp_query)
175
- query["species_hash"] = {"$in": list(hash_chunk)}
176
- yield {"query": query}
177
-
178
- def get_items(self) -> Iterator[list[TaskDocument]]:
179
- """
180
- Gets all items to process into molecules (and other) documents.
181
- This does no datetime checking; relying on on whether
182
- task_ids are included in the molecules Store
183
-
184
- Returns:
185
- generator or list relevant tasks and molecules to process into documents
186
- """
187
-
188
- self.logger.info("Molecule association builder started")
189
- self.logger.info(
190
- f"Allowed task types: {[task_type.value for task_type in self.settings.QCHEM_ALLOWED_TASK_TYPES]}"
191
- )
192
-
193
- self.logger.info("Setting indexes")
194
- self.ensure_indexes()
195
-
196
- # Save timestamp to mark buildtime
197
- self.timestamp = datetime.utcnow()
198
-
199
- # Get all processed tasks
200
- temp_query = dict(self.query)
201
- temp_query["state"] = "successful"
202
-
203
- self.logger.info("Finding tasks to process")
204
- all_tasks = list(self.tasks.query(temp_query, [self.tasks.key, "species_hash"]))
205
-
206
- processed_tasks = set(self.assoc.distinct("task_ids"))
207
- to_process_tasks = {d[self.tasks.key] for d in all_tasks} - processed_tasks
208
- to_process_hashes = {
209
- d["species_hash"]
210
- for d in all_tasks
211
- if d[self.tasks.key] in to_process_tasks
212
- }
213
-
214
- self.logger.info(f"Found {len(to_process_tasks)} unprocessed tasks")
215
- self.logger.info(f"Found {len(to_process_hashes)} unprocessed hashes")
216
-
217
- # Set total for builder bars to have a total
218
- self.total = len(to_process_hashes)
219
-
220
- projected_fields = [
221
- "last_updated",
222
- "task_id",
223
- "formula_alphabetical",
224
- "species_hash",
225
- "coord_hash",
226
- "smiles",
227
- "orig",
228
- "tags",
229
- "walltime",
230
- "cputime",
231
- "output",
232
- "calcs_reversed",
233
- "special_run_type",
234
- "custom_smd",
235
- "critic2",
236
- ]
237
-
238
- for shash in to_process_hashes:
239
- tasks_query = dict(temp_query)
240
- tasks_query["species_hash"] = shash
241
- tasks = list(
242
- self.tasks.query(criteria=tasks_query, properties=projected_fields)
243
- )
244
- to_yield = list()
245
- for t in tasks:
246
- # TODO: Validation
247
- # basic validation here ensures that tasks with invalid levels of
248
- # theory don't halt the build pipeline
249
- try:
250
- task = TaskDocument(**t)
251
- to_yield.append(task)
252
- except Exception as e:
253
- self.logger.info(
254
- f"Processing task {t['task_id']} failed with Exception - {e}"
255
- )
256
- continue
257
-
258
- yield to_yield
259
-
260
- def process_item(self, tasks: list[TaskDocument]) -> list[dict]:
261
- """
262
- Process the tasks into a MoleculeDoc
263
-
264
- Args:
265
- tasks [TaskDocument] : a list of task docs
266
-
267
- Returns:
268
- [dict] : a list of new molecule docs
269
- """
270
-
271
- if len(tasks) == 0:
272
- return list()
273
- shash = tasks[0].species_hash
274
- task_ids = [task.task_id for task in tasks]
275
- self.logger.debug(f"Processing {shash} : {task_ids}")
276
- molecules = list()
277
-
278
- for group in self.filter_and_group_tasks(tasks):
279
- try:
280
- doc = MoleculeDoc.from_tasks(group)
281
- molecules.append(doc)
282
- except Exception as e:
283
- failed_ids = list({t_.task_id for t_ in group})
284
- doc = MoleculeDoc.construct_deprecated_molecule(group)
285
- doc.warnings.append(str(e))
286
- molecules.append(doc)
287
- self.logger.warning(
288
- f"Failed making molecule for {failed_ids}."
289
- f" Inserted as deprecated molecule: {doc.molecule_id}"
290
- )
291
-
292
- self.logger.debug(f"Produced {len(molecules)} molecules for {shash}")
293
-
294
- return jsanitize([mol.model_dump() for mol in molecules], allow_bson=True)
295
-
296
- def update_targets(self, items: list[list[dict]]):
297
- """
298
- Inserts the new molecules into the molecules collection
299
-
300
- Args:
301
- items [[dict]]: A list of molecules to update
302
- """
303
-
304
- docs = list(chain.from_iterable(items)) # type: ignore
305
-
306
- for item in docs:
307
- item.update({"_bt": self.timestamp})
308
-
309
- molecule_ids = list({item["molecule_id"] for item in docs})
310
-
311
- if len(items) > 0:
312
- self.logger.info(f"Updating {len(docs)} molecules")
313
- self.assoc.remove_docs({self.assoc.key: {"$in": molecule_ids}})
314
- self.assoc.update(
315
- docs=docs,
316
- key=["molecule_id"],
317
- )
318
- else:
319
- self.logger.info("No items to update")
320
-
321
- def filter_and_group_tasks(
322
- self, tasks: list[TaskDocument]
323
- ) -> Iterator[list[TaskDocument]]:
324
- """
325
- Groups tasks by identical structure
326
- """
327
-
328
- filtered_tasks = [
329
- task
330
- for task in tasks
331
- if any(
332
- allowed_type is task.task_type
333
- for allowed_type in self.settings.QCHEM_ALLOWED_TASK_TYPES
334
- )
335
- ]
336
-
337
- molecules = list()
338
-
339
- for idx, task in enumerate(filtered_tasks):
340
- if task.output.optimized_molecule:
341
- m = task.output.optimized_molecule
342
- else:
343
- m = task.output.initial_molecule
344
- m.ind: int = idx # type: ignore
345
- molecules.append(m)
346
-
347
- grouped_molecules = group_molecules(molecules)
348
- for group in grouped_molecules:
349
- grouped_tasks = [filtered_tasks[mol.ind] for mol in group] # type: ignore
350
- yield grouped_tasks
351
-
352
-
353
- class MoleculesBuilder(Builder):
354
- """
355
- The MoleculesBuilder collects MoleculeDocs from the MoleculesAssociationBuilder
356
- and groups them by key properties (charge, spin multiplicity, bonding).
357
- Then, the best molecular structure is identified (based on electronic energy),
358
- and this document becomes the representative MoleculeDoc.
359
-
360
- The process is as follows:
361
-
362
- 1.) Find all documents with the same formula
363
- 2.) Group documents based on charge, spin, and bonding
364
- 3.) Create a MoleculeDoc from the group of task documents
365
- """
366
-
367
- def __init__(
368
- self,
369
- assoc: Store,
370
- molecules: Store,
371
- query: dict | None = None,
372
- settings: EmmetBuildSettings | None = None,
373
- **kwargs,
374
- ):
375
- """
376
- Args:
377
- assoc: Store of associated molecules documents, created by MoleculesAssociationBuilder
378
- molecules: Store of processed molecules documents
379
- query: dictionary to limit tasks to be analyzed
380
- settings: EmmetSettings to use in the build process
381
- """
382
-
383
- self.assoc = assoc
384
- self.molecules = molecules
385
- self.query = query if query else dict()
386
- self.settings = EmmetBuildSettings.autoload(settings)
387
- self.kwargs = kwargs
388
-
389
- super().__init__(sources=[assoc], targets=[molecules], **kwargs)
390
-
391
- def ensure_indexes(self):
392
- """
393
- Ensures indices on the collections needed for building
394
- """
395
-
396
- # Search index for associated molecules
397
- self.assoc.ensure_index("molecule_id")
398
- self.assoc.ensure_index("last_updated")
399
- self.assoc.ensure_index("task_ids")
400
- self.assoc.ensure_index("formula_alphabetical")
401
- self.assoc.ensure_index("species_hash")
402
-
403
- # Search index for molecules
404
- self.molecules.ensure_index("molecule_id")
405
- self.molecules.ensure_index("last_updated")
406
- self.molecules.ensure_index("task_ids")
407
- self.molecules.ensure_index("formula_alphabetical")
408
-
409
- def prechunk(self, number_splits: int) -> Iterable[dict]: # pragma: no cover
410
- """Prechunk the molecule builder for distributed computation"""
411
-
412
- temp_query = dict(self.query)
413
- temp_query["deprecated"] = False
414
-
415
- self.logger.info("Finding documents to process")
416
- all_assoc = list(
417
- self.assoc.query(
418
- temp_query,
419
- [
420
- self.assoc.key,
421
- "formula_alphabetical",
422
- "species_hash",
423
- "charge",
424
- "spin_multiplicity",
425
- ],
426
- )
427
- )
428
-
429
- # Should be using species hash, rather than coord hash, at this point
430
- processed_docs = set(list(self.molecules.distinct("molecule_id")))
431
- assoc_ids = set()
432
-
433
- xyz_species_id_map = dict()
434
- for d in all_assoc:
435
- this_id = "{}-{}-{}-{}".format(
436
- d["species_hash"],
437
- d["formula_alphabetical"].replace(" ", ""),
438
- str(int(d["charge"])).replace("-", "m"),
439
- str(int(d["spin_multiplicity"])),
440
- )
441
- assoc_ids.add(this_id)
442
- xyz_species_id_map[d[self.assoc.key]] = this_id
443
- to_process_docs = assoc_ids - processed_docs
444
-
445
- to_process_hashes = {
446
- d["species_hash"]
447
- for d in all_assoc
448
- if xyz_species_id_map[d[self.assoc.key]] in to_process_docs
449
- }
450
-
451
- N = ceil(len(to_process_hashes) / number_splits)
452
-
453
- for hash_chunk in grouper(to_process_hashes, N):
454
- query = dict(temp_query)
455
- query["species_hash"] = {"$in": list(hash_chunk)}
456
- yield {"query": query}
457
-
458
- def get_items(self) -> Iterator[list[dict]]:
459
- """
460
- Gets all items to process into molecules (and other) documents.
461
- This does no datetime checking; relying on on whether
462
- task_ids are included in the molecules Store
463
-
464
- Returns:
465
- generator or list relevant tasks and molecules to process into documents
466
- """
467
-
468
- self.logger.info("Molecules builder started")
469
- self.logger.info("Setting indexes")
470
- self.ensure_indexes()
471
-
472
- # Save timestamp to mark buildtime
473
- self.timestamp = datetime.utcnow()
474
-
475
- # Get all processed molecules
476
- temp_query = dict(self.query)
477
- temp_query["deprecated"] = False
478
-
479
- self.logger.info("Finding documents to process")
480
- all_assoc = list(
481
- self.assoc.query(
482
- temp_query,
483
- [
484
- self.assoc.key,
485
- "formula_alphabetical",
486
- "species_hash",
487
- "charge",
488
- "spin_multiplicity",
489
- ],
490
- )
491
- )
492
-
493
- # Should be using species hash, rather than coord hash, at this point
494
- processed_docs = set(list(self.molecules.distinct("molecule_id")))
495
- assoc_ids = set()
496
-
497
- xyz_species_id_map = dict()
498
- for d in all_assoc:
499
- this_id = "{}-{}-{}-{}".format(
500
- d["species_hash"],
501
- d["formula_alphabetical"].replace(" ", ""),
502
- str(int(d["charge"])).replace("-", "m"),
503
- str(int(d["spin_multiplicity"])),
504
- )
505
- assoc_ids.add(this_id)
506
- xyz_species_id_map[d[self.assoc.key]] = this_id
507
- to_process_docs = assoc_ids - processed_docs
508
-
509
- to_process_hashes = {
510
- d["species_hash"]
511
- for d in all_assoc
512
- if xyz_species_id_map[d[self.assoc.key]] in to_process_docs
513
- }
514
-
515
- self.logger.info(f"Found {len(to_process_docs)} unprocessed documents")
516
- self.logger.info(f"Found {len(to_process_hashes)} unprocessed hashes")
517
-
518
- # Set total for builder bars to have a total
519
- self.total = len(to_process_hashes)
520
-
521
- for shash in to_process_hashes:
522
- assoc_query = dict(temp_query)
523
- assoc_query["species_hash"] = shash
524
- assoc = list(self.assoc.query(criteria=assoc_query))
525
-
526
- yield assoc
527
-
528
- def process_item(self, items: list[dict]) -> list[dict]:
529
- """
530
- Process the tasks into a MoleculeDoc
531
-
532
- Args:
533
- tasks list[dict] : a list of task docs
534
-
535
- Returns:
536
- [dict] : a list of new molecule docs
537
- """
538
-
539
- assoc = [MoleculeDoc(**item) for item in items]
540
- shash = assoc[0].species_hash
541
- mol_ids = [a.molecule_id for a in assoc]
542
- self.logger.debug(f"Processing {shash} : {mol_ids}")
543
-
544
- complete_mol_docs = list()
545
-
546
- # This is only slightly unholy
547
- # Need to combine many variables of the various constituent associated docs
548
- # into one MoleculeDoc, where the best associated doc for each solvent is taken
549
- for group in self.group_mol_docs(assoc):
550
- # Maybe all are disconnected and therefore none get grouped?
551
- if len(group) == 0:
552
- continue
553
-
554
- docs_by_solvent = dict()
555
- mols_by_solvent = dict()
556
- mol_lots = dict()
557
-
558
- task_ids = list()
559
- calc_types = dict()
560
- task_types = dict()
561
- levels_of_theory = dict()
562
- solvents = dict()
563
- lot_solvents = dict()
564
- unique_calc_types: set[str | CalcType] = set()
565
- unique_task_types: set[str | TaskType] = set()
566
- unique_levels_of_theory: set[str | LevelOfTheory] = set()
567
- unique_solvents: set[str] = set()
568
- unique_lot_solvents: set[str] = set()
569
- origins = list()
570
- entries = list()
571
- best_entries: dict[str, Any] = dict()
572
- constituent_molecules = list()
573
- similar_molecules = list()
574
-
575
- base_doc: MoleculeDoc | None = None
576
-
577
- # Grab best doc for each solvent
578
- # A doc is given a solvent based on how the molecule was optimized
579
- for solv, subgroup in groupby(
580
- sorted(group, key=_optimizing_solvent), key=_optimizing_solvent
581
- ):
582
- sorted_docs = sorted(subgroup, key=evaluate_molecule)
583
- docs_by_solvent[solv] = sorted_docs[0]
584
- mols_by_solvent[solv] = sorted_docs[0].molecule
585
- mol_lots[solv] = sorted_docs[0].levels_of_theory[
586
- sorted_docs[0].origins[0].task_id
587
- ]
588
- constituent_molecules.append(sorted_docs[0].molecule_id)
589
-
590
- if len(sorted_docs) > 1:
591
- for m in sorted_docs[1:]:
592
- if m.molecule_id not in constituent_molecules:
593
- similar_molecules.append(m.molecule_id)
594
-
595
- if base_doc is None:
596
- base_doc = docs_by_solvent[solv]
597
-
598
- if base_doc is None:
599
- continue
600
-
601
- else:
602
- # Compile data on each constituent doc
603
- for solv, doc in docs_by_solvent.items():
604
- task_ids.extend(doc.task_ids)
605
- calc_types.update(doc.calc_types)
606
- task_types.update(doc.task_types)
607
- levels_of_theory.update(doc.levels_of_theory)
608
- solvents.update(doc.solvents)
609
- lot_solvents.update(doc.lot_solvents)
610
- unique_calc_types = unique_calc_types.union(
611
- set(doc.unique_calc_types)
612
- )
613
- unique_task_types = unique_task_types.union(
614
- set(doc.unique_task_types)
615
- )
616
- unique_levels_of_theory = unique_levels_of_theory.union(
617
- set(doc.unique_levels_of_theory)
618
- )
619
- unique_solvents = unique_solvents.union(set(doc.unique_solvents))
620
- unique_lot_solvents = unique_lot_solvents.union(
621
- set(doc.unique_lot_solvents)
622
- )
623
- origins.extend(doc.origins)
624
- entries.extend(doc.entries)
625
-
626
- for lot_solv, entry in doc.best_entries.items():
627
- if lot_solv in best_entries:
628
- current_eval = evaluate_task_entry(best_entries[lot_solv])
629
- this_eval = evaluate_task_entry(entry)
630
- if this_eval < current_eval:
631
- best_entries[lot_solv] = entry
632
- else:
633
- best_entries[lot_solv] = entry
634
-
635
- # Assign new doc info
636
- base_doc.molecule_id = get_molecule_id(
637
- base_doc.molecule, node_attr="specie"
638
- )
639
- base_doc.molecules = mols_by_solvent
640
- base_doc.molecule_levels_of_theory = mol_lots
641
- base_doc.task_ids = task_ids
642
- base_doc.calc_types = calc_types
643
- base_doc.task_types = task_types
644
- base_doc.levels_of_theory = levels_of_theory
645
- base_doc.solvents = solvents
646
- base_doc.lot_solvents = lot_solvents
647
- base_doc.unique_calc_types = unique_calc_types
648
- base_doc.unique_task_types = unique_task_types
649
- base_doc.unique_levels_of_theory = unique_levels_of_theory
650
- base_doc.unique_solvents = unique_solvents
651
- base_doc.unique_lot_solvents = unique_lot_solvents
652
- base_doc.origins = origins
653
- base_doc.entries = entries
654
- base_doc.best_entries = best_entries
655
- base_doc.constituent_molecules = constituent_molecules
656
- base_doc.similar_molecules = similar_molecules
657
-
658
- complete_mol_docs.append(base_doc)
659
-
660
- self.logger.debug(f"Produced {len(complete_mol_docs)} molecules for {shash}")
661
-
662
- return jsanitize(
663
- [mol.model_dump() for mol in complete_mol_docs], allow_bson=True
664
- )
665
-
666
- def update_targets(self, items: list[list[dict]]):
667
- """
668
- Inserts the new molecules into the molecules collection
669
-
670
- Args:
671
- items [[dict]]: A list of molecules to update
672
- """
673
-
674
- self.logger.debug(f"Updating {len(items)} molecules")
675
-
676
- docs = list(chain.from_iterable(items)) # type: ignore
677
-
678
- # Add timestamp, add prefix to molecule id
679
- for item in docs:
680
- molid = item["molecule_id"]
681
-
682
- item.update({"_bt": self.timestamp})
683
-
684
- for entry in item["entries"]:
685
- entry["entry_id"] = molid
686
-
687
- molecule_ids = list({item["molecule_id"] for item in docs})
688
-
689
- if len(items) > 0:
690
- self.logger.info(f"Updating {len(docs)} molecules")
691
- self.molecules.remove_docs({self.molecules.key: {"$in": molecule_ids}})
692
- self.molecules.update(
693
- docs=docs,
694
- key=["molecule_id"],
695
- )
696
- else:
697
- self.logger.info("No items to update")
698
-
699
- def group_mol_docs(self, assoc: list[MoleculeDoc]) -> Iterator[list[MoleculeDoc]]:
700
- """
701
- Groups molecules by:
702
- - highest level of theory
703
- - charge
704
- - spin multiplicity
705
- - bonding (molecule graph isomorphism)
706
- - solvent environment used for the structure
707
- """
708
-
709
- # Molecules are already grouped by formula
710
-
711
- # First, group by charge, spin multiplicity
712
- # Then group by graph isomorphism, using OpenBabelNN + metal_edge_extender
713
-
714
- def charge_spin(mol_doc):
715
- return (mol_doc.charge, mol_doc.spin_multiplicity)
716
-
717
- # Group by charge and spin
718
- for c_s, group in groupby(sorted(assoc, key=charge_spin), key=charge_spin):
719
- subgroups: list[dict[str, Any]] = list()
720
- for mol_doc in group:
721
- mol_graph = make_mol_graph(mol_doc.molecule)
722
- mol_hash = mol_doc.species_hash
723
-
724
- # Finally, group by graph isomorphism
725
- # When bonding is defined by OpenBabelNN + metal_edge_extender
726
- # Unconnected molecule graphs are discarded at this step
727
- # TODO: What about molecules that would be connected under a different
728
- # TODO: bonding scheme? For now, ¯\_(ツ)_/¯
729
- # TODO: MAKE ClusterBuilder FOR THIS PURPOSE
730
- if nx.is_connected(mol_graph.graph.to_undirected()):
731
- matched = False
732
-
733
- for subgroup in subgroups:
734
- if mol_hash == subgroup["hash"]:
735
- subgroup["mol_docs"].append(mol_doc)
736
- matched = True
737
- break
738
-
739
- if not matched:
740
- subgroups.append({"hash": mol_hash, "mol_docs": [mol_doc]})
741
-
742
- self.logger.debug(f"Unique hashes: {[x['hash'] for x in subgroups]}")
743
-
744
- for subgroup in subgroups:
745
- yield subgroup["mol_docs"]