emmet-builders 0.78.3__py3-none-any.whl → 0.86.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (44) hide show
  1. emmet/builders/abinit/phonon.py +47 -47
  2. emmet/builders/abinit/sound_velocity.py +15 -11
  3. emmet/builders/feff/xas.py +1 -2
  4. emmet/builders/materials/absorption_spectrum.py +25 -14
  5. emmet/builders/materials/alloys.py +10 -11
  6. emmet/builders/materials/chemenv.py +2 -3
  7. emmet/builders/materials/corrected_entries.py +21 -15
  8. emmet/builders/materials/dielectric.py +19 -11
  9. emmet/builders/materials/elasticity.py +44 -33
  10. emmet/builders/materials/electrodes.py +35 -28
  11. emmet/builders/materials/electronic_structure.py +17 -17
  12. emmet/builders/materials/magnetism.py +11 -4
  13. emmet/builders/materials/optimade.py +7 -3
  14. emmet/builders/materials/piezoelectric.py +24 -21
  15. emmet/builders/materials/provenance.py +16 -13
  16. emmet/builders/materials/robocrys.py +2 -3
  17. emmet/builders/materials/substrates.py +9 -8
  18. emmet/builders/materials/summary.py +3 -3
  19. emmet/builders/materials/thermo.py +17 -11
  20. emmet/builders/matscholar/missing_compositions.py +12 -8
  21. emmet/builders/mobility/migration_graph.py +5 -5
  22. emmet/builders/settings.py +21 -17
  23. emmet/builders/utils.py +101 -12
  24. emmet/builders/vasp/materials.py +40 -51
  25. emmet/builders/vasp/mp_potcar_stats.json.gz +0 -0
  26. emmet/builders/vasp/task_validator.py +25 -36
  27. emmet_builders-0.86.0.dist-info/METADATA +37 -0
  28. emmet_builders-0.86.0.dist-info/RECORD +41 -0
  29. {emmet_builders-0.78.3.dist-info → emmet_builders-0.86.0.dist-info}/WHEEL +1 -1
  30. emmet/builders/materials/ml.py +0 -87
  31. emmet/builders/molecules/atomic.py +0 -589
  32. emmet/builders/molecules/bonds.py +0 -324
  33. emmet/builders/molecules/metal_binding.py +0 -526
  34. emmet/builders/molecules/orbitals.py +0 -288
  35. emmet/builders/molecules/redox.py +0 -496
  36. emmet/builders/molecules/summary.py +0 -383
  37. emmet/builders/molecules/thermo.py +0 -500
  38. emmet/builders/molecules/vibration.py +0 -278
  39. emmet/builders/qchem/__init__.py +0 -0
  40. emmet/builders/qchem/molecules.py +0 -734
  41. emmet_builders-0.78.3.dist-info/METADATA +0 -47
  42. emmet_builders-0.78.3.dist-info/RECORD +0 -51
  43. /emmet/builders/{molecules/__init__.py → py.typed} +0 -0
  44. {emmet_builders-0.78.3.dist-info → emmet_builders-0.86.0.dist-info}/top_level.txt +0 -0
@@ -1,734 +0,0 @@
1
- from datetime import datetime
2
- from itertools import chain, groupby
3
- from math import ceil
4
- from typing import Any, Dict, Iterable, Iterator, List, Optional, Set, Union
5
-
6
- import networkx as nx
7
-
8
- from maggma.builders import Builder
9
- from maggma.stores import Store
10
- from maggma.utils import grouper
11
-
12
- from emmet.builders.settings import EmmetBuildSettings
13
- from emmet.core.utils import get_molecule_id, group_molecules, jsanitize, make_mol_graph
14
- from emmet.core.qchem.molecule import (
15
- best_lot,
16
- evaluate_lot,
17
- evaluate_task_entry,
18
- MoleculeDoc,
19
- )
20
- from emmet.core.qchem.task import TaskDocument
21
- from emmet.core.qchem.calc_types import LevelOfTheory, CalcType, TaskType
22
-
23
-
24
- __author__ = "Evan Spotte-Smith <ewcspottesmith@lbl.gov>"
25
-
26
-
27
- SETTINGS = EmmetBuildSettings()
28
-
29
-
30
- def evaluate_molecule(
31
- mol_doc: MoleculeDoc,
32
- funct_scores: Dict[str, int] = SETTINGS.QCHEM_FUNCTIONAL_QUALITY_SCORES,
33
- basis_scores: Dict[str, int] = SETTINGS.QCHEM_BASIS_QUALITY_SCORES,
34
- solvent_scores: Dict[str, int] = SETTINGS.QCHEM_SOLVENT_MODEL_QUALITY_SCORES,
35
- ):
36
- """
37
- Helper function to order optimization calcs by
38
- - Level of theory
39
- - Electronic energy
40
-
41
- :param mol_doc: Molecule to be evaluated
42
- :param funct_scores: Scores for various density functionals
43
- :param basis_scores: Scores for various basis sets
44
- :param solvent_scores: Scores for various implicit solvent models
45
- :return:
46
- """
47
-
48
- opt_lot = None
49
- for origin in mol_doc.origins:
50
- if origin.name == "molecule":
51
- opt_lot = mol_doc.levels_of_theory[origin.task_id]
52
- if isinstance(opt_lot, LevelOfTheory):
53
- opt_lot = opt_lot.value
54
-
55
- if opt_lot is None:
56
- opt_eval = [0]
57
- else:
58
- opt_eval = evaluate_lot(opt_lot, funct_scores, basis_scores, solvent_scores)
59
-
60
- best = best_lot(mol_doc, funct_scores, basis_scores, solvent_scores)
61
-
62
- best_eval = evaluate_lot(best, funct_scores, basis_scores, solvent_scores)
63
-
64
- return (
65
- -1 * int(mol_doc.deprecated),
66
- sum(best_eval),
67
- sum(opt_eval),
68
- mol_doc.best_entries[best]["energy"],
69
- )
70
-
71
-
72
- def _optimizing_solvent(mol_doc):
73
- """
74
- Returns which solvent was used to optimize this (associated) MoleculeDoc.
75
-
76
- Args:
77
- mol_doc: MoleculeDoc
78
-
79
- Returns:
80
- solvent (str)
81
-
82
- """
83
-
84
- for origin in mol_doc.origins:
85
- if origin.name.startswith("molecule"):
86
- solvent = mol_doc.solvents[origin.task_id]
87
- return solvent
88
-
89
-
90
- class MoleculesAssociationBuilder(Builder):
91
- """
92
- The MoleculesAssociationBuilder matches Q-Chem task documents by composition
93
- and collects tasks associated with identical structures.
94
- The purpose of this builder is to group calculations in preparation for the
95
- MoleculesBuilder.
96
-
97
- The process is as follows:
98
-
99
- 1.) Find all documents with the same formula
100
- 2.) Select only task documents for the task_types we can select properties from
101
- 3.) Aggregate task documents based on nuclear geometry
102
- 4.) Create a MoleculeDoc from the group of task documents
103
- """
104
-
105
- def __init__(
106
- self,
107
- tasks: Store,
108
- assoc: Store,
109
- query: Optional[Dict] = None,
110
- settings: Optional[EmmetBuildSettings] = None,
111
- **kwargs,
112
- ):
113
- """
114
- Args:
115
- tasks: Store of task documents
116
- assoc: Store of associated molecules documents to prepare
117
- query: dictionary to limit tasks to be analyzed
118
- settings: EmmetSettings to use in the build process
119
- """
120
-
121
- self.tasks = tasks
122
- self.assoc = assoc
123
- self.query = query if query else dict()
124
- self.settings = EmmetBuildSettings.autoload(settings)
125
- self.kwargs = kwargs
126
-
127
- super().__init__(sources=[tasks], targets=[assoc], **kwargs)
128
-
129
- def ensure_indexes(self):
130
- """
131
- Ensures indices on the collections needed for building
132
- """
133
-
134
- # Basic search index for tasks
135
- self.tasks.ensure_index("task_id")
136
- self.tasks.ensure_index("last_updated")
137
- self.tasks.ensure_index("state")
138
- self.tasks.ensure_index("formula_alphabetical")
139
- self.tasks.ensure_index("smiles")
140
- self.tasks.ensure_index("species_hash")
141
-
142
- # Search index for molecules
143
- self.assoc.ensure_index("molecule_id")
144
- self.assoc.ensure_index("last_updated")
145
- self.assoc.ensure_index("task_ids")
146
- self.assoc.ensure_index("formula_alphabetical")
147
-
148
- def prechunk(self, number_splits: int) -> Iterable[Dict]: # pragma: no cover
149
- """Prechunk the molecule builder for distributed computation"""
150
-
151
- temp_query = dict(self.query)
152
- temp_query["state"] = "successful"
153
-
154
- self.logger.info("Finding tasks to process")
155
- all_tasks = list(self.tasks.query(temp_query, [self.tasks.key, "species_hash"]))
156
-
157
- processed_tasks = set(self.assoc.distinct("task_ids"))
158
- to_process_tasks = {d[self.tasks.key] for d in all_tasks} - processed_tasks
159
- to_process_hashes = {
160
- d["species_hash"]
161
- for d in all_tasks
162
- if d[self.tasks.key] in to_process_tasks
163
- }
164
-
165
- N = ceil(len(to_process_hashes) / number_splits)
166
-
167
- for hash_chunk in grouper(to_process_hashes, N):
168
- yield {"query": {"species_hash": {"$in": list(hash_chunk)}}}
169
-
170
- def get_items(self) -> Iterator[List[TaskDocument]]:
171
- """
172
- Gets all items to process into molecules (and other) documents.
173
- This does no datetime checking; relying on on whether
174
- task_ids are included in the molecules Store
175
-
176
- Returns:
177
- generator or list relevant tasks and molecules to process into documents
178
- """
179
-
180
- self.logger.info("Molecule association builder started")
181
- self.logger.info(
182
- f"Allowed task types: {[task_type.value for task_type in self.settings.QCHEM_ALLOWED_TASK_TYPES]}"
183
- )
184
-
185
- self.logger.info("Setting indexes")
186
- self.ensure_indexes()
187
-
188
- # Save timestamp to mark buildtime
189
- self.timestamp = datetime.utcnow()
190
-
191
- # Get all processed tasks
192
- temp_query = dict(self.query)
193
- temp_query["state"] = "successful"
194
-
195
- self.logger.info("Finding tasks to process")
196
- all_tasks = list(self.tasks.query(temp_query, [self.tasks.key, "species_hash"]))
197
-
198
- processed_tasks = set(self.assoc.distinct("task_ids"))
199
- to_process_tasks = {d[self.tasks.key] for d in all_tasks} - processed_tasks
200
- to_process_hashes = {
201
- d["species_hash"]
202
- for d in all_tasks
203
- if d[self.tasks.key] in to_process_tasks
204
- }
205
-
206
- self.logger.info(f"Found {len(to_process_tasks)} unprocessed tasks")
207
- self.logger.info(f"Found {len(to_process_hashes)} unprocessed hashes")
208
-
209
- # Set total for builder bars to have a total
210
- self.total = len(to_process_hashes)
211
-
212
- projected_fields = [
213
- "last_updated",
214
- "task_id",
215
- "formula_alphabetical",
216
- "species_hash",
217
- "coord_hash",
218
- "smiles",
219
- "orig",
220
- "tags",
221
- "walltime",
222
- "cputime",
223
- "output",
224
- "calcs_reversed",
225
- "special_run_type",
226
- "custom_smd",
227
- "critic2",
228
- ]
229
-
230
- for shash in to_process_hashes:
231
- tasks_query = dict(temp_query)
232
- tasks_query["species_hash"] = shash
233
- tasks = list(
234
- self.tasks.query(criteria=tasks_query, properties=projected_fields)
235
- )
236
- to_yield = list()
237
- for t in tasks:
238
- # TODO: Validation
239
- # basic validation here ensures that tasks with invalid levels of
240
- # theory don't halt the build pipeline
241
- try:
242
- task = TaskDocument(**t)
243
- to_yield.append(task)
244
- except Exception as e:
245
- self.logger.info(
246
- f"Processing task {t['task_id']} failed with Exception - {e}"
247
- )
248
- continue
249
-
250
- yield to_yield
251
-
252
- def process_item(self, tasks: List[TaskDocument]) -> List[Dict]:
253
- """
254
- Process the tasks into a MoleculeDoc
255
-
256
- Args:
257
- tasks [TaskDocument] : a list of task docs
258
-
259
- Returns:
260
- [dict] : a list of new molecule docs
261
- """
262
-
263
- if len(tasks) == 0:
264
- return list()
265
- shash = tasks[0].species_hash
266
- task_ids = [task.task_id for task in tasks]
267
- self.logger.debug(f"Processing {shash} : {task_ids}")
268
- molecules = list()
269
-
270
- for group in self.filter_and_group_tasks(tasks):
271
- try:
272
- doc = MoleculeDoc.from_tasks(group)
273
- molecules.append(doc)
274
- except Exception as e:
275
- failed_ids = list({t_.task_id for t_ in group})
276
- doc = MoleculeDoc.construct_deprecated_molecule(group)
277
- doc.warnings.append(str(e))
278
- molecules.append(doc)
279
- self.logger.warning(
280
- f"Failed making molecule for {failed_ids}."
281
- f" Inserted as deprecated molecule: {doc.molecule_id}"
282
- )
283
-
284
- self.logger.debug(f"Produced {len(molecules)} molecules for {shash}")
285
-
286
- return jsanitize([mol.model_dump() for mol in molecules], allow_bson=True)
287
-
288
- def update_targets(self, items: List[List[Dict]]):
289
- """
290
- Inserts the new molecules into the molecules collection
291
-
292
- Args:
293
- items [[dict]]: A list of molecules to update
294
- """
295
-
296
- docs = list(chain.from_iterable(items)) # type: ignore
297
-
298
- for item in docs:
299
- item.update({"_bt": self.timestamp})
300
-
301
- molecule_ids = list({item["molecule_id"] for item in docs})
302
-
303
- if len(items) > 0:
304
- self.logger.info(f"Updating {len(docs)} molecules")
305
- self.assoc.remove_docs({self.assoc.key: {"$in": molecule_ids}})
306
- self.assoc.update(
307
- docs=docs,
308
- key=["molecule_id"],
309
- )
310
- else:
311
- self.logger.info("No items to update")
312
-
313
- def filter_and_group_tasks(
314
- self, tasks: List[TaskDocument]
315
- ) -> Iterator[List[TaskDocument]]:
316
- """
317
- Groups tasks by identical structure
318
- """
319
-
320
- filtered_tasks = [
321
- task
322
- for task in tasks
323
- if any(
324
- allowed_type is task.task_type
325
- for allowed_type in self.settings.QCHEM_ALLOWED_TASK_TYPES
326
- )
327
- ]
328
-
329
- molecules = list()
330
-
331
- for idx, task in enumerate(filtered_tasks):
332
- if task.output.optimized_molecule:
333
- m = task.output.optimized_molecule
334
- else:
335
- m = task.output.initial_molecule
336
- m.ind: int = idx # type: ignore
337
- molecules.append(m)
338
-
339
- grouped_molecules = group_molecules(molecules)
340
- for group in grouped_molecules:
341
- grouped_tasks = [filtered_tasks[mol.ind] for mol in group] # type: ignore
342
- yield grouped_tasks
343
-
344
-
345
- class MoleculesBuilder(Builder):
346
- """
347
- The MoleculesBuilder collects MoleculeDocs from the MoleculesAssociationBuilder
348
- and groups them by key properties (charge, spin multiplicity, bonding).
349
- Then, the best molecular structure is identified (based on electronic energy),
350
- and this document becomes the representative MoleculeDoc.
351
-
352
- The process is as follows:
353
-
354
- 1.) Find all documents with the same formula
355
- 2.) Group documents based on charge, spin, and bonding
356
- 3.) Create a MoleculeDoc from the group of task documents
357
- """
358
-
359
- def __init__(
360
- self,
361
- assoc: Store,
362
- molecules: Store,
363
- query: Optional[Dict] = None,
364
- settings: Optional[EmmetBuildSettings] = None,
365
- **kwargs,
366
- ):
367
- """
368
- Args:
369
- assoc: Store of associated molecules documents, created by MoleculesAssociationBuilder
370
- molecules: Store of processed molecules documents
371
- query: dictionary to limit tasks to be analyzed
372
- settings: EmmetSettings to use in the build process
373
- """
374
-
375
- self.assoc = assoc
376
- self.molecules = molecules
377
- self.query = query if query else dict()
378
- self.settings = EmmetBuildSettings.autoload(settings)
379
- self.kwargs = kwargs
380
-
381
- super().__init__(sources=[assoc], targets=[molecules], **kwargs)
382
-
383
- def ensure_indexes(self):
384
- """
385
- Ensures indices on the collections needed for building
386
- """
387
-
388
- # Search index for associated molecules
389
- self.assoc.ensure_index("molecule_id")
390
- self.assoc.ensure_index("last_updated")
391
- self.assoc.ensure_index("task_ids")
392
- self.assoc.ensure_index("formula_alphabetical")
393
-
394
- # Search index for molecules
395
- self.molecules.ensure_index("molecule_id")
396
- self.molecules.ensure_index("last_updated")
397
- self.molecules.ensure_index("task_ids")
398
- self.molecules.ensure_index("formula_alphabetical")
399
-
400
- def prechunk(self, number_splits: int) -> Iterable[Dict]: # pragma: no cover
401
- """Prechunk the molecule builder for distributed computation"""
402
-
403
- temp_query = dict(self.query)
404
- temp_query["deprecated"] = False
405
-
406
- self.logger.info("Finding documents to process")
407
- all_assoc = list(
408
- self.assoc.query(
409
- temp_query,
410
- [
411
- self.assoc.key,
412
- "formula_alphabetical",
413
- "species_hash",
414
- "charge",
415
- "spin_multiplicity",
416
- ],
417
- )
418
- )
419
-
420
- # Should be using species hash, rather than coord hash, at this point
421
- processed_docs = set(list(self.molecules.distinct("molecule_id")))
422
- assoc_ids = set()
423
-
424
- xyz_species_id_map = dict()
425
- for d in all_assoc:
426
- this_id = "{}-{}-{}-{}".format(
427
- d["species_hash"],
428
- d["formula_alphabetical"].replace(" ", ""),
429
- str(int(d["charge"])).replace("-", "m"),
430
- str(int(d["spin_multiplicity"])),
431
- )
432
- assoc_ids.add(this_id)
433
- xyz_species_id_map[d[self.assoc.key]] = this_id
434
- to_process_docs = assoc_ids - processed_docs
435
-
436
- to_process_forms = {
437
- d["formula_alphabetical"]
438
- for d in all_assoc
439
- if xyz_species_id_map[d[self.assoc.key]] in to_process_docs
440
- }
441
-
442
- N = ceil(len(to_process_forms) / number_splits)
443
-
444
- for formula_chunk in grouper(to_process_forms, N):
445
- yield {"query": {"formula_alphabetical": {"$in": list(formula_chunk)}}}
446
-
447
- def get_items(self) -> Iterator[List[Dict]]:
448
- """
449
- Gets all items to process into molecules (and other) documents.
450
- This does no datetime checking; relying on on whether
451
- task_ids are included in the molecules Store
452
-
453
- Returns:
454
- generator or list relevant tasks and molecules to process into documents
455
- """
456
-
457
- self.logger.info("Molecules builder started")
458
- self.logger.info("Setting indexes")
459
- self.ensure_indexes()
460
-
461
- # Save timestamp to mark buildtime
462
- self.timestamp = datetime.utcnow()
463
-
464
- # Get all processed molecules
465
- temp_query = dict(self.query)
466
- temp_query["deprecated"] = False
467
-
468
- self.logger.info("Finding documents to process")
469
- all_assoc = list(
470
- self.assoc.query(
471
- temp_query,
472
- [
473
- self.assoc.key,
474
- "formula_alphabetical",
475
- "species_hash",
476
- "charge",
477
- "spin_multiplicity",
478
- ],
479
- )
480
- )
481
-
482
- # Should be using species hash, rather than coord hash, at this point
483
- processed_docs = set(list(self.molecules.distinct("molecule_id")))
484
- assoc_ids = set()
485
-
486
- xyz_species_id_map = dict()
487
- for d in all_assoc:
488
- this_id = "{}-{}-{}-{}".format(
489
- d["species_hash"],
490
- d["formula_alphabetical"].replace(" ", ""),
491
- str(int(d["charge"])).replace("-", "m"),
492
- str(int(d["spin_multiplicity"])),
493
- )
494
- assoc_ids.add(this_id)
495
- xyz_species_id_map[d[self.assoc.key]] = this_id
496
- to_process_docs = assoc_ids - processed_docs
497
-
498
- to_process_forms = {
499
- d["formula_alphabetical"]
500
- for d in all_assoc
501
- if xyz_species_id_map[d[self.assoc.key]] in to_process_docs
502
- }
503
-
504
- self.logger.info(f"Found {len(to_process_docs)} unprocessed documents")
505
- self.logger.info(f"Found {len(to_process_forms)} unprocessed formulas")
506
-
507
- # Set total for builder bars to have a total
508
- self.total = len(to_process_forms)
509
-
510
- for formula in to_process_forms:
511
- assoc_query = dict(temp_query)
512
- assoc_query["formula_alphabetical"] = formula
513
- assoc = list(self.assoc.query(criteria=assoc_query))
514
-
515
- yield assoc
516
-
517
- def process_item(self, items: List[Dict]) -> List[Dict]:
518
- """
519
- Process the tasks into a MoleculeDoc
520
-
521
- Args:
522
- tasks List[Dict] : a list of task docs
523
-
524
- Returns:
525
- [dict] : a list of new molecule docs
526
- """
527
-
528
- assoc = [MoleculeDoc(**item) for item in items]
529
- formula = assoc[0].formula_alphabetical
530
- mol_ids = [a.molecule_id for a in assoc]
531
- self.logger.debug(f"Processing {formula} : {mol_ids}")
532
-
533
- complete_mol_docs = list()
534
-
535
- # This is only slightly unholy
536
- # Need to combine many variables of the various constituent associated docs
537
- # into one MoleculeDoc, where the best associated doc for each solvent is taken
538
- for group in self.group_mol_docs(assoc):
539
- # Maybe all are disconnected and therefore none get grouped?
540
- if len(group) == 0:
541
- continue
542
-
543
- docs_by_solvent = dict()
544
- mols_by_solvent = dict()
545
- mol_lots = dict()
546
-
547
- task_ids = list()
548
- calc_types = dict()
549
- task_types = dict()
550
- levels_of_theory = dict()
551
- solvents = dict()
552
- lot_solvents = dict()
553
- unique_calc_types: Set[Union[str, CalcType]] = set()
554
- unique_task_types: Set[Union[str, TaskType]] = set()
555
- unique_levels_of_theory: Set[Union[str, LevelOfTheory]] = set()
556
- unique_solvents: Set[str] = set()
557
- unique_lot_solvents: Set[str] = set()
558
- origins = list()
559
- entries = list()
560
- best_entries: Dict[str, Any] = dict()
561
- constituent_molecules = list()
562
- similar_molecules = list()
563
-
564
- base_doc: Optional[MoleculeDoc] = None
565
-
566
- # Grab best doc for each solvent
567
- # A doc is given a solvent based on how the molecule was optimized
568
- for solv, subgroup in groupby(
569
- sorted(group, key=_optimizing_solvent), key=_optimizing_solvent
570
- ):
571
- sorted_docs = sorted(subgroup, key=evaluate_molecule)
572
- docs_by_solvent[solv] = sorted_docs[0]
573
- mols_by_solvent[solv] = sorted_docs[0].molecule
574
- mol_lots[solv] = sorted_docs[0].levels_of_theory[
575
- sorted_docs[0].origins[0].task_id
576
- ]
577
- constituent_molecules.append(sorted_docs[0].molecule_id)
578
-
579
- if len(sorted_docs) > 1:
580
- for m in sorted_docs[1:]:
581
- if m.molecule_id not in constituent_molecules:
582
- similar_molecules.append(m.molecule_id)
583
-
584
- if base_doc is None:
585
- base_doc = docs_by_solvent[solv]
586
-
587
- if base_doc is None:
588
- continue
589
-
590
- else:
591
- # Compile data on each constituent doc
592
- for solv, doc in docs_by_solvent.items():
593
- task_ids.extend(doc.task_ids)
594
- calc_types.update(doc.calc_types)
595
- task_types.update(doc.task_types)
596
- levels_of_theory.update(doc.levels_of_theory)
597
- solvents.update(doc.solvents)
598
- lot_solvents.update(doc.lot_solvents)
599
- unique_calc_types = unique_calc_types.union(
600
- set(doc.unique_calc_types)
601
- )
602
- unique_task_types = unique_task_types.union(
603
- set(doc.unique_task_types)
604
- )
605
- unique_levels_of_theory = unique_levels_of_theory.union(
606
- set(doc.unique_levels_of_theory)
607
- )
608
- unique_solvents = unique_solvents.union(set(doc.unique_solvents))
609
- unique_lot_solvents = unique_lot_solvents.union(
610
- set(doc.unique_lot_solvents)
611
- )
612
- origins.extend(doc.origins)
613
- entries.extend(doc.entries)
614
-
615
- for lot_solv, entry in doc.best_entries.items():
616
- if lot_solv in best_entries:
617
- current_eval = evaluate_task_entry(best_entries[lot_solv])
618
- this_eval = evaluate_task_entry(entry)
619
- if this_eval < current_eval:
620
- best_entries[lot_solv] = entry
621
- else:
622
- best_entries[lot_solv] = entry
623
-
624
- # Assign new doc info
625
- base_doc.molecule_id = get_molecule_id(
626
- base_doc.molecule, node_attr="specie"
627
- )
628
- base_doc.molecules = mols_by_solvent
629
- base_doc.molecule_levels_of_theory = mol_lots
630
- base_doc.task_ids = task_ids
631
- base_doc.calc_types = calc_types
632
- base_doc.task_types = task_types
633
- base_doc.levels_of_theory = levels_of_theory
634
- base_doc.solvents = solvents
635
- base_doc.lot_solvents = lot_solvents
636
- base_doc.unique_calc_types = unique_calc_types
637
- base_doc.unique_task_types = unique_task_types
638
- base_doc.unique_levels_of_theory = unique_levels_of_theory
639
- base_doc.unique_solvents = unique_solvents
640
- base_doc.unique_lot_solvents = unique_lot_solvents
641
- base_doc.origins = origins
642
- base_doc.entries = entries
643
- base_doc.best_entries = best_entries
644
- base_doc.constituent_molecules = constituent_molecules
645
- base_doc.similar_molecules = similar_molecules
646
-
647
- complete_mol_docs.append(base_doc)
648
-
649
- self.logger.debug(f"Produced {len(complete_mol_docs)} molecules for {formula}")
650
-
651
- return jsanitize(
652
- [mol.model_dump() for mol in complete_mol_docs], allow_bson=True
653
- )
654
-
655
- def update_targets(self, items: List[List[Dict]]):
656
- """
657
- Inserts the new molecules into the molecules collection
658
-
659
- Args:
660
- items [[dict]]: A list of molecules to update
661
- """
662
-
663
- self.logger.debug(f"Updating {len(items)} molecules")
664
-
665
- docs = list(chain.from_iterable(items)) # type: ignore
666
-
667
- # Add timestamp, add prefix to molecule id
668
- for item in docs:
669
- molid = item["molecule_id"]
670
-
671
- item.update({"_bt": self.timestamp})
672
-
673
- for entry in item["entries"]:
674
- entry["entry_id"] = molid
675
-
676
- molecule_ids = list({item["molecule_id"] for item in docs})
677
-
678
- if len(items) > 0:
679
- self.logger.info(f"Updating {len(docs)} molecules")
680
- self.molecules.remove_docs({self.molecules.key: {"$in": molecule_ids}})
681
- self.molecules.update(
682
- docs=docs,
683
- key=["molecule_id"],
684
- )
685
- else:
686
- self.logger.info("No items to update")
687
-
688
- def group_mol_docs(self, assoc: List[MoleculeDoc]) -> Iterator[List[MoleculeDoc]]:
689
- """
690
- Groups molecules by:
691
- - highest level of theory
692
- - charge
693
- - spin multiplicity
694
- - bonding (molecule graph isomorphism)
695
- - solvent environment used for the structure
696
- """
697
-
698
- # Molecules are already grouped by formula
699
-
700
- # First, group by charge, spin multiplicity
701
- # Then group by graph isomorphism, using OpenBabelNN + metal_edge_extender
702
-
703
- def charge_spin(mol_doc):
704
- return (mol_doc.charge, mol_doc.spin_multiplicity)
705
-
706
- # Group by charge and spin
707
- for c_s, group in groupby(sorted(assoc, key=charge_spin), key=charge_spin):
708
- subgroups: List[Dict[str, Any]] = list()
709
- for mol_doc in group:
710
- mol_graph = make_mol_graph(mol_doc.molecule)
711
- mol_hash = mol_doc.species_hash
712
-
713
- # Finally, group by graph isomorphism
714
- # When bonding is defined by OpenBabelNN + metal_edge_extender
715
- # Unconnected molecule graphs are discarded at this step
716
- # TODO: What about molecules that would be connected under a different
717
- # TODO: bonding scheme? For now, ¯\_(ツ)_/¯
718
- # TODO: MAKE ClusterBuilder FOR THIS PURPOSE
719
- if nx.is_connected(mol_graph.graph.to_undirected()):
720
- matched = False
721
-
722
- for subgroup in subgroups:
723
- if mol_hash == subgroup["hash"]:
724
- subgroup["mol_docs"].append(mol_doc)
725
- matched = True
726
- break
727
-
728
- if not matched:
729
- subgroups.append({"hash": mol_hash, "mol_docs": [mol_doc]})
730
-
731
- self.logger.debug(f"Unique hashes: {[x['hash'] for x in subgroups]}")
732
-
733
- for subgroup in subgroups:
734
- yield subgroup["mol_docs"]