emmet-builders 0.84.10rc2__py3-none-any.whl → 0.85.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of emmet-builders might be problematic. Click here for more details.
- emmet/builders/abinit/phonon.py +12 -14
- emmet/builders/abinit/sound_velocity.py +1 -1
- emmet/builders/materials/absorption_spectrum.py +16 -10
- emmet/builders/materials/alloys.py +1 -1
- emmet/builders/materials/corrected_entries.py +1 -1
- emmet/builders/materials/dielectric.py +10 -7
- emmet/builders/materials/elasticity.py +12 -9
- emmet/builders/materials/electrodes.py +1 -1
- emmet/builders/materials/electronic_structure.py +1 -1
- emmet/builders/materials/magnetism.py +2 -1
- emmet/builders/materials/piezoelectric.py +23 -19
- emmet/builders/materials/provenance.py +3 -4
- emmet/builders/materials/summary.py +1 -1
- emmet/builders/settings.py +14 -9
- emmet/builders/utils.py +5 -4
- emmet/builders/vasp/materials.py +11 -4
- emmet/builders/vasp/task_validator.py +3 -1
- {emmet_builders-0.84.10rc2.dist-info → emmet_builders-0.85.0.dist-info}/METADATA +7 -30
- emmet_builders-0.85.0.dist-info/RECORD +41 -0
- emmet/builders/materials/ml.py +0 -101
- emmet/builders/molecules/atomic.py +0 -592
- emmet/builders/molecules/bonds.py +0 -329
- emmet/builders/molecules/electric.py +0 -287
- emmet/builders/molecules/metal_binding.py +0 -528
- emmet/builders/molecules/orbitals.py +0 -292
- emmet/builders/molecules/redox.py +0 -502
- emmet/builders/molecules/summary.py +0 -406
- emmet/builders/molecules/thermo.py +0 -505
- emmet/builders/molecules/trajectory.py +0 -530
- emmet/builders/molecules/vibration.py +0 -282
- emmet/builders/qchem/__init__.py +0 -0
- emmet/builders/qchem/molecules.py +0 -745
- emmet_builders-0.84.10rc2.dist-info/RECORD +0 -54
- /emmet/builders/{molecules/__init__.py → py.typed} +0 -0
- {emmet_builders-0.84.10rc2.dist-info → emmet_builders-0.85.0.dist-info}/WHEEL +0 -0
- {emmet_builders-0.84.10rc2.dist-info → emmet_builders-0.85.0.dist-info}/top_level.txt +0 -0
emmet/builders/materials/ml.py
DELETED
|
@@ -1,101 +0,0 @@
|
|
|
1
|
-
from importlib.metadata import version
|
|
2
|
-
|
|
3
|
-
from maggma.builders.map_builder import MapBuilder
|
|
4
|
-
from maggma.core import Store
|
|
5
|
-
|
|
6
|
-
try:
|
|
7
|
-
from matcalc import PESCalculator
|
|
8
|
-
|
|
9
|
-
matcalc_installed = True
|
|
10
|
-
except ImportError:
|
|
11
|
-
matcalc_installed = False
|
|
12
|
-
|
|
13
|
-
from pymatgen.core import Structure
|
|
14
|
-
|
|
15
|
-
from emmet.core.ml import MLDoc
|
|
16
|
-
from emmet.core.utils import jsanitize
|
|
17
|
-
|
|
18
|
-
try:
|
|
19
|
-
from ase.calculators.calculator import Calculator
|
|
20
|
-
|
|
21
|
-
ase_installed = True
|
|
22
|
-
except ImportError:
|
|
23
|
-
ase_installed = False
|
|
24
|
-
|
|
25
|
-
|
|
26
|
-
class MLBuilder(MapBuilder):
|
|
27
|
-
def __init__(
|
|
28
|
-
self,
|
|
29
|
-
materials: Store,
|
|
30
|
-
ml_potential: Store,
|
|
31
|
-
model: str | Calculator,
|
|
32
|
-
model_kwargs: dict | None = None,
|
|
33
|
-
prop_kwargs: dict | None = None,
|
|
34
|
-
provenance: dict | None = None,
|
|
35
|
-
**kwargs,
|
|
36
|
-
):
|
|
37
|
-
"""Machine learning interatomic potential builder.
|
|
38
|
-
|
|
39
|
-
Args:
|
|
40
|
-
materials (Store): Materials to use as input structures.
|
|
41
|
-
ml_potential (Store): Where to save MLDoc documents to.
|
|
42
|
-
model (str | Calculator): ASE calculator or name of model to use as ML
|
|
43
|
-
potential. See matcalc.utils.UNIVERSAL_CALCULATORS for recognized names.
|
|
44
|
-
model_kwargs (dict, optional): Additional kwargs to pass to the calculator.
|
|
45
|
-
Defaults to None.
|
|
46
|
-
prop_kwargs (dict[str, dict], optional): Separate kwargs passed to each matcalc
|
|
47
|
-
PropCalc class. Recognized keys are RelaxCalc, ElasticityCalc, PhononCalc, EOSCalc.
|
|
48
|
-
Defaults to None.
|
|
49
|
-
provenance (dict, optional): Additional provenance information to include in
|
|
50
|
-
MLDocs. Will be saved in each document so use sparingly. Defaults to None.
|
|
51
|
-
Set to {} to disable default provenance model, version, matcalc_version.
|
|
52
|
-
"""
|
|
53
|
-
|
|
54
|
-
if not matcalc_installed or not ase_installed:
|
|
55
|
-
raise ImportError("Please `pip install matcalc` to use the MLBuilder.")
|
|
56
|
-
|
|
57
|
-
self.materials = materials
|
|
58
|
-
self.ml_potential = ml_potential
|
|
59
|
-
self.kwargs = kwargs
|
|
60
|
-
self.model = PESCalculator.load_universal(model, **(model_kwargs or {}))
|
|
61
|
-
self.prop_kwargs = prop_kwargs or {}
|
|
62
|
-
|
|
63
|
-
if provenance == {}:
|
|
64
|
-
self.provenance = {}
|
|
65
|
-
else:
|
|
66
|
-
model_name = (
|
|
67
|
-
model if isinstance(model, str) else type(model).__name__
|
|
68
|
-
).lower()
|
|
69
|
-
model_name = {"chgnetcalculator": "chgnet"}.get(model_name, model_name)
|
|
70
|
-
pkg_name = {"m3gnet": "matgl"}.get(model_name, model_name)
|
|
71
|
-
self.provenance = dict(
|
|
72
|
-
model=model_name,
|
|
73
|
-
version=version(pkg_name),
|
|
74
|
-
matcalc_version=version("matcalc"),
|
|
75
|
-
**(provenance or {}),
|
|
76
|
-
)
|
|
77
|
-
|
|
78
|
-
# Enforce that we key on material_id
|
|
79
|
-
self.materials.key = "material_id"
|
|
80
|
-
self.ml_potential.key = "material_id"
|
|
81
|
-
super().__init__(
|
|
82
|
-
source=materials,
|
|
83
|
-
target=ml_potential,
|
|
84
|
-
projection=["structure", "deprecated"],
|
|
85
|
-
**kwargs,
|
|
86
|
-
)
|
|
87
|
-
|
|
88
|
-
def unary_function(self, item):
|
|
89
|
-
struct = Structure.from_dict(item["structure"])
|
|
90
|
-
mp_id, deprecated = item["material_id"], item["deprecated"]
|
|
91
|
-
|
|
92
|
-
doc = MLDoc(
|
|
93
|
-
structure=struct,
|
|
94
|
-
material_id=mp_id,
|
|
95
|
-
calculator=self.model,
|
|
96
|
-
prop_kwargs=self.prop_kwargs,
|
|
97
|
-
deprecated=deprecated,
|
|
98
|
-
**self.provenance,
|
|
99
|
-
)
|
|
100
|
-
|
|
101
|
-
return jsanitize(doc, allow_bson=True)
|
|
@@ -1,592 +0,0 @@
|
|
|
1
|
-
from __future__ import annotations
|
|
2
|
-
|
|
3
|
-
from collections import defaultdict
|
|
4
|
-
from datetime import datetime
|
|
5
|
-
from itertools import chain
|
|
6
|
-
from math import ceil
|
|
7
|
-
|
|
8
|
-
from maggma.builders import Builder
|
|
9
|
-
from maggma.core import Store
|
|
10
|
-
from maggma.utils import grouper
|
|
11
|
-
|
|
12
|
-
from emmet.builders.settings import EmmetBuildSettings
|
|
13
|
-
from emmet.core.molecules.atomic import (
|
|
14
|
-
CHARGES_METHODS,
|
|
15
|
-
SPINS_METHODS,
|
|
16
|
-
PartialChargesDoc,
|
|
17
|
-
PartialSpinsDoc,
|
|
18
|
-
)
|
|
19
|
-
from emmet.core.qchem.molecule import MoleculeDoc, evaluate_lot
|
|
20
|
-
from emmet.core.qchem.task import TaskDocument
|
|
21
|
-
from emmet.core.utils import jsanitize
|
|
22
|
-
|
|
23
|
-
from typing import TYPE_CHECKING
|
|
24
|
-
|
|
25
|
-
if TYPE_CHECKING:
|
|
26
|
-
from collections.abc import Iterable, Iterator
|
|
27
|
-
|
|
28
|
-
__author__ = "Evan Spotte-Smith"
|
|
29
|
-
|
|
30
|
-
SETTINGS = EmmetBuildSettings()
|
|
31
|
-
|
|
32
|
-
|
|
33
|
-
class PartialChargesBuilder(Builder):
|
|
34
|
-
"""
|
|
35
|
-
The PartialChargesBuilder extracts partial charges data from a MoleculeDoc.
|
|
36
|
-
|
|
37
|
-
Various methods can be used to define partial charges, including:
|
|
38
|
-
- Mulliken
|
|
39
|
-
- Restrained Electrostatic Potential (RESP)
|
|
40
|
-
- Critic2
|
|
41
|
-
- Natural Bonding Orbital (NBO) population analysis
|
|
42
|
-
|
|
43
|
-
This builder will attempt to build documents for each molecule, in each solvent,
|
|
44
|
-
with each method. For each molecule-solvent-method combination, the
|
|
45
|
-
highest-quality data available (based on level of theory and electronic
|
|
46
|
-
energy) will be used.
|
|
47
|
-
|
|
48
|
-
The process is as follows:
|
|
49
|
-
1. Gather MoleculeDocs by species hash
|
|
50
|
-
2. For each molecule, group all tasks by solvent.
|
|
51
|
-
3. For each solvent, sort tasks by level of theory and electronic energy
|
|
52
|
-
4. For each method:
|
|
53
|
-
4.1. Find task docs with necessary data to calculate partial charges by that method
|
|
54
|
-
4.2. Take best (defined by level of theory and electronic energy) task
|
|
55
|
-
4.3. Convert TaskDoc to PartialChargesDoc
|
|
56
|
-
"""
|
|
57
|
-
|
|
58
|
-
def __init__(
|
|
59
|
-
self,
|
|
60
|
-
tasks: Store,
|
|
61
|
-
molecules: Store,
|
|
62
|
-
charges: Store,
|
|
63
|
-
query: dict | None = None,
|
|
64
|
-
methods: list | None = None,
|
|
65
|
-
settings: EmmetBuildSettings | None = None,
|
|
66
|
-
**kwargs,
|
|
67
|
-
):
|
|
68
|
-
self.tasks = tasks
|
|
69
|
-
self.molecules = molecules
|
|
70
|
-
self.charges = charges
|
|
71
|
-
self.query = query if query else dict()
|
|
72
|
-
self.methods = methods if methods else CHARGES_METHODS
|
|
73
|
-
self.settings = EmmetBuildSettings.autoload(settings)
|
|
74
|
-
self.kwargs = kwargs
|
|
75
|
-
|
|
76
|
-
super().__init__(sources=[tasks, molecules], targets=[charges], **kwargs)
|
|
77
|
-
# Uncomment in case of issue with mrun not connecting automatically to collections
|
|
78
|
-
# for i in [self.tasks, self.molecules, self.charges]:
|
|
79
|
-
# try:
|
|
80
|
-
# i.connect()
|
|
81
|
-
# except Exception as e:
|
|
82
|
-
# print("Could not connect,", e)
|
|
83
|
-
|
|
84
|
-
def ensure_indexes(self):
|
|
85
|
-
"""
|
|
86
|
-
Ensures indices on the collections needed for building
|
|
87
|
-
"""
|
|
88
|
-
|
|
89
|
-
# Basic search index for tasks
|
|
90
|
-
self.tasks.ensure_index("task_id")
|
|
91
|
-
self.tasks.ensure_index("last_updated")
|
|
92
|
-
self.tasks.ensure_index("state")
|
|
93
|
-
self.tasks.ensure_index("formula_alphabetical")
|
|
94
|
-
self.tasks.ensure_index("species_hash")
|
|
95
|
-
|
|
96
|
-
# Search index for molecules
|
|
97
|
-
self.molecules.ensure_index("molecule_id")
|
|
98
|
-
self.molecules.ensure_index("last_updated")
|
|
99
|
-
self.molecules.ensure_index("task_ids")
|
|
100
|
-
self.molecules.ensure_index("formula_alphabetical")
|
|
101
|
-
self.molecules.ensure_index("species_hash")
|
|
102
|
-
|
|
103
|
-
# Search index for charges
|
|
104
|
-
self.charges.ensure_index("molecule_id")
|
|
105
|
-
self.charges.ensure_index("task_id")
|
|
106
|
-
self.charges.ensure_index("method")
|
|
107
|
-
self.charges.ensure_index("solvent")
|
|
108
|
-
self.charges.ensure_index("lot_solvent")
|
|
109
|
-
self.charges.ensure_index("property_id")
|
|
110
|
-
self.charges.ensure_index("last_updated")
|
|
111
|
-
self.charges.ensure_index("formula_alphabetical")
|
|
112
|
-
|
|
113
|
-
def prechunk(self, number_splits: int) -> Iterable[dict]: # pragma: no cover
|
|
114
|
-
"""Prechunk the builder for distributed computation"""
|
|
115
|
-
|
|
116
|
-
temp_query = dict(self.query)
|
|
117
|
-
temp_query["deprecated"] = False
|
|
118
|
-
|
|
119
|
-
self.logger.info("Finding documents to process")
|
|
120
|
-
all_mols = list(
|
|
121
|
-
self.molecules.query(temp_query, [self.molecules.key, "species_hash"])
|
|
122
|
-
)
|
|
123
|
-
|
|
124
|
-
processed_docs = set([e for e in self.charges.distinct("molecule_id")])
|
|
125
|
-
to_process_docs = {d[self.molecules.key] for d in all_mols} - processed_docs
|
|
126
|
-
to_process_hashes = {
|
|
127
|
-
d["species_hash"]
|
|
128
|
-
for d in all_mols
|
|
129
|
-
if d[self.molecules.key] in to_process_docs
|
|
130
|
-
}
|
|
131
|
-
|
|
132
|
-
N = ceil(len(to_process_hashes) / number_splits)
|
|
133
|
-
|
|
134
|
-
for hash_chunk in grouper(to_process_hashes, N):
|
|
135
|
-
query = dict(temp_query)
|
|
136
|
-
query["species_hash"] = {"$in": list(hash_chunk)}
|
|
137
|
-
yield {"query": query}
|
|
138
|
-
|
|
139
|
-
def get_items(self) -> Iterator[list[dict]]:
|
|
140
|
-
"""
|
|
141
|
-
Gets all items to process into partial charges documents.
|
|
142
|
-
This does no datetime checking; relying on on whether
|
|
143
|
-
task_ids are included in the charges Store
|
|
144
|
-
|
|
145
|
-
Returns:
|
|
146
|
-
generator or list relevant tasks and molecules to process into documents
|
|
147
|
-
"""
|
|
148
|
-
|
|
149
|
-
self.logger.info("Partial charges builder started")
|
|
150
|
-
self.logger.info("Setting indexes")
|
|
151
|
-
self.ensure_indexes()
|
|
152
|
-
|
|
153
|
-
# Save timestamp to mark buildtime
|
|
154
|
-
self.timestamp = datetime.utcnow()
|
|
155
|
-
|
|
156
|
-
# Get all processed molecules
|
|
157
|
-
temp_query = dict(self.query)
|
|
158
|
-
temp_query["deprecated"] = False
|
|
159
|
-
|
|
160
|
-
self.logger.info("Finding documents to process")
|
|
161
|
-
all_mols = list(
|
|
162
|
-
self.molecules.query(temp_query, [self.molecules.key, "species_hash"])
|
|
163
|
-
)
|
|
164
|
-
|
|
165
|
-
processed_docs = set([e for e in self.charges.distinct("molecule_id")])
|
|
166
|
-
to_process_docs = {d[self.molecules.key] for d in all_mols} - processed_docs
|
|
167
|
-
to_process_hashes = {
|
|
168
|
-
d["species_hash"]
|
|
169
|
-
for d in all_mols
|
|
170
|
-
if d[self.molecules.key] in to_process_docs
|
|
171
|
-
}
|
|
172
|
-
|
|
173
|
-
self.logger.info(f"Found {len(to_process_docs)} unprocessed documents")
|
|
174
|
-
self.logger.info(f"Found {len(to_process_hashes)} unprocessed hashes")
|
|
175
|
-
|
|
176
|
-
# Set total for builder bars to have a total
|
|
177
|
-
self.total = len(to_process_hashes)
|
|
178
|
-
|
|
179
|
-
for shash in to_process_hashes:
|
|
180
|
-
mol_query = dict(temp_query)
|
|
181
|
-
mol_query["species_hash"] = shash
|
|
182
|
-
molecules = list(self.molecules.query(criteria=mol_query))
|
|
183
|
-
|
|
184
|
-
yield molecules
|
|
185
|
-
|
|
186
|
-
def process_item(self, items: list[dict]) -> list[dict]:
|
|
187
|
-
"""
|
|
188
|
-
Process the tasks into PartialChargesDocs
|
|
189
|
-
|
|
190
|
-
Args:
|
|
191
|
-
tasks list[dict] : a list of MoleculeDocs in dict form
|
|
192
|
-
|
|
193
|
-
Returns:
|
|
194
|
-
[dict] : a list of new partial charges docs
|
|
195
|
-
"""
|
|
196
|
-
|
|
197
|
-
mols = [MoleculeDoc(**item) for item in items]
|
|
198
|
-
shash = mols[0].species_hash
|
|
199
|
-
mol_ids = [m.molecule_id for m in mols]
|
|
200
|
-
self.logger.debug(f"Processing {shash} : {mol_ids}")
|
|
201
|
-
|
|
202
|
-
charges_docs = list()
|
|
203
|
-
|
|
204
|
-
for mol in mols:
|
|
205
|
-
correct_charge_spin = [
|
|
206
|
-
e
|
|
207
|
-
for e in mol.entries
|
|
208
|
-
if e["charge"] == mol.charge
|
|
209
|
-
and e["spin_multiplicity"] == mol.spin_multiplicity
|
|
210
|
-
]
|
|
211
|
-
|
|
212
|
-
# Organize by solvent environment
|
|
213
|
-
by_solvent = defaultdict(list)
|
|
214
|
-
for entry in correct_charge_spin:
|
|
215
|
-
by_solvent[entry["solvent"]].append(entry)
|
|
216
|
-
|
|
217
|
-
for solvent, entries in by_solvent.items():
|
|
218
|
-
sorted_entries = sorted(
|
|
219
|
-
entries,
|
|
220
|
-
key=lambda x: (
|
|
221
|
-
sum(evaluate_lot(x["level_of_theory"])),
|
|
222
|
-
x["energy"],
|
|
223
|
-
),
|
|
224
|
-
)
|
|
225
|
-
|
|
226
|
-
for method in self.methods:
|
|
227
|
-
# For each method, grab entries that have the relevant data
|
|
228
|
-
relevant_entries = [
|
|
229
|
-
e
|
|
230
|
-
for e in sorted_entries
|
|
231
|
-
if e.get(method) is not None
|
|
232
|
-
or e["output"].get(method) is not None
|
|
233
|
-
]
|
|
234
|
-
|
|
235
|
-
if len(relevant_entries) == 0:
|
|
236
|
-
continue
|
|
237
|
-
|
|
238
|
-
# Grab task document of best entry
|
|
239
|
-
best_entry = relevant_entries[0]
|
|
240
|
-
task = best_entry["task_id"]
|
|
241
|
-
|
|
242
|
-
tdoc = self.tasks.query_one(
|
|
243
|
-
{
|
|
244
|
-
"task_id": task,
|
|
245
|
-
"species_hash": shash,
|
|
246
|
-
"orig": {"$exists": True},
|
|
247
|
-
}
|
|
248
|
-
)
|
|
249
|
-
|
|
250
|
-
if tdoc is None:
|
|
251
|
-
try:
|
|
252
|
-
tdoc = self.tasks.query_one(
|
|
253
|
-
{
|
|
254
|
-
"task_id": int(task),
|
|
255
|
-
"species_hash": shash,
|
|
256
|
-
"orig": {"$exists": True},
|
|
257
|
-
}
|
|
258
|
-
)
|
|
259
|
-
except ValueError:
|
|
260
|
-
tdoc = None
|
|
261
|
-
|
|
262
|
-
if tdoc is None:
|
|
263
|
-
continue
|
|
264
|
-
|
|
265
|
-
task_doc = TaskDocument(**tdoc)
|
|
266
|
-
|
|
267
|
-
if task_doc is None:
|
|
268
|
-
continue
|
|
269
|
-
|
|
270
|
-
doc = PartialChargesDoc.from_task(
|
|
271
|
-
task_doc,
|
|
272
|
-
molecule_id=mol.molecule_id,
|
|
273
|
-
preferred_methods=[method],
|
|
274
|
-
deprecated=False,
|
|
275
|
-
)
|
|
276
|
-
|
|
277
|
-
charges_docs.append(doc)
|
|
278
|
-
|
|
279
|
-
self.logger.debug(f"Produced {len(charges_docs)} charges docs for {shash}")
|
|
280
|
-
|
|
281
|
-
return jsanitize([doc.model_dump() for doc in charges_docs], allow_bson=True)
|
|
282
|
-
|
|
283
|
-
def update_targets(self, items: list[list[dict]]):
|
|
284
|
-
"""
|
|
285
|
-
Inserts the new documents into the charges collection
|
|
286
|
-
|
|
287
|
-
Args:
|
|
288
|
-
items [[dict]]: A list of documents to update
|
|
289
|
-
"""
|
|
290
|
-
|
|
291
|
-
docs = list(chain.from_iterable(items)) # type: ignore
|
|
292
|
-
|
|
293
|
-
# Add timestamp
|
|
294
|
-
for item in docs:
|
|
295
|
-
item.update(
|
|
296
|
-
{
|
|
297
|
-
"_bt": self.timestamp,
|
|
298
|
-
}
|
|
299
|
-
)
|
|
300
|
-
|
|
301
|
-
molecule_ids = list({item["molecule_id"] for item in docs})
|
|
302
|
-
|
|
303
|
-
if len(items) > 0:
|
|
304
|
-
self.logger.info(f"Updating {len(docs)} partial charges documents")
|
|
305
|
-
self.charges.remove_docs({self.charges.key: {"$in": molecule_ids}})
|
|
306
|
-
# Neither molecule_id nor method need to be unique, but the combination must be
|
|
307
|
-
self.charges.update(
|
|
308
|
-
docs=docs,
|
|
309
|
-
key=["molecule_id", "method", "solvent"],
|
|
310
|
-
)
|
|
311
|
-
else:
|
|
312
|
-
self.logger.info("No items to update")
|
|
313
|
-
|
|
314
|
-
|
|
315
|
-
class PartialSpinsBuilder(Builder):
|
|
316
|
-
"""
|
|
317
|
-
The PartialSpinsBuilder extracts partial spin data from a MoleculeDoc.
|
|
318
|
-
|
|
319
|
-
Various methods can be used to define partial atomic spins, including:
|
|
320
|
-
- Mulliken
|
|
321
|
-
- Natural Bonding Orbital (NBO) population analysis
|
|
322
|
-
|
|
323
|
-
This builder will attempt to build documents for each molecule, in each solvent,
|
|
324
|
-
with each method. For each molecule-method combination, the highest-quality
|
|
325
|
-
data available (based on level of theory and electronic energy) will be used.
|
|
326
|
-
|
|
327
|
-
The process is as follows:
|
|
328
|
-
1. Gather MoleculeDocs by species_hash
|
|
329
|
-
2. For each molecule, group all tasks by solvent.
|
|
330
|
-
3. For each solvent, sort tasks by level of theory and electronic energy
|
|
331
|
-
4. For each method:
|
|
332
|
-
4.1. Find task docs with necessary data to calculate partial charges by that method
|
|
333
|
-
4.2. Take best (defined by level of theory and electronic energy) task
|
|
334
|
-
4.3. Convert TaskDoc to PartialSpinsDoc
|
|
335
|
-
"""
|
|
336
|
-
|
|
337
|
-
def __init__(
|
|
338
|
-
self,
|
|
339
|
-
tasks: Store,
|
|
340
|
-
molecules: Store,
|
|
341
|
-
spins: Store,
|
|
342
|
-
query: dict | None = None,
|
|
343
|
-
methods: list | None = None,
|
|
344
|
-
settings: EmmetBuildSettings | None = None,
|
|
345
|
-
**kwargs,
|
|
346
|
-
):
|
|
347
|
-
self.tasks = tasks
|
|
348
|
-
self.molecules = molecules
|
|
349
|
-
self.spins = spins
|
|
350
|
-
self.query = query if query else dict()
|
|
351
|
-
self.methods = methods if methods else SPINS_METHODS
|
|
352
|
-
self.settings = EmmetBuildSettings.autoload(settings)
|
|
353
|
-
self.kwargs = kwargs
|
|
354
|
-
|
|
355
|
-
super().__init__(sources=[tasks, molecules], targets=[spins], **kwargs)
|
|
356
|
-
# Uncomment in case of issue with mrun not connecting automatically to collections
|
|
357
|
-
# for i in [self.tasks, self.molecules, self.spins]:
|
|
358
|
-
# try:
|
|
359
|
-
# i.connect()
|
|
360
|
-
# except Exception as e:
|
|
361
|
-
# print("Could not connect,", e)
|
|
362
|
-
|
|
363
|
-
def ensure_indexes(self):
|
|
364
|
-
"""
|
|
365
|
-
Ensures indices on the collections needed for building
|
|
366
|
-
"""
|
|
367
|
-
|
|
368
|
-
# Basic search index for tasks
|
|
369
|
-
self.tasks.ensure_index("task_id")
|
|
370
|
-
self.tasks.ensure_index("last_updated")
|
|
371
|
-
self.tasks.ensure_index("state")
|
|
372
|
-
self.tasks.ensure_index("formula_alphabetical")
|
|
373
|
-
self.tasks.ensure_index("species_hash")
|
|
374
|
-
|
|
375
|
-
# Search index for molecules
|
|
376
|
-
self.molecules.ensure_index("molecule_id")
|
|
377
|
-
self.molecules.ensure_index("last_updated")
|
|
378
|
-
self.molecules.ensure_index("task_ids")
|
|
379
|
-
self.molecules.ensure_index("formula_alphabetical")
|
|
380
|
-
self.molecules.ensure_index("species_hash")
|
|
381
|
-
|
|
382
|
-
# Search index for spins
|
|
383
|
-
self.spins.ensure_index("molecule_id")
|
|
384
|
-
self.spins.ensure_index("task_id")
|
|
385
|
-
self.spins.ensure_index("method")
|
|
386
|
-
self.spins.ensure_index("solvent")
|
|
387
|
-
self.spins.ensure_index("lot_solvent")
|
|
388
|
-
self.spins.ensure_index("property_id")
|
|
389
|
-
self.spins.ensure_index("last_updated")
|
|
390
|
-
self.spins.ensure_index("formula_alphabetical")
|
|
391
|
-
|
|
392
|
-
def prechunk(self, number_splits: int) -> Iterable[dict]: # pragma: no cover
|
|
393
|
-
"""Prechunk the builder for distributed computation"""
|
|
394
|
-
|
|
395
|
-
temp_query = dict(self.query)
|
|
396
|
-
temp_query["deprecated"] = False
|
|
397
|
-
|
|
398
|
-
self.logger.info("Finding documents to process")
|
|
399
|
-
all_mols = list(
|
|
400
|
-
self.molecules.query(temp_query, [self.molecules.key, "species_hash"])
|
|
401
|
-
)
|
|
402
|
-
|
|
403
|
-
processed_docs = set([e for e in self.spins.distinct("molecule_id")])
|
|
404
|
-
to_process_docs = {d[self.molecules.key] for d in all_mols} - processed_docs
|
|
405
|
-
to_process_hashes = {
|
|
406
|
-
d["species_hash"]
|
|
407
|
-
for d in all_mols
|
|
408
|
-
if d[self.molecules.key] in to_process_docs
|
|
409
|
-
}
|
|
410
|
-
|
|
411
|
-
N = ceil(len(to_process_hashes) / number_splits)
|
|
412
|
-
|
|
413
|
-
for hash_chunk in grouper(to_process_hashes, N):
|
|
414
|
-
query = dict(temp_query)
|
|
415
|
-
query["species_hash"] = {"$in": list(hash_chunk)}
|
|
416
|
-
yield {"query": query}
|
|
417
|
-
|
|
418
|
-
def get_items(self) -> Iterator[list[dict]]:
|
|
419
|
-
"""
|
|
420
|
-
Gets all items to process into partial spins documents.
|
|
421
|
-
This does no datetime checking; relying on on whether
|
|
422
|
-
task_ids are included in the spins Store
|
|
423
|
-
|
|
424
|
-
Returns:
|
|
425
|
-
generator or list relevant tasks and molecules to process into documents
|
|
426
|
-
"""
|
|
427
|
-
|
|
428
|
-
self.logger.info("Partial spins builder started")
|
|
429
|
-
self.logger.info("Setting indexes")
|
|
430
|
-
self.ensure_indexes()
|
|
431
|
-
|
|
432
|
-
# Save timestamp to mark buildtime
|
|
433
|
-
self.timestamp = datetime.utcnow()
|
|
434
|
-
|
|
435
|
-
# Get all processed molecules
|
|
436
|
-
temp_query = dict(self.query)
|
|
437
|
-
temp_query["deprecated"] = False
|
|
438
|
-
|
|
439
|
-
self.logger.info("Finding documents to process")
|
|
440
|
-
all_mols = list(
|
|
441
|
-
self.molecules.query(temp_query, [self.molecules.key, "species_hash"])
|
|
442
|
-
)
|
|
443
|
-
|
|
444
|
-
processed_docs = set([e for e in self.spins.distinct("molecule_id")])
|
|
445
|
-
to_process_docs = {d[self.molecules.key] for d in all_mols} - processed_docs
|
|
446
|
-
to_process_hashes = {
|
|
447
|
-
d["species_hash"]
|
|
448
|
-
for d in all_mols
|
|
449
|
-
if d[self.molecules.key] in to_process_docs
|
|
450
|
-
}
|
|
451
|
-
|
|
452
|
-
self.logger.info(f"Found {len(to_process_docs)} unprocessed documents")
|
|
453
|
-
self.logger.info(f"Found {len(to_process_hashes)} unprocessed hashes")
|
|
454
|
-
|
|
455
|
-
# Set total for builder bars to have a total
|
|
456
|
-
self.total = len(to_process_hashes)
|
|
457
|
-
|
|
458
|
-
for shash in to_process_hashes:
|
|
459
|
-
mol_query = dict(temp_query)
|
|
460
|
-
mol_query["species_hash"] = shash
|
|
461
|
-
molecules = list(self.molecules.query(criteria=mol_query))
|
|
462
|
-
|
|
463
|
-
yield molecules
|
|
464
|
-
|
|
465
|
-
def process_item(self, items: list[dict]) -> list[dict]:
|
|
466
|
-
"""
|
|
467
|
-
Process the tasks into PartialSpinsDocs
|
|
468
|
-
|
|
469
|
-
Args:
|
|
470
|
-
tasks list[dict] : a list of MoleculeDocs in dict form
|
|
471
|
-
|
|
472
|
-
Returns:
|
|
473
|
-
[dict] : a list of new partial spins docs
|
|
474
|
-
"""
|
|
475
|
-
|
|
476
|
-
mols = [MoleculeDoc(**item) for item in items]
|
|
477
|
-
shash = mols[0].species_hash
|
|
478
|
-
mol_ids = [m.molecule_id for m in mols]
|
|
479
|
-
self.logger.debug(f"Processing {shash} : {mol_ids}")
|
|
480
|
-
|
|
481
|
-
spins_docs = list()
|
|
482
|
-
|
|
483
|
-
for mol in mols:
|
|
484
|
-
# Molecule with spin multiplicity 1 has no partial spins
|
|
485
|
-
if mol.spin_multiplicity == 1:
|
|
486
|
-
continue
|
|
487
|
-
|
|
488
|
-
correct_charge_spin = [
|
|
489
|
-
e
|
|
490
|
-
for e in mol.entries
|
|
491
|
-
if e["charge"] == mol.charge
|
|
492
|
-
and e["spin_multiplicity"] == mol.spin_multiplicity
|
|
493
|
-
]
|
|
494
|
-
|
|
495
|
-
# Organize by solvent environment
|
|
496
|
-
by_solvent = defaultdict(list)
|
|
497
|
-
for entry in correct_charge_spin:
|
|
498
|
-
by_solvent[entry["solvent"]].append(entry)
|
|
499
|
-
|
|
500
|
-
for solvent, entries in by_solvent.items():
|
|
501
|
-
sorted_entries = sorted(
|
|
502
|
-
entries,
|
|
503
|
-
key=lambda x: (
|
|
504
|
-
sum(evaluate_lot(x["level_of_theory"])),
|
|
505
|
-
x["energy"],
|
|
506
|
-
),
|
|
507
|
-
)
|
|
508
|
-
|
|
509
|
-
for method in self.methods:
|
|
510
|
-
# For each method, grab entries that have the relevant data
|
|
511
|
-
relevant_entries = [
|
|
512
|
-
e
|
|
513
|
-
for e in sorted_entries
|
|
514
|
-
if e.get(method) is not None
|
|
515
|
-
or e["output"].get(method) is not None
|
|
516
|
-
]
|
|
517
|
-
|
|
518
|
-
if len(relevant_entries) == 0:
|
|
519
|
-
continue
|
|
520
|
-
|
|
521
|
-
# Grab task document of best entry
|
|
522
|
-
best_entry = relevant_entries[0]
|
|
523
|
-
task = best_entry["task_id"]
|
|
524
|
-
|
|
525
|
-
tdoc = self.tasks.query_one(
|
|
526
|
-
{
|
|
527
|
-
"task_id": task,
|
|
528
|
-
"species_hash": shash,
|
|
529
|
-
"orig": {"$exists": True},
|
|
530
|
-
}
|
|
531
|
-
)
|
|
532
|
-
|
|
533
|
-
if tdoc is None:
|
|
534
|
-
try:
|
|
535
|
-
tdoc = self.tasks.query_one(
|
|
536
|
-
{
|
|
537
|
-
"task_id": int(task),
|
|
538
|
-
"species_hash": shash,
|
|
539
|
-
"orig": {"$exists": True},
|
|
540
|
-
}
|
|
541
|
-
)
|
|
542
|
-
except ValueError:
|
|
543
|
-
tdoc = None
|
|
544
|
-
|
|
545
|
-
if tdoc is None:
|
|
546
|
-
continue
|
|
547
|
-
|
|
548
|
-
task_doc = TaskDocument(**tdoc)
|
|
549
|
-
|
|
550
|
-
doc = PartialSpinsDoc.from_task(
|
|
551
|
-
task_doc,
|
|
552
|
-
molecule_id=mol.molecule_id,
|
|
553
|
-
preferred_methods=[method],
|
|
554
|
-
deprecated=False,
|
|
555
|
-
)
|
|
556
|
-
|
|
557
|
-
spins_docs.append(doc)
|
|
558
|
-
|
|
559
|
-
self.logger.debug(f"Produced {len(spins_docs)} partial spins docs for {shash}")
|
|
560
|
-
|
|
561
|
-
return jsanitize([doc.model_dump() for doc in spins_docs], allow_bson=True)
|
|
562
|
-
|
|
563
|
-
def update_targets(self, items: list[list[dict]]):
|
|
564
|
-
"""
|
|
565
|
-
Inserts the new documents into the spins collection
|
|
566
|
-
|
|
567
|
-
Args:
|
|
568
|
-
items [[dict]]: A list of documents to update
|
|
569
|
-
"""
|
|
570
|
-
|
|
571
|
-
docs = list(chain.from_iterable(items)) # type: ignore
|
|
572
|
-
|
|
573
|
-
# Add timestamp
|
|
574
|
-
for item in docs:
|
|
575
|
-
item.update(
|
|
576
|
-
{
|
|
577
|
-
"_bt": self.timestamp,
|
|
578
|
-
}
|
|
579
|
-
)
|
|
580
|
-
|
|
581
|
-
molecule_ids = list({item["molecule_id"] for item in docs})
|
|
582
|
-
|
|
583
|
-
if len(items) > 0:
|
|
584
|
-
self.logger.info(f"Updating {len(docs)} partial spins documents")
|
|
585
|
-
self.spins.remove_docs({self.spins.key: {"$in": molecule_ids}})
|
|
586
|
-
# Neither molecule_id nor method need to be unique, but the combination must be
|
|
587
|
-
self.spins.update(
|
|
588
|
-
docs=docs,
|
|
589
|
-
key=["molecule_id", "method", "solvent"],
|
|
590
|
-
)
|
|
591
|
-
else:
|
|
592
|
-
self.logger.info("No items to update")
|