emmet-builders 0.84.2rc7__py3-none-any.whl → 0.84.2rc9__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of emmet-builders might be problematic. Click here for more details.
- emmet/builders/molecules/atomic.py +48 -46
- emmet/builders/molecules/bonds.py +24 -24
- emmet/builders/molecules/metal_binding.py +21 -20
- emmet/builders/molecules/orbitals.py +23 -23
- emmet/builders/molecules/redox.py +27 -27
- emmet/builders/molecules/summary.py +21 -36
- emmet/builders/molecules/thermo.py +23 -23
- emmet/builders/molecules/vibration.py +23 -23
- emmet/builders/qchem/molecules.py +15 -21
- emmet/builders/vasp/mp_potcar_stats.json.gz +0 -0
- {emmet_builders-0.84.2rc7.dist-info → emmet_builders-0.84.2rc9.dist-info}/METADATA +1 -1
- {emmet_builders-0.84.2rc7.dist-info → emmet_builders-0.84.2rc9.dist-info}/RECORD +14 -16
- emmet/builders/molecules/electric.py +0 -282
- emmet/builders/molecules/trajectory.py +0 -525
- {emmet_builders-0.84.2rc7.dist-info → emmet_builders-0.84.2rc9.dist-info}/WHEEL +0 -0
- {emmet_builders-0.84.2rc7.dist-info → emmet_builders-0.84.2rc9.dist-info}/top_level.txt +0 -0
|
@@ -30,7 +30,7 @@ class RedoxBuilder(Builder):
|
|
|
30
30
|
from a MoleculeDoc (lowest electronic energy, highest level of theory).
|
|
31
31
|
|
|
32
32
|
The process is as follows:
|
|
33
|
-
1. Gather MoleculeDocs by
|
|
33
|
+
1. Gather MoleculeDocs by formula
|
|
34
34
|
2. Further group based on (covalent) isomorphism and charge
|
|
35
35
|
3. For each MoleculeDoc:
|
|
36
36
|
3a. Identify relevant MoleculeThermoDocs
|
|
@@ -81,14 +81,12 @@ class RedoxBuilder(Builder):
|
|
|
81
81
|
self.tasks.ensure_index("last_updated")
|
|
82
82
|
self.tasks.ensure_index("state")
|
|
83
83
|
self.tasks.ensure_index("formula_alphabetical")
|
|
84
|
-
self.tasks.ensure_index("species_hash")
|
|
85
84
|
|
|
86
85
|
# Search index for molecules
|
|
87
86
|
self.molecules.ensure_index("molecule_id")
|
|
88
87
|
self.molecules.ensure_index("last_updated")
|
|
89
88
|
self.molecules.ensure_index("task_ids")
|
|
90
89
|
self.molecules.ensure_index("formula_alphabetical")
|
|
91
|
-
self.molecules.ensure_index("species_hash")
|
|
92
90
|
|
|
93
91
|
# Search index for thermo
|
|
94
92
|
self.thermo.ensure_index("molecule_id")
|
|
@@ -115,23 +113,23 @@ class RedoxBuilder(Builder):
|
|
|
115
113
|
|
|
116
114
|
self.logger.info("Finding documents to process")
|
|
117
115
|
all_mols = list(
|
|
118
|
-
self.molecules.query(
|
|
116
|
+
self.molecules.query(
|
|
117
|
+
temp_query, [self.molecules.key, "formula_alphabetical"]
|
|
118
|
+
)
|
|
119
119
|
)
|
|
120
120
|
|
|
121
121
|
processed_docs = set([e for e in self.redox.distinct("molecule_id")])
|
|
122
122
|
to_process_docs = {d[self.molecules.key] for d in all_mols} - processed_docs
|
|
123
|
-
|
|
124
|
-
d["
|
|
123
|
+
to_process_forms = {
|
|
124
|
+
d["formula_alphabetical"]
|
|
125
125
|
for d in all_mols
|
|
126
126
|
if d[self.molecules.key] in to_process_docs
|
|
127
127
|
}
|
|
128
128
|
|
|
129
|
-
N = ceil(len(
|
|
129
|
+
N = ceil(len(to_process_forms) / number_splits)
|
|
130
130
|
|
|
131
|
-
for
|
|
132
|
-
query
|
|
133
|
-
query["species_hash"] = {"$in": list(hash_chunk)}
|
|
134
|
-
yield {"query": query}
|
|
131
|
+
for formula_chunk in grouper(to_process_forms, N):
|
|
132
|
+
yield {"query": {"formula_alphabetical": {"$in": list(formula_chunk)}}}
|
|
135
133
|
|
|
136
134
|
def get_items(self) -> Iterator[List[Dict]]:
|
|
137
135
|
"""
|
|
@@ -156,26 +154,28 @@ class RedoxBuilder(Builder):
|
|
|
156
154
|
|
|
157
155
|
self.logger.info("Finding documents to process")
|
|
158
156
|
all_mols = list(
|
|
159
|
-
self.molecules.query(
|
|
157
|
+
self.molecules.query(
|
|
158
|
+
temp_query, [self.molecules.key, "formula_alphabetical"]
|
|
159
|
+
)
|
|
160
160
|
)
|
|
161
161
|
|
|
162
162
|
processed_docs = set([e for e in self.redox.distinct("molecule_id")])
|
|
163
163
|
to_process_docs = {d[self.molecules.key] for d in all_mols} - processed_docs
|
|
164
|
-
|
|
165
|
-
d["
|
|
164
|
+
to_process_forms = {
|
|
165
|
+
d["formula_alphabetical"]
|
|
166
166
|
for d in all_mols
|
|
167
167
|
if d[self.molecules.key] in to_process_docs
|
|
168
168
|
}
|
|
169
169
|
|
|
170
170
|
self.logger.info(f"Found {len(to_process_docs)} unprocessed documents")
|
|
171
|
-
self.logger.info(f"Found {len(
|
|
171
|
+
self.logger.info(f"Found {len(to_process_forms)} unprocessed formulas")
|
|
172
172
|
|
|
173
173
|
# Set total for builder bars to have a total
|
|
174
|
-
self.total = len(
|
|
174
|
+
self.total = len(to_process_forms)
|
|
175
175
|
|
|
176
|
-
for
|
|
176
|
+
for formula in to_process_forms:
|
|
177
177
|
mol_query = dict(temp_query)
|
|
178
|
-
mol_query["
|
|
178
|
+
mol_query["formula_alphabetical"] = formula
|
|
179
179
|
molecules = list(self.molecules.query(criteria=mol_query))
|
|
180
180
|
|
|
181
181
|
yield molecules
|
|
@@ -192,9 +192,9 @@ class RedoxBuilder(Builder):
|
|
|
192
192
|
"""
|
|
193
193
|
|
|
194
194
|
mols = [MoleculeDoc(**item) for item in items]
|
|
195
|
-
|
|
195
|
+
formula = mols[0].formula_alphabetical
|
|
196
196
|
mol_ids = [m.molecule_id for m in mols]
|
|
197
|
-
self.logger.debug(f"Processing {
|
|
197
|
+
self.logger.debug(f"Processing {formula} : {mol_ids}")
|
|
198
198
|
|
|
199
199
|
redox_docs = list()
|
|
200
200
|
|
|
@@ -220,7 +220,7 @@ class RedoxBuilder(Builder):
|
|
|
220
220
|
e["task_id"]
|
|
221
221
|
for e in gg.entries
|
|
222
222
|
if e["charge"] == gg.charge + 1
|
|
223
|
-
and e["task_type"]
|
|
223
|
+
and e["task_type"] == "Single Point"
|
|
224
224
|
and e["output"].get("final_energy")
|
|
225
225
|
]
|
|
226
226
|
ie_tasks = list()
|
|
@@ -228,7 +228,7 @@ class RedoxBuilder(Builder):
|
|
|
228
228
|
tdoc = self.tasks.query_one(
|
|
229
229
|
{
|
|
230
230
|
"task_id": i,
|
|
231
|
-
"
|
|
231
|
+
"formula_alphabetical": formula,
|
|
232
232
|
"orig": {"$exists": True},
|
|
233
233
|
}
|
|
234
234
|
)
|
|
@@ -238,7 +238,7 @@ class RedoxBuilder(Builder):
|
|
|
238
238
|
tdoc = self.tasks.query_one(
|
|
239
239
|
{
|
|
240
240
|
"task_id": int(i),
|
|
241
|
-
"
|
|
241
|
+
"formula_alphabetical": formula,
|
|
242
242
|
"orig": {"$exists": True},
|
|
243
243
|
}
|
|
244
244
|
)
|
|
@@ -254,7 +254,7 @@ class RedoxBuilder(Builder):
|
|
|
254
254
|
e["task_id"]
|
|
255
255
|
for e in gg.entries
|
|
256
256
|
if e["charge"] == gg.charge - 1
|
|
257
|
-
and e["task_type"]
|
|
257
|
+
and e["task_type"] == "Single Point"
|
|
258
258
|
and e["output"].get("final_energy")
|
|
259
259
|
]
|
|
260
260
|
ea_tasks = list()
|
|
@@ -262,7 +262,7 @@ class RedoxBuilder(Builder):
|
|
|
262
262
|
tdoc = self.tasks.query_one(
|
|
263
263
|
{
|
|
264
264
|
"task_id": i,
|
|
265
|
-
"
|
|
265
|
+
"formula_alphabetical": formula,
|
|
266
266
|
"orig": {"$exists": True},
|
|
267
267
|
}
|
|
268
268
|
)
|
|
@@ -272,7 +272,7 @@ class RedoxBuilder(Builder):
|
|
|
272
272
|
tdoc = self.tasks.query_one(
|
|
273
273
|
{
|
|
274
274
|
"task_id": int(i),
|
|
275
|
-
"
|
|
275
|
+
"formula_alphabetical": formula,
|
|
276
276
|
"orig": {"$exists": True},
|
|
277
277
|
}
|
|
278
278
|
)
|
|
@@ -354,7 +354,7 @@ class RedoxBuilder(Builder):
|
|
|
354
354
|
)
|
|
355
355
|
)
|
|
356
356
|
|
|
357
|
-
self.logger.debug(f"Produced {len(redox_docs)} redox docs for {
|
|
357
|
+
self.logger.debug(f"Produced {len(redox_docs)} redox docs for {formula}")
|
|
358
358
|
|
|
359
359
|
return jsanitize(
|
|
360
360
|
[doc.model_dump() for doc in redox_docs if doc is not None], allow_bson=True
|
|
@@ -36,7 +36,6 @@ class SummaryBuilder(Builder):
|
|
|
36
36
|
charges: Store,
|
|
37
37
|
spins: Store,
|
|
38
38
|
bonds: Store,
|
|
39
|
-
multipoles: Store,
|
|
40
39
|
metal_binding: Store,
|
|
41
40
|
orbitals: Store,
|
|
42
41
|
redox: Store,
|
|
@@ -51,7 +50,6 @@ class SummaryBuilder(Builder):
|
|
|
51
50
|
self.charges = charges
|
|
52
51
|
self.spins = spins
|
|
53
52
|
self.bonds = bonds
|
|
54
|
-
self.multipoles = multipoles
|
|
55
53
|
self.metal_binding = metal_binding
|
|
56
54
|
self.orbitals = orbitals
|
|
57
55
|
self.redox = redox
|
|
@@ -68,7 +66,6 @@ class SummaryBuilder(Builder):
|
|
|
68
66
|
charges,
|
|
69
67
|
spins,
|
|
70
68
|
bonds,
|
|
71
|
-
multipoles,
|
|
72
69
|
metal_binding,
|
|
73
70
|
orbitals,
|
|
74
71
|
redox,
|
|
@@ -84,7 +81,6 @@ class SummaryBuilder(Builder):
|
|
|
84
81
|
# self.charges,
|
|
85
82
|
# self.spins,
|
|
86
83
|
# self.bonds,
|
|
87
|
-
# self.multipoles,
|
|
88
84
|
# self.metal_binding,
|
|
89
85
|
# self.orbitals,
|
|
90
86
|
# self.redox,
|
|
@@ -107,7 +103,6 @@ class SummaryBuilder(Builder):
|
|
|
107
103
|
self.molecules.ensure_index("last_updated")
|
|
108
104
|
self.molecules.ensure_index("task_ids")
|
|
109
105
|
self.molecules.ensure_index("formula_alphabetical")
|
|
110
|
-
self.molecules.ensure_index("species_hash")
|
|
111
106
|
|
|
112
107
|
# Search index for charges
|
|
113
108
|
self.charges.ensure_index("molecule_id")
|
|
@@ -139,15 +134,6 @@ class SummaryBuilder(Builder):
|
|
|
139
134
|
self.bonds.ensure_index("last_updated")
|
|
140
135
|
self.bonds.ensure_index("formula_alphabetical")
|
|
141
136
|
|
|
142
|
-
# Search index for multipoles
|
|
143
|
-
self.multipoles.ensure_index("molecule_id")
|
|
144
|
-
self.multipoles.ensure_index("task_id")
|
|
145
|
-
self.multipoles.ensure_index("solvent")
|
|
146
|
-
self.multipoles.ensure_index("lot_solvent")
|
|
147
|
-
self.multipoles.ensure_index("property_id")
|
|
148
|
-
self.multipoles.ensure_index("last_updated")
|
|
149
|
-
self.multipoles.ensure_index("formula_alphabetical")
|
|
150
|
-
|
|
151
137
|
# Search index for metal_binding
|
|
152
138
|
self.metal_binding.ensure_index("molecule_id")
|
|
153
139
|
self.metal_binding.ensure_index("solvent")
|
|
@@ -206,23 +192,23 @@ class SummaryBuilder(Builder):
|
|
|
206
192
|
|
|
207
193
|
self.logger.info("Finding documents to process")
|
|
208
194
|
all_mols = list(
|
|
209
|
-
self.molecules.query(
|
|
195
|
+
self.molecules.query(
|
|
196
|
+
temp_query, [self.molecules.key, "formula_alphabetical"]
|
|
197
|
+
)
|
|
210
198
|
)
|
|
211
199
|
|
|
212
200
|
processed_docs = set([e for e in self.summary.distinct("molecule_id")])
|
|
213
201
|
to_process_docs = {d[self.molecules.key] for d in all_mols} - processed_docs
|
|
214
|
-
|
|
215
|
-
d["
|
|
202
|
+
to_process_forms = {
|
|
203
|
+
d["formula_alphabetical"]
|
|
216
204
|
for d in all_mols
|
|
217
205
|
if d[self.molecules.key] in to_process_docs
|
|
218
206
|
}
|
|
219
207
|
|
|
220
|
-
N = ceil(len(
|
|
208
|
+
N = ceil(len(to_process_forms) / number_splits)
|
|
221
209
|
|
|
222
|
-
for
|
|
223
|
-
query
|
|
224
|
-
query["species_hash"] = {"$in": list(hash_chunk)}
|
|
225
|
-
yield {"query": query}
|
|
210
|
+
for formula_chunk in grouper(to_process_forms, N):
|
|
211
|
+
yield {"query": {"formula_alphabetical": {"$in": list(formula_chunk)}}}
|
|
226
212
|
|
|
227
213
|
def get_items(self) -> Iterator[List[Dict]]:
|
|
228
214
|
"""
|
|
@@ -247,26 +233,28 @@ class SummaryBuilder(Builder):
|
|
|
247
233
|
|
|
248
234
|
self.logger.info("Finding documents to process")
|
|
249
235
|
all_mols = list(
|
|
250
|
-
self.molecules.query(
|
|
236
|
+
self.molecules.query(
|
|
237
|
+
temp_query, [self.molecules.key, "formula_alphabetical"]
|
|
238
|
+
)
|
|
251
239
|
)
|
|
252
240
|
|
|
253
241
|
processed_docs = set([e for e in self.summary.distinct("molecule_id")])
|
|
254
242
|
to_process_docs = {d[self.molecules.key] for d in all_mols} - processed_docs
|
|
255
|
-
|
|
256
|
-
d["
|
|
243
|
+
to_process_forms = {
|
|
244
|
+
d["formula_alphabetical"]
|
|
257
245
|
for d in all_mols
|
|
258
246
|
if d[self.molecules.key] in to_process_docs
|
|
259
247
|
}
|
|
260
248
|
|
|
261
249
|
self.logger.info(f"Found {len(to_process_docs)} unprocessed documents")
|
|
262
|
-
self.logger.info(f"Found {len(
|
|
250
|
+
self.logger.info(f"Found {len(to_process_forms)} unprocessed formulas")
|
|
263
251
|
|
|
264
252
|
# Set total for builder bars to have a total
|
|
265
|
-
self.total = len(
|
|
253
|
+
self.total = len(to_process_forms)
|
|
266
254
|
|
|
267
|
-
for
|
|
255
|
+
for formula in to_process_forms:
|
|
268
256
|
mol_query = dict(temp_query)
|
|
269
|
-
mol_query["
|
|
257
|
+
mol_query["formula_alphabetical"] = formula
|
|
270
258
|
molecules = list(self.molecules.query(criteria=mol_query))
|
|
271
259
|
|
|
272
260
|
yield molecules
|
|
@@ -304,12 +292,12 @@ class SummaryBuilder(Builder):
|
|
|
304
292
|
else:
|
|
305
293
|
grouped[solvent][method] = doc
|
|
306
294
|
|
|
307
|
-
return grouped
|
|
295
|
+
return (grouped, by_method)
|
|
308
296
|
|
|
309
297
|
mols = items
|
|
310
|
-
|
|
298
|
+
formula = mols[0]["formula_alphabetical"]
|
|
311
299
|
mol_ids = [m["molecule_id"] for m in mols]
|
|
312
|
-
self.logger.debug(f"Processing {
|
|
300
|
+
self.logger.debug(f"Processing {formula} : {mol_ids}")
|
|
313
301
|
|
|
314
302
|
summary_docs = list()
|
|
315
303
|
|
|
@@ -330,9 +318,6 @@ class SummaryBuilder(Builder):
|
|
|
330
318
|
"metal_binding": _group_docs(
|
|
331
319
|
list(self.metal_binding.query({"molecule_id": mol_id})), True
|
|
332
320
|
),
|
|
333
|
-
"multipole_moments": _group_docs(
|
|
334
|
-
list(self.multipoles.query({"molecule_id": mol_id})), False
|
|
335
|
-
),
|
|
336
321
|
"orbitals": _group_docs(
|
|
337
322
|
list(self.orbitals.query({"molecule_id": mol_id})), False
|
|
338
323
|
),
|
|
@@ -363,7 +348,7 @@ class SummaryBuilder(Builder):
|
|
|
363
348
|
summary_doc = MoleculeSummaryDoc.from_docs(molecule_id=mol_id, docs=d)
|
|
364
349
|
summary_docs.append(summary_doc)
|
|
365
350
|
|
|
366
|
-
self.logger.debug(f"Produced {len(summary_docs)} summary docs for {
|
|
351
|
+
self.logger.debug(f"Produced {len(summary_docs)} summary docs for {formula}")
|
|
367
352
|
|
|
368
353
|
return jsanitize([doc.model_dump() for doc in summary_docs], allow_bson=True)
|
|
369
354
|
|
|
@@ -84,7 +84,7 @@ class ThermoBuilder(Builder):
|
|
|
84
84
|
single-point energy corrections.
|
|
85
85
|
|
|
86
86
|
Before any documents are constructed, the following steps are taken:
|
|
87
|
-
1. Gather MoleculeDocs by
|
|
87
|
+
1. Gather MoleculeDocs by formula
|
|
88
88
|
2. For each doc, identify tasks with thermodynamic information such as
|
|
89
89
|
zero-point energy, enthalpy, and entropy. Collect these "documents
|
|
90
90
|
including complete thermodynamics" (DICTs).
|
|
@@ -148,14 +148,12 @@ class ThermoBuilder(Builder):
|
|
|
148
148
|
self.tasks.ensure_index("last_updated")
|
|
149
149
|
self.tasks.ensure_index("state")
|
|
150
150
|
self.tasks.ensure_index("formula_alphabetical")
|
|
151
|
-
self.tasks.ensure_index("species_hash")
|
|
152
151
|
|
|
153
152
|
# Search index for molecules
|
|
154
153
|
self.molecules.ensure_index("molecule_id")
|
|
155
154
|
self.molecules.ensure_index("last_updated")
|
|
156
155
|
self.molecules.ensure_index("task_ids")
|
|
157
156
|
self.molecules.ensure_index("formula_alphabetical")
|
|
158
|
-
self.molecules.ensure_index("species_hash")
|
|
159
157
|
|
|
160
158
|
# Search index for thermo
|
|
161
159
|
self.thermo.ensure_index("molecule_id")
|
|
@@ -174,23 +172,23 @@ class ThermoBuilder(Builder):
|
|
|
174
172
|
|
|
175
173
|
self.logger.info("Finding documents to process")
|
|
176
174
|
all_mols = list(
|
|
177
|
-
self.molecules.query(
|
|
175
|
+
self.molecules.query(
|
|
176
|
+
temp_query, [self.molecules.key, "formula_alphabetical"]
|
|
177
|
+
)
|
|
178
178
|
)
|
|
179
179
|
|
|
180
180
|
processed_docs = set([e for e in self.thermo.distinct("molecule_id")])
|
|
181
181
|
to_process_docs = {d[self.molecules.key] for d in all_mols} - processed_docs
|
|
182
|
-
|
|
183
|
-
d["
|
|
182
|
+
to_process_forms = {
|
|
183
|
+
d["formula_alphabetical"]
|
|
184
184
|
for d in all_mols
|
|
185
185
|
if d[self.molecules.key] in to_process_docs
|
|
186
186
|
}
|
|
187
187
|
|
|
188
|
-
N = ceil(len(
|
|
188
|
+
N = ceil(len(to_process_forms) / number_splits)
|
|
189
189
|
|
|
190
|
-
for
|
|
191
|
-
query
|
|
192
|
-
query["species_hash"] = {"$in": list(hash_chunk)}
|
|
193
|
-
yield {"query": query}
|
|
190
|
+
for formula_chunk in grouper(to_process_forms, N):
|
|
191
|
+
yield {"query": {"formula_alphabetical": {"$in": list(formula_chunk)}}}
|
|
194
192
|
|
|
195
193
|
def get_items(self) -> Iterator[List[Dict]]:
|
|
196
194
|
"""
|
|
@@ -215,26 +213,28 @@ class ThermoBuilder(Builder):
|
|
|
215
213
|
|
|
216
214
|
self.logger.info("Finding documents to process")
|
|
217
215
|
all_mols = list(
|
|
218
|
-
self.molecules.query(
|
|
216
|
+
self.molecules.query(
|
|
217
|
+
temp_query, [self.molecules.key, "formula_alphabetical"]
|
|
218
|
+
)
|
|
219
219
|
)
|
|
220
220
|
|
|
221
221
|
processed_docs = set([e for e in self.thermo.distinct("molecule_id")])
|
|
222
222
|
to_process_docs = {d[self.molecules.key] for d in all_mols} - processed_docs
|
|
223
|
-
|
|
224
|
-
d["
|
|
223
|
+
to_process_forms = {
|
|
224
|
+
d["formula_alphabetical"]
|
|
225
225
|
for d in all_mols
|
|
226
226
|
if d[self.molecules.key] in to_process_docs
|
|
227
227
|
}
|
|
228
228
|
|
|
229
229
|
self.logger.info(f"Found {len(to_process_docs)} unprocessed documents")
|
|
230
|
-
self.logger.info(f"Found {len(
|
|
230
|
+
self.logger.info(f"Found {len(to_process_forms)} unprocessed formulas")
|
|
231
231
|
|
|
232
232
|
# Set total for builder bars to have a total
|
|
233
|
-
self.total = len(
|
|
233
|
+
self.total = len(to_process_forms)
|
|
234
234
|
|
|
235
|
-
for
|
|
235
|
+
for formula in to_process_forms:
|
|
236
236
|
mol_query = dict(temp_query)
|
|
237
|
-
mol_query["
|
|
237
|
+
mol_query["formula_alphabetical"] = formula
|
|
238
238
|
molecules = list(self.molecules.query(criteria=mol_query))
|
|
239
239
|
|
|
240
240
|
yield molecules
|
|
@@ -273,9 +273,9 @@ class ThermoBuilder(Builder):
|
|
|
273
273
|
return doc
|
|
274
274
|
|
|
275
275
|
mols = [MoleculeDoc(**item) for item in items]
|
|
276
|
-
|
|
276
|
+
formula = mols[0].formula_alphabetical
|
|
277
277
|
mol_ids = [m.molecule_id for m in mols]
|
|
278
|
-
self.logger.debug(f"Processing {
|
|
278
|
+
self.logger.debug(f"Processing {formula} : {mol_ids}")
|
|
279
279
|
|
|
280
280
|
thermo_docs = list()
|
|
281
281
|
|
|
@@ -334,7 +334,7 @@ class ThermoBuilder(Builder):
|
|
|
334
334
|
tdoc = self.tasks.query_one(
|
|
335
335
|
{
|
|
336
336
|
"task_id": task,
|
|
337
|
-
"
|
|
337
|
+
"formula_alphabetical": formula,
|
|
338
338
|
"orig": {"$exists": True},
|
|
339
339
|
}
|
|
340
340
|
)
|
|
@@ -344,7 +344,7 @@ class ThermoBuilder(Builder):
|
|
|
344
344
|
tdoc = self.tasks.query_one(
|
|
345
345
|
{
|
|
346
346
|
"task_id": int(task),
|
|
347
|
-
"
|
|
347
|
+
"formula_alphabetical": formula,
|
|
348
348
|
"orig": {"$exists": True},
|
|
349
349
|
}
|
|
350
350
|
)
|
|
@@ -465,7 +465,7 @@ class ThermoBuilder(Builder):
|
|
|
465
465
|
sorted(with_eval_e, key=lambda x: (x[1], x[2]))[0][0]
|
|
466
466
|
)
|
|
467
467
|
|
|
468
|
-
self.logger.debug(f"Produced {len(thermo_docs)} thermo docs for {
|
|
468
|
+
self.logger.debug(f"Produced {len(thermo_docs)} thermo docs for {formula}")
|
|
469
469
|
|
|
470
470
|
return jsanitize([doc.model_dump() for doc in thermo_docs], allow_bson=True)
|
|
471
471
|
|
|
@@ -27,7 +27,7 @@ class VibrationBuilder(Builder):
|
|
|
27
27
|
each solvent available).
|
|
28
28
|
|
|
29
29
|
The process is as follows:
|
|
30
|
-
1. Gather MoleculeDocs by
|
|
30
|
+
1. Gather MoleculeDocs by formula
|
|
31
31
|
2. For each doc, sort tasks by solvent
|
|
32
32
|
3. For each solvent, grab the best TaskDoc (doc with vibrational
|
|
33
33
|
information that has the highest level of theory with lowest
|
|
@@ -73,14 +73,12 @@ class VibrationBuilder(Builder):
|
|
|
73
73
|
self.tasks.ensure_index("last_updated")
|
|
74
74
|
self.tasks.ensure_index("state")
|
|
75
75
|
self.tasks.ensure_index("formula_alphabetical")
|
|
76
|
-
self.tasks.ensure_index("species_hash")
|
|
77
76
|
|
|
78
77
|
# Search index for molecules
|
|
79
78
|
self.molecules.ensure_index("molecule_id")
|
|
80
79
|
self.molecules.ensure_index("last_updated")
|
|
81
80
|
self.molecules.ensure_index("task_ids")
|
|
82
81
|
self.molecules.ensure_index("formula_alphabetical")
|
|
83
|
-
self.molecules.ensure_index("species_hash")
|
|
84
82
|
|
|
85
83
|
# Search index for vibrational properties
|
|
86
84
|
self.vibes.ensure_index("molecule_id")
|
|
@@ -99,23 +97,23 @@ class VibrationBuilder(Builder):
|
|
|
99
97
|
|
|
100
98
|
self.logger.info("Finding documents to process")
|
|
101
99
|
all_mols = list(
|
|
102
|
-
self.molecules.query(
|
|
100
|
+
self.molecules.query(
|
|
101
|
+
temp_query, [self.molecules.key, "formula_alphabetical"]
|
|
102
|
+
)
|
|
103
103
|
)
|
|
104
104
|
|
|
105
105
|
processed_docs = set([e for e in self.vibes.distinct("molecule_id")])
|
|
106
106
|
to_process_docs = {d[self.molecules.key] for d in all_mols} - processed_docs
|
|
107
|
-
|
|
108
|
-
d["
|
|
107
|
+
to_process_forms = {
|
|
108
|
+
d["formula_alphabetical"]
|
|
109
109
|
for d in all_mols
|
|
110
110
|
if d[self.molecules.key] in to_process_docs
|
|
111
111
|
}
|
|
112
112
|
|
|
113
|
-
N = ceil(len(
|
|
113
|
+
N = ceil(len(to_process_forms) / number_splits)
|
|
114
114
|
|
|
115
|
-
for
|
|
116
|
-
query
|
|
117
|
-
query["species_hash"] = {"$in": list(hash_chunk)}
|
|
118
|
-
yield {"query": query}
|
|
115
|
+
for formula_chunk in grouper(to_process_forms, N):
|
|
116
|
+
yield {"query": {"formula_alphabetical": {"$in": list(formula_chunk)}}}
|
|
119
117
|
|
|
120
118
|
def get_items(self) -> Iterator[List[Dict]]:
|
|
121
119
|
"""
|
|
@@ -140,26 +138,28 @@ class VibrationBuilder(Builder):
|
|
|
140
138
|
|
|
141
139
|
self.logger.info("Finding documents to process")
|
|
142
140
|
all_mols = list(
|
|
143
|
-
self.molecules.query(
|
|
141
|
+
self.molecules.query(
|
|
142
|
+
temp_query, [self.molecules.key, "formula_alphabetical"]
|
|
143
|
+
)
|
|
144
144
|
)
|
|
145
145
|
|
|
146
146
|
processed_docs = set([e for e in self.vibes.distinct("molecule_id")])
|
|
147
147
|
to_process_docs = {d[self.molecules.key] for d in all_mols} - processed_docs
|
|
148
|
-
|
|
149
|
-
d["
|
|
148
|
+
to_process_forms = {
|
|
149
|
+
d["formula_alphabetical"]
|
|
150
150
|
for d in all_mols
|
|
151
151
|
if d[self.molecules.key] in to_process_docs
|
|
152
152
|
}
|
|
153
153
|
|
|
154
154
|
self.logger.info(f"Found {len(to_process_docs)} unprocessed documents")
|
|
155
|
-
self.logger.info(f"Found {len(
|
|
155
|
+
self.logger.info(f"Found {len(to_process_forms)} unprocessed formulas")
|
|
156
156
|
|
|
157
157
|
# Set total for builder bars to have a total
|
|
158
|
-
self.total = len(
|
|
158
|
+
self.total = len(to_process_forms)
|
|
159
159
|
|
|
160
|
-
for
|
|
160
|
+
for formula in to_process_forms:
|
|
161
161
|
mol_query = dict(temp_query)
|
|
162
|
-
mol_query["
|
|
162
|
+
mol_query["formula_alphabetical"] = formula
|
|
163
163
|
molecules = list(self.molecules.query(criteria=mol_query))
|
|
164
164
|
|
|
165
165
|
yield molecules
|
|
@@ -176,9 +176,9 @@ class VibrationBuilder(Builder):
|
|
|
176
176
|
"""
|
|
177
177
|
|
|
178
178
|
mols = [MoleculeDoc(**item) for item in items]
|
|
179
|
-
|
|
179
|
+
formula = mols[0].formula_alphabetical
|
|
180
180
|
mol_ids = [m.molecule_id for m in mols]
|
|
181
|
-
self.logger.debug(f"Processing {
|
|
181
|
+
self.logger.debug(f"Processing {formula} : {mol_ids}")
|
|
182
182
|
|
|
183
183
|
vibe_docs = list()
|
|
184
184
|
|
|
@@ -213,7 +213,7 @@ class VibrationBuilder(Builder):
|
|
|
213
213
|
tdoc = self.tasks.query_one(
|
|
214
214
|
{
|
|
215
215
|
"task_id": task,
|
|
216
|
-
"
|
|
216
|
+
"formula_alphabetical": formula,
|
|
217
217
|
"orig": {"$exists": True},
|
|
218
218
|
}
|
|
219
219
|
)
|
|
@@ -223,7 +223,7 @@ class VibrationBuilder(Builder):
|
|
|
223
223
|
tdoc = self.tasks.query_one(
|
|
224
224
|
{
|
|
225
225
|
"task_id": int(task),
|
|
226
|
-
"
|
|
226
|
+
"formula_alphabetical": formula,
|
|
227
227
|
"orig": {"$exists": True},
|
|
228
228
|
}
|
|
229
229
|
)
|
|
@@ -243,7 +243,7 @@ class VibrationBuilder(Builder):
|
|
|
243
243
|
)
|
|
244
244
|
vibe_docs.append(vibe_doc)
|
|
245
245
|
|
|
246
|
-
self.logger.debug(f"Produced {len(vibe_docs)} vibration docs for {
|
|
246
|
+
self.logger.debug(f"Produced {len(vibe_docs)} vibration docs for {formula}")
|
|
247
247
|
|
|
248
248
|
return jsanitize([doc.model_dump() for doc in vibe_docs], allow_bson=True)
|
|
249
249
|
|