emmet-builders 0.84.2rc6__py3-none-any.whl → 0.84.2rc8__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of emmet-builders might be problematic. Click here for more details.
- emmet/builders/molecules/atomic.py +46 -48
- emmet/builders/molecules/bonds.py +24 -24
- emmet/builders/molecules/electric.py +282 -0
- emmet/builders/molecules/metal_binding.py +20 -21
- emmet/builders/molecules/orbitals.py +23 -23
- emmet/builders/molecules/redox.py +27 -27
- emmet/builders/molecules/summary.py +36 -21
- emmet/builders/molecules/thermo.py +23 -23
- emmet/builders/molecules/trajectory.py +525 -0
- emmet/builders/molecules/vibration.py +23 -23
- emmet/builders/qchem/molecules.py +21 -15
- emmet/builders/vasp/mp_potcar_stats.json.gz +0 -0
- {emmet_builders-0.84.2rc6.dist-info → emmet_builders-0.84.2rc8.dist-info}/METADATA +1 -1
- {emmet_builders-0.84.2rc6.dist-info → emmet_builders-0.84.2rc8.dist-info}/RECORD +16 -14
- {emmet_builders-0.84.2rc6.dist-info → emmet_builders-0.84.2rc8.dist-info}/WHEEL +0 -0
- {emmet_builders-0.84.2rc6.dist-info → emmet_builders-0.84.2rc8.dist-info}/top_level.txt +0 -0
|
@@ -44,7 +44,7 @@ class MetalBindingBuilder(Builder):
|
|
|
44
44
|
will be used.
|
|
45
45
|
|
|
46
46
|
The process is as follows:
|
|
47
|
-
1. Gather MoleculeDocs by
|
|
47
|
+
1. Gather MoleculeDocs by species hash
|
|
48
48
|
2. For each molecule, first identify if there are any metals. If not, then no MetalBindingDoc can be made.
|
|
49
49
|
If so, then identify the possible solvents that can be used to generate MetalBindingDocs
|
|
50
50
|
3. For each combination of Molecule ID and solvent, search for additional documents:
|
|
@@ -111,6 +111,7 @@ class MetalBindingBuilder(Builder):
|
|
|
111
111
|
self.molecules.ensure_index("last_updated")
|
|
112
112
|
self.molecules.ensure_index("task_ids")
|
|
113
113
|
self.molecules.ensure_index("formula_alphabetical")
|
|
114
|
+
self.molecules.ensure_index("species_hash")
|
|
114
115
|
|
|
115
116
|
# Search index for charges
|
|
116
117
|
self.charges.ensure_index("molecule_id")
|
|
@@ -168,23 +169,23 @@ class MetalBindingBuilder(Builder):
|
|
|
168
169
|
|
|
169
170
|
self.logger.info("Finding documents to process")
|
|
170
171
|
all_mols = list(
|
|
171
|
-
self.molecules.query(
|
|
172
|
-
temp_query, [self.molecules.key, "formula_alphabetical"]
|
|
173
|
-
)
|
|
172
|
+
self.molecules.query(temp_query, [self.molecules.key, "species_hash"])
|
|
174
173
|
)
|
|
175
174
|
|
|
176
175
|
processed_docs = set([e for e in self.metal_binding.distinct("molecule_id")])
|
|
177
176
|
to_process_docs = {d[self.molecules.key] for d in all_mols} - processed_docs
|
|
178
|
-
|
|
179
|
-
d["
|
|
177
|
+
to_process_hashes = {
|
|
178
|
+
d["species_hash"]
|
|
180
179
|
for d in all_mols
|
|
181
180
|
if d[self.molecules.key] in to_process_docs
|
|
182
181
|
}
|
|
183
182
|
|
|
184
|
-
N = ceil(len(
|
|
183
|
+
N = ceil(len(to_process_hashes) / number_splits)
|
|
185
184
|
|
|
186
|
-
for
|
|
187
|
-
|
|
185
|
+
for hash_chunk in grouper(to_process_hashes, N):
|
|
186
|
+
query = dict(temp_query)
|
|
187
|
+
query["species_hash"] = {"$in": list(hash_chunk)}
|
|
188
|
+
yield {"query": query}
|
|
188
189
|
|
|
189
190
|
def get_items(self) -> Iterator[List[Dict]]:
|
|
190
191
|
"""
|
|
@@ -207,28 +208,26 @@ class MetalBindingBuilder(Builder):
|
|
|
207
208
|
|
|
208
209
|
self.logger.info("Finding documents to process")
|
|
209
210
|
all_mols = list(
|
|
210
|
-
self.molecules.query(
|
|
211
|
-
temp_query, [self.molecules.key, "formula_alphabetical"]
|
|
212
|
-
)
|
|
211
|
+
self.molecules.query(temp_query, [self.molecules.key, "species_hash"])
|
|
213
212
|
)
|
|
214
213
|
|
|
215
214
|
processed_docs = set([e for e in self.metal_binding.distinct("molecule_id")])
|
|
216
215
|
to_process_docs = {d[self.molecules.key] for d in all_mols} - processed_docs
|
|
217
|
-
|
|
218
|
-
d["
|
|
216
|
+
to_process_hashes = {
|
|
217
|
+
d["species_hash"]
|
|
219
218
|
for d in all_mols
|
|
220
219
|
if d[self.molecules.key] in to_process_docs
|
|
221
220
|
}
|
|
222
221
|
|
|
223
222
|
self.logger.info(f"Found {len(to_process_docs)} unprocessed documents")
|
|
224
|
-
self.logger.info(f"Found {len(
|
|
223
|
+
self.logger.info(f"Found {len(to_process_hashes)} unprocessed hashes")
|
|
225
224
|
|
|
226
225
|
# Set total for builder bars to have a total
|
|
227
|
-
self.total = len(
|
|
226
|
+
self.total = len(to_process_hashes)
|
|
228
227
|
|
|
229
|
-
for
|
|
228
|
+
for shash in to_process_hashes:
|
|
230
229
|
mol_query = dict(temp_query)
|
|
231
|
-
mol_query["
|
|
230
|
+
mol_query["species_hash"] = shash
|
|
232
231
|
molecules = list(self.molecules.query(criteria=mol_query))
|
|
233
232
|
|
|
234
233
|
yield molecules
|
|
@@ -245,9 +244,9 @@ class MetalBindingBuilder(Builder):
|
|
|
245
244
|
"""
|
|
246
245
|
|
|
247
246
|
mols = [MoleculeDoc(**item) for item in items]
|
|
248
|
-
|
|
247
|
+
shash = mols[0].species_hash
|
|
249
248
|
mol_ids = [m.molecule_id for m in mols]
|
|
250
|
-
self.logger.debug(f"Processing {
|
|
249
|
+
self.logger.debug(f"Processing {shash} : {mol_ids}")
|
|
251
250
|
|
|
252
251
|
binding_docs = list()
|
|
253
252
|
|
|
@@ -487,7 +486,7 @@ class MetalBindingBuilder(Builder):
|
|
|
487
486
|
binding_docs.append(doc)
|
|
488
487
|
|
|
489
488
|
self.logger.debug(
|
|
490
|
-
f"Produced {len(binding_docs)} metal binding docs for {
|
|
489
|
+
f"Produced {len(binding_docs)} metal binding docs for {shash}"
|
|
491
490
|
)
|
|
492
491
|
|
|
493
492
|
return jsanitize([doc.model_dump() for doc in binding_docs], allow_bson=True)
|
|
@@ -27,7 +27,7 @@ class OrbitalBuilder(Builder):
|
|
|
27
27
|
each solvent available).
|
|
28
28
|
|
|
29
29
|
The process is as follows:
|
|
30
|
-
1. Gather MoleculeDocs by
|
|
30
|
+
1. Gather MoleculeDocs by species hash
|
|
31
31
|
2. For each doc, sort tasks by solvent
|
|
32
32
|
3. For each solvent, grab the best TaskDoc (including NBO data using
|
|
33
33
|
the highest level of theory with lowest electronic energy for the
|
|
@@ -69,12 +69,14 @@ class OrbitalBuilder(Builder):
|
|
|
69
69
|
self.tasks.ensure_index("last_updated")
|
|
70
70
|
self.tasks.ensure_index("state")
|
|
71
71
|
self.tasks.ensure_index("formula_alphabetical")
|
|
72
|
+
self.tasks.ensure_index("species_hash")
|
|
72
73
|
|
|
73
74
|
# Search index for molecules
|
|
74
75
|
self.molecules.ensure_index("molecule_id")
|
|
75
76
|
self.molecules.ensure_index("last_updated")
|
|
76
77
|
self.molecules.ensure_index("task_ids")
|
|
77
78
|
self.molecules.ensure_index("formula_alphabetical")
|
|
79
|
+
self.molecules.ensure_index("species_hash")
|
|
78
80
|
|
|
79
81
|
# Search index for orbitals
|
|
80
82
|
self.orbitals.ensure_index("molecule_id")
|
|
@@ -93,23 +95,23 @@ class OrbitalBuilder(Builder):
|
|
|
93
95
|
|
|
94
96
|
self.logger.info("Finding documents to process")
|
|
95
97
|
all_mols = list(
|
|
96
|
-
self.molecules.query(
|
|
97
|
-
temp_query, [self.molecules.key, "formula_alphabetical"]
|
|
98
|
-
)
|
|
98
|
+
self.molecules.query(temp_query, [self.molecules.key, "species_hash"])
|
|
99
99
|
)
|
|
100
100
|
|
|
101
101
|
processed_docs = set([e for e in self.orbitals.distinct("molecule_id")])
|
|
102
102
|
to_process_docs = {d[self.molecules.key] for d in all_mols} - processed_docs
|
|
103
|
-
|
|
104
|
-
d["
|
|
103
|
+
to_process_hashes = {
|
|
104
|
+
d["species_hash"]
|
|
105
105
|
for d in all_mols
|
|
106
106
|
if d[self.molecules.key] in to_process_docs
|
|
107
107
|
}
|
|
108
108
|
|
|
109
|
-
N = ceil(len(
|
|
109
|
+
N = ceil(len(to_process_hashes) / number_splits)
|
|
110
110
|
|
|
111
|
-
for
|
|
112
|
-
|
|
111
|
+
for hash_chunk in grouper(to_process_hashes, N):
|
|
112
|
+
query = dict(temp_query)
|
|
113
|
+
query["species_hash"] = {"$in": list(hash_chunk)}
|
|
114
|
+
yield {"query": query}
|
|
113
115
|
|
|
114
116
|
def get_items(self) -> Iterator[List[Dict]]:
|
|
115
117
|
"""
|
|
@@ -134,28 +136,26 @@ class OrbitalBuilder(Builder):
|
|
|
134
136
|
|
|
135
137
|
self.logger.info("Finding documents to process")
|
|
136
138
|
all_mols = list(
|
|
137
|
-
self.molecules.query(
|
|
138
|
-
temp_query, [self.molecules.key, "formula_alphabetical"]
|
|
139
|
-
)
|
|
139
|
+
self.molecules.query(temp_query, [self.molecules.key, "species_hash"])
|
|
140
140
|
)
|
|
141
141
|
|
|
142
142
|
processed_docs = set([e for e in self.orbitals.distinct("molecule_id")])
|
|
143
143
|
to_process_docs = {d[self.molecules.key] for d in all_mols} - processed_docs
|
|
144
|
-
|
|
145
|
-
d["
|
|
144
|
+
to_process_hashes = {
|
|
145
|
+
d["species_hash"]
|
|
146
146
|
for d in all_mols
|
|
147
147
|
if d[self.molecules.key] in to_process_docs
|
|
148
148
|
}
|
|
149
149
|
|
|
150
150
|
self.logger.info(f"Found {len(to_process_docs)} unprocessed documents")
|
|
151
|
-
self.logger.info(f"Found {len(
|
|
151
|
+
self.logger.info(f"Found {len(to_process_hashes)} unprocessed hashes")
|
|
152
152
|
|
|
153
153
|
# Set total for builder bars to have a total
|
|
154
|
-
self.total = len(
|
|
154
|
+
self.total = len(to_process_hashes)
|
|
155
155
|
|
|
156
|
-
for
|
|
156
|
+
for shash in to_process_hashes:
|
|
157
157
|
mol_query = dict(temp_query)
|
|
158
|
-
mol_query["
|
|
158
|
+
mol_query["species_hash"] = shash
|
|
159
159
|
molecules = list(self.molecules.query(criteria=mol_query))
|
|
160
160
|
|
|
161
161
|
yield molecules
|
|
@@ -172,9 +172,9 @@ class OrbitalBuilder(Builder):
|
|
|
172
172
|
"""
|
|
173
173
|
|
|
174
174
|
mols = [MoleculeDoc(**item) for item in items]
|
|
175
|
-
|
|
175
|
+
shash = mols[0].species_hash
|
|
176
176
|
mol_ids = [m.molecule_id for m in mols]
|
|
177
|
-
self.logger.info(f"Processing {
|
|
177
|
+
self.logger.info(f"Processing {shash} : {mol_ids}")
|
|
178
178
|
|
|
179
179
|
orbital_docs = list()
|
|
180
180
|
|
|
@@ -221,7 +221,7 @@ class OrbitalBuilder(Builder):
|
|
|
221
221
|
tdoc = self.tasks.query_one(
|
|
222
222
|
{
|
|
223
223
|
"task_id": task,
|
|
224
|
-
"
|
|
224
|
+
"species_hash": shash,
|
|
225
225
|
"orig": {"$exists": True},
|
|
226
226
|
}
|
|
227
227
|
)
|
|
@@ -231,7 +231,7 @@ class OrbitalBuilder(Builder):
|
|
|
231
231
|
tdoc = self.tasks.query_one(
|
|
232
232
|
{
|
|
233
233
|
"task_id": int(task),
|
|
234
|
-
"
|
|
234
|
+
"species_hash": shash,
|
|
235
235
|
"orig": {"$exists": True},
|
|
236
236
|
}
|
|
237
237
|
)
|
|
@@ -253,7 +253,7 @@ class OrbitalBuilder(Builder):
|
|
|
253
253
|
if orbital_doc is not None:
|
|
254
254
|
orbital_docs.append(orbital_doc)
|
|
255
255
|
|
|
256
|
-
self.logger.debug(f"Produced {len(orbital_docs)} orbital docs for {
|
|
256
|
+
self.logger.debug(f"Produced {len(orbital_docs)} orbital docs for {shash}")
|
|
257
257
|
|
|
258
258
|
return jsanitize([doc.model_dump() for doc in orbital_docs], allow_bson=True)
|
|
259
259
|
|
|
@@ -30,7 +30,7 @@ class RedoxBuilder(Builder):
|
|
|
30
30
|
from a MoleculeDoc (lowest electronic energy, highest level of theory).
|
|
31
31
|
|
|
32
32
|
The process is as follows:
|
|
33
|
-
1. Gather MoleculeDocs by
|
|
33
|
+
1. Gather MoleculeDocs by species hash
|
|
34
34
|
2. Further group based on (covalent) isomorphism and charge
|
|
35
35
|
3. For each MoleculeDoc:
|
|
36
36
|
3a. Identify relevant MoleculeThermoDocs
|
|
@@ -81,12 +81,14 @@ class RedoxBuilder(Builder):
|
|
|
81
81
|
self.tasks.ensure_index("last_updated")
|
|
82
82
|
self.tasks.ensure_index("state")
|
|
83
83
|
self.tasks.ensure_index("formula_alphabetical")
|
|
84
|
+
self.tasks.ensure_index("species_hash")
|
|
84
85
|
|
|
85
86
|
# Search index for molecules
|
|
86
87
|
self.molecules.ensure_index("molecule_id")
|
|
87
88
|
self.molecules.ensure_index("last_updated")
|
|
88
89
|
self.molecules.ensure_index("task_ids")
|
|
89
90
|
self.molecules.ensure_index("formula_alphabetical")
|
|
91
|
+
self.molecules.ensure_index("species_hash")
|
|
90
92
|
|
|
91
93
|
# Search index for thermo
|
|
92
94
|
self.thermo.ensure_index("molecule_id")
|
|
@@ -113,23 +115,23 @@ class RedoxBuilder(Builder):
|
|
|
113
115
|
|
|
114
116
|
self.logger.info("Finding documents to process")
|
|
115
117
|
all_mols = list(
|
|
116
|
-
self.molecules.query(
|
|
117
|
-
temp_query, [self.molecules.key, "formula_alphabetical"]
|
|
118
|
-
)
|
|
118
|
+
self.molecules.query(temp_query, [self.molecules.key, "species_hash"])
|
|
119
119
|
)
|
|
120
120
|
|
|
121
121
|
processed_docs = set([e for e in self.redox.distinct("molecule_id")])
|
|
122
122
|
to_process_docs = {d[self.molecules.key] for d in all_mols} - processed_docs
|
|
123
|
-
|
|
124
|
-
d["
|
|
123
|
+
to_process_hashes = {
|
|
124
|
+
d["species_hash"]
|
|
125
125
|
for d in all_mols
|
|
126
126
|
if d[self.molecules.key] in to_process_docs
|
|
127
127
|
}
|
|
128
128
|
|
|
129
|
-
N = ceil(len(
|
|
129
|
+
N = ceil(len(to_process_hashes) / number_splits)
|
|
130
130
|
|
|
131
|
-
for
|
|
132
|
-
|
|
131
|
+
for hash_chunk in grouper(to_process_hashes, N):
|
|
132
|
+
query = dict(temp_query)
|
|
133
|
+
query["species_hash"] = {"$in": list(hash_chunk)}
|
|
134
|
+
yield {"query": query}
|
|
133
135
|
|
|
134
136
|
def get_items(self) -> Iterator[List[Dict]]:
|
|
135
137
|
"""
|
|
@@ -154,28 +156,26 @@ class RedoxBuilder(Builder):
|
|
|
154
156
|
|
|
155
157
|
self.logger.info("Finding documents to process")
|
|
156
158
|
all_mols = list(
|
|
157
|
-
self.molecules.query(
|
|
158
|
-
temp_query, [self.molecules.key, "formula_alphabetical"]
|
|
159
|
-
)
|
|
159
|
+
self.molecules.query(temp_query, [self.molecules.key, "species_hash"])
|
|
160
160
|
)
|
|
161
161
|
|
|
162
162
|
processed_docs = set([e for e in self.redox.distinct("molecule_id")])
|
|
163
163
|
to_process_docs = {d[self.molecules.key] for d in all_mols} - processed_docs
|
|
164
|
-
|
|
165
|
-
d["
|
|
164
|
+
to_process_hashes = {
|
|
165
|
+
d["species_hash"]
|
|
166
166
|
for d in all_mols
|
|
167
167
|
if d[self.molecules.key] in to_process_docs
|
|
168
168
|
}
|
|
169
169
|
|
|
170
170
|
self.logger.info(f"Found {len(to_process_docs)} unprocessed documents")
|
|
171
|
-
self.logger.info(f"Found {len(
|
|
171
|
+
self.logger.info(f"Found {len(to_process_hashes)} unprocessed hashes")
|
|
172
172
|
|
|
173
173
|
# Set total for builder bars to have a total
|
|
174
|
-
self.total = len(
|
|
174
|
+
self.total = len(to_process_hashes)
|
|
175
175
|
|
|
176
|
-
for
|
|
176
|
+
for shash in to_process_hashes:
|
|
177
177
|
mol_query = dict(temp_query)
|
|
178
|
-
mol_query["
|
|
178
|
+
mol_query["species_hash"] = shash
|
|
179
179
|
molecules = list(self.molecules.query(criteria=mol_query))
|
|
180
180
|
|
|
181
181
|
yield molecules
|
|
@@ -192,9 +192,9 @@ class RedoxBuilder(Builder):
|
|
|
192
192
|
"""
|
|
193
193
|
|
|
194
194
|
mols = [MoleculeDoc(**item) for item in items]
|
|
195
|
-
|
|
195
|
+
shash = mols[0].species_hash
|
|
196
196
|
mol_ids = [m.molecule_id for m in mols]
|
|
197
|
-
self.logger.debug(f"Processing {
|
|
197
|
+
self.logger.debug(f"Processing {shash} : {mol_ids}")
|
|
198
198
|
|
|
199
199
|
redox_docs = list()
|
|
200
200
|
|
|
@@ -220,7 +220,7 @@ class RedoxBuilder(Builder):
|
|
|
220
220
|
e["task_id"]
|
|
221
221
|
for e in gg.entries
|
|
222
222
|
if e["charge"] == gg.charge + 1
|
|
223
|
-
and e["task_type"]
|
|
223
|
+
and e["task_type"] in ["Single Point", "Force"]
|
|
224
224
|
and e["output"].get("final_energy")
|
|
225
225
|
]
|
|
226
226
|
ie_tasks = list()
|
|
@@ -228,7 +228,7 @@ class RedoxBuilder(Builder):
|
|
|
228
228
|
tdoc = self.tasks.query_one(
|
|
229
229
|
{
|
|
230
230
|
"task_id": i,
|
|
231
|
-
"
|
|
231
|
+
"species_hash": shash,
|
|
232
232
|
"orig": {"$exists": True},
|
|
233
233
|
}
|
|
234
234
|
)
|
|
@@ -238,7 +238,7 @@ class RedoxBuilder(Builder):
|
|
|
238
238
|
tdoc = self.tasks.query_one(
|
|
239
239
|
{
|
|
240
240
|
"task_id": int(i),
|
|
241
|
-
"
|
|
241
|
+
"species_hash": shash,
|
|
242
242
|
"orig": {"$exists": True},
|
|
243
243
|
}
|
|
244
244
|
)
|
|
@@ -254,7 +254,7 @@ class RedoxBuilder(Builder):
|
|
|
254
254
|
e["task_id"]
|
|
255
255
|
for e in gg.entries
|
|
256
256
|
if e["charge"] == gg.charge - 1
|
|
257
|
-
and e["task_type"]
|
|
257
|
+
and e["task_type"] in ["Single Point", "Force"]
|
|
258
258
|
and e["output"].get("final_energy")
|
|
259
259
|
]
|
|
260
260
|
ea_tasks = list()
|
|
@@ -262,7 +262,7 @@ class RedoxBuilder(Builder):
|
|
|
262
262
|
tdoc = self.tasks.query_one(
|
|
263
263
|
{
|
|
264
264
|
"task_id": i,
|
|
265
|
-
"
|
|
265
|
+
"species_hash": shash,
|
|
266
266
|
"orig": {"$exists": True},
|
|
267
267
|
}
|
|
268
268
|
)
|
|
@@ -272,7 +272,7 @@ class RedoxBuilder(Builder):
|
|
|
272
272
|
tdoc = self.tasks.query_one(
|
|
273
273
|
{
|
|
274
274
|
"task_id": int(i),
|
|
275
|
-
"
|
|
275
|
+
"species_hash": shash,
|
|
276
276
|
"orig": {"$exists": True},
|
|
277
277
|
}
|
|
278
278
|
)
|
|
@@ -354,7 +354,7 @@ class RedoxBuilder(Builder):
|
|
|
354
354
|
)
|
|
355
355
|
)
|
|
356
356
|
|
|
357
|
-
self.logger.debug(f"Produced {len(redox_docs)} redox docs for {
|
|
357
|
+
self.logger.debug(f"Produced {len(redox_docs)} redox docs for {shash}")
|
|
358
358
|
|
|
359
359
|
return jsanitize(
|
|
360
360
|
[doc.model_dump() for doc in redox_docs if doc is not None], allow_bson=True
|
|
@@ -36,6 +36,7 @@ class SummaryBuilder(Builder):
|
|
|
36
36
|
charges: Store,
|
|
37
37
|
spins: Store,
|
|
38
38
|
bonds: Store,
|
|
39
|
+
multipoles: Store,
|
|
39
40
|
metal_binding: Store,
|
|
40
41
|
orbitals: Store,
|
|
41
42
|
redox: Store,
|
|
@@ -50,6 +51,7 @@ class SummaryBuilder(Builder):
|
|
|
50
51
|
self.charges = charges
|
|
51
52
|
self.spins = spins
|
|
52
53
|
self.bonds = bonds
|
|
54
|
+
self.multipoles = multipoles
|
|
53
55
|
self.metal_binding = metal_binding
|
|
54
56
|
self.orbitals = orbitals
|
|
55
57
|
self.redox = redox
|
|
@@ -66,6 +68,7 @@ class SummaryBuilder(Builder):
|
|
|
66
68
|
charges,
|
|
67
69
|
spins,
|
|
68
70
|
bonds,
|
|
71
|
+
multipoles,
|
|
69
72
|
metal_binding,
|
|
70
73
|
orbitals,
|
|
71
74
|
redox,
|
|
@@ -81,6 +84,7 @@ class SummaryBuilder(Builder):
|
|
|
81
84
|
# self.charges,
|
|
82
85
|
# self.spins,
|
|
83
86
|
# self.bonds,
|
|
87
|
+
# self.multipoles,
|
|
84
88
|
# self.metal_binding,
|
|
85
89
|
# self.orbitals,
|
|
86
90
|
# self.redox,
|
|
@@ -103,6 +107,7 @@ class SummaryBuilder(Builder):
|
|
|
103
107
|
self.molecules.ensure_index("last_updated")
|
|
104
108
|
self.molecules.ensure_index("task_ids")
|
|
105
109
|
self.molecules.ensure_index("formula_alphabetical")
|
|
110
|
+
self.molecules.ensure_index("species_hash")
|
|
106
111
|
|
|
107
112
|
# Search index for charges
|
|
108
113
|
self.charges.ensure_index("molecule_id")
|
|
@@ -134,6 +139,15 @@ class SummaryBuilder(Builder):
|
|
|
134
139
|
self.bonds.ensure_index("last_updated")
|
|
135
140
|
self.bonds.ensure_index("formula_alphabetical")
|
|
136
141
|
|
|
142
|
+
# Search index for multipoles
|
|
143
|
+
self.multipoles.ensure_index("molecule_id")
|
|
144
|
+
self.multipoles.ensure_index("task_id")
|
|
145
|
+
self.multipoles.ensure_index("solvent")
|
|
146
|
+
self.multipoles.ensure_index("lot_solvent")
|
|
147
|
+
self.multipoles.ensure_index("property_id")
|
|
148
|
+
self.multipoles.ensure_index("last_updated")
|
|
149
|
+
self.multipoles.ensure_index("formula_alphabetical")
|
|
150
|
+
|
|
137
151
|
# Search index for metal_binding
|
|
138
152
|
self.metal_binding.ensure_index("molecule_id")
|
|
139
153
|
self.metal_binding.ensure_index("solvent")
|
|
@@ -192,23 +206,23 @@ class SummaryBuilder(Builder):
|
|
|
192
206
|
|
|
193
207
|
self.logger.info("Finding documents to process")
|
|
194
208
|
all_mols = list(
|
|
195
|
-
self.molecules.query(
|
|
196
|
-
temp_query, [self.molecules.key, "formula_alphabetical"]
|
|
197
|
-
)
|
|
209
|
+
self.molecules.query(temp_query, [self.molecules.key, "species_hash"])
|
|
198
210
|
)
|
|
199
211
|
|
|
200
212
|
processed_docs = set([e for e in self.summary.distinct("molecule_id")])
|
|
201
213
|
to_process_docs = {d[self.molecules.key] for d in all_mols} - processed_docs
|
|
202
|
-
|
|
203
|
-
d["
|
|
214
|
+
to_process_hashes = {
|
|
215
|
+
d["species_hash"]
|
|
204
216
|
for d in all_mols
|
|
205
217
|
if d[self.molecules.key] in to_process_docs
|
|
206
218
|
}
|
|
207
219
|
|
|
208
|
-
N = ceil(len(
|
|
220
|
+
N = ceil(len(to_process_hashes) / number_splits)
|
|
209
221
|
|
|
210
|
-
for
|
|
211
|
-
|
|
222
|
+
for hash_chunk in grouper(to_process_hashes, N):
|
|
223
|
+
query = dict(temp_query)
|
|
224
|
+
query["species_hash"] = {"$in": list(hash_chunk)}
|
|
225
|
+
yield {"query": query}
|
|
212
226
|
|
|
213
227
|
def get_items(self) -> Iterator[List[Dict]]:
|
|
214
228
|
"""
|
|
@@ -233,28 +247,26 @@ class SummaryBuilder(Builder):
|
|
|
233
247
|
|
|
234
248
|
self.logger.info("Finding documents to process")
|
|
235
249
|
all_mols = list(
|
|
236
|
-
self.molecules.query(
|
|
237
|
-
temp_query, [self.molecules.key, "formula_alphabetical"]
|
|
238
|
-
)
|
|
250
|
+
self.molecules.query(temp_query, [self.molecules.key, "species_hash"])
|
|
239
251
|
)
|
|
240
252
|
|
|
241
253
|
processed_docs = set([e for e in self.summary.distinct("molecule_id")])
|
|
242
254
|
to_process_docs = {d[self.molecules.key] for d in all_mols} - processed_docs
|
|
243
|
-
|
|
244
|
-
d["
|
|
255
|
+
to_process_hashes = {
|
|
256
|
+
d["species_hash"]
|
|
245
257
|
for d in all_mols
|
|
246
258
|
if d[self.molecules.key] in to_process_docs
|
|
247
259
|
}
|
|
248
260
|
|
|
249
261
|
self.logger.info(f"Found {len(to_process_docs)} unprocessed documents")
|
|
250
|
-
self.logger.info(f"Found {len(
|
|
262
|
+
self.logger.info(f"Found {len(to_process_hashes)} unprocessed hashes")
|
|
251
263
|
|
|
252
264
|
# Set total for builder bars to have a total
|
|
253
|
-
self.total = len(
|
|
265
|
+
self.total = len(to_process_hashes)
|
|
254
266
|
|
|
255
|
-
for
|
|
267
|
+
for shash in to_process_hashes:
|
|
256
268
|
mol_query = dict(temp_query)
|
|
257
|
-
mol_query["
|
|
269
|
+
mol_query["species_hash"] = shash
|
|
258
270
|
molecules = list(self.molecules.query(criteria=mol_query))
|
|
259
271
|
|
|
260
272
|
yield molecules
|
|
@@ -292,12 +304,12 @@ class SummaryBuilder(Builder):
|
|
|
292
304
|
else:
|
|
293
305
|
grouped[solvent][method] = doc
|
|
294
306
|
|
|
295
|
-
return
|
|
307
|
+
return grouped
|
|
296
308
|
|
|
297
309
|
mols = items
|
|
298
|
-
|
|
310
|
+
shash = mols[0]["species_hash"]
|
|
299
311
|
mol_ids = [m["molecule_id"] for m in mols]
|
|
300
|
-
self.logger.debug(f"Processing {
|
|
312
|
+
self.logger.debug(f"Processing {shash} : {mol_ids}")
|
|
301
313
|
|
|
302
314
|
summary_docs = list()
|
|
303
315
|
|
|
@@ -318,6 +330,9 @@ class SummaryBuilder(Builder):
|
|
|
318
330
|
"metal_binding": _group_docs(
|
|
319
331
|
list(self.metal_binding.query({"molecule_id": mol_id})), True
|
|
320
332
|
),
|
|
333
|
+
"multipole_moments": _group_docs(
|
|
334
|
+
list(self.multipoles.query({"molecule_id": mol_id})), False
|
|
335
|
+
),
|
|
321
336
|
"orbitals": _group_docs(
|
|
322
337
|
list(self.orbitals.query({"molecule_id": mol_id})), False
|
|
323
338
|
),
|
|
@@ -348,7 +363,7 @@ class SummaryBuilder(Builder):
|
|
|
348
363
|
summary_doc = MoleculeSummaryDoc.from_docs(molecule_id=mol_id, docs=d)
|
|
349
364
|
summary_docs.append(summary_doc)
|
|
350
365
|
|
|
351
|
-
self.logger.debug(f"Produced {len(summary_docs)} summary docs for {
|
|
366
|
+
self.logger.debug(f"Produced {len(summary_docs)} summary docs for {shash}")
|
|
352
367
|
|
|
353
368
|
return jsanitize([doc.model_dump() for doc in summary_docs], allow_bson=True)
|
|
354
369
|
|