emmet-builders 0.84.10rc2__py3-none-any.whl → 0.85.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of emmet-builders might be problematic. Click here for more details.

Files changed (36) hide show
  1. emmet/builders/abinit/phonon.py +12 -14
  2. emmet/builders/abinit/sound_velocity.py +1 -1
  3. emmet/builders/materials/absorption_spectrum.py +16 -10
  4. emmet/builders/materials/alloys.py +1 -1
  5. emmet/builders/materials/corrected_entries.py +1 -1
  6. emmet/builders/materials/dielectric.py +10 -7
  7. emmet/builders/materials/elasticity.py +12 -9
  8. emmet/builders/materials/electrodes.py +1 -1
  9. emmet/builders/materials/electronic_structure.py +1 -1
  10. emmet/builders/materials/magnetism.py +2 -1
  11. emmet/builders/materials/piezoelectric.py +23 -19
  12. emmet/builders/materials/provenance.py +3 -4
  13. emmet/builders/materials/summary.py +1 -1
  14. emmet/builders/settings.py +14 -9
  15. emmet/builders/utils.py +5 -4
  16. emmet/builders/vasp/materials.py +11 -4
  17. emmet/builders/vasp/task_validator.py +3 -1
  18. {emmet_builders-0.84.10rc2.dist-info → emmet_builders-0.85.0.dist-info}/METADATA +7 -30
  19. emmet_builders-0.85.0.dist-info/RECORD +41 -0
  20. emmet/builders/materials/ml.py +0 -101
  21. emmet/builders/molecules/atomic.py +0 -592
  22. emmet/builders/molecules/bonds.py +0 -329
  23. emmet/builders/molecules/electric.py +0 -287
  24. emmet/builders/molecules/metal_binding.py +0 -528
  25. emmet/builders/molecules/orbitals.py +0 -292
  26. emmet/builders/molecules/redox.py +0 -502
  27. emmet/builders/molecules/summary.py +0 -406
  28. emmet/builders/molecules/thermo.py +0 -505
  29. emmet/builders/molecules/trajectory.py +0 -530
  30. emmet/builders/molecules/vibration.py +0 -282
  31. emmet/builders/qchem/__init__.py +0 -0
  32. emmet/builders/qchem/molecules.py +0 -745
  33. emmet_builders-0.84.10rc2.dist-info/RECORD +0 -54
  34. /emmet/builders/{molecules/__init__.py → py.typed} +0 -0
  35. {emmet_builders-0.84.10rc2.dist-info → emmet_builders-0.85.0.dist-info}/WHEEL +0 -0
  36. {emmet_builders-0.84.10rc2.dist-info → emmet_builders-0.85.0.dist-info}/top_level.txt +0 -0
@@ -1,406 +0,0 @@
1
- from __future__ import annotations
2
-
3
- from datetime import datetime
4
- from itertools import chain
5
- from math import ceil
6
-
7
- from maggma.builders import Builder
8
- from maggma.core import Store
9
- from maggma.utils import grouper
10
-
11
- from emmet.builders.settings import EmmetBuildSettings
12
- from emmet.core.molecules.summary import MoleculeSummaryDoc
13
- from emmet.core.utils import jsanitize
14
-
15
-
16
- from typing import TYPE_CHECKING
17
-
18
- if TYPE_CHECKING:
19
- from collections.abc import Iterable, Iterator
20
- from typing import Any
21
-
22
- # from monty.serialization import loadfn, dumpfn
23
-
24
-
25
- __author__ = "Evan Spotte-Smith"
26
-
27
- SETTINGS = EmmetBuildSettings()
28
-
29
-
30
- class SummaryBuilder(Builder):
31
- """
32
- The SummaryBuilder collects all property documents and gathers their properties
33
- into a single MoleculeSummaryDoc
34
-
35
- The process is as follows:
36
- 1. Gather MoleculeDocs by formula
37
- 2. For each doc, grab the relevant property docs
38
- 3. Convert property docs to MoleculeSummaryDoc
39
- """
40
-
41
- def __init__(
42
- self,
43
- molecules: Store,
44
- charges: Store,
45
- spins: Store,
46
- bonds: Store,
47
- multipoles: Store,
48
- metal_binding: Store,
49
- orbitals: Store,
50
- redox: Store,
51
- thermo: Store,
52
- vibes: Store,
53
- summary: Store,
54
- query: dict | None = None,
55
- settings: EmmetBuildSettings | None = None,
56
- **kwargs,
57
- ):
58
- self.molecules = molecules
59
- self.charges = charges
60
- self.spins = spins
61
- self.bonds = bonds
62
- self.multipoles = multipoles
63
- self.metal_binding = metal_binding
64
- self.orbitals = orbitals
65
- self.redox = redox
66
- self.thermo = thermo
67
- self.vibes = vibes
68
- self.summary = summary
69
- self.query = query if query else dict()
70
- self.settings = EmmetBuildSettings.autoload(settings)
71
- self.kwargs = kwargs
72
-
73
- super().__init__(
74
- sources=[
75
- molecules,
76
- charges,
77
- spins,
78
- bonds,
79
- multipoles,
80
- metal_binding,
81
- orbitals,
82
- redox,
83
- thermo,
84
- vibes,
85
- ],
86
- targets=[summary],
87
- **kwargs,
88
- )
89
- # Uncomment in case of issue with mrun not connecting automatically to collections
90
- # for i in [
91
- # self.molecules,
92
- # self.charges,
93
- # self.spins,
94
- # self.bonds,
95
- # self.multipoles,
96
- # self.metal_binding,
97
- # self.orbitals,
98
- # self.redox,
99
- # self.thermo,
100
- # self.vibes,
101
- # self.summary
102
- # ]:
103
- # try:
104
- # i.connect()
105
- # except Exception as e:
106
- # print("Could not connect,", e)
107
-
108
- def ensure_indexes(self):
109
- """
110
- Ensures indices on the collections needed for building
111
- """
112
-
113
- # Search index for molecules
114
- self.molecules.ensure_index("molecule_id")
115
- self.molecules.ensure_index("last_updated")
116
- self.molecules.ensure_index("task_ids")
117
- self.molecules.ensure_index("formula_alphabetical")
118
- self.molecules.ensure_index("species_hash")
119
-
120
- # Search index for charges
121
- self.charges.ensure_index("molecule_id")
122
- self.charges.ensure_index("method")
123
- self.charges.ensure_index("task_id")
124
- self.charges.ensure_index("solvent")
125
- self.charges.ensure_index("lot_solvent")
126
- self.charges.ensure_index("property_id")
127
- self.charges.ensure_index("last_updated")
128
- self.charges.ensure_index("formula_alphabetical")
129
-
130
- # Search index for charges
131
- self.spins.ensure_index("molecule_id")
132
- self.spins.ensure_index("method")
133
- self.spins.ensure_index("task_id")
134
- self.spins.ensure_index("solvent")
135
- self.spins.ensure_index("lot_solvent")
136
- self.spins.ensure_index("property_id")
137
- self.spins.ensure_index("last_updated")
138
- self.spins.ensure_index("formula_alphabetical")
139
-
140
- # Search index for charges
141
- self.bonds.ensure_index("molecule_id")
142
- self.bonds.ensure_index("method")
143
- self.bonds.ensure_index("task_id")
144
- self.bonds.ensure_index("solvent")
145
- self.bonds.ensure_index("lot_solvent")
146
- self.bonds.ensure_index("property_id")
147
- self.bonds.ensure_index("last_updated")
148
- self.bonds.ensure_index("formula_alphabetical")
149
-
150
- # Search index for multipoles
151
- self.multipoles.ensure_index("molecule_id")
152
- self.multipoles.ensure_index("task_id")
153
- self.multipoles.ensure_index("solvent")
154
- self.multipoles.ensure_index("lot_solvent")
155
- self.multipoles.ensure_index("property_id")
156
- self.multipoles.ensure_index("last_updated")
157
- self.multipoles.ensure_index("formula_alphabetical")
158
-
159
- # Search index for metal_binding
160
- self.metal_binding.ensure_index("molecule_id")
161
- self.metal_binding.ensure_index("solvent")
162
- self.metal_binding.ensure_index("lot_solvent")
163
- self.metal_binding.ensure_index("property_id")
164
- self.metal_binding.ensure_index("last_updated")
165
- self.metal_binding.ensure_index("formula_alphabetical")
166
- self.metal_binding.ensure_index("method")
167
-
168
- # Search index for orbitals
169
- self.orbitals.ensure_index("molecule_id")
170
- self.orbitals.ensure_index("task_id")
171
- self.orbitals.ensure_index("solvent")
172
- self.orbitals.ensure_index("lot_solvent")
173
- self.orbitals.ensure_index("property_id")
174
- self.orbitals.ensure_index("last_updated")
175
- self.orbitals.ensure_index("formula_alphabetical")
176
-
177
- # Search index for orbitals
178
- self.redox.ensure_index("molecule_id")
179
- self.redox.ensure_index("task_id")
180
- self.redox.ensure_index("solvent")
181
- self.redox.ensure_index("lot_solvent")
182
- self.redox.ensure_index("property_id")
183
- self.redox.ensure_index("last_updated")
184
- self.redox.ensure_index("formula_alphabetical")
185
-
186
- # Search index for thermo
187
- self.thermo.ensure_index("molecule_id")
188
- self.thermo.ensure_index("task_id")
189
- self.thermo.ensure_index("solvent")
190
- self.thermo.ensure_index("lot_solvent")
191
- self.thermo.ensure_index("property_id")
192
- self.thermo.ensure_index("last_updated")
193
- self.thermo.ensure_index("formula_alphabetical")
194
-
195
- # Search index for vibrational properties
196
- self.vibes.ensure_index("molecule_id")
197
- self.vibes.ensure_index("task_id")
198
- self.vibes.ensure_index("solvent")
199
- self.vibes.ensure_index("lot_solvent")
200
- self.vibes.ensure_index("property_id")
201
- self.vibes.ensure_index("last_updated")
202
- self.vibes.ensure_index("formula_alphabetical")
203
-
204
- # Search index for molecules
205
- self.summary.ensure_index("molecule_id")
206
- self.summary.ensure_index("last_updated")
207
- self.summary.ensure_index("formula_alphabetical")
208
-
209
- def prechunk(self, number_splits: int) -> Iterable[dict]: # pragma: no cover
210
- """Prechunk the builder for distributed computation"""
211
-
212
- temp_query = dict(self.query)
213
- temp_query["deprecated"] = False
214
-
215
- self.logger.info("Finding documents to process")
216
- all_mols = list(
217
- self.molecules.query(temp_query, [self.molecules.key, "species_hash"])
218
- )
219
-
220
- processed_docs = set([e for e in self.summary.distinct("molecule_id")])
221
- to_process_docs = {d[self.molecules.key] for d in all_mols} - processed_docs
222
- to_process_hashes = {
223
- d["species_hash"]
224
- for d in all_mols
225
- if d[self.molecules.key] in to_process_docs
226
- }
227
-
228
- N = ceil(len(to_process_hashes) / number_splits)
229
-
230
- for hash_chunk in grouper(to_process_hashes, N):
231
- query = dict(temp_query)
232
- query["species_hash"] = {"$in": list(hash_chunk)}
233
- yield {"query": query}
234
-
235
- def get_items(self) -> Iterator[list[dict]]:
236
- """
237
- Gets all items to process into summary documents.
238
- This does no datetime checking; relying on on whether
239
- task_ids are included in the summary Store
240
-
241
- Returns:
242
- generator or list relevant tasks and molecules to process into documents
243
- """
244
-
245
- self.logger.info("Summary builder started")
246
- self.logger.info("Setting indexes")
247
- self.ensure_indexes()
248
-
249
- # Save timestamp to mark buildtime
250
- self.timestamp = datetime.utcnow()
251
-
252
- # Get all processed molecules
253
- temp_query = dict(self.query)
254
- temp_query["deprecated"] = False
255
-
256
- self.logger.info("Finding documents to process")
257
- all_mols = list(
258
- self.molecules.query(temp_query, [self.molecules.key, "species_hash"])
259
- )
260
-
261
- processed_docs = set([e for e in self.summary.distinct("molecule_id")])
262
- to_process_docs = {d[self.molecules.key] for d in all_mols} - processed_docs
263
- to_process_hashes = {
264
- d["species_hash"]
265
- for d in all_mols
266
- if d[self.molecules.key] in to_process_docs
267
- }
268
-
269
- self.logger.info(f"Found {len(to_process_docs)} unprocessed documents")
270
- self.logger.info(f"Found {len(to_process_hashes)} unprocessed hashes")
271
-
272
- # Set total for builder bars to have a total
273
- self.total = len(to_process_hashes)
274
-
275
- for shash in to_process_hashes:
276
- mol_query = dict(temp_query)
277
- mol_query["species_hash"] = shash
278
- molecules = list(self.molecules.query(criteria=mol_query))
279
-
280
- yield molecules
281
-
282
- def process_item(self, items: list[dict]) -> list[dict]:
283
- """
284
- Process the tasks into a MoleculeSummaryDoc
285
-
286
- Args:
287
- tasks list[dict] : a list of MoleculeDocs in dict form
288
-
289
- Returns:
290
- [dict] : a list of new orbital docs
291
- """
292
-
293
- def _group_docs(docs: list[dict[str, Any]], by_method: bool = False):
294
- """Helper function to group docs by solvent"""
295
- grouped: dict[str, Any] = dict()
296
-
297
- for doc in docs:
298
- solvent = doc.get("solvent")
299
- method = doc.get("method")
300
- if not solvent:
301
- # Need to group by solvent
302
- continue
303
- if by_method and method is None:
304
- # Trying to group by method, but no method present
305
- continue
306
-
307
- if not by_method:
308
- grouped[solvent] = doc
309
- else:
310
- if solvent not in grouped:
311
- grouped[solvent] = {method: doc}
312
- else:
313
- grouped[solvent][method] = doc
314
-
315
- return grouped
316
-
317
- mols = items
318
- shash = mols[0]["species_hash"]
319
- mol_ids = [m["molecule_id"] for m in mols]
320
- self.logger.debug(f"Processing {shash} : {mol_ids}")
321
-
322
- summary_docs = list()
323
-
324
- for mol in mols:
325
- mol_id = mol["molecule_id"]
326
-
327
- d = {
328
- "molecules": mol,
329
- "partial_charges": _group_docs(
330
- list(self.charges.query({"molecule_id": mol_id})), True
331
- ),
332
- "partial_spins": _group_docs(
333
- list(self.spins.query({"molecule_id": mol_id})), True
334
- ),
335
- "bonding": _group_docs(
336
- list(self.bonds.query({"molecule_id": mol_id})), True
337
- ),
338
- "metal_binding": _group_docs(
339
- list(self.metal_binding.query({"molecule_id": mol_id})), True
340
- ),
341
- "multipole_moments": _group_docs(
342
- list(self.multipoles.query({"molecule_id": mol_id})), False
343
- ),
344
- "orbitals": _group_docs(
345
- list(self.orbitals.query({"molecule_id": mol_id})), False
346
- ),
347
- "redox": _group_docs(
348
- list(self.redox.query({"molecule_id": mol_id})), False
349
- ),
350
- "thermo": _group_docs(
351
- list(self.thermo.query({"molecule_id": mol_id})), False
352
- ),
353
- "vibration": _group_docs(
354
- list(self.vibes.query({"molecule_id": mol_id})), False
355
- ),
356
- }
357
-
358
- to_delete = list()
359
-
360
- for k, v in d.items():
361
- if isinstance(v, dict) and len(v) == 0:
362
- to_delete.append(k)
363
-
364
- for td in to_delete:
365
- del d[td]
366
-
367
- # # For debugging; keep because it might be needed again
368
- # dumpfn(d, f"{mol_id}.json.gz")
369
- # break
370
-
371
- summary_doc = MoleculeSummaryDoc.from_docs(molecule_id=mol_id, docs=d)
372
- summary_docs.append(summary_doc)
373
-
374
- self.logger.debug(f"Produced {len(summary_docs)} summary docs for {shash}")
375
-
376
- return jsanitize([doc.model_dump() for doc in summary_docs], allow_bson=True)
377
-
378
- def update_targets(self, items: list[list[dict]]):
379
- """
380
- Inserts the new documents into the summary collection
381
-
382
- Args:
383
- items [[dict]]: A list of documents to update
384
- """
385
-
386
- docs = list(chain.from_iterable(items)) # type: ignore
387
-
388
- # Add timestamp
389
- for item in docs:
390
- item.update(
391
- {
392
- "_bt": self.timestamp,
393
- }
394
- )
395
-
396
- molecule_ids = list({item["molecule_id"] for item in docs})
397
-
398
- if len(items) > 0:
399
- self.logger.info(f"Updating {len(docs)} summary documents")
400
- self.summary.remove_docs({self.summary.key: {"$in": molecule_ids}})
401
- self.summary.update(
402
- docs=docs,
403
- key=["molecule_id"],
404
- )
405
- else:
406
- self.logger.info("No items to update")