emmet-builders 0.84.2__py3-none-any.whl → 0.86.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (42) hide show
  1. emmet/builders/abinit/phonon.py +27 -25
  2. emmet/builders/abinit/sound_velocity.py +15 -11
  3. emmet/builders/feff/xas.py +1 -2
  4. emmet/builders/materials/absorption_spectrum.py +25 -14
  5. emmet/builders/materials/alloys.py +3 -4
  6. emmet/builders/materials/chemenv.py +2 -3
  7. emmet/builders/materials/corrected_entries.py +15 -9
  8. emmet/builders/materials/dielectric.py +19 -11
  9. emmet/builders/materials/elasticity.py +44 -33
  10. emmet/builders/materials/electrodes.py +24 -19
  11. emmet/builders/materials/electronic_structure.py +17 -17
  12. emmet/builders/materials/magnetism.py +11 -4
  13. emmet/builders/materials/optimade.py +7 -3
  14. emmet/builders/materials/piezoelectric.py +24 -21
  15. emmet/builders/materials/provenance.py +15 -12
  16. emmet/builders/materials/robocrys.py +2 -3
  17. emmet/builders/materials/substrates.py +9 -8
  18. emmet/builders/materials/summary.py +3 -3
  19. emmet/builders/materials/thermo.py +17 -11
  20. emmet/builders/matscholar/missing_compositions.py +12 -8
  21. emmet/builders/mobility/migration_graph.py +5 -5
  22. emmet/builders/settings.py +21 -17
  23. emmet/builders/utils.py +15 -10
  24. emmet/builders/vasp/materials.py +32 -16
  25. emmet/builders/vasp/task_validator.py +15 -11
  26. {emmet_builders-0.84.2.dist-info → emmet_builders-0.86.0.dist-info}/METADATA +21 -36
  27. emmet_builders-0.86.0.dist-info/RECORD +41 -0
  28. {emmet_builders-0.84.2.dist-info → emmet_builders-0.86.0.dist-info}/WHEEL +1 -1
  29. emmet/builders/materials/ml.py +0 -87
  30. emmet/builders/molecules/atomic.py +0 -589
  31. emmet/builders/molecules/bonds.py +0 -324
  32. emmet/builders/molecules/metal_binding.py +0 -526
  33. emmet/builders/molecules/orbitals.py +0 -288
  34. emmet/builders/molecules/redox.py +0 -496
  35. emmet/builders/molecules/summary.py +0 -383
  36. emmet/builders/molecules/thermo.py +0 -500
  37. emmet/builders/molecules/vibration.py +0 -278
  38. emmet/builders/qchem/__init__.py +0 -0
  39. emmet/builders/qchem/molecules.py +0 -734
  40. emmet_builders-0.84.2.dist-info/RECORD +0 -52
  41. /emmet/builders/{molecules/__init__.py → py.typed} +0 -0
  42. {emmet_builders-0.84.2.dist-info → emmet_builders-0.86.0.dist-info}/top_level.txt +0 -0
@@ -1,383 +0,0 @@
1
- from datetime import datetime
2
- from itertools import chain
3
- from math import ceil
4
- from typing import Any, Optional, Iterable, Iterator, List, Dict
5
-
6
- # from monty.serialization import loadfn, dumpfn
7
-
8
- from maggma.builders import Builder
9
- from maggma.core import Store
10
- from maggma.utils import grouper
11
-
12
- from emmet.core.molecules.summary import MoleculeSummaryDoc
13
- from emmet.core.utils import jsanitize
14
- from emmet.builders.settings import EmmetBuildSettings
15
-
16
-
17
- __author__ = "Evan Spotte-Smith"
18
-
19
- SETTINGS = EmmetBuildSettings()
20
-
21
-
22
- class SummaryBuilder(Builder):
23
- """
24
- The SummaryBuilder collects all property documents and gathers their properties
25
- into a single MoleculeSummaryDoc
26
-
27
- The process is as follows:
28
- 1. Gather MoleculeDocs by formula
29
- 2. For each doc, grab the relevant property docs
30
- 3. Convert property docs to MoleculeSummaryDoc
31
- """
32
-
33
- def __init__(
34
- self,
35
- molecules: Store,
36
- charges: Store,
37
- spins: Store,
38
- bonds: Store,
39
- metal_binding: Store,
40
- orbitals: Store,
41
- redox: Store,
42
- thermo: Store,
43
- vibes: Store,
44
- summary: Store,
45
- query: Optional[Dict] = None,
46
- settings: Optional[EmmetBuildSettings] = None,
47
- **kwargs,
48
- ):
49
- self.molecules = molecules
50
- self.charges = charges
51
- self.spins = spins
52
- self.bonds = bonds
53
- self.metal_binding = metal_binding
54
- self.orbitals = orbitals
55
- self.redox = redox
56
- self.thermo = thermo
57
- self.vibes = vibes
58
- self.summary = summary
59
- self.query = query if query else dict()
60
- self.settings = EmmetBuildSettings.autoload(settings)
61
- self.kwargs = kwargs
62
-
63
- super().__init__(
64
- sources=[
65
- molecules,
66
- charges,
67
- spins,
68
- bonds,
69
- metal_binding,
70
- orbitals,
71
- redox,
72
- thermo,
73
- vibes,
74
- ],
75
- targets=[summary],
76
- **kwargs,
77
- )
78
- # Uncomment in case of issue with mrun not connecting automatically to collections
79
- # for i in [
80
- # self.molecules,
81
- # self.charges,
82
- # self.spins,
83
- # self.bonds,
84
- # self.metal_binding,
85
- # self.orbitals,
86
- # self.redox,
87
- # self.thermo,
88
- # self.vibes,
89
- # self.summary
90
- # ]:
91
- # try:
92
- # i.connect()
93
- # except Exception as e:
94
- # print("Could not connect,", e)
95
-
96
- def ensure_indexes(self):
97
- """
98
- Ensures indices on the collections needed for building
99
- """
100
-
101
- # Search index for molecules
102
- self.molecules.ensure_index("molecule_id")
103
- self.molecules.ensure_index("last_updated")
104
- self.molecules.ensure_index("task_ids")
105
- self.molecules.ensure_index("formula_alphabetical")
106
-
107
- # Search index for charges
108
- self.charges.ensure_index("molecule_id")
109
- self.charges.ensure_index("method")
110
- self.charges.ensure_index("task_id")
111
- self.charges.ensure_index("solvent")
112
- self.charges.ensure_index("lot_solvent")
113
- self.charges.ensure_index("property_id")
114
- self.charges.ensure_index("last_updated")
115
- self.charges.ensure_index("formula_alphabetical")
116
-
117
- # Search index for charges
118
- self.spins.ensure_index("molecule_id")
119
- self.spins.ensure_index("method")
120
- self.spins.ensure_index("task_id")
121
- self.spins.ensure_index("solvent")
122
- self.spins.ensure_index("lot_solvent")
123
- self.spins.ensure_index("property_id")
124
- self.spins.ensure_index("last_updated")
125
- self.spins.ensure_index("formula_alphabetical")
126
-
127
- # Search index for charges
128
- self.bonds.ensure_index("molecule_id")
129
- self.bonds.ensure_index("method")
130
- self.bonds.ensure_index("task_id")
131
- self.bonds.ensure_index("solvent")
132
- self.bonds.ensure_index("lot_solvent")
133
- self.bonds.ensure_index("property_id")
134
- self.bonds.ensure_index("last_updated")
135
- self.bonds.ensure_index("formula_alphabetical")
136
-
137
- # Search index for metal_binding
138
- self.metal_binding.ensure_index("molecule_id")
139
- self.metal_binding.ensure_index("solvent")
140
- self.metal_binding.ensure_index("lot_solvent")
141
- self.metal_binding.ensure_index("property_id")
142
- self.metal_binding.ensure_index("last_updated")
143
- self.metal_binding.ensure_index("formula_alphabetical")
144
- self.metal_binding.ensure_index("method")
145
-
146
- # Search index for orbitals
147
- self.orbitals.ensure_index("molecule_id")
148
- self.orbitals.ensure_index("task_id")
149
- self.orbitals.ensure_index("solvent")
150
- self.orbitals.ensure_index("lot_solvent")
151
- self.orbitals.ensure_index("property_id")
152
- self.orbitals.ensure_index("last_updated")
153
- self.orbitals.ensure_index("formula_alphabetical")
154
-
155
- # Search index for orbitals
156
- self.redox.ensure_index("molecule_id")
157
- self.redox.ensure_index("task_id")
158
- self.redox.ensure_index("solvent")
159
- self.redox.ensure_index("lot_solvent")
160
- self.redox.ensure_index("property_id")
161
- self.redox.ensure_index("last_updated")
162
- self.redox.ensure_index("formula_alphabetical")
163
-
164
- # Search index for thermo
165
- self.thermo.ensure_index("molecule_id")
166
- self.thermo.ensure_index("task_id")
167
- self.thermo.ensure_index("solvent")
168
- self.thermo.ensure_index("lot_solvent")
169
- self.thermo.ensure_index("property_id")
170
- self.thermo.ensure_index("last_updated")
171
- self.thermo.ensure_index("formula_alphabetical")
172
-
173
- # Search index for vibrational properties
174
- self.vibes.ensure_index("molecule_id")
175
- self.vibes.ensure_index("task_id")
176
- self.vibes.ensure_index("solvent")
177
- self.vibes.ensure_index("lot_solvent")
178
- self.vibes.ensure_index("property_id")
179
- self.vibes.ensure_index("last_updated")
180
- self.vibes.ensure_index("formula_alphabetical")
181
-
182
- # Search index for molecules
183
- self.summary.ensure_index("molecule_id")
184
- self.summary.ensure_index("last_updated")
185
- self.summary.ensure_index("formula_alphabetical")
186
-
187
- def prechunk(self, number_splits: int) -> Iterable[Dict]: # pragma: no cover
188
- """Prechunk the builder for distributed computation"""
189
-
190
- temp_query = dict(self.query)
191
- temp_query["deprecated"] = False
192
-
193
- self.logger.info("Finding documents to process")
194
- all_mols = list(
195
- self.molecules.query(
196
- temp_query, [self.molecules.key, "formula_alphabetical"]
197
- )
198
- )
199
-
200
- processed_docs = set([e for e in self.summary.distinct("molecule_id")])
201
- to_process_docs = {d[self.molecules.key] for d in all_mols} - processed_docs
202
- to_process_forms = {
203
- d["formula_alphabetical"]
204
- for d in all_mols
205
- if d[self.molecules.key] in to_process_docs
206
- }
207
-
208
- N = ceil(len(to_process_forms) / number_splits)
209
-
210
- for formula_chunk in grouper(to_process_forms, N):
211
- yield {"query": {"formula_alphabetical": {"$in": list(formula_chunk)}}}
212
-
213
- def get_items(self) -> Iterator[List[Dict]]:
214
- """
215
- Gets all items to process into summary documents.
216
- This does no datetime checking; relying on on whether
217
- task_ids are included in the summary Store
218
-
219
- Returns:
220
- generator or list relevant tasks and molecules to process into documents
221
- """
222
-
223
- self.logger.info("Summary builder started")
224
- self.logger.info("Setting indexes")
225
- self.ensure_indexes()
226
-
227
- # Save timestamp to mark buildtime
228
- self.timestamp = datetime.utcnow()
229
-
230
- # Get all processed molecules
231
- temp_query = dict(self.query)
232
- temp_query["deprecated"] = False
233
-
234
- self.logger.info("Finding documents to process")
235
- all_mols = list(
236
- self.molecules.query(
237
- temp_query, [self.molecules.key, "formula_alphabetical"]
238
- )
239
- )
240
-
241
- processed_docs = set([e for e in self.summary.distinct("molecule_id")])
242
- to_process_docs = {d[self.molecules.key] for d in all_mols} - processed_docs
243
- to_process_forms = {
244
- d["formula_alphabetical"]
245
- for d in all_mols
246
- if d[self.molecules.key] in to_process_docs
247
- }
248
-
249
- self.logger.info(f"Found {len(to_process_docs)} unprocessed documents")
250
- self.logger.info(f"Found {len(to_process_forms)} unprocessed formulas")
251
-
252
- # Set total for builder bars to have a total
253
- self.total = len(to_process_forms)
254
-
255
- for formula in to_process_forms:
256
- mol_query = dict(temp_query)
257
- mol_query["formula_alphabetical"] = formula
258
- molecules = list(self.molecules.query(criteria=mol_query))
259
-
260
- yield molecules
261
-
262
- def process_item(self, items: List[Dict]) -> List[Dict]:
263
- """
264
- Process the tasks into a MoleculeSummaryDoc
265
-
266
- Args:
267
- tasks List[Dict] : a list of MoleculeDocs in dict form
268
-
269
- Returns:
270
- [dict] : a list of new orbital docs
271
- """
272
-
273
- def _group_docs(docs: List[Dict[str, Any]], by_method: bool = False):
274
- """Helper function to group docs by solvent"""
275
- grouped: Dict[str, Any] = dict()
276
-
277
- for doc in docs:
278
- solvent = doc.get("solvent")
279
- method = doc.get("method")
280
- if not solvent:
281
- # Need to group by solvent
282
- continue
283
- if by_method and method is None:
284
- # Trying to group by method, but no method present
285
- continue
286
-
287
- if not by_method:
288
- grouped[solvent] = doc
289
- else:
290
- if solvent not in grouped:
291
- grouped[solvent] = {method: doc}
292
- else:
293
- grouped[solvent][method] = doc
294
-
295
- return (grouped, by_method)
296
-
297
- mols = items
298
- formula = mols[0]["formula_alphabetical"]
299
- mol_ids = [m["molecule_id"] for m in mols]
300
- self.logger.debug(f"Processing {formula} : {mol_ids}")
301
-
302
- summary_docs = list()
303
-
304
- for mol in mols:
305
- mol_id = mol["molecule_id"]
306
-
307
- d = {
308
- "molecules": mol,
309
- "partial_charges": _group_docs(
310
- list(self.charges.query({"molecule_id": mol_id})), True
311
- ),
312
- "partial_spins": _group_docs(
313
- list(self.spins.query({"molecule_id": mol_id})), True
314
- ),
315
- "bonding": _group_docs(
316
- list(self.bonds.query({"molecule_id": mol_id})), True
317
- ),
318
- "metal_binding": _group_docs(
319
- list(self.metal_binding.query({"molecule_id": mol_id})), True
320
- ),
321
- "orbitals": _group_docs(
322
- list(self.orbitals.query({"molecule_id": mol_id})), False
323
- ),
324
- "redox": _group_docs(
325
- list(self.redox.query({"molecule_id": mol_id})), False
326
- ),
327
- "thermo": _group_docs(
328
- list(self.thermo.query({"molecule_id": mol_id})), False
329
- ),
330
- "vibration": _group_docs(
331
- list(self.vibes.query({"molecule_id": mol_id})), False
332
- ),
333
- }
334
-
335
- to_delete = list()
336
-
337
- for k, v in d.items():
338
- if isinstance(v, dict) and len(v) == 0:
339
- to_delete.append(k)
340
-
341
- for td in to_delete:
342
- del d[td]
343
-
344
- # # For debugging; keep because it might be needed again
345
- # dumpfn(d, f"{mol_id}.json.gz")
346
- # break
347
-
348
- summary_doc = MoleculeSummaryDoc.from_docs(molecule_id=mol_id, docs=d)
349
- summary_docs.append(summary_doc)
350
-
351
- self.logger.debug(f"Produced {len(summary_docs)} summary docs for {formula}")
352
-
353
- return jsanitize([doc.model_dump() for doc in summary_docs], allow_bson=True)
354
-
355
- def update_targets(self, items: List[List[Dict]]):
356
- """
357
- Inserts the new documents into the summary collection
358
-
359
- Args:
360
- items [[dict]]: A list of documents to update
361
- """
362
-
363
- docs = list(chain.from_iterable(items)) # type: ignore
364
-
365
- # Add timestamp
366
- for item in docs:
367
- item.update(
368
- {
369
- "_bt": self.timestamp,
370
- }
371
- )
372
-
373
- molecule_ids = list({item["molecule_id"] for item in docs})
374
-
375
- if len(items) > 0:
376
- self.logger.info(f"Updating {len(docs)} summary documents")
377
- self.summary.remove_docs({self.summary.key: {"$in": molecule_ids}})
378
- self.summary.update(
379
- docs=docs,
380
- key=["molecule_id"],
381
- )
382
- else:
383
- self.logger.info("No items to update")