emmet-builders 0.84.2__py3-none-any.whl → 0.86.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (42) hide show
  1. emmet/builders/abinit/phonon.py +27 -25
  2. emmet/builders/abinit/sound_velocity.py +15 -11
  3. emmet/builders/feff/xas.py +1 -2
  4. emmet/builders/materials/absorption_spectrum.py +25 -14
  5. emmet/builders/materials/alloys.py +3 -4
  6. emmet/builders/materials/chemenv.py +2 -3
  7. emmet/builders/materials/corrected_entries.py +15 -9
  8. emmet/builders/materials/dielectric.py +19 -11
  9. emmet/builders/materials/elasticity.py +44 -33
  10. emmet/builders/materials/electrodes.py +24 -19
  11. emmet/builders/materials/electronic_structure.py +17 -17
  12. emmet/builders/materials/magnetism.py +11 -4
  13. emmet/builders/materials/optimade.py +7 -3
  14. emmet/builders/materials/piezoelectric.py +24 -21
  15. emmet/builders/materials/provenance.py +15 -12
  16. emmet/builders/materials/robocrys.py +2 -3
  17. emmet/builders/materials/substrates.py +9 -8
  18. emmet/builders/materials/summary.py +3 -3
  19. emmet/builders/materials/thermo.py +17 -11
  20. emmet/builders/matscholar/missing_compositions.py +12 -8
  21. emmet/builders/mobility/migration_graph.py +5 -5
  22. emmet/builders/settings.py +21 -17
  23. emmet/builders/utils.py +15 -10
  24. emmet/builders/vasp/materials.py +32 -16
  25. emmet/builders/vasp/task_validator.py +15 -11
  26. {emmet_builders-0.84.2.dist-info → emmet_builders-0.86.0.dist-info}/METADATA +21 -36
  27. emmet_builders-0.86.0.dist-info/RECORD +41 -0
  28. {emmet_builders-0.84.2.dist-info → emmet_builders-0.86.0.dist-info}/WHEEL +1 -1
  29. emmet/builders/materials/ml.py +0 -87
  30. emmet/builders/molecules/atomic.py +0 -589
  31. emmet/builders/molecules/bonds.py +0 -324
  32. emmet/builders/molecules/metal_binding.py +0 -526
  33. emmet/builders/molecules/orbitals.py +0 -288
  34. emmet/builders/molecules/redox.py +0 -496
  35. emmet/builders/molecules/summary.py +0 -383
  36. emmet/builders/molecules/thermo.py +0 -500
  37. emmet/builders/molecules/vibration.py +0 -278
  38. emmet/builders/qchem/__init__.py +0 -0
  39. emmet/builders/qchem/molecules.py +0 -734
  40. emmet_builders-0.84.2.dist-info/RECORD +0 -52
  41. /emmet/builders/{molecules/__init__.py → py.typed} +0 -0
  42. {emmet_builders-0.84.2.dist-info → emmet_builders-0.86.0.dist-info}/top_level.txt +0 -0
@@ -1,324 +0,0 @@
1
- from collections import defaultdict
2
- from datetime import datetime
3
- from itertools import chain
4
- from math import ceil
5
- from typing import Optional, Iterable, Iterator, List, Dict
6
-
7
- from maggma.builders import Builder
8
- from maggma.core import Store
9
- from maggma.utils import grouper
10
-
11
- from emmet.core.qchem.task import TaskDocument
12
- from emmet.core.qchem.molecule import MoleculeDoc, evaluate_lot
13
- from emmet.core.molecules.bonds import MoleculeBondingDoc, BOND_METHODS
14
- from emmet.core.utils import jsanitize
15
- from emmet.builders.settings import EmmetBuildSettings
16
-
17
-
18
- __author__ = "Evan Spotte-Smith"
19
-
20
- SETTINGS = EmmetBuildSettings()
21
-
22
-
23
- class BondingBuilder(Builder):
24
- """
25
- The BondingBuilder defines the bonds in a MoleculeDoc.
26
-
27
- Various methods can be used to define bonding, including:
28
- - OpenBabelNN + metal_edge_extender: Combining the bond detection algorithms in OpenBabel (OpenBabelNN in
29
- pymatgen) with a heuristic to add metal coordinate bonds (metal_edge_extender
30
- in pymatgen)
31
- - critic2: Using critical points of the electron density to define bonds
32
- - nbo: Using Natural Bonding Orbital analysis to define bonds and other
33
- interatomic interactions
34
-
35
- NOTE: Only NBO7 can be used to generate bonding. Bonding (especially when metals
36
- are involved) is unreliable with earlier version of NBO!
37
-
38
- This builder will attempt to build documents for each molecule, in each solvent,
39
- with each method. For each molecule-solvent-method combination, the highest-quality
40
- data available (based on level of theory and electronic energy) will be used.
41
-
42
- The process is as follows:
43
- 1. Gather MoleculeDocs by formula
44
- 2. For each molecule, group all tasks by solvent.
45
- 3. For each solvent, sort tasks by level of theory and electronic energy
46
- 4. For each method:
47
- 4.1. Find task docs with necessary data to define bonding by that method
48
- 4.2. Take best (defined by level of theory and electronic energy) task
49
- 4.3. Convert TaskDoc to MoleculeBondingDoc
50
- """
51
-
52
- def __init__(
53
- self,
54
- tasks: Store,
55
- molecules: Store,
56
- bonds: Store,
57
- query: Optional[Dict] = None,
58
- methods: Optional[List] = None,
59
- settings: Optional[EmmetBuildSettings] = None,
60
- **kwargs,
61
- ):
62
- self.tasks = tasks
63
- self.molecules = molecules
64
- self.bonds = bonds
65
- self.query = query if query else dict()
66
- self.methods = methods if methods else BOND_METHODS
67
- self.settings = EmmetBuildSettings.autoload(settings)
68
- self.kwargs = kwargs
69
-
70
- super().__init__(sources=[tasks, molecules], targets=[bonds], **kwargs)
71
- # Uncomment in case of issue with mrun not connecting automatically to collections
72
- # for i in [self.tasks, self.molecules, self.bonds]:
73
- # try:
74
- # i.connect()
75
- # except Exception as e:
76
- # print("Could not connect,", e)
77
-
78
- def ensure_indexes(self):
79
- """
80
- Ensures indices on the collections needed for building
81
- """
82
-
83
- # Basic search index for tasks
84
- self.tasks.ensure_index("task_id")
85
- self.tasks.ensure_index("last_updated")
86
- self.tasks.ensure_index("state")
87
- self.tasks.ensure_index("formula_alphabetical")
88
-
89
- # Search index for molecules
90
- self.molecules.ensure_index("molecule_id")
91
- self.molecules.ensure_index("last_updated")
92
- self.molecules.ensure_index("task_ids")
93
- self.molecules.ensure_index("formula_alphabetical")
94
-
95
- # Search index for bonds
96
- self.bonds.ensure_index("molecule_id")
97
- self.bonds.ensure_index("method")
98
- self.bonds.ensure_index("task_id")
99
- self.bonds.ensure_index("solvent")
100
- self.bonds.ensure_index("lot_solvent")
101
- self.bonds.ensure_index("property_id")
102
- self.bonds.ensure_index("last_updated")
103
- self.bonds.ensure_index("formula_alphabetical")
104
-
105
- def prechunk(self, number_splits: int) -> Iterable[Dict]: # pragma: no cover
106
- """Prechunk the builder for distributed computation"""
107
-
108
- temp_query = dict(self.query)
109
- temp_query["deprecated"] = False
110
-
111
- self.logger.info("Finding documents to process")
112
- all_mols = list(
113
- self.molecules.query(
114
- temp_query, [self.molecules.key, "formula_alphabetical"]
115
- )
116
- )
117
-
118
- processed_docs = set([e for e in self.bonds.distinct("molecule_id")])
119
- to_process_docs = {d[self.molecules.key] for d in all_mols} - processed_docs
120
- to_process_forms = {
121
- d["formula_alphabetical"]
122
- for d in all_mols
123
- if d[self.molecules.key] in to_process_docs
124
- }
125
-
126
- N = ceil(len(to_process_forms) / number_splits)
127
-
128
- for formula_chunk in grouper(to_process_forms, N):
129
- yield {"query": {"formula_alphabetical": {"$in": list(formula_chunk)}}}
130
-
131
- def get_items(self) -> Iterator[List[Dict]]:
132
- """
133
- Gets all items to process into bonding documents.
134
- This does no datetime checking; relying on on whether
135
- task_ids are included in the bonds Store
136
-
137
- Returns:
138
- generator or list relevant tasks and molecules to process into documents
139
- """
140
-
141
- self.logger.info("Bonding builder started")
142
- self.logger.info("Setting indexes")
143
- self.ensure_indexes()
144
-
145
- # Save timestamp to mark buildtime
146
- self.timestamp = datetime.utcnow()
147
-
148
- # Get all processed molecules
149
- temp_query = dict(self.query)
150
- temp_query["deprecated"] = False
151
-
152
- self.logger.info("Finding documents to process")
153
- all_mols = list(
154
- self.molecules.query(
155
- temp_query, [self.molecules.key, "formula_alphabetical"]
156
- )
157
- )
158
-
159
- processed_docs = set([e for e in self.bonds.distinct("molecule_id")])
160
- to_process_docs = {d[self.molecules.key] for d in all_mols} - processed_docs
161
- to_process_forms = {
162
- d["formula_alphabetical"]
163
- for d in all_mols
164
- if d[self.molecules.key] in to_process_docs
165
- }
166
-
167
- self.logger.info(f"Found {len(to_process_docs)} unprocessed documents")
168
- self.logger.info(f"Found {len(to_process_forms)} unprocessed formulas")
169
-
170
- # Set total for builder bars to have a total
171
- self.total = len(to_process_forms)
172
-
173
- for formula in to_process_forms:
174
- mol_query = dict(temp_query)
175
- mol_query["formula_alphabetical"] = formula
176
- molecules = list(self.molecules.query(criteria=mol_query))
177
-
178
- yield molecules
179
-
180
- def process_item(self, items: List[Dict]) -> List[Dict]:
181
- """
182
- Process the tasks into MoleculeBondingDocs
183
-
184
- Args:
185
- tasks List[Dict] : a list of MoleculeDocs in dict form
186
-
187
- Returns:
188
- [dict] : a list of new bonding docs
189
- """
190
-
191
- mols = [MoleculeDoc(**item) for item in items]
192
- formula = mols[0].formula_alphabetical
193
- mol_ids = [m.molecule_id for m in mols]
194
- self.logger.debug(f"Processing {formula} : {mol_ids}")
195
-
196
- bonding_docs = list()
197
-
198
- for mol in mols:
199
- correct_charge_spin = [
200
- e
201
- for e in mol.entries
202
- if e["charge"] == mol.charge
203
- and e["spin_multiplicity"] == mol.spin_multiplicity
204
- ]
205
-
206
- # Organize by solvent environment
207
- by_solvent = defaultdict(list)
208
- for entry in correct_charge_spin:
209
- by_solvent[entry["solvent"]].append(entry)
210
-
211
- for solvent, entries in by_solvent.items():
212
- sorted_entries = sorted(
213
- entries,
214
- key=lambda x: (
215
- sum(evaluate_lot(x["level_of_theory"])),
216
- x["energy"],
217
- ),
218
- )
219
-
220
- for method in self.methods:
221
- # For each method, grab entries that have the relevant data
222
- if method == "OpenBabelNN + metal_edge_extender":
223
- # This is sort of silly. Since, at the MoleculeDoc level,
224
- # the structures have to be identical, bonding defined
225
- # using heuristic methods like OpenBabel should always
226
- # be identical.
227
- # TODO: Decide if only one OpenBabelNN + m_e_e doc
228
- # TODO: should be allowed.
229
- relevant_entries = sorted_entries
230
- else:
231
- relevant_entries = [
232
- e
233
- for e in sorted_entries
234
- if e.get(method) is not None
235
- or e["output"].get(method) is not None
236
- ]
237
-
238
- if method == "nbo":
239
- # Only allow NBO7 to be used. No earlier versions can be
240
- # relied upon for bonding
241
- relevant_entries = [
242
- e
243
- for e in relevant_entries
244
- if e["orig"]["rem"].get("run_nbo6", False)
245
- or e["orig"]["rem"].get("nbo_external", False)
246
- ]
247
-
248
- if len(relevant_entries) == 0:
249
- continue
250
-
251
- # Grab task document of best entry
252
- best_entry = relevant_entries[0]
253
- task = best_entry["task_id"]
254
-
255
- tdoc = self.tasks.query_one(
256
- {
257
- "task_id": task,
258
- "formula_alphabetical": formula,
259
- "orig": {"$exists": True},
260
- }
261
- )
262
-
263
- if tdoc is None:
264
- try:
265
- tdoc = self.tasks.query_one(
266
- {
267
- "task_id": int(task),
268
- "formula_alphabetical": formula,
269
- "orig": {"$exists": True},
270
- }
271
- )
272
- except ValueError:
273
- tdoc = None
274
-
275
- if tdoc is None:
276
- continue
277
-
278
- task_doc = TaskDocument(**tdoc)
279
-
280
- if task_doc is None:
281
- continue
282
-
283
- doc = MoleculeBondingDoc.from_task(
284
- task_doc,
285
- molecule_id=mol.molecule_id,
286
- preferred_methods=[method],
287
- deprecated=False,
288
- )
289
- bonding_docs.append(doc)
290
-
291
- self.logger.debug(f"Produced {len(bonding_docs)} bonding docs for {formula}")
292
-
293
- return jsanitize([doc.model_dump() for doc in bonding_docs], allow_bson=True)
294
-
295
- def update_targets(self, items: List[List[Dict]]):
296
- """
297
- Inserts the new documents into the charges collection
298
-
299
- Args:
300
- items [[dict]]: A list of documents to update
301
- """
302
-
303
- docs = list(chain.from_iterable(items)) # type: ignore
304
-
305
- # Add timestamp
306
- for item in docs:
307
- item.update(
308
- {
309
- "_bt": self.timestamp,
310
- }
311
- )
312
-
313
- molecule_ids = list({item["molecule_id"] for item in docs})
314
-
315
- if len(items) > 0:
316
- self.logger.info(f"Updating {len(docs)} bonding documents")
317
- self.bonds.remove_docs({self.bonds.key: {"$in": molecule_ids}})
318
- # Neither molecule_id nor method need to be unique, but the combination must be
319
- self.bonds.update(
320
- docs=docs,
321
- key=["molecule_id", "method", "solvent"],
322
- )
323
- else:
324
- self.logger.info("No items to update")