emmet-builders 0.78.3__py3-none-any.whl → 0.86.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (44) hide show
  1. emmet/builders/abinit/phonon.py +47 -47
  2. emmet/builders/abinit/sound_velocity.py +15 -11
  3. emmet/builders/feff/xas.py +1 -2
  4. emmet/builders/materials/absorption_spectrum.py +25 -14
  5. emmet/builders/materials/alloys.py +10 -11
  6. emmet/builders/materials/chemenv.py +2 -3
  7. emmet/builders/materials/corrected_entries.py +21 -15
  8. emmet/builders/materials/dielectric.py +19 -11
  9. emmet/builders/materials/elasticity.py +44 -33
  10. emmet/builders/materials/electrodes.py +35 -28
  11. emmet/builders/materials/electronic_structure.py +17 -17
  12. emmet/builders/materials/magnetism.py +11 -4
  13. emmet/builders/materials/optimade.py +7 -3
  14. emmet/builders/materials/piezoelectric.py +24 -21
  15. emmet/builders/materials/provenance.py +16 -13
  16. emmet/builders/materials/robocrys.py +2 -3
  17. emmet/builders/materials/substrates.py +9 -8
  18. emmet/builders/materials/summary.py +3 -3
  19. emmet/builders/materials/thermo.py +17 -11
  20. emmet/builders/matscholar/missing_compositions.py +12 -8
  21. emmet/builders/mobility/migration_graph.py +5 -5
  22. emmet/builders/settings.py +21 -17
  23. emmet/builders/utils.py +101 -12
  24. emmet/builders/vasp/materials.py +40 -51
  25. emmet/builders/vasp/mp_potcar_stats.json.gz +0 -0
  26. emmet/builders/vasp/task_validator.py +25 -36
  27. emmet_builders-0.86.0.dist-info/METADATA +37 -0
  28. emmet_builders-0.86.0.dist-info/RECORD +41 -0
  29. {emmet_builders-0.78.3.dist-info → emmet_builders-0.86.0.dist-info}/WHEEL +1 -1
  30. emmet/builders/materials/ml.py +0 -87
  31. emmet/builders/molecules/atomic.py +0 -589
  32. emmet/builders/molecules/bonds.py +0 -324
  33. emmet/builders/molecules/metal_binding.py +0 -526
  34. emmet/builders/molecules/orbitals.py +0 -288
  35. emmet/builders/molecules/redox.py +0 -496
  36. emmet/builders/molecules/summary.py +0 -383
  37. emmet/builders/molecules/thermo.py +0 -500
  38. emmet/builders/molecules/vibration.py +0 -278
  39. emmet/builders/qchem/__init__.py +0 -0
  40. emmet/builders/qchem/molecules.py +0 -734
  41. emmet_builders-0.78.3.dist-info/METADATA +0 -47
  42. emmet_builders-0.78.3.dist-info/RECORD +0 -51
  43. /emmet/builders/{molecules/__init__.py → py.typed} +0 -0
  44. {emmet_builders-0.78.3.dist-info → emmet_builders-0.86.0.dist-info}/top_level.txt +0 -0
@@ -1,589 +0,0 @@
1
- from collections import defaultdict
2
- from datetime import datetime
3
- from itertools import chain
4
- from math import ceil
5
- from typing import Optional, Iterable, Iterator, List, Dict
6
-
7
- from maggma.builders import Builder
8
- from maggma.core import Store
9
- from maggma.utils import grouper
10
-
11
- from emmet.core.qchem.task import TaskDocument
12
- from emmet.core.qchem.molecule import MoleculeDoc, evaluate_lot
13
- from emmet.core.molecules.atomic import (
14
- PartialChargesDoc,
15
- PartialSpinsDoc,
16
- CHARGES_METHODS,
17
- SPINS_METHODS,
18
- )
19
- from emmet.core.utils import jsanitize
20
- from emmet.builders.settings import EmmetBuildSettings
21
-
22
-
23
- __author__ = "Evan Spotte-Smith"
24
-
25
- SETTINGS = EmmetBuildSettings()
26
-
27
-
28
- class PartialChargesBuilder(Builder):
29
- """
30
- The PartialChargesBuilder extracts partial charges data from a MoleculeDoc.
31
-
32
- Various methods can be used to define partial charges, including:
33
- - Mulliken
34
- - Restrained Electrostatic Potential (RESP)
35
- - Critic2
36
- - Natural Bonding Orbital (NBO) population analysis
37
-
38
- This builder will attempt to build documents for each molecule, in each solvent,
39
- with each method. For each molecule-solvent-method combination, the
40
- highest-quality data available (based on level of theory and electronic
41
- energy) will be used.
42
-
43
- The process is as follows:
44
- 1. Gather MoleculeDocs by formula
45
- 2. For each molecule, group all tasks by solvent.
46
- 3. For each solvent, sort tasks by level of theory and electronic energy
47
- 4. For each method:
48
- 4.1. Find task docs with necessary data to calculate partial charges by that method
49
- 4.2. Take best (defined by level of theory and electronic energy) task
50
- 4.3. Convert TaskDoc to PartialChargesDoc
51
- """
52
-
53
- def __init__(
54
- self,
55
- tasks: Store,
56
- molecules: Store,
57
- charges: Store,
58
- query: Optional[Dict] = None,
59
- methods: Optional[List] = None,
60
- settings: Optional[EmmetBuildSettings] = None,
61
- **kwargs,
62
- ):
63
- self.tasks = tasks
64
- self.molecules = molecules
65
- self.charges = charges
66
- self.query = query if query else dict()
67
- self.methods = methods if methods else CHARGES_METHODS
68
- self.settings = EmmetBuildSettings.autoload(settings)
69
- self.kwargs = kwargs
70
-
71
- super().__init__(sources=[tasks, molecules], targets=[charges], **kwargs)
72
- # Uncomment in case of issue with mrun not connecting automatically to collections
73
- # for i in [self.tasks, self.molecules, self.charges]:
74
- # try:
75
- # i.connect()
76
- # except Exception as e:
77
- # print("Could not connect,", e)
78
-
79
- def ensure_indexes(self):
80
- """
81
- Ensures indices on the collections needed for building
82
- """
83
-
84
- # Basic search index for tasks
85
- self.tasks.ensure_index("task_id")
86
- self.tasks.ensure_index("last_updated")
87
- self.tasks.ensure_index("state")
88
- self.tasks.ensure_index("formula_alphabetical")
89
-
90
- # Search index for molecules
91
- self.molecules.ensure_index("molecule_id")
92
- self.molecules.ensure_index("last_updated")
93
- self.molecules.ensure_index("task_ids")
94
- self.molecules.ensure_index("formula_alphabetical")
95
-
96
- # Search index for charges
97
- self.charges.ensure_index("molecule_id")
98
- self.charges.ensure_index("task_id")
99
- self.charges.ensure_index("method")
100
- self.charges.ensure_index("solvent")
101
- self.charges.ensure_index("lot_solvent")
102
- self.charges.ensure_index("property_id")
103
- self.charges.ensure_index("last_updated")
104
- self.charges.ensure_index("formula_alphabetical")
105
-
106
- def prechunk(self, number_splits: int) -> Iterable[Dict]: # pragma: no cover
107
- """Prechunk the builder for distributed computation"""
108
-
109
- temp_query = dict(self.query)
110
- temp_query["deprecated"] = False
111
-
112
- self.logger.info("Finding documents to process")
113
- all_mols = list(
114
- self.molecules.query(
115
- temp_query, [self.molecules.key, "formula_alphabetical"]
116
- )
117
- )
118
-
119
- processed_docs = set([e for e in self.charges.distinct("molecule_id")])
120
- to_process_docs = {d[self.molecules.key] for d in all_mols} - processed_docs
121
- to_process_forms = {
122
- d["formula_alphabetical"]
123
- for d in all_mols
124
- if d[self.molecules.key] in to_process_docs
125
- }
126
-
127
- N = ceil(len(to_process_forms) / number_splits)
128
-
129
- for formula_chunk in grouper(to_process_forms, N):
130
- yield {"query": {"formula_alphabetical": {"$in": list(formula_chunk)}}}
131
-
132
- def get_items(self) -> Iterator[List[Dict]]:
133
- """
134
- Gets all items to process into partial charges documents.
135
- This does no datetime checking; relying on on whether
136
- task_ids are included in the charges Store
137
-
138
- Returns:
139
- generator or list relevant tasks and molecules to process into documents
140
- """
141
-
142
- self.logger.info("Partial charges builder started")
143
- self.logger.info("Setting indexes")
144
- self.ensure_indexes()
145
-
146
- # Save timestamp to mark buildtime
147
- self.timestamp = datetime.utcnow()
148
-
149
- # Get all processed molecules
150
- temp_query = dict(self.query)
151
- temp_query["deprecated"] = False
152
-
153
- self.logger.info("Finding documents to process")
154
- all_mols = list(
155
- self.molecules.query(
156
- temp_query, [self.molecules.key, "formula_alphabetical"]
157
- )
158
- )
159
-
160
- processed_docs = set([e for e in self.charges.distinct("molecule_id")])
161
- to_process_docs = {d[self.molecules.key] for d in all_mols} - processed_docs
162
- to_process_forms = {
163
- d["formula_alphabetical"]
164
- for d in all_mols
165
- if d[self.molecules.key] in to_process_docs
166
- }
167
-
168
- self.logger.info(f"Found {len(to_process_docs)} unprocessed documents")
169
- self.logger.info(f"Found {len(to_process_forms)} unprocessed formulas")
170
-
171
- # Set total for builder bars to have a total
172
- self.total = len(to_process_forms)
173
-
174
- for formula in to_process_forms:
175
- mol_query = dict(temp_query)
176
- mol_query["formula_alphabetical"] = formula
177
- molecules = list(self.molecules.query(criteria=mol_query))
178
-
179
- yield molecules
180
-
181
- def process_item(self, items: List[Dict]) -> List[Dict]:
182
- """
183
- Process the tasks into PartialChargesDocs
184
-
185
- Args:
186
- tasks List[Dict] : a list of MoleculeDocs in dict form
187
-
188
- Returns:
189
- [dict] : a list of new partial charges docs
190
- """
191
-
192
- mols = [MoleculeDoc(**item) for item in items]
193
- formula = mols[0].formula_alphabetical
194
- mol_ids = [m.molecule_id for m in mols]
195
- self.logger.debug(f"Processing {formula} : {mol_ids}")
196
-
197
- charges_docs = list()
198
-
199
- for mol in mols:
200
- correct_charge_spin = [
201
- e
202
- for e in mol.entries
203
- if e["charge"] == mol.charge
204
- and e["spin_multiplicity"] == mol.spin_multiplicity
205
- ]
206
-
207
- # Organize by solvent environment
208
- by_solvent = defaultdict(list)
209
- for entry in correct_charge_spin:
210
- by_solvent[entry["solvent"]].append(entry)
211
-
212
- for solvent, entries in by_solvent.items():
213
- sorted_entries = sorted(
214
- entries,
215
- key=lambda x: (
216
- sum(evaluate_lot(x["level_of_theory"])),
217
- x["energy"],
218
- ),
219
- )
220
-
221
- for method in self.methods:
222
- # For each method, grab entries that have the relevant data
223
- relevant_entries = [
224
- e
225
- for e in sorted_entries
226
- if e.get(method) is not None
227
- or e["output"].get(method) is not None
228
- ]
229
-
230
- if len(relevant_entries) == 0:
231
- continue
232
-
233
- # Grab task document of best entry
234
- best_entry = relevant_entries[0]
235
- task = best_entry["task_id"]
236
-
237
- tdoc = self.tasks.query_one(
238
- {
239
- "task_id": task,
240
- "formula_alphabetical": formula,
241
- "orig": {"$exists": True},
242
- }
243
- )
244
-
245
- if tdoc is None:
246
- try:
247
- tdoc = self.tasks.query_one(
248
- {
249
- "task_id": int(task),
250
- "formula_alphabetical": formula,
251
- "orig": {"$exists": True},
252
- }
253
- )
254
- except ValueError:
255
- tdoc = None
256
-
257
- if tdoc is None:
258
- continue
259
-
260
- task_doc = TaskDocument(**tdoc)
261
-
262
- if task_doc is None:
263
- continue
264
-
265
- doc = PartialChargesDoc.from_task(
266
- task_doc,
267
- molecule_id=mol.molecule_id,
268
- preferred_methods=[method],
269
- deprecated=False,
270
- )
271
-
272
- charges_docs.append(doc)
273
-
274
- self.logger.debug(f"Produced {len(charges_docs)} charges docs for {formula}")
275
-
276
- return jsanitize([doc.model_dump() for doc in charges_docs], allow_bson=True)
277
-
278
- def update_targets(self, items: List[List[Dict]]):
279
- """
280
- Inserts the new documents into the charges collection
281
-
282
- Args:
283
- items [[dict]]: A list of documents to update
284
- """
285
-
286
- docs = list(chain.from_iterable(items)) # type: ignore
287
-
288
- # Add timestamp
289
- for item in docs:
290
- item.update(
291
- {
292
- "_bt": self.timestamp,
293
- }
294
- )
295
-
296
- molecule_ids = list({item["molecule_id"] for item in docs})
297
-
298
- if len(items) > 0:
299
- self.logger.info(f"Updating {len(docs)} partial charges documents")
300
- self.charges.remove_docs({self.charges.key: {"$in": molecule_ids}})
301
- # Neither molecule_id nor method need to be unique, but the combination must be
302
- self.charges.update(
303
- docs=docs,
304
- key=["molecule_id", "method", "solvent"],
305
- )
306
- else:
307
- self.logger.info("No items to update")
308
-
309
-
310
- class PartialSpinsBuilder(Builder):
311
- """
312
- The PartialSpinsBuilder extracts partial spin data from a MoleculeDoc.
313
-
314
- Various methods can be used to define partial atomic spins, including:
315
- - Mulliken
316
- - Natural Bonding Orbital (NBO) population analysis
317
-
318
- This builder will attempt to build documents for each molecule, in each solvent,
319
- with each method. For each molecule-method combination, the highest-quality
320
- data available (based on level of theory and electronic energy) will be used.
321
-
322
- The process is as follows:
323
- 1. Gather MoleculeDocs by formula
324
- 2. For each molecule, group all tasks by solvent.
325
- 3. For each solvent, sort tasks by level of theory and electronic energy
326
- 4. For each method:
327
- 4.1. Find task docs with necessary data to calculate partial charges by that method
328
- 4.2. Take best (defined by level of theory and electronic energy) task
329
- 4.3. Convert TaskDoc to PartialSpinsDoc
330
- """
331
-
332
- def __init__(
333
- self,
334
- tasks: Store,
335
- molecules: Store,
336
- spins: Store,
337
- query: Optional[Dict] = None,
338
- methods: Optional[List] = None,
339
- settings: Optional[EmmetBuildSettings] = None,
340
- **kwargs,
341
- ):
342
- self.tasks = tasks
343
- self.molecules = molecules
344
- self.spins = spins
345
- self.query = query if query else dict()
346
- self.methods = methods if methods else SPINS_METHODS
347
- self.settings = EmmetBuildSettings.autoload(settings)
348
- self.kwargs = kwargs
349
-
350
- super().__init__(sources=[tasks, molecules], targets=[spins], **kwargs)
351
- # Uncomment in case of issue with mrun not connecting automatically to collections
352
- # for i in [self.tasks, self.molecules, self.spins]:
353
- # try:
354
- # i.connect()
355
- # except Exception as e:
356
- # print("Could not connect,", e)
357
-
358
- def ensure_indexes(self):
359
- """
360
- Ensures indices on the collections needed for building
361
- """
362
-
363
- # Basic search index for tasks
364
- self.tasks.ensure_index("task_id")
365
- self.tasks.ensure_index("last_updated")
366
- self.tasks.ensure_index("state")
367
- self.tasks.ensure_index("formula_alphabetical")
368
-
369
- # Search index for molecules
370
- self.molecules.ensure_index("molecule_id")
371
- self.molecules.ensure_index("last_updated")
372
- self.molecules.ensure_index("task_ids")
373
- self.molecules.ensure_index("formula_alphabetical")
374
-
375
- # Search index for spins
376
- self.spins.ensure_index("molecule_id")
377
- self.spins.ensure_index("task_id")
378
- self.spins.ensure_index("method")
379
- self.spins.ensure_index("solvent")
380
- self.spins.ensure_index("lot_solvent")
381
- self.spins.ensure_index("property_id")
382
- self.spins.ensure_index("last_updated")
383
- self.spins.ensure_index("formula_alphabetical")
384
-
385
- def prechunk(self, number_splits: int) -> Iterable[Dict]: # pragma: no cover
386
- """Prechunk the builder for distributed computation"""
387
-
388
- temp_query = dict(self.query)
389
- temp_query["deprecated"] = False
390
-
391
- self.logger.info("Finding documents to process")
392
- all_mols = list(
393
- self.molecules.query(
394
- temp_query, [self.molecules.key, "formula_alphabetical"]
395
- )
396
- )
397
-
398
- processed_docs = set([e for e in self.spins.distinct("molecule_id")])
399
- to_process_docs = {d[self.molecules.key] for d in all_mols} - processed_docs
400
- to_process_forms = {
401
- d["formula_alphabetical"]
402
- for d in all_mols
403
- if d[self.molecules.key] in to_process_docs
404
- }
405
-
406
- N = ceil(len(to_process_forms) / number_splits)
407
-
408
- for formula_chunk in grouper(to_process_forms, N):
409
- yield {"query": {"formula_alphabetical": {"$in": list(formula_chunk)}}}
410
-
411
- def get_items(self) -> Iterator[List[Dict]]:
412
- """
413
- Gets all items to process into partial spins documents.
414
- This does no datetime checking; relying on on whether
415
- task_ids are included in the spins Store
416
-
417
- Returns:
418
- generator or list relevant tasks and molecules to process into documents
419
- """
420
-
421
- self.logger.info("Partial spins builder started")
422
- self.logger.info("Setting indexes")
423
- self.ensure_indexes()
424
-
425
- # Save timestamp to mark buildtime
426
- self.timestamp = datetime.utcnow()
427
-
428
- # Get all processed molecules
429
- temp_query = dict(self.query)
430
- temp_query["deprecated"] = False
431
-
432
- self.logger.info("Finding documents to process")
433
- all_mols = list(
434
- self.molecules.query(
435
- temp_query, [self.molecules.key, "formula_alphabetical"]
436
- )
437
- )
438
-
439
- processed_docs = set([e for e in self.spins.distinct("molecule_id")])
440
- to_process_docs = {d[self.molecules.key] for d in all_mols} - processed_docs
441
- to_process_forms = {
442
- d["formula_alphabetical"]
443
- for d in all_mols
444
- if d[self.molecules.key] in to_process_docs
445
- }
446
-
447
- self.logger.info(f"Found {len(to_process_docs)} unprocessed documents")
448
- self.logger.info(f"Found {len(to_process_forms)} unprocessed formulas")
449
-
450
- # Set total for builder bars to have a total
451
- self.total = len(to_process_forms)
452
-
453
- for formula in to_process_forms:
454
- mol_query = dict(temp_query)
455
- mol_query["formula_alphabetical"] = formula
456
- molecules = list(self.molecules.query(criteria=mol_query))
457
-
458
- yield molecules
459
-
460
- def process_item(self, items: List[Dict]) -> List[Dict]:
461
- """
462
- Process the tasks into PartialSpinsDocs
463
-
464
- Args:
465
- tasks List[Dict] : a list of MoleculeDocs in dict form
466
-
467
- Returns:
468
- [dict] : a list of new partial spins docs
469
- """
470
-
471
- mols = [MoleculeDoc(**item) for item in items]
472
- formula = mols[0].formula_alphabetical
473
- mol_ids = [m.molecule_id for m in mols]
474
- self.logger.debug(f"Processing {formula} : {mol_ids}")
475
-
476
- spins_docs = list()
477
-
478
- for mol in mols:
479
- # Molecule with spin multiplicity 1 has no partial spins
480
- if mol.spin_multiplicity == 1:
481
- continue
482
-
483
- correct_charge_spin = [
484
- e
485
- for e in mol.entries
486
- if e["charge"] == mol.charge
487
- and e["spin_multiplicity"] == mol.spin_multiplicity
488
- ]
489
-
490
- # Organize by solvent environment
491
- by_solvent = defaultdict(list)
492
- for entry in correct_charge_spin:
493
- by_solvent[entry["solvent"]].append(entry)
494
-
495
- for solvent, entries in by_solvent.items():
496
- sorted_entries = sorted(
497
- entries,
498
- key=lambda x: (
499
- sum(evaluate_lot(x["level_of_theory"])),
500
- x["energy"],
501
- ),
502
- )
503
-
504
- for method in self.methods:
505
- # For each method, grab entries that have the relevant data
506
- relevant_entries = [
507
- e
508
- for e in sorted_entries
509
- if e.get(method) is not None
510
- or e["output"].get(method) is not None
511
- ]
512
-
513
- if len(relevant_entries) == 0:
514
- continue
515
-
516
- # Grab task document of best entry
517
- best_entry = relevant_entries[0]
518
- task = best_entry["task_id"]
519
-
520
- tdoc = self.tasks.query_one(
521
- {
522
- "task_id": task,
523
- "formula_alphabetical": formula,
524
- "orig": {"$exists": True},
525
- }
526
- )
527
-
528
- if tdoc is None:
529
- try:
530
- tdoc = self.tasks.query_one(
531
- {
532
- "task_id": int(task),
533
- "formula_alphabetical": formula,
534
- "orig": {"$exists": True},
535
- }
536
- )
537
- except ValueError:
538
- tdoc = None
539
-
540
- if tdoc is None:
541
- continue
542
-
543
- task_doc = TaskDocument(**tdoc)
544
-
545
- doc = PartialSpinsDoc.from_task(
546
- task_doc,
547
- molecule_id=mol.molecule_id,
548
- preferred_methods=[method],
549
- deprecated=False,
550
- )
551
-
552
- spins_docs.append(doc)
553
-
554
- self.logger.debug(
555
- f"Produced {len(spins_docs)} partial spins docs for {formula}"
556
- )
557
-
558
- return jsanitize([doc.model_dump() for doc in spins_docs], allow_bson=True)
559
-
560
- def update_targets(self, items: List[List[Dict]]):
561
- """
562
- Inserts the new documents into the spins collection
563
-
564
- Args:
565
- items [[dict]]: A list of documents to update
566
- """
567
-
568
- docs = list(chain.from_iterable(items)) # type: ignore
569
-
570
- # Add timestamp
571
- for item in docs:
572
- item.update(
573
- {
574
- "_bt": self.timestamp,
575
- }
576
- )
577
-
578
- molecule_ids = list({item["molecule_id"] for item in docs})
579
-
580
- if len(items) > 0:
581
- self.logger.info(f"Updating {len(docs)} partial spins documents")
582
- self.spins.remove_docs({self.spins.key: {"$in": molecule_ids}})
583
- # Neither molecule_id nor method need to be unique, but the combination must be
584
- self.spins.update(
585
- docs=docs,
586
- key=["molecule_id", "method", "solvent"],
587
- )
588
- else:
589
- self.logger.info("No items to update")