emmet-builders 0.84.10rc1__py3-none-any.whl → 0.85.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of emmet-builders might be problematic. Click here for more details.

Files changed (37) hide show
  1. emmet/builders/abinit/phonon.py +12 -14
  2. emmet/builders/abinit/sound_velocity.py +1 -1
  3. emmet/builders/materials/absorption_spectrum.py +16 -10
  4. emmet/builders/materials/alloys.py +1 -1
  5. emmet/builders/materials/corrected_entries.py +1 -1
  6. emmet/builders/materials/dielectric.py +10 -7
  7. emmet/builders/materials/elasticity.py +13 -9
  8. emmet/builders/materials/electrodes.py +1 -1
  9. emmet/builders/materials/electronic_structure.py +1 -1
  10. emmet/builders/materials/magnetism.py +2 -1
  11. emmet/builders/materials/piezoelectric.py +23 -19
  12. emmet/builders/materials/provenance.py +3 -4
  13. emmet/builders/materials/substrates.py +2 -2
  14. emmet/builders/materials/summary.py +3 -3
  15. emmet/builders/settings.py +14 -9
  16. emmet/builders/utils.py +5 -4
  17. emmet/builders/vasp/materials.py +11 -4
  18. emmet/builders/vasp/task_validator.py +3 -1
  19. {emmet_builders-0.84.10rc1.dist-info → emmet_builders-0.85.0.dist-info}/METADATA +7 -30
  20. emmet_builders-0.85.0.dist-info/RECORD +41 -0
  21. emmet/builders/materials/ml.py +0 -101
  22. emmet/builders/molecules/atomic.py +0 -592
  23. emmet/builders/molecules/bonds.py +0 -329
  24. emmet/builders/molecules/electric.py +0 -287
  25. emmet/builders/molecules/metal_binding.py +0 -528
  26. emmet/builders/molecules/orbitals.py +0 -292
  27. emmet/builders/molecules/redox.py +0 -502
  28. emmet/builders/molecules/summary.py +0 -406
  29. emmet/builders/molecules/thermo.py +0 -505
  30. emmet/builders/molecules/trajectory.py +0 -530
  31. emmet/builders/molecules/vibration.py +0 -282
  32. emmet/builders/qchem/__init__.py +0 -0
  33. emmet/builders/qchem/molecules.py +0 -745
  34. emmet_builders-0.84.10rc1.dist-info/RECORD +0 -54
  35. /emmet/builders/{molecules/__init__.py → py.typed} +0 -0
  36. {emmet_builders-0.84.10rc1.dist-info → emmet_builders-0.85.0.dist-info}/WHEEL +0 -0
  37. {emmet_builders-0.84.10rc1.dist-info → emmet_builders-0.85.0.dist-info}/top_level.txt +0 -0
@@ -1,530 +0,0 @@
1
- from __future__ import annotations
2
-
3
- from collections import defaultdict
4
- from datetime import datetime
5
- from itertools import chain
6
- from math import ceil
7
-
8
- from maggma.builders import Builder
9
- from maggma.core import Store
10
- from maggma.utils import grouper
11
-
12
- from emmet.builders.settings import EmmetBuildSettings
13
- from emmet.core.molecules.trajectory import ForcesDoc, TrajectoryDoc
14
- from emmet.core.qchem.molecule import MoleculeDoc, evaluate_lot
15
- from emmet.core.qchem.task import TaskDocument
16
- from emmet.core.utils import jsanitize
17
-
18
- from typing import TYPE_CHECKING
19
-
20
- if TYPE_CHECKING:
21
- from collections.abc import Iterable, Iterator
22
-
23
- __author__ = "Evan Spotte-Smith"
24
-
25
- SETTINGS = EmmetBuildSettings()
26
-
27
-
28
- class ForcesBuilder(Builder):
29
- """
30
- The ForcesBuilder extracts the highest-quality force data from a
31
- MoleculeDoc (lowest electronic energy, highest level of theory for
32
- each solvent available).
33
-
34
- The process is as follows:
35
- 1. Gather MoleculeDocs by species hash
36
- 2. For each doc, sort tasks by solvent
37
- 3. For each solvent, grab the best TaskDoc (doc with force
38
- information that has the highest level of theory with lowest
39
- electronic energy for the molecule)
40
- 4. Convert TaskDoc to ForcesDoc
41
-
42
- """
43
-
44
- def __init__(
45
- self,
46
- tasks: Store,
47
- molecules: Store,
48
- forces: Store,
49
- query: dict | None = None,
50
- settings: EmmetBuildSettings | None = None,
51
- **kwargs,
52
- ):
53
- self.tasks = tasks
54
- self.molecules = molecules
55
- self.forces = forces
56
- self.query = query if query else dict()
57
- self.settings = EmmetBuildSettings.autoload(settings)
58
- self.kwargs = kwargs
59
-
60
- super().__init__(sources=[tasks, molecules], targets=[forces], **kwargs)
61
- # Uncomment in case of issue with mrun not connecting automatically to collections
62
- # for i in [self.tasks, self.molecules, self.forces]:
63
- # try:
64
- # i.connect()
65
- # except Exception as e:
66
- # print("Could not connect,", e)
67
-
68
- def ensure_indexes(self):
69
- """
70
- Ensures indices on the collections needed for building
71
- """
72
-
73
- # Basic search index for tasks
74
- self.tasks.ensure_index("task_id")
75
- self.tasks.ensure_index("last_updated")
76
- self.tasks.ensure_index("state")
77
- self.tasks.ensure_index("formula_alphabetical")
78
- self.tasks.ensure_index("species_hash")
79
-
80
- # Search index for molecules
81
- self.molecules.ensure_index("molecule_id")
82
- self.molecules.ensure_index("last_updated")
83
- self.molecules.ensure_index("task_ids")
84
- self.molecules.ensure_index("formula_alphabetical")
85
- self.molecules.ensure_index("species_hash")
86
-
87
- # Search index for force properties
88
- self.forces.ensure_index("molecule_id")
89
- self.forces.ensure_index("task_id")
90
- self.forces.ensure_index("solvent")
91
- self.forces.ensure_index("lot_solvent")
92
- self.forces.ensure_index("property_id")
93
- self.forces.ensure_index("last_updated")
94
- self.forces.ensure_index("formula_alphabetical")
95
-
96
- def prechunk(self, number_splits: int) -> Iterable[dict]: # pragma: no cover
97
- """Prechunk the builder for distributed computation"""
98
-
99
- temp_query = dict(self.query)
100
- temp_query["deprecated"] = False
101
-
102
- self.logger.info("Finding documents to process")
103
- all_mols = list(
104
- self.molecules.query(temp_query, [self.molecules.key, "species_hash"])
105
- )
106
-
107
- processed_docs = set([e for e in self.forces.distinct("molecule_id")])
108
- to_process_docs = {d[self.molecules.key] for d in all_mols} - processed_docs
109
- to_process_hashes = {
110
- d["species_hash"]
111
- for d in all_mols
112
- if d[self.molecules.key] in to_process_docs
113
- }
114
-
115
- N = ceil(len(to_process_hashes) / number_splits)
116
-
117
- for hash_chunk in grouper(to_process_hashes, N):
118
- query = dict(temp_query)
119
- query["species_hash"] = {"$in": list(hash_chunk)}
120
- yield {"query": query}
121
-
122
- def get_items(self) -> Iterator[list[dict]]:
123
- """
124
- Gets all items to process into force documents.
125
- This does no datetime checking; relying on on whether
126
- task_ids are included in the forces Store
127
-
128
- Returns:
129
- generator or list relevant tasks and molecules to process into documents
130
- """
131
-
132
- self.logger.info("Forces builder started")
133
- self.logger.info("Setting indexes")
134
- self.ensure_indexes()
135
-
136
- # Save timestamp to mark buildtime
137
- self.timestamp = datetime.utcnow()
138
-
139
- # Get all processed molecules
140
- temp_query = dict(self.query)
141
- temp_query["deprecated"] = False
142
-
143
- self.logger.info("Finding documents to process")
144
- all_mols = list(
145
- self.molecules.query(temp_query, [self.molecules.key, "species_hash"])
146
- )
147
-
148
- processed_docs = set([e for e in self.forces.distinct("molecule_id")])
149
- to_process_docs = {d[self.molecules.key] for d in all_mols} - processed_docs
150
- to_process_hashes = {
151
- d["species_hash"]
152
- for d in all_mols
153
- if d[self.molecules.key] in to_process_docs
154
- }
155
-
156
- self.logger.info(f"Found {len(to_process_docs)} unprocessed documents")
157
- self.logger.info(f"Found {len(to_process_hashes)} unprocessed hashes")
158
-
159
- # Set total for builder bars to have a total
160
- self.total = len(to_process_hashes)
161
-
162
- for shash in to_process_hashes:
163
- mol_query = dict(temp_query)
164
- mol_query["species_hash"] = shash
165
- molecules = list(self.molecules.query(criteria=mol_query))
166
-
167
- yield molecules
168
-
169
- def process_item(self, items: list[dict]) -> list[dict]:
170
- """
171
- Process the tasks into ForcesDoc
172
-
173
- Args:
174
- items list[dict] : a list of MoleculeDocs in dict form
175
-
176
- Returns:
177
- [dict] : a list of new forces docs
178
- """
179
-
180
- mols = [MoleculeDoc(**item) for item in items]
181
- shash = mols[0].species_hash
182
- mol_ids = [m.molecule_id for m in mols]
183
- self.logger.debug(f"Processing {shash} : {mol_ids}")
184
-
185
- force_docs = list()
186
-
187
- for mol in mols:
188
- force_entries = [
189
- e
190
- for e in mol.entries
191
- if e["charge"] == mol.charge and e["task_type"] == "Force"
192
- ]
193
-
194
- # Organize by solvent environment
195
- by_solvent = defaultdict(list)
196
- for entry in force_entries:
197
- by_solvent[entry["solvent"]].append(entry)
198
-
199
- for solvent, entries in by_solvent.items():
200
- # No force calculations
201
- if len(entries) == 0:
202
- continue
203
- else:
204
- best = sorted(
205
- entries,
206
- key=lambda x: (
207
- sum(evaluate_lot(x["level_of_theory"])),
208
- x["energy"],
209
- ),
210
- )[0]
211
- task = best["task_id"]
212
-
213
- tdoc = self.tasks.query_one(
214
- {
215
- "task_id": task,
216
- "species_hash": shash,
217
- "orig": {"$exists": True},
218
- }
219
- )
220
-
221
- if tdoc is None:
222
- try:
223
- tdoc = self.tasks.query_one(
224
- {
225
- "task_id": int(task),
226
- "species_hash": shash,
227
- "orig": {"$exists": True},
228
- }
229
- )
230
- except ValueError:
231
- tdoc = None
232
-
233
- if tdoc is None:
234
- continue
235
-
236
- task_doc = TaskDocument(**tdoc)
237
-
238
- if task_doc is None:
239
- continue
240
-
241
- force_doc = ForcesDoc.from_task(
242
- task_doc, molecule_id=mol.molecule_id, deprecated=False
243
- )
244
- force_docs.append(force_doc)
245
-
246
- self.logger.debug(f"Produced {len(force_docs)} force docs for {shash}")
247
-
248
- return jsanitize([doc.model_dump() for doc in force_docs], allow_bson=True)
249
-
250
- def update_targets(self, items: list[list[dict]]):
251
- """
252
- Inserts the new force docs into the forces collection
253
-
254
- Args:
255
- items [[dict]]: A list of documents to update
256
- """
257
-
258
- docs = list(chain.from_iterable(items)) # type: ignore
259
-
260
- # Add timestamp
261
- for item in docs:
262
- item.update(
263
- {
264
- "_bt": self.timestamp,
265
- }
266
- )
267
-
268
- molecule_ids = list({item["molecule_id"] for item in docs})
269
-
270
- if len(items) > 0:
271
- self.logger.info(f"Updating {len(docs)} force documents")
272
- self.forces.remove_docs({self.forces.key: {"$in": molecule_ids}})
273
- self.forces.update(
274
- docs=docs,
275
- key=["molecule_id", "solvent"],
276
- )
277
- else:
278
- self.logger.info("No items to update")
279
-
280
-
281
- class TrajectoryBuilder(Builder):
282
- """
283
- The TrajectoryBuilder extracts the highest-quality optimization trajectory data from a
284
- MoleculeDoc. In general, this will mean that the geometry optimization calculation(s)
285
- corresponding to the structure(s) that make up this molecule will be used to extract
286
- optimization trajectories.
287
-
288
- The process is as follows:
289
- 1. Gather MoleculeDocs by species hash
290
- 2. For each doc, sort tasks by solvent
291
- 3. For each solvent, grab the best TaskDoc (geometry optimization calculation
292
- that has the highest level of theory with lowest
293
- electronic energy for the molecule)
294
- 4. Convert TaskDoc to TrajectoryDoc
295
- """
296
-
297
- def __init__(
298
- self,
299
- tasks: Store,
300
- molecules: Store,
301
- trajectories: Store,
302
- query: dict | None = None,
303
- settings: EmmetBuildSettings | None = None,
304
- **kwargs,
305
- ):
306
- self.tasks = tasks
307
- self.molecules = molecules
308
- self.trajectories = trajectories
309
- self.query = query if query else dict()
310
- self.settings = EmmetBuildSettings.autoload(settings)
311
- self.kwargs = kwargs
312
-
313
- super().__init__(sources=[tasks, molecules], targets=[trajectories], **kwargs)
314
- # Uncomment in case of issue with mrun not connecting automatically to collections
315
- # for i in [self.tasks, self.molecules, self.trajectories]:
316
- # try:
317
- # i.connect()
318
- # except Exception as e:
319
- # print("Could not connect,", e)
320
-
321
- def ensure_indexes(self):
322
- """
323
- Ensures indices on the collections needed for building
324
- """
325
-
326
- # Basic search index for tasks
327
- self.tasks.ensure_index("task_id")
328
- self.tasks.ensure_index("last_updated")
329
- self.tasks.ensure_index("state")
330
- self.tasks.ensure_index("formula_alphabetical")
331
- self.tasks.ensure_index("species_hash")
332
-
333
- # Search index for molecules
334
- self.molecules.ensure_index("molecule_id")
335
- self.molecules.ensure_index("last_updated")
336
- self.molecules.ensure_index("task_ids")
337
- self.molecules.ensure_index("formula_alphabetical")
338
- self.molecules.ensure_index("species_hash")
339
-
340
- # Search index for geometry optimization trajectory properties
341
- self.trajectories.ensure_index("molecule_id")
342
- self.trajectories.ensure_index("task_id")
343
- self.trajectories.ensure_index("solvent")
344
- self.trajectories.ensure_index("lot_solvent")
345
- self.trajectories.ensure_index("property_id")
346
- self.trajectories.ensure_index("last_updated")
347
- self.trajectories.ensure_index("formula_alphabetical")
348
-
349
- def prechunk(self, number_splits: int) -> Iterable[dict]: # pragma: no cover
350
- """Prechunk the builder for distributed computation"""
351
-
352
- temp_query = dict(self.query)
353
- temp_query["deprecated"] = False
354
-
355
- self.logger.info("Finding documents to process")
356
- all_mols = list(
357
- self.molecules.query(temp_query, [self.molecules.key, "species_hash"])
358
- )
359
-
360
- processed_docs = set([e for e in self.trajectories.distinct("molecule_id")])
361
- to_process_docs = {d[self.molecules.key] for d in all_mols} - processed_docs
362
- to_process_hashes = {
363
- d["species_hash"]
364
- for d in all_mols
365
- if d[self.molecules.key] in to_process_docs
366
- }
367
-
368
- N = ceil(len(to_process_hashes) / number_splits)
369
-
370
- for hash_chunk in grouper(to_process_hashes, N):
371
- yield {"query": {"species_hash": {"$in": list(hash_chunk)}}}
372
-
373
- def get_items(self) -> Iterator[list[dict]]:
374
- """
375
- Gets all items to process into trajectory documents.
376
- This does no datetime checking; relying on on whether
377
- task_ids are included in the forces Store
378
-
379
- Returns:
380
- generator or list relevant tasks and molecules to process into documents
381
- """
382
-
383
- self.logger.info("Trajectories builder started")
384
- self.logger.info("Setting indexes")
385
- self.ensure_indexes()
386
-
387
- # Save timestamp to mark buildtime
388
- self.timestamp = datetime.utcnow()
389
-
390
- # Get all processed molecules
391
- temp_query = dict(self.query)
392
- temp_query["deprecated"] = False
393
-
394
- self.logger.info("Finding documents to process")
395
- all_mols = list(
396
- self.molecules.query(temp_query, [self.molecules.key, "species_hash"])
397
- )
398
-
399
- processed_docs = set([e for e in self.trajectories.distinct("molecule_id")])
400
- to_process_docs = {d[self.molecules.key] for d in all_mols} - processed_docs
401
- to_process_hashes = {
402
- d["species_hash"]
403
- for d in all_mols
404
- if d[self.molecules.key] in to_process_docs
405
- }
406
-
407
- self.logger.info(f"Found {len(to_process_docs)} unprocessed documents")
408
- self.logger.info(f"Found {len(to_process_hashes)} unprocessed hashes")
409
-
410
- # Set total for builder bars to have a total
411
- self.total = len(to_process_hashes)
412
-
413
- for shash in to_process_hashes:
414
- mol_query = dict(temp_query)
415
- mol_query["species_hash"] = shash
416
- molecules = list(self.molecules.query(criteria=mol_query))
417
-
418
- yield molecules
419
-
420
- def process_item(self, items: list[dict]) -> list[dict]:
421
- """
422
- Process the tasks into TrajectoryDocs
423
-
424
- Args:
425
- items list[dict] : a list of MoleculeDocs in dict form
426
-
427
- Returns:
428
- [dict] : a list of new trajectory docs
429
- """
430
-
431
- mols = [MoleculeDoc(**item) for item in items]
432
- shash = mols[0].species_hash
433
- mol_ids = [m.molecule_id for m in mols]
434
- self.logger.debug(f"Processing {shash} : {mol_ids}")
435
-
436
- trajectory_docs = list()
437
-
438
- for mol in mols:
439
- entries = mol.best_entries
440
-
441
- # Organize by solvent environment
442
- by_solvent = defaultdict(list)
443
- for entry in entries.values():
444
- by_solvent[entry["solvent"]].append(entry)
445
-
446
- for solvent, entries in by_solvent.items():
447
- # No "best" entry - shouldn't happen, but just in case
448
- if len(entries) == 0:
449
- continue
450
- else:
451
- # In case there are multiple optimized structures with the same solvent but different LOT
452
- best = sorted(
453
- entries,
454
- key=lambda x: (
455
- sum(evaluate_lot(x["level_of_theory"])),
456
- x["energy"],
457
- ),
458
- )[0]
459
- task = best["task_id"]
460
-
461
- tdoc = self.tasks.query_one(
462
- {
463
- "task_id": task,
464
- "species_hash": shash,
465
- "orig": {"$exists": True},
466
- }
467
- )
468
-
469
- if tdoc is None:
470
- try:
471
- tdoc = self.tasks.query_one(
472
- {
473
- "task_id": int(task),
474
- "species_hash": shash,
475
- "orig": {"$exists": True},
476
- }
477
- )
478
- except ValueError:
479
- tdoc = None
480
-
481
- if tdoc is None:
482
- continue
483
-
484
- task_doc = TaskDocument(**tdoc)
485
-
486
- if task_doc is None:
487
- continue
488
-
489
- trajectory_doc = TrajectoryDoc.from_task(
490
- task_doc, molecule_id=mol.molecule_id, deprecated=False
491
- )
492
- trajectory_docs.append(trajectory_doc)
493
-
494
- self.logger.debug(
495
- f"Produced {len(trajectory_docs)} trajectory docs for {shash}"
496
- )
497
-
498
- return jsanitize([doc.model_dump() for doc in trajectory_docs], allow_bson=True)
499
-
500
- def update_targets(self, items: list[list[dict]]):
501
- """
502
- Inserts the new force docs into the trajectories collection
503
-
504
- Args:
505
- items [[dict]]: A list of documents to update
506
- """
507
-
508
- docs = list(chain.from_iterable(items)) # type: ignore
509
-
510
- # Add timestamp
511
- for item in docs:
512
- item.update(
513
- {
514
- "_bt": self.timestamp,
515
- }
516
- )
517
-
518
- molecule_ids = list({item["molecule_id"] for item in docs})
519
-
520
- if len(items) > 0:
521
- self.logger.info(f"Updating {len(docs)} trajectory documents")
522
- self.trajectories.remove_docs(
523
- {self.trajectories.key: {"$in": molecule_ids}}
524
- )
525
- self.trajectories.update(
526
- docs=docs,
527
- key=["molecule_id", "solvent"],
528
- )
529
- else:
530
- self.logger.info("No items to update")