emmet-builders 0.84.2rc7__py3-none-any.whl → 0.84.2rc9__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of emmet-builders might be problematic. Click here for more details.

@@ -138,7 +138,6 @@ class MoleculesAssociationBuilder(Builder):
138
138
  self.tasks.ensure_index("formula_alphabetical")
139
139
  self.tasks.ensure_index("smiles")
140
140
  self.tasks.ensure_index("species_hash")
141
- self.tasks.ensure_index("coord_hash")
142
141
 
143
142
  # Search index for molecules
144
143
  self.assoc.ensure_index("molecule_id")
@@ -166,9 +165,7 @@ class MoleculesAssociationBuilder(Builder):
166
165
  N = ceil(len(to_process_hashes) / number_splits)
167
166
 
168
167
  for hash_chunk in grouper(to_process_hashes, N):
169
- query = dict(temp_query)
170
- query["species_hash"] = {"$in": list(hash_chunk)}
171
- yield {"query": query}
168
+ yield {"query": {"species_hash": {"$in": list(hash_chunk)}}}
172
169
 
173
170
  def get_items(self) -> Iterator[List[TaskDocument]]:
174
171
  """
@@ -393,7 +390,6 @@ class MoleculesBuilder(Builder):
393
390
  self.assoc.ensure_index("last_updated")
394
391
  self.assoc.ensure_index("task_ids")
395
392
  self.assoc.ensure_index("formula_alphabetical")
396
- self.assoc.ensure_index("species_hash")
397
393
 
398
394
  # Search index for molecules
399
395
  self.molecules.ensure_index("molecule_id")
@@ -437,18 +433,16 @@ class MoleculesBuilder(Builder):
437
433
  xyz_species_id_map[d[self.assoc.key]] = this_id
438
434
  to_process_docs = assoc_ids - processed_docs
439
435
 
440
- to_process_hashes = {
441
- d["species_hash"]
436
+ to_process_forms = {
437
+ d["formula_alphabetical"]
442
438
  for d in all_assoc
443
439
  if xyz_species_id_map[d[self.assoc.key]] in to_process_docs
444
440
  }
445
441
 
446
- N = ceil(len(to_process_hashes) / number_splits)
442
+ N = ceil(len(to_process_forms) / number_splits)
447
443
 
448
- for hash_chunk in grouper(to_process_hashes, N):
449
- query = dict(temp_query)
450
- query["species_hash"] = {"$in": list(hash_chunk)}
451
- yield {"query": query}
444
+ for formula_chunk in grouper(to_process_forms, N):
445
+ yield {"query": {"formula_alphabetical": {"$in": list(formula_chunk)}}}
452
446
 
453
447
  def get_items(self) -> Iterator[List[Dict]]:
454
448
  """
@@ -501,21 +495,21 @@ class MoleculesBuilder(Builder):
501
495
  xyz_species_id_map[d[self.assoc.key]] = this_id
502
496
  to_process_docs = assoc_ids - processed_docs
503
497
 
504
- to_process_hashes = {
505
- d["species_hash"]
498
+ to_process_forms = {
499
+ d["formula_alphabetical"]
506
500
  for d in all_assoc
507
501
  if xyz_species_id_map[d[self.assoc.key]] in to_process_docs
508
502
  }
509
503
 
510
504
  self.logger.info(f"Found {len(to_process_docs)} unprocessed documents")
511
- self.logger.info(f"Found {len(to_process_hashes)} unprocessed hashes")
505
+ self.logger.info(f"Found {len(to_process_forms)} unprocessed formulas")
512
506
 
513
507
  # Set total for builder bars to have a total
514
- self.total = len(to_process_hashes)
508
+ self.total = len(to_process_forms)
515
509
 
516
- for shash in to_process_hashes:
510
+ for formula in to_process_forms:
517
511
  assoc_query = dict(temp_query)
518
- assoc_query["species_hash"] = shash
512
+ assoc_query["formula_alphabetical"] = formula
519
513
  assoc = list(self.assoc.query(criteria=assoc_query))
520
514
 
521
515
  yield assoc
@@ -532,9 +526,9 @@ class MoleculesBuilder(Builder):
532
526
  """
533
527
 
534
528
  assoc = [MoleculeDoc(**item) for item in items]
535
- shash = assoc[0].species_hash
529
+ formula = assoc[0].formula_alphabetical
536
530
  mol_ids = [a.molecule_id for a in assoc]
537
- self.logger.debug(f"Processing {shash} : {mol_ids}")
531
+ self.logger.debug(f"Processing {formula} : {mol_ids}")
538
532
 
539
533
  complete_mol_docs = list()
540
534
 
@@ -652,7 +646,7 @@ class MoleculesBuilder(Builder):
652
646
 
653
647
  complete_mol_docs.append(base_doc)
654
648
 
655
- self.logger.debug(f"Produced {len(complete_mol_docs)} molecules for {shash}")
649
+ self.logger.debug(f"Produced {len(complete_mol_docs)} molecules for {formula}")
656
650
 
657
651
  return jsanitize(
658
652
  [mol.model_dump() for mol in complete_mol_docs], allow_bson=True
Binary file
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: emmet-builders
3
- Version: 0.84.2rc7
3
+ Version: 0.84.2rc9
4
4
  Summary: Builders for the Emmet Library
5
5
  Home-page: https://github.com/materialsproject/emmet
6
6
  Author: The Materials Project
@@ -32,23 +32,21 @@ emmet/builders/matscholar/missing_compositions.py,sha256=RGQOEhfmJ6YMbjD4osLWqs7
32
32
  emmet/builders/mobility/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
33
33
  emmet/builders/mobility/migration_graph.py,sha256=WEXtPSn0UE5Q8mnvJ-T19FB3_LrZ3ojvNyRBs1PXWRg,3923
34
34
  emmet/builders/molecules/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
35
- emmet/builders/molecules/atomic.py,sha256=DBG_FScwT7YU1GgI69yRrR3_wUoWBwiP1uBvbL7xP3Y,20839
36
- emmet/builders/molecules/bonds.py,sha256=aHp9U_LX3sZvdTIaZj2T8PG_wTFYqw6nOcYC-4MiI0E,12103
37
- emmet/builders/molecules/electric.py,sha256=ldoLSfIAjue6YQlyXkgJRXLcfh178goiarjI_f_EtH4,10065
38
- emmet/builders/molecules/metal_binding.py,sha256=kimYsmQWdmukRRX2_GgVywCNRhNHz_-fp2oMYJTVMrg,23308
39
- emmet/builders/molecules/orbitals.py,sha256=nIsmx0m6Zi402opHE6OoEljGgQ2714p1Gig8Py1IXrU,10060
40
- emmet/builders/molecules/redox.py,sha256=52er0zK_IVQ0UYUh7svml-zwlvOtbptz1D8ZYCglfI0,18448
41
- emmet/builders/molecules/summary.py,sha256=7KHsnc9PBstps6s-hK4mYrWOBHt26_PTPKsCSFw3018,13848
42
- emmet/builders/molecules/thermo.py,sha256=MutvuYJsU0Hj5Qaa_Z7qnikM6mkPSBPNE4RG8JK4qes,19874
43
- emmet/builders/molecules/trajectory.py,sha256=oKrmWtKJ6mC0d1uJRE7g72X97kkS7JQ7nMhupVOUEUU,18163
44
- emmet/builders/molecules/vibration.py,sha256=_FA-tRixghsJdlls6oO9U2abxCHWqWv5SucbxpP5mVQ,9520
35
+ emmet/builders/molecules/atomic.py,sha256=X590oMDIPaHJMk0Xuy_r4hATm1TEj0hKfJD6ofi1asg,20823
36
+ emmet/builders/molecules/bonds.py,sha256=5orXkCBgBBOqz1iGoacDUb5iGMsOPVmdmghtQLud9ZU,12086
37
+ emmet/builders/molecules/metal_binding.py,sha256=gyL5Nu1pmu0ZJq4HgTxwLh8_1696pJiMnrFggFESnjo,23317
38
+ emmet/builders/molecules/orbitals.py,sha256=W_7_3zz9bFfHQZgAMdp3PSSt4PDH4DVZVervHPrv1Pk,10041
39
+ emmet/builders/molecules/redox.py,sha256=HHmj-nFMTEV7qq3g3GM2lB5RdLUMBE-xOIZogIgmORc,18427
40
+ emmet/builders/molecules/summary.py,sha256=I9-4-oKoUSg5sxvr-CHYVIuCyD48mpV9rsMno4pbbOk,13198
41
+ emmet/builders/molecules/thermo.py,sha256=DizVM9rLXo7AhHW3cq0Bo6vO1OI6YtK4PeIVixmt47g,19855
42
+ emmet/builders/molecules/vibration.py,sha256=9LNeKh8BHck-ooW4XzAZAFeio2u6bDwdsUV1aA5XVb4,9501
45
43
  emmet/builders/qchem/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
46
- emmet/builders/qchem/molecules.py,sha256=qI8WFOJ69FMPu9hxGzoFR4V3y4qO7UhtzVZwG9AWPpw,26382
44
+ emmet/builders/qchem/molecules.py,sha256=CZyVQzjfb-_gAS997BFbd9xkKwvwPWrquNH0Aoy6oY8,26217
47
45
  emmet/builders/vasp/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
48
46
  emmet/builders/vasp/materials.py,sha256=5bjP-W5-gmSjDzmcHdF7bviwgk4ywUceCL4FcF9Ya9c,12700
49
- emmet/builders/vasp/mp_potcar_stats.json.gz,sha256=RD6gbZEmmmKQYRKpFtEKHzncGO1WsLYMPjn3wvONrIc,291869
47
+ emmet/builders/vasp/mp_potcar_stats.json.gz,sha256=x3bn4gSMj1U_3bR2qKIaBtbJlYT-EJgoUIMFTA9bvaU,338957
50
48
  emmet/builders/vasp/task_validator.py,sha256=bmRTDiOWof4rpHVg3ksoxocN9xxieYu7IE-ylMjYOVs,2922
51
- emmet_builders-0.84.2rc7.dist-info/METADATA,sha256=QqGWpUDogkBmmZbyiHsduf3OWrUR3lfjmlBZigf8Kzk,2162
52
- emmet_builders-0.84.2rc7.dist-info/WHEEL,sha256=eOLhNAGa2EW3wWl_TU484h7q1UNgy0JXjjoqKoxAAQc,92
53
- emmet_builders-0.84.2rc7.dist-info/top_level.txt,sha256=6GcpbmWPeFhNCTfDFilb8GQ4T1UQu4z9c5jpobjwE-Q,6
54
- emmet_builders-0.84.2rc7.dist-info/RECORD,,
49
+ emmet_builders-0.84.2rc9.dist-info/METADATA,sha256=KApWsvZ71L5eRNIbqjPK3TE1eP5Sb5uYVU-3bQ5AJBM,2162
50
+ emmet_builders-0.84.2rc9.dist-info/WHEEL,sha256=eOLhNAGa2EW3wWl_TU484h7q1UNgy0JXjjoqKoxAAQc,92
51
+ emmet_builders-0.84.2rc9.dist-info/top_level.txt,sha256=6GcpbmWPeFhNCTfDFilb8GQ4T1UQu4z9c5jpobjwE-Q,6
52
+ emmet_builders-0.84.2rc9.dist-info/RECORD,,
@@ -1,282 +0,0 @@
1
- from collections import defaultdict
2
- from datetime import datetime
3
- from itertools import chain
4
- from math import ceil
5
- from typing import Optional, Iterable, Iterator, List, Dict
6
-
7
- from maggma.builders import Builder
8
- from maggma.core import Store
9
- from maggma.utils import grouper
10
-
11
- from emmet.core.qchem.task import TaskDocument
12
- from emmet.core.qchem.molecule import MoleculeDoc, evaluate_lot
13
- from emmet.core.molecules.electric import ElectricMultipoleDoc
14
- from emmet.core.utils import jsanitize
15
- from emmet.builders.settings import EmmetBuildSettings
16
-
17
-
18
- __author__ = "Evan Spotte-Smith"
19
-
20
- SETTINGS = EmmetBuildSettings()
21
-
22
-
23
- class ElectricMultipoleBuilder(Builder):
24
- """
25
- The ElectricMultipoleBuilder defines the electric multipole properties of a MoleculeDoc.
26
-
27
- This builder will attempt to build documents for each molecule, in each solvent.
28
- For each molecule-solvent combination, the highest-quality
29
- data available (based on level of theory and electronic energy) will be used.
30
-
31
- The process is as follows:
32
- 1. Gather MoleculeDocs by species hash
33
- 2. For each molecule, group all tasks by solvent.
34
- 3. For each solvent, grab the best TaskDoc (doc with elecrtric dipole/multipole information
35
- that has the highest level of theory with the lowest electronic energy) for the molecule
36
- 4. Convert TaskDoc to ElectricMultipoleDoc
37
- """
38
-
39
- def __init__(
40
- self,
41
- tasks: Store,
42
- molecules: Store,
43
- multipoles: Store,
44
- query: Optional[Dict] = None,
45
- settings: Optional[EmmetBuildSettings] = None,
46
- **kwargs,
47
- ):
48
- self.tasks = tasks
49
- self.molecules = molecules
50
- self.multipoles = multipoles
51
- self.query = query if query else dict()
52
- self.settings = EmmetBuildSettings.autoload(settings)
53
- self.kwargs = kwargs
54
-
55
- super().__init__(sources=[tasks, molecules], targets=[multipoles], **kwargs)
56
- # Uncomment in case of issue with mrun not connecting automatically to collections
57
- # for i in [self.tasks, self.molecules, self.multipoles]:
58
- # try:
59
- # i.connect()
60
- # except Exception as e:
61
- # print("Could not connect,", e)
62
-
63
- def ensure_indexes(self):
64
- """
65
- Ensures indices on the collections needed for building
66
- """
67
-
68
- # Basic search index for tasks
69
- self.tasks.ensure_index("task_id")
70
- self.tasks.ensure_index("last_updated")
71
- self.tasks.ensure_index("state")
72
- self.tasks.ensure_index("formula_alphabetical")
73
- self.tasks.ensure_index("species_hash")
74
-
75
- # Search index for molecules
76
- self.molecules.ensure_index("molecule_id")
77
- self.molecules.ensure_index("last_updated")
78
- self.molecules.ensure_index("task_ids")
79
- self.molecules.ensure_index("formula_alphabetical")
80
- self.molecules.ensure_index("species_hash")
81
-
82
- # Search index for electric
83
- self.multipoles.ensure_index("method")
84
- self.multipoles.ensure_index("molecule_id")
85
- self.multipoles.ensure_index("task_id")
86
- self.multipoles.ensure_index("solvent")
87
- self.multipoles.ensure_index("lot_solvent")
88
- self.multipoles.ensure_index("property_id")
89
- self.multipoles.ensure_index("last_updated")
90
- self.multipoles.ensure_index("formula_alphabetical")
91
-
92
- def prechunk(self, number_splits: int) -> Iterable[Dict]: # pragma: no cover
93
- """Prechunk the builder for distributed computation"""
94
-
95
- temp_query = dict(self.query)
96
- temp_query["deprecated"] = False
97
-
98
- self.logger.info("Finding documents to process")
99
- all_mols = list(
100
- self.molecules.query(temp_query, [self.molecules.key, "species_hash"])
101
- )
102
-
103
- processed_docs = set([e for e in self.multipoles.distinct("molecule_id")])
104
- to_process_docs = {d[self.molecules.key] for d in all_mols} - processed_docs
105
- to_process_hashes = {
106
- d["species_hash"]
107
- for d in all_mols
108
- if d[self.molecules.key] in to_process_docs
109
- }
110
-
111
- N = ceil(len(to_process_hashes) / number_splits)
112
-
113
- for hash_chunk in grouper(to_process_hashes, N):
114
- query = dict(temp_query)
115
- query["species_hash"] = {"$in": list(hash_chunk)}
116
- yield {"query": query}
117
-
118
- def get_items(self) -> Iterator[List[Dict]]:
119
- """
120
- Gets all items to process into multipole documents.
121
- This does no datetime checking; relying on on whether
122
- task_ids are included in the multipoles Store
123
-
124
- Returns:
125
- generator or list relevant tasks and molecules to process into documents
126
- """
127
-
128
- self.logger.info("Electric multipoles builder started")
129
- self.logger.info("Setting indexes")
130
- self.ensure_indexes()
131
-
132
- # Save timestamp to mark buildtime
133
- self.timestamp = datetime.utcnow()
134
-
135
- # Get all processed molecules
136
- temp_query = dict(self.query)
137
- temp_query["deprecated"] = False
138
-
139
- self.logger.info("Finding documents to process")
140
- all_mols = list(
141
- self.molecules.query(temp_query, [self.molecules.key, "species_hash"])
142
- )
143
-
144
- processed_docs = set([e for e in self.multipoles.distinct("molecule_id")])
145
- to_process_docs = {d[self.molecules.key] for d in all_mols} - processed_docs
146
- to_process_hashes = {
147
- d["species_hash"]
148
- for d in all_mols
149
- if d[self.molecules.key] in to_process_docs
150
- }
151
-
152
- self.logger.info(f"Found {len(to_process_docs)} unprocessed documents")
153
- self.logger.info(f"Found {len(to_process_hashes)} unprocessed hashes")
154
-
155
- # Set total for builder bars to have a total
156
- self.total = len(to_process_hashes)
157
-
158
- for shash in to_process_hashes:
159
- mol_query = dict(temp_query)
160
- mol_query["species_hash"] = shash
161
- molecules = list(self.molecules.query(criteria=mol_query))
162
-
163
- yield molecules
164
-
165
- def process_item(self, items: List[Dict]) -> List[Dict]:
166
- """
167
- Process the tasks into ElectricMultipoleDocs
168
-
169
- Args:
170
- tasks List[Dict] : a list of MoleculeDocs in dict form
171
-
172
- Returns:
173
- [dict] : a list of new electric multipole docs
174
- """
175
-
176
- mols = [MoleculeDoc(**item) for item in items]
177
- shash = mols[0].species_hash
178
- mol_ids = [m.molecule_id for m in mols]
179
- self.logger.debug(f"Processing {shash} : {mol_ids}")
180
-
181
- multipole_docs = list()
182
-
183
- for mol in mols:
184
- # Relevant tasks are those with the correct charge and spin
185
- # for which there are AT LEAST electric dipoles present
186
- # (ideally, multipole information would also be present)
187
- multip_entries = [
188
- e
189
- for e in mol.entries
190
- if e["charge"] == mol.charge
191
- and e["spin_multiplicity"] == mol.spin_multiplicity
192
- and (e["output"].get("dipoles") is not None)
193
- ]
194
-
195
- # Organize by solvent environment
196
- by_solvent = defaultdict(list)
197
- for entry in multip_entries:
198
- by_solvent[entry["solvent"]].append(entry)
199
-
200
- for solvent, entries in by_solvent.items():
201
- # No documents with enthalpy and entropy
202
- if len(entries) == 0:
203
- continue
204
- else:
205
- best = sorted(
206
- entries,
207
- key=lambda x: (
208
- sum(evaluate_lot(x["level_of_theory"])),
209
- x["energy"],
210
- ),
211
- )[0]
212
- task = best["task_id"]
213
-
214
- tdoc = self.tasks.query_one(
215
- {
216
- "task_id": task,
217
- "species_hash": shash,
218
- "orig": {"$exists": True},
219
- }
220
- )
221
-
222
- if tdoc is None:
223
- try:
224
- tdoc = self.tasks.query_one(
225
- {
226
- "task_id": int(task),
227
- "species_hash": shash,
228
- "orig": {"$exists": True},
229
- }
230
- )
231
- except ValueError:
232
- tdoc = None
233
-
234
- if tdoc is None:
235
- continue
236
-
237
- task_doc = TaskDocument(**tdoc)
238
-
239
- if task_doc is None:
240
- continue
241
-
242
- multipole_doc = ElectricMultipoleDoc.from_task(
243
- task_doc, molecule_id=mol.molecule_id, deprecated=False
244
- )
245
- multipole_docs.append(multipole_doc)
246
-
247
- self.logger.debug(
248
- f"Produced {len(multipole_docs)} electric multipole docs for {shash}"
249
- )
250
-
251
- return jsanitize([doc.model_dump() for doc in multipole_docs], allow_bson=True)
252
-
253
- def update_targets(self, items: List[List[Dict]]):
254
- """
255
- Inserts the new documents into the multipoles collection
256
-
257
- Args:
258
- items [[dict]]: A list of documents to update
259
- """
260
-
261
- docs = list(chain.from_iterable(items)) # type: ignore
262
-
263
- # Add timestamp
264
- for item in docs:
265
- item.update(
266
- {
267
- "_bt": self.timestamp,
268
- }
269
- )
270
-
271
- molecule_ids = list({item["molecule_id"] for item in docs})
272
-
273
- if len(items) > 0:
274
- self.logger.info(f"Updating {len(docs)} electric multipole documents")
275
- self.multipoles.remove_docs({self.multipoles.key: {"$in": molecule_ids}})
276
- # Neither molecule_id nor method need to be unique, but the combination must be
277
- self.multipoles.update(
278
- docs=docs,
279
- key=["molecule_id", "solvent"],
280
- )
281
- else:
282
- self.logger.info("No items to update")