emmet-builders 0.84.3rc2__py3-none-any.whl → 0.84.3rc4__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of emmet-builders might be problematic. Click here for more details.

@@ -41,7 +41,7 @@ class PartialChargesBuilder(Builder):
41
41
  energy) will be used.
42
42
 
43
43
  The process is as follows:
44
- 1. Gather MoleculeDocs by formula
44
+ 1. Gather MoleculeDocs by species hash
45
45
  2. For each molecule, group all tasks by solvent.
46
46
  3. For each solvent, sort tasks by level of theory and electronic energy
47
47
  4. For each method:
@@ -86,12 +86,14 @@ class PartialChargesBuilder(Builder):
86
86
  self.tasks.ensure_index("last_updated")
87
87
  self.tasks.ensure_index("state")
88
88
  self.tasks.ensure_index("formula_alphabetical")
89
+ self.tasks.ensure_index("species_hash")
89
90
 
90
91
  # Search index for molecules
91
92
  self.molecules.ensure_index("molecule_id")
92
93
  self.molecules.ensure_index("last_updated")
93
94
  self.molecules.ensure_index("task_ids")
94
95
  self.molecules.ensure_index("formula_alphabetical")
96
+ self.molecules.ensure_index("species_hash")
95
97
 
96
98
  # Search index for charges
97
99
  self.charges.ensure_index("molecule_id")
@@ -111,23 +113,23 @@ class PartialChargesBuilder(Builder):
111
113
 
112
114
  self.logger.info("Finding documents to process")
113
115
  all_mols = list(
114
- self.molecules.query(
115
- temp_query, [self.molecules.key, "formula_alphabetical"]
116
- )
116
+ self.molecules.query(temp_query, [self.molecules.key, "species_hash"])
117
117
  )
118
118
 
119
119
  processed_docs = set([e for e in self.charges.distinct("molecule_id")])
120
120
  to_process_docs = {d[self.molecules.key] for d in all_mols} - processed_docs
121
- to_process_forms = {
122
- d["formula_alphabetical"]
121
+ to_process_hashes = {
122
+ d["species_hash"]
123
123
  for d in all_mols
124
124
  if d[self.molecules.key] in to_process_docs
125
125
  }
126
126
 
127
- N = ceil(len(to_process_forms) / number_splits)
127
+ N = ceil(len(to_process_hashes) / number_splits)
128
128
 
129
- for formula_chunk in grouper(to_process_forms, N):
130
- yield {"query": {"formula_alphabetical": {"$in": list(formula_chunk)}}}
129
+ for hash_chunk in grouper(to_process_hashes, N):
130
+ query = dict(temp_query)
131
+ query["species_hash"] = {"$in": list(hash_chunk)}
132
+ yield {"query": query}
131
133
 
132
134
  def get_items(self) -> Iterator[List[Dict]]:
133
135
  """
@@ -152,28 +154,26 @@ class PartialChargesBuilder(Builder):
152
154
 
153
155
  self.logger.info("Finding documents to process")
154
156
  all_mols = list(
155
- self.molecules.query(
156
- temp_query, [self.molecules.key, "formula_alphabetical"]
157
- )
157
+ self.molecules.query(temp_query, [self.molecules.key, "species_hash"])
158
158
  )
159
159
 
160
160
  processed_docs = set([e for e in self.charges.distinct("molecule_id")])
161
161
  to_process_docs = {d[self.molecules.key] for d in all_mols} - processed_docs
162
- to_process_forms = {
163
- d["formula_alphabetical"]
162
+ to_process_hashes = {
163
+ d["species_hash"]
164
164
  for d in all_mols
165
165
  if d[self.molecules.key] in to_process_docs
166
166
  }
167
167
 
168
168
  self.logger.info(f"Found {len(to_process_docs)} unprocessed documents")
169
- self.logger.info(f"Found {len(to_process_forms)} unprocessed formulas")
169
+ self.logger.info(f"Found {len(to_process_hashes)} unprocessed hashes")
170
170
 
171
171
  # Set total for builder bars to have a total
172
- self.total = len(to_process_forms)
172
+ self.total = len(to_process_hashes)
173
173
 
174
- for formula in to_process_forms:
174
+ for shash in to_process_hashes:
175
175
  mol_query = dict(temp_query)
176
- mol_query["formula_alphabetical"] = formula
176
+ mol_query["species_hash"] = shash
177
177
  molecules = list(self.molecules.query(criteria=mol_query))
178
178
 
179
179
  yield molecules
@@ -190,9 +190,9 @@ class PartialChargesBuilder(Builder):
190
190
  """
191
191
 
192
192
  mols = [MoleculeDoc(**item) for item in items]
193
- formula = mols[0].formula_alphabetical
193
+ shash = mols[0].species_hash
194
194
  mol_ids = [m.molecule_id for m in mols]
195
- self.logger.debug(f"Processing {formula} : {mol_ids}")
195
+ self.logger.debug(f"Processing {shash} : {mol_ids}")
196
196
 
197
197
  charges_docs = list()
198
198
 
@@ -237,7 +237,7 @@ class PartialChargesBuilder(Builder):
237
237
  tdoc = self.tasks.query_one(
238
238
  {
239
239
  "task_id": task,
240
- "formula_alphabetical": formula,
240
+ "species_hash": shash,
241
241
  "orig": {"$exists": True},
242
242
  }
243
243
  )
@@ -247,7 +247,7 @@ class PartialChargesBuilder(Builder):
247
247
  tdoc = self.tasks.query_one(
248
248
  {
249
249
  "task_id": int(task),
250
- "formula_alphabetical": formula,
250
+ "species_hash": shash,
251
251
  "orig": {"$exists": True},
252
252
  }
253
253
  )
@@ -271,7 +271,7 @@ class PartialChargesBuilder(Builder):
271
271
 
272
272
  charges_docs.append(doc)
273
273
 
274
- self.logger.debug(f"Produced {len(charges_docs)} charges docs for {formula}")
274
+ self.logger.debug(f"Produced {len(charges_docs)} charges docs for {shash}")
275
275
 
276
276
  return jsanitize([doc.model_dump() for doc in charges_docs], allow_bson=True)
277
277
 
@@ -320,7 +320,7 @@ class PartialSpinsBuilder(Builder):
320
320
  data available (based on level of theory and electronic energy) will be used.
321
321
 
322
322
  The process is as follows:
323
- 1. Gather MoleculeDocs by formula
323
+ 1. Gather MoleculeDocs by species_hash
324
324
  2. For each molecule, group all tasks by solvent.
325
325
  3. For each solvent, sort tasks by level of theory and electronic energy
326
326
  4. For each method:
@@ -365,12 +365,14 @@ class PartialSpinsBuilder(Builder):
365
365
  self.tasks.ensure_index("last_updated")
366
366
  self.tasks.ensure_index("state")
367
367
  self.tasks.ensure_index("formula_alphabetical")
368
+ self.tasks.ensure_index("species_hash")
368
369
 
369
370
  # Search index for molecules
370
371
  self.molecules.ensure_index("molecule_id")
371
372
  self.molecules.ensure_index("last_updated")
372
373
  self.molecules.ensure_index("task_ids")
373
374
  self.molecules.ensure_index("formula_alphabetical")
375
+ self.molecules.ensure_index("species_hash")
374
376
 
375
377
  # Search index for spins
376
378
  self.spins.ensure_index("molecule_id")
@@ -390,23 +392,23 @@ class PartialSpinsBuilder(Builder):
390
392
 
391
393
  self.logger.info("Finding documents to process")
392
394
  all_mols = list(
393
- self.molecules.query(
394
- temp_query, [self.molecules.key, "formula_alphabetical"]
395
- )
395
+ self.molecules.query(temp_query, [self.molecules.key, "species_hash"])
396
396
  )
397
397
 
398
398
  processed_docs = set([e for e in self.spins.distinct("molecule_id")])
399
399
  to_process_docs = {d[self.molecules.key] for d in all_mols} - processed_docs
400
- to_process_forms = {
401
- d["formula_alphabetical"]
400
+ to_process_hashes = {
401
+ d["species_hash"]
402
402
  for d in all_mols
403
403
  if d[self.molecules.key] in to_process_docs
404
404
  }
405
405
 
406
- N = ceil(len(to_process_forms) / number_splits)
406
+ N = ceil(len(to_process_hashes) / number_splits)
407
407
 
408
- for formula_chunk in grouper(to_process_forms, N):
409
- yield {"query": {"formula_alphabetical": {"$in": list(formula_chunk)}}}
408
+ for hash_chunk in grouper(to_process_hashes, N):
409
+ query = dict(temp_query)
410
+ query["species_hash"] = {"$in": list(hash_chunk)}
411
+ yield {"query": query}
410
412
 
411
413
  def get_items(self) -> Iterator[List[Dict]]:
412
414
  """
@@ -431,28 +433,26 @@ class PartialSpinsBuilder(Builder):
431
433
 
432
434
  self.logger.info("Finding documents to process")
433
435
  all_mols = list(
434
- self.molecules.query(
435
- temp_query, [self.molecules.key, "formula_alphabetical"]
436
- )
436
+ self.molecules.query(temp_query, [self.molecules.key, "species_hash"])
437
437
  )
438
438
 
439
439
  processed_docs = set([e for e in self.spins.distinct("molecule_id")])
440
440
  to_process_docs = {d[self.molecules.key] for d in all_mols} - processed_docs
441
- to_process_forms = {
442
- d["formula_alphabetical"]
441
+ to_process_hashes = {
442
+ d["species_hash"]
443
443
  for d in all_mols
444
444
  if d[self.molecules.key] in to_process_docs
445
445
  }
446
446
 
447
447
  self.logger.info(f"Found {len(to_process_docs)} unprocessed documents")
448
- self.logger.info(f"Found {len(to_process_forms)} unprocessed formulas")
448
+ self.logger.info(f"Found {len(to_process_hashes)} unprocessed hashes")
449
449
 
450
450
  # Set total for builder bars to have a total
451
- self.total = len(to_process_forms)
451
+ self.total = len(to_process_hashes)
452
452
 
453
- for formula in to_process_forms:
453
+ for shash in to_process_hashes:
454
454
  mol_query = dict(temp_query)
455
- mol_query["formula_alphabetical"] = formula
455
+ mol_query["species_hash"] = shash
456
456
  molecules = list(self.molecules.query(criteria=mol_query))
457
457
 
458
458
  yield molecules
@@ -469,9 +469,9 @@ class PartialSpinsBuilder(Builder):
469
469
  """
470
470
 
471
471
  mols = [MoleculeDoc(**item) for item in items]
472
- formula = mols[0].formula_alphabetical
472
+ shash = mols[0].species_hash
473
473
  mol_ids = [m.molecule_id for m in mols]
474
- self.logger.debug(f"Processing {formula} : {mol_ids}")
474
+ self.logger.debug(f"Processing {shash} : {mol_ids}")
475
475
 
476
476
  spins_docs = list()
477
477
 
@@ -520,7 +520,7 @@ class PartialSpinsBuilder(Builder):
520
520
  tdoc = self.tasks.query_one(
521
521
  {
522
522
  "task_id": task,
523
- "formula_alphabetical": formula,
523
+ "species_hash": shash,
524
524
  "orig": {"$exists": True},
525
525
  }
526
526
  )
@@ -530,7 +530,7 @@ class PartialSpinsBuilder(Builder):
530
530
  tdoc = self.tasks.query_one(
531
531
  {
532
532
  "task_id": int(task),
533
- "formula_alphabetical": formula,
533
+ "species_hash": shash,
534
534
  "orig": {"$exists": True},
535
535
  }
536
536
  )
@@ -551,9 +551,7 @@ class PartialSpinsBuilder(Builder):
551
551
 
552
552
  spins_docs.append(doc)
553
553
 
554
- self.logger.debug(
555
- f"Produced {len(spins_docs)} partial spins docs for {formula}"
556
- )
554
+ self.logger.debug(f"Produced {len(spins_docs)} partial spins docs for {shash}")
557
555
 
558
556
  return jsanitize([doc.model_dump() for doc in spins_docs], allow_bson=True)
559
557
 
@@ -40,7 +40,7 @@ class BondingBuilder(Builder):
40
40
  data available (based on level of theory and electronic energy) will be used.
41
41
 
42
42
  The process is as follows:
43
- 1. Gather MoleculeDocs by formula
43
+ 1. Gather MoleculeDocs by species hash
44
44
  2. For each molecule, group all tasks by solvent.
45
45
  3. For each solvent, sort tasks by level of theory and electronic energy
46
46
  4. For each method:
@@ -85,12 +85,14 @@ class BondingBuilder(Builder):
85
85
  self.tasks.ensure_index("last_updated")
86
86
  self.tasks.ensure_index("state")
87
87
  self.tasks.ensure_index("formula_alphabetical")
88
+ self.tasks.ensure_index("species_hash")
88
89
 
89
90
  # Search index for molecules
90
91
  self.molecules.ensure_index("molecule_id")
91
92
  self.molecules.ensure_index("last_updated")
92
93
  self.molecules.ensure_index("task_ids")
93
94
  self.molecules.ensure_index("formula_alphabetical")
95
+ self.molecules.ensure_index("species_hash")
94
96
 
95
97
  # Search index for bonds
96
98
  self.bonds.ensure_index("molecule_id")
@@ -110,23 +112,23 @@ class BondingBuilder(Builder):
110
112
 
111
113
  self.logger.info("Finding documents to process")
112
114
  all_mols = list(
113
- self.molecules.query(
114
- temp_query, [self.molecules.key, "formula_alphabetical"]
115
- )
115
+ self.molecules.query(temp_query, [self.molecules.key, "species_hash"])
116
116
  )
117
117
 
118
118
  processed_docs = set([e for e in self.bonds.distinct("molecule_id")])
119
119
  to_process_docs = {d[self.molecules.key] for d in all_mols} - processed_docs
120
- to_process_forms = {
121
- d["formula_alphabetical"]
120
+ to_process_hashes = {
121
+ d["species_hash"]
122
122
  for d in all_mols
123
123
  if d[self.molecules.key] in to_process_docs
124
124
  }
125
125
 
126
- N = ceil(len(to_process_forms) / number_splits)
126
+ N = ceil(len(to_process_hashes) / number_splits)
127
127
 
128
- for formula_chunk in grouper(to_process_forms, N):
129
- yield {"query": {"formula_alphabetical": {"$in": list(formula_chunk)}}}
128
+ for hash_chunk in grouper(to_process_hashes, N):
129
+ query = dict(temp_query)
130
+ query["species_hash"] = {"$in": list(hash_chunk)}
131
+ yield {"query": query}
130
132
 
131
133
  def get_items(self) -> Iterator[List[Dict]]:
132
134
  """
@@ -151,28 +153,26 @@ class BondingBuilder(Builder):
151
153
 
152
154
  self.logger.info("Finding documents to process")
153
155
  all_mols = list(
154
- self.molecules.query(
155
- temp_query, [self.molecules.key, "formula_alphabetical"]
156
- )
156
+ self.molecules.query(temp_query, [self.molecules.key, "species_hash"])
157
157
  )
158
158
 
159
159
  processed_docs = set([e for e in self.bonds.distinct("molecule_id")])
160
160
  to_process_docs = {d[self.molecules.key] for d in all_mols} - processed_docs
161
- to_process_forms = {
162
- d["formula_alphabetical"]
161
+ to_process_hashes = {
162
+ d["species_hash"]
163
163
  for d in all_mols
164
164
  if d[self.molecules.key] in to_process_docs
165
165
  }
166
166
 
167
167
  self.logger.info(f"Found {len(to_process_docs)} unprocessed documents")
168
- self.logger.info(f"Found {len(to_process_forms)} unprocessed formulas")
168
+ self.logger.info(f"Found {len(to_process_hashes)} unprocessed hashes")
169
169
 
170
170
  # Set total for builder bars to have a total
171
- self.total = len(to_process_forms)
171
+ self.total = len(to_process_hashes)
172
172
 
173
- for formula in to_process_forms:
173
+ for shash in to_process_hashes:
174
174
  mol_query = dict(temp_query)
175
- mol_query["formula_alphabetical"] = formula
175
+ mol_query["species_hash"] = shash
176
176
  molecules = list(self.molecules.query(criteria=mol_query))
177
177
 
178
178
  yield molecules
@@ -189,9 +189,9 @@ class BondingBuilder(Builder):
189
189
  """
190
190
 
191
191
  mols = [MoleculeDoc(**item) for item in items]
192
- formula = mols[0].formula_alphabetical
192
+ shash = mols[0].species_hash
193
193
  mol_ids = [m.molecule_id for m in mols]
194
- self.logger.debug(f"Processing {formula} : {mol_ids}")
194
+ self.logger.debug(f"Processing {shash} : {mol_ids}")
195
195
 
196
196
  bonding_docs = list()
197
197
 
@@ -255,7 +255,7 @@ class BondingBuilder(Builder):
255
255
  tdoc = self.tasks.query_one(
256
256
  {
257
257
  "task_id": task,
258
- "formula_alphabetical": formula,
258
+ "species_hash": shash,
259
259
  "orig": {"$exists": True},
260
260
  }
261
261
  )
@@ -265,7 +265,7 @@ class BondingBuilder(Builder):
265
265
  tdoc = self.tasks.query_one(
266
266
  {
267
267
  "task_id": int(task),
268
- "formula_alphabetical": formula,
268
+ "species_hash": shash,
269
269
  "orig": {"$exists": True},
270
270
  }
271
271
  )
@@ -288,13 +288,13 @@ class BondingBuilder(Builder):
288
288
  )
289
289
  bonding_docs.append(doc)
290
290
 
291
- self.logger.debug(f"Produced {len(bonding_docs)} bonding docs for {formula}")
291
+ self.logger.debug(f"Produced {len(bonding_docs)} bonding docs for {shash}")
292
292
 
293
293
  return jsanitize([doc.model_dump() for doc in bonding_docs], allow_bson=True)
294
294
 
295
295
  def update_targets(self, items: List[List[Dict]]):
296
296
  """
297
- Inserts the new documents into the charges collection
297
+ Inserts the new documents into the bonds collection
298
298
 
299
299
  Args:
300
300
  items [[dict]]: A list of documents to update
@@ -0,0 +1,282 @@
1
+ from collections import defaultdict
2
+ from datetime import datetime
3
+ from itertools import chain
4
+ from math import ceil
5
+ from typing import Optional, Iterable, Iterator, List, Dict
6
+
7
+ from maggma.builders import Builder
8
+ from maggma.core import Store
9
+ from maggma.utils import grouper
10
+
11
+ from emmet.core.qchem.task import TaskDocument
12
+ from emmet.core.qchem.molecule import MoleculeDoc, evaluate_lot
13
+ from emmet.core.molecules.electric import ElectricMultipoleDoc
14
+ from emmet.core.utils import jsanitize
15
+ from emmet.builders.settings import EmmetBuildSettings
16
+
17
+
18
+ __author__ = "Evan Spotte-Smith"
19
+
20
+ SETTINGS = EmmetBuildSettings()
21
+
22
+
23
+ class ElectricMultipoleBuilder(Builder):
24
+ """
25
+ The ElectricMultipoleBuilder defines the electric multipole properties of a MoleculeDoc.
26
+
27
+ This builder will attempt to build documents for each molecule, in each solvent.
28
+ For each molecule-solvent combination, the highest-quality
29
+ data available (based on level of theory and electronic energy) will be used.
30
+
31
+ The process is as follows:
32
+ 1. Gather MoleculeDocs by species hash
33
+ 2. For each molecule, group all tasks by solvent.
34
+ 3. For each solvent, grab the best TaskDoc (doc with elecrtric dipole/multipole information
35
+ that has the highest level of theory with the lowest electronic energy) for the molecule
36
+ 4. Convert TaskDoc to ElectricMultipoleDoc
37
+ """
38
+
39
+ def __init__(
40
+ self,
41
+ tasks: Store,
42
+ molecules: Store,
43
+ multipoles: Store,
44
+ query: Optional[Dict] = None,
45
+ settings: Optional[EmmetBuildSettings] = None,
46
+ **kwargs,
47
+ ):
48
+ self.tasks = tasks
49
+ self.molecules = molecules
50
+ self.multipoles = multipoles
51
+ self.query = query if query else dict()
52
+ self.settings = EmmetBuildSettings.autoload(settings)
53
+ self.kwargs = kwargs
54
+
55
+ super().__init__(sources=[tasks, molecules], targets=[multipoles], **kwargs)
56
+ # Uncomment in case of issue with mrun not connecting automatically to collections
57
+ # for i in [self.tasks, self.molecules, self.multipoles]:
58
+ # try:
59
+ # i.connect()
60
+ # except Exception as e:
61
+ # print("Could not connect,", e)
62
+
63
+ def ensure_indexes(self):
64
+ """
65
+ Ensures indices on the collections needed for building
66
+ """
67
+
68
+ # Basic search index for tasks
69
+ self.tasks.ensure_index("task_id")
70
+ self.tasks.ensure_index("last_updated")
71
+ self.tasks.ensure_index("state")
72
+ self.tasks.ensure_index("formula_alphabetical")
73
+ self.tasks.ensure_index("species_hash")
74
+
75
+ # Search index for molecules
76
+ self.molecules.ensure_index("molecule_id")
77
+ self.molecules.ensure_index("last_updated")
78
+ self.molecules.ensure_index("task_ids")
79
+ self.molecules.ensure_index("formula_alphabetical")
80
+ self.molecules.ensure_index("species_hash")
81
+
82
+ # Search index for electric
83
+ self.multipoles.ensure_index("method")
84
+ self.multipoles.ensure_index("molecule_id")
85
+ self.multipoles.ensure_index("task_id")
86
+ self.multipoles.ensure_index("solvent")
87
+ self.multipoles.ensure_index("lot_solvent")
88
+ self.multipoles.ensure_index("property_id")
89
+ self.multipoles.ensure_index("last_updated")
90
+ self.multipoles.ensure_index("formula_alphabetical")
91
+
92
+ def prechunk(self, number_splits: int) -> Iterable[Dict]: # pragma: no cover
93
+ """Prechunk the builder for distributed computation"""
94
+
95
+ temp_query = dict(self.query)
96
+ temp_query["deprecated"] = False
97
+
98
+ self.logger.info("Finding documents to process")
99
+ all_mols = list(
100
+ self.molecules.query(temp_query, [self.molecules.key, "species_hash"])
101
+ )
102
+
103
+ processed_docs = set([e for e in self.multipoles.distinct("molecule_id")])
104
+ to_process_docs = {d[self.molecules.key] for d in all_mols} - processed_docs
105
+ to_process_hashes = {
106
+ d["species_hash"]
107
+ for d in all_mols
108
+ if d[self.molecules.key] in to_process_docs
109
+ }
110
+
111
+ N = ceil(len(to_process_hashes) / number_splits)
112
+
113
+ for hash_chunk in grouper(to_process_hashes, N):
114
+ query = dict(temp_query)
115
+ query["species_hash"] = {"$in": list(hash_chunk)}
116
+ yield {"query": query}
117
+
118
+ def get_items(self) -> Iterator[List[Dict]]:
119
+ """
120
+ Gets all items to process into multipole documents.
121
+ This does no datetime checking; relying on on whether
122
+ task_ids are included in the multipoles Store
123
+
124
+ Returns:
125
+ generator or list relevant tasks and molecules to process into documents
126
+ """
127
+
128
+ self.logger.info("Electric multipoles builder started")
129
+ self.logger.info("Setting indexes")
130
+ self.ensure_indexes()
131
+
132
+ # Save timestamp to mark buildtime
133
+ self.timestamp = datetime.utcnow()
134
+
135
+ # Get all processed molecules
136
+ temp_query = dict(self.query)
137
+ temp_query["deprecated"] = False
138
+
139
+ self.logger.info("Finding documents to process")
140
+ all_mols = list(
141
+ self.molecules.query(temp_query, [self.molecules.key, "species_hash"])
142
+ )
143
+
144
+ processed_docs = set([e for e in self.multipoles.distinct("molecule_id")])
145
+ to_process_docs = {d[self.molecules.key] for d in all_mols} - processed_docs
146
+ to_process_hashes = {
147
+ d["species_hash"]
148
+ for d in all_mols
149
+ if d[self.molecules.key] in to_process_docs
150
+ }
151
+
152
+ self.logger.info(f"Found {len(to_process_docs)} unprocessed documents")
153
+ self.logger.info(f"Found {len(to_process_hashes)} unprocessed hashes")
154
+
155
+ # Set total for builder bars to have a total
156
+ self.total = len(to_process_hashes)
157
+
158
+ for shash in to_process_hashes:
159
+ mol_query = dict(temp_query)
160
+ mol_query["species_hash"] = shash
161
+ molecules = list(self.molecules.query(criteria=mol_query))
162
+
163
+ yield molecules
164
+
165
+ def process_item(self, items: List[Dict]) -> List[Dict]:
166
+ """
167
+ Process the tasks into ElectricMultipoleDocs
168
+
169
+ Args:
170
+ tasks List[Dict] : a list of MoleculeDocs in dict form
171
+
172
+ Returns:
173
+ [dict] : a list of new electric multipole docs
174
+ """
175
+
176
+ mols = [MoleculeDoc(**item) for item in items]
177
+ shash = mols[0].species_hash
178
+ mol_ids = [m.molecule_id for m in mols]
179
+ self.logger.debug(f"Processing {shash} : {mol_ids}")
180
+
181
+ multipole_docs = list()
182
+
183
+ for mol in mols:
184
+ # Relevant tasks are those with the correct charge and spin
185
+ # for which there are AT LEAST electric dipoles present
186
+ # (ideally, multipole information would also be present)
187
+ multip_entries = [
188
+ e
189
+ for e in mol.entries
190
+ if e["charge"] == mol.charge
191
+ and e["spin_multiplicity"] == mol.spin_multiplicity
192
+ and (e["output"].get("dipoles") is not None)
193
+ ]
194
+
195
+ # Organize by solvent environment
196
+ by_solvent = defaultdict(list)
197
+ for entry in multip_entries:
198
+ by_solvent[entry["solvent"]].append(entry)
199
+
200
+ for solvent, entries in by_solvent.items():
201
+ # No documents with enthalpy and entropy
202
+ if len(entries) == 0:
203
+ continue
204
+ else:
205
+ best = sorted(
206
+ entries,
207
+ key=lambda x: (
208
+ sum(evaluate_lot(x["level_of_theory"])),
209
+ x["energy"],
210
+ ),
211
+ )[0]
212
+ task = best["task_id"]
213
+
214
+ tdoc = self.tasks.query_one(
215
+ {
216
+ "task_id": task,
217
+ "species_hash": shash,
218
+ "orig": {"$exists": True},
219
+ }
220
+ )
221
+
222
+ if tdoc is None:
223
+ try:
224
+ tdoc = self.tasks.query_one(
225
+ {
226
+ "task_id": int(task),
227
+ "species_hash": shash,
228
+ "orig": {"$exists": True},
229
+ }
230
+ )
231
+ except ValueError:
232
+ tdoc = None
233
+
234
+ if tdoc is None:
235
+ continue
236
+
237
+ task_doc = TaskDocument(**tdoc)
238
+
239
+ if task_doc is None:
240
+ continue
241
+
242
+ multipole_doc = ElectricMultipoleDoc.from_task(
243
+ task_doc, molecule_id=mol.molecule_id, deprecated=False
244
+ )
245
+ multipole_docs.append(multipole_doc)
246
+
247
+ self.logger.debug(
248
+ f"Produced {len(multipole_docs)} electric multipole docs for {shash}"
249
+ )
250
+
251
+ return jsanitize([doc.model_dump() for doc in multipole_docs], allow_bson=True)
252
+
253
+ def update_targets(self, items: List[List[Dict]]):
254
+ """
255
+ Inserts the new documents into the multipoles collection
256
+
257
+ Args:
258
+ items [[dict]]: A list of documents to update
259
+ """
260
+
261
+ docs = list(chain.from_iterable(items)) # type: ignore
262
+
263
+ # Add timestamp
264
+ for item in docs:
265
+ item.update(
266
+ {
267
+ "_bt": self.timestamp,
268
+ }
269
+ )
270
+
271
+ molecule_ids = list({item["molecule_id"] for item in docs})
272
+
273
+ if len(items) > 0:
274
+ self.logger.info(f"Updating {len(docs)} electric multipole documents")
275
+ self.multipoles.remove_docs({self.multipoles.key: {"$in": molecule_ids}})
276
+ # Neither molecule_id nor method need to be unique, but the combination must be
277
+ self.multipoles.update(
278
+ docs=docs,
279
+ key=["molecule_id", "solvent"],
280
+ )
281
+ else:
282
+ self.logger.info("No items to update")