digichem-core 6.0.3__py3-none-any.whl → 6.10.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
digichem/parse/util.py CHANGED
@@ -12,7 +12,8 @@ import warnings
12
12
  # IMPORTANT: Do not replace multiprocessing pools with pathos, the latter is too buggy for production ATM (26-05-2023).
13
13
  import multiprocessing
14
14
 
15
- from configurables.misc import is_iter
15
+ from configurables.misc import is_iter, is_int
16
+ from configurables.defres import Default, defres
16
17
 
17
18
  # Digichem imports.
18
19
  from digichem.exception.base import Digichem_exception
@@ -36,6 +37,8 @@ custom_parsing_formats = {
36
37
  "json": Json_multi_parser,
37
38
  }
38
39
 
40
+ archive_formats = list(itertools.chain(*[extensions for name, extensions, desc in shutil.get_unpack_formats()]))
41
+
39
42
  def find_log_files_from_hint(hint):
40
43
  """
41
44
  Find output (log) files from a given hint.
@@ -57,7 +60,7 @@ def find_log_files_from_hint(hint):
57
60
  # Remove any 'digichem.log' files as we know these are not calc log files.
58
61
  # We don't actually write 'digichem.log' files anymore either (we use digichem.out instead),
59
62
  # but older versions did...
60
- log_files = [log_file for log_file in log_files if log_file.name not in ["digichem.log", "digichem.out"]]
63
+ log_files = [log_file for log_file in log_files if log_file.name not in ["digichem.log", "digichem.out", "silico.log", "silico.out"]]
61
64
  else:
62
65
  parent = hint.parent
63
66
  log_files = [hint]
@@ -66,18 +69,12 @@ def find_log_files_from_hint(hint):
66
69
  if hint.suffix not in ["." + custom_format for custom_format in custom_parsing_formats]:
67
70
  # Try and find job files.
68
71
  # These files have names like 'job.0', 'job.1' etc, ending in 'job.last'.
69
- for number in itertools.count():
70
- # Get the theoretical file name.
71
- job_file_path = Path(parent, "job.{}".format(number))
72
-
73
- # See if it exists (and isn't the log_file given to us).
74
- if job_file_path.exists():
75
- # Add to list.
76
- log_files.append(job_file_path)
77
- else:
78
- # We've found all the numbered files.
79
- break
80
-
72
+ for maybe_job_file in parent.glob("job.*"):
73
+ # We only want the numbered files.
74
+ if is_int(".".join(maybe_job_file.name.split(".")[1:])):
75
+ # Looks good.
76
+ log_files.append(maybe_job_file)
77
+
81
78
  # Look for other files.
82
79
  for maybe_file_name in ("basis", "control", "mos", "alpha", "beta", "coord", "gradient", "aoforce", "job.last", "numforce/aoforce.out"):
83
80
  maybe_file_path = Path(parent, maybe_file_name)
@@ -144,6 +141,9 @@ def class_from_log_files(*log_files, format_hint = "auto"):
144
141
  log_file_type = type(cclib.io.ccopen([str(found_log_file) for found_log_file in log_files]))
145
142
 
146
143
  except Exception as e:
144
+ if isinstance(e, FileNotFoundError):
145
+ raise
146
+
147
147
  # cclib couldn't figure out the file type, it probably wasn't a .log file.
148
148
  raise Digichem_exception("Could not determine file type of file(s): '{}'; are you sure these are computational log files?".format(", ".join((str(log_file) for log_file in log_files)))) from e
149
149
 
@@ -166,11 +166,9 @@ def from_log_files(*log_files, format_hint = "auto", parser_options = {}, **auxi
166
166
  :param format_hint: A hint as to the format of the given log files. Either 'auto' (to guess), 'log' (calc log file), 'sir' (digichem result file) or 'sid' (digichem database file).
167
167
  """
168
168
  found_log_files = find_log_files(*log_files)
169
-
170
- #return class_from_log_files(*found_log_files, format_hint = format_hint).from_logs(*found_log_files, **auxiliary_files)
171
169
 
172
170
  try:
173
- return class_from_log_files(*found_log_files, format_hint = format_hint).from_logs(*found_log_files, **parser_options, **auxiliary_files)
171
+ return class_from_log_files(*found_log_files, format_hint = format_hint).from_logs(*found_log_files, hints = log_files, **parser_options, **auxiliary_files)
174
172
 
175
173
  except Exception:
176
174
  if len(found_log_files) == 0:
@@ -223,16 +221,16 @@ def parse_calculation(*log_files, options, parse_all = False, format_hint = "aut
223
221
  log_files = real_log_files
224
222
 
225
223
  # Open files for reading (handles archives for us).
226
- archive = open_for_parsing(*log_files)
224
+ archive = open_for_parsing(*log_files, auxiliary_files = auxiliary_files)
227
225
 
228
226
  try:
229
- open_log_files = archive.open()
227
+ open_log_files, open_aux_files = archive.open()
230
228
 
231
229
  if parse_all:
232
- results = from_log_files(*open_log_files, format_hint = format_hint, parser_options = parser_options, **auxiliary_files).process_all(options)
230
+ results = from_log_files(*open_log_files, format_hint = format_hint, parser_options = parser_options, **open_aux_files).process_all(options)
233
231
 
234
232
  else:
235
- results = from_log_files(*open_log_files, format_hint = format_hint, parser_options = parser_options, **auxiliary_files).process(options)
233
+ results = from_log_files(*open_log_files, format_hint = format_hint, parser_options = parser_options, **open_aux_files).process(options)
236
234
 
237
235
  finally:
238
236
  if not keep_archive:
@@ -245,15 +243,7 @@ def parse_calculation(*log_files, options, parse_all = False, format_hint = "aut
245
243
  else:
246
244
  # The caller isn't interested in the archive.
247
245
  return results
248
-
249
-
250
- # with open_for_parsing(*log_files) as open_log_files:
251
- #
252
- # if parse_all:
253
- # return from_log_files(*open_log_files, format_hint = format_hint, **auxiliary_files).process_all(options)
254
- #
255
- # else:
256
- # return from_log_files(*open_log_files, format_hint = format_hint, **auxiliary_files).process(options)
246
+
257
247
 
258
248
  def multi_parser(log_files, auxiliary_files, *, options, format_hint = "auto", keep_archive = False, parser_options = {},):
259
249
  """
@@ -279,7 +269,7 @@ def multi_parser(log_files, auxiliary_files, *, options, format_hint = "auto", k
279
269
  return parse_calculation(*logs, options = options, parse_all = True, format_hint = format_hint, keep_archive = keep_archive, parser_options = parser_options, **auxiliary_files)
280
270
 
281
271
  except Exception:
282
- digichem.log.get_logger().warning("Unable to parse calculation result file '{}'; skipping".format(logs[0]), exc_info = True)
272
+ digichem.log.get_logger().warning("Unable to parse calculation result file '{}'; skipping".format(logs[0] if len(logs) == 1 else logs), exc_info = True)
283
273
  return None
284
274
 
285
275
  def parse_multiple_calculations(*log_files, auxiliary_files = None, options, parser_options = {}, pool = None, init_func = None, init_args = None, format_hint = "auto", processes = 1, keep_archive = False):
@@ -422,7 +412,7 @@ class open_for_parsing():
422
412
  Currently, the main purpose of this context manager is to intelligently handle unpacking of archives (.zip, .tar etc) for parsing.
423
413
  """
424
414
 
425
- def __init__(self, *log_files):
415
+ def __init__(self, *log_files, auxiliary_files = Default(None)):
426
416
  log_files = [Path(log_file).resolve() for log_file in log_files]
427
417
 
428
418
  # Check we haven't been given any duplicate log files.
@@ -434,22 +424,63 @@ class open_for_parsing():
434
424
 
435
425
  # Remove duplicates but retain order.
436
426
  self.log_files = list(dict.fromkeys(log_files).keys())
427
+
428
+ # Should we worry about duplicate aux files?
429
+ self.has_aux_files = not isinstance(auxiliary_files, Default)
430
+ self.auxiliary_files = {aux_type: Path(auxiliary_file).resolve() for aux_type, auxiliary_file in auxiliary_files.items()} if defres(auxiliary_files) is not None else {}
437
431
 
438
432
  # A list of tempfile.TemporaryDirectory objects that should be closed when we are finished.
439
- self.temp_dirs = []
433
+ #self.temp_dirs = []
434
+
435
+ self.archive_dirs = {}
436
+
437
+ # Keep track of past work to prevent unpacking duplicates.
438
+ self.done_archives = []
440
439
 
441
440
  @classmethod
442
- def archive_formats(self):
441
+ def get_archive_formats(self):
443
442
  """
444
443
  Get a list of supported archive formats.
445
444
  """
446
- return list(itertools.chain(*[extensions for name, extensions, desc in shutil.get_unpack_formats()]))
445
+ return archive_formats
446
+
447
+ @property
448
+ def archive_formats(self):
449
+ return archive_formats
447
450
 
448
451
  def __enter__(self):
449
452
  """
450
453
  'Open' files for reading.
451
454
  """
452
455
  return self.open()
456
+
457
+ @classmethod
458
+ def is_archive(self, path):
459
+ """
460
+ Determine (based on a given file extensions) whether a path points to an archive.
461
+ """
462
+ # We can't simply join all the extensions together, what if the file is named something like "Benzene.log.zip"
463
+ # Also be careful for files like Benzene.tar.gz, this is .tar.gz not .gz
464
+ for pos in range(0, len(path.suffixes)):
465
+ if "".join(path.suffixes[pos:]) in self.get_archive_formats():
466
+ # This is an archive format.
467
+ return True
468
+
469
+ return False
470
+
471
+ def find(self, hint):
472
+ """
473
+ """
474
+ # First, open the file if it is an archive.
475
+ open_files = []
476
+
477
+ if hint.exists() and self.is_archive(hint):
478
+ open_files = self.extract(hint)
479
+
480
+ else:
481
+ open_files = [hint]
482
+
483
+ return open_files
453
484
 
454
485
  def open(self):
455
486
  """
@@ -457,46 +488,96 @@ class open_for_parsing():
457
488
  """
458
489
  new_log_files = []
459
490
 
460
- formats = self.archive_formats()
491
+ # First, unpack any explicitly specified aux files.
492
+ # We do this first because we won't extract the same file twice, and we want to make sure
493
+ # we definitely capture the file here.
494
+ new_aux_files = {}
495
+ for aux_type, path in self.auxiliary_files.items():
496
+ if self.is_archive(path):
497
+ files = list(self.extract(path))
498
+
499
+ # If the archive contains multiple files, complain (because we don't know which one the user wants).
500
+ if len(files) == 0:
501
+ raise Digichem_exception("Cannot extract auxiliary file archive '{}'; this archive is empty".format(path))
502
+
503
+ elif len(files) > 1:
504
+ raise Digichem_exception("Cannot extract auxiliary file archive '{}'; this archive contains multiple files".format(path))
505
+
506
+ new_aux_files[aux_type] = files[0]
507
+
508
+ else:
509
+ new_aux_files[aux_type] = path
461
510
 
511
+ # Next, build a list of files and folders to check.
462
512
  for log_file in self.log_files:
513
+ found_child_archive = False
463
514
 
464
- found_child_archive = None
465
-
466
- # If 'log_file' is a directory, check for an archive inside called 'Output.xxx'.
467
- for archive_format in formats:
468
- child_archive = Path(log_file, "Output" + archive_format)
469
- if child_archive.exists():
470
- if not found_child_archive:
471
- # Found an Output dir archive, use this instead.
472
- new_log_files.extend(self.extract(child_archive))
473
- found_child_archive = child_archive
474
-
475
- else:
476
- # For now, only care about the first.
477
- warnings.warn("Ignoring subsequent Output archive '{}'; already found '{}'".format(child_archive, found_child_archive))
478
-
479
- # No need to check 'found_child_archive' here; a file cannot simultaneously be a directory containing an archive and also an archive itself.
480
- if "".join(log_file.suffixes) in formats:
481
- # This is an archive format.
482
- # Add any files/directories that were unpacked.
483
- new_log_files.extend(self.extract(log_file))
515
+ if log_file.is_dir():
516
+ parent_dir = log_file
484
517
 
485
- elif not found_child_archive:
486
- # Non-archive file, add normally.
487
- new_log_files.append(log_file)
488
-
489
- return new_log_files
518
+ # If 'log_file' is a directory, check for an archive inside called 'Output.xxx'.
519
+ for archive_format in self.archive_formats:
520
+ child_archive = Path(log_file, "Output" + archive_format)
521
+ if child_archive.exists():
522
+ if not found_child_archive:
523
+ # Found an Output dir archive, use this instead.
524
+ new_log_files.extend(self.find(child_archive))
525
+ found_child_archive = child_archive
526
+
527
+ else:
528
+ # For now, only care about the first.
529
+ warnings.warn("Ignoring subsequent Output archive '{}'; already found '{}'".format(child_archive, found_child_archive))
530
+
531
+ else:
532
+ # The hint is not a directory, either it is a normal log file, or an archive.
533
+ # If it is an archive, it could either be:
534
+ # - An archive of a single file (eg, Output.log.zip).
535
+ # - An archive of an Output dir.
536
+ # - An archive of a calculation dir (inside of which is Output).
537
+ new_files = list(self.find(log_file))
538
+ new_log_files.extend(new_files)
539
+ parent_dir = log_file.parent
540
+
541
+ if not all([file.is_file() for file in new_files]):
542
+
543
+ #if "Output" in [file.name for file in new_files] or len(list(itertools.chain(*[file.glob("Output") for file in new_files]))) > 0:
544
+ found_child_archive = True
545
+
546
+ elif self.is_archive(log_file):
547
+ # If the hint is a single file archive, also add the parent dir (incase not all of the files are archives).
548
+ new_log_files.append(log_file.parent)
549
+
550
+
551
+ if not found_child_archive:
552
+ # If there's not an Output.zip type file, also look for individually zipped output files.
553
+ for sub_file in parent_dir.iterdir():
554
+ if self.is_archive(sub_file):
555
+ new_log_files.extend([found.parent for found in self.find(sub_file)])
556
+
557
+ new_log_files.extend(self.find(log_file))
558
+
559
+ if self.has_aux_files:
560
+ return new_log_files, new_aux_files
561
+
562
+ else:
563
+ return new_log_files
490
564
 
491
565
  def extract(self, file_name):
492
566
  """
493
567
  Extract an archive and return the contained log files.
494
568
  """
495
- # Get a temp dir to extact to.
569
+ file_name = file_name.resolve()
570
+ if file_name in self.done_archives:
571
+ digichem.log.get_logger().debug("Skipping duplicate archive '{}'".format(file_name))
572
+ return []
573
+
574
+ else:
575
+ self.done_archives.append(file_name)
576
+
577
+ # Get a temp dir to extract to.
496
578
  # We can't use TemporaryDirectory here, because these are auto deleted on program exit. This is not compatible with multi-processing.
497
- #tempdir = tempfile.TemporaryDirectory()
498
579
  tempdir = mkdtemp()
499
- self.temp_dirs.append(tempdir)
580
+ self.archive_dirs[file_name] = tempdir
500
581
 
501
582
  # Extract to it.
502
583
  digichem.log.get_logger().info("Extracting archive '{}'...".format(file_name))
@@ -517,7 +598,7 @@ class open_for_parsing():
517
598
  """
518
599
  'Close' any open files.
519
600
  """
520
- for tempdir in self.temp_dirs:
601
+ for tempdir in self.archive_dirs.values():
521
602
  shutil.rmtree(tempdir, ignore_errors = True)
522
603
 
523
604
 
@@ -298,7 +298,7 @@ class Excited_state_list(Result_container):
298
298
  }
299
299
  }
300
300
 
301
- def dump(self, digichem_options):
301
+ def dump(self, digichem_options):
302
302
  dump_dict = {
303
303
  "values": super().dump(digichem_options),
304
304
  }
@@ -398,11 +398,20 @@ class Excited_state_transition(Result_object):
398
398
  for excited_state_transitions in parser.data.etsecs:
399
399
 
400
400
  # We'll first create an intermediate list of keyword dicts which we'll then sort.
401
- data_list = [
402
- {'starting_mo': MOs[starting_mo_AB][starting_mo_index], 'ending_mo': MOs[ending_mo_AB][ending_mo_index], 'coefficient': coefficient}
403
- for (starting_mo_index, starting_mo_AB), (ending_mo_index, ending_mo_AB), coefficient
404
- in excited_state_transitions
405
- ]
401
+ data_list = []
402
+
403
+ for (starting_mo_index, starting_mo_AB), (ending_mo_index, ending_mo_AB), coefficient in excited_state_transitions:
404
+ try:
405
+ data_list.append({
406
+ 'starting_mo': MOs[starting_mo_AB][starting_mo_index],
407
+ 'ending_mo': MOs[ending_mo_AB][ending_mo_index],
408
+ 'coefficient': coefficient
409
+ })
410
+
411
+ except IndexError:
412
+ # This is fairly common in Orca 6, where only a subset of virtual orbitals are printed by default.
413
+ digichem.log.get_logger() \
414
+ .warning("Unable to construct excited state transition; transition is to/from an orbital that is not available ({} and {})".format(starting_mo_index, ending_mo_index))
406
415
 
407
416
  # Sort by probability/coefficient.
408
417
  data_list.sort(key=lambda keywords: math.fabs(keywords['coefficient']), reverse=True)
@@ -414,10 +423,7 @@ class Excited_state_transition(Result_object):
414
423
 
415
424
  # All done.
416
425
  return transitions_list
417
-
418
- except IndexError:
419
- # Probably because one (or both) of our given mo_lists is empty (or too short).
420
- raise TypeError("Unable to construct excited state transition; transition is to/from an orbital that is not available")
426
+
421
427
  except AttributeError:
422
428
  # No data.
423
429
  return []
@@ -474,7 +480,7 @@ class Energy_state(Result_object, Floatable_mixin):
474
480
  :param energy: The energy of this state in eV. Whether this value is absolute or relative to another state depends on the implementing class.
475
481
  """
476
482
  self.level = level
477
- self.multiplicity = round(multiplicity)
483
+ self.multiplicity = round(multiplicity) if multiplicity is not None else None
478
484
  # 'True' multiplicity is unrounded (do something smarter)
479
485
  self.true_multiplicity = multiplicity
480
486
  self.multiplicity_level = multiplicity_level