qccodec 0.7.6__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
qccodec/__init__.py ADDED
@@ -0,0 +1,9 @@
1
+ from importlib import metadata
2
+
3
+ __version__ = metadata.version(__name__)
4
+
5
+
6
+ from .codec import decode, encode # noqa: F401
7
+ from .registry import registry # noqa: F401
8
+
9
+ __all__ = ["decode", "encode", "registry"]
qccodec/cli.py ADDED
@@ -0,0 +1,34 @@
1
+ import argparse
2
+ from pathlib import Path
3
+
4
+ from .codec import decode
5
+
6
+
7
+ def main():
8
+ parser = argparse.ArgumentParser(
9
+ description="Parse Quantum Chemistry output files into structured JSON or "
10
+ "Python objects."
11
+ )
12
+ parser.add_argument("program", help="Name of the program")
13
+ parser.add_argument(
14
+ "calctype", help="Type of calculation. See qcio.CalcType for options"
15
+ )
16
+ parser.add_argument(
17
+ "stdout",
18
+ help="Path to the stdout file (optional)",
19
+ nargs="?",
20
+ default=None,
21
+ )
22
+ parser.add_argument(
23
+ "directory",
24
+ help="Path to the directory containing the output files (optional)",
25
+ nargs="?",
26
+ default=None,
27
+ )
28
+ args = parser.parse_args()
29
+
30
+ stdout_contents = Path(args.stdout).read_text() if args.stdout else None
31
+ results = decode(
32
+ args.program, args.calctype, stdout=stdout_contents, directory=args.directory
33
+ )
34
+ print(results.model_dump_json(indent=4, exclude_unset=True))
qccodec/codec.py ADDED
@@ -0,0 +1,147 @@
1
+ """Top level functions for the tcparse library"""
2
+
3
+ import logging
4
+ from importlib import import_module
5
+ from pathlib import Path
6
+ from typing import Any, Optional, Union
7
+
8
+ from qcio import (
9
+ CalcType,
10
+ ConformerSearchResults,
11
+ OptimizationResults,
12
+ ProgramInput,
13
+ SinglePointResults,
14
+ StructuredInputs,
15
+ StructuredResults,
16
+ )
17
+
18
+ from .exceptions import DecoderError, EncoderError, MatchNotFoundError
19
+ from .models import (
20
+ DataCollector,
21
+ NativeInput,
22
+ )
23
+ from .parsers import * # noqa: F403 Ensure all parsers get registered
24
+ from .registry import registry
25
+
26
+ __all__ = ["parse", "parse_results", "encode", "registry"]
27
+
28
+ logger = logging.getLogger(__name__)
29
+
30
+
31
+ RESULTS_TYPE_MAP = {
32
+ CalcType.energy: SinglePointResults,
33
+ CalcType.gradient: SinglePointResults,
34
+ CalcType.hessian: SinglePointResults,
35
+ CalcType.optimization: OptimizationResults,
36
+ CalcType.transition_state: OptimizationResults,
37
+ CalcType.conformer_search: ConformerSearchResults,
38
+ }
39
+
40
+
41
+ def decode(
42
+ program: str,
43
+ calctype: CalcType,
44
+ *,
45
+ stdout: Optional[str] = None,
46
+ directory: Optional[Union[str, Path]] = None,
47
+ input_data: Optional[StructuredInputs] = None,
48
+ as_dict: bool = False,
49
+ ) -> Union[StructuredResults, dict[str, Any]]:
50
+ """Decode the output of a quantum chemistry program into a standardized output.
51
+
52
+ Args:
53
+ program: The QC program that generated the output file.
54
+ calctype: The type of calculation that was run.
55
+ stdout: The stdout file contents as a string.
56
+ directory: The directory containing the output files.
57
+ input_data: The input data used for the calculation.
58
+ This is used to provide additional context for the parsers.
59
+ as_dict: If True, return the results as a dictionary instead of a
60
+ StructuredResults object. Used mostly for testing purposes to enable
61
+ returning parsed data that isn't a fully valid StructuredResults object.
62
+
63
+ Returns:
64
+ A StructuredResults object containing the parsed data.
65
+
66
+ Raises:
67
+ DecoderError: If neither stdout nor directory is provided or if the program
68
+ or calctype is not supported.
69
+ MatchNotFoundError: If a required parser fails to find a match.
70
+ """
71
+ logger.info("Starting decode for program: %s with calctype: %s", program, calctype)
72
+ if not stdout and not directory:
73
+ raise ValueError("Either stdout, directory, or both must be provided.")
74
+
75
+ # Import the program-specific module.
76
+ try:
77
+ mod = import_module(f"qccodec.parsers.{program}")
78
+ except ImportError as e:
79
+ logger.exception("Failed to import module qccodec.parsers.%s", program)
80
+ raise DecoderError(f"No parsers found for program '{program}'.") from e
81
+
82
+ # Create a generator for stdout (if provided) and all parsable files in directory
83
+ files = mod.iter_files(stdout, directory)
84
+
85
+ # Now iterate over the combined generator of all parsable files
86
+ data_collector = DataCollector()
87
+ for filetype, contents in files:
88
+ # Look up the parsers for the given program, filetype, and calctype
89
+ logger.debug("Processing file with filetype: %s", filetype)
90
+ parser_specs = registry.get_parsers(program, filetype, calctype)
91
+ logger.info("Found %d parser(s) for program '%s', filetype '%s', calctype '%s'", len(parser_specs), program, filetype, calctype) # noqa: E501
92
+
93
+ for spec in parser_specs:
94
+ logger.debug("Running parser '%s' for target '%s'", spec.parser.__name__, spec.target) # noqa: E501
95
+ # Parse the contents using the parser
96
+ try:
97
+ if spec.filetype == "directory":
98
+ parsed_value: Any = spec.parser(directory, stdout, input_data)
99
+ else:
100
+ parsed_value = spec.parser(contents)
101
+ logger.info("Parser '%s' succeeded; returned value: %s", spec.parser.__name__, parsed_value) # noqa: E501
102
+ # Raised if the parser can't find its data
103
+ except MatchNotFoundError as e:
104
+ if spec.required:
105
+ logger.error("Required parser '%s' failed; raising exception", spec.parser.__name__) # noqa: E501
106
+ raise
107
+ else:
108
+ logger.info("Parser '%s' did not find a match but is not required.", spec.parser.__name__) # noqa: E501
109
+ # Place the parsed value into the data collector
110
+ else:
111
+ # If the parser returns a dictionary, assign each key-value pair to the data collector
112
+ if isinstance(parsed_value, dict):
113
+ for key, value in parsed_value.items():
114
+ data_collector.add_data(key, value)
115
+ logger.debug("Assigned parsed value to target '%s' on data_collector", (spec.target, key))
116
+ # Otherwise, assign the parsed value to the specified target
117
+ else:
118
+ assert spec.target is not None, "Target must be specified for non-dictionary parsed values." # for mypy
119
+ data_collector.add_data(spec.target, parsed_value)
120
+ logger.debug("Assigned parsed value to target '%s' on data_collector", spec.target) # noqa: E501
121
+
122
+ logger.info("Completed processing files; final data_collector state: %s", data_collector) # noqa: E501
123
+ # Finally, construct and return the StructuredResults using the collected data.
124
+ if as_dict:
125
+ return dict(data_collector)
126
+ return RESULTS_TYPE_MAP[calctype](**data_collector)
127
+
128
+ def encode(inp_data: ProgramInput, program: str) -> NativeInput:
129
+ """Encode a ProgramInput object to a NativeInput object.
130
+
131
+ Args:
132
+ inp_data: The ProgramInput object to encode.
133
+ program: The program for which to encode the input.
134
+
135
+ Returns:
136
+ A NativeInput object with the encoded input.
137
+
138
+ Raises:
139
+ EncoderError: If the calctype is not supported by the program's encoder or the
140
+ input is invalid.
141
+ """
142
+ # Check that calctype is supported by the encoder
143
+ encoder = import_module(f"qccodec.encoders.{program}")
144
+ if inp_data.calctype not in encoder.SUPPORTED_CALCTYPES:
145
+ raise EncoderError(f"Calctype '{inp_data.calctype}' not supported by encoder.")
146
+
147
+ return encoder.encode(inp_data)
File without changes
@@ -0,0 +1,135 @@
1
+ import copy
2
+ import os
3
+ from typing import Any
4
+
5
+ import tomli_w
6
+ from qcio import CalcType, ProgramInput
7
+
8
+ from qccodec.exceptions import EncoderError
9
+ from qccodec.models import NativeInput
10
+
11
+ SUPPORTED_CALCTYPES = {
12
+ CalcType.conformer_search,
13
+ CalcType.optimization,
14
+ CalcType.energy,
15
+ CalcType.gradient,
16
+ CalcType.hessian,
17
+ }
18
+
19
+
20
+ def encode(inp_obj: ProgramInput) -> NativeInput:
21
+ """Translate a ProgramInput into CREST inputs files.
22
+
23
+ Args:
24
+ inp_obj: The qcio ProgramInput object for a computation.
25
+
26
+ Returns:
27
+ NativeInput with .input_files being a crest.toml file and .geometry_file the
28
+ Structure's xyz file.
29
+ """
30
+ validate_input(inp_obj)
31
+ struct_filename = "structure.xyz"
32
+
33
+ return NativeInput(
34
+ input_file=tomli_w.dumps(_to_toml_dict(inp_obj, struct_filename)),
35
+ geometry_file=inp_obj.structure.to_xyz(),
36
+ geometry_filename=struct_filename,
37
+ )
38
+
39
+
40
+ def validate_input(inp_obj: ProgramInput):
41
+ """Validate the input for CREST.
42
+
43
+ Args:
44
+ inp_obj: The qcio ProgramInput object for a computation.
45
+
46
+ Raises:
47
+ EncoderError: If the input is invalid.
48
+ """
49
+ # These values come from other parts of the ProgramInput and should not be set
50
+ # in the keywords.
51
+ non_allowed_keywords = ["charge", "uhf"]
52
+ for keyword in non_allowed_keywords:
53
+ if keyword in inp_obj.keywords:
54
+ raise EncoderError(
55
+ f"{keyword} should not be set in keywords for CREST. It is already set "
56
+ "on the Structure or ProgramInput elsewhere.",
57
+ )
58
+ if "runtype" in inp_obj.keywords:
59
+ _validate_runtype_calctype(inp_obj.keywords["runtype"], inp_obj.calctype)
60
+
61
+
62
+ def _validate_runtype_calctype(runtype: str, calctype: CalcType):
63
+ """Validate that the runtype is supported for the calctype."""
64
+ invalid_runtype = False
65
+ valid_runtypes = set()
66
+
67
+ if calctype == CalcType.conformer_search:
68
+ valid_runtypes = {"imtd-gc", "imtd-smtd", "entropy", "nci", "nci-mtd"}
69
+ if runtype not in valid_runtypes:
70
+ invalid_runtype = True
71
+
72
+ elif calctype == CalcType.optimization:
73
+ valid_runtypes = {"optimize", "ancopt"}
74
+ if runtype not in valid_runtypes:
75
+ invalid_runtype = True
76
+
77
+ elif calctype in {CalcType.energy, CalcType.gradient}:
78
+ valid_runtypes = {"singlepoint"}
79
+ if runtype not in valid_runtypes:
80
+ invalid_runtype = True
81
+
82
+ elif calctype == CalcType.hessian:
83
+ valid_runtypes = {"numhess"}
84
+ if runtype not in valid_runtypes:
85
+ invalid_runtype = True
86
+
87
+ if invalid_runtype:
88
+ raise EncoderError(
89
+ f"Unsupported runtype {runtype} for calctype {calctype}. Valid runtypes "
90
+ f"are: {valid_runtypes}.",
91
+ )
92
+
93
+
94
+ def _to_toml_dict(inp_obj: ProgramInput, struct_filename: str) -> dict[str, Any]:
95
+ """Convert a ProgramInput object to a dictionary in the CREST format of TOML.
96
+
97
+ This function makes it easier to test for the correct TOML structure.
98
+ """
99
+ # Start with existing keywords
100
+ toml_dict = copy.deepcopy(inp_obj.keywords)
101
+
102
+ # Top level keywords
103
+ # Logical cores was 10% faster than physical cores, so not using psutil
104
+ toml_dict.setdefault("threads", min(os.cpu_count() or 16, 16))
105
+ toml_dict["input"] = struct_filename
106
+
107
+ # Set default runtype if not already set
108
+ if "runtype" not in inp_obj.keywords:
109
+ if inp_obj.calctype == CalcType.conformer_search:
110
+ toml_dict["runtype"] = "imtd-gc"
111
+ elif inp_obj.calctype == CalcType.optimization:
112
+ toml_dict["runtype"] = "optimize"
113
+ elif inp_obj.calctype in {CalcType.energy, CalcType.gradient}:
114
+ toml_dict["runtype"] = "singlepoint"
115
+ elif inp_obj.calctype == CalcType.hessian:
116
+ toml_dict["runtype"] = "numhess"
117
+ else:
118
+ raise EncoderError(
119
+ f"Unsupported calctype {inp_obj.calctype} for CREST encoder.",
120
+ )
121
+
122
+ # Calculation level keywords
123
+ calculation = toml_dict.pop("calculation", {})
124
+ calculation_level = calculation.pop("level", [])
125
+ if len(calculation_level) == 0:
126
+ calculation_level.append({})
127
+ for level_dict in calculation_level:
128
+ level_dict["method"] = inp_obj.model.method
129
+ level_dict["charge"] = inp_obj.structure.charge
130
+ level_dict["uhf"] = inp_obj.structure.multiplicity - 1
131
+
132
+ calculation["level"] = calculation_level
133
+ toml_dict["calculation"] = calculation
134
+
135
+ return toml_dict
@@ -0,0 +1,74 @@
1
+ from qcio import CalcType, ProgramInput
2
+
3
+ from qccodec.exceptions import EncoderError
4
+ from qccodec.models import NativeInput
5
+
6
+ SUPPORTED_CALCTYPES = {
7
+ CalcType.energy,
8
+ CalcType.gradient,
9
+ CalcType.hessian,
10
+ CalcType.optimization,
11
+ CalcType.transition_state,
12
+ }
13
+ XYZ_FILENAME = "geometry.xyz"
14
+ PADDING = 20 # padding between keyword and value in tc.in
15
+
16
+
17
+ def encode(inp_obj: ProgramInput) -> NativeInput:
18
+ """Translate a ProgramInput into TeraChem inputs files.
19
+
20
+ Args:
21
+ inp_obj: The qcio ProgramInput object for a computation.
22
+
23
+ Returns:
24
+ NativeInput with .input being a tc.in file and .geometry an xyz file.
25
+ """
26
+
27
+ # calctype
28
+ if inp_obj.calctype.value == CalcType.hessian:
29
+ calctype = "frequencies"
30
+ elif inp_obj.calctype.value == CalcType.optimization:
31
+ calctype = "minimize"
32
+ if not inp_obj.keywords.get("new_minimizer", "no") == "yes":
33
+ raise EncoderError(
34
+ "Only the new_minimizer is supported for optimizations. Add "
35
+ "'new_minimizer': 'yes' to the keywords."
36
+ )
37
+ elif inp_obj.calctype.value == CalcType.transition_state:
38
+ calctype = "ts"
39
+ else:
40
+ calctype = inp_obj.calctype.value
41
+
42
+ # Collect lines for input file
43
+ inp_lines = []
44
+ inp_lines.append(f"{'run':<{PADDING}} {calctype}")
45
+ # Structure
46
+ inp_lines.append(f"{'coordinates':<{PADDING}} {XYZ_FILENAME}")
47
+ inp_lines.append(f"{'charge':<{PADDING}} {inp_obj.structure.charge}")
48
+ inp_lines.append(f"{'spinmult':<{PADDING}} {inp_obj.structure.multiplicity}")
49
+ # Model
50
+ inp_lines.append(f"{'method':<{PADDING}} {inp_obj.model.method}")
51
+ inp_lines.append(f"{'basis':<{PADDING}} {inp_obj.model.basis}")
52
+
53
+ # Keywords
54
+ non_keywords = {
55
+ "charge": ".structure.charge",
56
+ "spinmult": ".structure.multiplicity",
57
+ "run": ".calctype",
58
+ "basis": ".model.basis",
59
+ "method": ".model.method",
60
+ }
61
+ for key, value in inp_obj.keywords.items():
62
+ # Check for keywords that should be passed as structured data
63
+ if key in non_keywords:
64
+ raise EncoderError(
65
+ f"Keyword '{key}' should not be set as a keyword. It "
66
+ f"should be set at '{non_keywords[key]}'",
67
+ )
68
+ # Lowercase booleans
69
+ inp_lines.append(f"{key:<{PADDING}} {str(value).lower()}")
70
+ return NativeInput(
71
+ input_file="\n".join(inp_lines) + "\n", # End file with newline
72
+ geometry_file=inp_obj.structure.to_xyz(),
73
+ geometry_filename=XYZ_FILENAME,
74
+ )
qccodec/exceptions.py ADDED
@@ -0,0 +1,33 @@
1
+ class BaseError(Exception):
2
+ """Base qccodec exceptions"""
3
+
4
+
5
+ class DecoderError(BaseError):
6
+ """Exception raised when a decoder error occurs"""
7
+
8
+
9
+ class EncoderError(BaseError):
10
+ """Exception raised when a encoder error occurs"""
11
+
12
+
13
+ class ParserError(BaseError):
14
+ """Base exception for parsers"""
15
+
16
+
17
+ class MatchNotFoundError(ParserError):
18
+ """Exception raised when a parsing match is not found"""
19
+
20
+ def __init__(self, regex: str, contents: str):
21
+ self.regex = regex
22
+ self.contents = contents
23
+ super().__init__(
24
+ f"Could not locate match for regex: '{regex}' in contents: '{contents}'"
25
+ )
26
+
27
+
28
+ class RegistryError(BaseError):
29
+ """Exception raised when a registry error occurs"""
30
+
31
+
32
+ class DataCollectorError(BaseError):
33
+ """Exception raised when a data collector error occurs"""
qccodec/models.py ADDED
@@ -0,0 +1,54 @@
1
+ """Simple data models to support parsing of QM program output files."""
2
+
3
+ from dataclasses import dataclass
4
+ from typing import Any, Optional, Union
5
+
6
+ from .exceptions import DataCollectorError
7
+
8
+
9
+ class DataCollector(dict):
10
+ """A dictionary for collecting data from parsers."""
11
+
12
+ def add_data(self, target: Union[str, tuple[str, ...]], value: Any) -> None:
13
+ """
14
+ Assign a value into the DataCollector at the specified target.
15
+
16
+ If target is a string, the value is assigned to that key.
17
+ If target is a tuple, the method navigates through nested dictionaries,
18
+ creating them as needed, and assigns the value at the final key.
19
+ """
20
+ keys = target if isinstance(target, tuple) else (target,)
21
+ d = self
22
+ for key in keys[:-1]:
23
+ d = d.setdefault(key, {})
24
+ if keys[-1] in d:
25
+ raise DataCollectorError(
26
+ f"Target '{keys}' already exists in DataCollector. You cannot add the same target twice."
27
+ )
28
+ d[keys[-1]] = value
29
+
30
+
31
+ @dataclass
32
+ class NativeInput:
33
+ """Native input file data for a quantum chemistry program.
34
+
35
+ Writing these files to disk should produce a valid input.
36
+
37
+ Attributes:
38
+ input_file: input file for the program
39
+ geometry: xyz file or other geometry file required for the calculation
40
+ geometry_filename: filename of the geometry file referenced in the input
41
+ """
42
+
43
+ input_file: str
44
+ geometry_file: Optional[str] = None
45
+ geometry_filename: Optional[str] = None
46
+
47
+ def __post_init__(self):
48
+ """Ensure that geometry_filename is set if geometry is set."""
49
+ if self.geometry_file and not self.geometry_filename:
50
+ raise ValueError(
51
+ "geometry_filename must be set if geometry_file is provided. "
52
+ "Set geometry_filename to the name of the geometry file as referenced "
53
+ "in the input file."
54
+ )
@@ -0,0 +1,16 @@
1
+ """All parsers should follow a basic pattern:
2
+
3
+ 1. Extract the parsed data, cast to its appropriate Python type, set it on the
4
+ SinglePointResults object at the appropriate attribute name.
5
+ 2. Raise a MatchNotFound error if a match was not found
6
+ 3. Register parser with the registry by decorating it with the parser() decorator
7
+
8
+ Use the .utils.regex_search() helper function in place of re.search() to
9
+ ensure that a MatchNotFoundError will be raised in a parser. More sophisticated parsers
10
+ that use re.findall (like terachem.parse_hessian) or rely upon not finding a match may
11
+ implement a different interface, but please strive to follow this basic patterns as much
12
+ as possible.
13
+ """
14
+
15
+ # Required for parsers to register
16
+ from .terachem import * # noqa: F403