PyPI - qccodec - Versions diffs - 0.7.6__py3-none-any.whl - Mend

qccodec 0.7.6__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (18) hide show

qccodec/__init__.py +9 -0
qccodec/cli.py +34 -0
qccodec/codec.py +147 -0
qccodec/encoders/__init__.py +0 -0
qccodec/encoders/crest.py +135 -0
qccodec/encoders/terachem.py +74 -0
qccodec/exceptions.py +33 -0
qccodec/models.py +54 -0
qccodec/parsers/__init__.py +16 -0
qccodec/parsers/crest.py +379 -0
qccodec/parsers/terachem.py +348 -0
qccodec/parsers/utils.py +32 -0
qccodec/registry.py +202 -0
qccodec-0.7.6.dist-info/METADATA +148 -0
qccodec-0.7.6.dist-info/RECORD +18 -0
qccodec-0.7.6.dist-info/WHEEL +4 -0
qccodec-0.7.6.dist-info/entry_points.txt +2 -0
qccodec-0.7.6.dist-info/licenses/LICENSE +21 -0

qccodec/__init__.py ADDED Viewed

@@ -0,0 +1,9 @@
+from importlib import metadata
+__version__ = metadata.version(__name__)
+from .codec import decode, encode  # noqa: F401
+from .registry import registry  # noqa: F401
+__all__ = ["decode", "encode", "registry"]

qccodec/cli.py ADDED Viewed

@@ -0,0 +1,34 @@
+import argparse
+from pathlib import Path
+from .codec import decode
+def main():
+    parser = argparse.ArgumentParser(
+        description="Parse Quantum Chemistry output files into structured JSON or "
+        "Python objects."
+    )
+    parser.add_argument("program", help="Name of the program")
+    parser.add_argument(
+        "calctype", help="Type of calculation. See qcio.CalcType for options"
+    )
+    parser.add_argument(
+        "stdout",
+        help="Path to the stdout file (optional)",
+        nargs="?",
+        default=None,
+    )
+    parser.add_argument(
+        "directory",
+        help="Path to the directory containing the output files (optional)",
+        nargs="?",
+        default=None,
+    )
+    args = parser.parse_args()
+    stdout_contents = Path(args.stdout).read_text() if args.stdout else None
+    results = decode(
+        args.program, args.calctype, stdout=stdout_contents, directory=args.directory
+    )
+    print(results.model_dump_json(indent=4, exclude_unset=True))

qccodec/codec.py ADDED Viewed

@@ -0,0 +1,147 @@
+"""Top level functions for the tcparse library"""
+import logging
+from importlib import import_module
+from pathlib import Path
+from typing import Any, Optional, Union
+from qcio import (
+    CalcType,
+    ConformerSearchResults,
+    OptimizationResults,
+    ProgramInput,
+    SinglePointResults,
+    StructuredInputs,
+    StructuredResults,
+)
+from .exceptions import DecoderError, EncoderError, MatchNotFoundError
+from .models import (
+    DataCollector,
+    NativeInput,
+)
+from .parsers import *  # noqa: F403 Ensure all parsers get registered
+from .registry import registry
+__all__ = ["parse", "parse_results", "encode", "registry"]
+logger = logging.getLogger(__name__)
+RESULTS_TYPE_MAP = {
+    CalcType.energy: SinglePointResults,
+    CalcType.gradient: SinglePointResults,
+    CalcType.hessian: SinglePointResults,
+    CalcType.optimization: OptimizationResults,
+    CalcType.transition_state: OptimizationResults,
+    CalcType.conformer_search: ConformerSearchResults,
+}
+def decode(
+    program: str,
+    calctype: CalcType,
+    *,
+    stdout: Optional[str] = None,
+    directory: Optional[Union[str, Path]] = None,
+    input_data: Optional[StructuredInputs] = None,
+    as_dict: bool = False,
+) -> Union[StructuredResults, dict[str, Any]]:
+    """Decode the output of a quantum chemistry program into a standardized output.
+    Args:
+        program: The QC program that generated the output file.
+        calctype: The type of calculation that was run.
+        stdout: The stdout file contents as a string.
+        directory: The directory containing the output files.
+        input_data: The input data used for the calculation.
+            This is used to provide additional context for the parsers.
+        as_dict: If True, return the results as a dictionary instead of a
+            StructuredResults object. Used mostly for testing purposes to enable
+            returning parsed data that isn't a fully valid StructuredResults object.
+    Returns:
+        A StructuredResults object containing the parsed data.
+    Raises:
+        DecoderError: If neither stdout nor directory is provided or if the program
+            or calctype is not supported.
+        MatchNotFoundError: If a required parser fails to find a match.
+    """
+    logger.info("Starting decode for program: %s with calctype: %s", program, calctype)
+    if not stdout and not directory:
+        raise ValueError("Either stdout, directory, or both must be provided.")
+    # Import the program-specific module.
+    try:
+        mod = import_module(f"qccodec.parsers.{program}")
+    except ImportError as e:
+        logger.exception("Failed to import module qccodec.parsers.%s", program)
+        raise DecoderError(f"No parsers found for program '{program}'.") from e
+    # Create a generator for stdout (if provided) and all parsable files in directory
+    files = mod.iter_files(stdout, directory)
+    # Now iterate over the combined generator of all parsable files
+    data_collector = DataCollector()
+    for filetype, contents in files:
+        # Look up the parsers for the given program, filetype, and calctype
+        logger.debug("Processing file with filetype: %s", filetype)
+        parser_specs = registry.get_parsers(program, filetype, calctype)
+        logger.info("Found %d parser(s) for program '%s', filetype '%s', calctype '%s'", len(parser_specs), program, filetype, calctype) # noqa: E501
+        for spec in parser_specs:
+            logger.debug("Running parser '%s' for target '%s'", spec.parser.__name__, spec.target) # noqa: E501
+            # Parse the contents using the parser
+            try:
+                if spec.filetype == "directory":
+                    parsed_value: Any = spec.parser(directory, stdout, input_data)
+                else:
+                    parsed_value = spec.parser(contents)
+                logger.info("Parser '%s' succeeded; returned value: %s", spec.parser.__name__, parsed_value) # noqa: E501
+            # Raised if the parser can't find its data
+            except MatchNotFoundError as e:
+                if spec.required:
+                    logger.error("Required parser '%s' failed; raising exception", spec.parser.__name__) # noqa: E501
+                    raise
+                else:
+                    logger.info("Parser '%s' did not find a match but is not required.", spec.parser.__name__) # noqa: E501
+            # Place the parsed value into the data collector
+            else:
+                # If the parser returns a dictionary, assign each key-value pair to the data collector
+                if isinstance(parsed_value, dict):
+                    for key, value in parsed_value.items():
+                        data_collector.add_data(key, value)
+                        logger.debug("Assigned parsed value to target '%s' on data_collector", (spec.target, key))
+                # Otherwise, assign the parsed value to the specified target
+                else:
+                    assert spec.target is not None, "Target must be specified for non-dictionary parsed values." # for mypy
+                    data_collector.add_data(spec.target, parsed_value)
+                logger.debug("Assigned parsed value to target '%s' on data_collector", spec.target) # noqa: E501
+    logger.info("Completed processing files; final data_collector state: %s", data_collector) # noqa: E501
+    # Finally, construct and return the StructuredResults using the collected data.
+    if as_dict:
+        return dict(data_collector)
+    return RESULTS_TYPE_MAP[calctype](**data_collector)
+def encode(inp_data: ProgramInput, program: str) -> NativeInput:
+    """Encode a ProgramInput object to a NativeInput object.
+    Args:
+        inp_data: The ProgramInput object to encode.
+        program: The program for which to encode the input.
+    Returns:
+        A NativeInput object with the encoded input.
+    Raises:
+        EncoderError: If the calctype is not supported by the program's encoder or the
+            input is invalid.
+    """
+    # Check that calctype is supported by the encoder
+    encoder = import_module(f"qccodec.encoders.{program}")
+    if inp_data.calctype not in encoder.SUPPORTED_CALCTYPES:
+        raise EncoderError(f"Calctype '{inp_data.calctype}' not supported by encoder.")
+    return encoder.encode(inp_data)

qccodec/encoders/__init__.py ADDED Viewed

File without changes

qccodec/encoders/crest.py ADDED Viewed

@@ -0,0 +1,135 @@
+import copy
+import os
+from typing import Any
+import tomli_w
+from qcio import CalcType, ProgramInput
+from qccodec.exceptions import EncoderError
+from qccodec.models import NativeInput
+SUPPORTED_CALCTYPES = {
+    CalcType.conformer_search,
+    CalcType.optimization,
+    CalcType.energy,
+    CalcType.gradient,
+    CalcType.hessian,
+}
+def encode(inp_obj: ProgramInput) -> NativeInput:
+    """Translate a ProgramInput into CREST inputs files.
+    Args:
+        inp_obj: The qcio ProgramInput object for a computation.
+    Returns:
+        NativeInput with .input_files being a crest.toml file and .geometry_file the
+            Structure's xyz file.
+    """
+    validate_input(inp_obj)
+    struct_filename = "structure.xyz"
+    return NativeInput(
+        input_file=tomli_w.dumps(_to_toml_dict(inp_obj, struct_filename)),
+        geometry_file=inp_obj.structure.to_xyz(),
+        geometry_filename=struct_filename,
+    )
+def validate_input(inp_obj: ProgramInput):
+    """Validate the input for CREST.
+    Args:
+        inp_obj: The qcio ProgramInput object for a computation.
+    Raises:
+        EncoderError: If the input is invalid.
+    """
+    # These values come from other parts of the ProgramInput and should not be set
+    # in the keywords.
+    non_allowed_keywords = ["charge", "uhf"]
+    for keyword in non_allowed_keywords:
+        if keyword in inp_obj.keywords:
+            raise EncoderError(
+                f"{keyword} should not be set in keywords for CREST. It is already set "
+                "on the Structure or ProgramInput elsewhere.",
+            )
+    if "runtype" in inp_obj.keywords:
+        _validate_runtype_calctype(inp_obj.keywords["runtype"], inp_obj.calctype)
+def _validate_runtype_calctype(runtype: str, calctype: CalcType):
+    """Validate that the runtype is supported for the calctype."""
+    invalid_runtype = False
+    valid_runtypes = set()
+    if calctype == CalcType.conformer_search:
+        valid_runtypes = {"imtd-gc", "imtd-smtd", "entropy", "nci", "nci-mtd"}
+        if runtype not in valid_runtypes:
+            invalid_runtype = True
+    elif calctype == CalcType.optimization:
+        valid_runtypes = {"optimize", "ancopt"}
+        if runtype not in valid_runtypes:
+            invalid_runtype = True
+    elif calctype in {CalcType.energy, CalcType.gradient}:
+        valid_runtypes = {"singlepoint"}
+        if runtype not in valid_runtypes:
+            invalid_runtype = True
+    elif calctype == CalcType.hessian:
+        valid_runtypes = {"numhess"}
+        if runtype not in valid_runtypes:
+            invalid_runtype = True
+    if invalid_runtype:
+        raise EncoderError(
+            f"Unsupported runtype {runtype} for calctype {calctype}. Valid runtypes "
+            f"are: {valid_runtypes}.",
+        )
+def _to_toml_dict(inp_obj: ProgramInput, struct_filename: str) -> dict[str, Any]:
+    """Convert a ProgramInput object to a dictionary in the CREST format of TOML.
+    This function makes it easier to test for the correct TOML structure.
+    """
+    # Start with existing keywords
+    toml_dict = copy.deepcopy(inp_obj.keywords)
+    # Top level keywords
+    # Logical cores was 10% faster than physical cores, so not using psutil
+    toml_dict.setdefault("threads", min(os.cpu_count() or 16, 16))
+    toml_dict["input"] = struct_filename
+    # Set default runtype if not already set
+    if "runtype" not in inp_obj.keywords:
+        if inp_obj.calctype == CalcType.conformer_search:
+            toml_dict["runtype"] = "imtd-gc"
+        elif inp_obj.calctype == CalcType.optimization:
+            toml_dict["runtype"] = "optimize"
+        elif inp_obj.calctype in {CalcType.energy, CalcType.gradient}:
+            toml_dict["runtype"] = "singlepoint"
+        elif inp_obj.calctype == CalcType.hessian:
+            toml_dict["runtype"] = "numhess"
+        else:
+            raise EncoderError(
+                f"Unsupported calctype {inp_obj.calctype} for CREST encoder.",
+            )
+    # Calculation level keywords
+    calculation = toml_dict.pop("calculation", {})
+    calculation_level = calculation.pop("level", [])
+    if len(calculation_level) == 0:
+        calculation_level.append({})
+    for level_dict in calculation_level:
+        level_dict["method"] = inp_obj.model.method
+        level_dict["charge"] = inp_obj.structure.charge
+        level_dict["uhf"] = inp_obj.structure.multiplicity - 1
+    calculation["level"] = calculation_level
+    toml_dict["calculation"] = calculation
+    return toml_dict

qccodec/encoders/terachem.py ADDED Viewed

@@ -0,0 +1,74 @@
+from qcio import CalcType, ProgramInput
+from qccodec.exceptions import EncoderError
+from qccodec.models import NativeInput
+SUPPORTED_CALCTYPES = {
+    CalcType.energy,
+    CalcType.gradient,
+    CalcType.hessian,
+    CalcType.optimization,
+    CalcType.transition_state,
+}
+XYZ_FILENAME = "geometry.xyz"
+PADDING = 20  # padding between keyword and value in tc.in
+def encode(inp_obj: ProgramInput) -> NativeInput:
+    """Translate a ProgramInput into TeraChem inputs files.
+    Args:
+        inp_obj: The qcio ProgramInput object for a computation.
+    Returns:
+        NativeInput with .input being a tc.in file and .geometry an xyz file.
+    """
+    # calctype
+    if inp_obj.calctype.value == CalcType.hessian:
+        calctype = "frequencies"
+    elif inp_obj.calctype.value == CalcType.optimization:
+        calctype = "minimize"
+        if not inp_obj.keywords.get("new_minimizer", "no") == "yes":
+            raise EncoderError(
+                "Only the new_minimizer is supported for optimizations. Add "
+                "'new_minimizer': 'yes' to the keywords."
+            )
+    elif inp_obj.calctype.value == CalcType.transition_state:
+        calctype = "ts"
+    else:
+        calctype = inp_obj.calctype.value
+    # Collect lines for input file
+    inp_lines = []
+    inp_lines.append(f"{'run':<{PADDING}} {calctype}")
+    # Structure
+    inp_lines.append(f"{'coordinates':<{PADDING}} {XYZ_FILENAME}")
+    inp_lines.append(f"{'charge':<{PADDING}} {inp_obj.structure.charge}")
+    inp_lines.append(f"{'spinmult':<{PADDING}} {inp_obj.structure.multiplicity}")
+    # Model
+    inp_lines.append(f"{'method':<{PADDING}} {inp_obj.model.method}")
+    inp_lines.append(f"{'basis':<{PADDING}} {inp_obj.model.basis}")
+    # Keywords
+    non_keywords = {
+        "charge": ".structure.charge",
+        "spinmult": ".structure.multiplicity",
+        "run": ".calctype",
+        "basis": ".model.basis",
+        "method": ".model.method",
+    }
+    for key, value in inp_obj.keywords.items():
+        # Check for keywords that should be passed as structured data
+        if key in non_keywords:
+            raise EncoderError(
+                f"Keyword '{key}' should not be set as a keyword. It "
+                f"should be set at '{non_keywords[key]}'",
+            )
+        # Lowercase booleans
+        inp_lines.append(f"{key:<{PADDING}} {str(value).lower()}")
+    return NativeInput(
+        input_file="\n".join(inp_lines) + "\n",  # End file with newline
+        geometry_file=inp_obj.structure.to_xyz(),
+        geometry_filename=XYZ_FILENAME,
+    )

qccodec/exceptions.py ADDED Viewed

@@ -0,0 +1,33 @@
+class BaseError(Exception):
+    """Base qccodec exceptions"""
+class DecoderError(BaseError):
+    """Exception raised when a decoder error occurs"""
+class EncoderError(BaseError):
+    """Exception raised when a encoder error occurs"""
+class ParserError(BaseError):
+    """Base exception for parsers"""
+class MatchNotFoundError(ParserError):
+    """Exception raised when a parsing match is not found"""
+    def __init__(self, regex: str, contents: str):
+        self.regex = regex
+        self.contents = contents
+        super().__init__(
+            f"Could not locate match for regex: '{regex}' in contents: '{contents}'"
+        )
+class RegistryError(BaseError):
+    """Exception raised when a registry error occurs"""
+class DataCollectorError(BaseError):
+    """Exception raised when a data collector error occurs"""

qccodec/models.py ADDED Viewed

@@ -0,0 +1,54 @@
+"""Simple data models to support parsing of QM program output files."""
+from dataclasses import dataclass
+from typing import Any, Optional, Union
+from .exceptions import DataCollectorError
+class DataCollector(dict):
+    """A dictionary for collecting data from parsers."""
+    def add_data(self, target: Union[str, tuple[str, ...]], value: Any) -> None:
+        """
+        Assign a value into the DataCollector at the specified target.
+        If target is a string, the value is assigned to that key.
+        If target is a tuple, the method navigates through nested dictionaries,
+        creating them as needed, and assigns the value at the final key.
+        """
+        keys = target if isinstance(target, tuple) else (target,)
+        d = self
+        for key in keys[:-1]:
+            d = d.setdefault(key, {})
+        if keys[-1] in d:
+            raise DataCollectorError(
+                f"Target '{keys}' already exists in DataCollector. You cannot add the same target twice."
+            )
+        d[keys[-1]] = value
+@dataclass
+class NativeInput:
+    """Native input file data for a quantum chemistry program.
+    Writing these files to disk should produce a valid input.
+    Attributes:
+        input_file: input file for the program
+        geometry: xyz file or other geometry file required for the calculation
+        geometry_filename: filename of the geometry file referenced in the input
+    """
+    input_file: str
+    geometry_file: Optional[str] = None
+    geometry_filename: Optional[str] = None
+    def __post_init__(self):
+        """Ensure that geometry_filename is set if geometry is set."""
+        if self.geometry_file and not self.geometry_filename:
+            raise ValueError(
+                "geometry_filename must be set if geometry_file is provided. "
+                "Set geometry_filename to the name of the geometry file as referenced "
+                "in the input file."
+            )

qccodec/parsers/__init__.py ADDED Viewed

@@ -0,0 +1,16 @@
+"""All parsers should follow a basic pattern:
+1. Extract the parsed data, cast to its appropriate Python type, set it on the
+    SinglePointResults object at the appropriate attribute name.
+2. Raise a MatchNotFound error if a match was not found
+3. Register parser with the registry by decorating it with the parser() decorator
+Use the .utils.regex_search() helper function in place of re.search() to
+ensure that a MatchNotFoundError will be raised in a parser. More sophisticated parsers
+that use re.findall (like terachem.parse_hessian) or rely upon not finding a match may
+implement a different interface, but please strive to follow this basic patterns as much
+as possible.
+"""
+# Required for parsers to register
+from .terachem import *  # noqa:  F403