PyPI - biosim_extractor - Versions diffs - 0.0.4__py3-none-any.whl - Mend

biosim_extractor 0.0.4__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (22) hide show

biosim_extractor/__init__.py +8 -0
biosim_extractor/amber/__init__.py +0 -0
biosim_extractor/amber/amberlog.py +310 -0
biosim_extractor/gromacs/__init__.py +0 -0
biosim_extractor/gromacs/gromacslog.py +311 -0
biosim_extractor/helpers/__init__.py +0 -0
biosim_extractor/helpers/log_utils.py +82 -0
biosim_extractor/helpers/metadata_utils.py +24 -0
biosim_extractor/mdanalysis/__init__.py +0 -0
biosim_extractor/mdanalysis/toptraj.py +380 -0
biosim_extractor/metadata/__init__.py +0 -0
biosim_extractor/metadata/convertpopulated.py +52 -0
biosim_extractor/metadata/fetchschema.py +168 -0
biosim_extractor/metadata/populatemetadata.py +592 -0
biosim_extractor/metadata/validatemetadata.py +170 -0
biosim_extractor/units/__init__.py +0 -0
biosim_extractor/units/unitconversion.py +294 -0
biosim_extractor-0.0.4.dist-info/METADATA +58 -0
biosim_extractor-0.0.4.dist-info/RECORD +22 -0
biosim_extractor-0.0.4.dist-info/WHEEL +4 -0
biosim_extractor-0.0.4.dist-info/entry_points.txt +7 -0
biosim_extractor-0.0.4.dist-info/licenses/LICENSE +21 -0

biosim_extractor/__init__.py ADDED Viewed

@@ -0,0 +1,8 @@
+"""
+example_package
+A short description of your package.
+"""
+__all__ = []  # Populate with public exports when needed
+__version__ = "0.0.4"

biosim_extractor/amber/__init__.py ADDED Viewed

File without changes

biosim_extractor/amber/amberlog.py ADDED Viewed

@@ -0,0 +1,310 @@
+#!/usr/bin/env python
+"""
+Extract AMBER log file metadata into a structured dictionary.
+This script parses AMBER log files and outputs structured metadata as JSON.
+It can be used as a standalone CLI tool or imported as a module.
+"""
+import argparse
+import json
+import re
+from biosim_extractor.helpers.log_utils import add_value, normalize_name, parse_value
+# -------------------------
+# PARSER
+# -------------------------
+class AmberLogParser:
+    """
+    Parser for AMBER log files.
+    """
+    def __init__(self, filepath):
+        """
+        Args:
+            filepath (str): Path to the AMBER log file.
+        """
+        self.filepath = filepath
+        self.lines = []
+        self.data = {
+            # "Header": {},
+            "SimulationSettings": {},
+            "Results": {
+                "TimeSeries": [],
+                "Averages": {},
+                "RMSFluctuations": {},
+                "Timings": {},
+            },
+        }
+    # -------------------------
+    # PUBLIC API
+    # -------------------------
+    def parse(self):
+        """
+        Parse the AMBER log file.
+        Returns:
+            dict: Parsed metadata.
+        """
+        with open(self.filepath) as f:
+            self.lines = f.readlines()
+        # self._parse_header()
+        self._parse_simulation_settings()
+        self._parse_results()
+        # print(json.dumps(self.data, indent=2))
+        return self.data
+    # # -------------------------
+    # # HEADER
+    # # -------------------------
+    # def _parse_header(self):
+    #     for line in self.lines[:200]:
+    #         if "=" in line:
+    #             parts = line.split(",")[0].split("=")
+    #             if len(parts) == 2:
+    #                 key, val = parts
+    #                 add_value(self.data["Header"], key.strip(), parse_value(val))
+    # -------------------------
+    # SIMULATION SETTINGS
+    # -------------------------
+    def _parse_simulation_settings(self):
+        """
+        Parse simulation settings from the log file.
+        """
+        settings = self.data["SimulationSettings"]
+        current_section = None
+        capture_cntrl = False
+        for line in self.lines:
+            stripped = line.strip()
+            # Stop at time series
+            if "NSTEP" in line and "TIME" in line:
+                break
+            # -------------------------
+            # &cntrl block
+            # -------------------------
+            if "&cntrl" in stripped:
+                capture_cntrl = True
+                current_section = "cntrl"
+                settings[current_section] = {}
+                continue
+            if capture_cntrl:
+                if "/" in stripped:
+                    capture_cntrl = False
+                    current_section = None
+                    continue
+                for part in stripped.split(","):
+                    if "=" in part:
+                        k, v = part.split("=")
+                        add_value(settings["cntrl"], k.strip(), parse_value(v))
+                continue
+            # -------------------------
+            # Colon sections
+            # -------------------------
+            if stripped.endswith(":") and "=" not in stripped:
+                section_name = normalize_name(stripped[:-1])
+                current_section = section_name
+                settings[current_section] = {}
+                continue
+            # -------------------------
+            # Key-value pairs
+            # -------------------------
+            if "=" in line:
+                matches = re.findall(r"([A-Za-z0-9_\-\s]+?)\s*=\s*([-\d\.E+]+)", line)
+                for k, v in matches:
+                    key = normalize_name(k)
+                    val = parse_value(v)
+                    if current_section:
+                        add_value(settings[current_section], key, val)
+                    else:
+                        add_value(settings, key, val)
+            # Reset section on blank line
+            if not stripped:
+                current_section = None
+        self._parse_file_assignments(settings)
+    # -------------------------
+    # SETTINGS: FILE ASSIGNMENTS
+    # -------------------------
+    def _parse_file_assignments(self, settings):
+        """
+        Parse file assignments from the log file.
+        Args:
+            settings (dict): Simulation settings dictionary to update.
+        """
+        capture = False
+        files = {}
+        pattern = r"\|\s*([A-Z0-9_]+):\s*(.+)"
+        for line in self.lines:
+            stripped = line.strip()
+            # Start block
+            if "File Assignments:" in line:
+                capture = True
+                continue
+            if capture:
+                # Stop if block ends
+                if not stripped or not stripped.startswith("|"):
+                    break
+                match = re.search(pattern, line)
+                if match:
+                    key = match.group(1).strip()
+                    val = match.group(2).strip()
+                    files[key] = val
+        if files:
+            settings["File_Assignments"] = files
+    # -------------------------
+    # RESULTS (ALL OUTPUT DATA)
+    # -------------------------
+    def _parse_results(self):
+        """
+        Parse results blocks from the log file.
+        """
+        # self._parse_time_series()
+        self._parse_block(
+            "A V E R A G E S", "R M S  F L U C T U A T I O N S", "Averages"
+        )
+        # self._parse_block("R M S  F L U C T U A T I O N S", "TIMINGS", "RMSFluctuations")
+        self._parse_timings()
+    # -------------------------
+    # TIME SERIES
+    # -------------------------
+    def _parse_time_series(self):
+        """
+        Parse time series data from the log file.
+        """
+        steps = []
+        current = {}
+        in_series = False
+        for line in self.lines:
+            if "NSTEP" in line and "TIME" in line:
+                in_series = True
+                if current:
+                    steps.append(current)
+                    current = {}
+                matches = re.findall(r"([A-Za-z\(\)\-]+)\s*=\s*([-\d\.E+]+)", line)
+                for k, v in matches:
+                    current[k] = parse_value(v)
+                continue
+            if in_series and "=" in line:
+                matches = re.findall(r"([A-Za-z\(\)\-]+)\s*=\s*([-\d\.E+]+)", line)
+                for k, v in matches:
+                    current[k] = parse_value(v)
+            if "A V E R A G E S" in line:
+                break
+        if current:
+            steps.append(current)
+        self.data["Results"]["TimeSeries"] = steps
+    # -------------------------
+    # GENERIC BLOCK PARSER
+    # -------------------------
+    def _parse_block(self, start_marker, end_marker, target_key):
+        """
+        Parse a generic results block.
+        Args:
+            start_marker (str): Line indicating the start of the block.
+            end_marker (str): Line indicating the end of the block.
+            target_key (str): Key in the results dictionary to populate.
+        """
+        capture = False
+        target = self.data["Results"][target_key]
+        for line in self.lines:
+            if start_marker in line:
+                capture = True
+                continue
+            if capture and "=" in line:
+                matches = re.findall(r"([A-Za-z\(\)\-]+)\s*=\s*([-\d\.E+]+)", line)
+                for k, v in matches:
+                    add_value(target, k, parse_value(v))
+            if end_marker in line:
+                break
+    # -------------------------
+    # TIMINGS
+    # -------------------------
+    def _parse_timings(self):
+        """
+        Parse timing information from the log file.
+        """
+        timings = self.data["Results"]["Timings"]
+        pattern = r"\|\s*(.*?)\s*:\s*([-\d\.E+]+)\s*seconds"
+        for line in self.lines:
+            if "CPU time" in line or "wall time" in line:
+                match = re.search(pattern, line)
+                if match:
+                    key = normalize_name(match.group(1))
+                    val = parse_value(match.group(2))
+                    add_value(timings, key, val)
+# =========================
+# ENTRY POINT
+# =========================
+def parse_args():
+    """Parse command-line arguments.
+    Returns:
+        Parsed ``argparse.Namespace`` object.
+    """
+    parser = argparse.ArgumentParser(
+        description="Extract Amber log file metadata to JSON"
+    )
+    parser.add_argument("logfile", help="Path to Amber log file")
+    parser.add_argument("--output", "-o", help="Output file path (default: stdout)")
+    return parser.parse_args()
+def main():
+    """Entry point: parse args, run extraction, and write output."""
+    args = parse_args()
+    parser = AmberLogParser(args.logfile)
+    result = parser.parse()
+    if args.output:
+        with open(args.output, "w") as f:
+            json.dump(result, f, indent=2)
+    else:
+        print(json.dumps(result, indent=2))
+if __name__ == "__main__":
+    main()

biosim_extractor/gromacs/__init__.py ADDED Viewed

File without changes

biosim_extractor/gromacs/gromacslog.py ADDED Viewed

@@ -0,0 +1,311 @@
+#!/usr/bin/env python
+"""
+Extract gmx log file metadata into a dictionary.
+"""
+import argparse
+import json
+import re
+from biosim_extractor.helpers.log_utils import parse_value
+class GromacsLogParser:
+    """Parser for GROMACS ``.log`` files, extracting header, input parameters, summary, and averages."""
+    def __init__(self, filepath):
+        """
+        Args:
+            filepath: Path to the GROMACS log file.
+        """
+        self.filepath = filepath
+        self.lines = []
+        self.data = {}
+        self.energy_timeseries = []
+    # =========================
+    # MAIN ENTRY
+    # =========================
+    def parse(self):
+        """Parse the log file and return all extracted data.
+        Returns:
+            Dictionary containing header fields, input parameters, summary,
+            and averages.
+        """
+        with open(self.filepath) as f:
+            self.lines = f.readlines()
+        self._parse_header()
+        self._parse_indented_blocks()
+        self._parse_summary()
+        # self._parse_energy_timeseries()
+        self._parse_averages()  # new averages parser
+        # self.data["Energy Time Series"] = self.energy_timeseries
+        # print(json.dumps(self.data, indent=2))
+        return self.data
+    # =========================
+    # HEADER
+    # =========================
+    def _parse_header(self):
+        """Extract top-level key-value fields from the file header (e.g. GROMACS version, compiler)."""
+        keys = [
+            "Executable",
+            "Data prefix",
+            "Working dir",
+            "Process ID",
+            "Command line",
+            "GROMACS version",
+            "Precision",
+            "Memory model",
+            "MPI library",
+            "OpenMP support",
+            "GPU support",
+            "SIMD instructions",
+            "CPU FFT library",
+            "GPU FFT library",
+            "RDTSCP usage",
+            "TNG support",
+            "Hwloc support",
+            "Tracing support",
+            "C compiler",
+            "C compiler flags",
+            "C++ compiler",
+            "C++ compiler flags",
+        ]
+        for line in self.lines:
+            for key in keys:
+                if line.startswith(key):
+                    _, val = line.split(":", 1)
+                    self.data[key] = val.strip()
+    # =========================
+    # INDENTED BLOCKS
+    # =========================
+    def _parse_indented_blocks(self):
+        """Parse indented ``key: value`` and ``key = value`` blocks into nested dicts.
+        Also collapses ``(3x3)`` matrix entries into lists of rows.
+        """
+        stack = [(-1, self.data)]
+        for line in self.lines:
+            if not line.strip():
+                continue
+            indent = len(line) - len(line.lstrip(" "))
+            stripped = line.strip()
+            while stack and stack[-1][0] >= indent:
+                stack.pop()
+            parent = stack[-1][1]
+            if ":" in stripped and not stripped.endswith(":"):
+                key, val = map(str.strip, stripped.split(":", 1))
+                parent[key] = parse_value(val)
+                continue
+            if "=" in stripped:
+                key, val = map(str.strip, stripped.split("=", 1))
+                parent[key] = parse_value(val)
+                continue
+            if stripped.endswith(":"):
+                key = stripped[:-1].strip()
+                new_dict = {}
+                parent[key] = new_dict
+                stack.append((indent, new_dict))
+        # deal with 3x3 arrays here
+        for key in ["Input Parameters", "qm-opts"]:
+            sub_dict = self.data.get(key)
+            if not sub_dict:
+                continue
+            for k, v in list(sub_dict.items()):
+                if "(3x3)" in k:
+                    new_k = k.split(" (")[0]
+                    array = [arr for arr in v.values()]
+                    sub_dict.pop(k)
+                    sub_dict[new_k] = array
+    # =========================
+    # SUMMARY (PERFORMANCE, TIME)
+    # =========================
+    def _parse_summary(self):
+        """Extract performance and wall-time summary from the end of the log file."""
+        summary = {}
+        lines = self.lines
+        n = len(lines)
+        i = 0
+        while i < n:
+            line = lines[i]
+            if "Performance:" in line:
+                parts = line.split()
+                summary["Performance"] = {
+                    "(ns/day)": parse_value(parts[-2]),
+                    "(hour/ns)": parse_value(parts[-1]),
+                }
+            elif line.strip().startswith("Time:"):
+                vals = lines[i + 1].split()
+                summary["Time"] = {
+                    "Core t (s)": parse_value(vals[0]),
+                    "Wall t (s)": parse_value(vals[1]),
+                }
+            i += 1
+        self.data["Summary"] = summary
+    # =========================
+    # ENERGY TIME SERIES
+    # =========================
+    # unused — retained for future use
+    def _parse_energy_timeseries(self):
+        """Extract per-step energy blocks into ``self.energy_timeseries``."""
+        lines = self.lines
+        n = len(lines)
+        i = 0
+        while i < n:
+            line = lines[i]
+            if re.match(r"\s*Step\s+Time", line):
+                step_line = lines[i + 1].split()
+                entry = {
+                    "Step": parse_value(step_line[0]),
+                    "Time": parse_value(step_line[1]),
+                }
+                # locate "Energies" block
+                j = i + 2
+                while j < n and "Energies (kJ/mol)" not in lines[j]:
+                    j += 1
+                if j >= n:
+                    break
+                # parse 4 energy blocks
+                for block in range(4):
+                    headers = lines[j + 1 + block * 2].split()
+                    values = lines[j + 2 + block * 2].split()
+                    for h, v in zip(headers, values):
+                        entry[h] = parse_value(v)
+                # parse last line with Pres., DC, bar
+                k = j + 9
+                if k < n:
+                    extra_line = lines[k].split()
+                    if len(extra_line) >= 3:
+                        entry["Pres."] = parse_value(extra_line[0])
+                        entry["DC"] = parse_value(extra_line[1])
+                        entry["(bar)"] = parse_value(extra_line[2])
+                self.energy_timeseries.append(entry)
+                i = j + 10
+            else:
+                i += 1
+    # =========================
+    # AVERAGES
+    # =========================
+    def _parse_averages(self):
+        """Extract the ``A V E R A G E S`` block, including energies, box dimensions, and tensors."""
+        lines = self.lines
+        n = len(lines)
+        i = 0
+        averages = {}
+        capture = False
+        while i < n:
+            line = lines[i]
+            # detect start of averages block
+            if "A V E R A G E S" in line:
+                capture = True
+                i += 1
+                continue
+            if capture:
+                # Statistics header
+                if "Statistics over" in line:
+                    parts = line.split()
+                    averages["total-steps"] = parse_value(parts[2])
+                    averages["total-frames"] = parse_value(parts[-2])
+                # Energies block (same as timeseries)
+                elif "Energies (kJ/mol)" in line:
+                    for block in range(4):
+                        headers = lines[i + 1 + block * 2]
+                        values = lines[i + 2 + block * 2].split()
+                        headers_split = [
+                            (headers[i : i + 15].split())
+                            for i in range(0, len(headers), 15)
+                        ]
+                        for h, v in zip(headers_split[:-1], values):
+                            h = " ".join(h)
+                            averages[h] = parse_value(v)
+                    i += 8
+                    continue
+                # Box dimensions
+                elif line.strip().startswith("Box-"):
+                    headers = line.split()
+                    values = lines[i + 1].split()
+                    for h, v in zip(headers, values):
+                        averages[h] = parse_value(v)
+                    i += 2
+                    continue
+                # Protein temperatures
+                elif line.strip().startswith("T-Protein"):
+                    headers = line.split()
+                    values = lines[i + 1].split()
+                    for h, v in zip(headers, values):
+                        averages[h] = parse_value(v)
+                    i += 2
+                    continue
+                # Total Virial and Pressure tensors
+                elif "Total Virial" in line or "Pressure (bar)" in line:
+                    key = line.strip()
+                    matrix = []
+                    for j in range(1, 4):
+                        row = [parse_value(x) for x in lines[i + j].split()]
+                        matrix.append(row)
+                    averages[key + " tensor"] = matrix
+                    i += 4
+                    continue
+            i += 1
+        self.data["Averages"] = averages
+# =========================
+# ENTRY POINT
+# =========================
+def parse_args():
+    """Parse command-line arguments.
+    Returns:
+        Parsed ``argparse.Namespace`` object.
+    """
+    parser = argparse.ArgumentParser(
+        description="Extract GROMACS log file metadata to JSON"
+    )
+    parser.add_argument("logfile", help="Path to GROMACS log file")
+    parser.add_argument("--output", "-o", help="Output file path (default: stdout)")
+    return parser.parse_args()
+def main():
+    """Entry point: parse args, run extraction, and write output."""
+    args = parse_args()
+    parser = GromacsLogParser(args.logfile)
+    result = parser.parse()
+    if args.output:
+        with open(args.output, "w") as f:
+            json.dump(result, f, indent=2)
+    else:
+        print(json.dumps(result, indent=2))
+if __name__ == "__main__":
+    main()

biosim_extractor/helpers/__init__.py ADDED Viewed

File without changes