biosim-extractor 0.0.1__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,21 @@
1
+ MIT License
2
+
3
+ Copyright (c) 2025-2026 CCPBioSim
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
@@ -0,0 +1,58 @@
1
+ Metadata-Version: 2.4
2
+ Name: biosim_extractor
3
+ Version: 0.0.1
4
+ Summary: Extract simulation metadata from MD output files
5
+ Keywords: molecular dynamics,parser,schema
6
+ Author-email: Jas Kalayan <jas.kalayan@stfc.ac.uk>
7
+ Maintainer-email: Jas Kalayan <jas.kalayan@stfc.ac.uk>
8
+ Requires-Python: >=3.12
9
+ Description-Content-Type: text/markdown
10
+ Classifier: Development Status :: 3 - Alpha
11
+ Classifier: Programming Language :: Python :: 3
12
+ Classifier: Programming Language :: Python :: 3.12
13
+ Classifier: Programming Language :: Python :: 3.13
14
+ Classifier: Programming Language :: Python :: 3.14
15
+ Classifier: License :: OSI Approved :: MIT License
16
+ Classifier: Natural Language :: English
17
+ Classifier: Intended Audience :: Science/Research
18
+ Classifier: Operating System :: OS Independent
19
+ Classifier: Topic :: Software Development :: Libraries
20
+ License-File: LICENSE
21
+ Requires-Dist: MDAnalysis==2.10.0
22
+ Requires-Dist: biopython==1.87
23
+ Requires-Dist: rdkit>=2025.9.5
24
+ Requires-Dist: linkml==1.11.1
25
+ Requires-Dist: sphinx>=9,<10 ; extra == "docs"
26
+ Requires-Dist: nbsphinx>=0.9,<1.0 ; extra == "docs"
27
+ Requires-Dist: sphinx_rtd_theme>=3,<3.2 ; extra == "docs"
28
+ Requires-Dist: sphinx_copybutton>=0.5,<1.0 ; extra == "docs"
29
+ Requires-Dist: sphinxcontrib-contentui>=0.2,<1.0 ; extra == "docs"
30
+ Requires-Dist: sphinxcontrib-details-directive>=0.1,<1.0 ; extra == "docs"
31
+ Requires-Dist: furo>=2025.0,<2026.0 ; extra == "docs"
32
+ Requires-Dist: markupsafe>=3.0,<4.0 ; extra == "docs"
33
+ Requires-Dist: ruff>=0.14.8 ; extra == "pre-commit"
34
+ Requires-Dist: pre-commit>=4.5,<5.0 ; extra == "pre-commit"
35
+ Requires-Dist: pylint>=4.0,<5.0 ; extra == "pre-commit"
36
+ Requires-Dist: pytest>=9.0,<10.0 ; extra == "testing"
37
+ Requires-Dist: pytest-cov>=7.0,<8.0 ; extra == "testing"
38
+ Requires-Dist: pytest-sugar>=1.1,<2.0 ; extra == "testing"
39
+ Project-URL: Documentation, https://ccpbiosim-biosim-extractor.github.io/
40
+ Project-URL: Homepage, https://github.com/CCPBioSim/biosim-extractor/
41
+ Project-URL: Repository, https://github.com/CCPBioSim/biosim-extractor.git
42
+ Provides-Extra: docs
43
+ Provides-Extra: pre-commit
44
+ Provides-Extra: testing
45
+
46
+ biosim-extractor
47
+ ================
48
+ A repository for extracting simulation data from output files produced from molecular dynamics (MD) simulations of biomolecules and validating against biosim-schem.
49
+
50
+ ## Project Status
51
+
52
+ | Category | Badges |
53
+ |----------------|--------|
54
+ | **Build** | [![CI](https://github.com/CCPBioSim/biosim-extractor/actions/workflows/biosim_extractor_project-ci.yaml/badge.svg)](https://github.com/CCPBioSim/biosim-extractor/actions/workflows/biosim_extractor_project-ci.yaml) |
55
+ | **Documentation** | [![Docs – Status](https://app.readthedocs.org/projects/biosim-extractor/badge/?version=latest)](https://biosim-extractor.readthedocs.io/en/latest/) |
56
+ | **PyPI** | [![PyPI - Version](https://img.shields.io/pypi/v/biosim-extractor.svg)](https://pypi.org/project/biosim-extractor/) [![PyPI - Status](https://img.shields.io/pypi/status/biosim-extractor.svg)](https://pypi.org/project/biosim-extractor/) [![PyPI - Python Version](https://img.shields.io/pypi/pyversions/biosim-extractor.svg)](https://pypi.org/project/biosim-extractor/) [![PyPI - Downloads](https://img.shields.io/pypi/dm/biosim-extractor.svg)](https://pypi.org/project/biosim-extractor/) |
57
+ | **Quality** | [![Coverage Status](https://coveralls.io/repos/github/CCPBioSim/biosim-extractor/badge.svg?branch=main)](https://coveralls.io/github/CCPBioSim/biosim-extractor?branch=main) |
58
+
@@ -0,0 +1,12 @@
1
+ biosim-extractor
2
+ ================
3
+ A repository for extracting simulation data from output files produced from molecular dynamics (MD) simulations of biomolecules and validating against biosim-schem.
4
+
5
+ ## Project Status
6
+
7
+ | Category | Badges |
8
+ |----------------|--------|
9
+ | **Build** | [![CI](https://github.com/CCPBioSim/biosim-extractor/actions/workflows/biosim_extractor_project-ci.yaml/badge.svg)](https://github.com/CCPBioSim/biosim-extractor/actions/workflows/biosim_extractor_project-ci.yaml) |
10
+ | **Documentation** | [![Docs – Status](https://app.readthedocs.org/projects/biosim-extractor/badge/?version=latest)](https://biosim-extractor.readthedocs.io/en/latest/) |
11
+ | **PyPI** | [![PyPI - Version](https://img.shields.io/pypi/v/biosim-extractor.svg)](https://pypi.org/project/biosim-extractor/) [![PyPI - Status](https://img.shields.io/pypi/status/biosim-extractor.svg)](https://pypi.org/project/biosim-extractor/) [![PyPI - Python Version](https://img.shields.io/pypi/pyversions/biosim-extractor.svg)](https://pypi.org/project/biosim-extractor/) [![PyPI - Downloads](https://img.shields.io/pypi/dm/biosim-extractor.svg)](https://pypi.org/project/biosim-extractor/) |
12
+ | **Quality** | [![Coverage Status](https://coveralls.io/repos/github/CCPBioSim/biosim-extractor/badge.svg?branch=main)](https://coveralls.io/github/CCPBioSim/biosim-extractor?branch=main) |
@@ -0,0 +1,8 @@
1
+ """
2
+ example_package
3
+
4
+ A short description of your package.
5
+ """
6
+
7
+ __all__ = [] # Populate with public exports when needed
8
+ __version__ = "0.0.1"
@@ -0,0 +1,310 @@
1
+ #!/usr/bin/env python
2
+ """
3
+ Extract AMBER log file metadata into a structured dictionary.
4
+
5
+ This script parses AMBER log files and outputs structured metadata as JSON.
6
+ It can be used as a standalone CLI tool or imported as a module.
7
+ """
8
+
9
+ import argparse
10
+ import json
11
+ import re
12
+
13
+ from biosim_extractor.helpers.log_utils import add_value, normalize_name, parse_value
14
+
15
+
16
+ # -------------------------
17
+ # PARSER
18
+ # -------------------------
19
+ class AmberLogParser:
20
+ """
21
+ Parser for AMBER log files.
22
+ """
23
+
24
+ def __init__(self, filepath):
25
+ """
26
+ Args:
27
+ filepath (str): Path to the AMBER log file.
28
+ """
29
+ self.filepath = filepath
30
+ self.lines = []
31
+ self.data = {
32
+ # "Header": {},
33
+ "SimulationSettings": {},
34
+ "Results": {
35
+ "TimeSeries": [],
36
+ "Averages": {},
37
+ "RMSFluctuations": {},
38
+ "Timings": {},
39
+ },
40
+ }
41
+
42
+ # -------------------------
43
+ # PUBLIC API
44
+ # -------------------------
45
+ def parse(self):
46
+ """
47
+ Parse the AMBER log file.
48
+
49
+ Returns:
50
+ dict: Parsed metadata.
51
+ """
52
+ with open(self.filepath) as f:
53
+ self.lines = f.readlines()
54
+
55
+ # self._parse_header()
56
+ self._parse_simulation_settings()
57
+ self._parse_results()
58
+ # print(json.dumps(self.data, indent=2))
59
+ return self.data
60
+
61
+ # # -------------------------
62
+ # # HEADER
63
+ # # -------------------------
64
+ # def _parse_header(self):
65
+ # for line in self.lines[:200]:
66
+ # if "=" in line:
67
+ # parts = line.split(",")[0].split("=")
68
+ # if len(parts) == 2:
69
+ # key, val = parts
70
+ # add_value(self.data["Header"], key.strip(), parse_value(val))
71
+
72
+ # -------------------------
73
+ # SIMULATION SETTINGS
74
+ # -------------------------
75
+ def _parse_simulation_settings(self):
76
+ """
77
+ Parse simulation settings from the log file.
78
+ """
79
+ settings = self.data["SimulationSettings"]
80
+ current_section = None
81
+ capture_cntrl = False
82
+
83
+ for line in self.lines:
84
+ stripped = line.strip()
85
+
86
+ # Stop at time series
87
+ if "NSTEP" in line and "TIME" in line:
88
+ break
89
+
90
+ # -------------------------
91
+ # &cntrl block
92
+ # -------------------------
93
+ if "&cntrl" in stripped:
94
+ capture_cntrl = True
95
+ current_section = "cntrl"
96
+ settings[current_section] = {}
97
+ continue
98
+
99
+ if capture_cntrl:
100
+ if "/" in stripped:
101
+ capture_cntrl = False
102
+ current_section = None
103
+ continue
104
+
105
+ for part in stripped.split(","):
106
+ if "=" in part:
107
+ k, v = part.split("=")
108
+ add_value(settings["cntrl"], k.strip(), parse_value(v))
109
+ continue
110
+
111
+ # -------------------------
112
+ # Colon sections
113
+ # -------------------------
114
+ if stripped.endswith(":") and "=" not in stripped:
115
+ section_name = normalize_name(stripped[:-1])
116
+ current_section = section_name
117
+ settings[current_section] = {}
118
+ continue
119
+
120
+ # -------------------------
121
+ # Key-value pairs
122
+ # -------------------------
123
+ if "=" in line:
124
+ matches = re.findall(r"([A-Za-z0-9_\-\s]+?)\s*=\s*([-\d\.E+]+)", line)
125
+
126
+ for k, v in matches:
127
+ key = normalize_name(k)
128
+ val = parse_value(v)
129
+
130
+ if current_section:
131
+ add_value(settings[current_section], key, val)
132
+ else:
133
+ add_value(settings, key, val)
134
+
135
+ # Reset section on blank line
136
+ if not stripped:
137
+ current_section = None
138
+
139
+ self._parse_file_assignments(settings)
140
+
141
+ # -------------------------
142
+ # SETTINGS: FILE ASSIGNMENTS
143
+ # -------------------------
144
+ def _parse_file_assignments(self, settings):
145
+ """
146
+ Parse file assignments from the log file.
147
+
148
+ Args:
149
+ settings (dict): Simulation settings dictionary to update.
150
+ """
151
+ capture = False
152
+ files = {}
153
+
154
+ pattern = r"\|\s*([A-Z0-9_]+):\s*(.+)"
155
+
156
+ for line in self.lines:
157
+ stripped = line.strip()
158
+
159
+ # Start block
160
+ if "File Assignments:" in line:
161
+ capture = True
162
+ continue
163
+
164
+ if capture:
165
+ # Stop if block ends
166
+ if not stripped or not stripped.startswith("|"):
167
+ break
168
+
169
+ match = re.search(pattern, line)
170
+ if match:
171
+ key = match.group(1).strip()
172
+ val = match.group(2).strip()
173
+
174
+ files[key] = val
175
+
176
+ if files:
177
+ settings["File_Assignments"] = files
178
+
179
+ # -------------------------
180
+ # RESULTS (ALL OUTPUT DATA)
181
+ # -------------------------
182
+ def _parse_results(self):
183
+ """
184
+ Parse results blocks from the log file.
185
+ """
186
+ # self._parse_time_series()
187
+ self._parse_block(
188
+ "A V E R A G E S", "R M S F L U C T U A T I O N S", "Averages"
189
+ )
190
+ # self._parse_block("R M S F L U C T U A T I O N S", "TIMINGS", "RMSFluctuations")
191
+ self._parse_timings()
192
+
193
+ # -------------------------
194
+ # TIME SERIES
195
+ # -------------------------
196
+ def _parse_time_series(self):
197
+ """
198
+ Parse time series data from the log file.
199
+ """
200
+ steps = []
201
+ current = {}
202
+ in_series = False
203
+
204
+ for line in self.lines:
205
+ if "NSTEP" in line and "TIME" in line:
206
+ in_series = True
207
+ if current:
208
+ steps.append(current)
209
+ current = {}
210
+
211
+ matches = re.findall(r"([A-Za-z\(\)\-]+)\s*=\s*([-\d\.E+]+)", line)
212
+ for k, v in matches:
213
+ current[k] = parse_value(v)
214
+ continue
215
+
216
+ if in_series and "=" in line:
217
+ matches = re.findall(r"([A-Za-z\(\)\-]+)\s*=\s*([-\d\.E+]+)", line)
218
+ for k, v in matches:
219
+ current[k] = parse_value(v)
220
+
221
+ if "A V E R A G E S" in line:
222
+ break
223
+
224
+ if current:
225
+ steps.append(current)
226
+
227
+ self.data["Results"]["TimeSeries"] = steps
228
+
229
+ # -------------------------
230
+ # GENERIC BLOCK PARSER
231
+ # -------------------------
232
+ def _parse_block(self, start_marker, end_marker, target_key):
233
+ """
234
+ Parse a generic results block.
235
+
236
+ Args:
237
+ start_marker (str): Line indicating the start of the block.
238
+ end_marker (str): Line indicating the end of the block.
239
+ target_key (str): Key in the results dictionary to populate.
240
+ """
241
+ capture = False
242
+ target = self.data["Results"][target_key]
243
+
244
+ for line in self.lines:
245
+ if start_marker in line:
246
+ capture = True
247
+ continue
248
+
249
+ if capture and "=" in line:
250
+ matches = re.findall(r"([A-Za-z\(\)\-]+)\s*=\s*([-\d\.E+]+)", line)
251
+ for k, v in matches:
252
+ add_value(target, k, parse_value(v))
253
+
254
+ if end_marker in line:
255
+ break
256
+
257
+ # -------------------------
258
+ # TIMINGS
259
+ # -------------------------
260
+ def _parse_timings(self):
261
+ """
262
+ Parse timing information from the log file.
263
+ """
264
+ timings = self.data["Results"]["Timings"]
265
+ pattern = r"\|\s*(.*?)\s*:\s*([-\d\.E+]+)\s*seconds"
266
+
267
+ for line in self.lines:
268
+ if "CPU time" in line or "wall time" in line:
269
+ match = re.search(pattern, line)
270
+ if match:
271
+ key = normalize_name(match.group(1))
272
+ val = parse_value(match.group(2))
273
+ add_value(timings, key, val)
274
+
275
+
276
+ # =========================
277
+ # ENTRY POINT
278
+ # =========================
279
+
280
+
281
+ def parse_args():
282
+ """Parse command-line arguments.
283
+
284
+ Returns:
285
+ Parsed ``argparse.Namespace`` object.
286
+ """
287
+ parser = argparse.ArgumentParser(
288
+ description="Extract Amber log file metadata to JSON"
289
+ )
290
+ parser.add_argument("logfile", help="Path to Amber log file")
291
+ parser.add_argument("--output", "-o", help="Output file path (default: stdout)")
292
+ return parser.parse_args()
293
+
294
+
295
+ def main():
296
+ """Entry point: parse args, run extraction, and write output."""
297
+ args = parse_args()
298
+
299
+ parser = AmberLogParser(args.logfile)
300
+ result = parser.parse()
301
+
302
+ if args.output:
303
+ with open(args.output, "w") as f:
304
+ json.dump(result, f, indent=2)
305
+ else:
306
+ print(json.dumps(result, indent=2))
307
+
308
+
309
+ if __name__ == "__main__":
310
+ main()