biosim_extractor 0.0.4__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,8 @@
1
+ """
2
+ example_package
3
+
4
+ A short description of your package.
5
+ """
6
+
7
+ __all__ = [] # Populate with public exports when needed
8
+ __version__ = "0.0.4"
File without changes
@@ -0,0 +1,310 @@
1
+ #!/usr/bin/env python
2
+ """
3
+ Extract AMBER log file metadata into a structured dictionary.
4
+
5
+ This script parses AMBER log files and outputs structured metadata as JSON.
6
+ It can be used as a standalone CLI tool or imported as a module.
7
+ """
8
+
9
+ import argparse
10
+ import json
11
+ import re
12
+
13
+ from biosim_extractor.helpers.log_utils import add_value, normalize_name, parse_value
14
+
15
+
16
+ # -------------------------
17
+ # PARSER
18
+ # -------------------------
19
+ class AmberLogParser:
20
+ """
21
+ Parser for AMBER log files.
22
+ """
23
+
24
+ def __init__(self, filepath):
25
+ """
26
+ Args:
27
+ filepath (str): Path to the AMBER log file.
28
+ """
29
+ self.filepath = filepath
30
+ self.lines = []
31
+ self.data = {
32
+ # "Header": {},
33
+ "SimulationSettings": {},
34
+ "Results": {
35
+ "TimeSeries": [],
36
+ "Averages": {},
37
+ "RMSFluctuations": {},
38
+ "Timings": {},
39
+ },
40
+ }
41
+
42
+ # -------------------------
43
+ # PUBLIC API
44
+ # -------------------------
45
+ def parse(self):
46
+ """
47
+ Parse the AMBER log file.
48
+
49
+ Returns:
50
+ dict: Parsed metadata.
51
+ """
52
+ with open(self.filepath) as f:
53
+ self.lines = f.readlines()
54
+
55
+ # self._parse_header()
56
+ self._parse_simulation_settings()
57
+ self._parse_results()
58
+ # print(json.dumps(self.data, indent=2))
59
+ return self.data
60
+
61
+ # # -------------------------
62
+ # # HEADER
63
+ # # -------------------------
64
+ # def _parse_header(self):
65
+ # for line in self.lines[:200]:
66
+ # if "=" in line:
67
+ # parts = line.split(",")[0].split("=")
68
+ # if len(parts) == 2:
69
+ # key, val = parts
70
+ # add_value(self.data["Header"], key.strip(), parse_value(val))
71
+
72
+ # -------------------------
73
+ # SIMULATION SETTINGS
74
+ # -------------------------
75
+ def _parse_simulation_settings(self):
76
+ """
77
+ Parse simulation settings from the log file.
78
+ """
79
+ settings = self.data["SimulationSettings"]
80
+ current_section = None
81
+ capture_cntrl = False
82
+
83
+ for line in self.lines:
84
+ stripped = line.strip()
85
+
86
+ # Stop at time series
87
+ if "NSTEP" in line and "TIME" in line:
88
+ break
89
+
90
+ # -------------------------
91
+ # &cntrl block
92
+ # -------------------------
93
+ if "&cntrl" in stripped:
94
+ capture_cntrl = True
95
+ current_section = "cntrl"
96
+ settings[current_section] = {}
97
+ continue
98
+
99
+ if capture_cntrl:
100
+ if "/" in stripped:
101
+ capture_cntrl = False
102
+ current_section = None
103
+ continue
104
+
105
+ for part in stripped.split(","):
106
+ if "=" in part:
107
+ k, v = part.split("=")
108
+ add_value(settings["cntrl"], k.strip(), parse_value(v))
109
+ continue
110
+
111
+ # -------------------------
112
+ # Colon sections
113
+ # -------------------------
114
+ if stripped.endswith(":") and "=" not in stripped:
115
+ section_name = normalize_name(stripped[:-1])
116
+ current_section = section_name
117
+ settings[current_section] = {}
118
+ continue
119
+
120
+ # -------------------------
121
+ # Key-value pairs
122
+ # -------------------------
123
+ if "=" in line:
124
+ matches = re.findall(r"([A-Za-z0-9_\-\s]+?)\s*=\s*([-\d\.E+]+)", line)
125
+
126
+ for k, v in matches:
127
+ key = normalize_name(k)
128
+ val = parse_value(v)
129
+
130
+ if current_section:
131
+ add_value(settings[current_section], key, val)
132
+ else:
133
+ add_value(settings, key, val)
134
+
135
+ # Reset section on blank line
136
+ if not stripped:
137
+ current_section = None
138
+
139
+ self._parse_file_assignments(settings)
140
+
141
+ # -------------------------
142
+ # SETTINGS: FILE ASSIGNMENTS
143
+ # -------------------------
144
+ def _parse_file_assignments(self, settings):
145
+ """
146
+ Parse file assignments from the log file.
147
+
148
+ Args:
149
+ settings (dict): Simulation settings dictionary to update.
150
+ """
151
+ capture = False
152
+ files = {}
153
+
154
+ pattern = r"\|\s*([A-Z0-9_]+):\s*(.+)"
155
+
156
+ for line in self.lines:
157
+ stripped = line.strip()
158
+
159
+ # Start block
160
+ if "File Assignments:" in line:
161
+ capture = True
162
+ continue
163
+
164
+ if capture:
165
+ # Stop if block ends
166
+ if not stripped or not stripped.startswith("|"):
167
+ break
168
+
169
+ match = re.search(pattern, line)
170
+ if match:
171
+ key = match.group(1).strip()
172
+ val = match.group(2).strip()
173
+
174
+ files[key] = val
175
+
176
+ if files:
177
+ settings["File_Assignments"] = files
178
+
179
+ # -------------------------
180
+ # RESULTS (ALL OUTPUT DATA)
181
+ # -------------------------
182
+ def _parse_results(self):
183
+ """
184
+ Parse results blocks from the log file.
185
+ """
186
+ # self._parse_time_series()
187
+ self._parse_block(
188
+ "A V E R A G E S", "R M S F L U C T U A T I O N S", "Averages"
189
+ )
190
+ # self._parse_block("R M S F L U C T U A T I O N S", "TIMINGS", "RMSFluctuations")
191
+ self._parse_timings()
192
+
193
+ # -------------------------
194
+ # TIME SERIES
195
+ # -------------------------
196
+ def _parse_time_series(self):
197
+ """
198
+ Parse time series data from the log file.
199
+ """
200
+ steps = []
201
+ current = {}
202
+ in_series = False
203
+
204
+ for line in self.lines:
205
+ if "NSTEP" in line and "TIME" in line:
206
+ in_series = True
207
+ if current:
208
+ steps.append(current)
209
+ current = {}
210
+
211
+ matches = re.findall(r"([A-Za-z\(\)\-]+)\s*=\s*([-\d\.E+]+)", line)
212
+ for k, v in matches:
213
+ current[k] = parse_value(v)
214
+ continue
215
+
216
+ if in_series and "=" in line:
217
+ matches = re.findall(r"([A-Za-z\(\)\-]+)\s*=\s*([-\d\.E+]+)", line)
218
+ for k, v in matches:
219
+ current[k] = parse_value(v)
220
+
221
+ if "A V E R A G E S" in line:
222
+ break
223
+
224
+ if current:
225
+ steps.append(current)
226
+
227
+ self.data["Results"]["TimeSeries"] = steps
228
+
229
+ # -------------------------
230
+ # GENERIC BLOCK PARSER
231
+ # -------------------------
232
+ def _parse_block(self, start_marker, end_marker, target_key):
233
+ """
234
+ Parse a generic results block.
235
+
236
+ Args:
237
+ start_marker (str): Line indicating the start of the block.
238
+ end_marker (str): Line indicating the end of the block.
239
+ target_key (str): Key in the results dictionary to populate.
240
+ """
241
+ capture = False
242
+ target = self.data["Results"][target_key]
243
+
244
+ for line in self.lines:
245
+ if start_marker in line:
246
+ capture = True
247
+ continue
248
+
249
+ if capture and "=" in line:
250
+ matches = re.findall(r"([A-Za-z\(\)\-]+)\s*=\s*([-\d\.E+]+)", line)
251
+ for k, v in matches:
252
+ add_value(target, k, parse_value(v))
253
+
254
+ if end_marker in line:
255
+ break
256
+
257
+ # -------------------------
258
+ # TIMINGS
259
+ # -------------------------
260
+ def _parse_timings(self):
261
+ """
262
+ Parse timing information from the log file.
263
+ """
264
+ timings = self.data["Results"]["Timings"]
265
+ pattern = r"\|\s*(.*?)\s*:\s*([-\d\.E+]+)\s*seconds"
266
+
267
+ for line in self.lines:
268
+ if "CPU time" in line or "wall time" in line:
269
+ match = re.search(pattern, line)
270
+ if match:
271
+ key = normalize_name(match.group(1))
272
+ val = parse_value(match.group(2))
273
+ add_value(timings, key, val)
274
+
275
+
276
+ # =========================
277
+ # ENTRY POINT
278
+ # =========================
279
+
280
+
281
+ def parse_args():
282
+ """Parse command-line arguments.
283
+
284
+ Returns:
285
+ Parsed ``argparse.Namespace`` object.
286
+ """
287
+ parser = argparse.ArgumentParser(
288
+ description="Extract Amber log file metadata to JSON"
289
+ )
290
+ parser.add_argument("logfile", help="Path to Amber log file")
291
+ parser.add_argument("--output", "-o", help="Output file path (default: stdout)")
292
+ return parser.parse_args()
293
+
294
+
295
+ def main():
296
+ """Entry point: parse args, run extraction, and write output."""
297
+ args = parse_args()
298
+
299
+ parser = AmberLogParser(args.logfile)
300
+ result = parser.parse()
301
+
302
+ if args.output:
303
+ with open(args.output, "w") as f:
304
+ json.dump(result, f, indent=2)
305
+ else:
306
+ print(json.dumps(result, indent=2))
307
+
308
+
309
+ if __name__ == "__main__":
310
+ main()
File without changes
@@ -0,0 +1,311 @@
1
+ #!/usr/bin/env python
2
+ """
3
+ Extract gmx log file metadata into a dictionary.
4
+ """
5
+
6
+ import argparse
7
+ import json
8
+ import re
9
+
10
+ from biosim_extractor.helpers.log_utils import parse_value
11
+
12
+
13
+ class GromacsLogParser:
14
+ """Parser for GROMACS ``.log`` files, extracting header, input parameters, summary, and averages."""
15
+
16
+ def __init__(self, filepath):
17
+ """
18
+ Args:
19
+ filepath: Path to the GROMACS log file.
20
+ """
21
+ self.filepath = filepath
22
+ self.lines = []
23
+ self.data = {}
24
+ self.energy_timeseries = []
25
+
26
+ # =========================
27
+ # MAIN ENTRY
28
+ # =========================
29
+ def parse(self):
30
+ """Parse the log file and return all extracted data.
31
+
32
+ Returns:
33
+ Dictionary containing header fields, input parameters, summary,
34
+ and averages.
35
+ """
36
+ with open(self.filepath) as f:
37
+ self.lines = f.readlines()
38
+
39
+ self._parse_header()
40
+ self._parse_indented_blocks()
41
+ self._parse_summary()
42
+ # self._parse_energy_timeseries()
43
+ self._parse_averages() # new averages parser
44
+
45
+ # self.data["Energy Time Series"] = self.energy_timeseries
46
+ # print(json.dumps(self.data, indent=2))
47
+ return self.data
48
+
49
+ # =========================
50
+ # HEADER
51
+ # =========================
52
+ def _parse_header(self):
53
+ """Extract top-level key-value fields from the file header (e.g. GROMACS version, compiler)."""
54
+ keys = [
55
+ "Executable",
56
+ "Data prefix",
57
+ "Working dir",
58
+ "Process ID",
59
+ "Command line",
60
+ "GROMACS version",
61
+ "Precision",
62
+ "Memory model",
63
+ "MPI library",
64
+ "OpenMP support",
65
+ "GPU support",
66
+ "SIMD instructions",
67
+ "CPU FFT library",
68
+ "GPU FFT library",
69
+ "RDTSCP usage",
70
+ "TNG support",
71
+ "Hwloc support",
72
+ "Tracing support",
73
+ "C compiler",
74
+ "C compiler flags",
75
+ "C++ compiler",
76
+ "C++ compiler flags",
77
+ ]
78
+ for line in self.lines:
79
+ for key in keys:
80
+ if line.startswith(key):
81
+ _, val = line.split(":", 1)
82
+ self.data[key] = val.strip()
83
+
84
+ # =========================
85
+ # INDENTED BLOCKS
86
+ # =========================
87
+ def _parse_indented_blocks(self):
88
+ """Parse indented ``key: value`` and ``key = value`` blocks into nested dicts.
89
+
90
+ Also collapses ``(3x3)`` matrix entries into lists of rows.
91
+ """
92
+ stack = [(-1, self.data)]
93
+ for line in self.lines:
94
+ if not line.strip():
95
+ continue
96
+ indent = len(line) - len(line.lstrip(" "))
97
+ stripped = line.strip()
98
+ while stack and stack[-1][0] >= indent:
99
+ stack.pop()
100
+ parent = stack[-1][1]
101
+
102
+ if ":" in stripped and not stripped.endswith(":"):
103
+ key, val = map(str.strip, stripped.split(":", 1))
104
+ parent[key] = parse_value(val)
105
+ continue
106
+ if "=" in stripped:
107
+ key, val = map(str.strip, stripped.split("=", 1))
108
+ parent[key] = parse_value(val)
109
+ continue
110
+ if stripped.endswith(":"):
111
+ key = stripped[:-1].strip()
112
+ new_dict = {}
113
+ parent[key] = new_dict
114
+ stack.append((indent, new_dict))
115
+
116
+ # deal with 3x3 arrays here
117
+ for key in ["Input Parameters", "qm-opts"]:
118
+ sub_dict = self.data.get(key)
119
+ if not sub_dict:
120
+ continue
121
+ for k, v in list(sub_dict.items()):
122
+ if "(3x3)" in k:
123
+ new_k = k.split(" (")[0]
124
+ array = [arr for arr in v.values()]
125
+ sub_dict.pop(k)
126
+ sub_dict[new_k] = array
127
+
128
+ # =========================
129
+ # SUMMARY (PERFORMANCE, TIME)
130
+ # =========================
131
+ def _parse_summary(self):
132
+ """Extract performance and wall-time summary from the end of the log file."""
133
+ summary = {}
134
+ lines = self.lines
135
+ n = len(lines)
136
+ i = 0
137
+ while i < n:
138
+ line = lines[i]
139
+ if "Performance:" in line:
140
+ parts = line.split()
141
+ summary["Performance"] = {
142
+ "(ns/day)": parse_value(parts[-2]),
143
+ "(hour/ns)": parse_value(parts[-1]),
144
+ }
145
+ elif line.strip().startswith("Time:"):
146
+ vals = lines[i + 1].split()
147
+ summary["Time"] = {
148
+ "Core t (s)": parse_value(vals[0]),
149
+ "Wall t (s)": parse_value(vals[1]),
150
+ }
151
+ i += 1
152
+ self.data["Summary"] = summary
153
+
154
+ # =========================
155
+ # ENERGY TIME SERIES
156
+ # =========================
157
+ # unused — retained for future use
158
+ def _parse_energy_timeseries(self):
159
+ """Extract per-step energy blocks into ``self.energy_timeseries``."""
160
+ lines = self.lines
161
+ n = len(lines)
162
+ i = 0
163
+
164
+ while i < n:
165
+ line = lines[i]
166
+ if re.match(r"\s*Step\s+Time", line):
167
+ step_line = lines[i + 1].split()
168
+ entry = {
169
+ "Step": parse_value(step_line[0]),
170
+ "Time": parse_value(step_line[1]),
171
+ }
172
+
173
+ # locate "Energies" block
174
+ j = i + 2
175
+ while j < n and "Energies (kJ/mol)" not in lines[j]:
176
+ j += 1
177
+ if j >= n:
178
+ break
179
+
180
+ # parse 4 energy blocks
181
+ for block in range(4):
182
+ headers = lines[j + 1 + block * 2].split()
183
+ values = lines[j + 2 + block * 2].split()
184
+ for h, v in zip(headers, values):
185
+ entry[h] = parse_value(v)
186
+
187
+ # parse last line with Pres., DC, bar
188
+ k = j + 9
189
+ if k < n:
190
+ extra_line = lines[k].split()
191
+ if len(extra_line) >= 3:
192
+ entry["Pres."] = parse_value(extra_line[0])
193
+ entry["DC"] = parse_value(extra_line[1])
194
+ entry["(bar)"] = parse_value(extra_line[2])
195
+
196
+ self.energy_timeseries.append(entry)
197
+ i = j + 10
198
+ else:
199
+ i += 1
200
+
201
+ # =========================
202
+ # AVERAGES
203
+ # =========================
204
+ def _parse_averages(self):
205
+ """Extract the ``A V E R A G E S`` block, including energies, box dimensions, and tensors."""
206
+ lines = self.lines
207
+ n = len(lines)
208
+ i = 0
209
+ averages = {}
210
+ capture = False
211
+
212
+ while i < n:
213
+ line = lines[i]
214
+
215
+ # detect start of averages block
216
+ if "A V E R A G E S" in line:
217
+ capture = True
218
+ i += 1
219
+ continue
220
+
221
+ if capture:
222
+ # Statistics header
223
+ if "Statistics over" in line:
224
+ parts = line.split()
225
+ averages["total-steps"] = parse_value(parts[2])
226
+ averages["total-frames"] = parse_value(parts[-2])
227
+
228
+ # Energies block (same as timeseries)
229
+ elif "Energies (kJ/mol)" in line:
230
+ for block in range(4):
231
+ headers = lines[i + 1 + block * 2]
232
+ values = lines[i + 2 + block * 2].split()
233
+ headers_split = [
234
+ (headers[i : i + 15].split())
235
+ for i in range(0, len(headers), 15)
236
+ ]
237
+ for h, v in zip(headers_split[:-1], values):
238
+ h = " ".join(h)
239
+ averages[h] = parse_value(v)
240
+ i += 8
241
+ continue
242
+
243
+ # Box dimensions
244
+ elif line.strip().startswith("Box-"):
245
+ headers = line.split()
246
+ values = lines[i + 1].split()
247
+ for h, v in zip(headers, values):
248
+ averages[h] = parse_value(v)
249
+ i += 2
250
+ continue
251
+
252
+ # Protein temperatures
253
+ elif line.strip().startswith("T-Protein"):
254
+ headers = line.split()
255
+ values = lines[i + 1].split()
256
+ for h, v in zip(headers, values):
257
+ averages[h] = parse_value(v)
258
+ i += 2
259
+ continue
260
+
261
+ # Total Virial and Pressure tensors
262
+ elif "Total Virial" in line or "Pressure (bar)" in line:
263
+ key = line.strip()
264
+ matrix = []
265
+ for j in range(1, 4):
266
+ row = [parse_value(x) for x in lines[i + j].split()]
267
+ matrix.append(row)
268
+ averages[key + " tensor"] = matrix
269
+ i += 4
270
+ continue
271
+
272
+ i += 1
273
+
274
+ self.data["Averages"] = averages
275
+
276
+
277
+ # =========================
278
+ # ENTRY POINT
279
+ # =========================
280
+
281
+
282
+ def parse_args():
283
+ """Parse command-line arguments.
284
+
285
+ Returns:
286
+ Parsed ``argparse.Namespace`` object.
287
+ """
288
+ parser = argparse.ArgumentParser(
289
+ description="Extract GROMACS log file metadata to JSON"
290
+ )
291
+ parser.add_argument("logfile", help="Path to GROMACS log file")
292
+ parser.add_argument("--output", "-o", help="Output file path (default: stdout)")
293
+ return parser.parse_args()
294
+
295
+
296
+ def main():
297
+ """Entry point: parse args, run extraction, and write output."""
298
+ args = parse_args()
299
+
300
+ parser = GromacsLogParser(args.logfile)
301
+ result = parser.parse()
302
+
303
+ if args.output:
304
+ with open(args.output, "w") as f:
305
+ json.dump(result, f, indent=2)
306
+ else:
307
+ print(json.dumps(result, indent=2))
308
+
309
+
310
+ if __name__ == "__main__":
311
+ main()
File without changes