pytme 0.2.9__cp311-cp311-macosx_15_0_arm64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (119) hide show
  1. pytme-0.2.9.data/scripts/estimate_ram_usage.py +97 -0
  2. pytme-0.2.9.data/scripts/match_template.py +1135 -0
  3. pytme-0.2.9.data/scripts/postprocess.py +622 -0
  4. pytme-0.2.9.data/scripts/preprocess.py +209 -0
  5. pytme-0.2.9.data/scripts/preprocessor_gui.py +1227 -0
  6. pytme-0.2.9.dist-info/METADATA +95 -0
  7. pytme-0.2.9.dist-info/RECORD +119 -0
  8. pytme-0.2.9.dist-info/WHEEL +5 -0
  9. pytme-0.2.9.dist-info/entry_points.txt +6 -0
  10. pytme-0.2.9.dist-info/licenses/LICENSE +153 -0
  11. pytme-0.2.9.dist-info/top_level.txt +3 -0
  12. scripts/__init__.py +0 -0
  13. scripts/estimate_ram_usage.py +97 -0
  14. scripts/match_template.py +1135 -0
  15. scripts/postprocess.py +622 -0
  16. scripts/preprocess.py +209 -0
  17. scripts/preprocessor_gui.py +1227 -0
  18. tests/__init__.py +0 -0
  19. tests/data/Blurring/blob_width18.npy +0 -0
  20. tests/data/Blurring/edgegaussian_sigma3.npy +0 -0
  21. tests/data/Blurring/gaussian_sigma2.npy +0 -0
  22. tests/data/Blurring/hamming_width6.npy +0 -0
  23. tests/data/Blurring/kaiserb_width18.npy +0 -0
  24. tests/data/Blurring/localgaussian_sigma0510.npy +0 -0
  25. tests/data/Blurring/mean_size5.npy +0 -0
  26. tests/data/Blurring/ntree_sigma0510.npy +0 -0
  27. tests/data/Blurring/rank_rank3.npy +0 -0
  28. tests/data/Maps/.DS_Store +0 -0
  29. tests/data/Maps/emd_8621.mrc.gz +0 -0
  30. tests/data/README.md +2 -0
  31. tests/data/Raw/em_map.map +0 -0
  32. tests/data/Structures/.DS_Store +0 -0
  33. tests/data/Structures/1pdj.cif +3339 -0
  34. tests/data/Structures/1pdj.pdb +1429 -0
  35. tests/data/Structures/5khe.cif +3685 -0
  36. tests/data/Structures/5khe.ent +2210 -0
  37. tests/data/Structures/5khe.pdb +2210 -0
  38. tests/data/Structures/5uz4.cif +70548 -0
  39. tests/preprocessing/__init__.py +0 -0
  40. tests/preprocessing/test_compose.py +76 -0
  41. tests/preprocessing/test_frequency_filters.py +178 -0
  42. tests/preprocessing/test_preprocessor.py +136 -0
  43. tests/preprocessing/test_utils.py +79 -0
  44. tests/test_analyzer.py +216 -0
  45. tests/test_backends.py +446 -0
  46. tests/test_density.py +503 -0
  47. tests/test_extensions.py +130 -0
  48. tests/test_matching_cli.py +283 -0
  49. tests/test_matching_data.py +162 -0
  50. tests/test_matching_exhaustive.py +124 -0
  51. tests/test_matching_memory.py +30 -0
  52. tests/test_matching_optimization.py +226 -0
  53. tests/test_matching_utils.py +189 -0
  54. tests/test_orientations.py +175 -0
  55. tests/test_parser.py +33 -0
  56. tests/test_rotations.py +153 -0
  57. tests/test_structure.py +247 -0
  58. tme/__init__.py +6 -0
  59. tme/__version__.py +1 -0
  60. tme/analyzer/__init__.py +2 -0
  61. tme/analyzer/_utils.py +186 -0
  62. tme/analyzer/aggregation.py +577 -0
  63. tme/analyzer/peaks.py +953 -0
  64. tme/backends/__init__.py +171 -0
  65. tme/backends/_cupy_utils.py +734 -0
  66. tme/backends/_jax_utils.py +188 -0
  67. tme/backends/cupy_backend.py +294 -0
  68. tme/backends/jax_backend.py +314 -0
  69. tme/backends/matching_backend.py +1270 -0
  70. tme/backends/mlx_backend.py +241 -0
  71. tme/backends/npfftw_backend.py +583 -0
  72. tme/backends/pytorch_backend.py +430 -0
  73. tme/data/__init__.py +0 -0
  74. tme/data/c48n309.npy +0 -0
  75. tme/data/c48n527.npy +0 -0
  76. tme/data/c48n9.npy +0 -0
  77. tme/data/c48u1.npy +0 -0
  78. tme/data/c48u1153.npy +0 -0
  79. tme/data/c48u1201.npy +0 -0
  80. tme/data/c48u1641.npy +0 -0
  81. tme/data/c48u181.npy +0 -0
  82. tme/data/c48u2219.npy +0 -0
  83. tme/data/c48u27.npy +0 -0
  84. tme/data/c48u2947.npy +0 -0
  85. tme/data/c48u3733.npy +0 -0
  86. tme/data/c48u4749.npy +0 -0
  87. tme/data/c48u5879.npy +0 -0
  88. tme/data/c48u7111.npy +0 -0
  89. tme/data/c48u815.npy +0 -0
  90. tme/data/c48u83.npy +0 -0
  91. tme/data/c48u8649.npy +0 -0
  92. tme/data/c600v.npy +0 -0
  93. tme/data/c600vc.npy +0 -0
  94. tme/data/metadata.yaml +80 -0
  95. tme/data/quat_to_numpy.py +42 -0
  96. tme/data/scattering_factors.pickle +0 -0
  97. tme/density.py +2263 -0
  98. tme/extensions.cpython-311-darwin.so +0 -0
  99. tme/external/bindings.cpp +332 -0
  100. tme/filters/__init__.py +6 -0
  101. tme/filters/_utils.py +311 -0
  102. tme/filters/bandpass.py +230 -0
  103. tme/filters/compose.py +81 -0
  104. tme/filters/ctf.py +393 -0
  105. tme/filters/reconstruction.py +160 -0
  106. tme/filters/wedge.py +542 -0
  107. tme/filters/whitening.py +191 -0
  108. tme/matching_data.py +863 -0
  109. tme/matching_exhaustive.py +497 -0
  110. tme/matching_optimization.py +1311 -0
  111. tme/matching_scores.py +1183 -0
  112. tme/matching_utils.py +1188 -0
  113. tme/memory.py +337 -0
  114. tme/orientations.py +598 -0
  115. tme/parser.py +685 -0
  116. tme/preprocessor.py +1329 -0
  117. tme/rotations.py +350 -0
  118. tme/structure.py +1864 -0
  119. tme/types.py +13 -0
tme/parser.py ADDED
@@ -0,0 +1,685 @@
1
+ """ Implements parsers for atomic structure file formats.
2
+
3
+ Copyright (c) 2023 European Molecular Biology Laboratory
4
+
5
+ Author: Valentin Maurer <valentin.maurer@embl-hamburg.de>
6
+ """
7
+
8
+ import re
9
+ import xml.etree.ElementTree as ET
10
+
11
+ from collections import deque
12
+ from abc import ABC, abstractmethod
13
+ from typing import List, Dict, Union
14
+
15
+
16
+ import numpy as np
17
+
18
+ __all__ = ["PDBParser", "MMCIFParser", "GROParser", "StarParser", "XMLParser"]
19
+
20
+
21
+ class Parser(ABC):
22
+ """
23
+ Base class for structure file parsers.
24
+
25
+ Classes inheriting from :py:class:`Parser` need to define a function
26
+ :py:meth:`Parser.parse_input` that creates a dictionary representation
27
+ of the given file. The input is a deque of all lines in the file.
28
+ """
29
+
30
+ def __init__(self, filename: str, mode: str = "r", **kwargs) -> None:
31
+ """
32
+ Initialize a Parser object.
33
+
34
+ Parameters
35
+ ----------
36
+ filename : str
37
+ File name to parse data from.
38
+ mode : str, optional
39
+ Mode to open the file. Default is 'r' for read.
40
+ kwargs : Dict, optional
41
+ Optional keyword arguments passed to the child's parse_input method.
42
+
43
+ """
44
+ try:
45
+ with open(filename, mode) as infile:
46
+ data = infile.read()
47
+ except UnicodeDecodeError:
48
+ with open(filename, mode, encoding="utf-16") as infile:
49
+ data = infile.read()
50
+
51
+ data = deque(filter(lambda line: line and line[0] != "#", data.split("\n")))
52
+ self._data = self.parse_input(data, **kwargs)
53
+
54
+ def __getitem__(self, key: str):
55
+ """
56
+ Retrieve a value from the internal data using a given key.
57
+
58
+ Parameters
59
+ ----------
60
+ key : str
61
+ The key to use for retrieving the corresponding value from
62
+ the internal data.
63
+
64
+ Returns
65
+ -------
66
+ value
67
+ The value associated with the provided key in the internal data.
68
+ """
69
+ return self._data[key]
70
+
71
+ def __contains__(self, key) -> bool:
72
+ """
73
+ Check if a given key exists in the internal data.
74
+
75
+ Parameters
76
+ ----------
77
+ key : str
78
+ The key to check for in the internal data.
79
+
80
+ Returns
81
+ -------
82
+ bool
83
+ True if the key exists in the internal data, False otherwise.
84
+ """
85
+ return key in self._data
86
+
87
+ def get(self, key, default=None):
88
+ """
89
+ Retrieve a value from the internal data using a given key. If the
90
+ key does not exist, return a default value.
91
+
92
+ Parameters
93
+ ----------
94
+ key : str
95
+ The key to use for retrieving the corresponding value from
96
+ the internal data.
97
+ default : Any
98
+ The value to return if the key does not exist in the
99
+ internal data, defaults to None.
100
+
101
+ Returns
102
+ -------
103
+ value
104
+ The value associated with the provided key in the internal data,
105
+ or the default value if the key does not exist.
106
+ """
107
+ return self._data.get(key, default)
108
+
109
+ def keys(self):
110
+ """
111
+ List keys available in internal dictionary.
112
+ """
113
+ return self._data.keys()
114
+
115
+ def values(self):
116
+ """
117
+ List values available in internal dictionary.
118
+ """
119
+ return self._data.values()
120
+
121
+ def items(self):
122
+ """
123
+ List items available in internal dictionary.
124
+ """
125
+ return self._data.items()
126
+
127
+ @abstractmethod
128
+ def parse_input(self, lines: List[str]) -> Dict:
129
+ """
130
+ Parse a list of lines from a file and convert the data into a dictionary.
131
+
132
+ This function is not intended to be called directly, but should rather be
133
+ defined by classes inheriting from :py:class:`Parser` to parse a given
134
+ file format.
135
+
136
+ Parameters
137
+ ----------
138
+ lines : list of str
139
+ The lines of a structure file to parse.
140
+
141
+ Returns
142
+ -------
143
+ dict
144
+ A dictionary containing the parsed data.
145
+ """
146
+
147
+
148
+ class PDBParser(Parser):
149
+ """
150
+ Convert PDB file data into a dictionary representation [1]_.
151
+
152
+ References
153
+ ----------
154
+ .. [1] https://www.cgl.ucsf.edu/chimera/docs/UsersGuide/tutorials/pdbintro.html
155
+ """
156
+
157
+ def parse_input(self, lines: deque) -> Dict:
158
+ """
159
+ Parse a list of lines from a PDB file and convert the data into a dictionary.
160
+
161
+ Parameters
162
+ ----------
163
+ lines : deque of str
164
+ The lines of a PDB file to parse.
165
+
166
+ Returns
167
+ -------
168
+ dict
169
+ A dictionary containing the parsed data from the PDB file.
170
+ """
171
+ metadata = {
172
+ "resolution": re.compile(
173
+ r"(.)+?(EFFECTIVE RESOLUTION\s+\(ANGSTROMS\)){1}(.)+?(\d+\.\d+)(\s)*$"
174
+ ),
175
+ "reconstruction_method": re.compile(
176
+ r"(.)+?(RECONSTRUCTION METHOD)+(.)+?(\w+\s*\w+)(\s)*$"
177
+ ),
178
+ "electron_source": re.compile(r"(.)+?(SOURCE)+(.)+?(\w+\s*\w+)(\s)*$"),
179
+ "illumination_mode": re.compile(
180
+ r"(.)+?(ILLUMINATION MODE)+(.)+?(\w+\s*\w+)(\s)*$"
181
+ ),
182
+ "microscope_mode": re.compile(
183
+ r"(.)+?(IMAGING MODE)+(.)+?(\w+\s*\w+)(\s)*$"
184
+ ),
185
+ "microscope_model": re.compile(
186
+ r"(.)+?(MICROSCOPE MODEL)+(.+?:\s+)+?(.+)(\s)*$"
187
+ ),
188
+ }
189
+
190
+ data = {
191
+ "record_type": [],
192
+ "atom_serial_number": [],
193
+ "atom_name": [],
194
+ "alternate_location_indicator": [],
195
+ "residue_name": [],
196
+ "chain_identifier": [],
197
+ "residue_sequence_number": [],
198
+ "code_for_residue_insertion": [],
199
+ "atom_coordinate": [],
200
+ "occupancy": [],
201
+ "temperature_factor": [],
202
+ "segment_identifier": [],
203
+ "element_symbol": [],
204
+ "charge": [],
205
+ "details": {},
206
+ }
207
+ data["details"]["resolution"] = np.nan
208
+
209
+ for line in lines:
210
+ if line.startswith("REMARK"):
211
+ matches = [(key, metadata[key].match(line)) for key in metadata]
212
+ matches = [match for match in matches if match[1]]
213
+ for key, match in matches:
214
+ data["details"][key] = match.group(4)
215
+ _ = metadata.pop(key)
216
+ elif line.startswith("ATOM") or line.startswith("HETATM"):
217
+ data["record_type"].append(line[0:6])
218
+ data["atom_serial_number"].append(line[6:11])
219
+ data["atom_name"].append(line[12:16])
220
+ data["alternate_location_indicator"].append(line[16])
221
+ data["residue_name"].append(line[17:20])
222
+
223
+ data["chain_identifier"].append(line[21])
224
+ data["residue_sequence_number"].append(line[22:26])
225
+ data["code_for_residue_insertion"].append(line[26])
226
+ data["atom_coordinate"].append((line[30:38], line[38:46], line[46:54]))
227
+ data["occupancy"].append(line[54:60])
228
+ data["temperature_factor"].append(line[60:66])
229
+ data["segment_identifier"].append(line[74:76])
230
+ data["element_symbol"].append(line[76:78])
231
+ data["charge"].append(line[78:80])
232
+
233
+ data["details"]["resolution"] = float(data["details"]["resolution"])
234
+
235
+ return data
236
+
237
+
238
+ class MMCIFParser(Parser):
239
+ """
240
+ Convert MMCIF file data into a dictionary representation. This implementation
241
+ heavily relies on the atomium library [1]_.
242
+
243
+ References
244
+ ----------
245
+ .. [1] Ireland, S. M., & Martin, A. C. R. (2020). atomium (Version 1.0.0)
246
+ [Computer software]. https://doi.org/10.1093/bioinformatics/btaa072
247
+ """
248
+
249
+ def parse_input(self, lines: deque) -> Dict:
250
+ """
251
+ Parse a list of lines from an MMCIF file and convert the data into a dictionary.
252
+
253
+ Parameters
254
+ ----------
255
+ lines : deque of str
256
+ The lines of an MMCIF file to parse.
257
+
258
+ Returns
259
+ -------
260
+ dict
261
+ A dictionary containing the parsed data from the MMCIF file.
262
+ """
263
+ lines = self._consolidate_strings(lines)
264
+ blocks = self._split_in_blocks(lines)
265
+ mmcif_dict = {}
266
+ for block in blocks:
267
+ if block["lines"][0] == "loop_":
268
+ mmcif_dict[block["category"]] = self._loop_block_to_dict(block)
269
+ else:
270
+ mmcif_dict[block["category"]] = self._non_loop_block_to_dict(block)
271
+ return mmcif_dict
272
+
273
+ @staticmethod
274
+ def _consolidate_strings(lines: List[str]) -> List[str]:
275
+ """
276
+ Consolidate multi-line strings that have been separated by semicolons in a
277
+ list of strings.
278
+
279
+ Parameters
280
+ ----------
281
+ lines : deque of str
282
+ Deque of strings where each string is a line from an MMCIF file.
283
+
284
+ Returns
285
+ -------
286
+ deque of str
287
+ A deque of consolidated strings from the given input.
288
+ """
289
+ new_lines = deque()
290
+ while lines:
291
+ line = lines.popleft()
292
+ if line.startswith(";"):
293
+ string = [line[1:].strip()]
294
+ while not lines[0].startswith(";"):
295
+ string.append(lines.popleft())
296
+ lines.popleft()
297
+ new_lines[-1] += ' "{}"'.format(
298
+ " ".join(string).replace('"', "").replace("'", "'")
299
+ )
300
+ else:
301
+ new_lines.append(line.replace('"', "").replace("'", "'"))
302
+ return new_lines
303
+
304
+ @staticmethod
305
+ def _split_in_blocks(lines: List[str]) -> List[Dict]:
306
+ """
307
+ Split a deque of consolidated strings into a list of dictionaries,
308
+ each representing a block of data.
309
+
310
+ Parameters
311
+ ----------
312
+ lines : deque of str
313
+ Deque of consolidated strings where each string is a line from
314
+ an MMCIF file.
315
+
316
+ Returns
317
+ -------
318
+ list of dict
319
+ A list of dictionaries where each dictionary represents a block
320
+ of data from the MMCIF file.
321
+ """
322
+ category = None
323
+ block, blocks = [], []
324
+ while lines:
325
+ line = lines.popleft()
326
+ if line.startswith("data_"):
327
+ continue
328
+ if line.startswith("_"):
329
+ line_category = line.split(".")[0]
330
+ if line_category != category:
331
+ if category:
332
+ blocks.append({"category": category[1:], "lines": block})
333
+ category = line_category
334
+ block = []
335
+ if line.startswith("loop_"):
336
+ if category:
337
+ blocks.append({"category": category[1:], "lines": block})
338
+ category = lines[0].split(".")[0]
339
+ block = []
340
+ block.append(line)
341
+ if block:
342
+ blocks.append({"category": category[1:], "lines": block})
343
+ return blocks
344
+
345
+ @staticmethod
346
+ def _non_loop_block_to_dict(block: Dict) -> Dict:
347
+ """
348
+ Convert a non-loop block of data into a dictionary.
349
+
350
+ Parameters
351
+ ----------
352
+ block : dict
353
+ A dictionary representing a non-loop block of data from an MMCIF file.
354
+
355
+ Returns
356
+ -------
357
+ dict
358
+ A dictionary representing the parsed data from the given non-loop block.
359
+ """
360
+ d = {}
361
+ # category = block["lines"][0].split(".")[0]
362
+ for index in range(len(block["lines"]) - 1):
363
+ if block["lines"][index + 1][0] != "_":
364
+ block["lines"][index] += " " + block["lines"][index + 1]
365
+ block["lines"] = [line for line in block["lines"] if line[0] == "_"]
366
+ for line in block["lines"]:
367
+ name = line.split(".")[1].split()[0]
368
+ value = " ".join(line.split()[1:])
369
+ d[name] = value
370
+ return d
371
+
372
+ def _loop_block_to_dict(self, block: Dict) -> Dict:
373
+ """
374
+ Convert a loop block of data into a dictionary.
375
+
376
+ Parameters
377
+ ----------
378
+ block : dict
379
+ A dictionary representing a loop block of data from an MMCIF file.
380
+
381
+ Returns
382
+ -------
383
+ dict
384
+ A dictionary representing the parsed data from the given loop block.
385
+ """
386
+ names, lines = [], []
387
+ body_start = 0
388
+ for index, line in enumerate(block["lines"][1:], start=1):
389
+ if not line.startswith("_" + block["category"]):
390
+ body_start = index
391
+ break
392
+ names = [line.split(".")[1].rstrip() for line in block["lines"][1:body_start]]
393
+ lines = [self._split_line(line) for line in block["lines"][body_start:]]
394
+ # reunites broken lines
395
+ for n in range(len(lines) - 1):
396
+ while n < len(lines) - 1 and len(lines[n]) + len(lines[n + 1]) <= len(
397
+ names
398
+ ):
399
+ lines[n] += lines.pop(n + 1)
400
+ res = {name: [] for name in names}
401
+ for line in lines:
402
+ for name, value in zip(names, line):
403
+ res[name].append(value)
404
+ return res
405
+
406
+ @staticmethod
407
+ def _split_line(line: str) -> List[str]:
408
+ """
409
+ Split a string into substrings, ignoring quotation marks within the string.
410
+
411
+ Parameters
412
+ ----------
413
+ line : str
414
+ The string to be split.
415
+
416
+ Returns
417
+ -------
418
+ list of str
419
+ A list of substrings resulting from the split operation on the given string.
420
+ """
421
+ if not re.search("['\"]", line):
422
+ return line.split()
423
+
424
+ chars = deque(line.strip())
425
+ values, value, in_string = [], [], False
426
+ while chars:
427
+ char = chars.popleft()
428
+ if char == " " and not in_string:
429
+ values.append("".join(value))
430
+ value = []
431
+ elif char == '"':
432
+ in_string = not in_string
433
+ value.append(char)
434
+ else:
435
+ value.append(char)
436
+
437
+ values.append(value)
438
+ return ["".join(v) for v in values if v]
439
+
440
+
441
+ class GROParser(Parser):
442
+ """
443
+ Convert GRO file in Gromos87 format into a dictionary representation [1]_.
444
+
445
+ References
446
+ ----------
447
+ .. [1] https://manual.gromacs.org/archive/5.0.4/online/gro.html
448
+ """
449
+
450
+ def parse_input(self, lines, **kwargs) -> Dict:
451
+ """
452
+ Parse a list of lines from a GRO file and convert the data into a dictionary.
453
+
454
+ Parameters
455
+ ----------
456
+ lines : deque of str
457
+ The lines of a GRO file to parse.
458
+ kwargs : Dict, optional
459
+ Optional keyword arguments.
460
+
461
+ Returns
462
+ -------
463
+ dict
464
+ A dictionary containing the parsed data from the GRO file.
465
+ """
466
+ data = {
467
+ "title": [],
468
+ "num_atoms": [],
469
+ "record_type": [],
470
+ "residue_number": [],
471
+ "residue_name": [],
472
+ "atom_name": [],
473
+ "atom_number": [],
474
+ "atom_coordinate": [],
475
+ "segment_identifier": [],
476
+ "velocity": [],
477
+ "box_vectors": [],
478
+ "time": [],
479
+ }
480
+
481
+ if not lines:
482
+ return data
483
+
484
+ time_pattern = re.compile(r"t=\s*(\d+\.?\d*)")
485
+ file_index = -1
486
+
487
+ # GRO files can be concatenated. Parse one per invocation
488
+ while lines:
489
+ file_index += 1
490
+ str_file_index = str(file_index)
491
+
492
+ title = lines.popleft()
493
+ data["title"].append(title)
494
+
495
+ time_match = time_pattern.search(title)
496
+ if time_match:
497
+ data["time"].append(float(time_match.group(1)))
498
+
499
+ try:
500
+ num_atoms = int(lines.popleft())
501
+ data["num_atoms"].append(num_atoms)
502
+ except (ValueError, IndexError):
503
+ return data
504
+
505
+ if num_atoms <= 0:
506
+ continue
507
+
508
+ valid_atoms = 0
509
+ for _ in range(num_atoms):
510
+ if not lines:
511
+ break
512
+
513
+ line = lines.popleft()
514
+
515
+ try:
516
+ res_num = int(line[:5])
517
+ res_name = line[5:10].strip()
518
+ atom_name = line[10:15].strip()
519
+ atom_num = int(line[15:20])
520
+
521
+ coord = (float(line[20:28]), float(line[28:36]), float(line[36:44]))
522
+
523
+ vel = None
524
+ if len(line) >= 68:
525
+ vel = (
526
+ float(line[44:52]),
527
+ float(line[52:60]),
528
+ float(line[60:68]),
529
+ )
530
+ except (ValueError, IndexError):
531
+ continue
532
+
533
+ valid_atoms += 1
534
+ data["residue_number"].append(res_num)
535
+ data["residue_name"].append(res_name)
536
+ data["atom_name"].append(atom_name)
537
+ data["atom_number"].append(atom_num)
538
+ data["atom_coordinate"].append(coord)
539
+ data["velocity"].append(vel)
540
+
541
+ data["segment_identifier"].extend([str_file_index] * valid_atoms)
542
+ data["record_type"].extend(["ATOM"] * valid_atoms)
543
+
544
+ if lines:
545
+ box_line = lines.popleft()
546
+ try:
547
+ data["box_vectors"].append([float(val) for val in box_line.split()])
548
+ except ValueError:
549
+ pass
550
+
551
+ return data
552
+
553
+
554
+ class StarParser(MMCIFParser):
555
+ """
556
+ Convert STAR file data into a dictionary representation [1]_.
557
+
558
+ References
559
+ ----------
560
+ .. [1] https://www.iucr.org/__data/assets/file/0013/11416/star.5.html
561
+ """
562
+
563
+ def parse_input(self, lines: List[str], delimiter: str = "\t") -> Dict:
564
+ pattern = re.compile(r"\s*#.*")
565
+
566
+ ret, category, block = {}, None, []
567
+ while lines:
568
+ line = lines.popleft()
569
+
570
+ if line.startswith("data") and not line.startswith("_"):
571
+ if category != line and category is not None:
572
+ headers = list(ret[category].keys())
573
+ headers = [pattern.sub("", x) for x in headers]
574
+ ret[category] = {
575
+ header: list(column)
576
+ for header, column in zip(headers, zip(*block))
577
+ }
578
+ block.clear()
579
+ category = line
580
+ if category not in ret:
581
+ ret[category] = {}
582
+ continue
583
+
584
+ if line.startswith("_"):
585
+ ret[category][line] = []
586
+ continue
587
+
588
+ if line.startswith("loop"):
589
+ continue
590
+
591
+ line_split = line.split(delimiter)
592
+ if len(line_split):
593
+ block.append(line_split)
594
+
595
+ headers = list(ret[category].keys())
596
+ headers = [pattern.sub("", x) for x in headers]
597
+ ret[category] = {
598
+ header: list(column) for header, column in zip(headers, zip(*block))
599
+ }
600
+ return ret
601
+
602
+
603
+ class XMLParser(Parser):
604
+ """
605
+ Parser for XML files.
606
+ """
607
+
608
+ def parse_input(self, lines: deque, **kwargs) -> Dict:
609
+ root = ET.fromstring("\n".join(lines))
610
+ return self._element_to_dict(root)
611
+
612
+ def _element_to_dict(self, element) -> Dict:
613
+ """
614
+ Convert an XML element and its children to a dictionary.
615
+
616
+ Parameters
617
+ ----------
618
+ element : xml.etree.ElementTree.Element
619
+ The XML element to convert.
620
+
621
+ Returns
622
+ -------
623
+ Dict
624
+ Dictionary representation of the element.
625
+ """
626
+ result = {}
627
+
628
+ if element.attrib:
629
+ result["@attributes"] = {
630
+ k: self._convert_value(v) for k, v in element.attrib.items()
631
+ }
632
+
633
+ children = list(element)
634
+ if not children:
635
+ if element.text and element.text.strip():
636
+ text = element.text.strip()
637
+ if "\n" in text:
638
+ result = [
639
+ self._convert_value(line.strip())
640
+ for line in text.split("\n")
641
+ if line.strip()
642
+ ]
643
+ else:
644
+ result = self._convert_value(text)
645
+ else:
646
+ for child in children:
647
+ child_dict = self._element_to_dict(child)
648
+
649
+ if child.tag not in result:
650
+ result[child.tag] = child_dict
651
+ else:
652
+ if not isinstance(result[child.tag], list):
653
+ result[child.tag] = [result[child.tag]]
654
+ result[child.tag].append(child_dict)
655
+
656
+ return result
657
+
658
+ def _convert_value(self, value_str: str) -> Union[int, float, bool, str]:
659
+ """
660
+ Convert a string value to an appropriate data type.
661
+
662
+ Parameters
663
+ ----------
664
+ value_str : str
665
+ String value to convert.
666
+
667
+ Returns
668
+ -------
669
+ Union[int, float, bool, str]
670
+ Converted value in appropriate data type.
671
+ """
672
+ if value_str.lower() in ("true", "false"):
673
+ return value_str.lower() == "true"
674
+
675
+ try:
676
+ return int(value_str)
677
+ except ValueError:
678
+ pass
679
+
680
+ try:
681
+ return float(value_str)
682
+ except ValueError:
683
+ pass
684
+
685
+ return value_str