reciprocalspaceship 1.0.1__py3-none-any.whl → 1.0.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of reciprocalspaceship might be problematic. Click here for more details.
- reciprocalspaceship/VERSION +1 -1
- reciprocalspaceship/algorithms/scale_merged_intensities.py +8 -7
- reciprocalspaceship/dataset.py +5 -0
- reciprocalspaceship/decorators.py +2 -2
- reciprocalspaceship/dtypes/floating.py +24 -28
- reciprocalspaceship/dtypes/integer.py +38 -37
- reciprocalspaceship/dtypes/internals.py +243 -49
- reciprocalspaceship/io/crystfel.py +568 -234
- reciprocalspaceship/utils/__init__.py +6 -1
- reciprocalspaceship/utils/cell.py +5 -0
- reciprocalspaceship/utils/stats.py +5 -7
- reciprocalspaceship/utils/structurefactors.py +5 -0
- reciprocalspaceship/utils/units.py +14 -4
- {reciprocalspaceship-1.0.1.dist-info → reciprocalspaceship-1.0.2.dist-info}/METADATA +26 -28
- {reciprocalspaceship-1.0.1.dist-info → reciprocalspaceship-1.0.2.dist-info}/RECORD +20 -20
- {reciprocalspaceship-1.0.1.dist-info → reciprocalspaceship-1.0.2.dist-info}/WHEEL +1 -1
- {reciprocalspaceship-1.0.1.dist-info → reciprocalspaceship-1.0.2.dist-info}/entry_points.txt +0 -1
- tests/test_dataseries.py +1 -1
- {reciprocalspaceship-1.0.1.dist-info → reciprocalspaceship-1.0.2.dist-info}/LICENSE +0 -0
- {reciprocalspaceship-1.0.1.dist-info → reciprocalspaceship-1.0.2.dist-info}/top_level.txt +0 -0
|
@@ -1,270 +1,604 @@
|
|
|
1
|
-
import
|
|
2
|
-
import
|
|
3
|
-
|
|
4
|
-
from
|
|
5
|
-
from
|
|
1
|
+
import mmap
|
|
2
|
+
import re
|
|
3
|
+
from contextlib import contextmanager
|
|
4
|
+
from importlib.util import find_spec
|
|
5
|
+
from typing import Union
|
|
6
6
|
|
|
7
|
+
import gemmi
|
|
8
|
+
import numpy as np
|
|
7
9
|
|
|
8
|
-
|
|
10
|
+
from reciprocalspaceship import DataSet, concat
|
|
11
|
+
from reciprocalspaceship.utils import angle_between, eV2Angstroms
|
|
12
|
+
|
|
13
|
+
# See Rupp Table 5-2
|
|
14
|
+
_cell_constraints = {
|
|
15
|
+
"triclinic": lambda x: x,
|
|
16
|
+
"orthorhombic": lambda x: [x[0], x[1], x[2], 90.0, 90.0, 90.0],
|
|
17
|
+
"monoclinic": lambda x: [x[0], x[1], x[2], 90.0, x[4], 90.0],
|
|
18
|
+
"hexagonal": lambda x: [
|
|
19
|
+
0.5 * (x[0] + x[1]),
|
|
20
|
+
0.5 * (x[0] + x[1]),
|
|
21
|
+
x[2],
|
|
22
|
+
90.0,
|
|
23
|
+
90.0,
|
|
24
|
+
120.0,
|
|
25
|
+
],
|
|
26
|
+
"rhombohedral": lambda x: [
|
|
27
|
+
0.5 * (x[0] + x[1]),
|
|
28
|
+
0.5 * (x[0] + x[1]),
|
|
29
|
+
x[2],
|
|
30
|
+
90.0,
|
|
31
|
+
90.0,
|
|
32
|
+
120.0,
|
|
33
|
+
],
|
|
34
|
+
"cubic": lambda x: [
|
|
35
|
+
np.mean(x[:3]),
|
|
36
|
+
np.mean(x[:3]),
|
|
37
|
+
np.mean(x[:3]),
|
|
38
|
+
90.0,
|
|
39
|
+
90.0,
|
|
40
|
+
90.0,
|
|
41
|
+
],
|
|
42
|
+
"tetragonal": lambda x: [
|
|
43
|
+
0.5 * (x[0] + x[1]),
|
|
44
|
+
0.5 * (x[0] + x[1]),
|
|
45
|
+
x[2],
|
|
46
|
+
90.0,
|
|
47
|
+
90.0,
|
|
48
|
+
90.0,
|
|
49
|
+
],
|
|
50
|
+
}
|
|
51
|
+
|
|
52
|
+
# See crystFEL API reference here: https://www.desy.de/~twhite/crystfel/reference/stream_8h.html
|
|
53
|
+
_block_markers = {
|
|
54
|
+
"geometry": (r"----- Begin geometry file -----", r"----- End geometry file -----"),
|
|
55
|
+
"chunk": (r"----- Begin chunk -----", r"----- End chunk -----"),
|
|
56
|
+
"cell": (r"----- Begin unit cell -----", r"----- End unit cell -----"),
|
|
57
|
+
"peaks": (r"Peaks from peak search", r"End of peak list"),
|
|
58
|
+
"crystal": (r"--- Begin crystal", r"--- End crystal"),
|
|
59
|
+
"reflections": (r"Reflections measured after indexing", r"End of reflections"),
|
|
60
|
+
}
|
|
61
|
+
|
|
62
|
+
|
|
63
|
+
@contextmanager
|
|
64
|
+
def ray_context(**ray_kwargs):
|
|
65
|
+
import ray
|
|
66
|
+
|
|
67
|
+
ray.init(**ray_kwargs)
|
|
68
|
+
try:
|
|
69
|
+
yield ray
|
|
70
|
+
finally:
|
|
71
|
+
ray.shutdown()
|
|
72
|
+
|
|
73
|
+
|
|
74
|
+
class StreamLoader(object):
|
|
9
75
|
"""
|
|
10
|
-
|
|
11
|
-
|
|
12
|
-
Parameters
|
|
76
|
+
An object that loads stream files into rs.DataSet objects in parallel.
|
|
77
|
+
Attributes
|
|
13
78
|
----------
|
|
14
|
-
|
|
15
|
-
|
|
16
|
-
|
|
17
|
-
|
|
18
|
-
--------
|
|
19
|
-
(dict, np.ndarray)
|
|
79
|
+
block_regex_bytes : dict
|
|
80
|
+
A dictionary of compiled regular expressions that operate on strings
|
|
81
|
+
block_regex : dict
|
|
82
|
+
A dictionary of compiled regular expressions that operate on byte strings
|
|
20
83
|
"""
|
|
21
84
|
|
|
22
|
-
|
|
23
|
-
|
|
24
|
-
|
|
25
|
-
|
|
26
|
-
|
|
27
|
-
|
|
28
|
-
|
|
29
|
-
|
|
30
|
-
|
|
31
|
-
|
|
32
|
-
|
|
33
|
-
|
|
34
|
-
|
|
35
|
-
|
|
36
|
-
|
|
37
|
-
|
|
85
|
+
peak_list_columns = {
|
|
86
|
+
"H": 0,
|
|
87
|
+
"K": 1,
|
|
88
|
+
"L": 2,
|
|
89
|
+
"I": 3,
|
|
90
|
+
"SigI": 4,
|
|
91
|
+
"peak": 5,
|
|
92
|
+
"background": 6,
|
|
93
|
+
"XDET": 7,
|
|
94
|
+
"YDET": 8,
|
|
95
|
+
"s1x": 9,
|
|
96
|
+
"s1y": 10,
|
|
97
|
+
"s1z": 11,
|
|
98
|
+
"ewald_offset": 12,
|
|
99
|
+
"angular_ewald_offset": 13,
|
|
100
|
+
"ewald_offset_x": 14,
|
|
101
|
+
"ewald_offset_y": 15,
|
|
102
|
+
"ewald_offset_z": 16,
|
|
103
|
+
}
|
|
38
104
|
|
|
39
|
-
def
|
|
40
|
-
|
|
41
|
-
|
|
105
|
+
def __init__(self, filename: str, encoding="utf-8"):
|
|
106
|
+
self.filename = filename
|
|
107
|
+
self.encoding = encoding
|
|
108
|
+
self.block_regex = {}
|
|
109
|
+
self.block_regex_bytes = {}
|
|
110
|
+
|
|
111
|
+
# Set up all the regular expressions for finding block boundaries
|
|
112
|
+
for k, (beginning, ending) in _block_markers.items():
|
|
113
|
+
self.block_regex[k + "_begin"] = re.compile(beginning)
|
|
114
|
+
self.block_regex[k + "_end"] = re.compile(ending)
|
|
115
|
+
self.block_regex[k] = re.compile(
|
|
116
|
+
f"(?s){beginning}\n(?P<CRYSTAL_BLOCK>.*?)\n{ending}"
|
|
117
|
+
)
|
|
118
|
+
|
|
119
|
+
self.block_regex_bytes[k + "_begin"] = re.compile(
|
|
120
|
+
beginning.encode(self.encoding)
|
|
121
|
+
)
|
|
122
|
+
self.block_regex_bytes[k + "_end"] = re.compile(
|
|
123
|
+
ending.encode(self.encoding)
|
|
124
|
+
)
|
|
125
|
+
self.block_regex_bytes[k] = re.compile(
|
|
126
|
+
f"(?s){beginning}\n(?P<CRYSTAL_BLOCK>.*?)\n{ending}".encode(
|
|
127
|
+
self.encoding
|
|
128
|
+
)
|
|
129
|
+
)
|
|
130
|
+
|
|
131
|
+
self.re_abcstar = re.compile("[abc]star =.+\n")
|
|
132
|
+
self.re_photon_energy = re.compile("photon_energy_eV =.+\n")
|
|
133
|
+
|
|
134
|
+
self.re_chunk_metadata = {
|
|
135
|
+
"Image filename": re.compile(r"(?<=Image filename: ).+(?=\n)"),
|
|
136
|
+
"Event": re.compile(r"(?<=Event: ).+(?=\n)"),
|
|
137
|
+
"Image serial number:": re.compile(r"(?<=Image serial number: ).+(?=\n)"),
|
|
138
|
+
"indexed_by": re.compile(r"(?<=indexed_by \= ).+(?=\n)"),
|
|
139
|
+
"photon_energy_eV": re.compile(r"(?<=photon_energy_eV \= ).+(?=\n)"),
|
|
140
|
+
"beam_divergence": re.compile(r"(?<=beam_divergence \= ).+(?=\n)"),
|
|
141
|
+
"beam_bandwidth": re.compile(r"(?<=beam_bandwidth \= ).+(?=\n)"),
|
|
142
|
+
}
|
|
143
|
+
|
|
144
|
+
self.re_crystal_metadata = {
|
|
145
|
+
"Cell parameters": re.compile(r"(?<=Cell parameters).+(?=\n)"),
|
|
146
|
+
"astar": re.compile(r"(?<=astar = ).+(?=\n)"),
|
|
147
|
+
"bstar": re.compile(r"(?<=bstar = ).+(?=\n)"),
|
|
148
|
+
"cstar": re.compile(r"(?<=cstar = ).+(?=\n)"),
|
|
149
|
+
"lattice_type": re.compile(r"(?<=lattice_type = ).+(?=\n)"),
|
|
150
|
+
"centering": re.compile(r"(?<=centering = ).+(?=\n)"),
|
|
151
|
+
"unique_axis": re.compile(r"(?<=unique_axis = ).+(?=\n)"),
|
|
152
|
+
"profile_radius": re.compile(r"(?<=profile_radius = ).+(?=\n)"),
|
|
153
|
+
"predict_refine/det_shift": re.compile(
|
|
154
|
+
r"(?<=predict_refine/det_shift ).+(?=\n)"
|
|
155
|
+
),
|
|
156
|
+
"predict_refine/R": re.compile(r"(?<=predict_refine/R ).+(?=\n)"),
|
|
157
|
+
"diffraction_resolution_limit": re.compile(
|
|
158
|
+
r"(?<=diffraction_resolution_limit = ).+(?=\n)"
|
|
159
|
+
),
|
|
160
|
+
"num_reflections": re.compile(r"(?<=num_reflections = ).+(?=\n)"),
|
|
161
|
+
}
|
|
162
|
+
|
|
163
|
+
# TODO: replace these with the faster, non variabled length equivalents
|
|
164
|
+
self.re_crystal = re.compile(
|
|
165
|
+
r"(?s)--- Begin crystal\n(?P<CRYSTAL_BLOCK>.*?)\n--- End crystal"
|
|
166
|
+
)
|
|
167
|
+
self.re_refls = re.compile(
|
|
168
|
+
r"(?s)Reflections measured after indexing\n(?P<REFL_BLOCK>.*?)\nEnd of reflections"
|
|
42
169
|
)
|
|
43
170
|
|
|
44
|
-
def
|
|
45
|
-
|
|
46
|
-
|
|
47
|
-
|
|
48
|
-
|
|
49
|
-
|
|
50
|
-
|
|
51
|
-
|
|
52
|
-
|
|
53
|
-
|
|
54
|
-
|
|
55
|
-
|
|
56
|
-
|
|
57
|
-
|
|
58
|
-
|
|
59
|
-
|
|
60
|
-
|
|
61
|
-
|
|
62
|
-
|
|
63
|
-
|
|
64
|
-
|
|
65
|
-
|
|
66
|
-
|
|
67
|
-
|
|
68
|
-
|
|
69
|
-
|
|
70
|
-
|
|
71
|
-
|
|
72
|
-
|
|
73
|
-
|
|
74
|
-
|
|
75
|
-
|
|
76
|
-
|
|
77
|
-
|
|
78
|
-
|
|
79
|
-
|
|
80
|
-
|
|
81
|
-
|
|
82
|
-
|
|
83
|
-
|
|
84
|
-
|
|
85
|
-
|
|
86
|
-
|
|
87
|
-
|
|
88
|
-
|
|
89
|
-
|
|
90
|
-
|
|
91
|
-
|
|
92
|
-
|
|
93
|
-
|
|
94
|
-
|
|
95
|
-
|
|
96
|
-
|
|
97
|
-
|
|
98
|
-
|
|
99
|
-
|
|
100
|
-
|
|
101
|
-
|
|
102
|
-
|
|
103
|
-
|
|
104
|
-
|
|
105
|
-
|
|
106
|
-
|
|
107
|
-
|
|
108
|
-
|
|
109
|
-
|
|
110
|
-
|
|
111
|
-
|
|
112
|
-
|
|
113
|
-
|
|
114
|
-
|
|
115
|
-
|
|
116
|
-
|
|
117
|
-
|
|
118
|
-
|
|
119
|
-
|
|
120
|
-
|
|
121
|
-
|
|
122
|
-
|
|
123
|
-
|
|
124
|
-
|
|
125
|
-
|
|
126
|
-
|
|
127
|
-
|
|
128
|
-
|
|
129
|
-
|
|
130
|
-
|
|
131
|
-
|
|
132
|
-
|
|
133
|
-
|
|
134
|
-
|
|
135
|
-
|
|
136
|
-
|
|
137
|
-
|
|
138
|
-
|
|
139
|
-
|
|
140
|
-
|
|
141
|
-
|
|
142
|
-
|
|
143
|
-
|
|
144
|
-
|
|
145
|
-
|
|
146
|
-
|
|
147
|
-
|
|
148
|
-
|
|
149
|
-
|
|
150
|
-
|
|
151
|
-
|
|
152
|
-
|
|
153
|
-
|
|
154
|
-
|
|
155
|
-
|
|
156
|
-
|
|
157
|
-
|
|
158
|
-
|
|
159
|
-
|
|
160
|
-
|
|
161
|
-
|
|
162
|
-
|
|
163
|
-
|
|
164
|
-
|
|
165
|
-
|
|
166
|
-
|
|
167
|
-
|
|
168
|
-
|
|
169
|
-
|
|
170
|
-
|
|
171
|
-
|
|
172
|
-
|
|
173
|
-
|
|
174
|
-
|
|
175
|
-
|
|
176
|
-
|
|
177
|
-
|
|
178
|
-
|
|
171
|
+
def extract_target_unit_cell(self) -> Union[list, None]:
|
|
172
|
+
"""
|
|
173
|
+
Search the file header for target unit cell parameters.
|
|
174
|
+
"""
|
|
175
|
+
header = self.extract_file_header()
|
|
176
|
+
cell = None
|
|
177
|
+
lattice_type = None
|
|
178
|
+
|
|
179
|
+
for line in header.split("\n"):
|
|
180
|
+
if line.startswith("a = "):
|
|
181
|
+
idx = 0
|
|
182
|
+
elif line.startswith("b = "):
|
|
183
|
+
idx = 1
|
|
184
|
+
elif line.startswith("c = "):
|
|
185
|
+
idx = 2
|
|
186
|
+
elif line.startswith("al = "):
|
|
187
|
+
idx = 3
|
|
188
|
+
elif line.startswith("be = "):
|
|
189
|
+
idx = 4
|
|
190
|
+
elif line.startswith("ga = "):
|
|
191
|
+
idx = 5
|
|
192
|
+
else:
|
|
193
|
+
idx = None
|
|
194
|
+
if idx is not None:
|
|
195
|
+
if cell is None:
|
|
196
|
+
cell = [None] * 6
|
|
197
|
+
value = float(line.split()[2])
|
|
198
|
+
cell[idx] = value
|
|
199
|
+
if line.startswith("lattice_type ="):
|
|
200
|
+
lattice_type = line.split()[-1]
|
|
201
|
+
|
|
202
|
+
if lattice_type is not None:
|
|
203
|
+
cell = _cell_constraints[lattice_type](cell)
|
|
204
|
+
return cell
|
|
205
|
+
|
|
206
|
+
def calculate_average_unit_cell(self) -> gemmi.UnitCell:
|
|
207
|
+
"""
|
|
208
|
+
Compute the average of all cell parameters across the file.
|
|
209
|
+
"""
|
|
210
|
+
regex = re.compile(rb"Cell parameters .+\n")
|
|
211
|
+
with open(self.filename, "r") as f:
|
|
212
|
+
memfile = mmap.mmap(f.fileno(), 0, access=mmap.ACCESS_READ)
|
|
213
|
+
lines = regex.findall(memfile)
|
|
214
|
+
if len(lines) == 0:
|
|
215
|
+
raise ValueError(
|
|
216
|
+
f"No unit cell parameters were found in the header of {self.filename}"
|
|
217
|
+
)
|
|
218
|
+
|
|
219
|
+
cell = np.loadtxt(lines, usecols=[2, 3, 4, 6, 7, 8], dtype="float32").mean(0)
|
|
220
|
+
cell[:3] *= 10.0
|
|
221
|
+
|
|
222
|
+
header = self.extract_file_header()
|
|
223
|
+
lattice_type = None
|
|
224
|
+
|
|
225
|
+
for line in header.split("\n"):
|
|
226
|
+
if line.startswith("lattice_type ="):
|
|
227
|
+
lattice_type = line.split()[-1]
|
|
228
|
+
|
|
229
|
+
if lattice_type is not None:
|
|
230
|
+
cell = _cell_constraints[lattice_type](cell)
|
|
231
|
+
return cell
|
|
232
|
+
|
|
233
|
+
def extract_file_header(self) -> str:
|
|
234
|
+
"""
|
|
235
|
+
Extract all the data prior to first chunk and return it as a string.
|
|
236
|
+
"""
|
|
237
|
+
with open(self.filename, "r") as f:
|
|
238
|
+
memfile = mmap.mmap(f.fileno(), 0, access=mmap.ACCESS_READ)
|
|
239
|
+
match = self.block_regex_bytes["chunk_begin"].search(memfile)
|
|
240
|
+
header = memfile.read(match.start()).decode()
|
|
241
|
+
return header
|
|
242
|
+
|
|
243
|
+
@property
|
|
244
|
+
def available_column_names(self) -> list:
|
|
245
|
+
"""Keys which can be passed to parallel_read_crystfel to customize the peak list output"""
|
|
246
|
+
return list(self.peak_list_columns.keys())
|
|
247
|
+
|
|
248
|
+
@property
|
|
249
|
+
def available_chunk_metadata_keys(self) -> list:
|
|
250
|
+
"""Keys which can be passed to parallel_read_crystfel to customize the chunk level metadata"""
|
|
251
|
+
return list(self.re_chunk_metadata.keys())
|
|
252
|
+
|
|
253
|
+
@property
|
|
254
|
+
def available_crystal_metadata_keys(self) -> list:
|
|
255
|
+
"""Keys which can be passed to parallel_read_crystfel to customize the crystal level metadata"""
|
|
256
|
+
return list(self.re_crystal_metadata.keys())
|
|
257
|
+
|
|
258
|
+
def read_crystfel(
|
|
259
|
+
self,
|
|
260
|
+
wavelength=None,
|
|
261
|
+
chunk_metadata_keys=None,
|
|
262
|
+
crystal_metadata_keys=None,
|
|
263
|
+
peak_list_columns=None,
|
|
264
|
+
use_ray=True,
|
|
265
|
+
num_cpus=None,
|
|
266
|
+
address="local",
|
|
267
|
+
**ray_kwargs,
|
|
268
|
+
) -> list:
|
|
269
|
+
"""
|
|
270
|
+
Parse a CrystFEL stream file using multiple processors. Parallelization depends on the ray library (https://www.ray.io/).
|
|
271
|
+
If ray is unavailable, this method falls back to serial processing on one CPU. Ray is not a dependency of reciprocalspaceship
|
|
272
|
+
and will not be installed automatically. Users must manually install it prior to calling this method.
|
|
273
|
+
|
|
274
|
+
PARAMETERS
|
|
275
|
+
----------
|
|
276
|
+
wavelength : float
|
|
277
|
+
Override the wavelength with this value. Wavelength is used to compute Ewald offsets.
|
|
278
|
+
chunk_metadata_keys : list
|
|
279
|
+
A list of metadata_keys which will be returned in the resulting dictionaries under the 'chunk_metadata' entry.
|
|
280
|
+
A list of possible keys is stored as stream_loader.available_chunk_metadata_keys
|
|
281
|
+
crytal_metadata_keys : list
|
|
282
|
+
A list of metadata_keys which will be returned in the resulting dictionaries under the 'crystal_metadata' entry.
|
|
283
|
+
A list of possible keys is stored as stream_loader.available_crystal_metadata_keys
|
|
284
|
+
peak_list_columns : list
|
|
285
|
+
A list of columns to include in the peak list numpy arrays.
|
|
286
|
+
A list of possible column names is stored as stream_loader.available_column_names.
|
|
287
|
+
use_ray : bool(optional)
|
|
288
|
+
Whether or not to use ray for parallelization.
|
|
289
|
+
num_cpus : int (optional)
|
|
290
|
+
The number of cpus for ray to use.
|
|
291
|
+
ray_kwargs : optional
|
|
292
|
+
Additional keyword arguments to pass to [ray.init](https://docs.ray.io/en/latest/ray-core/api/doc/ray.init.html#ray.init).
|
|
293
|
+
|
|
294
|
+
RETURNS
|
|
295
|
+
-------
|
|
296
|
+
chunks : list
|
|
297
|
+
A list of dictionaries containing the per-chunk data. The 'peak_lists' item contains a
|
|
298
|
+
numpy array with shape n x 14 with the following information.
|
|
299
|
+
h, k, l, I, SIGI, peak, background, fs/px, ss/px, s1x, s1y, s1z,
|
|
300
|
+
ewald_offset, angular_ewald_offset
|
|
301
|
+
"""
|
|
302
|
+
if peak_list_columns is not None:
|
|
303
|
+
peak_list_columns = [self.peak_list_columns[s] for s in peak_list_columns]
|
|
304
|
+
|
|
305
|
+
# Check whether ray is available
|
|
306
|
+
if use_ray:
|
|
307
|
+
if find_spec("ray") is None:
|
|
308
|
+
use_ray = False
|
|
309
|
+
import warnings
|
|
310
|
+
|
|
311
|
+
message = (
|
|
312
|
+
"ray (https://www.ray.io/) is not available..."
|
|
313
|
+
"Falling back to serial stream file parser."
|
|
314
|
+
)
|
|
315
|
+
warnings.warn(message, ImportWarning)
|
|
316
|
+
|
|
317
|
+
with open(self.filename, "r") as f:
|
|
318
|
+
memfile = mmap.mmap(f.fileno(), 0, access=mmap.ACCESS_READ)
|
|
319
|
+
beginnings_and_ends = zip(
|
|
320
|
+
self.block_regex_bytes["chunk_begin"].finditer(memfile),
|
|
321
|
+
self.block_regex_bytes["chunk_end"].finditer(memfile),
|
|
322
|
+
)
|
|
323
|
+
if use_ray:
|
|
324
|
+
with ray_context(num_cpus=num_cpus, **ray_kwargs) as ray:
|
|
325
|
+
|
|
326
|
+
@ray.remote
|
|
327
|
+
def parse_chunk(loader: StreamLoader, *args):
|
|
328
|
+
return loader._parse_chunk(*args)
|
|
329
|
+
|
|
330
|
+
result_ids = []
|
|
331
|
+
for begin, end in beginnings_and_ends:
|
|
332
|
+
result_ids.append(
|
|
333
|
+
parse_chunk.remote(
|
|
334
|
+
self,
|
|
335
|
+
begin.start(),
|
|
336
|
+
end.end(),
|
|
337
|
+
wavelength,
|
|
338
|
+
chunk_metadata_keys,
|
|
339
|
+
crystal_metadata_keys,
|
|
340
|
+
peak_list_columns,
|
|
341
|
+
)
|
|
342
|
+
)
|
|
343
|
+
|
|
344
|
+
results = ray.get(result_ids)
|
|
345
|
+
|
|
346
|
+
return results
|
|
347
|
+
|
|
348
|
+
else:
|
|
349
|
+
results = []
|
|
350
|
+
for begin, end in beginnings_and_ends:
|
|
351
|
+
results.append(
|
|
352
|
+
self._parse_chunk(
|
|
353
|
+
begin.start(),
|
|
354
|
+
end.end(),
|
|
355
|
+
wavelength,
|
|
356
|
+
chunk_metadata_keys,
|
|
357
|
+
crystal_metadata_keys,
|
|
358
|
+
peak_list_columns,
|
|
359
|
+
)
|
|
179
360
|
)
|
|
180
|
-
|
|
181
|
-
|
|
182
|
-
|
|
183
|
-
|
|
184
|
-
|
|
185
|
-
|
|
361
|
+
return results
|
|
362
|
+
|
|
363
|
+
def _extract_chunk_metadata(self, chunk_text, metadata_keys=None):
|
|
364
|
+
if metadata_keys is None:
|
|
365
|
+
return None
|
|
366
|
+
result = {}
|
|
367
|
+
for k in metadata_keys:
|
|
368
|
+
re = self.re_chunk_metadata[k]
|
|
369
|
+
for v in re.findall(chunk_text):
|
|
370
|
+
result[k] = v
|
|
371
|
+
return result
|
|
372
|
+
|
|
373
|
+
def _extract_crystal_metadata(self, xtal_text, metadata_keys=None):
|
|
374
|
+
if metadata_keys is None:
|
|
375
|
+
return None
|
|
376
|
+
result = {}
|
|
377
|
+
for k in metadata_keys:
|
|
378
|
+
re = self.re_crystal_metadata[k]
|
|
379
|
+
for v in re.findall(xtal_text):
|
|
380
|
+
result[k] = v
|
|
381
|
+
return result
|
|
382
|
+
|
|
383
|
+
def _parse_chunk(
|
|
384
|
+
self,
|
|
385
|
+
start,
|
|
386
|
+
end,
|
|
387
|
+
wavelength,
|
|
388
|
+
chunk_metadata_keys,
|
|
389
|
+
crystal_metadata_keys,
|
|
390
|
+
peak_list_columns,
|
|
391
|
+
):
|
|
392
|
+
with open(self.filename, "r") as f:
|
|
393
|
+
f.seek(start)
|
|
394
|
+
data = f.read(end - start)
|
|
395
|
+
|
|
396
|
+
if wavelength is None:
|
|
397
|
+
ev_match = self.re_photon_energy.search(data)
|
|
398
|
+
ev_line = data[ev_match.start() : ev_match.end()]
|
|
399
|
+
photon_energy = np.float32(ev_line.split()[2])
|
|
400
|
+
wavelength = eV2Angstroms(photon_energy)
|
|
401
|
+
lambda_inv = np.reciprocal(wavelength)
|
|
402
|
+
else:
|
|
403
|
+
lambda_inv = np.reciprocal(wavelength)
|
|
404
|
+
|
|
405
|
+
peak_lists = []
|
|
406
|
+
a_matrices = []
|
|
407
|
+
chunk_metadata = None
|
|
408
|
+
crystal_metadata = []
|
|
409
|
+
header = None
|
|
410
|
+
for xmatch in self.re_crystal.finditer(data):
|
|
411
|
+
xdata = data[xmatch.start() : xmatch.end()]
|
|
412
|
+
if header is None:
|
|
413
|
+
header = data[: xmatch.start()]
|
|
414
|
+
|
|
415
|
+
# crystal_metadata.append(self._extract_crystal_metadata(xdata))
|
|
416
|
+
A = (
|
|
417
|
+
np.loadtxt(
|
|
418
|
+
self.re_abcstar.findall(xdata),
|
|
419
|
+
usecols=[2, 3, 4],
|
|
420
|
+
dtype="float32",
|
|
421
|
+
).T
|
|
422
|
+
/ 10.0
|
|
423
|
+
)
|
|
424
|
+
a_matrices.append(A)
|
|
425
|
+
|
|
426
|
+
for pmatch in self.re_refls.finditer(xdata):
|
|
427
|
+
pdata = xdata[pmatch.start() : pmatch.end()]
|
|
428
|
+
crystal_metadata.append(
|
|
429
|
+
self._extract_crystal_metadata(xdata, crystal_metadata_keys)
|
|
186
430
|
)
|
|
187
|
-
|
|
188
|
-
|
|
189
|
-
|
|
190
|
-
|
|
191
|
-
|
|
192
|
-
|
|
193
|
-
|
|
194
|
-
|
|
195
|
-
|
|
196
|
-
|
|
197
|
-
|
|
198
|
-
|
|
199
|
-
|
|
200
|
-
|
|
201
|
-
|
|
202
|
-
|
|
203
|
-
|
|
204
|
-
|
|
205
|
-
|
|
206
|
-
|
|
207
|
-
|
|
208
|
-
|
|
209
|
-
|
|
210
|
-
|
|
211
|
-
|
|
212
|
-
|
|
213
|
-
|
|
214
|
-
|
|
215
|
-
|
|
216
|
-
|
|
217
|
-
|
|
431
|
+
peak_array = np.loadtxt(
|
|
432
|
+
pdata.split("\n")[2:-1],
|
|
433
|
+
usecols=(0, 1, 2, 3, 4, 5, 6, 7, 8),
|
|
434
|
+
dtype="float32",
|
|
435
|
+
)
|
|
436
|
+
s0 = np.array([0, 0, lambda_inv], dtype="float32").T
|
|
437
|
+
q = (A @ peak_array[:, :3].T).T
|
|
438
|
+
s1 = q + s0
|
|
439
|
+
|
|
440
|
+
# This is way faster than np.linalg.norm for small dimensions
|
|
441
|
+
x, y, z = s1.T
|
|
442
|
+
s1_norm = np.sqrt(x * x + y * y + z * z)
|
|
443
|
+
ewald_offset = s1_norm - lambda_inv
|
|
444
|
+
|
|
445
|
+
# project calculated s1 onto the ewald sphere
|
|
446
|
+
s1_obs = lambda_inv * s1 / s1_norm[:, None]
|
|
447
|
+
|
|
448
|
+
# Compute the angular ewald offset
|
|
449
|
+
q_obs = s1_obs - s0
|
|
450
|
+
qangle = np.sign(ewald_offset) * angle_between(q, q_obs)
|
|
451
|
+
|
|
452
|
+
peak_array = np.concatenate(
|
|
453
|
+
(
|
|
454
|
+
peak_array,
|
|
455
|
+
s1,
|
|
456
|
+
ewald_offset[:, None],
|
|
457
|
+
qangle[:, None],
|
|
458
|
+
s1_obs - s1, # Ewald offset vector
|
|
459
|
+
),
|
|
460
|
+
axis=-1,
|
|
461
|
+
)
|
|
462
|
+
if peak_list_columns is not None:
|
|
463
|
+
peak_array = peak_array[:, peak_list_columns]
|
|
464
|
+
peak_lists.append(peak_array)
|
|
465
|
+
|
|
466
|
+
if header is None:
|
|
467
|
+
header = data
|
|
468
|
+
chunk_metadata = self._extract_chunk_metadata(header, chunk_metadata_keys)
|
|
469
|
+
|
|
470
|
+
result = {
|
|
471
|
+
"wavelength": wavelength,
|
|
472
|
+
"A_matrices": a_matrices,
|
|
473
|
+
"peak_lists": peak_lists,
|
|
474
|
+
}
|
|
475
|
+
if chunk_metadata_keys is not None:
|
|
476
|
+
result[chunk_metadata_keys] = chunk_metadata
|
|
477
|
+
if crystal_metadata_keys is not None:
|
|
478
|
+
result[crystal_metadata_keys] = crystal_metadata
|
|
479
|
+
return result
|
|
480
|
+
|
|
481
|
+
|
|
482
|
+
def read_crystfel(
|
|
483
|
+
streamfile: str,
|
|
484
|
+
spacegroup=None,
|
|
485
|
+
encoding="utf-8",
|
|
486
|
+
columns=None,
|
|
487
|
+
parallel=True,
|
|
488
|
+
num_cpus=None,
|
|
489
|
+
address="local",
|
|
490
|
+
**ray_kwargs,
|
|
491
|
+
) -> DataSet:
|
|
218
492
|
"""
|
|
219
493
|
Initialize attributes and populate the DataSet object with data from a CrystFEL stream with indexed reflections.
|
|
220
494
|
This is the output format used by CrystFEL software when processing still diffraction data.
|
|
221
495
|
|
|
496
|
+
This method is parallelized across CPUs speed up parsing. Parallelization depends on the ray library (https://www.ray.io/).
|
|
497
|
+
If ray is unavailable, this method falls back to serial processing on one CPU. Ray is not a dependency of reciprocalspaceship
|
|
498
|
+
and will not be installed automatically. Users must manually install it prior to calling this method.
|
|
499
|
+
|
|
222
500
|
Parameters
|
|
223
501
|
----------
|
|
224
502
|
streamfile : str
|
|
225
503
|
name of a .stream file
|
|
226
504
|
spacegroup : gemmi.SpaceGroup or int or string (optional)
|
|
227
505
|
optionally set the spacegroup of the returned DataSet.
|
|
506
|
+
encoding : str
|
|
507
|
+
The type of byte-encoding (optional, 'utf-8').
|
|
508
|
+
columns : list (optional)
|
|
509
|
+
Optionally specify the columns of the output by a list of strings.
|
|
510
|
+
The default list is:
|
|
511
|
+
[ "H", "K", "L", "I", "SigI", "BATCH", "s1x", "s1y", "s1z", "ewald_offset",
|
|
512
|
+
"angular_ewald_offset", "XDET", "YDET" ]
|
|
513
|
+
See `rs.io.crystfel.StreamLoader().available_column_names` for a list of available column names.
|
|
514
|
+
parallel : bool (optional)
|
|
515
|
+
Read the stream file in parallel using [ray.io](https://docs.ray.io) if it is available.
|
|
516
|
+
num_cpus : int (optional)
|
|
517
|
+
By default, the model will use all available cores. For very large cpu counts, this may consume
|
|
518
|
+
too much memory. Decreasing num_cpus may help. If ray is not installed, a single core will be used.
|
|
519
|
+
address : str (optional)
|
|
520
|
+
Optionally specify the ray instance to connect to. By default, start a new local instance.
|
|
521
|
+
ray_kwargs : optional
|
|
522
|
+
Additional keyword arguments to pass to [ray.init](https://docs.ray.io/en/latest/ray-core/api/doc/ray.init.html#ray.init).
|
|
228
523
|
|
|
229
524
|
Returns
|
|
230
525
|
--------
|
|
231
526
|
rs.DataSet
|
|
232
527
|
"""
|
|
233
|
-
|
|
234
528
|
if not streamfile.endswith(".stream"):
|
|
235
529
|
raise ValueError("Stream file should end with .stream")
|
|
530
|
+
|
|
236
531
|
# read data from stream file
|
|
237
|
-
|
|
238
|
-
|
|
239
|
-
|
|
240
|
-
|
|
241
|
-
|
|
242
|
-
|
|
243
|
-
|
|
244
|
-
|
|
245
|
-
|
|
246
|
-
|
|
532
|
+
if columns is None:
|
|
533
|
+
columns = [
|
|
534
|
+
"H",
|
|
535
|
+
"K",
|
|
536
|
+
"L",
|
|
537
|
+
"I",
|
|
538
|
+
"SigI",
|
|
539
|
+
"BATCH",
|
|
540
|
+
"s1x",
|
|
541
|
+
"s1y",
|
|
542
|
+
"s1z",
|
|
543
|
+
"ewald_offset",
|
|
544
|
+
"angular_ewald_offset",
|
|
545
|
+
"XDET",
|
|
546
|
+
"YDET",
|
|
547
|
+
]
|
|
548
|
+
peak_list_columns = [
|
|
549
|
+
i for i in columns if i != "BATCH"
|
|
550
|
+
] # BATCH is computed afterward
|
|
551
|
+
|
|
552
|
+
mtz_dtypes = {
|
|
247
553
|
"H": "H",
|
|
248
554
|
"K": "H",
|
|
249
555
|
"L": "H",
|
|
250
556
|
"I": "J",
|
|
251
557
|
"SigI": "Q",
|
|
252
558
|
"BATCH": "B",
|
|
253
|
-
"s1x": "R",
|
|
254
|
-
"s1y": "R",
|
|
255
|
-
"s1z": "R",
|
|
256
|
-
"ewald_offset": "R",
|
|
257
|
-
"angular_ewald_offset": "R",
|
|
258
|
-
"XDET": "R",
|
|
259
|
-
"YDET": "R",
|
|
260
559
|
}
|
|
261
|
-
|
|
262
|
-
|
|
263
|
-
|
|
264
|
-
|
|
265
|
-
)
|
|
266
|
-
|
|
267
|
-
|
|
268
|
-
|
|
269
|
-
|
|
270
|
-
|
|
560
|
+
for k in columns:
|
|
561
|
+
mtz_dtypes[k] = mtz_dtypes.get(k, "R")
|
|
562
|
+
|
|
563
|
+
loader = StreamLoader(streamfile, encoding=encoding)
|
|
564
|
+
cell = loader.extract_target_unit_cell()
|
|
565
|
+
|
|
566
|
+
batch = 0
|
|
567
|
+
ds = []
|
|
568
|
+
|
|
569
|
+
for chunk in loader.read_crystfel(
|
|
570
|
+
peak_list_columns=peak_list_columns,
|
|
571
|
+
use_ray=parallel,
|
|
572
|
+
num_cpus=num_cpus,
|
|
573
|
+
address=address,
|
|
574
|
+
**ray_kwargs,
|
|
575
|
+
):
|
|
576
|
+
for peak_list in chunk["peak_lists"]:
|
|
577
|
+
_ds = DataSet(
|
|
578
|
+
peak_list,
|
|
579
|
+
columns=peak_list_columns,
|
|
580
|
+
cell=cell,
|
|
581
|
+
spacegroup=spacegroup,
|
|
582
|
+
merged=False,
|
|
583
|
+
)
|
|
584
|
+
_ds["BATCH"] = batch
|
|
585
|
+
ds.append(_ds)
|
|
586
|
+
batch += 1
|
|
587
|
+
|
|
588
|
+
ds = concat(ds, axis=0, check_isomorphous=False, copy=False, ignore_index=True)
|
|
589
|
+
|
|
590
|
+
mtz_dtypes = {
|
|
591
|
+
"H": "H",
|
|
592
|
+
"K": "H",
|
|
593
|
+
"L": "H",
|
|
594
|
+
"I": "J",
|
|
595
|
+
"SigI": "Q",
|
|
596
|
+
"BATCH": "B",
|
|
597
|
+
}
|
|
598
|
+
for k in ds:
|
|
599
|
+
mtz_dtypes[k] = mtz_dtypes.get(k, "R")
|
|
600
|
+
|
|
601
|
+
ds = ds.astype(mtz_dtypes, copy=False)
|
|
602
|
+
ds.set_index(["H", "K", "L"], inplace=True)
|
|
603
|
+
|
|
604
|
+
return ds
|