reciprocalspaceship 1.0.1__py3-none-any.whl → 1.0.3__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of reciprocalspaceship might be problematic. Click here for more details.
- reciprocalspaceship/VERSION +1 -1
- reciprocalspaceship/__init__.py +9 -2
- reciprocalspaceship/algorithms/scale_merged_intensities.py +8 -7
- reciprocalspaceship/dataset.py +28 -3
- reciprocalspaceship/decorators.py +8 -4
- reciprocalspaceship/dtypes/floating.py +24 -28
- reciprocalspaceship/dtypes/integer.py +38 -37
- reciprocalspaceship/dtypes/internals.py +243 -49
- reciprocalspaceship/io/__init__.py +1 -0
- reciprocalspaceship/io/common.py +48 -0
- reciprocalspaceship/io/crystfel.py +559 -234
- reciprocalspaceship/io/dials.py +330 -0
- reciprocalspaceship/io/dials_mpi.py +44 -0
- reciprocalspaceship/io/mtz.py +4 -5
- reciprocalspaceship/utils/__init__.py +6 -1
- reciprocalspaceship/utils/cell.py +5 -0
- reciprocalspaceship/utils/stats.py +5 -7
- reciprocalspaceship/utils/structurefactors.py +5 -0
- reciprocalspaceship/utils/units.py +14 -4
- {reciprocalspaceship-1.0.1.dist-info → reciprocalspaceship-1.0.3.dist-info}/METADATA +27 -28
- {reciprocalspaceship-1.0.1.dist-info → reciprocalspaceship-1.0.3.dist-info}/RECORD +28 -24
- {reciprocalspaceship-1.0.1.dist-info → reciprocalspaceship-1.0.3.dist-info}/WHEEL +1 -1
- {reciprocalspaceship-1.0.1.dist-info → reciprocalspaceship-1.0.3.dist-info}/entry_points.txt +0 -1
- tests/test_dataseries.py +1 -1
- tests/test_dataset.py +42 -0
- tests/test_dataset_signatures.py +53 -0
- {reciprocalspaceship-1.0.1.dist-info → reciprocalspaceship-1.0.3.dist-info}/LICENSE +0 -0
- {reciprocalspaceship-1.0.1.dist-info → reciprocalspaceship-1.0.3.dist-info}/top_level.txt +0 -0
|
@@ -1,270 +1,595 @@
|
|
|
1
|
-
import
|
|
2
|
-
import
|
|
3
|
-
|
|
4
|
-
from reciprocalspaceship import DataSet
|
|
5
|
-
from reciprocalspaceship.utils import angle_between
|
|
1
|
+
import mmap
|
|
2
|
+
import re
|
|
3
|
+
from typing import Union
|
|
6
4
|
|
|
5
|
+
import gemmi
|
|
6
|
+
import numpy as np
|
|
7
7
|
|
|
8
|
-
|
|
8
|
+
from reciprocalspaceship import DataSet, concat
|
|
9
|
+
from reciprocalspaceship.io.common import check_for_ray, ray_context
|
|
10
|
+
from reciprocalspaceship.utils import angle_between, eV2Angstroms
|
|
11
|
+
|
|
12
|
+
# See Rupp Table 5-2
|
|
13
|
+
_cell_constraints = {
|
|
14
|
+
"triclinic": lambda x: x,
|
|
15
|
+
"orthorhombic": lambda x: [x[0], x[1], x[2], 90.0, 90.0, 90.0],
|
|
16
|
+
"monoclinic": lambda x: [x[0], x[1], x[2], 90.0, x[4], 90.0],
|
|
17
|
+
"hexagonal": lambda x: [
|
|
18
|
+
0.5 * (x[0] + x[1]),
|
|
19
|
+
0.5 * (x[0] + x[1]),
|
|
20
|
+
x[2],
|
|
21
|
+
90.0,
|
|
22
|
+
90.0,
|
|
23
|
+
120.0,
|
|
24
|
+
],
|
|
25
|
+
"rhombohedral": lambda x: [
|
|
26
|
+
0.5 * (x[0] + x[1]),
|
|
27
|
+
0.5 * (x[0] + x[1]),
|
|
28
|
+
x[2],
|
|
29
|
+
90.0,
|
|
30
|
+
90.0,
|
|
31
|
+
120.0,
|
|
32
|
+
],
|
|
33
|
+
"cubic": lambda x: [
|
|
34
|
+
np.mean(x[:3]),
|
|
35
|
+
np.mean(x[:3]),
|
|
36
|
+
np.mean(x[:3]),
|
|
37
|
+
90.0,
|
|
38
|
+
90.0,
|
|
39
|
+
90.0,
|
|
40
|
+
],
|
|
41
|
+
"tetragonal": lambda x: [
|
|
42
|
+
0.5 * (x[0] + x[1]),
|
|
43
|
+
0.5 * (x[0] + x[1]),
|
|
44
|
+
x[2],
|
|
45
|
+
90.0,
|
|
46
|
+
90.0,
|
|
47
|
+
90.0,
|
|
48
|
+
],
|
|
49
|
+
}
|
|
50
|
+
|
|
51
|
+
# See crystFEL API reference here: https://www.desy.de/~twhite/crystfel/reference/stream_8h.html
|
|
52
|
+
_block_markers = {
|
|
53
|
+
"geometry": (r"----- Begin geometry file -----", r"----- End geometry file -----"),
|
|
54
|
+
"chunk": (r"----- Begin chunk -----", r"----- End chunk -----"),
|
|
55
|
+
"cell": (r"----- Begin unit cell -----", r"----- End unit cell -----"),
|
|
56
|
+
"peaks": (r"Peaks from peak search", r"End of peak list"),
|
|
57
|
+
"crystal": (r"--- Begin crystal", r"--- End crystal"),
|
|
58
|
+
"reflections": (r"Reflections measured after indexing", r"End of reflections"),
|
|
59
|
+
}
|
|
60
|
+
|
|
61
|
+
|
|
62
|
+
class StreamLoader(object):
|
|
9
63
|
"""
|
|
10
|
-
|
|
11
|
-
|
|
12
|
-
Parameters
|
|
64
|
+
An object that loads stream files into rs.DataSet objects in parallel.
|
|
65
|
+
Attributes
|
|
13
66
|
----------
|
|
14
|
-
|
|
15
|
-
|
|
16
|
-
|
|
17
|
-
|
|
18
|
-
--------
|
|
19
|
-
(dict, np.ndarray)
|
|
67
|
+
block_regex_bytes : dict
|
|
68
|
+
A dictionary of compiled regular expressions that operate on strings
|
|
69
|
+
block_regex : dict
|
|
70
|
+
A dictionary of compiled regular expressions that operate on byte strings
|
|
20
71
|
"""
|
|
21
72
|
|
|
22
|
-
|
|
23
|
-
|
|
24
|
-
|
|
25
|
-
|
|
26
|
-
|
|
27
|
-
|
|
28
|
-
|
|
29
|
-
|
|
30
|
-
|
|
31
|
-
|
|
32
|
-
|
|
33
|
-
|
|
34
|
-
|
|
35
|
-
|
|
36
|
-
|
|
37
|
-
|
|
73
|
+
peak_list_columns = {
|
|
74
|
+
"H": 0,
|
|
75
|
+
"K": 1,
|
|
76
|
+
"L": 2,
|
|
77
|
+
"I": 3,
|
|
78
|
+
"SigI": 4,
|
|
79
|
+
"peak": 5,
|
|
80
|
+
"background": 6,
|
|
81
|
+
"XDET": 7,
|
|
82
|
+
"YDET": 8,
|
|
83
|
+
"s1x": 9,
|
|
84
|
+
"s1y": 10,
|
|
85
|
+
"s1z": 11,
|
|
86
|
+
"ewald_offset": 12,
|
|
87
|
+
"angular_ewald_offset": 13,
|
|
88
|
+
"ewald_offset_x": 14,
|
|
89
|
+
"ewald_offset_y": 15,
|
|
90
|
+
"ewald_offset_z": 16,
|
|
91
|
+
}
|
|
38
92
|
|
|
39
|
-
def
|
|
40
|
-
|
|
41
|
-
|
|
93
|
+
def __init__(self, filename: str, encoding="utf-8"):
|
|
94
|
+
self.filename = filename
|
|
95
|
+
self.encoding = encoding
|
|
96
|
+
self.block_regex = {}
|
|
97
|
+
self.block_regex_bytes = {}
|
|
98
|
+
|
|
99
|
+
# Set up all the regular expressions for finding block boundaries
|
|
100
|
+
for k, (beginning, ending) in _block_markers.items():
|
|
101
|
+
self.block_regex[k + "_begin"] = re.compile(beginning)
|
|
102
|
+
self.block_regex[k + "_end"] = re.compile(ending)
|
|
103
|
+
self.block_regex[k] = re.compile(
|
|
104
|
+
f"(?s){beginning}\n(?P<CRYSTAL_BLOCK>.*?)\n{ending}"
|
|
105
|
+
)
|
|
106
|
+
|
|
107
|
+
self.block_regex_bytes[k + "_begin"] = re.compile(
|
|
108
|
+
beginning.encode(self.encoding)
|
|
109
|
+
)
|
|
110
|
+
self.block_regex_bytes[k + "_end"] = re.compile(
|
|
111
|
+
ending.encode(self.encoding)
|
|
112
|
+
)
|
|
113
|
+
self.block_regex_bytes[k] = re.compile(
|
|
114
|
+
f"(?s){beginning}\n(?P<CRYSTAL_BLOCK>.*?)\n{ending}".encode(
|
|
115
|
+
self.encoding
|
|
116
|
+
)
|
|
117
|
+
)
|
|
118
|
+
|
|
119
|
+
self.re_abcstar = re.compile("[abc]star =.+\n")
|
|
120
|
+
self.re_photon_energy = re.compile("photon_energy_eV =.+\n")
|
|
121
|
+
|
|
122
|
+
self.re_chunk_metadata = {
|
|
123
|
+
"Image filename": re.compile(r"(?<=Image filename: ).+(?=\n)"),
|
|
124
|
+
"Event": re.compile(r"(?<=Event: ).+(?=\n)"),
|
|
125
|
+
"Image serial number:": re.compile(r"(?<=Image serial number: ).+(?=\n)"),
|
|
126
|
+
"indexed_by": re.compile(r"(?<=indexed_by \= ).+(?=\n)"),
|
|
127
|
+
"photon_energy_eV": re.compile(r"(?<=photon_energy_eV \= ).+(?=\n)"),
|
|
128
|
+
"beam_divergence": re.compile(r"(?<=beam_divergence \= ).+(?=\n)"),
|
|
129
|
+
"beam_bandwidth": re.compile(r"(?<=beam_bandwidth \= ).+(?=\n)"),
|
|
130
|
+
}
|
|
131
|
+
|
|
132
|
+
self.re_crystal_metadata = {
|
|
133
|
+
"Cell parameters": re.compile(r"(?<=Cell parameters).+(?=\n)"),
|
|
134
|
+
"astar": re.compile(r"(?<=astar = ).+(?=\n)"),
|
|
135
|
+
"bstar": re.compile(r"(?<=bstar = ).+(?=\n)"),
|
|
136
|
+
"cstar": re.compile(r"(?<=cstar = ).+(?=\n)"),
|
|
137
|
+
"lattice_type": re.compile(r"(?<=lattice_type = ).+(?=\n)"),
|
|
138
|
+
"centering": re.compile(r"(?<=centering = ).+(?=\n)"),
|
|
139
|
+
"unique_axis": re.compile(r"(?<=unique_axis = ).+(?=\n)"),
|
|
140
|
+
"profile_radius": re.compile(r"(?<=profile_radius = ).+(?=\n)"),
|
|
141
|
+
"predict_refine/det_shift": re.compile(
|
|
142
|
+
r"(?<=predict_refine/det_shift ).+(?=\n)"
|
|
143
|
+
),
|
|
144
|
+
"predict_refine/R": re.compile(r"(?<=predict_refine/R ).+(?=\n)"),
|
|
145
|
+
"diffraction_resolution_limit": re.compile(
|
|
146
|
+
r"(?<=diffraction_resolution_limit = ).+(?=\n)"
|
|
147
|
+
),
|
|
148
|
+
"num_reflections": re.compile(r"(?<=num_reflections = ).+(?=\n)"),
|
|
149
|
+
}
|
|
150
|
+
|
|
151
|
+
# TODO: replace these with the faster, non variabled length equivalents
|
|
152
|
+
self.re_crystal = re.compile(
|
|
153
|
+
r"(?s)--- Begin crystal\n(?P<CRYSTAL_BLOCK>.*?)\n--- End crystal"
|
|
154
|
+
)
|
|
155
|
+
self.re_refls = re.compile(
|
|
156
|
+
r"(?s)Reflections measured after indexing\n(?P<REFL_BLOCK>.*?)\nEnd of reflections"
|
|
42
157
|
)
|
|
43
158
|
|
|
44
|
-
def
|
|
45
|
-
|
|
46
|
-
|
|
47
|
-
|
|
48
|
-
|
|
49
|
-
|
|
50
|
-
|
|
51
|
-
|
|
52
|
-
|
|
53
|
-
|
|
54
|
-
|
|
55
|
-
|
|
56
|
-
|
|
57
|
-
|
|
58
|
-
|
|
59
|
-
|
|
60
|
-
|
|
61
|
-
|
|
62
|
-
|
|
63
|
-
|
|
64
|
-
|
|
65
|
-
|
|
66
|
-
|
|
67
|
-
|
|
68
|
-
|
|
69
|
-
|
|
70
|
-
|
|
71
|
-
|
|
72
|
-
|
|
73
|
-
|
|
74
|
-
|
|
75
|
-
|
|
76
|
-
|
|
77
|
-
|
|
78
|
-
|
|
79
|
-
|
|
80
|
-
|
|
81
|
-
|
|
82
|
-
|
|
83
|
-
|
|
84
|
-
|
|
85
|
-
|
|
86
|
-
|
|
87
|
-
|
|
88
|
-
|
|
89
|
-
|
|
90
|
-
|
|
91
|
-
|
|
92
|
-
|
|
93
|
-
|
|
94
|
-
|
|
95
|
-
|
|
96
|
-
|
|
97
|
-
|
|
98
|
-
|
|
99
|
-
|
|
100
|
-
|
|
101
|
-
|
|
102
|
-
|
|
103
|
-
|
|
104
|
-
|
|
105
|
-
|
|
106
|
-
|
|
107
|
-
|
|
108
|
-
|
|
109
|
-
|
|
110
|
-
|
|
111
|
-
|
|
112
|
-
|
|
113
|
-
|
|
114
|
-
|
|
115
|
-
|
|
116
|
-
|
|
117
|
-
|
|
118
|
-
|
|
119
|
-
|
|
120
|
-
|
|
121
|
-
|
|
122
|
-
|
|
123
|
-
|
|
124
|
-
|
|
125
|
-
|
|
126
|
-
|
|
127
|
-
|
|
128
|
-
|
|
129
|
-
|
|
130
|
-
|
|
131
|
-
|
|
132
|
-
|
|
133
|
-
|
|
134
|
-
|
|
135
|
-
|
|
136
|
-
|
|
137
|
-
|
|
138
|
-
|
|
139
|
-
|
|
140
|
-
|
|
141
|
-
|
|
142
|
-
|
|
143
|
-
|
|
144
|
-
|
|
145
|
-
|
|
146
|
-
|
|
147
|
-
|
|
148
|
-
|
|
149
|
-
|
|
150
|
-
|
|
151
|
-
|
|
152
|
-
|
|
153
|
-
|
|
154
|
-
|
|
155
|
-
|
|
156
|
-
|
|
157
|
-
|
|
158
|
-
|
|
159
|
-
|
|
160
|
-
|
|
161
|
-
|
|
162
|
-
|
|
163
|
-
|
|
164
|
-
|
|
165
|
-
|
|
166
|
-
|
|
167
|
-
|
|
168
|
-
|
|
169
|
-
|
|
170
|
-
|
|
171
|
-
|
|
172
|
-
|
|
173
|
-
|
|
174
|
-
|
|
175
|
-
|
|
176
|
-
|
|
177
|
-
|
|
178
|
-
|
|
159
|
+
def extract_target_unit_cell(self) -> Union[list, None]:
|
|
160
|
+
"""
|
|
161
|
+
Search the file header for target unit cell parameters.
|
|
162
|
+
"""
|
|
163
|
+
header = self.extract_file_header()
|
|
164
|
+
cell = None
|
|
165
|
+
lattice_type = None
|
|
166
|
+
|
|
167
|
+
for line in header.split("\n"):
|
|
168
|
+
if line.startswith("a = "):
|
|
169
|
+
idx = 0
|
|
170
|
+
elif line.startswith("b = "):
|
|
171
|
+
idx = 1
|
|
172
|
+
elif line.startswith("c = "):
|
|
173
|
+
idx = 2
|
|
174
|
+
elif line.startswith("al = "):
|
|
175
|
+
idx = 3
|
|
176
|
+
elif line.startswith("be = "):
|
|
177
|
+
idx = 4
|
|
178
|
+
elif line.startswith("ga = "):
|
|
179
|
+
idx = 5
|
|
180
|
+
else:
|
|
181
|
+
idx = None
|
|
182
|
+
if idx is not None:
|
|
183
|
+
if cell is None:
|
|
184
|
+
cell = [None] * 6
|
|
185
|
+
value = float(line.split()[2])
|
|
186
|
+
cell[idx] = value
|
|
187
|
+
if line.startswith("lattice_type ="):
|
|
188
|
+
lattice_type = line.split()[-1]
|
|
189
|
+
|
|
190
|
+
if lattice_type is not None:
|
|
191
|
+
cell = _cell_constraints[lattice_type](cell)
|
|
192
|
+
return cell
|
|
193
|
+
|
|
194
|
+
def calculate_average_unit_cell(self) -> gemmi.UnitCell:
|
|
195
|
+
"""
|
|
196
|
+
Compute the average of all cell parameters across the file.
|
|
197
|
+
"""
|
|
198
|
+
regex = re.compile(rb"Cell parameters .+\n")
|
|
199
|
+
with open(self.filename, "r") as f:
|
|
200
|
+
memfile = mmap.mmap(f.fileno(), 0, access=mmap.ACCESS_READ)
|
|
201
|
+
lines = regex.findall(memfile)
|
|
202
|
+
if len(lines) == 0:
|
|
203
|
+
raise ValueError(
|
|
204
|
+
f"No unit cell parameters were found in the header of {self.filename}"
|
|
205
|
+
)
|
|
206
|
+
|
|
207
|
+
cell = np.loadtxt(lines, usecols=[2, 3, 4, 6, 7, 8], dtype="float32").mean(0)
|
|
208
|
+
cell[:3] *= 10.0
|
|
209
|
+
|
|
210
|
+
header = self.extract_file_header()
|
|
211
|
+
lattice_type = None
|
|
212
|
+
|
|
213
|
+
for line in header.split("\n"):
|
|
214
|
+
if line.startswith("lattice_type ="):
|
|
215
|
+
lattice_type = line.split()[-1]
|
|
216
|
+
|
|
217
|
+
if lattice_type is not None:
|
|
218
|
+
cell = _cell_constraints[lattice_type](cell)
|
|
219
|
+
return cell
|
|
220
|
+
|
|
221
|
+
def extract_file_header(self) -> str:
|
|
222
|
+
"""
|
|
223
|
+
Extract all the data prior to first chunk and return it as a string.
|
|
224
|
+
"""
|
|
225
|
+
with open(self.filename, "r") as f:
|
|
226
|
+
memfile = mmap.mmap(f.fileno(), 0, access=mmap.ACCESS_READ)
|
|
227
|
+
match = self.block_regex_bytes["chunk_begin"].search(memfile)
|
|
228
|
+
header = memfile.read(match.start()).decode()
|
|
229
|
+
return header
|
|
230
|
+
|
|
231
|
+
@property
|
|
232
|
+
def available_column_names(self) -> list:
|
|
233
|
+
"""Keys which can be passed to parallel_read_crystfel to customize the peak list output"""
|
|
234
|
+
return list(self.peak_list_columns.keys())
|
|
235
|
+
|
|
236
|
+
@property
|
|
237
|
+
def available_chunk_metadata_keys(self) -> list:
|
|
238
|
+
"""Keys which can be passed to parallel_read_crystfel to customize the chunk level metadata"""
|
|
239
|
+
return list(self.re_chunk_metadata.keys())
|
|
240
|
+
|
|
241
|
+
@property
|
|
242
|
+
def available_crystal_metadata_keys(self) -> list:
|
|
243
|
+
"""Keys which can be passed to parallel_read_crystfel to customize the crystal level metadata"""
|
|
244
|
+
return list(self.re_crystal_metadata.keys())
|
|
245
|
+
|
|
246
|
+
def read_crystfel(
|
|
247
|
+
self,
|
|
248
|
+
wavelength=None,
|
|
249
|
+
chunk_metadata_keys=None,
|
|
250
|
+
crystal_metadata_keys=None,
|
|
251
|
+
peak_list_columns=None,
|
|
252
|
+
use_ray=True,
|
|
253
|
+
num_cpus=None,
|
|
254
|
+
address="local",
|
|
255
|
+
**ray_kwargs,
|
|
256
|
+
) -> list:
|
|
257
|
+
"""
|
|
258
|
+
Parse a CrystFEL stream file using multiple processors. Parallelization depends on the ray library (https://www.ray.io/).
|
|
259
|
+
If ray is unavailable, this method falls back to serial processing on one CPU. Ray is not a dependency of reciprocalspaceship
|
|
260
|
+
and will not be installed automatically. Users must manually install it prior to calling this method.
|
|
261
|
+
|
|
262
|
+
PARAMETERS
|
|
263
|
+
----------
|
|
264
|
+
wavelength : float
|
|
265
|
+
Override the wavelength with this value. Wavelength is used to compute Ewald offsets.
|
|
266
|
+
chunk_metadata_keys : list
|
|
267
|
+
A list of metadata_keys which will be returned in the resulting dictionaries under the 'chunk_metadata' entry.
|
|
268
|
+
A list of possible keys is stored as stream_loader.available_chunk_metadata_keys
|
|
269
|
+
crytal_metadata_keys : list
|
|
270
|
+
A list of metadata_keys which will be returned in the resulting dictionaries under the 'crystal_metadata' entry.
|
|
271
|
+
A list of possible keys is stored as stream_loader.available_crystal_metadata_keys
|
|
272
|
+
peak_list_columns : list
|
|
273
|
+
A list of columns to include in the peak list numpy arrays.
|
|
274
|
+
A list of possible column names is stored as stream_loader.available_column_names.
|
|
275
|
+
use_ray : bool(optional)
|
|
276
|
+
Whether or not to use ray for parallelization.
|
|
277
|
+
num_cpus : int (optional)
|
|
278
|
+
The number of cpus for ray to use.
|
|
279
|
+
ray_kwargs : optional
|
|
280
|
+
Additional keyword arguments to pass to [ray.init](https://docs.ray.io/en/latest/ray-core/api/doc/ray.init.html#ray.init).
|
|
281
|
+
|
|
282
|
+
RETURNS
|
|
283
|
+
-------
|
|
284
|
+
chunks : list
|
|
285
|
+
A list of dictionaries containing the per-chunk data. The 'peak_lists' item contains a
|
|
286
|
+
numpy array with shape n x 14 with the following information.
|
|
287
|
+
h, k, l, I, SIGI, peak, background, fs/px, ss/px, s1x, s1y, s1z,
|
|
288
|
+
ewald_offset, angular_ewald_offset
|
|
289
|
+
"""
|
|
290
|
+
if peak_list_columns is not None:
|
|
291
|
+
peak_list_columns = [self.peak_list_columns[s] for s in peak_list_columns]
|
|
292
|
+
|
|
293
|
+
# Check whether ray is available
|
|
294
|
+
if use_ray:
|
|
295
|
+
use_ray = check_for_ray()
|
|
296
|
+
|
|
297
|
+
with open(self.filename, "r") as f:
|
|
298
|
+
memfile = mmap.mmap(f.fileno(), 0, access=mmap.ACCESS_READ)
|
|
299
|
+
beginnings_and_ends = zip(
|
|
300
|
+
self.block_regex_bytes["chunk_begin"].finditer(memfile),
|
|
301
|
+
self.block_regex_bytes["chunk_end"].finditer(memfile),
|
|
302
|
+
)
|
|
303
|
+
if use_ray:
|
|
304
|
+
with ray_context(num_cpus=num_cpus, **ray_kwargs) as ray:
|
|
305
|
+
|
|
306
|
+
@ray.remote
|
|
307
|
+
def parse_chunk(loader: StreamLoader, *args):
|
|
308
|
+
return loader._parse_chunk(*args)
|
|
309
|
+
|
|
310
|
+
result_ids = []
|
|
311
|
+
for begin, end in beginnings_and_ends:
|
|
312
|
+
result_ids.append(
|
|
313
|
+
parse_chunk.remote(
|
|
314
|
+
self,
|
|
315
|
+
begin.start(),
|
|
316
|
+
end.end(),
|
|
317
|
+
wavelength,
|
|
318
|
+
chunk_metadata_keys,
|
|
319
|
+
crystal_metadata_keys,
|
|
320
|
+
peak_list_columns,
|
|
321
|
+
)
|
|
322
|
+
)
|
|
323
|
+
|
|
324
|
+
results = ray.get(result_ids)
|
|
325
|
+
|
|
326
|
+
return results
|
|
327
|
+
|
|
328
|
+
else:
|
|
329
|
+
results = []
|
|
330
|
+
for begin, end in beginnings_and_ends:
|
|
331
|
+
results.append(
|
|
332
|
+
self._parse_chunk(
|
|
333
|
+
begin.start(),
|
|
334
|
+
end.end(),
|
|
335
|
+
wavelength,
|
|
336
|
+
chunk_metadata_keys,
|
|
337
|
+
crystal_metadata_keys,
|
|
338
|
+
peak_list_columns,
|
|
339
|
+
)
|
|
179
340
|
)
|
|
180
|
-
|
|
181
|
-
|
|
182
|
-
|
|
183
|
-
|
|
184
|
-
|
|
185
|
-
|
|
341
|
+
return results
|
|
342
|
+
|
|
343
|
+
def _extract_chunk_metadata(self, chunk_text, metadata_keys=None):
|
|
344
|
+
if metadata_keys is None:
|
|
345
|
+
return None
|
|
346
|
+
result = {}
|
|
347
|
+
for k in metadata_keys:
|
|
348
|
+
re = self.re_chunk_metadata[k]
|
|
349
|
+
for v in re.findall(chunk_text):
|
|
350
|
+
result[k] = v
|
|
351
|
+
return result
|
|
352
|
+
|
|
353
|
+
def _extract_crystal_metadata(self, xtal_text, metadata_keys=None):
|
|
354
|
+
if metadata_keys is None:
|
|
355
|
+
return None
|
|
356
|
+
result = {}
|
|
357
|
+
for k in metadata_keys:
|
|
358
|
+
re = self.re_crystal_metadata[k]
|
|
359
|
+
for v in re.findall(xtal_text):
|
|
360
|
+
result[k] = v
|
|
361
|
+
return result
|
|
362
|
+
|
|
363
|
+
def _parse_chunk(
|
|
364
|
+
self,
|
|
365
|
+
start,
|
|
366
|
+
end,
|
|
367
|
+
wavelength,
|
|
368
|
+
chunk_metadata_keys,
|
|
369
|
+
crystal_metadata_keys,
|
|
370
|
+
peak_list_columns,
|
|
371
|
+
):
|
|
372
|
+
with open(self.filename, "r") as f:
|
|
373
|
+
f.seek(start)
|
|
374
|
+
data = f.read(end - start)
|
|
375
|
+
|
|
376
|
+
if wavelength is None:
|
|
377
|
+
ev_match = self.re_photon_energy.search(data)
|
|
378
|
+
ev_line = data[ev_match.start() : ev_match.end()]
|
|
379
|
+
photon_energy = np.float32(ev_line.split()[2])
|
|
380
|
+
wavelength = eV2Angstroms(photon_energy)
|
|
381
|
+
lambda_inv = np.reciprocal(wavelength)
|
|
382
|
+
else:
|
|
383
|
+
lambda_inv = np.reciprocal(wavelength)
|
|
384
|
+
|
|
385
|
+
peak_lists = []
|
|
386
|
+
a_matrices = []
|
|
387
|
+
chunk_metadata = None
|
|
388
|
+
crystal_metadata = []
|
|
389
|
+
header = None
|
|
390
|
+
for xmatch in self.re_crystal.finditer(data):
|
|
391
|
+
xdata = data[xmatch.start() : xmatch.end()]
|
|
392
|
+
if header is None:
|
|
393
|
+
header = data[: xmatch.start()]
|
|
394
|
+
|
|
395
|
+
# crystal_metadata.append(self._extract_crystal_metadata(xdata))
|
|
396
|
+
A = (
|
|
397
|
+
np.loadtxt(
|
|
398
|
+
self.re_abcstar.findall(xdata),
|
|
399
|
+
usecols=[2, 3, 4],
|
|
400
|
+
dtype="float32",
|
|
401
|
+
).T
|
|
402
|
+
/ 10.0
|
|
403
|
+
)
|
|
404
|
+
a_matrices.append(A)
|
|
405
|
+
|
|
406
|
+
for pmatch in self.re_refls.finditer(xdata):
|
|
407
|
+
pdata = xdata[pmatch.start() : pmatch.end()]
|
|
408
|
+
crystal_metadata.append(
|
|
409
|
+
self._extract_crystal_metadata(xdata, crystal_metadata_keys)
|
|
186
410
|
)
|
|
187
|
-
|
|
188
|
-
|
|
189
|
-
|
|
190
|
-
|
|
191
|
-
|
|
192
|
-
|
|
193
|
-
|
|
194
|
-
|
|
195
|
-
|
|
196
|
-
|
|
197
|
-
|
|
198
|
-
|
|
199
|
-
|
|
200
|
-
|
|
201
|
-
|
|
202
|
-
|
|
203
|
-
|
|
204
|
-
|
|
205
|
-
|
|
206
|
-
|
|
207
|
-
|
|
208
|
-
|
|
209
|
-
|
|
210
|
-
|
|
211
|
-
|
|
212
|
-
|
|
213
|
-
|
|
214
|
-
|
|
215
|
-
|
|
216
|
-
|
|
217
|
-
|
|
411
|
+
peak_array = np.loadtxt(
|
|
412
|
+
pdata.split("\n")[2:-1],
|
|
413
|
+
usecols=(0, 1, 2, 3, 4, 5, 6, 7, 8),
|
|
414
|
+
dtype="float32",
|
|
415
|
+
)
|
|
416
|
+
s0 = np.array([0, 0, lambda_inv], dtype="float32").T
|
|
417
|
+
q = (A @ peak_array[:, :3].T).T
|
|
418
|
+
s1 = q + s0
|
|
419
|
+
|
|
420
|
+
# This is way faster than np.linalg.norm for small dimensions
|
|
421
|
+
x, y, z = s1.T
|
|
422
|
+
s1_norm = np.sqrt(x * x + y * y + z * z)
|
|
423
|
+
ewald_offset = s1_norm - lambda_inv
|
|
424
|
+
|
|
425
|
+
# project calculated s1 onto the ewald sphere
|
|
426
|
+
s1_obs = lambda_inv * s1 / s1_norm[:, None]
|
|
427
|
+
|
|
428
|
+
# Compute the angular ewald offset
|
|
429
|
+
q_obs = s1_obs - s0
|
|
430
|
+
qangle = np.sign(ewald_offset) * angle_between(q, q_obs)
|
|
431
|
+
|
|
432
|
+
peak_array = np.concatenate(
|
|
433
|
+
(
|
|
434
|
+
peak_array,
|
|
435
|
+
s1,
|
|
436
|
+
ewald_offset[:, None],
|
|
437
|
+
qangle[:, None],
|
|
438
|
+
s1_obs - s1, # Ewald offset vector
|
|
439
|
+
),
|
|
440
|
+
axis=-1,
|
|
441
|
+
)
|
|
442
|
+
if peak_list_columns is not None:
|
|
443
|
+
peak_array = peak_array[:, peak_list_columns]
|
|
444
|
+
peak_lists.append(peak_array)
|
|
445
|
+
|
|
446
|
+
if header is None:
|
|
447
|
+
header = data
|
|
448
|
+
chunk_metadata = self._extract_chunk_metadata(header, chunk_metadata_keys)
|
|
449
|
+
|
|
450
|
+
result = {
|
|
451
|
+
"wavelength": wavelength,
|
|
452
|
+
"A_matrices": a_matrices,
|
|
453
|
+
"peak_lists": peak_lists,
|
|
454
|
+
}
|
|
455
|
+
if chunk_metadata_keys is not None:
|
|
456
|
+
result[chunk_metadata_keys] = chunk_metadata
|
|
457
|
+
if crystal_metadata_keys is not None:
|
|
458
|
+
result[crystal_metadata_keys] = crystal_metadata
|
|
459
|
+
return result
|
|
460
|
+
|
|
461
|
+
|
|
462
|
+
def read_crystfel(
|
|
463
|
+
streamfile: str,
|
|
464
|
+
spacegroup=None,
|
|
465
|
+
encoding="utf-8",
|
|
466
|
+
columns=None,
|
|
467
|
+
parallel=True,
|
|
468
|
+
num_cpus=None,
|
|
469
|
+
address="local",
|
|
470
|
+
**ray_kwargs,
|
|
471
|
+
) -> DataSet:
|
|
218
472
|
"""
|
|
219
473
|
Initialize attributes and populate the DataSet object with data from a CrystFEL stream with indexed reflections.
|
|
220
474
|
This is the output format used by CrystFEL software when processing still diffraction data.
|
|
221
475
|
|
|
476
|
+
This method is parallelized across CPUs speed up parsing. Parallelization depends on the ray library (https://www.ray.io/).
|
|
477
|
+
If ray is unavailable, this method falls back to serial processing on one CPU. Ray is not a dependency of reciprocalspaceship
|
|
478
|
+
and will not be installed automatically. Users must manually install it prior to calling this method.
|
|
479
|
+
|
|
222
480
|
Parameters
|
|
223
481
|
----------
|
|
224
482
|
streamfile : str
|
|
225
483
|
name of a .stream file
|
|
226
484
|
spacegroup : gemmi.SpaceGroup or int or string (optional)
|
|
227
485
|
optionally set the spacegroup of the returned DataSet.
|
|
486
|
+
encoding : str
|
|
487
|
+
The type of byte-encoding (optional, 'utf-8').
|
|
488
|
+
columns : list (optional)
|
|
489
|
+
Optionally specify the columns of the output by a list of strings.
|
|
490
|
+
The default list is: [ "H", "K", "L", "I", "SigI", "BATCH", "s1x", "s1y", "s1z", "ewald_offset", "angular_ewald_offset", "XDET", "YDET" ]
|
|
491
|
+
See `rs.io.crystfel.StreamLoader().available_column_names` for a list of available
|
|
492
|
+
column names and *Notes* for a description of the returned columns
|
|
493
|
+
parallel : bool (optional)
|
|
494
|
+
Read the stream file in parallel using [ray.io](https://docs.ray.io) if it is available.
|
|
495
|
+
num_cpus : int (optional)
|
|
496
|
+
By default, the model will use all available cores. For very large cpu counts, this may consume
|
|
497
|
+
too much memory. Decreasing num_cpus may help. If ray is not installed, a single core will be used.
|
|
498
|
+
address : str (optional)
|
|
499
|
+
Optionally specify the ray instance to connect to. By default, start a new local instance.
|
|
500
|
+
ray_kwargs : optional
|
|
501
|
+
Additional keyword arguments to pass to [ray.init](https://docs.ray.io/en/latest/ray-core/api/doc/ray.init.html#ray.init).
|
|
228
502
|
|
|
229
503
|
Returns
|
|
230
504
|
--------
|
|
231
505
|
rs.DataSet
|
|
232
|
-
"""
|
|
233
506
|
|
|
507
|
+
Notes
|
|
508
|
+
-----
|
|
509
|
+
The following columns are included in the returned DataSet object:
|
|
510
|
+
|
|
511
|
+
- H, K, L: Miller indices of each reflection
|
|
512
|
+
- I, SigI: Intensity and associated uncertainty
|
|
513
|
+
- BATCH: Image number
|
|
514
|
+
- s1x, s1y, s1z: scattered beam wavevector which points from the sample to the bragg peak
|
|
515
|
+
- ewald_offset: the distance in cartesian space (1/angstroms) between the observed reflection and the ewald sphere
|
|
516
|
+
- angular_ewald_offset: the distance in polar coordinates (degrees) between the observed reflection and the ewald sphere
|
|
517
|
+
- XDET, YDET: Internal detector panel coordinates
|
|
518
|
+
"""
|
|
234
519
|
if not streamfile.endswith(".stream"):
|
|
235
520
|
raise ValueError("Stream file should end with .stream")
|
|
521
|
+
|
|
236
522
|
# read data from stream file
|
|
237
|
-
|
|
238
|
-
|
|
239
|
-
|
|
240
|
-
|
|
241
|
-
|
|
242
|
-
|
|
243
|
-
|
|
244
|
-
|
|
245
|
-
|
|
246
|
-
|
|
523
|
+
if columns is None:
|
|
524
|
+
columns = [
|
|
525
|
+
"H",
|
|
526
|
+
"K",
|
|
527
|
+
"L",
|
|
528
|
+
"I",
|
|
529
|
+
"SigI",
|
|
530
|
+
"BATCH",
|
|
531
|
+
"s1x",
|
|
532
|
+
"s1y",
|
|
533
|
+
"s1z",
|
|
534
|
+
"ewald_offset",
|
|
535
|
+
"angular_ewald_offset",
|
|
536
|
+
"XDET",
|
|
537
|
+
"YDET",
|
|
538
|
+
]
|
|
539
|
+
peak_list_columns = [
|
|
540
|
+
i for i in columns if i != "BATCH"
|
|
541
|
+
] # BATCH is computed afterward
|
|
542
|
+
|
|
543
|
+
mtz_dtypes = {
|
|
247
544
|
"H": "H",
|
|
248
545
|
"K": "H",
|
|
249
546
|
"L": "H",
|
|
250
547
|
"I": "J",
|
|
251
548
|
"SigI": "Q",
|
|
252
549
|
"BATCH": "B",
|
|
253
|
-
"s1x": "R",
|
|
254
|
-
"s1y": "R",
|
|
255
|
-
"s1z": "R",
|
|
256
|
-
"ewald_offset": "R",
|
|
257
|
-
"angular_ewald_offset": "R",
|
|
258
|
-
"XDET": "R",
|
|
259
|
-
"YDET": "R",
|
|
260
550
|
}
|
|
261
|
-
|
|
262
|
-
|
|
263
|
-
|
|
264
|
-
|
|
265
|
-
)
|
|
266
|
-
|
|
267
|
-
|
|
268
|
-
|
|
269
|
-
|
|
270
|
-
|
|
551
|
+
for k in columns:
|
|
552
|
+
mtz_dtypes[k] = mtz_dtypes.get(k, "R")
|
|
553
|
+
|
|
554
|
+
loader = StreamLoader(streamfile, encoding=encoding)
|
|
555
|
+
cell = loader.extract_target_unit_cell()
|
|
556
|
+
|
|
557
|
+
batch = 0
|
|
558
|
+
ds = []
|
|
559
|
+
|
|
560
|
+
for chunk in loader.read_crystfel(
|
|
561
|
+
peak_list_columns=peak_list_columns,
|
|
562
|
+
use_ray=parallel,
|
|
563
|
+
num_cpus=num_cpus,
|
|
564
|
+
address=address,
|
|
565
|
+
**ray_kwargs,
|
|
566
|
+
):
|
|
567
|
+
for peak_list in chunk["peak_lists"]:
|
|
568
|
+
_ds = DataSet(
|
|
569
|
+
peak_list,
|
|
570
|
+
columns=peak_list_columns,
|
|
571
|
+
cell=cell,
|
|
572
|
+
spacegroup=spacegroup,
|
|
573
|
+
merged=False,
|
|
574
|
+
)
|
|
575
|
+
_ds["BATCH"] = batch
|
|
576
|
+
ds.append(_ds)
|
|
577
|
+
batch += 1
|
|
578
|
+
|
|
579
|
+
ds = concat(ds, axis=0, check_isomorphous=False, copy=False, ignore_index=True)
|
|
580
|
+
|
|
581
|
+
mtz_dtypes = {
|
|
582
|
+
"H": "H",
|
|
583
|
+
"K": "H",
|
|
584
|
+
"L": "H",
|
|
585
|
+
"I": "J",
|
|
586
|
+
"SigI": "Q",
|
|
587
|
+
"BATCH": "B",
|
|
588
|
+
}
|
|
589
|
+
for k in ds:
|
|
590
|
+
mtz_dtypes[k] = mtz_dtypes.get(k, "R")
|
|
591
|
+
|
|
592
|
+
ds = ds.astype(mtz_dtypes, copy=False)
|
|
593
|
+
ds.set_index(["H", "K", "L"], inplace=True)
|
|
594
|
+
|
|
595
|
+
return ds
|