reciprocalspaceship 1.0.1__py3-none-any.whl → 1.0.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of reciprocalspaceship might be problematic. Click here for more details.

@@ -1,270 +1,604 @@
1
- import numpy as np
2
- import pandas as pd
3
-
4
- from reciprocalspaceship import DataSet
5
- from reciprocalspaceship.utils import angle_between
1
+ import mmap
2
+ import re
3
+ from contextlib import contextmanager
4
+ from importlib.util import find_spec
5
+ from typing import Union
6
6
 
7
+ import gemmi
8
+ import numpy as np
7
9
 
8
- def _parse_stream(filename: str) -> dict:
10
+ from reciprocalspaceship import DataSet, concat
11
+ from reciprocalspaceship.utils import angle_between, eV2Angstroms
12
+
13
+ # See Rupp Table 5-2
14
+ _cell_constraints = {
15
+ "triclinic": lambda x: x,
16
+ "orthorhombic": lambda x: [x[0], x[1], x[2], 90.0, 90.0, 90.0],
17
+ "monoclinic": lambda x: [x[0], x[1], x[2], 90.0, x[4], 90.0],
18
+ "hexagonal": lambda x: [
19
+ 0.5 * (x[0] + x[1]),
20
+ 0.5 * (x[0] + x[1]),
21
+ x[2],
22
+ 90.0,
23
+ 90.0,
24
+ 120.0,
25
+ ],
26
+ "rhombohedral": lambda x: [
27
+ 0.5 * (x[0] + x[1]),
28
+ 0.5 * (x[0] + x[1]),
29
+ x[2],
30
+ 90.0,
31
+ 90.0,
32
+ 120.0,
33
+ ],
34
+ "cubic": lambda x: [
35
+ np.mean(x[:3]),
36
+ np.mean(x[:3]),
37
+ np.mean(x[:3]),
38
+ 90.0,
39
+ 90.0,
40
+ 90.0,
41
+ ],
42
+ "tetragonal": lambda x: [
43
+ 0.5 * (x[0] + x[1]),
44
+ 0.5 * (x[0] + x[1]),
45
+ x[2],
46
+ 90.0,
47
+ 90.0,
48
+ 90.0,
49
+ ],
50
+ }
51
+
52
+ # See crystFEL API reference here: https://www.desy.de/~twhite/crystfel/reference/stream_8h.html
53
+ _block_markers = {
54
+ "geometry": (r"----- Begin geometry file -----", r"----- End geometry file -----"),
55
+ "chunk": (r"----- Begin chunk -----", r"----- End chunk -----"),
56
+ "cell": (r"----- Begin unit cell -----", r"----- End unit cell -----"),
57
+ "peaks": (r"Peaks from peak search", r"End of peak list"),
58
+ "crystal": (r"--- Begin crystal", r"--- End crystal"),
59
+ "reflections": (r"Reflections measured after indexing", r"End of reflections"),
60
+ }
61
+
62
+
63
+ @contextmanager
64
+ def ray_context(**ray_kwargs):
65
+ import ray
66
+
67
+ ray.init(**ray_kwargs)
68
+ try:
69
+ yield ray
70
+ finally:
71
+ ray.shutdown()
72
+
73
+
74
+ class StreamLoader(object):
9
75
  """
10
- Parses stream and returns all indexed peak positions
11
-
12
- Parameters
76
+ An object that loads stream files into rs.DataSet objects in parallel.
77
+ Attributes
13
78
  ----------
14
- filename : stream filename
15
- name of a .stream file
16
-
17
- Returns
18
- --------
19
- (dict, np.ndarray)
79
+ block_regex_bytes : dict
80
+ A dictionary of compiled regular expressions that operate on strings
81
+ block_regex : dict
82
+ A dictionary of compiled regular expressions that operate on byte strings
20
83
  """
21
84
 
22
- answ_crystals = {}
23
-
24
- def contains_filename(s):
25
- return s.startswith("Image filename")
26
-
27
- def contains_event(s):
28
- return s.startswith("Event")
29
-
30
- def contains_serial_number(s):
31
- return s.startswith("Image serial number")
32
-
33
- def starts_chunk_peaks(s):
34
- return s.startswith(" fs/px ss/px (1/d)/nm^-1 Intensity Panel")
35
-
36
- def ends_chunk_peaks(s):
37
- return s.startswith("End of peak list")
85
+ peak_list_columns = {
86
+ "H": 0,
87
+ "K": 1,
88
+ "L": 2,
89
+ "I": 3,
90
+ "SigI": 4,
91
+ "peak": 5,
92
+ "background": 6,
93
+ "XDET": 7,
94
+ "YDET": 8,
95
+ "s1x": 9,
96
+ "s1y": 10,
97
+ "s1z": 11,
98
+ "ewald_offset": 12,
99
+ "angular_ewald_offset": 13,
100
+ "ewald_offset_x": 14,
101
+ "ewald_offset_y": 15,
102
+ "ewald_offset_z": 16,
103
+ }
38
104
 
39
- def starts_crystal_peaks(s):
40
- return s.startswith(
41
- " h k l I sigma(I) peak background fs/px ss/px panel"
105
+ def __init__(self, filename: str, encoding="utf-8"):
106
+ self.filename = filename
107
+ self.encoding = encoding
108
+ self.block_regex = {}
109
+ self.block_regex_bytes = {}
110
+
111
+ # Set up all the regular expressions for finding block boundaries
112
+ for k, (beginning, ending) in _block_markers.items():
113
+ self.block_regex[k + "_begin"] = re.compile(beginning)
114
+ self.block_regex[k + "_end"] = re.compile(ending)
115
+ self.block_regex[k] = re.compile(
116
+ f"(?s){beginning}\n(?P<CRYSTAL_BLOCK>.*?)\n{ending}"
117
+ )
118
+
119
+ self.block_regex_bytes[k + "_begin"] = re.compile(
120
+ beginning.encode(self.encoding)
121
+ )
122
+ self.block_regex_bytes[k + "_end"] = re.compile(
123
+ ending.encode(self.encoding)
124
+ )
125
+ self.block_regex_bytes[k] = re.compile(
126
+ f"(?s){beginning}\n(?P<CRYSTAL_BLOCK>.*?)\n{ending}".encode(
127
+ self.encoding
128
+ )
129
+ )
130
+
131
+ self.re_abcstar = re.compile("[abc]star =.+\n")
132
+ self.re_photon_energy = re.compile("photon_energy_eV =.+\n")
133
+
134
+ self.re_chunk_metadata = {
135
+ "Image filename": re.compile(r"(?<=Image filename: ).+(?=\n)"),
136
+ "Event": re.compile(r"(?<=Event: ).+(?=\n)"),
137
+ "Image serial number:": re.compile(r"(?<=Image serial number: ).+(?=\n)"),
138
+ "indexed_by": re.compile(r"(?<=indexed_by \= ).+(?=\n)"),
139
+ "photon_energy_eV": re.compile(r"(?<=photon_energy_eV \= ).+(?=\n)"),
140
+ "beam_divergence": re.compile(r"(?<=beam_divergence \= ).+(?=\n)"),
141
+ "beam_bandwidth": re.compile(r"(?<=beam_bandwidth \= ).+(?=\n)"),
142
+ }
143
+
144
+ self.re_crystal_metadata = {
145
+ "Cell parameters": re.compile(r"(?<=Cell parameters).+(?=\n)"),
146
+ "astar": re.compile(r"(?<=astar = ).+(?=\n)"),
147
+ "bstar": re.compile(r"(?<=bstar = ).+(?=\n)"),
148
+ "cstar": re.compile(r"(?<=cstar = ).+(?=\n)"),
149
+ "lattice_type": re.compile(r"(?<=lattice_type = ).+(?=\n)"),
150
+ "centering": re.compile(r"(?<=centering = ).+(?=\n)"),
151
+ "unique_axis": re.compile(r"(?<=unique_axis = ).+(?=\n)"),
152
+ "profile_radius": re.compile(r"(?<=profile_radius = ).+(?=\n)"),
153
+ "predict_refine/det_shift": re.compile(
154
+ r"(?<=predict_refine/det_shift ).+(?=\n)"
155
+ ),
156
+ "predict_refine/R": re.compile(r"(?<=predict_refine/R ).+(?=\n)"),
157
+ "diffraction_resolution_limit": re.compile(
158
+ r"(?<=diffraction_resolution_limit = ).+(?=\n)"
159
+ ),
160
+ "num_reflections": re.compile(r"(?<=num_reflections = ).+(?=\n)"),
161
+ }
162
+
163
+ # TODO: replace these with the faster, non variabled length equivalents
164
+ self.re_crystal = re.compile(
165
+ r"(?s)--- Begin crystal\n(?P<CRYSTAL_BLOCK>.*?)\n--- End crystal"
166
+ )
167
+ self.re_refls = re.compile(
168
+ r"(?s)Reflections measured after indexing\n(?P<REFL_BLOCK>.*?)\nEnd of reflections"
42
169
  )
43
170
 
44
- def is_photon_energy(s):
45
- return s.startswith("photon_energy_eV")
46
-
47
- def is_astar(s):
48
- return s.startswith("astar")
49
-
50
- def is_bstar(s):
51
- return s.startswith("bstar")
52
-
53
- def is_cstar(s):
54
- return s.startswith("cstar")
55
-
56
- def ends_crystal_peaks(s):
57
- return s.startswith("End of reflections")
58
-
59
- def eV2Angstrom(e_eV):
60
- return 12398.0 / e_eV
61
-
62
- # add unit cell parameters parsing
63
- with open(filename, "r") as stream:
64
- is_unit_cell = False
65
- get_cellparam = lambda s: float(s.split()[2])
66
- rv_cell_param = None
67
- a, b, c, al, be, ga = [
68
- None
69
- ] * 6 # None's are needed since stream not always has all 6 parameters
70
- for line in stream:
71
- if "Begin unit cell" in line:
72
- is_unit_cell = True
73
- continue
74
- elif is_unit_cell:
75
- if line.startswith("a ="):
76
- a = get_cellparam(line)
77
- if line.startswith("b ="):
78
- b = get_cellparam(line)
79
- if line.startswith("c ="):
80
- c = get_cellparam(line)
81
- if line.startswith("al ="):
82
- al = get_cellparam(line)
83
- if line.startswith("be ="):
84
- be = get_cellparam(line)
85
- if line.startswith("ga ="):
86
- ga = get_cellparam(line)
87
- is_unit_cell = False # gamma is the last parameters
88
- elif "End unit cell" in line:
89
- rv_cell_param = np.array([a, b, c, al, be, ga])
90
- break
91
-
92
- with open(filename, "r") as stream:
93
- is_chunk = False
94
- is_crystal = False
95
- current_filename = None
96
- current_event = None # to handle non-event streams
97
- current_serial_number = None
98
- corrupted_chunk = False
99
- crystal_peak_number = 0
100
- crystal_idx = 0
101
-
102
- for line in stream:
103
- # analyzing what we have
104
- if ends_chunk_peaks(line):
105
- is_chunk = False
106
- chunk_peak_number = 0
107
- elif ends_crystal_peaks(line):
108
- is_crystal = False
109
- crystal_peak_number = 0
110
-
111
- elif is_photon_energy(line):
112
- photon_energy = float(line.split()[2])
113
- elif is_astar(line):
114
- astar = (
115
- np.array(line.split()[2:5], dtype="float32") / 10.0
116
- ) # crystfel's notation uses nm-1
117
- elif is_bstar(line):
118
- bstar = (
119
- np.array(line.split()[2:5], dtype="float32") / 10.0
120
- ) # crystfel's notation uses nm-1
121
- elif is_cstar(line):
122
- cstar = (
123
- np.array(line.split()[2:5], dtype="float32") / 10.0
124
- ) # crystfel's notation uses nm-1
125
-
126
- # since it's the last line needed to construct Ewald offset,
127
- # we'll pre-compute the matrices here
128
- A = np.array([astar, bstar, cstar]).T
129
- lambda_inv = 1 / eV2Angstrom(photon_energy)
130
- s0 = np.array([0, 0, lambda_inv]).T
131
-
132
- elif is_crystal:
133
- # example line:
134
- # h k l I sigma(I) peak background fs/px ss/px panel
135
- # -63 41 9 -41.31 57.45 195.00 170.86 731.0 1350.4 p0
136
- crystal_peak_number += 1
137
- h, k, l, I, sigmaI, peak, background, xdet, ydet, panel = [
138
- i for i in line.split()
139
- ]
140
- h, k, l = map(int, [h, k, l])
141
-
142
- # calculate ewald offset and s1
143
- hkl = np.array([h, k, l])
144
- q = A @ hkl
145
- s1 = q + s0
146
- s1x, s1y, s1z = s1
147
- s1_norm = np.linalg.norm(s1)
148
- ewald_offset = s1_norm - lambda_inv
149
-
150
- # project calculated s1 onto the ewald sphere
151
- s1_obs = lambda_inv * s1 / s1_norm
152
-
153
- # Compute the angular ewald offset
154
- q_obs = s1_obs - s0
155
- qangle = np.sign(ewald_offset) * angle_between(q, q_obs)
156
-
157
- record = {
158
- "H": h,
159
- "K": k,
160
- "L": l,
161
- "I": float(I),
162
- "SigI": float(sigmaI),
163
- "BATCH": crystal_idx,
164
- "s1x": s1x,
165
- "s1y": s1y,
166
- "s1z": s1z,
167
- "ewald_offset": ewald_offset,
168
- "angular_ewald_offset": qangle,
169
- "XDET": float(xdet),
170
- "YDET": float(ydet),
171
- }
172
- if current_event is not None:
173
- name = (
174
- current_filename,
175
- current_event,
176
- current_serial_number,
177
- crystal_idx,
178
- crystal_peak_number,
171
+ def extract_target_unit_cell(self) -> Union[list, None]:
172
+ """
173
+ Search the file header for target unit cell parameters.
174
+ """
175
+ header = self.extract_file_header()
176
+ cell = None
177
+ lattice_type = None
178
+
179
+ for line in header.split("\n"):
180
+ if line.startswith("a = "):
181
+ idx = 0
182
+ elif line.startswith("b = "):
183
+ idx = 1
184
+ elif line.startswith("c = "):
185
+ idx = 2
186
+ elif line.startswith("al = "):
187
+ idx = 3
188
+ elif line.startswith("be = "):
189
+ idx = 4
190
+ elif line.startswith("ga = "):
191
+ idx = 5
192
+ else:
193
+ idx = None
194
+ if idx is not None:
195
+ if cell is None:
196
+ cell = [None] * 6
197
+ value = float(line.split()[2])
198
+ cell[idx] = value
199
+ if line.startswith("lattice_type ="):
200
+ lattice_type = line.split()[-1]
201
+
202
+ if lattice_type is not None:
203
+ cell = _cell_constraints[lattice_type](cell)
204
+ return cell
205
+
206
+ def calculate_average_unit_cell(self) -> gemmi.UnitCell:
207
+ """
208
+ Compute the average of all cell parameters across the file.
209
+ """
210
+ regex = re.compile(rb"Cell parameters .+\n")
211
+ with open(self.filename, "r") as f:
212
+ memfile = mmap.mmap(f.fileno(), 0, access=mmap.ACCESS_READ)
213
+ lines = regex.findall(memfile)
214
+ if len(lines) == 0:
215
+ raise ValueError(
216
+ f"No unit cell parameters were found in the header of {self.filename}"
217
+ )
218
+
219
+ cell = np.loadtxt(lines, usecols=[2, 3, 4, 6, 7, 8], dtype="float32").mean(0)
220
+ cell[:3] *= 10.0
221
+
222
+ header = self.extract_file_header()
223
+ lattice_type = None
224
+
225
+ for line in header.split("\n"):
226
+ if line.startswith("lattice_type ="):
227
+ lattice_type = line.split()[-1]
228
+
229
+ if lattice_type is not None:
230
+ cell = _cell_constraints[lattice_type](cell)
231
+ return cell
232
+
233
+ def extract_file_header(self) -> str:
234
+ """
235
+ Extract all the data prior to first chunk and return it as a string.
236
+ """
237
+ with open(self.filename, "r") as f:
238
+ memfile = mmap.mmap(f.fileno(), 0, access=mmap.ACCESS_READ)
239
+ match = self.block_regex_bytes["chunk_begin"].search(memfile)
240
+ header = memfile.read(match.start()).decode()
241
+ return header
242
+
243
+ @property
244
+ def available_column_names(self) -> list:
245
+ """Keys which can be passed to parallel_read_crystfel to customize the peak list output"""
246
+ return list(self.peak_list_columns.keys())
247
+
248
+ @property
249
+ def available_chunk_metadata_keys(self) -> list:
250
+ """Keys which can be passed to parallel_read_crystfel to customize the chunk level metadata"""
251
+ return list(self.re_chunk_metadata.keys())
252
+
253
+ @property
254
+ def available_crystal_metadata_keys(self) -> list:
255
+ """Keys which can be passed to parallel_read_crystfel to customize the crystal level metadata"""
256
+ return list(self.re_crystal_metadata.keys())
257
+
258
+ def read_crystfel(
259
+ self,
260
+ wavelength=None,
261
+ chunk_metadata_keys=None,
262
+ crystal_metadata_keys=None,
263
+ peak_list_columns=None,
264
+ use_ray=True,
265
+ num_cpus=None,
266
+ address="local",
267
+ **ray_kwargs,
268
+ ) -> list:
269
+ """
270
+ Parse a CrystFEL stream file using multiple processors. Parallelization depends on the ray library (https://www.ray.io/).
271
+ If ray is unavailable, this method falls back to serial processing on one CPU. Ray is not a dependency of reciprocalspaceship
272
+ and will not be installed automatically. Users must manually install it prior to calling this method.
273
+
274
+ PARAMETERS
275
+ ----------
276
+ wavelength : float
277
+ Override the wavelength with this value. Wavelength is used to compute Ewald offsets.
278
+ chunk_metadata_keys : list
279
+ A list of metadata_keys which will be returned in the resulting dictionaries under the 'chunk_metadata' entry.
280
+ A list of possible keys is stored as stream_loader.available_chunk_metadata_keys
281
+ crytal_metadata_keys : list
282
+ A list of metadata_keys which will be returned in the resulting dictionaries under the 'crystal_metadata' entry.
283
+ A list of possible keys is stored as stream_loader.available_crystal_metadata_keys
284
+ peak_list_columns : list
285
+ A list of columns to include in the peak list numpy arrays.
286
+ A list of possible column names is stored as stream_loader.available_column_names.
287
+ use_ray : bool(optional)
288
+ Whether or not to use ray for parallelization.
289
+ num_cpus : int (optional)
290
+ The number of cpus for ray to use.
291
+ ray_kwargs : optional
292
+ Additional keyword arguments to pass to [ray.init](https://docs.ray.io/en/latest/ray-core/api/doc/ray.init.html#ray.init).
293
+
294
+ RETURNS
295
+ -------
296
+ chunks : list
297
+ A list of dictionaries containing the per-chunk data. The 'peak_lists' item contains a
298
+ numpy array with shape n x 14 with the following information.
299
+ h, k, l, I, SIGI, peak, background, fs/px, ss/px, s1x, s1y, s1z,
300
+ ewald_offset, angular_ewald_offset
301
+ """
302
+ if peak_list_columns is not None:
303
+ peak_list_columns = [self.peak_list_columns[s] for s in peak_list_columns]
304
+
305
+ # Check whether ray is available
306
+ if use_ray:
307
+ if find_spec("ray") is None:
308
+ use_ray = False
309
+ import warnings
310
+
311
+ message = (
312
+ "ray (https://www.ray.io/) is not available..."
313
+ "Falling back to serial stream file parser."
314
+ )
315
+ warnings.warn(message, ImportWarning)
316
+
317
+ with open(self.filename, "r") as f:
318
+ memfile = mmap.mmap(f.fileno(), 0, access=mmap.ACCESS_READ)
319
+ beginnings_and_ends = zip(
320
+ self.block_regex_bytes["chunk_begin"].finditer(memfile),
321
+ self.block_regex_bytes["chunk_end"].finditer(memfile),
322
+ )
323
+ if use_ray:
324
+ with ray_context(num_cpus=num_cpus, **ray_kwargs) as ray:
325
+
326
+ @ray.remote
327
+ def parse_chunk(loader: StreamLoader, *args):
328
+ return loader._parse_chunk(*args)
329
+
330
+ result_ids = []
331
+ for begin, end in beginnings_and_ends:
332
+ result_ids.append(
333
+ parse_chunk.remote(
334
+ self,
335
+ begin.start(),
336
+ end.end(),
337
+ wavelength,
338
+ chunk_metadata_keys,
339
+ crystal_metadata_keys,
340
+ peak_list_columns,
341
+ )
342
+ )
343
+
344
+ results = ray.get(result_ids)
345
+
346
+ return results
347
+
348
+ else:
349
+ results = []
350
+ for begin, end in beginnings_and_ends:
351
+ results.append(
352
+ self._parse_chunk(
353
+ begin.start(),
354
+ end.end(),
355
+ wavelength,
356
+ chunk_metadata_keys,
357
+ crystal_metadata_keys,
358
+ peak_list_columns,
359
+ )
179
360
  )
180
- else:
181
- name = (
182
- current_filename,
183
- current_serial_number,
184
- crystal_idx,
185
- crystal_peak_number,
361
+ return results
362
+
363
+ def _extract_chunk_metadata(self, chunk_text, metadata_keys=None):
364
+ if metadata_keys is None:
365
+ return None
366
+ result = {}
367
+ for k in metadata_keys:
368
+ re = self.re_chunk_metadata[k]
369
+ for v in re.findall(chunk_text):
370
+ result[k] = v
371
+ return result
372
+
373
+ def _extract_crystal_metadata(self, xtal_text, metadata_keys=None):
374
+ if metadata_keys is None:
375
+ return None
376
+ result = {}
377
+ for k in metadata_keys:
378
+ re = self.re_crystal_metadata[k]
379
+ for v in re.findall(xtal_text):
380
+ result[k] = v
381
+ return result
382
+
383
+ def _parse_chunk(
384
+ self,
385
+ start,
386
+ end,
387
+ wavelength,
388
+ chunk_metadata_keys,
389
+ crystal_metadata_keys,
390
+ peak_list_columns,
391
+ ):
392
+ with open(self.filename, "r") as f:
393
+ f.seek(start)
394
+ data = f.read(end - start)
395
+
396
+ if wavelength is None:
397
+ ev_match = self.re_photon_energy.search(data)
398
+ ev_line = data[ev_match.start() : ev_match.end()]
399
+ photon_energy = np.float32(ev_line.split()[2])
400
+ wavelength = eV2Angstroms(photon_energy)
401
+ lambda_inv = np.reciprocal(wavelength)
402
+ else:
403
+ lambda_inv = np.reciprocal(wavelength)
404
+
405
+ peak_lists = []
406
+ a_matrices = []
407
+ chunk_metadata = None
408
+ crystal_metadata = []
409
+ header = None
410
+ for xmatch in self.re_crystal.finditer(data):
411
+ xdata = data[xmatch.start() : xmatch.end()]
412
+ if header is None:
413
+ header = data[: xmatch.start()]
414
+
415
+ # crystal_metadata.append(self._extract_crystal_metadata(xdata))
416
+ A = (
417
+ np.loadtxt(
418
+ self.re_abcstar.findall(xdata),
419
+ usecols=[2, 3, 4],
420
+ dtype="float32",
421
+ ).T
422
+ / 10.0
423
+ )
424
+ a_matrices.append(A)
425
+
426
+ for pmatch in self.re_refls.finditer(xdata):
427
+ pdata = xdata[pmatch.start() : pmatch.end()]
428
+ crystal_metadata.append(
429
+ self._extract_crystal_metadata(xdata, crystal_metadata_keys)
186
430
  )
187
- answ_crystals[name] = record
188
-
189
- # start analyzing where we are now
190
- if corrupted_chunk:
191
- if "Begin chunk" not in line:
192
- continue
193
- else:
194
- is_crystal, is_chunk = False, False
195
- corrupted_chunk = False
196
- continue
197
-
198
- if contains_filename(line):
199
- current_filename = line.split()[-1]
200
- elif contains_event(line):
201
- current_event = line.split()[-1][2:]
202
- elif contains_serial_number(line):
203
- current_serial_number = line.split()[-1]
204
-
205
- elif starts_chunk_peaks(line):
206
- is_chunk = True
207
- continue
208
-
209
- elif starts_crystal_peaks(line):
210
- crystal_idx += 1
211
- is_crystal = True
212
- continue
213
-
214
- return answ_crystals, rv_cell_param
215
-
216
-
217
- def read_crystfel(streamfile: str, spacegroup=None) -> DataSet:
431
+ peak_array = np.loadtxt(
432
+ pdata.split("\n")[2:-1],
433
+ usecols=(0, 1, 2, 3, 4, 5, 6, 7, 8),
434
+ dtype="float32",
435
+ )
436
+ s0 = np.array([0, 0, lambda_inv], dtype="float32").T
437
+ q = (A @ peak_array[:, :3].T).T
438
+ s1 = q + s0
439
+
440
+ # This is way faster than np.linalg.norm for small dimensions
441
+ x, y, z = s1.T
442
+ s1_norm = np.sqrt(x * x + y * y + z * z)
443
+ ewald_offset = s1_norm - lambda_inv
444
+
445
+ # project calculated s1 onto the ewald sphere
446
+ s1_obs = lambda_inv * s1 / s1_norm[:, None]
447
+
448
+ # Compute the angular ewald offset
449
+ q_obs = s1_obs - s0
450
+ qangle = np.sign(ewald_offset) * angle_between(q, q_obs)
451
+
452
+ peak_array = np.concatenate(
453
+ (
454
+ peak_array,
455
+ s1,
456
+ ewald_offset[:, None],
457
+ qangle[:, None],
458
+ s1_obs - s1, # Ewald offset vector
459
+ ),
460
+ axis=-1,
461
+ )
462
+ if peak_list_columns is not None:
463
+ peak_array = peak_array[:, peak_list_columns]
464
+ peak_lists.append(peak_array)
465
+
466
+ if header is None:
467
+ header = data
468
+ chunk_metadata = self._extract_chunk_metadata(header, chunk_metadata_keys)
469
+
470
+ result = {
471
+ "wavelength": wavelength,
472
+ "A_matrices": a_matrices,
473
+ "peak_lists": peak_lists,
474
+ }
475
+ if chunk_metadata_keys is not None:
476
+ result[chunk_metadata_keys] = chunk_metadata
477
+ if crystal_metadata_keys is not None:
478
+ result[crystal_metadata_keys] = crystal_metadata
479
+ return result
480
+
481
+
482
+ def read_crystfel(
483
+ streamfile: str,
484
+ spacegroup=None,
485
+ encoding="utf-8",
486
+ columns=None,
487
+ parallel=True,
488
+ num_cpus=None,
489
+ address="local",
490
+ **ray_kwargs,
491
+ ) -> DataSet:
218
492
  """
219
493
  Initialize attributes and populate the DataSet object with data from a CrystFEL stream with indexed reflections.
220
494
  This is the output format used by CrystFEL software when processing still diffraction data.
221
495
 
496
+ This method is parallelized across CPUs speed up parsing. Parallelization depends on the ray library (https://www.ray.io/).
497
+ If ray is unavailable, this method falls back to serial processing on one CPU. Ray is not a dependency of reciprocalspaceship
498
+ and will not be installed automatically. Users must manually install it prior to calling this method.
499
+
222
500
  Parameters
223
501
  ----------
224
502
  streamfile : str
225
503
  name of a .stream file
226
504
  spacegroup : gemmi.SpaceGroup or int or string (optional)
227
505
  optionally set the spacegroup of the returned DataSet.
506
+ encoding : str
507
+ The type of byte-encoding (optional, 'utf-8').
508
+ columns : list (optional)
509
+ Optionally specify the columns of the output by a list of strings.
510
+ The default list is:
511
+ [ "H", "K", "L", "I", "SigI", "BATCH", "s1x", "s1y", "s1z", "ewald_offset",
512
+ "angular_ewald_offset", "XDET", "YDET" ]
513
+ See `rs.io.crystfel.StreamLoader().available_column_names` for a list of available column names.
514
+ parallel : bool (optional)
515
+ Read the stream file in parallel using [ray.io](https://docs.ray.io) if it is available.
516
+ num_cpus : int (optional)
517
+ By default, the model will use all available cores. For very large cpu counts, this may consume
518
+ too much memory. Decreasing num_cpus may help. If ray is not installed, a single core will be used.
519
+ address : str (optional)
520
+ Optionally specify the ray instance to connect to. By default, start a new local instance.
521
+ ray_kwargs : optional
522
+ Additional keyword arguments to pass to [ray.init](https://docs.ray.io/en/latest/ray-core/api/doc/ray.init.html#ray.init).
228
523
 
229
524
  Returns
230
525
  --------
231
526
  rs.DataSet
232
527
  """
233
-
234
528
  if not streamfile.endswith(".stream"):
235
529
  raise ValueError("Stream file should end with .stream")
530
+
236
531
  # read data from stream file
237
- d, cell = _parse_stream(streamfile)
238
- df = pd.DataFrame.from_records(list(d.values()))
239
-
240
- # set mtztypes as in precognition.py
241
- # hkl -- H
242
- # I, sigmaI -- J, Q
243
- # BATCH -- B
244
- # s1{x,y,z} -- R
245
- # ewald_offset -- R
246
- mtzdtypes = {
532
+ if columns is None:
533
+ columns = [
534
+ "H",
535
+ "K",
536
+ "L",
537
+ "I",
538
+ "SigI",
539
+ "BATCH",
540
+ "s1x",
541
+ "s1y",
542
+ "s1z",
543
+ "ewald_offset",
544
+ "angular_ewald_offset",
545
+ "XDET",
546
+ "YDET",
547
+ ]
548
+ peak_list_columns = [
549
+ i for i in columns if i != "BATCH"
550
+ ] # BATCH is computed afterward
551
+
552
+ mtz_dtypes = {
247
553
  "H": "H",
248
554
  "K": "H",
249
555
  "L": "H",
250
556
  "I": "J",
251
557
  "SigI": "Q",
252
558
  "BATCH": "B",
253
- "s1x": "R",
254
- "s1y": "R",
255
- "s1z": "R",
256
- "ewald_offset": "R",
257
- "angular_ewald_offset": "R",
258
- "XDET": "R",
259
- "YDET": "R",
260
559
  }
261
- dataset = DataSet(
262
- spacegroup=spacegroup,
263
- cell=cell,
264
- merged=False, # CrystFEL stream is always unmerged
265
- )
266
- for k, v in df.items():
267
- dataset[k] = v.astype(mtzdtypes[k])
268
- dataset.set_index(["H", "K", "L"], inplace=True)
269
-
270
- return dataset
560
+ for k in columns:
561
+ mtz_dtypes[k] = mtz_dtypes.get(k, "R")
562
+
563
+ loader = StreamLoader(streamfile, encoding=encoding)
564
+ cell = loader.extract_target_unit_cell()
565
+
566
+ batch = 0
567
+ ds = []
568
+
569
+ for chunk in loader.read_crystfel(
570
+ peak_list_columns=peak_list_columns,
571
+ use_ray=parallel,
572
+ num_cpus=num_cpus,
573
+ address=address,
574
+ **ray_kwargs,
575
+ ):
576
+ for peak_list in chunk["peak_lists"]:
577
+ _ds = DataSet(
578
+ peak_list,
579
+ columns=peak_list_columns,
580
+ cell=cell,
581
+ spacegroup=spacegroup,
582
+ merged=False,
583
+ )
584
+ _ds["BATCH"] = batch
585
+ ds.append(_ds)
586
+ batch += 1
587
+
588
+ ds = concat(ds, axis=0, check_isomorphous=False, copy=False, ignore_index=True)
589
+
590
+ mtz_dtypes = {
591
+ "H": "H",
592
+ "K": "H",
593
+ "L": "H",
594
+ "I": "J",
595
+ "SigI": "Q",
596
+ "BATCH": "B",
597
+ }
598
+ for k in ds:
599
+ mtz_dtypes[k] = mtz_dtypes.get(k, "R")
600
+
601
+ ds = ds.astype(mtz_dtypes, copy=False)
602
+ ds.set_index(["H", "K", "L"], inplace=True)
603
+
604
+ return ds