reciprocalspaceship 1.0.1__py3-none-any.whl → 1.0.3__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of reciprocalspaceship might be problematic. Click here for more details.

Files changed (28) hide show
  1. reciprocalspaceship/VERSION +1 -1
  2. reciprocalspaceship/__init__.py +9 -2
  3. reciprocalspaceship/algorithms/scale_merged_intensities.py +8 -7
  4. reciprocalspaceship/dataset.py +28 -3
  5. reciprocalspaceship/decorators.py +8 -4
  6. reciprocalspaceship/dtypes/floating.py +24 -28
  7. reciprocalspaceship/dtypes/integer.py +38 -37
  8. reciprocalspaceship/dtypes/internals.py +243 -49
  9. reciprocalspaceship/io/__init__.py +1 -0
  10. reciprocalspaceship/io/common.py +48 -0
  11. reciprocalspaceship/io/crystfel.py +559 -234
  12. reciprocalspaceship/io/dials.py +330 -0
  13. reciprocalspaceship/io/dials_mpi.py +44 -0
  14. reciprocalspaceship/io/mtz.py +4 -5
  15. reciprocalspaceship/utils/__init__.py +6 -1
  16. reciprocalspaceship/utils/cell.py +5 -0
  17. reciprocalspaceship/utils/stats.py +5 -7
  18. reciprocalspaceship/utils/structurefactors.py +5 -0
  19. reciprocalspaceship/utils/units.py +14 -4
  20. {reciprocalspaceship-1.0.1.dist-info → reciprocalspaceship-1.0.3.dist-info}/METADATA +27 -28
  21. {reciprocalspaceship-1.0.1.dist-info → reciprocalspaceship-1.0.3.dist-info}/RECORD +28 -24
  22. {reciprocalspaceship-1.0.1.dist-info → reciprocalspaceship-1.0.3.dist-info}/WHEEL +1 -1
  23. {reciprocalspaceship-1.0.1.dist-info → reciprocalspaceship-1.0.3.dist-info}/entry_points.txt +0 -1
  24. tests/test_dataseries.py +1 -1
  25. tests/test_dataset.py +42 -0
  26. tests/test_dataset_signatures.py +53 -0
  27. {reciprocalspaceship-1.0.1.dist-info → reciprocalspaceship-1.0.3.dist-info}/LICENSE +0 -0
  28. {reciprocalspaceship-1.0.1.dist-info → reciprocalspaceship-1.0.3.dist-info}/top_level.txt +0 -0
@@ -1,270 +1,595 @@
1
- import numpy as np
2
- import pandas as pd
3
-
4
- from reciprocalspaceship import DataSet
5
- from reciprocalspaceship.utils import angle_between
1
+ import mmap
2
+ import re
3
+ from typing import Union
6
4
 
5
+ import gemmi
6
+ import numpy as np
7
7
 
8
- def _parse_stream(filename: str) -> dict:
8
+ from reciprocalspaceship import DataSet, concat
9
+ from reciprocalspaceship.io.common import check_for_ray, ray_context
10
+ from reciprocalspaceship.utils import angle_between, eV2Angstroms
11
+
12
+ # See Rupp Table 5-2
13
+ _cell_constraints = {
14
+ "triclinic": lambda x: x,
15
+ "orthorhombic": lambda x: [x[0], x[1], x[2], 90.0, 90.0, 90.0],
16
+ "monoclinic": lambda x: [x[0], x[1], x[2], 90.0, x[4], 90.0],
17
+ "hexagonal": lambda x: [
18
+ 0.5 * (x[0] + x[1]),
19
+ 0.5 * (x[0] + x[1]),
20
+ x[2],
21
+ 90.0,
22
+ 90.0,
23
+ 120.0,
24
+ ],
25
+ "rhombohedral": lambda x: [
26
+ 0.5 * (x[0] + x[1]),
27
+ 0.5 * (x[0] + x[1]),
28
+ x[2],
29
+ 90.0,
30
+ 90.0,
31
+ 120.0,
32
+ ],
33
+ "cubic": lambda x: [
34
+ np.mean(x[:3]),
35
+ np.mean(x[:3]),
36
+ np.mean(x[:3]),
37
+ 90.0,
38
+ 90.0,
39
+ 90.0,
40
+ ],
41
+ "tetragonal": lambda x: [
42
+ 0.5 * (x[0] + x[1]),
43
+ 0.5 * (x[0] + x[1]),
44
+ x[2],
45
+ 90.0,
46
+ 90.0,
47
+ 90.0,
48
+ ],
49
+ }
50
+
51
+ # See crystFEL API reference here: https://www.desy.de/~twhite/crystfel/reference/stream_8h.html
52
+ _block_markers = {
53
+ "geometry": (r"----- Begin geometry file -----", r"----- End geometry file -----"),
54
+ "chunk": (r"----- Begin chunk -----", r"----- End chunk -----"),
55
+ "cell": (r"----- Begin unit cell -----", r"----- End unit cell -----"),
56
+ "peaks": (r"Peaks from peak search", r"End of peak list"),
57
+ "crystal": (r"--- Begin crystal", r"--- End crystal"),
58
+ "reflections": (r"Reflections measured after indexing", r"End of reflections"),
59
+ }
60
+
61
+
62
+ class StreamLoader(object):
9
63
  """
10
- Parses stream and returns all indexed peak positions
11
-
12
- Parameters
64
+ An object that loads stream files into rs.DataSet objects in parallel.
65
+ Attributes
13
66
  ----------
14
- filename : stream filename
15
- name of a .stream file
16
-
17
- Returns
18
- --------
19
- (dict, np.ndarray)
67
+ block_regex_bytes : dict
68
+ A dictionary of compiled regular expressions that operate on strings
69
+ block_regex : dict
70
+ A dictionary of compiled regular expressions that operate on byte strings
20
71
  """
21
72
 
22
- answ_crystals = {}
23
-
24
- def contains_filename(s):
25
- return s.startswith("Image filename")
26
-
27
- def contains_event(s):
28
- return s.startswith("Event")
29
-
30
- def contains_serial_number(s):
31
- return s.startswith("Image serial number")
32
-
33
- def starts_chunk_peaks(s):
34
- return s.startswith(" fs/px ss/px (1/d)/nm^-1 Intensity Panel")
35
-
36
- def ends_chunk_peaks(s):
37
- return s.startswith("End of peak list")
73
+ peak_list_columns = {
74
+ "H": 0,
75
+ "K": 1,
76
+ "L": 2,
77
+ "I": 3,
78
+ "SigI": 4,
79
+ "peak": 5,
80
+ "background": 6,
81
+ "XDET": 7,
82
+ "YDET": 8,
83
+ "s1x": 9,
84
+ "s1y": 10,
85
+ "s1z": 11,
86
+ "ewald_offset": 12,
87
+ "angular_ewald_offset": 13,
88
+ "ewald_offset_x": 14,
89
+ "ewald_offset_y": 15,
90
+ "ewald_offset_z": 16,
91
+ }
38
92
 
39
- def starts_crystal_peaks(s):
40
- return s.startswith(
41
- " h k l I sigma(I) peak background fs/px ss/px panel"
93
+ def __init__(self, filename: str, encoding="utf-8"):
94
+ self.filename = filename
95
+ self.encoding = encoding
96
+ self.block_regex = {}
97
+ self.block_regex_bytes = {}
98
+
99
+ # Set up all the regular expressions for finding block boundaries
100
+ for k, (beginning, ending) in _block_markers.items():
101
+ self.block_regex[k + "_begin"] = re.compile(beginning)
102
+ self.block_regex[k + "_end"] = re.compile(ending)
103
+ self.block_regex[k] = re.compile(
104
+ f"(?s){beginning}\n(?P<CRYSTAL_BLOCK>.*?)\n{ending}"
105
+ )
106
+
107
+ self.block_regex_bytes[k + "_begin"] = re.compile(
108
+ beginning.encode(self.encoding)
109
+ )
110
+ self.block_regex_bytes[k + "_end"] = re.compile(
111
+ ending.encode(self.encoding)
112
+ )
113
+ self.block_regex_bytes[k] = re.compile(
114
+ f"(?s){beginning}\n(?P<CRYSTAL_BLOCK>.*?)\n{ending}".encode(
115
+ self.encoding
116
+ )
117
+ )
118
+
119
+ self.re_abcstar = re.compile("[abc]star =.+\n")
120
+ self.re_photon_energy = re.compile("photon_energy_eV =.+\n")
121
+
122
+ self.re_chunk_metadata = {
123
+ "Image filename": re.compile(r"(?<=Image filename: ).+(?=\n)"),
124
+ "Event": re.compile(r"(?<=Event: ).+(?=\n)"),
125
+ "Image serial number:": re.compile(r"(?<=Image serial number: ).+(?=\n)"),
126
+ "indexed_by": re.compile(r"(?<=indexed_by \= ).+(?=\n)"),
127
+ "photon_energy_eV": re.compile(r"(?<=photon_energy_eV \= ).+(?=\n)"),
128
+ "beam_divergence": re.compile(r"(?<=beam_divergence \= ).+(?=\n)"),
129
+ "beam_bandwidth": re.compile(r"(?<=beam_bandwidth \= ).+(?=\n)"),
130
+ }
131
+
132
+ self.re_crystal_metadata = {
133
+ "Cell parameters": re.compile(r"(?<=Cell parameters).+(?=\n)"),
134
+ "astar": re.compile(r"(?<=astar = ).+(?=\n)"),
135
+ "bstar": re.compile(r"(?<=bstar = ).+(?=\n)"),
136
+ "cstar": re.compile(r"(?<=cstar = ).+(?=\n)"),
137
+ "lattice_type": re.compile(r"(?<=lattice_type = ).+(?=\n)"),
138
+ "centering": re.compile(r"(?<=centering = ).+(?=\n)"),
139
+ "unique_axis": re.compile(r"(?<=unique_axis = ).+(?=\n)"),
140
+ "profile_radius": re.compile(r"(?<=profile_radius = ).+(?=\n)"),
141
+ "predict_refine/det_shift": re.compile(
142
+ r"(?<=predict_refine/det_shift ).+(?=\n)"
143
+ ),
144
+ "predict_refine/R": re.compile(r"(?<=predict_refine/R ).+(?=\n)"),
145
+ "diffraction_resolution_limit": re.compile(
146
+ r"(?<=diffraction_resolution_limit = ).+(?=\n)"
147
+ ),
148
+ "num_reflections": re.compile(r"(?<=num_reflections = ).+(?=\n)"),
149
+ }
150
+
151
+ # TODO: replace these with the faster, non variabled length equivalents
152
+ self.re_crystal = re.compile(
153
+ r"(?s)--- Begin crystal\n(?P<CRYSTAL_BLOCK>.*?)\n--- End crystal"
154
+ )
155
+ self.re_refls = re.compile(
156
+ r"(?s)Reflections measured after indexing\n(?P<REFL_BLOCK>.*?)\nEnd of reflections"
42
157
  )
43
158
 
44
- def is_photon_energy(s):
45
- return s.startswith("photon_energy_eV")
46
-
47
- def is_astar(s):
48
- return s.startswith("astar")
49
-
50
- def is_bstar(s):
51
- return s.startswith("bstar")
52
-
53
- def is_cstar(s):
54
- return s.startswith("cstar")
55
-
56
- def ends_crystal_peaks(s):
57
- return s.startswith("End of reflections")
58
-
59
- def eV2Angstrom(e_eV):
60
- return 12398.0 / e_eV
61
-
62
- # add unit cell parameters parsing
63
- with open(filename, "r") as stream:
64
- is_unit_cell = False
65
- get_cellparam = lambda s: float(s.split()[2])
66
- rv_cell_param = None
67
- a, b, c, al, be, ga = [
68
- None
69
- ] * 6 # None's are needed since stream not always has all 6 parameters
70
- for line in stream:
71
- if "Begin unit cell" in line:
72
- is_unit_cell = True
73
- continue
74
- elif is_unit_cell:
75
- if line.startswith("a ="):
76
- a = get_cellparam(line)
77
- if line.startswith("b ="):
78
- b = get_cellparam(line)
79
- if line.startswith("c ="):
80
- c = get_cellparam(line)
81
- if line.startswith("al ="):
82
- al = get_cellparam(line)
83
- if line.startswith("be ="):
84
- be = get_cellparam(line)
85
- if line.startswith("ga ="):
86
- ga = get_cellparam(line)
87
- is_unit_cell = False # gamma is the last parameters
88
- elif "End unit cell" in line:
89
- rv_cell_param = np.array([a, b, c, al, be, ga])
90
- break
91
-
92
- with open(filename, "r") as stream:
93
- is_chunk = False
94
- is_crystal = False
95
- current_filename = None
96
- current_event = None # to handle non-event streams
97
- current_serial_number = None
98
- corrupted_chunk = False
99
- crystal_peak_number = 0
100
- crystal_idx = 0
101
-
102
- for line in stream:
103
- # analyzing what we have
104
- if ends_chunk_peaks(line):
105
- is_chunk = False
106
- chunk_peak_number = 0
107
- elif ends_crystal_peaks(line):
108
- is_crystal = False
109
- crystal_peak_number = 0
110
-
111
- elif is_photon_energy(line):
112
- photon_energy = float(line.split()[2])
113
- elif is_astar(line):
114
- astar = (
115
- np.array(line.split()[2:5], dtype="float32") / 10.0
116
- ) # crystfel's notation uses nm-1
117
- elif is_bstar(line):
118
- bstar = (
119
- np.array(line.split()[2:5], dtype="float32") / 10.0
120
- ) # crystfel's notation uses nm-1
121
- elif is_cstar(line):
122
- cstar = (
123
- np.array(line.split()[2:5], dtype="float32") / 10.0
124
- ) # crystfel's notation uses nm-1
125
-
126
- # since it's the last line needed to construct Ewald offset,
127
- # we'll pre-compute the matrices here
128
- A = np.array([astar, bstar, cstar]).T
129
- lambda_inv = 1 / eV2Angstrom(photon_energy)
130
- s0 = np.array([0, 0, lambda_inv]).T
131
-
132
- elif is_crystal:
133
- # example line:
134
- # h k l I sigma(I) peak background fs/px ss/px panel
135
- # -63 41 9 -41.31 57.45 195.00 170.86 731.0 1350.4 p0
136
- crystal_peak_number += 1
137
- h, k, l, I, sigmaI, peak, background, xdet, ydet, panel = [
138
- i for i in line.split()
139
- ]
140
- h, k, l = map(int, [h, k, l])
141
-
142
- # calculate ewald offset and s1
143
- hkl = np.array([h, k, l])
144
- q = A @ hkl
145
- s1 = q + s0
146
- s1x, s1y, s1z = s1
147
- s1_norm = np.linalg.norm(s1)
148
- ewald_offset = s1_norm - lambda_inv
149
-
150
- # project calculated s1 onto the ewald sphere
151
- s1_obs = lambda_inv * s1 / s1_norm
152
-
153
- # Compute the angular ewald offset
154
- q_obs = s1_obs - s0
155
- qangle = np.sign(ewald_offset) * angle_between(q, q_obs)
156
-
157
- record = {
158
- "H": h,
159
- "K": k,
160
- "L": l,
161
- "I": float(I),
162
- "SigI": float(sigmaI),
163
- "BATCH": crystal_idx,
164
- "s1x": s1x,
165
- "s1y": s1y,
166
- "s1z": s1z,
167
- "ewald_offset": ewald_offset,
168
- "angular_ewald_offset": qangle,
169
- "XDET": float(xdet),
170
- "YDET": float(ydet),
171
- }
172
- if current_event is not None:
173
- name = (
174
- current_filename,
175
- current_event,
176
- current_serial_number,
177
- crystal_idx,
178
- crystal_peak_number,
159
+ def extract_target_unit_cell(self) -> Union[list, None]:
160
+ """
161
+ Search the file header for target unit cell parameters.
162
+ """
163
+ header = self.extract_file_header()
164
+ cell = None
165
+ lattice_type = None
166
+
167
+ for line in header.split("\n"):
168
+ if line.startswith("a = "):
169
+ idx = 0
170
+ elif line.startswith("b = "):
171
+ idx = 1
172
+ elif line.startswith("c = "):
173
+ idx = 2
174
+ elif line.startswith("al = "):
175
+ idx = 3
176
+ elif line.startswith("be = "):
177
+ idx = 4
178
+ elif line.startswith("ga = "):
179
+ idx = 5
180
+ else:
181
+ idx = None
182
+ if idx is not None:
183
+ if cell is None:
184
+ cell = [None] * 6
185
+ value = float(line.split()[2])
186
+ cell[idx] = value
187
+ if line.startswith("lattice_type ="):
188
+ lattice_type = line.split()[-1]
189
+
190
+ if lattice_type is not None:
191
+ cell = _cell_constraints[lattice_type](cell)
192
+ return cell
193
+
194
+ def calculate_average_unit_cell(self) -> gemmi.UnitCell:
195
+ """
196
+ Compute the average of all cell parameters across the file.
197
+ """
198
+ regex = re.compile(rb"Cell parameters .+\n")
199
+ with open(self.filename, "r") as f:
200
+ memfile = mmap.mmap(f.fileno(), 0, access=mmap.ACCESS_READ)
201
+ lines = regex.findall(memfile)
202
+ if len(lines) == 0:
203
+ raise ValueError(
204
+ f"No unit cell parameters were found in the header of {self.filename}"
205
+ )
206
+
207
+ cell = np.loadtxt(lines, usecols=[2, 3, 4, 6, 7, 8], dtype="float32").mean(0)
208
+ cell[:3] *= 10.0
209
+
210
+ header = self.extract_file_header()
211
+ lattice_type = None
212
+
213
+ for line in header.split("\n"):
214
+ if line.startswith("lattice_type ="):
215
+ lattice_type = line.split()[-1]
216
+
217
+ if lattice_type is not None:
218
+ cell = _cell_constraints[lattice_type](cell)
219
+ return cell
220
+
221
+ def extract_file_header(self) -> str:
222
+ """
223
+ Extract all the data prior to first chunk and return it as a string.
224
+ """
225
+ with open(self.filename, "r") as f:
226
+ memfile = mmap.mmap(f.fileno(), 0, access=mmap.ACCESS_READ)
227
+ match = self.block_regex_bytes["chunk_begin"].search(memfile)
228
+ header = memfile.read(match.start()).decode()
229
+ return header
230
+
231
+ @property
232
+ def available_column_names(self) -> list:
233
+ """Keys which can be passed to parallel_read_crystfel to customize the peak list output"""
234
+ return list(self.peak_list_columns.keys())
235
+
236
+ @property
237
+ def available_chunk_metadata_keys(self) -> list:
238
+ """Keys which can be passed to parallel_read_crystfel to customize the chunk level metadata"""
239
+ return list(self.re_chunk_metadata.keys())
240
+
241
+ @property
242
+ def available_crystal_metadata_keys(self) -> list:
243
+ """Keys which can be passed to parallel_read_crystfel to customize the crystal level metadata"""
244
+ return list(self.re_crystal_metadata.keys())
245
+
246
+ def read_crystfel(
247
+ self,
248
+ wavelength=None,
249
+ chunk_metadata_keys=None,
250
+ crystal_metadata_keys=None,
251
+ peak_list_columns=None,
252
+ use_ray=True,
253
+ num_cpus=None,
254
+ address="local",
255
+ **ray_kwargs,
256
+ ) -> list:
257
+ """
258
+ Parse a CrystFEL stream file using multiple processors. Parallelization depends on the ray library (https://www.ray.io/).
259
+ If ray is unavailable, this method falls back to serial processing on one CPU. Ray is not a dependency of reciprocalspaceship
260
+ and will not be installed automatically. Users must manually install it prior to calling this method.
261
+
262
+ PARAMETERS
263
+ ----------
264
+ wavelength : float
265
+ Override the wavelength with this value. Wavelength is used to compute Ewald offsets.
266
+ chunk_metadata_keys : list
267
+ A list of metadata_keys which will be returned in the resulting dictionaries under the 'chunk_metadata' entry.
268
+ A list of possible keys is stored as stream_loader.available_chunk_metadata_keys
269
+ crytal_metadata_keys : list
270
+ A list of metadata_keys which will be returned in the resulting dictionaries under the 'crystal_metadata' entry.
271
+ A list of possible keys is stored as stream_loader.available_crystal_metadata_keys
272
+ peak_list_columns : list
273
+ A list of columns to include in the peak list numpy arrays.
274
+ A list of possible column names is stored as stream_loader.available_column_names.
275
+ use_ray : bool(optional)
276
+ Whether or not to use ray for parallelization.
277
+ num_cpus : int (optional)
278
+ The number of cpus for ray to use.
279
+ ray_kwargs : optional
280
+ Additional keyword arguments to pass to [ray.init](https://docs.ray.io/en/latest/ray-core/api/doc/ray.init.html#ray.init).
281
+
282
+ RETURNS
283
+ -------
284
+ chunks : list
285
+ A list of dictionaries containing the per-chunk data. The 'peak_lists' item contains a
286
+ numpy array with shape n x 14 with the following information.
287
+ h, k, l, I, SIGI, peak, background, fs/px, ss/px, s1x, s1y, s1z,
288
+ ewald_offset, angular_ewald_offset
289
+ """
290
+ if peak_list_columns is not None:
291
+ peak_list_columns = [self.peak_list_columns[s] for s in peak_list_columns]
292
+
293
+ # Check whether ray is available
294
+ if use_ray:
295
+ use_ray = check_for_ray()
296
+
297
+ with open(self.filename, "r") as f:
298
+ memfile = mmap.mmap(f.fileno(), 0, access=mmap.ACCESS_READ)
299
+ beginnings_and_ends = zip(
300
+ self.block_regex_bytes["chunk_begin"].finditer(memfile),
301
+ self.block_regex_bytes["chunk_end"].finditer(memfile),
302
+ )
303
+ if use_ray:
304
+ with ray_context(num_cpus=num_cpus, **ray_kwargs) as ray:
305
+
306
+ @ray.remote
307
+ def parse_chunk(loader: StreamLoader, *args):
308
+ return loader._parse_chunk(*args)
309
+
310
+ result_ids = []
311
+ for begin, end in beginnings_and_ends:
312
+ result_ids.append(
313
+ parse_chunk.remote(
314
+ self,
315
+ begin.start(),
316
+ end.end(),
317
+ wavelength,
318
+ chunk_metadata_keys,
319
+ crystal_metadata_keys,
320
+ peak_list_columns,
321
+ )
322
+ )
323
+
324
+ results = ray.get(result_ids)
325
+
326
+ return results
327
+
328
+ else:
329
+ results = []
330
+ for begin, end in beginnings_and_ends:
331
+ results.append(
332
+ self._parse_chunk(
333
+ begin.start(),
334
+ end.end(),
335
+ wavelength,
336
+ chunk_metadata_keys,
337
+ crystal_metadata_keys,
338
+ peak_list_columns,
339
+ )
179
340
  )
180
- else:
181
- name = (
182
- current_filename,
183
- current_serial_number,
184
- crystal_idx,
185
- crystal_peak_number,
341
+ return results
342
+
343
+ def _extract_chunk_metadata(self, chunk_text, metadata_keys=None):
344
+ if metadata_keys is None:
345
+ return None
346
+ result = {}
347
+ for k in metadata_keys:
348
+ re = self.re_chunk_metadata[k]
349
+ for v in re.findall(chunk_text):
350
+ result[k] = v
351
+ return result
352
+
353
+ def _extract_crystal_metadata(self, xtal_text, metadata_keys=None):
354
+ if metadata_keys is None:
355
+ return None
356
+ result = {}
357
+ for k in metadata_keys:
358
+ re = self.re_crystal_metadata[k]
359
+ for v in re.findall(xtal_text):
360
+ result[k] = v
361
+ return result
362
+
363
+ def _parse_chunk(
364
+ self,
365
+ start,
366
+ end,
367
+ wavelength,
368
+ chunk_metadata_keys,
369
+ crystal_metadata_keys,
370
+ peak_list_columns,
371
+ ):
372
+ with open(self.filename, "r") as f:
373
+ f.seek(start)
374
+ data = f.read(end - start)
375
+
376
+ if wavelength is None:
377
+ ev_match = self.re_photon_energy.search(data)
378
+ ev_line = data[ev_match.start() : ev_match.end()]
379
+ photon_energy = np.float32(ev_line.split()[2])
380
+ wavelength = eV2Angstroms(photon_energy)
381
+ lambda_inv = np.reciprocal(wavelength)
382
+ else:
383
+ lambda_inv = np.reciprocal(wavelength)
384
+
385
+ peak_lists = []
386
+ a_matrices = []
387
+ chunk_metadata = None
388
+ crystal_metadata = []
389
+ header = None
390
+ for xmatch in self.re_crystal.finditer(data):
391
+ xdata = data[xmatch.start() : xmatch.end()]
392
+ if header is None:
393
+ header = data[: xmatch.start()]
394
+
395
+ # crystal_metadata.append(self._extract_crystal_metadata(xdata))
396
+ A = (
397
+ np.loadtxt(
398
+ self.re_abcstar.findall(xdata),
399
+ usecols=[2, 3, 4],
400
+ dtype="float32",
401
+ ).T
402
+ / 10.0
403
+ )
404
+ a_matrices.append(A)
405
+
406
+ for pmatch in self.re_refls.finditer(xdata):
407
+ pdata = xdata[pmatch.start() : pmatch.end()]
408
+ crystal_metadata.append(
409
+ self._extract_crystal_metadata(xdata, crystal_metadata_keys)
186
410
  )
187
- answ_crystals[name] = record
188
-
189
- # start analyzing where we are now
190
- if corrupted_chunk:
191
- if "Begin chunk" not in line:
192
- continue
193
- else:
194
- is_crystal, is_chunk = False, False
195
- corrupted_chunk = False
196
- continue
197
-
198
- if contains_filename(line):
199
- current_filename = line.split()[-1]
200
- elif contains_event(line):
201
- current_event = line.split()[-1][2:]
202
- elif contains_serial_number(line):
203
- current_serial_number = line.split()[-1]
204
-
205
- elif starts_chunk_peaks(line):
206
- is_chunk = True
207
- continue
208
-
209
- elif starts_crystal_peaks(line):
210
- crystal_idx += 1
211
- is_crystal = True
212
- continue
213
-
214
- return answ_crystals, rv_cell_param
215
-
216
-
217
- def read_crystfel(streamfile: str, spacegroup=None) -> DataSet:
411
+ peak_array = np.loadtxt(
412
+ pdata.split("\n")[2:-1],
413
+ usecols=(0, 1, 2, 3, 4, 5, 6, 7, 8),
414
+ dtype="float32",
415
+ )
416
+ s0 = np.array([0, 0, lambda_inv], dtype="float32").T
417
+ q = (A @ peak_array[:, :3].T).T
418
+ s1 = q + s0
419
+
420
+ # This is way faster than np.linalg.norm for small dimensions
421
+ x, y, z = s1.T
422
+ s1_norm = np.sqrt(x * x + y * y + z * z)
423
+ ewald_offset = s1_norm - lambda_inv
424
+
425
+ # project calculated s1 onto the ewald sphere
426
+ s1_obs = lambda_inv * s1 / s1_norm[:, None]
427
+
428
+ # Compute the angular ewald offset
429
+ q_obs = s1_obs - s0
430
+ qangle = np.sign(ewald_offset) * angle_between(q, q_obs)
431
+
432
+ peak_array = np.concatenate(
433
+ (
434
+ peak_array,
435
+ s1,
436
+ ewald_offset[:, None],
437
+ qangle[:, None],
438
+ s1_obs - s1, # Ewald offset vector
439
+ ),
440
+ axis=-1,
441
+ )
442
+ if peak_list_columns is not None:
443
+ peak_array = peak_array[:, peak_list_columns]
444
+ peak_lists.append(peak_array)
445
+
446
+ if header is None:
447
+ header = data
448
+ chunk_metadata = self._extract_chunk_metadata(header, chunk_metadata_keys)
449
+
450
+ result = {
451
+ "wavelength": wavelength,
452
+ "A_matrices": a_matrices,
453
+ "peak_lists": peak_lists,
454
+ }
455
+ if chunk_metadata_keys is not None:
456
+ result[chunk_metadata_keys] = chunk_metadata
457
+ if crystal_metadata_keys is not None:
458
+ result[crystal_metadata_keys] = crystal_metadata
459
+ return result
460
+
461
+
462
+ def read_crystfel(
463
+ streamfile: str,
464
+ spacegroup=None,
465
+ encoding="utf-8",
466
+ columns=None,
467
+ parallel=True,
468
+ num_cpus=None,
469
+ address="local",
470
+ **ray_kwargs,
471
+ ) -> DataSet:
218
472
  """
219
473
  Initialize attributes and populate the DataSet object with data from a CrystFEL stream with indexed reflections.
220
474
  This is the output format used by CrystFEL software when processing still diffraction data.
221
475
 
476
+ This method is parallelized across CPUs speed up parsing. Parallelization depends on the ray library (https://www.ray.io/).
477
+ If ray is unavailable, this method falls back to serial processing on one CPU. Ray is not a dependency of reciprocalspaceship
478
+ and will not be installed automatically. Users must manually install it prior to calling this method.
479
+
222
480
  Parameters
223
481
  ----------
224
482
  streamfile : str
225
483
  name of a .stream file
226
484
  spacegroup : gemmi.SpaceGroup or int or string (optional)
227
485
  optionally set the spacegroup of the returned DataSet.
486
+ encoding : str
487
+ The type of byte-encoding (optional, 'utf-8').
488
+ columns : list (optional)
489
+ Optionally specify the columns of the output by a list of strings.
490
+ The default list is: [ "H", "K", "L", "I", "SigI", "BATCH", "s1x", "s1y", "s1z", "ewald_offset", "angular_ewald_offset", "XDET", "YDET" ]
491
+ See `rs.io.crystfel.StreamLoader().available_column_names` for a list of available
492
+ column names and *Notes* for a description of the returned columns
493
+ parallel : bool (optional)
494
+ Read the stream file in parallel using [ray.io](https://docs.ray.io) if it is available.
495
+ num_cpus : int (optional)
496
+ By default, the model will use all available cores. For very large cpu counts, this may consume
497
+ too much memory. Decreasing num_cpus may help. If ray is not installed, a single core will be used.
498
+ address : str (optional)
499
+ Optionally specify the ray instance to connect to. By default, start a new local instance.
500
+ ray_kwargs : optional
501
+ Additional keyword arguments to pass to [ray.init](https://docs.ray.io/en/latest/ray-core/api/doc/ray.init.html#ray.init).
228
502
 
229
503
  Returns
230
504
  --------
231
505
  rs.DataSet
232
- """
233
506
 
507
+ Notes
508
+ -----
509
+ The following columns are included in the returned DataSet object:
510
+
511
+ - H, K, L: Miller indices of each reflection
512
+ - I, SigI: Intensity and associated uncertainty
513
+ - BATCH: Image number
514
+ - s1x, s1y, s1z: scattered beam wavevector which points from the sample to the bragg peak
515
+ - ewald_offset: the distance in cartesian space (1/angstroms) between the observed reflection and the ewald sphere
516
+ - angular_ewald_offset: the distance in polar coordinates (degrees) between the observed reflection and the ewald sphere
517
+ - XDET, YDET: Internal detector panel coordinates
518
+ """
234
519
  if not streamfile.endswith(".stream"):
235
520
  raise ValueError("Stream file should end with .stream")
521
+
236
522
  # read data from stream file
237
- d, cell = _parse_stream(streamfile)
238
- df = pd.DataFrame.from_records(list(d.values()))
239
-
240
- # set mtztypes as in precognition.py
241
- # hkl -- H
242
- # I, sigmaI -- J, Q
243
- # BATCH -- B
244
- # s1{x,y,z} -- R
245
- # ewald_offset -- R
246
- mtzdtypes = {
523
+ if columns is None:
524
+ columns = [
525
+ "H",
526
+ "K",
527
+ "L",
528
+ "I",
529
+ "SigI",
530
+ "BATCH",
531
+ "s1x",
532
+ "s1y",
533
+ "s1z",
534
+ "ewald_offset",
535
+ "angular_ewald_offset",
536
+ "XDET",
537
+ "YDET",
538
+ ]
539
+ peak_list_columns = [
540
+ i for i in columns if i != "BATCH"
541
+ ] # BATCH is computed afterward
542
+
543
+ mtz_dtypes = {
247
544
  "H": "H",
248
545
  "K": "H",
249
546
  "L": "H",
250
547
  "I": "J",
251
548
  "SigI": "Q",
252
549
  "BATCH": "B",
253
- "s1x": "R",
254
- "s1y": "R",
255
- "s1z": "R",
256
- "ewald_offset": "R",
257
- "angular_ewald_offset": "R",
258
- "XDET": "R",
259
- "YDET": "R",
260
550
  }
261
- dataset = DataSet(
262
- spacegroup=spacegroup,
263
- cell=cell,
264
- merged=False, # CrystFEL stream is always unmerged
265
- )
266
- for k, v in df.items():
267
- dataset[k] = v.astype(mtzdtypes[k])
268
- dataset.set_index(["H", "K", "L"], inplace=True)
269
-
270
- return dataset
551
+ for k in columns:
552
+ mtz_dtypes[k] = mtz_dtypes.get(k, "R")
553
+
554
+ loader = StreamLoader(streamfile, encoding=encoding)
555
+ cell = loader.extract_target_unit_cell()
556
+
557
+ batch = 0
558
+ ds = []
559
+
560
+ for chunk in loader.read_crystfel(
561
+ peak_list_columns=peak_list_columns,
562
+ use_ray=parallel,
563
+ num_cpus=num_cpus,
564
+ address=address,
565
+ **ray_kwargs,
566
+ ):
567
+ for peak_list in chunk["peak_lists"]:
568
+ _ds = DataSet(
569
+ peak_list,
570
+ columns=peak_list_columns,
571
+ cell=cell,
572
+ spacegroup=spacegroup,
573
+ merged=False,
574
+ )
575
+ _ds["BATCH"] = batch
576
+ ds.append(_ds)
577
+ batch += 1
578
+
579
+ ds = concat(ds, axis=0, check_isomorphous=False, copy=False, ignore_index=True)
580
+
581
+ mtz_dtypes = {
582
+ "H": "H",
583
+ "K": "H",
584
+ "L": "H",
585
+ "I": "J",
586
+ "SigI": "Q",
587
+ "BATCH": "B",
588
+ }
589
+ for k in ds:
590
+ mtz_dtypes[k] = mtz_dtypes.get(k, "R")
591
+
592
+ ds = ds.astype(mtz_dtypes, copy=False)
593
+ ds.set_index(["H", "K", "L"], inplace=True)
594
+
595
+ return ds