servalcat 0.4.88__cp313-cp313-win_amd64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of servalcat might be problematic. Click here for more details.

Files changed (45) hide show
  1. servalcat/__init__.py +10 -0
  2. servalcat/__main__.py +120 -0
  3. servalcat/ext.cp313-win_amd64.pyd +0 -0
  4. servalcat/refine/__init__.py +0 -0
  5. servalcat/refine/cgsolve.py +100 -0
  6. servalcat/refine/refine.py +823 -0
  7. servalcat/refine/refine_geom.py +220 -0
  8. servalcat/refine/refine_spa.py +345 -0
  9. servalcat/refine/refine_xtal.py +268 -0
  10. servalcat/refine/spa.py +136 -0
  11. servalcat/refine/xtal.py +273 -0
  12. servalcat/refmac/__init__.py +0 -0
  13. servalcat/refmac/exte.py +182 -0
  14. servalcat/refmac/refmac_keywords.py +639 -0
  15. servalcat/refmac/refmac_wrapper.py +403 -0
  16. servalcat/spa/__init__.py +0 -0
  17. servalcat/spa/fofc.py +473 -0
  18. servalcat/spa/fsc.py +387 -0
  19. servalcat/spa/localcc.py +188 -0
  20. servalcat/spa/realspcc_from_var.py +128 -0
  21. servalcat/spa/run_refmac.py +972 -0
  22. servalcat/spa/shift_maps.py +293 -0
  23. servalcat/spa/shiftback.py +137 -0
  24. servalcat/spa/translate.py +129 -0
  25. servalcat/utils/__init__.py +35 -0
  26. servalcat/utils/commands.py +1397 -0
  27. servalcat/utils/fileio.py +737 -0
  28. servalcat/utils/generate_operators.py +296 -0
  29. servalcat/utils/hkl.py +712 -0
  30. servalcat/utils/logger.py +116 -0
  31. servalcat/utils/maps.py +345 -0
  32. servalcat/utils/model.py +782 -0
  33. servalcat/utils/refmac.py +760 -0
  34. servalcat/utils/restraints.py +782 -0
  35. servalcat/utils/symmetry.py +295 -0
  36. servalcat/xtal/__init__.py +0 -0
  37. servalcat/xtal/french_wilson.py +256 -0
  38. servalcat/xtal/run_refmac_small.py +240 -0
  39. servalcat/xtal/sigmaa.py +1622 -0
  40. servalcat/xtal/twin.py +115 -0
  41. servalcat-0.4.88.dist-info/METADATA +55 -0
  42. servalcat-0.4.88.dist-info/RECORD +45 -0
  43. servalcat-0.4.88.dist-info/WHEEL +5 -0
  44. servalcat-0.4.88.dist-info/entry_points.txt +4 -0
  45. servalcat-0.4.88.dist-info/licenses/LICENSE +373 -0
servalcat/utils/hkl.py ADDED
@@ -0,0 +1,712 @@
1
+ """
2
+ Author: "Keitaro Yamashita, Garib N. Murshudov"
3
+ MRC Laboratory of Molecular Biology
4
+
5
+ This software is released under the
6
+ Mozilla Public License, version 2.0; see LICENSE.
7
+ """
8
+ from __future__ import absolute_import, division, print_function, generators
9
+ import numpy
10
+ import numpy.lib.recfunctions
11
+ import scipy.optimize
12
+ import pandas
13
+ import gemmi
14
+ from servalcat.utils import logger
15
+
16
+ dtypes64 = dict(i=numpy.int64, u=numpy.uint64, f=numpy.float64, c=numpy.complex128)
17
+ to64 = lambda x: x.astype(dtypes64.get(x.dtype.kind, x.dtype))
18
+
19
+ def r_factor(fo, fc):
20
+ if fo.size == 0:
21
+ return numpy.nan
22
+ return numpy.nansum(numpy.abs(fo-fc)) / numpy.nansum(fo)
23
+ def correlation(obs, calc):
24
+ if obs.size == 0:
25
+ return numpy.nan
26
+ sel = numpy.isfinite(obs)
27
+ return numpy.corrcoef(obs[sel], calc[sel])[0,1]
28
+
29
+ def df_from_asu_data(asu_data, label):
30
+ df = pandas.DataFrame(data=asu_data.miller_array,
31
+ columns=["H","K","L"])
32
+ if asu_data.value_array.dtype.names == ('value', 'sigma'):
33
+ df[label] = to64(asu_data.value_array["value"])
34
+ df["SIG"+label] = to64(asu_data.value_array["sigma"])
35
+ else:
36
+ df[label] = to64(asu_data.value_array)
37
+ return df
38
+
39
+ def df_from_raw(miller_array, value_array, label):
40
+ df = pandas.DataFrame(data=miller_array,
41
+ columns=["H","K","L"])
42
+ df[label] = to64(value_array)
43
+ return df
44
+
45
+ def hkldata_from_asu_data(asu_data, label):
46
+ df = df_from_asu_data(asu_data, label)
47
+ return HklData(asu_data.unit_cell, asu_data.spacegroup, df)
48
+ # hkldata_from_asu_data()
49
+
50
+ def mtz_find_data_columns(mtz, require_sigma=True):
51
+ # for now (+)/(-) (types K/M and L/G) are not supported
52
+ col_types = {x.label:x.type for x in mtz.columns}
53
+ ret = {"J": [], "F": [], "K": [], "G": []}
54
+ for col in col_types:
55
+ typ = col_types[col]
56
+ if typ in ("J", "F"):
57
+ ret[typ].append([col])
58
+ sig = "SIG" + col
59
+ if col_types.get(sig) == "Q":
60
+ ret[typ][-1].append(sig)
61
+ elif require_sigma:
62
+ ret[typ].pop()
63
+ elif typ in ("K", "G") and col.endswith(("(+)", "plus")):
64
+ # we always need sigma - right?
65
+ col_minus = col.replace("(+)", "(-)") if col.endswith("(+)") else col.replace("plus", "minus")
66
+ sig_type = {"K": "M", "G": "L"}[typ]
67
+ if (col_types.get(col_minus) == typ and
68
+ col_types.get("SIG"+col) == sig_type and
69
+ col_types.get("SIG"+col_minus) == sig_type):
70
+ ret[typ].append([col, "SIG"+col, col_minus, "SIG"+col_minus])
71
+ return ret
72
+ # mtz_find_data_columns()
73
+
74
+ def mtz_find_free_columns(mtz):
75
+ col_types = {x.label:x.type for x in mtz.columns}
76
+ free_names = ("FREE", "RFREE", "FREER", "FreeR_flag", "R-free-flags", "FreeRflag")
77
+ ret = []
78
+ for col in col_types:
79
+ typ = col_types[col]
80
+ if typ == "I" and col in free_names:
81
+ ret.append(col)
82
+ return ret
83
+ # mtz_find_free_columns()
84
+
85
+ def hkldata_from_mtz(mtz, labels, newlabels=None, require_types=None):
86
+ assert type(mtz) == gemmi.Mtz
87
+ notfound = set(labels) - set(mtz.column_labels())
88
+ if notfound:
89
+ raise RuntimeError("MTZ columns not found: {}".format(" ".join(notfound)))
90
+ col_types = {x.label:x.type for x in mtz.columns}
91
+ if require_types:
92
+ mismatches = [l for l,r in zip(labels, require_types) if r is not None and r != col_types[l]]
93
+ if mismatches:
94
+ raise RuntimeError("MTZ column types mismatch: {}".format(" ".join(mismatches)))
95
+
96
+ df = pandas.DataFrame(data=numpy.array(mtz, copy=False), columns=mtz.column_labels())
97
+ df = df.astype({col: 'int32' for col in col_types if col_types[col] == "H"})
98
+ df = df.astype({col: 'Int64' for col in col_types if col_types[col] in ("B", "Y", "I")}) # pandas's nullable int
99
+ for lab in set(mtz.column_labels()).difference(labels+["H","K","L"]):
100
+ del df[lab]
101
+
102
+ if newlabels is not None:
103
+ assert len(newlabels) == len(labels)
104
+ for i in range(1, len(newlabels)):
105
+ if newlabels[i] == "": # means this is phase and should be transferred to previous column
106
+ assert col_types.get(labels[i]) == "P"
107
+ assert col_types.get(labels[i-1]) == "F"
108
+ ph = numpy.deg2rad(df[labels[i]])
109
+ df[labels[i-1]] = df[labels[i-1]] * (numpy.cos(ph) + 1j * numpy.sin(ph))
110
+ del df[labels[i]]
111
+
112
+ df.rename(columns={x:y for x,y in zip(labels, newlabels) if y != ""}, inplace=True)
113
+
114
+ return HklData(mtz.cell, mtz.spacegroup, df)
115
+ # hkldata_from_mtz()
116
+
117
+ def df_from_twin_data(twin_data, fc_labs):
118
+ df = pandas.DataFrame(data=twin_data.asu,
119
+ columns=["H","K","L"])
120
+ df[fc_labs] = twin_data.f_calc
121
+ return df
122
+
123
+ def blur_mtz(mtz, B):
124
+ # modify given mtz object
125
+
126
+ s2 = mtz.make_1_d2_array()
127
+ k2 = numpy.exp(-B*s2/2)
128
+ k = numpy.exp(-B*s2/4)
129
+ i_labs = [c.label for c in mtz.columns if c.type in "JK"]
130
+ f_labs = [c.label for c in mtz.columns if c.type in "FDG"]
131
+ for labs in i_labs, f_labs:
132
+ for l in labs:
133
+ sl = "SIG"+l
134
+ if sl in mtz.column_labels(): labs.append(sl)
135
+
136
+ if i_labs:
137
+ logger.writeln("Intensities: {}".format(" ".join(i_labs)))
138
+ logger.writeln(" exp(-B*s^2/2) will be multiplied (B= {:.2f})".format(B))
139
+ if f_labs:
140
+ logger.writeln("Amplitudes: {}".format(" ".join(f_labs)))
141
+ logger.writeln(" exp(-B*s^2/4) will be multiplied (B= {:.2f})".format(B))
142
+
143
+ for l in i_labs:
144
+ c = mtz.column_with_label(l)
145
+ c.array[:] *= k2
146
+ for l in f_labs:
147
+ c = mtz.column_with_label(l)
148
+ c.array[:] *= k
149
+ # blur_mtz()
150
+
151
+ def mtz_selected(mtz, columns):
152
+ """
153
+ creates a new mtz object having specified `columns` of `mtz`
154
+ """
155
+ columns = ["H", "K", "L"] + columns # TODO make sure no duplicates
156
+ col_dict = {x.label:x for x in mtz.columns}
157
+ col_idxes = {x.label:i for i, x in enumerate(mtz.columns)}
158
+
159
+ notfound = list(set(columns) - set(col_idxes))
160
+ if notfound:
161
+ raise RuntimeError("specified columns not found: {}".format(str(notfound)))
162
+
163
+ # copy metadata
164
+ mtz2 = gemmi.Mtz()
165
+ for k in ("spacegroup", "cell", "history", "title"):
166
+ setattr(mtz2, k, getattr(mtz, k))
167
+
168
+ for ds in mtz.datasets:
169
+ ds2 = mtz2.add_dataset("")
170
+ for k in ("cell", "id", "crystal_name", "dataset_name", "project_name", "wavelength"):
171
+ setattr(ds2, k, getattr(ds, k))
172
+
173
+ # copy selected columns
174
+ for col in columns:
175
+ mtz2.add_column(col, col_dict[col].type,
176
+ dataset_id=col_dict[col].dataset_id, expand_data=False)
177
+
178
+ idxes = [col_idxes[col] for col in columns]
179
+ data = numpy.array(mtz, copy=False)[:, idxes]
180
+ mtz2.set_data(data)
181
+ return mtz2
182
+ # mtz_selected()
183
+
184
+ def decide_n_bins(n_per_bin, s_array, power=2, min_bins=1, max_bins=50):
185
+ sp = numpy.sort(s_array)**power
186
+ spmin, spmax = numpy.min(sp), numpy.max(sp)
187
+ n_bins = 1
188
+ if n_per_bin <= len(sp):
189
+ # Decide n_bins so that inner-shell has requested number
190
+ width = sp[n_per_bin - 1] - spmin
191
+ n_bins = int((spmax - spmin) / width)
192
+ if min_bins is not None:
193
+ n_bins = max(n_bins, min_bins)
194
+ if max_bins is not None:
195
+ n_bins = min(n_bins, max_bins)
196
+ return n_bins
197
+ # decide_n_bins()
198
+
199
+ def fft_map(cell, sg, miller_array, data, grid_size=None, sample_rate=3):
200
+ if data is not None:
201
+ data = data.astype(numpy.complex64) # we may want to keep complex128?
202
+ asu = gemmi.ComplexAsuData(cell, sg, miller_array, data)
203
+ if grid_size is None:
204
+ ma = asu.transform_f_phi_to_map(sample_rate=sample_rate, exact_size=(0, 0, 0)) # half_l=True
205
+ else:
206
+ ma = gemmi.transform_f_phi_grid_to_map(asu.get_f_phi_on_grid(grid_size)) # half_l=False
207
+ return ma
208
+ # fft_map()
209
+
210
+ class HklData:
211
+ def __init__(self, cell, sg, df=None, binned_df=None):
212
+ self.cell = cell
213
+ self.sg = sg
214
+ self.df = df
215
+ self.binned_df = binned_df
216
+ self._bin_and_indices = []
217
+ # __init__()
218
+
219
+ def update_cell(self, cell):
220
+ # update d
221
+ pass
222
+
223
+ def switch_to_asu(self):
224
+ # Need to care phases
225
+ assert not any(numpy.iscomplexobj(self.df[x]) for x in self.df)
226
+ hkl = self.miller_array().to_numpy()
227
+ self.sg.switch_to_asu(hkl)
228
+ self.df[["H","K","L"]] = hkl
229
+ # in some environment type changes to int64 even though hkl's dtype is int32
230
+ # it causes a problem in self.debye_waller_factors()
231
+ self.df = self.df.astype({x: numpy.int32 for x in "HKL"})
232
+
233
+ def copy(self, d_min=None, d_max=None):
234
+ # FIXME we should reset_index here? after resolution truncation, max(df.index) will be larger than size.
235
+ if (d_min, d_max).count(None) == 2:
236
+ df = self.df.copy()
237
+ binned_df = self.binned_df.copy() if self.binned_df is not None else None
238
+ else:
239
+ if d_min is None: d_min = 0
240
+ if d_max is None: d_max = float("inf")
241
+ d = self.d_spacings()
242
+ sel = (d >= d_min) & (d <= d_max)
243
+ df = self.df[sel].copy()
244
+ binned_df = None # no way to keep it
245
+
246
+ return HklData(self.cell, self.sg, df, binned_df)
247
+ # copy()
248
+
249
+ def selected(self, sel):
250
+ df = self.df[sel].copy()
251
+ return HklData(self.cell, self.sg, df)
252
+
253
+ def merge_asu_data(self, asu_data, label, common_only=True):
254
+ if self.df is not None and label in self.df:
255
+ raise Exception("Duplicated label")
256
+
257
+ df_tmp = df_from_asu_data(asu_data, label)
258
+
259
+ if self.df is None:
260
+ self.df = df_tmp
261
+ elif common_only:
262
+ self.df = self.df.merge(df_tmp)
263
+ else:
264
+ self.df = self.df.merge(df_tmp, how="outer")
265
+ # merge_asu_data()
266
+
267
+ def miller_array(self):
268
+ return self.df[["H","K","L"]]
269
+
270
+ def s_array(self):
271
+ hkl = self.miller_array()
272
+ return numpy.dot(hkl, self.cell.fractionalization_matrix)
273
+
274
+ def ssq_mat(self):
275
+ # k_aniso = exp(-s^T B_aniso s / 4)
276
+ # s^T B s / 4 can be reformulated as R b where R = 1x6 matrix and b = 6x1 matrix
277
+ # here R for all indices is returned with shape of (6, N)
278
+ # x[None,:].T <= (N, 6, 1)
279
+ # x.T[:,None] <= (N, 1, 6) they can be matmul'ed.
280
+ svecs = self.s_array()
281
+ tmp = (0.25 * svecs[:,0]**2, 0.25 * svecs[:,1]**2, 0.25 * svecs[:,2]**2,
282
+ 0.5 * svecs[:,0] * svecs[:,1], 0.5 * svecs[:,0] * svecs[:,2], 0.5 * svecs[:,1] * svecs[:,2])
283
+ return numpy.array(tmp)
284
+ # aniso_s_u_s_as_left_mat()
285
+
286
+ def debye_waller_factors(self, b_cart=None, b_iso=None):
287
+ if b_iso is not None:
288
+ s2 = 1 / self.d_spacings()**2
289
+ return numpy.exp(-b_iso / 4 * s2)
290
+ if b_cart is not None:
291
+ b_star = b_cart.transformed_by(self.cell.fractionalization_matrix)
292
+ return numpy.exp(-b_star.r_u_r(self.miller_array().to_numpy()) / 4)
293
+
294
+ def calc_d(self):
295
+ self.df["d"] = self.cell.calculate_d_array(self.miller_array())
296
+ # calc_d()
297
+
298
+ def calc_epsilon(self):
299
+ self.df["epsilon"] = self.sg.operations().epsilon_factor_without_centering_array(self.miller_array())
300
+ # calc_epsilon()
301
+
302
+ def calc_centric(self):
303
+ self.df["centric"] = self.sg.operations().centric_flag_array(self.miller_array()).astype(int)
304
+ # calc_centric()
305
+
306
+ def d_spacings(self):
307
+ if "d" not in self.df or self.df.d.isnull().values.any():
308
+ self.calc_d()
309
+ return self.df.d
310
+ # calc_d()
311
+
312
+ def sort_by_resolution(self, ascending=False):
313
+ self.d_spacings()
314
+ self.df.sort_values("d", ascending=ascending, inplace=True)
315
+ # sort_by_resolution()
316
+
317
+ def d_min_max(self):
318
+ d = self.d_spacings()
319
+ return numpy.min(d), numpy.max(d)
320
+ # d_min_max()
321
+
322
+ def complete(self):
323
+ # make complete set
324
+ d_min, d_max = self.d_min_max()
325
+ all_hkl = gemmi.make_miller_array(self.cell, self.sg, d_min, d_max)
326
+ match = gemmi.HklMatch(self.miller_array(), all_hkl)
327
+ missing_hkl_df = pandas.DataFrame(all_hkl[numpy.asarray(match.pos) < 0], columns=["H","K","L"])
328
+ self.df = pandas.concat([self.df, missing_hkl_df])
329
+ logger.writeln("Completing hkldata: {} reflections were missing".format(len(missing_hkl_df.index)))
330
+ self.calc_d()
331
+ # complete()
332
+
333
+ def completeness(self, label=None):
334
+ if label is None:
335
+ n_missing = numpy.sum(self.df.isna().any(axis=1))
336
+ else:
337
+ n_missing = numpy.sum(self.df[label].isna())
338
+ n_all = len(self.df.index)
339
+ return (n_all-n_missing)/n_all
340
+ # completeness()
341
+
342
+ def setup_binning(self, n_bins, method=gemmi.Binner.Method.Dstar2):
343
+ self.df.reset_index(drop=True, inplace=True)
344
+ s2 = 1/self.d_spacings().to_numpy()**2
345
+ binner = gemmi.Binner()
346
+ binner.setup_from_1_d2(n_bins, method, s2, self.cell)
347
+ self._bin_and_indices = []
348
+ d_limits = 1 / numpy.sqrt(binner.limits)
349
+ bin_number = binner.get_bins_from_1_d2(s2)
350
+ d_max_all = []
351
+ d_min_all = []
352
+ for i in range(binner.size):
353
+ left = numpy.max(self.d_spacings()) if i == 0 else d_limits[i-1]
354
+ right = numpy.min(self.d_spacings()) if i == binner.size -1 else d_limits[i]
355
+ sel = numpy.where(bin_number==i)[0] # slow?
356
+ d_max_all.append(left)
357
+ d_min_all.append(right)
358
+ self._bin_and_indices.append((i, sel))
359
+
360
+ self.df["bin"] = bin_number
361
+ self.binned_df = pandas.DataFrame(dict(d_max=d_max_all, d_min=d_min_all), index=list(range(binner.size)))
362
+ # setup_binning()
363
+
364
+ def setup_relion_binning(self, sort=False):
365
+ max_edge = max(self.cell.parameters[:3])
366
+ if sort:
367
+ self.sort_by_resolution()
368
+ self.df.reset_index(drop=True, inplace=True) # to allow numpy.array indexing
369
+
370
+ self.df["bin"] = (max_edge/self.d_spacings()+0.5).astype(int)
371
+ # Merge inner/outer shells if too few # TODO smarter way
372
+ bin_counts = []
373
+ bin_ranges = {}
374
+ modify_table = {}
375
+ for i_bin, g in self.df.groupby("bin", sort=True):
376
+ if i_bin == 0: continue # ignore DC component
377
+ bin_counts.append([i_bin, g.index])
378
+ bin_ranges[i_bin] = (numpy.max(g.d), numpy.min(g.d))
379
+
380
+ for i in range(len(bin_counts)):
381
+ if len(bin_counts[i][1]) < 10 and i < len(bin_counts)-1:
382
+ bin_counts[i+1][1] = bin_counts[i+1][1].union(bin_counts[i][1])
383
+ modify_table[bin_counts[i][0]] = bin_counts[i+1][0]
384
+ logger.writeln("Bin {} only has {} data. Merging with next bin.".format(bin_counts[i][0],
385
+ len(bin_counts[i][1])))
386
+ else: break
387
+
388
+ for i in reversed(range(len(bin_counts))):
389
+ if i > 0 and len(bin_counts[i][1])/len(bin_counts[i-1][1]) < 0.5:
390
+ bin_counts[i-1][1] = bin_counts[i-1][1].union(bin_counts[i][1])
391
+ modify_table[bin_counts[i][0]] = bin_counts[i-1][0]
392
+ logger.writeln("Bin {} only has {} data. Merging with previous bin.".format(bin_counts[i][0],
393
+ len(bin_counts[i][1])))
394
+ else: break
395
+
396
+ while True:
397
+ flag = True
398
+ for i_bin in modify_table:
399
+ if modify_table[i_bin] in modify_table:
400
+ modify_table[i_bin] = modify_table[modify_table[i_bin]]
401
+ flag = False
402
+ if flag: break
403
+
404
+ for i_bin in modify_table:
405
+ new_bin = modify_table[i_bin]
406
+ self.df["bin"] = numpy.where(self.df["bin"].to_numpy() == i_bin, new_bin, self.df["bin"].to_numpy())
407
+ bin_ranges[new_bin] = (max(bin_ranges[i_bin][0], bin_ranges[new_bin][0]),
408
+ min(bin_ranges[i_bin][1], bin_ranges[new_bin][1]))
409
+
410
+ self._bin_and_indices = []
411
+ bin_all = []
412
+ d_max_all = []
413
+ d_min_all = []
414
+ for i_bin, indices in bin_counts:
415
+ if i_bin in modify_table: continue
416
+ #if sort: # want this, but we cannot take len() for slice. we can add ncoeffs to binned_df
417
+ # self._bin_and_indices.append((i_bin, slice(numpy.min(indices), numpy.max(indices))))
418
+ #else:
419
+ self._bin_and_indices.append((i_bin, indices))
420
+
421
+ bin_all.append(i_bin)
422
+ d_max_all.append(bin_ranges[i_bin][0])
423
+ d_min_all.append(bin_ranges[i_bin][1])
424
+ self.binned_df = pandas.DataFrame(dict(d_max=d_max_all, d_min=d_min_all), index=bin_all)
425
+ # setup_relion_binning()
426
+
427
+ def binned_data_as_array(self, lab):
428
+ vals = numpy.zeros(len(self.df.index), dtype=self.binned_df[lab].dtype)
429
+ for i_bin, idxes in self.binned():
430
+ vals[idxes] = self.binned_df[lab][i_bin]
431
+ return vals
432
+ # binned_data_as_array()
433
+
434
+ def binned(self):
435
+ return self._bin_and_indices
436
+
437
+ def columns(self):
438
+ return [x for x in self.df.columns if x not in "HKL"]
439
+
440
+ def merge(self, other, common_only=True):
441
+ self.merge_df(other, common_only)
442
+ # merge()
443
+
444
+ def merge_df(self, other, common_only=True):
445
+ # TODO check space group, cell
446
+ # TODO transform to asu with phase shifts
447
+ # TODO check column labels. same names other than HKL?
448
+ #
449
+ if common_only:
450
+ self.df = self.df.merge(other)
451
+ else:
452
+ df = self.df.merge(other, indicator=True, how="outer")
453
+ df_left = df[df._merge=="left_only"]
454
+ df_right = df[df._merge=="right_only"]
455
+ df_both = df[df._merge=="both"]
456
+ # merge()
457
+
458
+ def guess_free_number(self, obs):
459
+ logger.writeln("Guessing test flag number")
460
+ sel = ~self.df[obs].isna()
461
+ free = self.df.loc[sel, "FREE"]
462
+ threshold = len(free.index) / 2
463
+ if free.isna().any():
464
+ raise RuntimeError("missing or invalid test flag")
465
+ counts = self.df.loc[sel, "FREE"].value_counts().sort_index()
466
+ logger.writeln(counts.to_string(header=False))
467
+ if len(counts.index) < 2:
468
+ raise RuntimeError("this does not appear to be test flag")
469
+ good_flags = [n for n, c in counts.items() if c < threshold]
470
+ if len(good_flags) > 0:
471
+ flag_num = min(good_flags)
472
+ else:
473
+ flag_num = min(counts.index)
474
+ logger.writeln(" best guess: free = {}".format(flag_num))
475
+ return flag_num
476
+ # guess_free_number()
477
+
478
+ def as_numpy_arrays(self, labels, omit_nan=True):
479
+ tmp = self.df[labels]
480
+ if omit_nan: tmp = tmp[~tmp.isna().any(axis=1)]
481
+ return [tmp[lab].to_numpy() for lab in labels]
482
+ # as_numpy_arrays()
483
+
484
+ def remove_nonpositive(self, label):
485
+ sel = self.df[label] <= 0
486
+ n_bad = sel.sum()
487
+ if n_bad > 0:
488
+ logger.writeln("Removing {} reflections with {}<=0".format(n_bad, label))
489
+ self.df = self.df[~sel]
490
+ # remove_nonpositive()
491
+
492
+ def remove_systematic_absences(self):
493
+ is_absent = self.sg.operations().systematic_absences(self.miller_array())
494
+ n_absent = numpy.sum(is_absent)
495
+ if n_absent > 0:
496
+ logger.writeln("Removing {} systematic absences".format(n_absent))
497
+ self.df = self.df[~is_absent]
498
+ # remove_systematic_absences()
499
+
500
+ def merge_anomalous(self, labs, newlabs):
501
+ assert len(labs) == 4 # i+,sigi+,i-,sigi- for example
502
+ assert len(newlabs) == 2
503
+ # skipna=True is default, so missing value is handled nicely.
504
+ self.df[newlabs[0]] = self.df[[labs[0], labs[2]]].mean(axis=1)
505
+ self.df[newlabs[1]] = self.df[[labs[1], labs[3]]].pow(2).mean(axis=1).pow(0.5)
506
+
507
+ def as_asu_data(self, label=None, data=None, label_sigma=None):
508
+ if label is None: assert data is not None
509
+ else: assert data is None
510
+
511
+ if label_sigma is not None:
512
+ assert data is None
513
+ assert not numpy.iscomplexobj(self.df[label])
514
+ sigma = self.df[label_sigma]
515
+ data = numpy.lib.recfunctions.unstructured_to_structured(self.df[[label,label_sigma]].to_numpy(),
516
+ numpy.dtype([("value", numpy.float32), ("sigma", numpy.float32)]))
517
+ elif data is None:
518
+ data = self.df[label]
519
+
520
+ if numpy.iscomplexobj(data):
521
+ asutype = gemmi.ComplexAsuData
522
+ elif issubclass(data.dtype.type, numpy.integer):
523
+ asutype = gemmi.IntAsuData
524
+ elif label_sigma is not None:
525
+ asutype = gemmi.ValueSigmaAsuData
526
+ else:
527
+ asutype = gemmi.FloatAsuData
528
+
529
+ return asutype(self.cell, self.sg,
530
+ self.miller_array(), data)
531
+ # as_asu_data()
532
+
533
+ def fft_map(self, label=None, data=None, grid_size=None, sample_rate=3):
534
+ if data is None:
535
+ data = self.df[label]
536
+ return fft_map(self.cell, self.sg, self.miller_array(), data, grid_size, sample_rate)
537
+ # fft_map()
538
+
539
+ def d_eff(self, label):
540
+ # Effective resolution defined using FSC
541
+ fsc = self.binned_df[label]
542
+ a = 0.
543
+ for i_bin, idxes in self.binned():
544
+ a += len(idxes) * fsc[i_bin]
545
+
546
+ fac = (a/len(self.df.index))**(1/3.)
547
+ d_min = self.d_min_max()[0]
548
+ ret = d_min/fac
549
+ return ret
550
+ # d_eff()
551
+
552
+ def hard_sphere_kernel(self, r_ang, grid_size):
553
+ s = 1. / self.d_spacings()
554
+ t = 2 * numpy.pi * s * r_ang
555
+ F_kernel = 3. * (-t * numpy.cos(t) + numpy.sin(t)) / t**3
556
+ knl = self.fft_map(data=F_kernel, grid_size=grid_size)
557
+ knl.array[:] += 1. / knl.unit_cell.volume # F000
558
+ knl.array[:] /= numpy.sum(knl.array)
559
+ return knl
560
+ # hard_sphere_kernel()
561
+
562
+ def scale_k_and_b(self, lab_ref, lab_scaled, debug=False):
563
+ logger.writeln("Determining k, B scales between {} and {}".format(lab_ref, lab_scaled))
564
+ s2 = 1/self.d_spacings().to_numpy()**2
565
+ # determine scales that minimize (|f1|-|f2|*k*e^(-b*s2/4))^2
566
+ f1 = self.df[lab_ref].to_numpy()
567
+ f2 = self.df[lab_scaled].to_numpy()
568
+ if numpy.iscomplexobj(f1): f1 = numpy.abs(f1)
569
+ if numpy.iscomplexobj(f2): f2 = numpy.abs(f2)
570
+
571
+ sel_pos = numpy.logical_and(f1 > 0, f2 > 0) # this filters nan as well
572
+ f1p, f2p, s2p = f1[sel_pos], f2[sel_pos], s2[sel_pos]
573
+
574
+ # 1st step: minimize (log(|f1|)-log(|f2|*e^k*e^(-b*s2/4)))^2 starting with k=1, b=0.
575
+ tmp = numpy.log(f2p) - numpy.log(f1p)
576
+ # g = [dT/dk, dT/db]
577
+ g = numpy.array([2 * numpy.sum(tmp), -numpy.sum(tmp*s2p)/2])
578
+ H = numpy.zeros((2,2))
579
+ H[0,0] = 2*len(f1p)
580
+ H[1,1] = numpy.sum(s2p**2/8)
581
+ H[0,1] = H[1,0] = -numpy.sum(s2p)/2
582
+ x = -numpy.dot(numpy.linalg.inv(H), g)
583
+ k1 = numpy.exp(x[0])
584
+ B1 = x[1]
585
+ logger.writeln(" initial estimate using log: k= {:.2e} B= {:.2e}".format(k1, B1))
586
+ f2tmp = f2 * k1 * numpy.exp(-B1*s2/4)
587
+ r_step0 = r_factor(f1, f2)
588
+ r_step1 = r_factor(f1, f2tmp)
589
+ logger.writeln(" R= {:.4f} (was: {:.4f})".format(r_step1, r_step0))
590
+
591
+ # 2nd step: - minimize (|f1|-|f2|*k*e^(-b*s2/4))^2 iteratively (TODO with regularisation)
592
+
593
+ def grad2(x):
594
+ t = numpy.exp(-x[1]*s2/4)
595
+ tmp = (f1-f2*x[0]*t)*f2*t
596
+ return numpy.array([-2.*numpy.nansum(tmp),
597
+ 0.5*x[0]*numpy.nansum(tmp*s2)])
598
+
599
+ def hess2(x):
600
+ h = numpy.zeros((2, 2))
601
+ t = numpy.exp(-x[1]*s2/4)
602
+ t2 = t**2
603
+ h[0,0] = numpy.nansum(f2**2 * t2) * 2
604
+ h[1,1] = numpy.nansum(f2 * s2**2/4 * (-f1/2*t + f2*x[0]*t2)) * x[0]
605
+ h[1,0] = numpy.nansum(f2 * s2 * (f1/2*t - f2*x[0]*t2))
606
+ h[0,1] = h[1,0]
607
+ return h
608
+
609
+ res = scipy.optimize.minimize(fun=lambda x: numpy.nansum((f1-f2*x[0]*numpy.exp(-x[1]*s2/4))**2),
610
+ jac=grad2,
611
+ hess=hess2,
612
+ method="Newton-CG",
613
+ x0=numpy.array([k1, B1]),
614
+ )
615
+ if debug:
616
+ logger.writeln(str(res))
617
+ k2, B2 = res.x
618
+ f2tmp2 = f2 * k2 * numpy.exp(-B2*s2/4)
619
+ r_step2 = r_factor(f1, f2tmp2)
620
+ logger.writeln(" Least-square estimate: k= {:.2e} B= {:.2e}".format(k2, B2))
621
+ logger.writeln(" R= {:.4f}".format(r_step2))
622
+
623
+ if 0:
624
+ self.setup_binning(40)
625
+ x = []
626
+ y0,y1,y2,y3=[],[],[],[]
627
+ for i_bin, idxes in self.binned():
628
+ bin_d_min = hkldata.binned_df.d_min[i_bin]
629
+ bin_d_max = hkldata.binned_df.d_max[i_bin]
630
+ x.append(1/bin_d_min**2)
631
+ y0.append(numpy.average(f1[idxes]))
632
+ y1.append(numpy.average(f2[idxes]))
633
+ y2.append(numpy.average(f2tmp[idxes]))
634
+ y3.append(numpy.average(f2tmp2[idxes]))
635
+
636
+ import matplotlib.pyplot as plt
637
+ plt.plot(x, y0, label="FC")
638
+ plt.plot(x, y1, label="FP")
639
+ plt.plot(x, y2, label="FP,scaled")
640
+ plt.plot(x, y3, label="FP,scaled2")
641
+ plt.legend()
642
+ plt.show()
643
+
644
+ if r_step2 < r_step1:
645
+ return k2, B2
646
+ else:
647
+ return k1, B1
648
+ # scale_k_and_b()
649
+
650
+ def translation_factor(self, shift):
651
+ if type(shift) != gemmi.Position:
652
+ shift = gemmi.Position(*shift)
653
+ return numpy.exp(2.j*numpy.pi*numpy.dot(self.miller_array(),
654
+ self.cell.fractionalize(shift).tolist()))
655
+ # translation_factor()
656
+ def translate(self, lab, shift):
657
+ # apply phase shift
658
+ assert numpy.iscomplexobj(self.df[lab])
659
+ self.df[lab] *= self.translation_factor(shift)
660
+ # translate()
661
+
662
+ def write_mtz(self, mtz_out, labs, types=None, phase_label_decorator=None,
663
+ exclude_000=True):
664
+ logger.writeln("Writing MTZ file: {}".format(mtz_out))
665
+ if self.sg.ccp4 < 1:
666
+ logger.writeln("WARNING: CCP4-unsupported space group ({})".format(self.sg.xhm()))
667
+ if types is None: types = {}
668
+ if exclude_000:
669
+ df = self.df.query("H!=0 | K!=0 | L!=0")
670
+ else:
671
+ df = self.df
672
+
673
+ ndata = sum(2 if numpy.iscomplexobj(df[lab]) else 1 for lab in labs)
674
+
675
+ data = numpy.empty((len(df.index), ndata + 3), dtype=numpy.float32)
676
+ data[:,:3] = df[["H","K","L"]]
677
+ idx = 3
678
+ for lab in labs:
679
+ if numpy.iscomplexobj(df[lab]):
680
+ data[:,idx] = numpy.abs(df[lab])
681
+ data[:,idx+1] = numpy.angle(df[lab], deg=True)
682
+ idx += 2
683
+ else:
684
+ data[:,idx] = df[lab].to_numpy(numpy.float32, na_value=numpy.nan) # for nullable integers
685
+ idx += 1
686
+
687
+ mtz = gemmi.Mtz()
688
+ mtz.spacegroup = self.sg
689
+ mtz.cell = self.cell
690
+ mtz.add_dataset('HKL_base')
691
+ for label in ['H', 'K', 'L']: mtz.add_column(label, 'H')
692
+
693
+ for lab in labs:
694
+ if numpy.iscomplexobj(df[lab]):
695
+ mtz.add_column(lab, "F")
696
+ if phase_label_decorator is None:
697
+ plab = {"FWT": "PHWT", "DELFWT": "PHDELWT", "FAN":"PHAN"}.get(lab, "PH"+lab)
698
+ else:
699
+ plab = phase_label_decorator(lab)
700
+ mtz.add_column(plab, "P")
701
+ else:
702
+ typ = types.get(lab)
703
+ if typ is None:
704
+ if issubclass(df[lab].dtype.type, numpy.integer):
705
+ typ = "I"
706
+ else:
707
+ typ = "R"
708
+ mtz.add_column(lab, typ)
709
+
710
+ mtz.set_data(data)
711
+ mtz.write_to_file(mtz_out)
712
+ # write_mtz()