servalcat 0.4.131__cp314-cp314t-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (45) hide show
  1. servalcat/__init__.py +10 -0
  2. servalcat/__main__.py +120 -0
  3. servalcat/ext.cpython-314t-x86_64-linux-gnu.so +0 -0
  4. servalcat/refine/__init__.py +0 -0
  5. servalcat/refine/cgsolve.py +100 -0
  6. servalcat/refine/refine.py +1162 -0
  7. servalcat/refine/refine_geom.py +245 -0
  8. servalcat/refine/refine_spa.py +400 -0
  9. servalcat/refine/refine_xtal.py +339 -0
  10. servalcat/refine/spa.py +151 -0
  11. servalcat/refine/xtal.py +312 -0
  12. servalcat/refmac/__init__.py +0 -0
  13. servalcat/refmac/exte.py +191 -0
  14. servalcat/refmac/refmac_keywords.py +660 -0
  15. servalcat/refmac/refmac_wrapper.py +423 -0
  16. servalcat/spa/__init__.py +0 -0
  17. servalcat/spa/fofc.py +488 -0
  18. servalcat/spa/fsc.py +391 -0
  19. servalcat/spa/localcc.py +197 -0
  20. servalcat/spa/realspcc_from_var.py +128 -0
  21. servalcat/spa/run_refmac.py +979 -0
  22. servalcat/spa/shift_maps.py +293 -0
  23. servalcat/spa/shiftback.py +137 -0
  24. servalcat/spa/translate.py +129 -0
  25. servalcat/utils/__init__.py +35 -0
  26. servalcat/utils/commands.py +1629 -0
  27. servalcat/utils/fileio.py +836 -0
  28. servalcat/utils/generate_operators.py +296 -0
  29. servalcat/utils/hkl.py +811 -0
  30. servalcat/utils/logger.py +140 -0
  31. servalcat/utils/maps.py +345 -0
  32. servalcat/utils/model.py +933 -0
  33. servalcat/utils/refmac.py +759 -0
  34. servalcat/utils/restraints.py +888 -0
  35. servalcat/utils/symmetry.py +298 -0
  36. servalcat/xtal/__init__.py +0 -0
  37. servalcat/xtal/french_wilson.py +262 -0
  38. servalcat/xtal/run_refmac_small.py +240 -0
  39. servalcat/xtal/sigmaa.py +1954 -0
  40. servalcat/xtal/twin.py +316 -0
  41. servalcat-0.4.131.dist-info/METADATA +60 -0
  42. servalcat-0.4.131.dist-info/RECORD +45 -0
  43. servalcat-0.4.131.dist-info/WHEEL +6 -0
  44. servalcat-0.4.131.dist-info/entry_points.txt +4 -0
  45. servalcat-0.4.131.dist-info/licenses/LICENSE +373 -0
servalcat/utils/hkl.py ADDED
@@ -0,0 +1,811 @@
1
+ """
2
+ Author: "Keitaro Yamashita, Garib N. Murshudov"
3
+ MRC Laboratory of Molecular Biology
4
+
5
+ This software is released under the
6
+ Mozilla Public License, version 2.0; see LICENSE.
7
+ """
8
+ from __future__ import absolute_import, division, print_function, generators
9
+ import numpy
10
+ import scipy.optimize
11
+ import pandas
12
+ import gemmi
13
+ from servalcat.utils import logger
14
+
15
+ dtypes64 = dict(i=numpy.int64, u=numpy.uint64, f=numpy.float64, c=numpy.complex128)
16
+ to64 = lambda x: x.astype(dtypes64.get(x.dtype.kind, x.dtype))
17
+
18
+ def r_factor(fo, fc):
19
+ denom = numpy.nansum(fo)
20
+ if denom == 0:
21
+ return numpy.nan
22
+ return numpy.nansum(numpy.abs(fo-fc)) / denom
23
+ def correlation(obs, calc):
24
+ obs, calc = obs.flatten(), calc.flatten() # for anomalous data
25
+ sel = numpy.isfinite(obs)
26
+ if obs.size == 0 or numpy.all(~sel):
27
+ return numpy.nan
28
+ return numpy.corrcoef(obs[sel], calc[sel])[0,1]
29
+
30
+ def df_from_asu_data(asu_data, label):
31
+ df = pandas.DataFrame(data=asu_data.miller_array.astype(numpy.int32),
32
+ columns=["H","K","L"])
33
+ if type(asu_data) is gemmi.ValueSigmaAsuData:
34
+ df[label] = to64(asu_data.value_array[:,0])
35
+ df["SIG"+label] = to64(asu_data.value_array[:,1])
36
+ else:
37
+ df[label] = to64(asu_data.value_array)
38
+ return df
39
+
40
+ def df_from_raw(miller_array, value_array, label):
41
+ df = pandas.DataFrame(data=miller_array.astype(numpy.int32),
42
+ columns=["H","K","L"])
43
+ df[label] = to64(value_array)
44
+ return df
45
+
46
+ def hkldata_from_asu_data(asu_data, label):
47
+ df = df_from_asu_data(asu_data, label)
48
+ return HklData(asu_data.unit_cell, asu_data.spacegroup, df)
49
+ # hkldata_from_asu_data()
50
+
51
+ def mtz_find_data_columns(mtz, require_sigma=True):
52
+ # for now (+)/(-) (types K/M and L/G) are not supported
53
+ col_types = {x.label:x.type for x in mtz.columns}
54
+ ret = {"J": [], "F": [], "K": [], "G": []}
55
+ for col in col_types:
56
+ typ = col_types[col]
57
+ if typ in ("J", "F"):
58
+ ret[typ].append([col])
59
+ sig = "SIG" + col
60
+ if col_types.get(sig) == "Q":
61
+ ret[typ][-1].append(sig)
62
+ elif require_sigma:
63
+ ret[typ].pop()
64
+ elif typ in ("K", "G") and col.endswith(("(+)", "plus")):
65
+ # we always need sigma - right?
66
+ col_minus = col.replace("(+)", "(-)") if col.endswith("(+)") else col.replace("plus", "minus")
67
+ sig_type = {"K": "M", "G": "L"}[typ]
68
+ if (col_types.get(col_minus) == typ and
69
+ col_types.get("SIG"+col) == sig_type and
70
+ col_types.get("SIG"+col_minus) == sig_type):
71
+ ret[typ].append([col, "SIG"+col, col_minus, "SIG"+col_minus])
72
+ return ret
73
+ # mtz_find_data_columns()
74
+
75
+ def mtz_find_free_columns(mtz):
76
+ col_types = {x.label:x.type for x in mtz.columns}
77
+ free_names = ("FREE", "RFREE", "FREER", "FreeR_flag", "R-free-flags", "FreeRflag",
78
+ "R_FREE_FLAGS")
79
+ ret = []
80
+ for col in mtz.columns:
81
+ if col.type == "I" and col.label in free_names:
82
+ vals = col.array[numpy.isfinite(col.array)].astype(int)
83
+ if len(numpy.unique(vals)) > 1:
84
+ ret.append(col.label)
85
+ else:
86
+ logger.writeln(f"INFO: {col.label} is not a test flag because all its values are identical.")
87
+ return ret
88
+ # mtz_find_free_columns()
89
+
90
+ def hkldata_from_mtz(mtz, labels, newlabels=None, require_types=None):
91
+ assert type(mtz) == gemmi.Mtz
92
+ notfound = set(labels) - set(mtz.column_labels())
93
+ if notfound:
94
+ raise RuntimeError("MTZ columns not found: {}".format(" ".join(notfound)))
95
+ col_types = {x.label:x.type for x in mtz.columns}
96
+ if require_types:
97
+ mismatches = [l for l,r in zip(labels, require_types) if r is not None and r != col_types[l]]
98
+ if mismatches:
99
+ raise RuntimeError("MTZ column types mismatch: {}".format(" ".join(mismatches)))
100
+
101
+ df = pandas.DataFrame(data=mtz.array, columns=mtz.column_labels())
102
+ df = df.astype({col: 'int32' for col in col_types if col_types[col] == "H"})
103
+ df = df.astype({col: 'Int64' for col in col_types if col_types[col] in ("B", "Y", "I")}) # pandas's nullable int
104
+ for lab in set(mtz.column_labels()).difference(labels+["H","K","L"]):
105
+ del df[lab]
106
+
107
+ if newlabels is not None:
108
+ assert len(newlabels) == len(labels)
109
+ for i in range(1, len(newlabels)):
110
+ if newlabels[i] == "": # means this is phase and should be transferred to previous column
111
+ assert col_types.get(labels[i]) == "P"
112
+ assert col_types.get(labels[i-1]) == "F"
113
+ ph = numpy.deg2rad(df[labels[i]])
114
+ df[labels[i-1]] = df[labels[i-1]] * (numpy.cos(ph) + 1j * numpy.sin(ph))
115
+ del df[labels[i]]
116
+
117
+ df.rename(columns={x:y for x,y in zip(labels, newlabels) if y != ""}, inplace=True)
118
+
119
+ return HklData(mtz.cell, mtz.spacegroup, df)
120
+ # hkldata_from_mtz()
121
+
122
+ def df_from_twin_data(twin_data, fc_labs):
123
+ df = pandas.DataFrame(data=twin_data.asu,
124
+ columns=["H","K","L"])
125
+ df[fc_labs] = twin_data.f_calc
126
+ return df
127
+
128
+ def blur_mtz(mtz, B):
129
+ # modify given mtz object
130
+
131
+ s2 = mtz.make_1_d2_array()
132
+ k2 = numpy.exp(-B*s2/2)
133
+ k = numpy.exp(-B*s2/4)
134
+ i_labs = [c.label for c in mtz.columns if c.type in "JK"]
135
+ f_labs = [c.label for c in mtz.columns if c.type in "FDG"]
136
+ for labs in i_labs, f_labs:
137
+ for l in labs:
138
+ sl = "SIG"+l
139
+ if sl in mtz.column_labels(): labs.append(sl)
140
+
141
+ if i_labs:
142
+ logger.writeln("Intensities: {}".format(" ".join(i_labs)))
143
+ logger.writeln(" exp(-B*s^2/2) will be multiplied (B= {:.2f})".format(B))
144
+ if f_labs:
145
+ logger.writeln("Amplitudes: {}".format(" ".join(f_labs)))
146
+ logger.writeln(" exp(-B*s^2/4) will be multiplied (B= {:.2f})".format(B))
147
+
148
+ for l in i_labs:
149
+ c = mtz.column_with_label(l)
150
+ c.array[:] *= k2
151
+ for l in f_labs:
152
+ c = mtz.column_with_label(l)
153
+ c.array[:] *= k
154
+ # blur_mtz()
155
+
156
+ def mtz_selected(mtz, columns):
157
+ """
158
+ creates a new mtz object having specified `columns` of `mtz`
159
+ """
160
+ columns = ["H", "K", "L"] + columns # TODO make sure no duplicates
161
+ col_dict = {x.label:x for x in mtz.columns}
162
+ col_idxes = {x.label:i for i, x in enumerate(mtz.columns)}
163
+
164
+ notfound = list(set(columns) - set(col_idxes))
165
+ if notfound:
166
+ raise RuntimeError("specified columns not found: {}".format(str(notfound)))
167
+
168
+ # copy metadata
169
+ mtz2 = gemmi.Mtz()
170
+ for k in ("spacegroup", "cell", "history", "title"):
171
+ setattr(mtz2, k, getattr(mtz, k))
172
+
173
+ for ds in mtz.datasets:
174
+ ds2 = mtz2.add_dataset("")
175
+ for k in ("cell", "id", "crystal_name", "dataset_name", "project_name", "wavelength"):
176
+ setattr(ds2, k, getattr(ds, k))
177
+
178
+ # copy selected columns
179
+ for col in columns:
180
+ mtz2.add_column(col, col_dict[col].type,
181
+ dataset_id=col_dict[col].dataset_id, expand_data=False)
182
+
183
+ idxes = [col_idxes[col] for col in columns]
184
+ data = mtz.array[:, idxes]
185
+ mtz2.set_data(data)
186
+ return mtz2
187
+ # mtz_selected()
188
+
189
+ def decide_ml_binning(hkldata, data_label, free_label, free, use, n_per_bin, max_bins):
190
+ assert use in ("all", "work", "test")
191
+ if n_per_bin is None:
192
+ if use == "all" or free_label not in hkldata.df:
193
+ n_per_bin = 100
194
+ use = "all"
195
+ elif use == "work":
196
+ n_per_bin = 100
197
+ elif use == "test":
198
+ n_per_bin = 50
199
+ else:
200
+ raise RuntimeError(f"should not happen: {use=}")
201
+
202
+ sel = hkldata.df[data_label].notna()
203
+ if use == "work":
204
+ sel &= hkldata.df[free_label] != free
205
+ elif use == "test":
206
+ sel &= hkldata.df[free_label] == free
207
+ s_array = 1/hkldata.d_spacings()[sel]
208
+ if len(s_array) == 0:
209
+ raise RuntimeError(f"no reflections in {use} set")
210
+
211
+ n_bins = decide_n_bins(n_per_bin, s_array, max_bins=max_bins)
212
+ logger.writeln(f"{n_per_bin=} requested for {use}. n_bins set to {n_bins}")
213
+ return n_bins, use
214
+ # decide_ml_binning()
215
+
216
+ def decide_n_bins(n_per_bin, s_array, power=2, min_bins=1, max_bins=50):
217
+ sp = numpy.sort(s_array)**power
218
+ spmin, spmax = numpy.min(sp), numpy.max(sp)
219
+ n_bins = 1
220
+ if n_per_bin <= len(sp):
221
+ # Decide n_bins so that inner-shell has requested number
222
+ width = sp[n_per_bin - 1] - spmin
223
+ n_bins = int((spmax - spmin) / width)
224
+ if min_bins is not None:
225
+ n_bins = max(n_bins, min_bins)
226
+ if max_bins is not None:
227
+ n_bins = min(n_bins, max_bins)
228
+ return n_bins
229
+ # decide_n_bins()
230
+
231
+ def fft_map(cell, sg, miller_array, data, grid_size=None, sample_rate=3):
232
+ if data is not None:
233
+ data = data.astype(numpy.complex64) # we may want to keep complex128?
234
+ if type(data) is pandas.core.series.Series:
235
+ data = data.to_numpy()
236
+ # TODO remove this with gemmi 0.7.5
237
+ if not miller_array.flags.writeable:
238
+ miller_array = miller_array.copy()
239
+ if not data.flags.writeable:
240
+ data = data.copy()
241
+ asu = gemmi.ComplexAsuData(cell, sg, miller_array, data)
242
+ if grid_size is None:
243
+ ma = asu.transform_f_phi_to_map(sample_rate=sample_rate, exact_size=(0, 0, 0)) # half_l=True
244
+ else:
245
+ ma = gemmi.transform_f_phi_grid_to_map(asu.get_f_phi_on_grid(grid_size)) # half_l=False
246
+ return ma
247
+ # fft_map()
248
+
249
+ class HklData:
250
+ def __init__(self, cell, sg, df=None, binned_df=None):
251
+ self.cell = cell
252
+ self.sg = sg
253
+ self.df = df
254
+ self.binned_df = {} if binned_df is None else binned_df
255
+ self._bin_and_indices = {}
256
+ self.centric_and_selections = {}
257
+ # __init__()
258
+
259
+ def update_cell(self, cell):
260
+ # update d
261
+ pass
262
+
263
+ def switch_to_asu(self):
264
+ # Need to care phases
265
+ assert not any(numpy.iscomplexobj(self.df[x]) for x in self.df)
266
+ hkl = self.miller_array()
267
+ self.sg.switch_to_asu(hkl)
268
+ self.df[["H","K","L"]] = hkl
269
+ # in some environment type changes to int64 even though hkl's dtype is int32
270
+ # it causes a problem in self.debye_waller_factors()
271
+ self.df = self.df.astype({x: numpy.int32 for x in "HKL"})
272
+
273
+ def copy(self, d_min=None, d_max=None):
274
+ # FIXME we should reset_index here? after resolution truncation, max(df.index) will be larger than size.
275
+ if (d_min, d_max).count(None) == 2:
276
+ df = self.df.copy()
277
+ binned_df = {k: self.binned_df[k].copy() for k in self.binned_df}
278
+ else:
279
+ if d_min is None: d_min = 0
280
+ if d_max is None: d_max = float("inf")
281
+ d = self.d_spacings()
282
+ sel = (d >= d_min) & (d <= d_max)
283
+ df = self.df[sel].copy()
284
+ binned_df = None # no way to keep it
285
+
286
+ return HklData(self.cell, self.sg, df, binned_df)
287
+ # copy()
288
+
289
+ def selected(self, sel):
290
+ df = self.df[sel].copy()
291
+ return HklData(self.cell, self.sg, df)
292
+
293
+ def merge_asu_data(self, asu_data, label, common_only=True):
294
+ if self.df is not None and label in self.df:
295
+ raise Exception("Duplicated label")
296
+
297
+ df_tmp = df_from_asu_data(asu_data, label)
298
+
299
+ if self.df is None:
300
+ self.df = df_tmp
301
+ elif common_only:
302
+ self.df = self.df.merge(df_tmp)
303
+ else:
304
+ self.df = self.df.merge(df_tmp, how="outer")
305
+ # merge_asu_data()
306
+
307
+ def miller_array(self, copy_if_not_writable=True): # workaround for pandas3. remove this option with gemm 0.7.5
308
+ ret = self.df[["H","K","L"]].to_numpy()
309
+ if not ret.flags.writeable: ret = ret.copy()
310
+ return ret
311
+ def s_array(self):
312
+ hkl = self.miller_array()
313
+ return numpy.dot(hkl, self.cell.frac.mat.array)
314
+
315
+ def ssq_mat(self):
316
+ # k_aniso = exp(-s^T B_aniso s / 4)
317
+ # s^T B s / 4 can be reformulated as R b where R = 1x6 matrix and b = 6x1 matrix
318
+ # here R for all indices is returned with shape of (6, N)
319
+ # x[None,:].T <= (N, 6, 1)
320
+ # x.T[:,None] <= (N, 1, 6) they can be matmul'ed.
321
+ svecs = self.s_array()
322
+ tmp = (0.25 * svecs[:,0]**2, 0.25 * svecs[:,1]**2, 0.25 * svecs[:,2]**2,
323
+ 0.5 * svecs[:,0] * svecs[:,1], 0.5 * svecs[:,0] * svecs[:,2], 0.5 * svecs[:,1] * svecs[:,2])
324
+ return numpy.array(tmp)
325
+ # aniso_s_u_s_as_left_mat()
326
+
327
+ def debye_waller_factors(self, b_cart=None, b_iso=None):
328
+ if b_iso is not None:
329
+ s2 = 1 / self.d_spacings()**2
330
+ return numpy.exp(-b_iso / 4 * s2)
331
+ if b_cart is not None:
332
+ b_star = b_cart.transformed_by(self.cell.frac.mat)
333
+ return numpy.exp(-b_star.r_u_r(self.miller_array()) / 4)
334
+
335
+ def calc_d(self):
336
+ self.df["d"] = self.cell.calculate_d_array(self.miller_array())
337
+ # calc_d()
338
+
339
+ def calc_epsilon(self):
340
+ self.df["epsilon"] = self.sg.operations().epsilon_factor_without_centering_array(self.miller_array())
341
+ # calc_epsilon()
342
+
343
+ def calc_centric(self):
344
+ self.df["centric"] = self.sg.operations().centric_flag_array(self.miller_array()).astype(int)
345
+ # calc_centric()
346
+
347
+ def d_spacings(self):
348
+ if "d" not in self.df or self.df.d.isnull().values.any():
349
+ self.calc_d()
350
+ return self.df.d
351
+ # calc_d()
352
+
353
+ def sort_by_resolution(self, ascending=False):
354
+ self.d_spacings()
355
+ self.df.sort_values("d", ascending=ascending, inplace=True)
356
+ # sort_by_resolution()
357
+
358
+ def d_min_max(self, labs=None):
359
+ d = self.d_spacings()
360
+ if labs:
361
+ d = d[~self.df[labs].isna().any(axis=1)]
362
+ return numpy.min(d), numpy.max(d)
363
+ # d_min_max()
364
+
365
+ def complete(self):
366
+ # make complete set
367
+ d_min, d_max = self.d_min_max()
368
+ all_hkl = gemmi.make_miller_array(self.cell, self.sg, d_min, d_max)
369
+ match = gemmi.HklMatch(self.miller_array(), all_hkl)
370
+ missing_hkl_df = pandas.DataFrame(all_hkl[numpy.asarray(match.pos) < 0], columns=["H","K","L"])
371
+ self.df = pandas.concat([self.df, missing_hkl_df])
372
+ logger.writeln("Completing hkldata: {} reflections were missing".format(len(missing_hkl_df.index)))
373
+ self.calc_d()
374
+ # complete()
375
+
376
+ def completeness(self, label=None):
377
+ if label is None:
378
+ n_missing = numpy.sum(self.df.isna().any(axis=1))
379
+ else:
380
+ n_missing = numpy.sum(self.df[label].isna())
381
+ n_all = len(self.df.index)
382
+ return (n_all-n_missing)/n_all
383
+ # completeness()
384
+
385
+ def setup_binning(self, n_bins, name, method=gemmi.Binner.Method.Dstar2):
386
+ self.df.reset_index(drop=True, inplace=True)
387
+ s2 = 1/self.d_spacings().to_numpy()**2
388
+ binner = gemmi.Binner()
389
+ binner.setup_from_1_d2(n_bins, method, s2, self.cell)
390
+ self._bin_and_indices[name] = []
391
+ d_limits = 1 / numpy.sqrt(binner.limits)
392
+ bin_number = binner.get_bins_from_1_d2(s2)
393
+ d_max_all = []
394
+ d_min_all = []
395
+ for i in range(binner.size):
396
+ left = numpy.max(self.d_spacings()) if i == 0 else d_limits[i-1]
397
+ right = numpy.min(self.d_spacings()) if i == binner.size -1 else d_limits[i]
398
+ sel = numpy.where(bin_number==i)[0] # slow?
399
+ d_max_all.append(left)
400
+ d_min_all.append(right)
401
+ self._bin_and_indices[name].append((i, sel))
402
+
403
+ self.df[f"bin_{name}"] = bin_number
404
+ self.binned_df[name] = pandas.DataFrame(dict(d_max=d_max_all, d_min=d_min_all), index=list(range(binner.size)))
405
+ # setup_binning()
406
+
407
+ def setup_relion_binning(self, name, sort=False):
408
+ max_edge = max(self.cell.parameters[:3])
409
+ if sort:
410
+ self.sort_by_resolution()
411
+ self.df.reset_index(drop=True, inplace=True) # to allow numpy.array indexing
412
+
413
+ self.df[f"bin_{name}"] = (max_edge/self.d_spacings()+0.5).astype(int)
414
+ # Merge inner/outer shells if too few # TODO smarter way
415
+ bin_counts = []
416
+ bin_ranges = {}
417
+ modify_table = {}
418
+ for i_bin, g in self.df.groupby(f"bin_{name}", sort=True):
419
+ if i_bin == 0: continue # ignore DC component
420
+ bin_counts.append([i_bin, g.index])
421
+ bin_ranges[i_bin] = (numpy.max(g.d), numpy.min(g.d))
422
+
423
+ for i in range(len(bin_counts)):
424
+ if len(bin_counts[i][1]) < 10 and i < len(bin_counts)-1:
425
+ bin_counts[i+1][1] = bin_counts[i+1][1].union(bin_counts[i][1])
426
+ modify_table[bin_counts[i][0]] = bin_counts[i+1][0]
427
+ logger.writeln("Bin {} only has {} data. Merging with next bin.".format(bin_counts[i][0],
428
+ len(bin_counts[i][1])))
429
+ else: break
430
+
431
+ for i in reversed(range(len(bin_counts))):
432
+ if i > 0 and len(bin_counts[i][1])/len(bin_counts[i-1][1]) < 0.5:
433
+ bin_counts[i-1][1] = bin_counts[i-1][1].union(bin_counts[i][1])
434
+ modify_table[bin_counts[i][0]] = bin_counts[i-1][0]
435
+ logger.writeln("Bin {} only has {} data. Merging with previous bin.".format(bin_counts[i][0],
436
+ len(bin_counts[i][1])))
437
+ else: break
438
+
439
+ while True:
440
+ flag = True
441
+ for i_bin in modify_table:
442
+ if modify_table[i_bin] in modify_table:
443
+ modify_table[i_bin] = modify_table[modify_table[i_bin]]
444
+ flag = False
445
+ if flag: break
446
+
447
+ for i_bin in modify_table:
448
+ new_bin = modify_table[i_bin]
449
+ self.df[f"bin_{name}"] = numpy.where(self.df[f"bin_{name}"].to_numpy() == i_bin, new_bin, self.df[f"bin_{name}"].to_numpy())
450
+ bin_ranges[new_bin] = (max(bin_ranges[i_bin][0], bin_ranges[new_bin][0]),
451
+ min(bin_ranges[i_bin][1], bin_ranges[new_bin][1]))
452
+
453
+ self._bin_and_indices[name] = []
454
+ bin_all = []
455
+ d_max_all = []
456
+ d_min_all = []
457
+ for i_bin, indices in bin_counts:
458
+ if i_bin in modify_table: continue
459
+ #if sort: # want this, but we cannot take len() for slice. we can add ncoeffs to binned_df
460
+ # self._bin_and_indices.append((i_bin, slice(numpy.min(indices), numpy.max(indices))))
461
+ #else:
462
+ self._bin_and_indices[name].append((i_bin, indices))
463
+
464
+ bin_all.append(i_bin)
465
+ d_max_all.append(bin_ranges[i_bin][0])
466
+ d_min_all.append(bin_ranges[i_bin][1])
467
+ self.binned_df[name] = pandas.DataFrame(dict(d_max=d_max_all, d_min=d_min_all), index=bin_all)
468
+ # setup_relion_binning()
469
+
470
+ def copy_binning(self, src, dst):
471
+ self.binned_df[dst] = self.binned_df[src].copy()
472
+ self._bin_and_indices[dst] = [x for x in self._bin_and_indices[src]]
473
+ self.df[f"bin_{dst}"] = self.df[f"bin_{src}"]
474
+
475
+ def setup_centric_and_selections(self, name, data_lab, free):
476
+ self.centric_and_selections[name] = {}
477
+ centric_and_selections = self.centric_and_selections[name]
478
+ for i_bin, idxes in self.binned(name):
479
+ centric_and_selections[i_bin] = []
480
+ for c, g2 in self.df.loc[idxes].groupby("centric", sort=False):
481
+ valid_sel = numpy.isfinite(g2[data_lab])
482
+ if "FREE" in g2:
483
+ test_sel = (g2.FREE == free).fillna(False)
484
+ test = g2.index[test_sel]
485
+ work = g2.index[~test_sel]
486
+ else:
487
+ work = g2.index
488
+ test = type(work)([], dtype=work.dtype)
489
+ centric_and_selections[i_bin].append((c, work, test))
490
+ # setup_centric_and_selections()
491
+
492
+ def binned_data_as_array(self, name, lab):
493
+ vals = numpy.zeros(len(self.df.index), dtype=self.binned_df[name][lab].dtype)
494
+ for i_bin, idxes in self.binned(name):
495
+ vals[idxes] = self.binned_df[name][lab][i_bin]
496
+ return vals
497
+ # binned_data_as_array()
498
+
499
+ def binned(self, name):
500
+ return self._bin_and_indices[name]
501
+
502
+ def columns(self):
503
+ return [x for x in self.df.columns if x not in "HKL"]
504
+
505
+ def merge(self, other, common_only=True):
506
+ self.merge_df(other, common_only)
507
+ # merge()
508
+
509
+ def merge_df(self, other, common_only=True):
510
+ # TODO check space group, cell
511
+ # TODO transform to asu with phase shifts
512
+ # TODO check column labels. same names other than HKL?
513
+ #
514
+ if common_only:
515
+ self.df = self.df.merge(other)
516
+ else:
517
+ df = self.df.merge(other, indicator=True, how="outer")
518
+ df_left = df[df._merge=="left_only"]
519
+ df_right = df[df._merge=="right_only"]
520
+ df_both = df[df._merge=="both"]
521
+ # merge()
522
+
523
+ def guess_free_number(self, obs):
524
+ logger.writeln("Guessing test flag number")
525
+ sel = ~self.df[obs].isna()
526
+ free = self.df.loc[sel, "FREE"]
527
+ threshold = len(free.index) / 2
528
+ free_na = free.isna()
529
+ if free_na.any():
530
+ raise RuntimeError(f"{free_na.sum()} missing test flags")
531
+ counts = self.df.loc[sel, "FREE"].value_counts().sort_index()
532
+ logger.writeln(counts.to_string(header=False))
533
+ if len(counts.index) < 2:
534
+ raise RuntimeError("this does not appear to be test flag")
535
+ good_flags = [n for n, c in counts.items() if c < threshold]
536
+ if len(good_flags) > 0:
537
+ flag_num = min(good_flags)
538
+ else:
539
+ flag_num = min(counts.index)
540
+ logger.writeln(" best guess: free = {}".format(flag_num))
541
+ return flag_num
542
+ # guess_free_number()
543
+
544
+ def as_numpy_arrays(self, labels, omit_nan=True):
545
+ tmp = self.df[labels]
546
+ if omit_nan: tmp = tmp[~tmp.isna().any(axis=1)]
547
+ return [tmp[lab].to_numpy() for lab in labels]
548
+ # as_numpy_arrays()
549
+
550
+ def remove_nonpositive(self, label):
551
+ sel = self.df[label] <= 0
552
+ n_bad = sel.sum()
553
+ if n_bad > 0:
554
+ logger.writeln("Removing {} reflections with {}<=0".format(n_bad, label))
555
+ self.df = self.df[~sel]
556
+ # remove_nonpositive()
557
+
558
+ def mask_invalid_obs_values(self, labels):
559
+ assert 1 < len(labels) < 6
560
+ assert labels[1].startswith("SIG")
561
+ def do_mask(label, target_labels):
562
+ sel = self.df[label] <= 0
563
+ n_bad = sel.sum()
564
+ if n_bad > 0:
565
+ logger.writeln("Removing {} reflections with {}<=0".format(n_bad, label))
566
+ self.df.loc[sel, target_labels] = numpy.nan
567
+ # If any element within target_labels is non-finite, mask all elements
568
+ self.df.loc[(~numpy.isfinite(self.df[target_labels])).any(axis=1), target_labels] = numpy.nan
569
+
570
+ if len(labels) < 4: # F/SIGF or I/SIGI
571
+ if labels[0].startswith("F"):
572
+ do_mask(labels[0], labels[:2]) # bad F
573
+ do_mask(labels[1], labels[:2]) # bad sigma
574
+ else: # I(+)/SIGI(+)/I(-)/SIGI(-) or F...
575
+ assert labels[3].startswith("SIG")
576
+ if labels[0].startswith("F"):
577
+ do_mask(labels[0], labels[:2]) # bad F+
578
+ do_mask(labels[2], labels[2:4]) # bad F-
579
+ do_mask(labels[1], labels[:2]) # bad sigma+
580
+ do_mask(labels[3], labels[2:4]) # bad sigma-
581
+ # mask_invalid_obs_values()
582
+
583
+ def remove_systematic_absences(self):
584
+ is_absent = self.sg.operations().systematic_absences(self.miller_array())
585
+ n_absent = numpy.sum(is_absent)
586
+ if n_absent > 0:
587
+ logger.writeln("Removing {} systematic absences".format(n_absent))
588
+ self.df = self.df[~is_absent]
589
+ # remove_systematic_absences()
590
+
591
+ def merge_anomalous(self, labs, newlabs, method="weighted"):
592
+ assert method in ("weighted", "simple")
593
+ assert len(labs) == 4 # i+,sigi+,i-,sigi- for example
594
+ assert len(newlabs) == 2
595
+ if method == "simple":
596
+ # skipna=True is default, so missing value is handled nicely.
597
+ self.df[newlabs[0]] = self.df[[labs[0], labs[2]]].mean(axis=1)
598
+ self.df[newlabs[1]] = self.df[[labs[1], labs[3]]].pow(2).mean(axis=1).pow(0.5)
599
+ else:
600
+ obs = self.df[[labs[0], labs[2]]].to_numpy()
601
+ weights = 1. / self.df[[labs[1], labs[3]]].to_numpy()**2
602
+ sum_w = numpy.nansum(weights, axis=1)
603
+ sum_w[sum_w == 0] = numpy.nan # mask when both are nan
604
+ self.df[newlabs[0]] = numpy.nansum(obs * weights, axis=1) / sum_w
605
+ self.df[newlabs[1]] = numpy.sqrt(1. / sum_w)
606
+ # merge_anomalous()
607
+
608
+ def as_asu_data(self, label=None, data=None, label_sigma=None):
609
+ if label is None: assert data is not None
610
+ else: assert data is None
611
+
612
+ if label_sigma is not None:
613
+ assert data is None
614
+ assert not numpy.iscomplexobj(self.df[label])
615
+ data = self.df[[label,label_sigma]].to_numpy()
616
+ elif data is None:
617
+ data = self.df[label]
618
+
619
+ if numpy.iscomplexobj(data):
620
+ asutype = gemmi.ComplexAsuData
621
+ elif issubclass(data.dtype.type, numpy.integer):
622
+ asutype = gemmi.IntAsuData
623
+ elif label_sigma is not None:
624
+ asutype = gemmi.ValueSigmaAsuData
625
+ else:
626
+ asutype = gemmi.FloatAsuData
627
+
628
+ return asutype(self.cell, self.sg,
629
+ self.miller_array(), data)
630
+ # as_asu_data()
631
+
632
+ def fft_map(self, label=None, data=None, grid_size=None, sample_rate=3):
633
+ if data is None:
634
+ data = self.df[label].to_numpy()
635
+ return fft_map(self.cell, self.sg, self.miller_array(), data, grid_size, sample_rate)
636
+ # fft_map()
637
+
638
+ def d_eff(self, name, label):
639
+ # Effective resolution defined using FSC
640
+ fsc = self.binned_df[name][label]
641
+ a = 0.
642
+ for i_bin, idxes in self.binned(name):
643
+ a += len(idxes) * fsc[i_bin]
644
+
645
+ fac = (a/len(self.df.index))**(1/3.)
646
+ d_min = self.d_min_max()[0]
647
+ ret = d_min/fac
648
+ return ret
649
+ # d_eff()
650
+
651
+ def hard_sphere_kernel(self, r_ang, grid_size):
652
+ s = 1. / self.d_spacings()
653
+ t = 2 * numpy.pi * s * r_ang
654
+ F_kernel = 3. * (-t * numpy.cos(t) + numpy.sin(t)) / t**3
655
+ knl = self.fft_map(data=F_kernel, grid_size=grid_size)
656
+ knl.array[:] += 1. / knl.unit_cell.volume # F000
657
+ knl.array[:] /= numpy.sum(knl.array)
658
+ return knl
659
+ # hard_sphere_kernel()
660
+
661
+ def scale_k_and_b(self, lab_ref, lab_scaled, debug=False):
662
+ logger.writeln("Determining k, B scales between {} and {}".format(lab_ref, lab_scaled))
663
+ s2 = 1/self.d_spacings().to_numpy()**2
664
+ # determine scales that minimize (|f1|-|f2|*k*e^(-b*s2/4))^2
665
+ f1 = self.df[lab_ref].to_numpy()
666
+ f2 = self.df[lab_scaled].to_numpy()
667
+ if numpy.iscomplexobj(f1): f1 = numpy.abs(f1)
668
+ if numpy.iscomplexobj(f2): f2 = numpy.abs(f2)
669
+
670
+ sel_pos = numpy.logical_and(f1 > 0, f2 > 0) # this filters nan as well
671
+ f1p, f2p, s2p = f1[sel_pos], f2[sel_pos], s2[sel_pos]
672
+
673
+ # 1st step: minimize (log(|f1|)-log(|f2|*e^k*e^(-b*s2/4)))^2 starting with k=1, b=0.
674
+ tmp = numpy.log(f2p) - numpy.log(f1p)
675
+ # g = [dT/dk, dT/db]
676
+ g = numpy.array([2 * numpy.sum(tmp), -numpy.sum(tmp*s2p)/2])
677
+ H = numpy.zeros((2,2))
678
+ H[0,0] = 2*len(f1p)
679
+ H[1,1] = numpy.sum(s2p**2/8)
680
+ H[0,1] = H[1,0] = -numpy.sum(s2p)/2
681
+ x = -numpy.dot(numpy.linalg.inv(H), g)
682
+ k1 = numpy.exp(x[0])
683
+ B1 = x[1]
684
+ logger.writeln(" initial estimate using log: k= {:.2e} B= {:.2e}".format(k1, B1))
685
+ f2tmp = f2 * k1 * numpy.exp(-B1*s2/4)
686
+ r_step0 = r_factor(f1, f2)
687
+ r_step1 = r_factor(f1, f2tmp)
688
+ logger.writeln(" R= {:.4f} (was: {:.4f})".format(r_step1, r_step0))
689
+
690
+ # 2nd step: - minimize (|f1|-|f2|*k*e^(-b*s2/4))^2 iteratively (TODO with regularisation)
691
+
692
+ def grad2(x):
693
+ t = numpy.exp(-x[1]*s2/4)
694
+ tmp = (f1-f2*x[0]*t)*f2*t
695
+ return numpy.array([-2.*numpy.nansum(tmp),
696
+ 0.5*x[0]*numpy.nansum(tmp*s2)])
697
+
698
+ def hess2(x):
699
+ h = numpy.zeros((2, 2))
700
+ t = numpy.exp(-x[1]*s2/4)
701
+ t2 = t**2
702
+ h[0,0] = numpy.nansum(f2**2 * t2) * 2
703
+ h[1,1] = numpy.nansum(f2 * s2**2/4 * (-f1/2*t + f2*x[0]*t2)) * x[0]
704
+ h[1,0] = numpy.nansum(f2 * s2 * (f1/2*t - f2*x[0]*t2))
705
+ h[0,1] = h[1,0]
706
+ return h
707
+
708
+ res = scipy.optimize.minimize(fun=lambda x: numpy.nansum((f1-f2*x[0]*numpy.exp(-x[1]*s2/4))**2),
709
+ jac=grad2,
710
+ hess=hess2,
711
+ method="Newton-CG",
712
+ x0=numpy.array([k1, B1]),
713
+ )
714
+ if debug:
715
+ logger.writeln(str(res))
716
+ k2, B2 = res.x
717
+ f2tmp2 = f2 * k2 * numpy.exp(-B2*s2/4)
718
+ r_step2 = r_factor(f1, f2tmp2)
719
+ logger.writeln(" Least-square estimate: k= {:.2e} B= {:.2e}".format(k2, B2))
720
+ logger.writeln(" R= {:.4f}".format(r_step2))
721
+
722
+ if 0:
723
+ self.setup_binning(40, "tst")
724
+ x = []
725
+ y0,y1,y2,y3=[],[],[],[]
726
+ for i_bin, idxes in self.binned("tst"):
727
+ bin_d_min = hkldata.binned_df["tst"].d_min[i_bin]
728
+ bin_d_max = hkldata.binned_df["tst"].d_max[i_bin]
729
+ x.append(1/bin_d_min**2)
730
+ y0.append(numpy.average(f1[idxes]))
731
+ y1.append(numpy.average(f2[idxes]))
732
+ y2.append(numpy.average(f2tmp[idxes]))
733
+ y3.append(numpy.average(f2tmp2[idxes]))
734
+
735
+ import matplotlib.pyplot as plt
736
+ plt.plot(x, y0, label="FC")
737
+ plt.plot(x, y1, label="FP")
738
+ plt.plot(x, y2, label="FP,scaled")
739
+ plt.plot(x, y3, label="FP,scaled2")
740
+ plt.legend()
741
+ plt.show()
742
+
743
+ if r_step2 < r_step1:
744
+ return k2, B2
745
+ else:
746
+ return k1, B1
747
+ # scale_k_and_b()
748
+
749
+ def translation_factor(self, shift):
750
+ if type(shift) != gemmi.Position:
751
+ shift = gemmi.Position(*shift)
752
+ return numpy.exp(2.j*numpy.pi*numpy.dot(self.miller_array(),
753
+ self.cell.fractionalize(shift).tolist()))
754
+ # translation_factor()
755
+ def translate(self, lab, shift):
756
+ # apply phase shift
757
+ assert numpy.iscomplexobj(self.df[lab])
758
+ self.df[lab] *= self.translation_factor(shift)
759
+ # translate()
760
+
761
+ def write_mtz(self, mtz_out, labs, types=None, phase_label_decorator=None,
762
+ exclude_000=True):
763
+ logger.writeln("Writing MTZ file: {}".format(mtz_out))
764
+ if self.sg.ccp4 < 1:
765
+ logger.writeln("WARNING: CCP4-unsupported space group ({})".format(self.sg.xhm()))
766
+ if types is None: types = {}
767
+ if exclude_000:
768
+ df = self.df.query("H!=0 | K!=0 | L!=0")
769
+ else:
770
+ df = self.df
771
+
772
+ ndata = sum(2 if numpy.iscomplexobj(df[lab]) else 1 for lab in labs)
773
+
774
+ data = numpy.empty((len(df.index), ndata + 3), dtype=numpy.float32)
775
+ data[:,:3] = df[["H","K","L"]]
776
+ idx = 3
777
+ for lab in labs:
778
+ if numpy.iscomplexobj(df[lab]):
779
+ data[:,idx] = numpy.abs(df[lab])
780
+ data[:,idx+1] = numpy.angle(df[lab], deg=True)
781
+ idx += 2
782
+ else:
783
+ data[:,idx] = df[lab].to_numpy(numpy.float32, na_value=numpy.nan) # for nullable integers
784
+ idx += 1
785
+
786
+ mtz = gemmi.Mtz()
787
+ mtz.spacegroup = self.sg
788
+ mtz.cell = self.cell
789
+ mtz.add_dataset('HKL_base')
790
+ for label in ['H', 'K', 'L']: mtz.add_column(label, 'H')
791
+
792
+ for lab in labs:
793
+ if numpy.iscomplexobj(df[lab]):
794
+ mtz.add_column(lab, "F")
795
+ if phase_label_decorator is None:
796
+ plab = {"FWT": "PHWT", "DELFWT": "PHDELWT", "FAN":"PHAN", "DELFAN":"PHDELAN"}.get(lab, "PH"+lab)
797
+ else:
798
+ plab = phase_label_decorator(lab)
799
+ mtz.add_column(plab, "P")
800
+ else:
801
+ typ = types.get(lab)
802
+ if typ is None:
803
+ if issubclass(df[lab].dtype.type, numpy.integer):
804
+ typ = "I"
805
+ else:
806
+ typ = "R"
807
+ mtz.add_column(lab, typ)
808
+
809
+ mtz.set_data(data)
810
+ mtz.write_to_file(mtz_out)
811
+ # write_mtz()