servalcat 0.4.131__cp314-cp314t-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (45) hide show
  1. servalcat/__init__.py +10 -0
  2. servalcat/__main__.py +120 -0
  3. servalcat/ext.cpython-314t-x86_64-linux-gnu.so +0 -0
  4. servalcat/refine/__init__.py +0 -0
  5. servalcat/refine/cgsolve.py +100 -0
  6. servalcat/refine/refine.py +1162 -0
  7. servalcat/refine/refine_geom.py +245 -0
  8. servalcat/refine/refine_spa.py +400 -0
  9. servalcat/refine/refine_xtal.py +339 -0
  10. servalcat/refine/spa.py +151 -0
  11. servalcat/refine/xtal.py +312 -0
  12. servalcat/refmac/__init__.py +0 -0
  13. servalcat/refmac/exte.py +191 -0
  14. servalcat/refmac/refmac_keywords.py +660 -0
  15. servalcat/refmac/refmac_wrapper.py +423 -0
  16. servalcat/spa/__init__.py +0 -0
  17. servalcat/spa/fofc.py +488 -0
  18. servalcat/spa/fsc.py +391 -0
  19. servalcat/spa/localcc.py +197 -0
  20. servalcat/spa/realspcc_from_var.py +128 -0
  21. servalcat/spa/run_refmac.py +979 -0
  22. servalcat/spa/shift_maps.py +293 -0
  23. servalcat/spa/shiftback.py +137 -0
  24. servalcat/spa/translate.py +129 -0
  25. servalcat/utils/__init__.py +35 -0
  26. servalcat/utils/commands.py +1629 -0
  27. servalcat/utils/fileio.py +836 -0
  28. servalcat/utils/generate_operators.py +296 -0
  29. servalcat/utils/hkl.py +811 -0
  30. servalcat/utils/logger.py +140 -0
  31. servalcat/utils/maps.py +345 -0
  32. servalcat/utils/model.py +933 -0
  33. servalcat/utils/refmac.py +759 -0
  34. servalcat/utils/restraints.py +888 -0
  35. servalcat/utils/symmetry.py +298 -0
  36. servalcat/xtal/__init__.py +0 -0
  37. servalcat/xtal/french_wilson.py +262 -0
  38. servalcat/xtal/run_refmac_small.py +240 -0
  39. servalcat/xtal/sigmaa.py +1954 -0
  40. servalcat/xtal/twin.py +316 -0
  41. servalcat-0.4.131.dist-info/METADATA +60 -0
  42. servalcat-0.4.131.dist-info/RECORD +45 -0
  43. servalcat-0.4.131.dist-info/WHEEL +6 -0
  44. servalcat-0.4.131.dist-info/entry_points.txt +4 -0
  45. servalcat-0.4.131.dist-info/licenses/LICENSE +373 -0
@@ -0,0 +1,1954 @@
1
+ """
2
+ Author: "Keitaro Yamashita, Garib N. Murshudov"
3
+ MRC Laboratory of Molecular Biology
4
+
5
+ This software is released under the
6
+ Mozilla Public License, version 2.0; see LICENSE.
7
+ """
8
+ from __future__ import absolute_import, division, print_function, generators
9
+ import argparse
10
+ import gemmi
11
+ import numpy
12
+ import pandas
13
+ import itertools
14
+ import time
15
+ import scipy.special
16
+ import scipy.optimize
17
+ from servalcat.utils import logger
18
+ from servalcat import utils
19
+ from servalcat import ext
20
+ from servalcat.xtal.twin import find_twin_domains_from_data, estimate_twin_fractions_from_model, mlopt_twin_fractions
21
+
22
+ """
23
+ DFc = sum_j D_j F_c,j
24
+ The last Fc,n is bulk solvent contribution.
25
+ """
26
+
27
+ integr = ext.IntensityIntegrator()
28
+
29
+ def add_arguments(parser):
30
+ parser.description = 'Sigma-A parameter estimation for crystallographic data'
31
+ parser.add_argument('--hklin', required=True,
32
+ help='Input MTZ file')
33
+ parser.add_argument('--hklin_free',
34
+ help='Input MTZ file for test flags')
35
+ parser.add_argument('--spacegroup',
36
+ help='Override space group')
37
+ parser.add_argument('--labin',
38
+ help='MTZ columns of --hklin for F,SIGF,FREE')
39
+ parser.add_argument('--labin_free',
40
+ help='MTZ column of --hklin_free')
41
+ parser.add_argument('--free', type=int,
42
+ help='flag number for test set')
43
+ parser.add_argument('--model', required=True, nargs="+", action="append",
44
+ help='Input atomic model file(s)')
45
+ parser.add_argument("-d", '--d_min', type=float)
46
+ parser.add_argument('--d_max', type=float)
47
+ parser.add_argument('--nbins', type=int,
48
+ help="Number of bins for statistics (default: auto)")
49
+ parser.add_argument('--nbins_ml', type=int,
50
+ help="Number of bins for ML parameters (default: auto)")
51
+ parser.add_argument('-s', '--source', choices=["electron", "xray", "neutron"], required=True,
52
+ help="Scattering factor choice")
53
+ parser.add_argument("--wavelength", type=float, help="For f_prime")
54
+ parser.add_argument('--D_trans', choices=["exp", "splus"],
55
+ help="estimate D with positivity constraint")
56
+ parser.add_argument('--S_trans', choices=["exp", "splus"],
57
+ help="estimate variance of unexplained signal with positivity constraint")
58
+ parser.add_argument('--no_solvent', action='store_true',
59
+ help="Do not consider bulk solvent contribution")
60
+ parser.add_argument('--use_cc', action='store_true',
61
+ help="Use CC(|F1|,|F2|) to CC(F1,F2) conversion to derive D and S")
62
+ parser.add_argument('--use', choices=["all", "work", "test"], default="all",
63
+ help="Which reflections to be used for the parameter estimate.")
64
+ parser.add_argument('--twin', action="store_true", help="Turn on twin refinement")
65
+ parser.add_argument('--twin_mlalpha', action="store_true", help="Use ML optimisation for twin fractions")
66
+ parser.add_argument('--mask',
67
+ help="A solvent mask (by default calculated from the coordinates)")
68
+ parser.add_argument('--keep_charges', action='store_true',
69
+ help="Use scattering factor for charged atoms. Use it with care.")
70
+ parser.add_argument('-o','--output_prefix', default="sigmaa",
71
+ help='output file name prefix (default: %(default)s)')
72
+ # add_arguments()
73
+
74
+ def parse_args(arg_list):
75
+ parser = argparse.ArgumentParser()
76
+ add_arguments(parser)
77
+ return parser.parse_args(arg_list)
78
+ # parse_args()
79
+
80
+ def nanaverage(cc, w):
81
+ sel = ~numpy.isnan(cc)
82
+ if numpy.sum(w[sel]) == 0:
83
+ return numpy.nan
84
+ return numpy.average(cc[sel], weights=w[sel])
85
+
86
+ def calc_r_and_cc(hkldata, twin_data=None):
87
+ has_int = "I" in hkldata.df
88
+ has_free = "FREE" in hkldata.df
89
+ has_llw = (hkldata.df.llweight != 1.0).any()
90
+ has_ano = not twin_data and ("I(+)" if has_int else "F(+)") in hkldata.df and "FC''" in hkldata.df
91
+ rlab = "R1" if has_int else "R"
92
+ cclab = "CCI" if has_int else "CCF"
93
+ olab = "Io" if has_int else "Fo"
94
+ clab = "Ic" if has_int else "Fc"
95
+ stats = hkldata.binned_df["stat"].copy()
96
+ stats[[f"Mn({olab})", f"Mn({clab})"]] = numpy.nan
97
+ stats[["n_obs", "n_all"]] = 0
98
+ if has_free:
99
+ stats[["n_work", "n_free"]] = 0
100
+ if has_llw:
101
+ for suf in ("_llw=0", "_llw>0"):
102
+ stats["n"+suf] = 0
103
+ if rlab == "R1":
104
+ if has_free:
105
+ for suf in ("work", "free"):
106
+ stats["n_R1"+suf] = 0
107
+ else:
108
+ stats["n_R1"] = 0
109
+ if has_llw:
110
+ for suf in ("_llw=0", "_llw>0"):
111
+ stats["n_R1"+suf] = 0
112
+ stats["Cmpl"] = 0.
113
+ if twin_data:
114
+ Fc = numpy.sqrt(twin_data.i_calc_twin()) * hkldata.df.k_aniso.to_numpy()
115
+ elif has_ano:
116
+ fcpp = hkldata.df["FC''"].to_numpy()[:,None] * numpy.array([1j, -1j])
117
+ Fc = numpy.abs(hkldata.df.FC.to_numpy()[:,None] + fcpp) * hkldata.df.k_aniso.to_numpy()[:,None]
118
+ else:
119
+ Fc = numpy.abs(hkldata.df.FC.to_numpy() * hkldata.df.k_aniso.to_numpy())
120
+ if has_int:
121
+ if has_ano:
122
+ obs = hkldata.df[["I(+)", "I(-)"]].to_numpy()
123
+ sigma = hkldata.df[["SIGI(+)", "SIGI(-)"]].to_numpy()
124
+ else:
125
+ obs = hkldata.df.I.to_numpy()
126
+ sigma = hkldata.df.SIGI.to_numpy()
127
+ obs_sqrt = numpy.sqrt(numpy.maximum(0, obs))
128
+ obs_sqrt[obs / sigma < 2] = numpy.nan # SHELX equivalent
129
+ calc = Fc**2
130
+ calc_sqrt = Fc
131
+ else:
132
+ if has_ano:
133
+ obs = hkldata.df[["F(+)", "F(-)"]].to_numpy()
134
+ else:
135
+ obs = obs_sqrt = hkldata.df.FP.to_numpy()
136
+ calc = calc_sqrt = Fc
137
+ if "CC*" in stats: # swap the positions
138
+ stats.insert(len(stats.columns)-1, "CC*", stats.pop("CC*"))
139
+ if has_free:
140
+ for lab in (cclab, rlab):
141
+ for suf in ("work", "free"):
142
+ stats[lab+suf] = numpy.nan
143
+ else:
144
+ stats[cclab] = numpy.nan
145
+ stats[rlab] = numpy.nan
146
+ if has_llw:
147
+ for lab in (cclab, rlab):
148
+ for suf in ("_llw=0", "_llw>0"):
149
+ stats[lab+suf] = numpy.nan
150
+
151
+ centric_and_selections = hkldata.centric_and_selections["stat"]
152
+ sel_llw = [hkldata.df.llweight == 0, hkldata.df.llweight > 0]
153
+ for i_bin, idxes in hkldata.binned("stat"):
154
+ stats.loc[i_bin, "n_obs"] = numpy.sum(numpy.isfinite(obs[idxes]))
155
+ if has_ano:
156
+ stats.loc[i_bin, "n_all"] = sum((len(work) + len(test)) * (1 if c == 1 else 2)
157
+ for c, work, test in centric_and_selections[i_bin])
158
+ else:
159
+ stats.loc[i_bin, "n_all"] = len(idxes)
160
+ stats.loc[i_bin, "Cmpl"] = stats.loc[i_bin, "n_obs"] / stats.loc[i_bin, "n_all"] * 100.
161
+ stats.loc[i_bin, f"Mn({olab})"] = numpy.nanmean(obs[idxes])
162
+ stats.loc[i_bin, f"Mn({clab})"] = numpy.nanmean(calc[idxes])
163
+ if has_free:
164
+ for j, suf in ((1, "work"), (2, "free")):
165
+ idxes2 = numpy.concatenate([sel[j] for sel in centric_and_selections[i_bin]])
166
+ stats.loc[i_bin, "n_"+suf] = numpy.sum(numpy.isfinite(obs[idxes2]))
167
+ stats.loc[i_bin, cclab+suf] = utils.hkl.correlation(obs[idxes2], calc[idxes2])
168
+ stats.loc[i_bin, rlab+suf] = utils.hkl.r_factor(obs_sqrt[idxes2], calc_sqrt[idxes2])
169
+ if rlab == "R1":
170
+ stats.loc[i_bin, "n_"+rlab+suf] = numpy.sum(numpy.isfinite(obs_sqrt[idxes2]))
171
+ else:
172
+ stats.loc[i_bin, cclab] = utils.hkl.correlation(obs[idxes], calc[idxes])
173
+ stats.loc[i_bin, rlab] = utils.hkl.r_factor(obs_sqrt[idxes], calc_sqrt[idxes])
174
+ if rlab == "R1":
175
+ stats.loc[i_bin, "n_"+rlab] = numpy.sum(numpy.isfinite(obs_sqrt[idxes]))
176
+ if has_llw:
177
+ for j, suf in enumerate(("_llw=0", "_llw>0")):
178
+ sel = sel_llw[j][idxes]
179
+ stats.loc[i_bin, "n"+suf] = numpy.sum(numpy.isfinite(obs[idxes][sel]))
180
+ stats.loc[i_bin, cclab+suf] = utils.hkl.correlation(obs[idxes][sel], calc[idxes][sel])
181
+ stats.loc[i_bin, rlab+suf] = utils.hkl.r_factor(obs_sqrt[idxes][sel], calc_sqrt[idxes][sel])
182
+ if rlab == "R1":
183
+ stats.loc[i_bin, "n_"+rlab+suf] = numpy.sum(numpy.isfinite(obs_sqrt[idxes][sel]))
184
+ # Overall
185
+ ret = {}
186
+ if has_free:
187
+ for suf in ("work", "free"):
188
+ ret[cclab+suf+"avg"] = nanaverage(stats[cclab+suf], stats["n_"+suf])
189
+ for j, suf in ((1, "work"), (2, "free")):
190
+ idxes = numpy.concatenate([sel[j] for i_bin, _ in hkldata.binned("stat") for sel in centric_and_selections[i_bin]])
191
+ ret[rlab+suf] = utils.hkl.r_factor(obs_sqrt[idxes], calc_sqrt[idxes])
192
+ else:
193
+ ret[cclab+"avg"] = nanaverage(stats[cclab], stats["n_obs"])
194
+ ret[rlab] = utils.hkl.r_factor(obs_sqrt, calc_sqrt)
195
+ if has_llw:
196
+ for j, suf in enumerate(("_llw=0", "_llw>0")):
197
+ ret[cclab+suf+"_avg"] = nanaverage(stats[cclab+suf], stats["n"+suf])
198
+ sel = sel_llw[j]
199
+ ret[rlab+suf] = utils.hkl.r_factor(obs_sqrt[sel], calc_sqrt[sel])
200
+
201
+ return stats, ret
202
+ # calc_r_and_cc()
203
+
204
+ def subtract_common_aniso_from_model(sts):
205
+ adpdirs = utils.model.adp_constraints(sts[0].find_spacegroup().operations(), sts[0].cell, tr0=True)
206
+ aniso_all = [cra.atom.aniso.added_kI(-cra.atom.aniso.trace()/3).elements_pdb() for st in sts for cra in st[0].all() if cra.atom.aniso.nonzero()]
207
+ if not aniso_all: # no atoms with aniso ADP
208
+ return gemmi.SMat33f(0,0,0,0,0,0)
209
+
210
+ aniso_mean = numpy.mean(aniso_all, axis=0)
211
+ aniso_mean = adpdirs.dot(aniso_mean).dot(adpdirs)
212
+
213
+ if not numpy.any(aniso_mean):
214
+ return gemmi.SMat33f(0,0,0,0,0,0)
215
+
216
+ # correct atoms
217
+ smat_sub = gemmi.SMat33f(*aniso_mean)
218
+ for st in sts:
219
+ for cra in st[0].all():
220
+ if cra.atom.aniso.nonzero():
221
+ cra.atom.aniso -= smat_sub
222
+
223
+ b_aniso = smat_sub.scaled(utils.model.u_to_b)
224
+ logger.writeln(f"Subtracting common anisotropic component from model: B= {b_aniso}")
225
+ return b_aniso
226
+ # subtract_common_aniso_from_model()
227
+
228
+ class VarTrans:
229
+ def __init__(self, D_trans, S_trans):
230
+ # splus (softplus) appears to be better than exp
231
+ # exp sometimes results in too large parameter value
232
+ trans_funcs = {"exp": (numpy.exp, # D = f(x)
233
+ numpy.exp, # dD/dx
234
+ numpy.log), # x = f^-1(D)
235
+ "splus": (lambda x: numpy.logaddexp(0, x),
236
+ scipy.special.expit, # lambda x: 1. / (1. + numpy.exp(-x))
237
+ lambda x: x + numpy.log(-numpy.expm1(-x))),
238
+ None: (lambda x: x,
239
+ lambda x: 1,
240
+ lambda x: x)}
241
+
242
+ self.D, self.D_deriv, self.D_inv = trans_funcs[D_trans]
243
+ self.S, self.S_deriv, self.S_inv = trans_funcs[S_trans]
244
+ # class VarTrans
245
+
246
+ class LsqScale:
247
+ # parameter x = [k_overall, adp_pars, k_sol, B_sol]
248
+ def __init__(self, k_as_exp=False, func_type="log_cosh"):
249
+ assert func_type in ("sq", "log_cosh")
250
+ self.k_trans = lambda x: numpy.exp(x) if k_as_exp else x
251
+ self.k_trans_der = lambda x: numpy.exp(x) if k_as_exp else 1
252
+ self.k_trans_inv = lambda x: numpy.log(x) if k_as_exp else x
253
+ self.func_type = func_type
254
+ self.reset()
255
+
256
+ def reset(self):
257
+ self.k_sol = 0.35 # same default as gemmi/scaling.hpp # refmac seems to use 0.33 and 100? SCALE_LS_PART
258
+ self.b_sol = 46.
259
+ self.k_overall = None
260
+ self.b_iso = None
261
+ self.b_aniso = None
262
+ self.stats = {}
263
+
264
+ def set_data(self, hkldata, fc_list, use_int=False, sigma_cutoff=None, twin_data=None):
265
+ assert 0 < len(fc_list) < 3
266
+ self.use_int = use_int
267
+ if sigma_cutoff is not None:
268
+ if use_int:
269
+ self.sel = hkldata.df.I / hkldata.df.SIGI > sigma_cutoff
270
+ self.labcut = "(I/SIGI>{})".format(sigma_cutoff)
271
+ else:
272
+ self.sel = hkldata.df.FP / hkldata.df.SIGFP > sigma_cutoff
273
+ self.labcut = "(F/SIGF>{})".format(sigma_cutoff)
274
+ else:
275
+ self.sel = hkldata.df.index
276
+ self.labcut = ""
277
+ self.obs = hkldata.df["I" if use_int else "FP"].to_numpy(copy=True)
278
+ self.obs[~self.sel] = numpy.nan
279
+ self.calc = [x for x in fc_list]
280
+ self.s2mat = hkldata.ssq_mat()
281
+ self.s2 = 1. / hkldata.d_spacings().to_numpy()**2
282
+ self.adpdirs = utils.model.adp_constraints(hkldata.sg.operations(), hkldata.cell, tr0=False)
283
+ self.twin_data = twin_data
284
+ if use_int:
285
+ self.sqrt_obs = numpy.sqrt(self.obs)
286
+
287
+ def get_solvent_scale(self, k_sol, b_sol, s2=None):
288
+ if s2 is None: s2 = self.s2
289
+ return k_sol * numpy.exp(-b_sol * s2 / 4)
290
+
291
+ def fc_and_mask_grad(self, x):
292
+ fc0 = self.calc[0]
293
+ if len(self.calc) == 2:
294
+ if self.twin_data:
295
+ r = self.twin_data.scaling_fc_and_mask_grad(self.calc[1], x[-2], x[-1])
296
+ return r[:,0], r[:,1], r[:,2]
297
+ else:
298
+ fmask = self.calc[1]
299
+ temp_sol = numpy.exp(-x[-1] * self.s2 / 4)
300
+ fbulk = x[-2] * temp_sol * fmask
301
+ fc = fc0 + fbulk
302
+ re_fmask_fcconj = (fmask * fc.conj()).real
303
+ fc_abs = numpy.abs(fc)
304
+ tmp = temp_sol / fc_abs * re_fmask_fcconj
305
+ return fc_abs, tmp, -tmp * x[-2] * self.s2 / 4
306
+ else:
307
+ if self.twin_data:
308
+ return numpy.sqrt(self.twin_data.i_calc_twin()), None, None
309
+ else:
310
+ return numpy.abs(fc0), None, None
311
+
312
+ def scaled_fc(self, x):
313
+ fc = self.fc_and_mask_grad(x)[0]
314
+ nadp = self.adpdirs.shape[0]
315
+ B = numpy.dot(x[1:nadp+1], self.adpdirs)
316
+ kani = numpy.exp(numpy.dot(-B, self.s2mat))
317
+ return self.k_trans(x[0]) * kani * fc
318
+
319
+ def target(self, x):
320
+ y = self.scaled_fc(x)
321
+ if self.use_int:
322
+ diff = self.sqrt_obs - y
323
+ #y2 = y**2
324
+ #diff = self.obs - y2
325
+ else:
326
+ diff = self.obs - y
327
+
328
+ if self.func_type == "sq":
329
+ return numpy.nansum(diff**2)
330
+ elif self.func_type == "log_cosh":
331
+ return numpy.nansum(gemmi.log_cosh(diff))
332
+ else:
333
+ raise RuntimeError("bad func_type")
334
+
335
+ def grad(self, x):
336
+ g = numpy.zeros_like(x)
337
+ fc_abs, der_ksol, der_bsol = self.fc_and_mask_grad(x)
338
+ nadp = self.adpdirs.shape[0]
339
+ B = numpy.dot(x[1:nadp+1], self.adpdirs)
340
+ kani = numpy.exp(numpy.dot(-B, self.s2mat))
341
+ k = self.k_trans(x[0])
342
+ y = k * kani * fc_abs
343
+ if self.use_int:
344
+ diff = self.sqrt_obs - y
345
+ diff_der = -1
346
+ #diff = self.obs - y**2
347
+ #diff_der = -2 * y
348
+ else:
349
+ diff = self.obs - y
350
+ diff_der = -1
351
+ if self.func_type == "sq":
352
+ dfdy = 2 * diff * diff_der
353
+ elif self.func_type == "log_cosh":
354
+ dfdy = numpy.tanh(diff) * diff_der
355
+ else:
356
+ raise RuntimeError("bad func_type")
357
+
358
+ dfdb = numpy.nansum(-self.s2mat * k * fc_abs * kani * dfdy, axis=1)
359
+ g[0] = numpy.nansum(kani * fc_abs * dfdy * self.k_trans_der(x[0]))
360
+ g[1:nadp+1] = numpy.dot(dfdb, self.adpdirs.T)
361
+ if len(self.calc) == 2:
362
+ g[-2] = numpy.nansum(k * kani * der_ksol * dfdy)
363
+ g[-1] = numpy.nansum(k * kani * der_bsol * dfdy)
364
+
365
+ return g
366
+
367
+ def calc_shift(self, x):
368
+ # TODO: sort out code duplication, if we use this.
369
+ g = numpy.zeros((len(self.obs), len(x)))
370
+ H = numpy.zeros((len(x), len(x)))
371
+ fc_abs, der_ksol, der_bsol = self.fc_and_mask_grad(x)
372
+ nadp = self.adpdirs.shape[0]
373
+ B = numpy.dot(x[1:nadp+1], self.adpdirs)
374
+ kani = numpy.exp(numpy.dot(-B, self.s2mat))
375
+ k = self.k_trans(x[0])
376
+ y = k * kani * fc_abs
377
+ if self.use_int:
378
+ diff = self.sqrt_obs - y
379
+ diff_der = -1
380
+ diff_der2 = 0
381
+ else:
382
+ diff = self.obs - y
383
+ diff_der = -1.
384
+ diff_der2 = 0.
385
+
386
+ if self.func_type == "sq":
387
+ dfdy = 2 * diff * diff_der
388
+ dfdy2 = 2 * diff_der**2 + 2 * diff * diff_der2
389
+ elif self.func_type == "log_cosh":
390
+ dfdy = numpy.tanh(diff) * diff_der
391
+ #dfdy2 = 1 /numpy.cosh(diff)**2 * diff_der**2 + numpy.tanh(diff) * diff_der2 # problematic with large diff
392
+ #dfdy2 = numpy.where(diff==0, 1., numpy.abs(numpy.tanh(diff)) / gemmi.log_cosh(diff)) * diff_der**2 + numpy.tanh(diff) * diff_der2
393
+ dfdy2 = numpy.where(diff==0, 1., numpy.tanh(diff) / diff) * diff_der**2 + numpy.tanh(diff) * diff_der2
394
+ else:
395
+ raise RuntimeError("bad func_type")
396
+
397
+ dfdb = -self.s2mat * k * fc_abs * kani
398
+ g[:,0] = kani * fc_abs * self.k_trans_der(x[0])
399
+ g[:,1:nadp+1] = numpy.dot(dfdb.T, self.adpdirs.T)
400
+ if len(self.calc) == 2:
401
+ g[:,-2] = k * kani * der_ksol
402
+ g[:,-1] = k * kani * der_bsol
403
+
404
+ # no numpy.nandot..
405
+ g, dfdy, dfdy2 = g[self.sel, :], dfdy[self.sel], dfdy2[self.sel]
406
+ H = numpy.dot(g.T, g * dfdy2[:,None])
407
+ g = numpy.sum(dfdy[:,None] * g, axis=0)
408
+ dx = -numpy.dot(g, numpy.linalg.pinv(H))
409
+ return dx
410
+
411
+ def initial_kb(self):
412
+ fc_abs = self.fc_and_mask_grad([self.k_sol, self.b_sol])[0]
413
+ sel = self.obs > 0 # exclude nan as well
414
+ f1p, f2p, s2p = self.obs[sel], fc_abs[sel], self.s2[sel]
415
+ if self.use_int: f2p *= f2p
416
+ tmp = numpy.log(f2p) - numpy.log(f1p)
417
+ # g = [dT/dk, dT/db]
418
+ g = numpy.array([2 * numpy.sum(tmp), -numpy.sum(tmp*s2p)/2])
419
+ H = numpy.zeros((2,2))
420
+ H[0,0] = 2*len(f1p)
421
+ H[1,1] = numpy.sum(s2p**2/8)
422
+ H[0,1] = H[1,0] = -numpy.sum(s2p)/2
423
+ x = -numpy.dot(numpy.linalg.inv(H), g)
424
+ if self.use_int: x /= 2
425
+ k = numpy.exp(x[0])
426
+ b = x[1]
427
+ logger.writeln(" initial k,b = {:.2e} {:.2e}".format(k, b))
428
+ logger.writeln(" R{} = {:.4f}".format(self.labcut, utils.hkl.r_factor(f1p, f2p * k * numpy.exp(-b*self.s2[sel]/4))))
429
+ return k, b
430
+
431
+ def scale(self):
432
+ use_sol = len(self.calc) == 2
433
+ msg = "Scaling Fc to {} {} bulk solvent contribution".format("Io" if self.use_int else "Fo",
434
+ "with" if use_sol else "without")
435
+ logger.writeln(msg)
436
+ if self.k_overall is None or self.b_iso is None:
437
+ k, b = self.initial_kb()
438
+ else:
439
+ k, b = self.k_overall, self.b_iso
440
+ if self.b_aniso is None:
441
+ self.b_aniso = gemmi.SMat33d(b,b,b,0,0,0)
442
+ x0 = [self.k_trans_inv(k)]
443
+ bounds = [(0, None)]
444
+ x0.extend(numpy.dot(self.b_aniso.elements_pdb(), self.adpdirs.T))
445
+ bounds.extend([(None, None)]*(len(x0)-1))
446
+ if use_sol:
447
+ x0.extend([self.k_sol, self.b_sol])
448
+ bounds.extend([(1e-4, None), (10., 400.)])
449
+ if 0:
450
+ f0 = self.target(x0)
451
+ ader = self.grad(x0)
452
+ e = 1e-4
453
+ nder = []
454
+ for i in range(len(x0)):
455
+ x = numpy.copy(x0)
456
+ x[i] += e
457
+ f1 = self.target(x)
458
+ nder.append((f1 - f0) / e)
459
+ print("ADER NDER RATIO")
460
+ print(ader)
461
+ print(nder)
462
+ print(ader / nder)
463
+ quit()
464
+
465
+ t0 = time.time()
466
+ if 1:
467
+ x = x0
468
+ for i in range(40):
469
+ x_ini = x.copy()
470
+ f0 = f1 = self.target(x)
471
+ dx = self.calc_shift(x)
472
+ if numpy.max(numpy.abs(dx)) < 1e-6:
473
+ break
474
+ for s in (1, 0.5, 0.25):
475
+ if 0:
476
+ with open("debug.dat", "w") as ofs:
477
+ for s in numpy.linspace(-2, 2, 100):
478
+ f1 = self.target(x+dx * s)
479
+ #print(dx, f0, f1, f0 - f1)
480
+ ofs.write("{:4e} {:4e}\n".format(s, f1))
481
+ shift = dx * s
482
+ x = x_ini + shift
483
+ if x[0] < 0: x[0] = x0[0]
484
+ if use_sol:
485
+ if x[-1] < 10: x[-1] = 10
486
+ elif x[-1] > 400: x[-1] = 400
487
+ if x[-2] < 1e-4: x[-2] = 1e-4
488
+ f1 = self.target(x)
489
+ if f1 < f0: break
490
+ #logger.writeln("cycle {} {} {} {} {} {}".format(i, f0, f1, s, shift, (f0 - f1) / f0))
491
+ if 0 < (f0 - f1) / f0 < 1e-6:
492
+ break
493
+ res_x = x
494
+ self.stats["fun"] = f1
495
+ self.stats["x"] = x
496
+ else:
497
+ res = scipy.optimize.minimize(fun=self.target, x0=x0, jac=self.grad, bounds=bounds)
498
+ #logger.writeln(str(res))
499
+ logger.writeln(" finished in {} iterations ({} evaluations)".format(res.nit, res.nfev))
500
+ res_x = res.x
501
+ self.stats["fun"] = res.fun
502
+ self.stats["x"] = res.x
503
+ logger.writeln(" time: {:.3f} sec".format(time.time() - t0))
504
+ self.k_overall = self.k_trans(res_x[0])
505
+ nadp = self.adpdirs.shape[0]
506
+ b_overall = gemmi.SMat33d(*numpy.dot(res_x[1:nadp+1], self.adpdirs))
507
+ self.b_iso = b_overall.trace() / 3
508
+ self.b_aniso = b_overall.added_kI(-self.b_iso) # subtract isotropic contribution
509
+
510
+ logger.writeln(" k_ov= {:.2e} B_iso= {:.2e} B_aniso= {}".format(self.k_overall, self.b_iso, self.b_aniso))
511
+ if use_sol:
512
+ self.k_sol = res_x[-2]
513
+ self.b_sol = res_x[-1]
514
+ logger.writeln(" k_sol= {:.2e} B_sol= {:.2e}".format(self.k_sol, self.b_sol))
515
+ calc = self.scaled_fc(res_x)
516
+ if self.use_int: calc *= calc
517
+ self.stats["cc"] = utils.hkl.correlation(self.obs, calc)
518
+ self.stats["r"] = utils.hkl.r_factor(self.obs, calc)
519
+ logger.writeln(" CC{} = {:.4f}".format(self.labcut, self.stats["cc"]))
520
+ logger.writeln(" R{} = {:.4f}".format(self.labcut, self.stats["r"]))
521
+ # class LsqScale
522
+
523
+ def calc_abs_DFc(Ds, Fcs):
524
+ DFc = sum(Ds[i] * Fcs[i] for i in range(len(Ds)))
525
+ return numpy.abs(DFc)
526
+ # calc_abs_DFc()
527
+
528
+ #import line_profiler
529
+ #profile = line_profiler.LineProfiler()
530
+ #import atexit
531
+ #atexit.register(profile.print_stats)
532
+ #@profile
533
+ def mlf(df, fc_labs, Ds, S, k_ani, idxes):
534
+ Fcs = numpy.vstack([df[lab].to_numpy()[idxes] for lab in fc_labs]).T
535
+ DFc = (Ds * Fcs).sum(axis=1)
536
+ ll = numpy.nansum(ext.ll_amp(df.FP.to_numpy()[idxes], df.SIGFP.to_numpy()[idxes],
537
+ k_ani[idxes], S * df.epsilon.to_numpy()[idxes],
538
+ numpy.abs(DFc), df.centric.to_numpy()[idxes]+1,
539
+ df.llweight.to_numpy()[idxes]))
540
+ return numpy.nansum(ll)
541
+ # mlf()
542
+
543
+ #@profile
544
+ def deriv_mlf_wrt_D_S(df, fc_labs, Ds, S, k_ani, idxes):
545
+ Fcs = [df[lab].to_numpy()[idxes] for lab in fc_labs]
546
+ r = ext.ll_amp_der1_DS(df.FP.to_numpy()[idxes], df.SIGFP.to_numpy()[idxes], k_ani[idxes], S,
547
+ numpy.vstack(Fcs).T, Ds,
548
+ df.centric.to_numpy()[idxes]+1, df.epsilon.to_numpy()[idxes],
549
+ df.llweight.to_numpy()[idxes])
550
+ g = numpy.zeros(len(fc_labs)+1)
551
+ g[:len(fc_labs)] = numpy.nansum(r[:,:len(fc_labs)], axis=0) # D
552
+ g[-1] = numpy.nansum(r[:,-1]) # S
553
+ return g
554
+ # deriv_mlf_wrt_D_S()
555
+
556
+ #@profile
557
+ def mlf_shift_S(df, fc_labs, Ds, S, k_ani, idxes):
558
+ Fcs = [df[lab].to_numpy()[idxes] for lab in fc_labs]
559
+ r = ext.ll_amp_der1_DS(df.FP.to_numpy()[idxes], df.SIGFP.to_numpy()[idxes], k_ani[idxes], S,
560
+ numpy.vstack(Fcs).T, Ds,
561
+ df.centric.to_numpy()[idxes]+1, df.epsilon.to_numpy()[idxes],
562
+ df.llweight.to_numpy()[idxes])
563
+ g = numpy.nansum(r[:,-1])
564
+ H = numpy.nansum(r[:,-1]**2) # approximating expectation value of second derivative
565
+ return -g / H
566
+ # mlf_shift_S()
567
+
568
+ def mli(df, fc_labs, Ds, S, k_ani, idxes):
569
+ Fcs = numpy.vstack([df[lab].to_numpy()[idxes] for lab in fc_labs]).T
570
+ DFc = (Ds * Fcs).sum(axis=1)
571
+ ll = integr.ll_int(df.I.to_numpy()[idxes], df.SIGI.to_numpy()[idxes],
572
+ k_ani[idxes], S * df.epsilon.to_numpy()[idxes],
573
+ numpy.abs(DFc), df.centric.to_numpy()[idxes]+1,
574
+ df.llweight.to_numpy()[idxes])
575
+ return numpy.nansum(ll)
576
+ # mli()
577
+
578
+ def deriv_mli_wrt_D_S(df, fc_labs, Ds, S, k_ani, idxes):
579
+ Fcs = numpy.vstack([df[lab].to_numpy()[idxes] for lab in fc_labs]).T
580
+ r = integr.ll_int_der1_DS(df.I.to_numpy()[idxes], df.SIGI.to_numpy()[idxes], k_ani[idxes], S,
581
+ Fcs, Ds,
582
+ df.centric.to_numpy()[idxes]+1, df.epsilon.to_numpy()[idxes],
583
+ df.llweight.to_numpy()[idxes])
584
+ g = numpy.zeros(len(fc_labs)+1)
585
+ g[:len(fc_labs)] = numpy.nansum(r[:,:len(fc_labs)], axis=0) # D
586
+ g[-1] = numpy.nansum(r[:,-1]) # S
587
+ return g
588
+ # deriv_mli_wrt_D_S()
589
+
590
+ def mli_shift_D(df, fc_labs, Ds, S, k_ani, idxes):
591
+ Fcs = numpy.vstack([df[lab].to_numpy()[idxes] for lab in fc_labs]).T
592
+ r = integr.ll_int_der1_DS(df.I.to_numpy()[idxes], df.SIGI.to_numpy()[idxes], k_ani[idxes], S,
593
+ Fcs, Ds,
594
+ df.centric.to_numpy()[idxes]+1, df.epsilon.to_numpy()[idxes],
595
+ df.llweight.to_numpy()[idxes])[:,:len(fc_labs)]
596
+ g = numpy.nansum(r, axis=0)# * trans.D_deriv(x[:len(fc_labs)]) # D
597
+ #tmp = numpy.hstack([r[:,:len(fc_labs)] #* trans.D_deriv(x[:len(fc_labs)]),
598
+ # r[:,-1,None] * trans.S_deriv(x[-1])])
599
+ H = numpy.nansum(numpy.matmul(r[:,:,None], r[:,None]), axis=0)
600
+ return -numpy.dot(g, numpy.linalg.pinv(H))
601
+ # mli_shift_D()
602
+
603
+ def mli_shift_S(df, fc_labs, Ds, S, k_ani, idxes):
604
+ Fcs = numpy.vstack([df[lab].to_numpy()[idxes] for lab in fc_labs]).T
605
+ r = integr.ll_int_der1_DS(df.I.to_numpy()[idxes], df.SIGI.to_numpy()[idxes], k_ani[idxes], S,
606
+ Fcs, Ds,
607
+ df.centric.to_numpy()[idxes]+1, df.epsilon.to_numpy()[idxes],
608
+ df.llweight.to_numpy()[idxes])
609
+ g = numpy.nansum(r[:,-1])
610
+ H = numpy.nansum(r[:,-1]**2) # approximating expectation value of second derivative
611
+ return -g / H
612
+ # mli_shift_S()
613
+
614
+ #debug_twin_count = 0
615
+
616
+ def mltwin_est_ftrue(twin_data, df, k_ani, idxes):
617
+ kani2_inv = 1 / k_ani**2
618
+ i_sigi = numpy.empty((2, len(df.index)))
619
+ i_sigi[:] = numpy.nan
620
+ i_sigi[0, idxes] = (df.I.to_numpy() * kani2_inv)[idxes]
621
+ i_sigi[1, idxes] = (df.SIGI.to_numpy() * kani2_inv)[idxes]
622
+ #global debug_twin_count
623
+ #debug_twin_count += 1
624
+ # sed -i "s/,]/]/; s/nan/NaN/g" *json
625
+ #twin_data.debug_open(f"twin_debug_{debug_twin_count}.json")
626
+ twin_data.est_f_true(i_sigi[0,:], i_sigi[1,:], 100)
627
+ #twin_data.debug_close()
628
+ return i_sigi[0,:], i_sigi[1,:]
629
+ # mltwin_est_ftrue()
630
+
631
+ def mltwin(df, twin_data, Ds, S, k_ani, idxes, i_bin):
632
+ twin_data.ml_sigma[i_bin] = S
633
+ twin_data.ml_scale[i_bin, :] = Ds
634
+ Io, sigIo = mltwin_est_ftrue(twin_data, df, k_ani, idxes)
635
+ ret = twin_data.ll(Io, sigIo)
636
+ #print("-LL=", ret)
637
+ return ret
638
+ # mltwin()
639
+
640
+ def deriv_mltwin_wrt_D_S(df, twin_data, Ds, S, k_ani, idxes, i_bin):
641
+ twin_data.ml_sigma[i_bin] = S
642
+ twin_data.ml_scale[i_bin, :] = Ds
643
+ Io, sigIo = mltwin_est_ftrue(twin_data, df, k_ani, idxes)
644
+ r = twin_data.ll_der_D_S(Io, sigIo)
645
+ g = numpy.zeros(r.shape[1])
646
+ g[:-1] = numpy.nansum(r[:,:-1], axis=0) # D
647
+ g[-1] = numpy.nansum(r[:,-1]) # S
648
+ return g
649
+ # deriv_mlf_wrt_D_S()
650
+
651
+ def mltwin_shift_S(df, twin_data, Ds, S, k_ani, idxes, i_bin):
652
+ twin_data.ml_sigma[i_bin] = S
653
+ twin_data.ml_scale[i_bin, :] = Ds
654
+ Io, sigIo = mltwin_est_ftrue(twin_data, df, k_ani, idxes)
655
+ r = twin_data.ll_der_D_S(Io, sigIo)
656
+ g = numpy.nansum(r[:,-1])
657
+ H = numpy.nansum(r[:,-1]**2) # approximating expectation value of second derivative
658
+ return -g / H
659
+ # mlf_shift_S()
660
+
661
+ def determine_mlf_params_from_cc(hkldata, fc_labs, D_labs, use="all", smoothing="gauss"):
662
+ # theorhetical values
663
+ cc_a = lambda cc: (numpy.pi/4*(1-cc**2)**2 * scipy.special.hyp2f1(3/2, 3/2, 1, cc**2) - numpy.pi/4) / (1-numpy.pi/4)
664
+ cc_c = lambda cc: 2/(numpy.pi-2) * (cc**2*numpy.sqrt(1-cc**2) + cc * numpy.arctan(cc/numpy.sqrt(1-cc**2)) + (1-cc**2)**(3/2)-1)
665
+ table_fsc = numpy.arange(0, 1, 1e-3)
666
+ table_cc = [cc_a(table_fsc), cc_c(table_fsc)]
667
+
668
+ for lab in D_labs: hkldata.binned_df["ml"][lab] = 1.
669
+ hkldata.binned_df["ml"]["S"] = 1.
670
+
671
+ stats = hkldata.binned_df["ml"][["d_max", "d_min"]].copy()
672
+ for i, labi in enumerate(fc_labs):
673
+ stats["CC(FP,{})".format(labi)] = numpy.nan
674
+ for i, labi in enumerate(fc_labs):
675
+ for j in range(i+1, len(fc_labs)):
676
+ labj = fc_labs[j]
677
+ stats["CC({},{})".format(labi, labj)] = numpy.nan
678
+
679
+ centric_and_selections = hkldata.centric_and_selections["ml"]
680
+ # sqrt of eps * c; c = 1 for acentrics and 2 for centrics
681
+ inv_sqrt_c_eps = 1. / numpy.sqrt(hkldata.df.epsilon.to_numpy() * (hkldata.df.centric.to_numpy() + 1))
682
+ for i_bin, _ in hkldata.binned("ml"):
683
+ # assume they are all acentrics.. only correct by c
684
+ if use == "all":
685
+ cidxes = numpy.concatenate([sel[i] for sel in centric_and_selections[i_bin] for i in (1,2)])
686
+ else:
687
+ i = 1 if use == "work" else 2
688
+ cidxes = numpy.concatenate([sel[i] for sel in centric_and_selections[i_bin]])
689
+ valid_sel = numpy.isfinite(hkldata.df.FP.to_numpy()[cidxes])
690
+ cidxes = cidxes[valid_sel]
691
+ factor = inv_sqrt_c_eps[cidxes]
692
+ k_ani = hkldata.df.k_aniso.to_numpy()[cidxes]
693
+ Fo = hkldata.df.FP.to_numpy()[cidxes] * factor / k_ani
694
+ mean_Fo2 = numpy.mean(Fo**2)
695
+ SigFo = hkldata.df.SIGFP.to_numpy()[cidxes] / k_ani
696
+ Fcs = [hkldata.df[lab].to_numpy()[cidxes] * factor for lab in fc_labs]
697
+ mean_Fk2 = numpy.array([numpy.mean(numpy.abs(fk)**2) for fk in Fcs])
698
+
699
+ # estimate D
700
+ cc_fo_fj = [numpy.corrcoef(numpy.abs(fj), Fo)[1,0] for fj in Fcs]
701
+ for i in range(len(fc_labs)): stats.loc[i_bin, "CC(FP,{})".format(fc_labs[i])] = cc_fo_fj[i]
702
+ mat = [[numpy.sqrt(numpy.mean(numpy.abs(fk)**2)/mean_Fo2) * numpy.real(numpy.corrcoef(fk, fj)[1,0])
703
+ for fk in Fcs]
704
+ for fj in Fcs]
705
+ A = [[numpy.sqrt(numpy.mean(numpy.abs(fk)**2) * numpy.mean(numpy.abs(fj)**2))/mean_Fo2 * numpy.real(numpy.corrcoef(fk, fj)[1,0])
706
+ for fk in Fcs]
707
+ for fj in Fcs]
708
+ A = numpy.array([[numpy.real(numpy.corrcoef(fk, fj)[1,0]) for fk in Fcs] for fj in Fcs])
709
+ v = numpy.interp(cc_fo_fj, table_cc[0], table_fsc)
710
+
711
+ for i in range(len(fc_labs)):
712
+ labi = fc_labs[i]
713
+ for j in range(i+1, len(fc_labs)):
714
+ labj = fc_labs[j]
715
+ stats.loc[i_bin, "CC({},{})".format(labi, labj)] = numpy.real(numpy.corrcoef(Fcs[i], Fcs[j])[1,0])
716
+
717
+ # test all signs, fixing first Fc positive.
718
+ cc_max = -2
719
+ for v_test in itertools.product(*((x, -x) for x in v[1:])):
720
+ v_test = numpy.array((v[0],)+v_test)
721
+ Dj_test = numpy.dot(numpy.linalg.pinv(A), v_test) * numpy.sqrt(mean_Fo2 / mean_Fk2)
722
+ DFc_test = calc_abs_DFc(Dj_test, Fcs)
723
+ cc_test = numpy.corrcoef(Fo, numpy.abs(DFc_test))[1,0]
724
+ if cc_test > cc_max:
725
+ cc_max = cc_test
726
+ v_max = v_test
727
+ DFc = DFc_test
728
+ Dj = Dj_test
729
+
730
+ for lab, D in zip(D_labs, Dj):
731
+ hkldata.binned_df["ml"].loc[i_bin, lab] = D
732
+
733
+ # estimate S
734
+ mean_DFc2 = numpy.mean(DFc**2)
735
+ est_fsc_fo_fc = numpy.interp(numpy.corrcoef(Fo, DFc)[1,0], table_cc[0], table_fsc)
736
+ S = mean_Fo2 - 2 * numpy.sqrt(mean_Fo2 * mean_DFc2) * est_fsc_fo_fc + mean_DFc2 - numpy.mean(SigFo**2)
737
+ hkldata.binned_df["ml"].loc[i_bin, "S"] = S
738
+
739
+ logger.writeln("\nCC:")
740
+ logger.writeln(stats.to_string())
741
+ logger.writeln("\nEstimates:")
742
+ logger.writeln(hkldata.binned_df["ml"].to_string())
743
+ smooth_params(hkldata, D_labs, smoothing)
744
+ # determine_mlf_params_from_cc()
745
+
746
+ def initialize_ml_params(hkldata, use_int, D_labs, b_aniso, use, twin_data=None):
747
+ hkldata.binned_df["ml"]["n_ref"] = 0
748
+ # Initial values
749
+ for lab in D_labs: hkldata.binned_df["ml"][lab] = 1.
750
+ hkldata.binned_df["ml"]["S"] = 10000.
751
+ k_ani = hkldata.debye_waller_factors(b_cart=b_aniso)
752
+ lab_obs = "I" if use_int else "FP"
753
+ centric_and_selections = hkldata.centric_and_selections["ml"]
754
+ for i_bin, _ in hkldata.binned("ml"):
755
+ if use == "all":
756
+ idxes = numpy.concatenate([sel[i] for sel in centric_and_selections[i_bin] for i in (1,2)])
757
+ else:
758
+ i = 1 if use == "work" else 2
759
+ idxes = numpy.concatenate([sel[i] for sel in centric_and_selections[i_bin]])
760
+ valid_sel = numpy.isfinite(hkldata.df.loc[idxes, lab_obs]) # as there is no nan-safe numpy.corrcoef
761
+ valid_sel &= hkldata.df.llweight[idxes] > 0
762
+ hkldata.binned_df["ml"].loc[i_bin, "n_ref"] = valid_sel.sum()
763
+ if numpy.sum(valid_sel) < 2:
764
+ continue
765
+ idxes = idxes[valid_sel]
766
+ if use_int:
767
+ Io = hkldata.df.I.to_numpy()[idxes]
768
+ else:
769
+ Io = hkldata.df.FP.to_numpy()[idxes]**2
770
+ Io /= k_ani[idxes]**2
771
+ if twin_data:
772
+ Ic = twin_data.i_calc_twin()[idxes]
773
+ else:
774
+ Ic = numpy.abs(hkldata.df.FC.to_numpy()[idxes])**2
775
+ mean_Io = numpy.mean(Io)
776
+ mean_Ic = numpy.mean(Ic)
777
+ cc = numpy.corrcoef(Io, Ic)[1,0]
778
+ if cc > 0 and mean_Io > 0:
779
+ D = numpy.sqrt(mean_Io / mean_Ic * cc)
780
+ else:
781
+ D = 0 # will be taken care later
782
+ hkldata.binned_df["ml"].loc[i_bin, D_labs[0]] = D
783
+ if mean_Io > 0:
784
+ S = mean_Io - 2 * numpy.sqrt(mean_Io * mean_Ic * numpy.maximum(0, cc)) + mean_Ic
785
+ else:
786
+ S = numpy.std(Io) # similar initial to french_wilson
787
+ hkldata.binned_df["ml"].loc[i_bin, "S"] = S
788
+
789
+ for D_lab in D_labs:
790
+ if hkldata.binned_df["ml"][D_lab].min() <= 0:
791
+ min_D = hkldata.binned_df["ml"][D_lab][hkldata.binned_df["ml"][D_lab] > 0].min() * 0.1
792
+ logger.writeln("WARNING: negative {} is detected from initial estimates. Replacing it using minimum positive value {:.2e}".format(D_lab, min_D))
793
+ hkldata.binned_df["ml"].loc[hkldata.binned_df["ml"][D_lab] <= 0, D_lab] = min_D # arbitrary
794
+
795
+ if twin_data:
796
+ twin_data.ml_scale[:] = hkldata.binned_df["ml"].loc[:, D_labs]
797
+ twin_data.ml_sigma[:] = hkldata.binned_df["ml"].loc[:, "S"]
798
+
799
+ logger.writeln("Initial estimates:")
800
+ logger.writeln(hkldata.binned_df["ml"].to_string())
801
+ # initialize_ml_params()
802
+
803
+ def refine_ml_params(hkldata, use_int, fc_labs, D_labs, b_aniso,
804
+ D_trans=None, S_trans=None, use="all", n_cycle=1, twin_data=None):
805
+ trans = VarTrans(D_trans, S_trans)
806
+ lab_obs = "I" if use_int else "FP"
807
+ centric_and_selections = hkldata.centric_and_selections["ml"]
808
+ def get_idxes(i_bin):
809
+ if use == "all":
810
+ return numpy.concatenate([sel[i] for sel in centric_and_selections[i_bin] for i in (1,2)])
811
+ else:
812
+ i = 1 if use == "work" else 2
813
+ return numpy.concatenate([sel[i] for sel in centric_and_selections[i_bin]])
814
+
815
+ refpar = "all"
816
+ for i_cyc in range(n_cycle):
817
+ t0 = time.time()
818
+ nfev_total = 0
819
+ k_ani = hkldata.debye_waller_factors(b_cart=b_aniso)
820
+ for i_bin, _ in hkldata.binned("ml"):
821
+ idxes = get_idxes(i_bin)
822
+ valid_sel = numpy.isfinite(hkldata.df.loc[idxes, lab_obs]) # as there is no nan-safe numpy.corrcoef
823
+ if numpy.sum(valid_sel) < 5:
824
+ logger.writeln("WARNING: bin {} has no sufficient reflections".format(i_bin))
825
+ continue
826
+
827
+ def target(x):
828
+ if refpar == "all":
829
+ Ds = trans.D(x[:len(fc_labs)])
830
+ S = trans.S(x[-1])
831
+ elif refpar == "D":
832
+ Ds = trans.D(x[:len(fc_labs)])
833
+ S = hkldata.binned_df["ml"].loc[i_bin, "S"]
834
+ else:
835
+ Ds = [hkldata.binned_df["ml"].loc[i_bin, lab] for lab in D_labs]
836
+ S = trans.S(x[-1])
837
+
838
+ if twin_data:
839
+ return mltwin(hkldata.df, twin_data, Ds, S, k_ani, idxes, i_bin)
840
+ else:
841
+ f = mli if use_int else mlf
842
+ return f(hkldata.df, fc_labs, Ds, S, k_ani, idxes)
843
+
844
+ def grad(x):
845
+ if refpar == "all":
846
+ Ds = trans.D(x[:len(fc_labs)])
847
+ S = trans.S(x[-1])
848
+ n_par = len(fc_labs)+1
849
+ elif refpar == "D":
850
+ Ds = trans.D(x[:len(fc_labs)])
851
+ S = hkldata.binned_df["ml"].loc[i_bin, "S"]
852
+ n_par = len(fc_labs)
853
+ else:
854
+ Ds = [hkldata.binned_df["ml"].loc[i_bin, lab] for lab in D_labs]
855
+ S = trans.S(x[-1])
856
+ n_par = 1
857
+ if twin_data:
858
+ r = deriv_mltwin_wrt_D_S(hkldata.df, twin_data, Ds, S, k_ani, idxes, i_bin)
859
+ else:
860
+ calc_deriv = deriv_mli_wrt_D_S if use_int else deriv_mlf_wrt_D_S
861
+ r = calc_deriv(hkldata.df, fc_labs, Ds, S, k_ani, idxes)
862
+ g = numpy.zeros(n_par)
863
+ if refpar in ("all", "D"):
864
+ g[:len(fc_labs)] = r[:len(fc_labs)]
865
+ g[:len(fc_labs)] *= trans.D_deriv(x[:len(fc_labs)])
866
+ if refpar in ("all", "S"):
867
+ g[-1] = r[-1]
868
+ g[-1] *= trans.S_deriv(x[-1])
869
+ return g
870
+
871
+ if 0:
872
+ refpar = "S"
873
+ x0 = trans.S_inv(hkldata.binned_df["ml"].loc[i_bin, "S"])
874
+ with open("s_line_{}.dat".format(i_bin), "w") as ofs:
875
+ for sval in numpy.linspace(1, x0*2, 100):
876
+ ofs.write("{:.4e} {:.10e} {:.10e}\n".format(sval,
877
+ target([sval]),
878
+ grad([sval])[0]))
879
+ continue
880
+ #print("Bin", i_bin)
881
+ if 1: # refine D and S iteratively
882
+ vals_last = None
883
+ for ids in range(10):
884
+ refpar = "D"
885
+ x0 = numpy.array([trans.D_inv(hkldata.binned_df["ml"].loc[i_bin, lab]) for lab in D_labs])
886
+ #print("MLTWIN=", target(x0))
887
+ #quit()
888
+ if 0 and ids == 0: # debug
889
+ x = x0.copy()
890
+ with open(f"debug_d_{i_bin}.dat", "w") as ofs:
891
+ ofs.write("x f der1 shift\n")
892
+ for d in numpy.linspace(0.001, 1.2, 100):
893
+ x[0] = d
894
+ #shift = mli_shift_D(hkldata.df, fc_labs, trans.D(x), hkldata.binned_df.loc[i_bin, "S"], k_ani, idxes)[0]
895
+ shift = numpy.nan
896
+ der1 = grad(x)[0]
897
+ f = target(x)
898
+ ofs.write(f"{d} {f} {der1} {shift}\n")
899
+ if 0:
900
+ h = 1e-3
901
+ f00 = target(x0)
902
+ g00 = grad(x0)
903
+ for ii in range(len(x0)):
904
+ xx = x0.copy()
905
+ xx[ii] += h
906
+ f01 = target(xx)
907
+ nder = (f01 - f00) / h
908
+ logger.writeln(f"DEBUG_der_D bin_{i_bin} {ii} ad={g00[ii]} nd={nder} r={g00[ii]/nder}")
909
+ vals_now = []
910
+ if 0:
911
+ f0 = target(x0)
912
+ nfev_total += 1
913
+ shift = mli_shift_D(hkldata.df, fc_labs, trans.D(x0), hkldata.binned_df["ml"].loc[i_bin, "S"], k_ani, idxes)
914
+ shift /= trans.D_deriv(x0)
915
+ #if abs(shift) < 1e-3: break
916
+ for itry in range(10):
917
+ x1 = x0 + shift
918
+ if (D_trans and any(x1 < -3)) or (not D_trans and any(x1 < 5e-2)):
919
+ #print(i_bin, cyc_s, trans.S(x0), trans.S(x1), shift, "BAD")
920
+ shift /= 2
921
+ continue
922
+ f1 = target(x1)
923
+ nfev_total += 1
924
+ if f1 > f0:
925
+ shift /= 2
926
+ continue
927
+ else: # good
928
+ for i, lab in enumerate(D_labs):
929
+ hkldata.binned_df["ml"].loc[i_bin, lab] = trans.D(x1[i])
930
+ vals_now.append(hkldata.binned_df["ml"].loc[i_bin, lab])
931
+ break
932
+ else:
933
+ break
934
+ else:
935
+ #print(mli_shift_D(hkldata.df, fc_labs, trans.D(x0), hkldata.binned_df["ml"].S[i_bin], k_ani, idxes))
936
+ res = scipy.optimize.minimize(fun=target, x0=x0, jac=grad,
937
+ bounds=((-5 if D_trans else 1e-5, None),)*len(x0))
938
+ nfev_total += res.nfev
939
+ #print(i_bin, "mini cycle", ids, refpar)
940
+ #print(res)
941
+ for i, lab in enumerate(D_labs):
942
+ hkldata.binned_df["ml"].loc[i_bin, lab] = trans.D(res.x[i])
943
+ vals_now.append(hkldata.binned_df["ml"].loc[i_bin, lab])
944
+ if twin_data:
945
+ twin_data.ml_scale[i_bin, :] = trans.D(res.x)
946
+ refpar = "S"
947
+ if 1:
948
+ for cyc_s in range(1):
949
+ x0 = trans.S_inv(hkldata.binned_df["ml"].loc[i_bin, "S"])
950
+ if 0:
951
+ h = 1e-1
952
+ f00 = target([x0])
953
+ g00 = grad([x0])
954
+ xx = x0 + h
955
+ f01 = target([xx])
956
+ nder = (f01 - f00) / h
957
+ logger.writeln(f"DEBUG_der_S bin_{i_bin} ad={g00} nd={nder} r={g00/nder}")
958
+
959
+ f0 = target([x0])
960
+ Ds = [hkldata.binned_df["ml"].loc[i_bin, lab] for lab in D_labs]
961
+ nfev_total += 1
962
+ if 0 and ids == 0: # debug
963
+ x = x0.copy()
964
+ with open(f"debug_s_{i_bin}.dat", "w") as ofs:
965
+ ofs.write("x f der1 shift\n")
966
+ for s in numpy.linspace(-5,5,100):
967
+ x = x0 * 10**s
968
+ if twin_data:
969
+ shift = mltwin_shift_S(hkldata.df, twin_data, Ds, trans.S(x), k_ani, idxes, i_bin)
970
+ else:
971
+ calc_shift_S = mli_shift_S if use_int else mlf_shift_S
972
+ shift = calc_shift_S(hkldata.df, fc_labs, Ds, trans.S(x), k_ani, idxes)
973
+ der1 = grad([x])[0]
974
+ f = target([x])
975
+ ofs.write(f"{x} {f} {der1} {shift}\n")
976
+ if twin_data:
977
+ shift = mltwin_shift_S(hkldata.df, twin_data, Ds, trans.S(x0), k_ani, idxes, i_bin)
978
+ else:
979
+ calc_shift_S = mli_shift_S if use_int else mlf_shift_S
980
+ shift = calc_shift_S(hkldata.df, fc_labs, Ds, trans.S(x0), k_ani, idxes)
981
+ shift /= trans.S_deriv(x0)
982
+ if abs(shift) < 1e-3: break
983
+ for itry in range(10):
984
+ x1 = x0 + shift
985
+ if (S_trans and x1 < -3) or (not S_trans and x1 < 5e-2):
986
+ #print(i_bin, cyc_s, trans.S(x0), trans.S(x1), shift, "BAD")
987
+ shift /= 2
988
+ continue
989
+ f1 = target([x1])
990
+ nfev_total += 1
991
+ if f1 > f0:
992
+ shift /= 2
993
+ continue
994
+ else: # good
995
+ #print(i_bin, cyc_s, trans.S(x0), trans.S(x1), shift)
996
+ hkldata.binned_df["ml"].loc[i_bin, "S"] = trans.S(x1)
997
+ break
998
+ else:
999
+ #print("all bad")
1000
+ break
1001
+ if twin_data:
1002
+ twin_data.ml_sigma[i_bin] = hkldata.binned_df["ml"].loc[i_bin, "S"]
1003
+ else:
1004
+ # somehow this does not work well.
1005
+ x0 = [trans.S_inv(hkldata.binned_df["ml"].loc[i_bin, "S"])]
1006
+ res = scipy.optimize.minimize(fun=target, x0=x0, jac=grad,
1007
+ bounds=((-3 if S_trans else 5e-2, None),))
1008
+ nfev_total += res.nfev
1009
+ #print(i_bin, "mini cycle", ids, refpar)
1010
+ #print(res)
1011
+ hkldata.binned_df["ml"].loc[i_bin, "S"] = trans.S(res.x[-1])
1012
+ if twin_data:
1013
+ twin_data.ml_sigma[i_bin] = trans.S(res.x[-1])
1014
+ vals_now.append(hkldata.binned_df["ml"].loc[i_bin, "S"])
1015
+ vals_now = numpy.array(vals_now)
1016
+ if vals_last is not None and numpy.all(numpy.abs((vals_last - vals_now) / vals_now) < 1e-2):
1017
+ #logger.writeln("converged in mini cycle {}".format(ids+1))
1018
+ break
1019
+ vals_last = vals_now
1020
+ else:
1021
+ x0 = [trans.D_inv(hkldata.binned_df["ml"].loc[i_bin, lab]) for lab in D_labs] + [trans.S_inv(hkldata.binned_df["ml"].loc[i_bin, "S"])]
1022
+ res = scipy.optimize.minimize(fun=target, x0=x0, jac=grad,
1023
+ bounds=((-5 if D_trans else 1e-5, None), )*len(D_labs) + ((-3 if S_trans else 5e-2, None),))
1024
+ nfev_total += res.nfev
1025
+ #print(i_bin)
1026
+ #print(res)
1027
+ for i, lab in enumerate(D_labs):
1028
+ hkldata.binned_df["ml"].loc[i_bin, lab] = trans.D(res.x[i])
1029
+ hkldata.binned_df["ml"].loc[i_bin, "S"] = trans.S(res.x[-1])
1030
+ if twin_data:
1031
+ twin_data.ml_scale[i_bin, :] = trans.D(res.x[:-1])
1032
+ twin_data.ml_sigma[i_bin] = trans.S(res.x[-1])
1033
+
1034
+ if twin_data:
1035
+ dfc = numpy.abs(twin_data.f_calc) * twin_data.ml_scale_array()
1036
+ for i_bin, idxes in hkldata.binned("ml"):
1037
+ dfc_bin = dfc[numpy.asarray(twin_data.bin)==i_bin,:]
1038
+ mean_dfc = numpy.nanmean(dfc_bin, axis=0)
1039
+ for i, (dlab, fclab) in enumerate(zip(D_labs, fc_labs)):
1040
+ hkldata.binned_df["ml"].loc[i_bin, "Mn(|{}*{}|)".format(dlab, fclab)] = mean_dfc[i]
1041
+ else:
1042
+ for i_bin, idxes in hkldata.binned("ml"):
1043
+ for dlab, fclab in zip(D_labs, fc_labs):
1044
+ mean_dfc = numpy.nanmean(numpy.abs(hkldata.binned_df["ml"][dlab][i_bin] * hkldata.df[fclab][idxes]))
1045
+ hkldata.binned_df["ml"].loc[i_bin, "Mn(|{}*{}|)".format(dlab, fclab)] = mean_dfc
1046
+
1047
+ logger.writeln("Refined estimates:")
1048
+ logger.writeln(hkldata.binned_df["ml"].to_string())
1049
+ #numpy.testing.assert_allclose(hkldata.binned_df["ml"].S, twin_data.ml_sigma)
1050
+ #numpy.testing.assert_allclose(hkldata.binned_df["ml"][D_labs], twin_data.ml_scale)
1051
+ logger.writeln("time: {:.1f} sec ({} evaluations)".format(time.time() - t0, nfev_total))
1052
+
1053
+ if not use_int or twin_data:
1054
+ break # did not implement MLF B_aniso optimization
1055
+
1056
+ # Refine b_aniso
1057
+ adpdirs = utils.model.adp_constraints(hkldata.sg.operations(), hkldata.cell, tr0=True)
1058
+ SMattolist = lambda B: [B.u11, B.u22, B.u33, B.u12, B.u13, B.u23]
1059
+
1060
+ def target_ani(x):
1061
+ b = gemmi.SMat33d(*numpy.dot(x, adpdirs))
1062
+ k_ani = hkldata.debye_waller_factors(b_cart=b)
1063
+ ret = 0.
1064
+ for i_bin, idxes in hkldata.binned("ml"):
1065
+ Ds = [hkldata.binned_df["ml"].loc[i_bin, lab] for lab in D_labs]
1066
+ ret += mli(hkldata.df, fc_labs, Ds, hkldata.binned_df["ml"].loc[i_bin, "S"], k_ani, idxes)
1067
+ return ret
1068
+ def grad_ani(x):
1069
+ b = gemmi.SMat33d(*numpy.dot(x, adpdirs))
1070
+ k_ani = hkldata.debye_waller_factors(b_cart=b)
1071
+ S2mat = hkldata.ssq_mat() # ssqmat
1072
+ g = numpy.zeros(6)
1073
+ for i_bin, idxes in hkldata.binned("ml"):
1074
+ r = integr.ll_int_der1_ani(hkldata.df.I.to_numpy()[idxes], hkldata.df.SIGI.to_numpy()[idxes],
1075
+ k_ani[idxes], hkldata.binned_df["ml"].loc[i_bin, "S"],
1076
+ hkldata.df[fc_labs].to_numpy()[idxes], hkldata.binned_df["ml"].loc[i_bin, D_labs],
1077
+ hkldata.df.centric.to_numpy()[idxes]+1, hkldata.df.epsilon.to_numpy()[idxes],
1078
+ hkldata.df.llweight.to_numpy()[idxes])
1079
+ S2 = S2mat[:,idxes]
1080
+ g += -numpy.nansum(S2 * r[:,0], axis=1) # k_ani is already multiplied in r
1081
+ return numpy.dot(g, adpdirs.T)
1082
+ def shift_ani(x):
1083
+ b = gemmi.SMat33d(*numpy.dot(x, adpdirs))
1084
+ k_ani = hkldata.debye_waller_factors(b_cart=b)
1085
+ S2mat = hkldata.ssq_mat() # ssqmat
1086
+ g = numpy.zeros(6)
1087
+ H = numpy.zeros((6, 6))
1088
+ for i_bin, idxes in hkldata.binned("ml"):
1089
+ r = integr.ll_int_der1_ani(hkldata.df.I.to_numpy()[idxes], hkldata.df.SIGI.to_numpy()[idxes],
1090
+ k_ani[idxes], hkldata.binned_df["ml"].loc[i_bin, "S"],
1091
+ hkldata.df[fc_labs].to_numpy()[idxes], list(hkldata.binned_df["ml"].loc[i_bin, D_labs]),
1092
+ hkldata.df.centric.to_numpy()[idxes]+1, hkldata.df.epsilon.to_numpy()[idxes],
1093
+ hkldata.df.llweight.to_numpy()[idxes])
1094
+ S2 = S2mat[:,idxes]
1095
+ g += -numpy.nansum(S2 * r[:,0], axis=1) # k_ani is already multiplied in r
1096
+ H += numpy.nansum(numpy.matmul(S2[None,:].T, S2.T[:,None]) * (r[:,0]**2)[:,None,None], axis=0)
1097
+
1098
+ g, H = numpy.dot(g, adpdirs.T), numpy.dot(adpdirs, numpy.dot(H, adpdirs.T))
1099
+ return -numpy.dot(g, numpy.linalg.pinv(H))
1100
+
1101
+ logger.writeln("Refining B_aniso. Current = {}".format(b_aniso))
1102
+ if 0:
1103
+ x0 = numpy.dot(SMattolist(b_aniso), numpy.linalg.pinv(adpdirs))
1104
+ res = scipy.optimize.minimize(fun=target_ani, x0=x0, jac=grad_ani)
1105
+ print(res)
1106
+ b_aniso = gemmi.SMat33d(*numpy.dot(res.x, adpdirs))
1107
+ f1 = res.fun
1108
+ else:
1109
+ B_converged = False
1110
+ for j in range(10):
1111
+ x = numpy.dot(SMattolist(b_aniso), numpy.linalg.pinv(adpdirs))
1112
+ f0 = target_ani(x)
1113
+ shift = shift_ani(x)
1114
+ for i in range(3):
1115
+ ss = shift / 2**i
1116
+ f1 = target_ani(x + ss)
1117
+ #logger.writeln("{:2d} f0 = {:.3e} shift = {} df = {:.3e}".format(j, f0, ss, f1 - f0))
1118
+ if f1 < f0:
1119
+ b_aniso = gemmi.SMat33d(*numpy.dot(x+ss, adpdirs))
1120
+ if numpy.max(numpy.abs(ss)) < 1e-4: B_converged = True
1121
+ break
1122
+ else:
1123
+ B_converged = True
1124
+ if B_converged: break
1125
+
1126
+ logger.writeln("Refined B_aniso = {}".format(b_aniso))
1127
+ logger.writeln("cycle {} f= {}".format(i_cyc, f1))
1128
+
1129
+ return b_aniso
1130
+ # refine_ml_params()
1131
+
1132
+ def determine_ml_params(hkldata, use_int, fc_labs, D_labs, b_aniso,
1133
+ D_trans=None, S_trans=None, use="all", n_cycle=1, smoothing="gauss",
1134
+ twin_data=None):
1135
+ assert use in ("all", "work", "test")
1136
+ assert smoothing in (None, "gauss")
1137
+ logger.write(f"Estimating sigma-A parameters from {'intensities' if use_int else 'amplitudes'} using {use} reflections")
1138
+ logger.writeln(f"{' (twin)' if twin_data else ''}")
1139
+ centric_and_selections = hkldata.centric_and_selections["ml"]
1140
+
1141
+ if not set(D_labs + ["S"]).issubset(hkldata.binned_df["ml"]):
1142
+ initialize_ml_params(hkldata, use_int, D_labs, b_aniso, use, twin_data=twin_data)
1143
+ for dlab, fclab in zip(D_labs, fc_labs):
1144
+ hkldata.binned_df["ml"]["Mn(|{}*{}|)".format(dlab, fclab)] = numpy.nan
1145
+
1146
+ if twin_data or use_int:
1147
+ t0 = time.time()
1148
+ if use == "all":
1149
+ idxes = numpy.concatenate([sel[i] for i_bin, _ in hkldata.binned("ml")
1150
+ for sel in centric_and_selections[i_bin] for i in (1,2)])
1151
+ else:
1152
+ i = 1 if use == "work" else 2
1153
+ idxes = numpy.concatenate([sel[i] for i_bin, _ in hkldata.binned("ml")
1154
+ for sel in centric_and_selections[i_bin]])
1155
+ k_ani = hkldata.debye_waller_factors(b_cart=b_aniso)
1156
+ i_sigi = numpy.empty((2, len(hkldata.df.index)))
1157
+ i_sigi[:] = numpy.nan
1158
+ if twin_data:
1159
+ kani2_inv = 1 / k_ani**2
1160
+ i_sigi[0, idxes] = (hkldata.df.I.to_numpy() * kani2_inv)[idxes]
1161
+ i_sigi[1, idxes] = (hkldata.df.SIGI.to_numpy() * kani2_inv)[idxes]
1162
+ #twin_data.debug_open("twin_debug.json")
1163
+ twin_data.ll_refine_D_S(i_sigi[0,:], i_sigi[1,:], 20)
1164
+ #twin_data.debug_close()
1165
+ dfc = numpy.abs(twin_data.f_calc) * twin_data.ml_scale_array()
1166
+ for i_bin, idxes in hkldata.binned("ml"):
1167
+ hkldata.binned_df["ml"].loc[i_bin, D_labs] = twin_data.ml_scale[i_bin, :]
1168
+ hkldata.binned_df["ml"].loc[i_bin, "S"] = twin_data.ml_sigma[i_bin]
1169
+ dfc_bin = dfc[numpy.asarray(twin_data.bin)==i_bin,:]
1170
+ mean_dfc = numpy.nanmean(dfc_bin, axis=0)
1171
+ for i, (dlab, fclab) in enumerate(zip(D_labs, fc_labs)):
1172
+ hkldata.binned_df["ml"].loc[i_bin, "Mn(|{}*{}|)".format(dlab, fclab)] = mean_dfc[i]
1173
+ else:
1174
+ i_sigi[0, idxes] = hkldata.df.I.to_numpy()[idxes]
1175
+ i_sigi[1, idxes] = hkldata.df.SIGI.to_numpy()[idxes]
1176
+ DS = integr.ll_refine_D_S(i_sigi[0,:], i_sigi[1,:], k_ani,
1177
+ hkldata.binned_df["ml"].loc[:, "S"].to_numpy(), hkldata.df[fc_labs].to_numpy(),
1178
+ hkldata.binned_df["ml"].loc[:, D_labs].to_numpy(), hkldata.df.centric.to_numpy()+1,
1179
+ hkldata.df.epsilon.to_numpy(), hkldata.df.llweight.to_numpy(),
1180
+ hkldata.df.bin_ml.to_numpy(), 20)
1181
+ for i_bin, idxes in hkldata.binned("ml"):
1182
+ hkldata.binned_df["ml"].loc[i_bin, D_labs] = DS[i_bin, :-1]
1183
+ hkldata.binned_df["ml"].loc[i_bin, "S"] = DS[i_bin, -1]
1184
+ for dlab, fclab in zip(D_labs, fc_labs):
1185
+ mean_dfc = numpy.nanmean(numpy.abs(hkldata.binned_df["ml"][dlab][i_bin] * hkldata.df[fclab][idxes]))
1186
+ hkldata.binned_df["ml"].loc[i_bin, "Mn(|{}*{}|)".format(dlab, fclab)] = mean_dfc
1187
+ logger.writeln("Refined estimates:")
1188
+ logger.writeln(hkldata.binned_df["ml"].to_string())
1189
+ logger.writeln(f"time: {time.time()-t0:.1f} sec")
1190
+ else:
1191
+ b_aniso = refine_ml_params(hkldata, use_int, fc_labs, D_labs, b_aniso,
1192
+ D_trans, S_trans, use, n_cycle, twin_data)
1193
+ smooth_params(hkldata, D_labs, smoothing)
1194
+ return b_aniso
1195
+ # determine_ml_params()
1196
+
1197
+ def smooth_params(hkldata, D_labs, smoothing): # XXX twin_data
1198
+ if smoothing is None or len(hkldata.binned("ml")) < 2:
1199
+ for i, lab in enumerate(D_labs + ["S"]):
1200
+ hkldata.df[lab] = hkldata.binned_data_as_array("ml", lab)
1201
+
1202
+ elif smoothing == "gauss":
1203
+ bin_centers = (0.5 / hkldata.binned_df["ml"][["d_min", "d_max"]]**2).sum(axis=1).to_numpy()
1204
+ vals = ext.smooth_gauss(bin_centers,
1205
+ hkldata.binned_df["ml"][D_labs + ["S"]].to_numpy(),
1206
+ 1./hkldata.df.d.to_numpy()**2,
1207
+ 100, # min(n_ref?)
1208
+ (bin_centers[1] - bin_centers[0]))
1209
+ for i, lab in enumerate(D_labs + ["S"]):
1210
+ hkldata.df[lab] = vals[:, i]
1211
+ # Update smoothened average; this affects next refinement.
1212
+ # TODO: update Mn(|Dj*FCj|) as well.
1213
+ #for i_bin, idxes in hkldata.binned("ml"):
1214
+ # for lab in D_labs + ["S"]:
1215
+ # hkldata.binned_df["ml"].loc[i_bin, lab] = numpy.mean(hkldata.df[lab].to_numpy()[idxes])
1216
+ else:
1217
+ raise RuntimeError("unknown smoothing method: {}".format(smoothing))
1218
+ # smooth_params()
1219
+
1220
+ def expected_F_from_int(Io, sigIo, k_ani, DFc, eps, c, S):
1221
+ k_num = numpy.repeat(0.5 if c == 0 else 0., Io.size) # 0.5 if acentric
1222
+ k_den = k_num - 0.5
1223
+ if numpy.isscalar(c): c = numpy.repeat(c, Io.size)
1224
+ to = Io / sigIo - sigIo / (c+1) / k_ani**2 / S / eps
1225
+ tf = k_ani * numpy.abs(DFc) / numpy.sqrt(sigIo)
1226
+ sig1 = k_ani**2 * S * eps / sigIo
1227
+ f = ext.integ_J_ratio(k_num, k_den, True, to, tf, sig1, c+1, integr.exp2_threshold, integr.h, integr.N, integr.ewmax)
1228
+ f *= numpy.sqrt(sigIo) / k_ani
1229
+ m_proxy = ext.integ_J_ratio(k_num, k_num, True, to, tf, sig1, c+1, integr.exp2_threshold, integr.h, integr.N, integr.ewmax)
1230
+ return f, m_proxy
1231
+ # expected_F_from_int()
1232
+
1233
+ def calc_fc_dano(hkldata, D_labs, DFc):
1234
+ # assuming only first components have anomalous scatterers
1235
+ fcpp = hkldata.df["FC''"].to_numpy() * hkldata.df[D_labs[0]].to_numpy() * 1j
1236
+ fc_dano = numpy.abs(DFc + fcpp) - numpy.abs(DFc - fcpp) # this includes centrics (should be zero)
1237
+ return fc_dano
1238
+ # calc_fc_dano()
1239
+
1240
+ def calculate_maps_int(hkldata, b_aniso, fc_labs, D_labs, use="all"):
1241
+ nmodels = len(fc_labs)
1242
+ hkldata.df["FWT"] = 0j * numpy.nan
1243
+ hkldata.df["DELFWT"] = 0j * numpy.nan
1244
+ hkldata.df["F_est"] = numpy.nan
1245
+ hkldata.df["FOM"] = numpy.nan # FOM proxy, |<F>| / <|F|>
1246
+ has_ano = "I(+)" in hkldata.df and "I(-)" in hkldata.df
1247
+ Io = hkldata.df.I.to_numpy()
1248
+ sigIo = hkldata.df.SIGI.to_numpy()
1249
+ k_ani = hkldata.debye_waller_factors(b_cart=b_aniso)
1250
+ eps = hkldata.df.epsilon.to_numpy()
1251
+ Ds = numpy.vstack([hkldata.df[lab].to_numpy() for lab in D_labs]).T
1252
+ Fcs = numpy.vstack([hkldata.df[lab].to_numpy() for lab in fc_labs]).T
1253
+ DFc = (Ds * Fcs).sum(axis=1)
1254
+ hkldata.df["DFC"] = DFc
1255
+ if has_ano:
1256
+ hkldata.df["FAN"] = 0j * numpy.nan
1257
+ ano_data = hkldata.df[["I(+)", "SIGI(+)", "I(-)", "SIGI(-)"]].to_numpy()
1258
+ if "FC''" in hkldata.df:
1259
+ fc_dano = calc_fc_dano(hkldata, D_labs, DFc)
1260
+ hkldata.df["DELFAN"] = 0j * numpy.nan
1261
+ for i_bin, idxes in hkldata.binned("ml"):
1262
+ for c, work, test in hkldata.centric_and_selections["ml"][i_bin]:
1263
+ cidxes = numpy.concatenate([work, test])
1264
+ S = hkldata.df["S"].to_numpy()[cidxes]
1265
+ f, m_proxy = expected_F_from_int(Io[cidxes], sigIo[cidxes], k_ani[cidxes], DFc[cidxes], eps[cidxes], c, S)
1266
+ exp_ip = numpy.exp(numpy.angle(DFc[cidxes])*1j)
1267
+ hkldata.df.loc[cidxes, "FWT"] = 2 * f * exp_ip - DFc[cidxes]
1268
+ hkldata.df.loc[cidxes, "DELFWT"] = f * exp_ip - DFc[cidxes]
1269
+ hkldata.df.loc[cidxes, "FOM"] = m_proxy
1270
+ hkldata.df.loc[cidxes, "F_est"] = f
1271
+ if has_ano:
1272
+ f_p, _ = expected_F_from_int(ano_data[cidxes,0], ano_data[cidxes,1],
1273
+ k_ani[cidxes], DFc[cidxes], eps[cidxes], c, S)
1274
+ f_m, _ = expected_F_from_int(ano_data[cidxes,2], ano_data[cidxes,3],
1275
+ k_ani[cidxes], DFc[cidxes], eps[cidxes], c, S)
1276
+ hkldata.df.loc[cidxes, "FAN"] = (f_p - f_m) * exp_ip / 2j
1277
+ if has_ano and "FC''" in hkldata.df:
1278
+ # for centrics Fo_dano should be NaN; no need to mask fc_dano
1279
+ hkldata.df.loc[cidxes, "DELFAN"] = ((f_p - f_m) - fc_dano[cidxes]) * exp_ip / 2j
1280
+ # remove reflections that should be hidden
1281
+ if use != "all":
1282
+ # usually use == "work"
1283
+ tohide = test if use == "work" else work
1284
+ hkldata.df.loc[tohide, "FWT"] = 0j * numpy.nan
1285
+ hkldata.df.loc[tohide, "DELFWT"] = 0j * numpy.nan
1286
+ fill_sel = numpy.isnan(hkldata.df["FWT"][cidxes].to_numpy())
1287
+ hkldata.df.loc[cidxes[fill_sel], "FWT"] = DFc[cidxes][fill_sel]
1288
+ # calculate_maps_int()
1289
+
1290
+ def calculate_maps_twin(hkldata, b_aniso, fc_labs, D_labs, twin_data, use="all"):
1291
+ k_ani2_inv = 1 / hkldata.debye_waller_factors(b_cart=b_aniso)**2
1292
+ Io = hkldata.df.I.to_numpy(copy=True) * k_ani2_inv
1293
+ sigIo = hkldata.df.SIGI.to_numpy(copy=True) * k_ani2_inv
1294
+ # Mask Io
1295
+ for i_bin, idxes in hkldata.binned("ml"):
1296
+ for c, work, test in hkldata.centric_and_selections["ml"][i_bin]:
1297
+ if use != "all":
1298
+ tohide = test if use == "work" else work
1299
+ Io[tohide] = numpy.nan
1300
+
1301
+ twin_data.est_f_true(Io, sigIo)
1302
+ Ds = twin_data.ml_scale_array()
1303
+ DFc = (twin_data.f_calc * Ds).sum(axis=1)
1304
+ exp_ip = numpy.exp(numpy.angle(DFc)*1j)
1305
+ Ft = numpy.asarray(twin_data.f_true_max)
1306
+ m = twin_data.calc_fom()
1307
+ Fexp = twin_data.expected_F(Io, sigIo)
1308
+ fwt = 2 * Fexp * exp_ip - DFc
1309
+ delfwt = Fexp * exp_ip - DFc
1310
+ sel = numpy.isnan(fwt)
1311
+ fwt[sel] = DFc[sel]
1312
+
1313
+ hkldata2 = utils.hkl.HklData(hkldata.cell, hkldata.sg,
1314
+ utils.hkl.df_from_twin_data(twin_data, fc_labs))
1315
+ hkldata2.df["FWT"] = fwt
1316
+ hkldata2.df["DELFWT"] = delfwt
1317
+ hkldata2.df["FOM"] = m
1318
+ hkldata2.df["F_est"] = Ft
1319
+ hkldata2.df["F_exp"] = Fexp
1320
+ hkldata2.df["FC"] = twin_data.f_calc.sum(axis=1)
1321
+ hkldata2.df["DFC"] = DFc
1322
+ hkldata2.df[D_labs] = Ds
1323
+ hkldata2.df["S"] = twin_data.ml_sigma_array()
1324
+ return hkldata2
1325
+ # calculate_maps_twin()
1326
+
1327
+ def merge_models(sts): # simply merge models. no fix in chain ids etc.
1328
+ st2 = sts[0].clone()
1329
+ del st2[:]
1330
+ model = gemmi.Model(1)
1331
+ for st in sts:
1332
+ for m in st:
1333
+ for c in m:
1334
+ model.add_chain(c)
1335
+ st2.add_model(model)
1336
+ return st2
1337
+ # merge_models()
1338
+
1339
+ def decide_mtz_labels(mtz, find_free=True, require=None, prefer_intensity=False):
1340
+ if prefer_intensity:
1341
+ obs_types = ("J", "F", "K", "G")
1342
+ else:
1343
+ obs_types = ("F", "J", "G", "K")
1344
+ if require:
1345
+ assert set(require).issubset(obs_types)
1346
+ else:
1347
+ require = obs_types
1348
+ dlabs = utils.hkl.mtz_find_data_columns(mtz)
1349
+ logger.writeln("Finding possible options from MTZ:")
1350
+ for typ in dlabs:
1351
+ for labs in dlabs[typ]:
1352
+ logger.writeln(" --labin '{}'".format(",".join(labs)))
1353
+ for typ in require:
1354
+ if dlabs[typ]:
1355
+ labin = dlabs[typ][0]
1356
+ break
1357
+ else:
1358
+ raise RuntimeError(f"Error: Observation or sigma not found in MTZ")
1359
+ if find_free:
1360
+ flabs = utils.hkl.mtz_find_free_columns(mtz)
1361
+ if flabs:
1362
+ labin += [flabs[0]]
1363
+ logger.writeln("MTZ columns automatically selected: {}".format(labin))
1364
+ return labin
1365
+ # decide_mtz_labels()
1366
+
1367
+ def decide_spacegroup(sg_user, sg_st, sg_hkl):
1368
+ assert sg_hkl is not None
1369
+ ret = None
1370
+ if sg_user is not None:
1371
+ ret = sg_user
1372
+ logger.writeln(f"Space group overridden by user. Using {ret.xhm()}")
1373
+ else:
1374
+ ret = sg_hkl
1375
+ if sg_hkl != sg_st:
1376
+ if sg_st and sg_st.laue_str() != sg_hkl.laue_str():
1377
+ raise RuntimeError("Crystal symmetry mismatch between model and data")
1378
+ logger.writeln("Warning: space group mismatch between model and mtz")
1379
+ if sg_st and sg_st.laue_str() == sg_hkl.laue_str():
1380
+ logger.writeln(" using space group from model")
1381
+ ret = sg_st
1382
+ else:
1383
+ logger.writeln(" using space group from mtz")
1384
+ logger.writeln("")
1385
+
1386
+ return ret
1387
+ # decide_spacegroup
1388
+
1389
+ def process_input(hklin, labin, n_bins_ml, free, xyzins, d_max=None, d_min=None,
1390
+ n_per_mlbin=None, use="all", max_mlbins=None, cif_index=0, keep_charges=False,
1391
+ allow_unusual_occupancies=False, space_group=None,
1392
+ hklin_free=None, labin_free=None, labin_llweight=None, n_bins_stat=None, max_statbins=20):
1393
+ if labin: assert 1 < len(labin) < 6
1394
+ assert use in ("all", "work", "test")
1395
+
1396
+ if len(xyzins) > 0 and type(xyzins[0]) is gemmi.Structure:
1397
+ sts = xyzins
1398
+ else:
1399
+ sts = []
1400
+
1401
+ if type(hklin) is gemmi.Mtz or utils.fileio.is_mmhkl_file(hklin):
1402
+ if type(hklin) is gemmi.Mtz:
1403
+ mtz = hklin
1404
+ else:
1405
+ mtz = utils.fileio.read_mmhkl(hklin, cif_index=cif_index)
1406
+ if not sts:
1407
+ sts = [utils.fileio.read_structure(f) for f in xyzins]
1408
+ else:
1409
+ assert len(xyzins) == 1
1410
+ assert not sts
1411
+ st, mtz = utils.fileio.read_small_molecule_files([hklin, xyzins[0]])
1412
+ if None in (st, mtz):
1413
+ raise SystemExit("Failed to read small molecule file(s)")
1414
+ sts = [st]
1415
+
1416
+ for st in sts:
1417
+ utils.model.check_occupancies(st, raise_error=not allow_unusual_occupancies)
1418
+
1419
+ sg_use = decide_spacegroup(sg_user=gemmi.SpaceGroup(space_group) if space_group else None,
1420
+ sg_st=sts[0].find_spacegroup() if sts else None,
1421
+ sg_hkl=mtz.spacegroup)
1422
+ if not labin:
1423
+ labin = decide_mtz_labels(mtz, find_free=hklin_free is None)
1424
+ col_types = {x.label:x.type for x in mtz.columns}
1425
+ if labin[0] not in col_types:
1426
+ raise RuntimeError("MTZ column not found: {}".format(labin[0]))
1427
+ labs_and_types = {"F": ("amplitude", ["FP","SIGFP"], ["F", "Q"]),
1428
+ "J": ("intensity", ["I","SIGI"], ["J", "Q"]),
1429
+ "G": ("anomalous amplitude", ["F(+)","SIGF(+)", "F(-)", "SIGF(-)"], ["G", "L", "G", "L"]),
1430
+ "K": ("anomalous intensity", ["I(+)","SIGI(+)", "I(-)", "SIGI(-)"], ["K", "M", "K", "M"])}
1431
+ if col_types[labin[0]] not in labs_and_types:
1432
+ raise RuntimeError("MTZ column {} is neither amplitude nor intensity".format(labin[0]))
1433
+ if col_types[labin[0]] == "J": # may be unmerged data
1434
+ if (d_min, d_max).count(None) != 2:
1435
+ d_array = mtz.make_d_array()
1436
+ sel = ((0 if d_min is None else d_min) < d_array) & (d_array < (numpy.inf if d_max is None else d_max))
1437
+ else:
1438
+ sel = ...
1439
+ ints = gemmi.Intensities()
1440
+ ints.set_data(mtz.cell, sg_use, mtz.make_miller_array()[sel],
1441
+ mtz.array[sel,mtz.column_labels().index(labin[0])],
1442
+ mtz.array[sel,mtz.column_labels().index(labin[1])])
1443
+ dtype = ints.prepare_for_merging(gemmi.DataType.Mean) # do we want Anomalous?
1444
+ ints_bak = ints.clone() # for stats
1445
+ ints.merge_in_place(dtype)
1446
+ if (ints.nobs_array > 1).any():
1447
+ mtz = ints.prepare_merged_mtz(with_nobs=False)
1448
+ labin = mtz.column_labels()[3:]
1449
+ col_types = {x.label:x.type for x in mtz.columns}
1450
+ mult = ints.nobs_array.mean()
1451
+ logger.writeln(f"Input data were merged (multiplicity: {mult:.2f}). Overriding labin={','.join(labin)}")
1452
+ else:
1453
+ ints_bak = None
1454
+ else:
1455
+ ints_bak = None
1456
+
1457
+ name, newlabels, require_types = labs_and_types[col_types[labin[0]]]
1458
+ logger.writeln("Observation type: {}".format(name))
1459
+ if len(newlabels) < len(labin): newlabels.append("FREE")
1460
+ hkldata = utils.hkl.hkldata_from_mtz(mtz, labin, newlabels=newlabels, require_types=require_types)
1461
+ hkldata.sg = sg_use
1462
+ hkldata.mask_invalid_obs_values(newlabels)
1463
+ if newlabels[0] == "F(+)":
1464
+ hkldata.merge_anomalous(newlabels[:4], ["FP", "SIGFP"])
1465
+ newlabels = ["FP", "SIGFP"] + newlabels[4:]
1466
+ elif newlabels[0] == "I(+)":
1467
+ hkldata.merge_anomalous(newlabels[:4], ["I", "SIGI"])
1468
+ newlabels = ["I", "SIGI"] + newlabels[4:]
1469
+
1470
+ if hkldata.df.empty:
1471
+ raise RuntimeError("No data in hkl data")
1472
+
1473
+ if sts:
1474
+ for st in sts:
1475
+ if st[0].count_atom_sites() == 0:
1476
+ raise RuntimeError("No atom in the model")
1477
+ if not hkldata.cell.approx(sts[0].cell, 1e-3):
1478
+ logger.writeln("Warning: unit cell mismatch between model and reflection data")
1479
+ logger.writeln(" using unit cell from mtz")
1480
+
1481
+ for st in sts:
1482
+ st.cell = hkldata.cell # mtz cell is used in any case
1483
+ st.spacegroup_hm = sg_use.xhm()
1484
+ st.setup_cell_images()
1485
+
1486
+ if not keep_charges:
1487
+ utils.model.remove_charge(sts)
1488
+
1489
+ hkldata.switch_to_asu()
1490
+ hkldata.remove_systematic_absences()
1491
+ #hkldata.df = hkldata.df.astype({name: 'float64' for name in ["I","SIGI","FP","SIGFP"] if name in hkldata.df})
1492
+ d_min_max_data = hkldata.d_min_max(newlabels)
1493
+ if d_min is None and hkldata.d_min_max()[0] != d_min_max_data[0]:
1494
+ d_min = d_min_max_data[0]
1495
+ logger.writeln(f"Changing resolution to {d_min:.3f} A")
1496
+ if (d_min, d_max).count(None) != 2:
1497
+ hkldata = hkldata.copy(d_min=d_min, d_max=d_max)
1498
+ d_min_max_data = hkldata.d_min_max(newlabels)
1499
+ if hkldata.df.empty:
1500
+ raise RuntimeError("No data left in hkl data")
1501
+
1502
+ if hklin_free is not None:
1503
+ mtz2 = utils.fileio.read_mmhkl(hklin_free)
1504
+ for lab in (labin_free, labin_llweight):
1505
+ if lab and lab not in mtz2.column_labels():
1506
+ raise RuntimeError(f"specified label ({labin_free}) not found in {hklin_free}")
1507
+ if not labin_free:
1508
+ tmp = utils.hkl.mtz_find_free_columns(mtz2)
1509
+ if tmp:
1510
+ labin_free = tmp[0]
1511
+ elif not labin_llweight:
1512
+ raise RuntimeError(f"Test flag label not found in {hklin_free}")
1513
+ labs, newlabs = [], []
1514
+ for lab, newlab in ((labin_free, "FREE"), (labin_llweight, "llweight")):
1515
+ if lab:
1516
+ labs.append(lab)
1517
+ newlabs.append(newlab)
1518
+ tmp = utils.hkl.hkldata_from_mtz(mtz2, labs, newlabels=newlabs)
1519
+ tmp.sg = sg_use
1520
+ tmp.switch_to_asu()
1521
+ tmp.remove_systematic_absences()
1522
+ tmp = tmp.copy(d_min=d_min_max_data[0], d_max=d_min_max_data[1])
1523
+ hkldata.complete()
1524
+ tmp.complete()
1525
+ hkldata.merge(tmp.df[["H","K","L"] + newlabs])
1526
+
1527
+ hkldata.complete()
1528
+ hkldata.sort_by_resolution()
1529
+ hkldata.calc_epsilon()
1530
+ hkldata.calc_centric()
1531
+
1532
+ # fill dummy value for missing where observation is also missing
1533
+ for lab in ("FREE", "llweight"):
1534
+ if lab in hkldata.df:
1535
+ if (hkldata.df[lab].isna() & ~hkldata.df[newlabels[0]].isna()).any():
1536
+ raise RuntimeError(f"Missing {lab} reflection(s).")
1537
+ hkldata.df[lab] = hkldata.df[lab].fillna(0)
1538
+
1539
+ if "llweight" not in hkldata.df:
1540
+ hkldata.df["llweight"] = 1.
1541
+
1542
+ if "FREE" in hkldata.df and free is None:
1543
+ free = hkldata.guess_free_number(newlabels[0]) # also check NaN
1544
+
1545
+ if n_bins_ml is None:
1546
+ n_bins_ml, use = utils.hkl.decide_ml_binning(hkldata, data_label=newlabels[0],
1547
+ free_label="FREE", free=free,
1548
+ use=use, n_per_bin=n_per_mlbin,
1549
+ max_bins=max_mlbins)
1550
+ if n_bins_ml < 3 and use == "test":
1551
+ logger.writeln("Warning: Not enough reflections for ML parameters.")
1552
+ logger.writeln("Switching to use=work, i.e. use working reflections for ML estimation")
1553
+ use = "work"
1554
+ n_bins_ml, use = utils.hkl.decide_ml_binning(hkldata, data_label=newlabels[0],
1555
+ free_label="FREE", free=free,
1556
+ use=use, n_per_bin=n_per_mlbin,
1557
+ max_bins=max_mlbins)
1558
+ if n_bins_stat is None:
1559
+ sel = hkldata.df[newlabels[0]].notna()
1560
+ if "FREE" in hkldata.df:
1561
+ sel &= hkldata.df["FREE"] == free
1562
+ s_array = 1/hkldata.d_spacings()[sel]
1563
+ n_bins_stat = utils.hkl.decide_n_bins(10, s_array, min_bins=2, max_bins=max_statbins)
1564
+
1565
+ hkldata.setup_binning(n_bins=n_bins_ml, name="ml")
1566
+ hkldata.setup_binning(n_bins=n_bins_stat, name="stat")
1567
+ hkldata.setup_centric_and_selections("ml", data_lab=newlabels[0], free=free)
1568
+ hkldata.setup_centric_and_selections("stat", data_lab=newlabels[0], free=free)
1569
+ fc_labs = ["FC{}".format(i) for i, _ in enumerate(sts)]
1570
+
1571
+ # Create a centric selection table for faster look up
1572
+ stats = hkldata.binned_df["stat"].copy()
1573
+ stats["n_all"] = 0
1574
+ stats["n_obs"] = 0
1575
+ stats[newlabels[0]] = numpy.nan
1576
+ snr = "I/sigma" if newlabels[0] == "I" else "F/sigma"
1577
+ stats[snr] = numpy.nan
1578
+ if newlabels[0] == "I":
1579
+ stats["Mn(I)/Std(I)"] = numpy.nan
1580
+ if "FREE" in hkldata.df:
1581
+ stats["n_work"] = 0
1582
+ stats["n_test"] = 0
1583
+
1584
+ for i_bin, idxes in hkldata.binned("stat"):
1585
+ n_work, n_test = 0, 0
1586
+ for c, work, test in hkldata.centric_and_selections["stat"][i_bin]:
1587
+ n_work += numpy.sum(numpy.isfinite(hkldata.df.loc[work, newlabels[0]]))
1588
+ n_test += numpy.sum(numpy.isfinite(hkldata.df.loc[test, newlabels[0]]))
1589
+ n_obs = n_work + n_test
1590
+ stats.loc[i_bin, "n_obs"] = n_obs
1591
+ stats.loc[i_bin, "n_all"] = len(idxes)
1592
+ obs = hkldata.df[newlabels[0]].to_numpy()[idxes]
1593
+ sigma = hkldata.df[newlabels[1]].to_numpy()[idxes]
1594
+ if n_obs > 0:
1595
+ stats.loc[i_bin, snr] = numpy.nanmean(obs / sigma)
1596
+ mean_obs = numpy.nanmean(obs)
1597
+ stats.loc[i_bin, newlabels[0]] = mean_obs
1598
+ if newlabels[0] == "I":
1599
+ stats.loc[i_bin, "Mn(I)/Std(I)"] = mean_obs / numpy.nanstd(obs)
1600
+ if "FREE" in hkldata.df:
1601
+ stats.loc[i_bin, "n_work"] = n_work
1602
+ stats.loc[i_bin, "n_test"] = n_test
1603
+
1604
+ stats["completeness"] = stats["n_obs"] / stats["n_all"] * 100
1605
+ logger.writeln("Data completeness: {:.2%}".format(stats["n_obs"].sum() / stats["n_all"].sum()))
1606
+ if ints_bak is not None: # TODO ensure the same binning (use hkldata's binning)
1607
+ binner = gemmi.Binner()
1608
+ for name, n_bins in (("stat", n_bins_stat), ("ml", n_bins_ml)):
1609
+ binner.setup(n_bins, gemmi.Binner.Method.Dstar2, ints_bak)
1610
+ bin_stats = ints_bak.calculate_merging_stats(binner, use_weights="X")
1611
+ cc12 = numpy.array([stats.cc_half() for stats in bin_stats])
1612
+ if name == "stat": stats["CC1/2"] = cc12
1613
+ hkldata.binned_df[name]["CC*"] = numpy.sqrt(2 * cc12 / (1 + cc12))
1614
+
1615
+ logger.writeln(stats.to_string())
1616
+ return hkldata, sts, fc_labs, free, use
1617
+ # process_input()
1618
+
1619
+ def update_fc(st_list, fc_labs, d_min, monlib, source, mott_bethe, hkldata=None, twin_data=None, addends=None, addends2=None):
1620
+ #assert (hkldata, twin_data).count(None) == 1
1621
+ # hkldata not updated when twin_data is given
1622
+ if addends2:
1623
+ hkldata.df["FC''"] = 0.
1624
+ for i, st in enumerate(st_list):
1625
+ if st.ncs:
1626
+ st = st.clone()
1627
+ st.expand_ncs(gemmi.HowToNameCopiedChain.Dup, merge_dist=0)
1628
+ if twin_data:
1629
+ hkl = twin_data.asu
1630
+ else:
1631
+ hkl = hkldata.miller_array()
1632
+ fc = utils.model.calc_fc_fft(st, d_min - 1e-6,
1633
+ monlib=monlib,
1634
+ source=source,
1635
+ mott_bethe=mott_bethe,
1636
+ miller_array=hkl,
1637
+ addends=addends)
1638
+ if addends2:
1639
+ fcpp = utils.model.calc_fcpp_fft(st, d_min - 1e-6, addends2, miller_array=hkl)
1640
+ hkldata.df["FC''"] += fcpp
1641
+ if twin_data:
1642
+ twin_data.f_calc[:,i] = fc
1643
+ else:
1644
+ hkldata.df[fc_labs[i]] = fc
1645
+ if not twin_data:
1646
+ hkldata.df["FC"] = hkldata.df[fc_labs].sum(axis=1)
1647
+ # update_fc()
1648
+
1649
+ def calc_Fmask(st, d_min, miller_array, use_non_binary_mask=False):
1650
+ logger.writeln("Calculating solvent contribution..")
1651
+ grid = gemmi.FloatGrid()
1652
+ grid.setup_from(st, spacing=min(0.6, (d_min-1e-6) / 2 - 1e-9))
1653
+ masker = gemmi.SolventMasker(gemmi.AtomicRadiiSet.Refmac)
1654
+ if use_non_binary_mask:
1655
+ logger.writeln("Using non-binary solvent mask")
1656
+ masker.use_atom_occupancy = True
1657
+ masker.island_min_volume = 0
1658
+ masker.put_mask_on_float_grid(grid, st[0])
1659
+ #utils.maps.write_ccp4_map("solmask.ccp4", grid)
1660
+ fmask_gr = gemmi.transform_map_to_f_phi(grid)
1661
+ # TODO remove this with gemmi 0.7.5
1662
+ if not miller_array.flags.writeable:
1663
+ miller_array = miller_array.copy()
1664
+ Fmask = fmask_gr.get_value_by_hkl(miller_array)
1665
+ return Fmask
1666
+ # calc_Fmask()
1667
+
1668
+ def bulk_solvent_and_lsq_scales(hkldata, sts, fc_labs, use_solvent=True, use_int=False, mask=None, func_type="log_cosh", twin_data=None):
1669
+ # fc_labs must have solvent part at the end
1670
+ miller_array = twin_data.asu if twin_data else hkldata.miller_array()
1671
+ d_min = twin_data.d_min(sts[0].cell) if twin_data else hkldata.d_min_max()[0]
1672
+ if use_solvent:
1673
+ if mask is None:
1674
+ Fmask = calc_Fmask(merge_models(sts), d_min, miller_array)
1675
+ else:
1676
+ fmask_gr = gemmi.transform_map_to_f_phi(mask)
1677
+ # TODO remove this with gemmi 0.7.5
1678
+ if not miller_array.flags.writeable:
1679
+ miller_array = miller_array.copy()
1680
+ Fmask = fmask_gr.get_value_by_hkl(miller_array)
1681
+ if twin_data:
1682
+ fc_sum = twin_data.f_calc[:,:-1].sum(axis=1)
1683
+ else:
1684
+ fc_sum = hkldata.df[fc_labs[:-1]].sum(axis=1).to_numpy()
1685
+ fc_list = [fc_sum, Fmask]
1686
+ else:
1687
+ if twin_data:
1688
+ fc_list = [twin_data.f_calc.sum(axis=1)]
1689
+ else:
1690
+ fc_list = [hkldata.df[fc_labs].sum(axis=1).to_numpy()]
1691
+
1692
+ scaling = LsqScale(func_type=func_type)
1693
+ scaling.set_data(hkldata, fc_list, use_int, sigma_cutoff=0, twin_data=twin_data)
1694
+ scaling.scale()
1695
+ b_iso = scaling.b_iso
1696
+ k_aniso = hkldata.debye_waller_factors(b_cart=scaling.b_aniso)
1697
+ hkldata.df["k_aniso"] = k_aniso # we need it later when calculating stats
1698
+
1699
+ if use_solvent:
1700
+ if twin_data:
1701
+ s2 = numpy.asarray(twin_data.s2_array)
1702
+ else:
1703
+ s2 = 1. / hkldata.d_spacings().to_numpy()**2
1704
+ Fbulk = Fmask * scaling.get_solvent_scale(scaling.k_sol, scaling.b_sol, s2)
1705
+ if twin_data:
1706
+ twin_data.f_calc[:,-1] = Fbulk
1707
+ else:
1708
+ hkldata.df[fc_labs[-1]] = Fbulk
1709
+
1710
+ # Apply scales
1711
+ if use_int:
1712
+ # in intensity case, we try to refine b_aniso with ML. perhaps we should do it in amplitude case also
1713
+ o_labs = ["I", "SIGI", "I(+)","SIGI(+)", "I(-)", "SIGI(-)"]
1714
+ hkldata.df[hkldata.df.columns.intersection(o_labs)] /= scaling.k_overall**2
1715
+ else:
1716
+ o_labs = ["FP", "SIGFP", "F(+)","SIGF(+)", "F(-)", "SIGF(-)"]
1717
+ hkldata.df[hkldata.df.columns.intersection(o_labs)] /= scaling.k_overall
1718
+ if twin_data:
1719
+ twin_data.f_calc[:] *= twin_data.debye_waller_factors(b_iso=b_iso)[:,None]
1720
+ else:
1721
+ k_iso = hkldata.debye_waller_factors(b_iso=b_iso)
1722
+ for lab in fc_labs + ["FC''"]:
1723
+ if lab in hkldata.df:
1724
+ hkldata.df[lab] *= k_iso
1725
+ # total Fc
1726
+ hkldata.df["FC"] = hkldata.df[fc_labs].sum(axis=1)
1727
+ return scaling
1728
+ # bulk_solvent_and_lsq_scales()
1729
+
1730
+ def calculate_maps(hkldata, b_aniso, fc_labs, D_labs, log_out, use="all"):
1731
+ nmodels = len(fc_labs)
1732
+ hkldata.df["FWT"] = 0j * numpy.nan
1733
+ hkldata.df["DELFWT"] = 0j * numpy.nan
1734
+ hkldata.df["FOM"] = numpy.nan
1735
+ hkldata.df["X"] = numpy.nan # for FOM
1736
+ has_ano = "F(+)" in hkldata.df and "F(-)" in hkldata.df
1737
+ stats_data = []
1738
+ k_ani = hkldata.debye_waller_factors(b_cart=b_aniso)
1739
+ Ds = numpy.vstack([hkldata.df[lab].to_numpy() for lab in D_labs]).T
1740
+ Fcs = numpy.vstack([hkldata.df[lab].to_numpy() for lab in fc_labs]).T
1741
+ DFc = (Ds * Fcs).sum(axis=1)
1742
+ hkldata.df["DFC"] = DFc
1743
+ if has_ano:
1744
+ hkldata.df["FAN"] = 0j * numpy.nan
1745
+ if "FC''" in hkldata.df:
1746
+ fc_dano = calc_fc_dano(hkldata, D_labs, DFc)
1747
+ hkldata.df["DELFAN"] = 0j * numpy.nan
1748
+ for i_bin, idxes in hkldata.binned("ml"):
1749
+ bin_d_min = hkldata.binned_df["ml"].d_min[i_bin]
1750
+ bin_d_max = hkldata.binned_df["ml"].d_max[i_bin]
1751
+ # 0: acentric 1: centric
1752
+ mean_fom = [numpy.nan, numpy.nan]
1753
+ nrefs = [0, 0]
1754
+ for c, work, test in hkldata.centric_and_selections["ml"][i_bin]:
1755
+ cidxes = numpy.concatenate([work, test])
1756
+ S = hkldata.df["S"].to_numpy()[cidxes]
1757
+ expip = numpy.exp(numpy.angle(DFc[cidxes])*1j)
1758
+ Fo = hkldata.df.FP.to_numpy()[cidxes] / k_ani[cidxes]
1759
+ SigFo = hkldata.df.SIGFP.to_numpy()[cidxes] / k_ani[cidxes]
1760
+ epsilon = hkldata.df.epsilon.to_numpy()[cidxes]
1761
+ nrefs[c] = numpy.sum(numpy.isfinite(Fo))
1762
+ DFc_abs = numpy.abs(DFc[cidxes])
1763
+ if c == 0:
1764
+ Sigma = 2 * SigFo**2 + epsilon * S
1765
+ X = 2 * Fo * DFc_abs / Sigma
1766
+ m = gemmi.bessel_i1_over_i0(X)
1767
+ else:
1768
+ Sigma = SigFo**2 + epsilon * S
1769
+ X = Fo * DFc_abs / Sigma
1770
+ m = numpy.tanh(X)
1771
+ hkldata.df.loc[cidxes, "FWT"] = (2 * m * Fo - DFc_abs) * expip
1772
+ hkldata.df.loc[cidxes, "DELFWT"] = (m * Fo - DFc_abs) * expip
1773
+ hkldata.df.loc[cidxes, "FOM"] = m
1774
+ hkldata.df.loc[cidxes, "X"] = X
1775
+ #hkldata.df.loc[cidxes, "LL"] = ext.ll_amp(hkldata.df.FP.to_numpy()[cidxes],
1776
+ # hkldata.df.SIGFP.to_numpy()[cidxes],
1777
+ # k_ani[cidxes], S * epsilon,
1778
+ # DFc_abs, numpy.full(cidxes.shape, c + 1),
1779
+ # hkldata.df.llweight.to_numpy()[cidxes])
1780
+ if has_ano:
1781
+ Fo_dano = (hkldata.df["F(+)"].to_numpy()[cidxes] - hkldata.df["F(-)"].to_numpy()[cidxes]) / k_ani[cidxes]
1782
+ hkldata.df.loc[cidxes, "FAN"] = m * Fo_dano * expip / 2j
1783
+ if has_ano and "FC''" in hkldata.df:
1784
+ # for centrics Fo_dano should be NaN; no need to mask fc_dano
1785
+ hkldata.df.loc[cidxes, "DELFAN"] = (m * Fo_dano - fc_dano[cidxes]) * expip / 2j
1786
+ if nrefs[c] > 0: mean_fom[c] = numpy.nanmean(m)
1787
+
1788
+ # remove reflections that should be hidden
1789
+ if use != "all":
1790
+ # usually use == "work"
1791
+ tohide = test if use == "work" else work
1792
+ hkldata.df.loc[tohide, "FWT"] = 0j * numpy.nan
1793
+ hkldata.df.loc[tohide, "DELFWT"] = 0j * numpy.nan
1794
+ fill_sel = numpy.isnan(hkldata.df["FWT"][cidxes].to_numpy())
1795
+ hkldata.df.loc[cidxes[fill_sel], "FWT"] = DFc[cidxes][fill_sel]
1796
+
1797
+ Fc = hkldata.df.FC.to_numpy()[idxes] * k_ani[idxes]
1798
+ Fo = hkldata.df.FP.to_numpy()[idxes]
1799
+ mean_DFc2 = numpy.nanmean(numpy.abs((Ds[idxes,:] * Fcs[idxes,:]).sum(axis=1) * k_ani[idxes])**2)
1800
+ with numpy.errstate(divide="ignore"):
1801
+ mean_log_DFcs = numpy.log(numpy.nanmean(numpy.abs(Ds[idxes,:] * Fcs[idxes,:] * k_ani[idxes,None]), axis=0)).tolist()
1802
+ mean_Ds = numpy.nanmean(Ds[idxes,:], axis=0).tolist()
1803
+ if sum(nrefs) > 0:
1804
+ r = numpy.nansum(numpy.abs(numpy.abs(Fc)-Fo)) / numpy.nansum(Fo)
1805
+ cc = utils.hkl.correlation(Fo, numpy.abs(Fc))
1806
+ mean_Fo2 = numpy.nanmean(numpy.abs(Fo)**2)
1807
+ else:
1808
+ r, cc, mean_Fo2 = numpy.nan, numpy.nan, numpy.nan
1809
+ stats_data.append([i_bin, nrefs[0], nrefs[1], bin_d_max, bin_d_min,
1810
+ numpy.log(mean_Fo2),
1811
+ numpy.log(numpy.nanmean(numpy.abs(Fc)**2)),
1812
+ numpy.log(mean_DFc2),
1813
+ numpy.log(numpy.mean(hkldata.df["S"].to_numpy()[idxes])),
1814
+ mean_fom[0], mean_fom[1], r, cc] + mean_Ds + mean_log_DFcs)
1815
+
1816
+ DFc_labs = ["log(Mn(|{}{}|))".format(dl,fl) for dl,fl in zip(D_labs, fc_labs)]
1817
+ cols = ["bin", "n_a", "n_c", "d_max", "d_min",
1818
+ "log(Mn(|Fo|^2))", "log(Mn(|Fc|^2))", "log(Mn(|DFc|^2))",
1819
+ "log(Sigma)", "FOM_a", "FOM_c", "R", "CC(|Fo|,|Fc|)"] + D_labs + DFc_labs
1820
+ stats = pandas.DataFrame(stats_data, columns=cols)
1821
+ title_labs = [["log(Mn(|F|^2)) and variances", ["log(Mn(|Fo|^2))", "log(Mn(|Fc|^2))", "log(Mn(|DFc|^2))", "log(Sigma)"]],
1822
+ ["FOM", ["FOM_a", "FOM_c"]],
1823
+ ["D", D_labs],
1824
+ ["DFc", DFc_labs],
1825
+ ["R-factor", ["R"]],
1826
+ ["CC", ["CC(|Fo|,|Fc|)"]],
1827
+ ["number of reflections", ["n_a", "n_c"]]]
1828
+ with open(log_out, "w") as ofs:
1829
+ ofs.write(utils.make_loggraph_str(stats, main_title="Statistics",
1830
+ title_labs=title_labs,
1831
+ s2=1/stats["d_min"]**2))
1832
+ logger.writeln("output log: {}".format(log_out))
1833
+ # calculate_maps()
1834
+
1835
+ def main(args):
1836
+ if args.wavelength is not None and args.source != "xray":
1837
+ raise SystemExit("Error: Wavelength is only available for X-ray source")
1838
+ try:
1839
+ hkldata, sts, fc_labs, free, args.use = process_input(
1840
+ hklin=args.hklin,
1841
+ labin=args.labin.split(",") if args.labin else None,
1842
+ n_bins_ml=args.nbins_ml,
1843
+ n_bins_stat=args.nbins,
1844
+ free=args.free,
1845
+ xyzins=sum(args.model, []),
1846
+ d_max=args.d_max,
1847
+ d_min=args.d_min,
1848
+ use=args.use,
1849
+ max_mlbins=30,
1850
+ keep_charges=args.keep_charges,
1851
+ space_group=args.spacegroup,
1852
+ hklin_free=args.hklin_free,
1853
+ labin_free=args.labin_free)
1854
+ except RuntimeError as e:
1855
+ raise SystemExit("Error: {}".format(e))
1856
+
1857
+ addends, addends2 = utils.model.check_atomsf(sts, args.source, mott_bethe=(args.source=="electron"), wavelength=args.wavelength)
1858
+ for st in sts:
1859
+ utils.model.find_special_positions(st, fix_occ=True, fix_pos=False, fix_adp=False)
1860
+
1861
+ if args.twin:
1862
+ twin_data, _ = find_twin_domains_from_data(hkldata)
1863
+ else:
1864
+ twin_data = None
1865
+ if twin_data:
1866
+ twin_data.setup_f_calc(len(sts) + (0 if args.no_solvent else 1))
1867
+
1868
+ subtract_common_aniso_from_model(sts)
1869
+ update_fc(sts, fc_labs, d_min=hkldata.d_min_max()[0], monlib=None,
1870
+ source=args.source, mott_bethe=(args.source=="electron"),
1871
+ hkldata=hkldata, twin_data=twin_data, addends=addends, addends2=addends2)
1872
+ is_int = "I" in hkldata.df
1873
+
1874
+ if args.mask:
1875
+ mask = utils.fileio.read_ccp4_map(args.mask)[0]
1876
+ else:
1877
+ mask = None
1878
+
1879
+ # Overall scaling & bulk solvent
1880
+ # FP/SIGFP will be scaled. Total FC will be added.
1881
+ if not args.no_solvent:
1882
+ fc_labs.append("Fbulk")
1883
+ lsq = bulk_solvent_and_lsq_scales(hkldata, sts, fc_labs, use_solvent=not args.no_solvent,
1884
+ use_int=is_int, mask=mask, twin_data=twin_data)
1885
+ b_aniso = lsq.b_aniso
1886
+ # stats
1887
+ stats, overall = calc_r_and_cc(hkldata, twin_data)
1888
+ if is_int:
1889
+ logger.writeln("R1 is calculated for reflections with I/sigma>2.")
1890
+
1891
+ if twin_data:
1892
+ estimate_twin_fractions_from_model(twin_data, hkldata)
1893
+ #del hkldata.df["FC"]
1894
+ #del hkldata.df["Fbulk"]
1895
+ # Need to redo scaling?
1896
+ lsq = bulk_solvent_and_lsq_scales(hkldata, sts, fc_labs, use_solvent=not args.no_solvent,
1897
+ use_int=is_int, mask=mask, twin_data=twin_data)
1898
+ b_aniso = lsq.b_aniso
1899
+ stats, overall = calc_r_and_cc(hkldata, twin_data)
1900
+ for lab in "R", "CC":
1901
+ logger.writeln(" ".join("{} = {:.4f}".format(x, overall[x]) for x in overall if x.startswith(lab)))
1902
+ logger.writeln(stats.to_string() + "\n")
1903
+
1904
+ # Estimate ML parameters
1905
+ D_labs = ["D{}".format(i) for i in range(len(fc_labs))]
1906
+
1907
+ if args.use_cc:
1908
+ assert not is_int
1909
+ assert not args.twin
1910
+ logger.writeln("Estimating sigma-A parameters from CC..")
1911
+ determine_mlf_params_from_cc(hkldata, fc_labs, D_labs, args.use)
1912
+ else:
1913
+ b_aniso = determine_ml_params(hkldata, is_int, fc_labs, D_labs, b_aniso, args.D_trans, args.S_trans, args.use,
1914
+ twin_data=twin_data)
1915
+ if twin_data and args.twin_mlalpha:
1916
+ mlopt_twin_fractions(hkldata, twin_data, b_aniso)
1917
+
1918
+ use = {"all": "all", "work": "work", "test": "work"}[args.use]
1919
+ if twin_data:
1920
+ # replace hkldata
1921
+ hkldata = calculate_maps_twin(hkldata, b_aniso, fc_labs, D_labs, twin_data, use)
1922
+ elif is_int:
1923
+ calculate_maps_int(hkldata, b_aniso, fc_labs, D_labs, use)
1924
+ else:
1925
+ log_out = "{}.log".format(args.output_prefix)
1926
+ calculate_maps(hkldata, b_aniso, fc_labs, D_labs, log_out, use)
1927
+
1928
+ # Write mtz file
1929
+ if twin_data:
1930
+ labs = ["F_est", "F_exp"]
1931
+ elif is_int:
1932
+ labs = ["I", "SIGI", "F_est"]
1933
+ else:
1934
+ labs = ["FP", "SIGFP"]
1935
+ labs.extend(["FOM", "FWT", "DELFWT", "FC", "DFC"])
1936
+ if "FAN" in hkldata.df:
1937
+ labs.append("FAN")
1938
+ if "DELFAN" in hkldata.df:
1939
+ labs.append("DELFAN")
1940
+ if not args.no_solvent:
1941
+ labs.append("Fbulk")
1942
+ if "FREE" in hkldata.df:
1943
+ labs.append("FREE")
1944
+ if "F_true_est" in hkldata.df:
1945
+ labs.append("F_true_est")
1946
+ labs += D_labs + ["S"]
1947
+ mtz_out = args.output_prefix+".mtz"
1948
+ hkldata.write_mtz(mtz_out, labs=labs, types={"FOM": "W", "FP":"F", "SIGFP":"Q", "F_est": "F", "F_exp": "F"})
1949
+ return hkldata
1950
+ # main()
1951
+ if __name__ == "__main__":
1952
+ import sys
1953
+ args = parse_args(sys.argv[1:])
1954
+ main(args)