servalcat 0.4.88__cp313-cp313-win_amd64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of servalcat might be problematic. Click here for more details.

Files changed (45) hide show
  1. servalcat/__init__.py +10 -0
  2. servalcat/__main__.py +120 -0
  3. servalcat/ext.cp313-win_amd64.pyd +0 -0
  4. servalcat/refine/__init__.py +0 -0
  5. servalcat/refine/cgsolve.py +100 -0
  6. servalcat/refine/refine.py +823 -0
  7. servalcat/refine/refine_geom.py +220 -0
  8. servalcat/refine/refine_spa.py +345 -0
  9. servalcat/refine/refine_xtal.py +268 -0
  10. servalcat/refine/spa.py +136 -0
  11. servalcat/refine/xtal.py +273 -0
  12. servalcat/refmac/__init__.py +0 -0
  13. servalcat/refmac/exte.py +182 -0
  14. servalcat/refmac/refmac_keywords.py +639 -0
  15. servalcat/refmac/refmac_wrapper.py +403 -0
  16. servalcat/spa/__init__.py +0 -0
  17. servalcat/spa/fofc.py +473 -0
  18. servalcat/spa/fsc.py +387 -0
  19. servalcat/spa/localcc.py +188 -0
  20. servalcat/spa/realspcc_from_var.py +128 -0
  21. servalcat/spa/run_refmac.py +972 -0
  22. servalcat/spa/shift_maps.py +293 -0
  23. servalcat/spa/shiftback.py +137 -0
  24. servalcat/spa/translate.py +129 -0
  25. servalcat/utils/__init__.py +35 -0
  26. servalcat/utils/commands.py +1397 -0
  27. servalcat/utils/fileio.py +737 -0
  28. servalcat/utils/generate_operators.py +296 -0
  29. servalcat/utils/hkl.py +712 -0
  30. servalcat/utils/logger.py +116 -0
  31. servalcat/utils/maps.py +345 -0
  32. servalcat/utils/model.py +782 -0
  33. servalcat/utils/refmac.py +760 -0
  34. servalcat/utils/restraints.py +782 -0
  35. servalcat/utils/symmetry.py +295 -0
  36. servalcat/xtal/__init__.py +0 -0
  37. servalcat/xtal/french_wilson.py +256 -0
  38. servalcat/xtal/run_refmac_small.py +240 -0
  39. servalcat/xtal/sigmaa.py +1622 -0
  40. servalcat/xtal/twin.py +115 -0
  41. servalcat-0.4.88.dist-info/METADATA +55 -0
  42. servalcat-0.4.88.dist-info/RECORD +45 -0
  43. servalcat-0.4.88.dist-info/WHEEL +5 -0
  44. servalcat-0.4.88.dist-info/entry_points.txt +4 -0
  45. servalcat-0.4.88.dist-info/licenses/LICENSE +373 -0
@@ -0,0 +1,1622 @@
1
+ """
2
+ Author: "Keitaro Yamashita, Garib N. Murshudov"
3
+ MRC Laboratory of Molecular Biology
4
+
5
+ This software is released under the
6
+ Mozilla Public License, version 2.0; see LICENSE.
7
+ """
8
+ from __future__ import absolute_import, division, print_function, generators
9
+ import argparse
10
+ import gemmi
11
+ import numpy
12
+ import pandas
13
+ import itertools
14
+ import time
15
+ import scipy.special
16
+ import scipy.optimize
17
+ from servalcat.utils import logger
18
+ from servalcat import utils
19
+ from servalcat import ext
20
+ from servalcat.xtal.twin import find_twin_domains_from_data, estimate_twin_fractions_from_model
21
+
22
+ """
23
+ DFc = sum_j D_j F_c,j
24
+ The last Fc,n is bulk solvent contribution.
25
+ """
26
+
27
+ integr = ext.IntensityIntegrator()
28
+
29
+ def add_arguments(parser):
30
+ parser.description = 'Sigma-A parameter estimation for crystallographic data'
31
+ parser.add_argument('--hklin', required=True,
32
+ help='Input MTZ file')
33
+ parser.add_argument('--labin',
34
+ help='MTZ column for F,SIGF,FREE')
35
+ parser.add_argument('--free', type=int,
36
+ help='flag number for test set')
37
+ parser.add_argument('--model', required=True, nargs="+", action="append",
38
+ help='Input atomic model file(s)')
39
+ parser.add_argument("-d", '--d_min', type=float)
40
+ parser.add_argument('--d_max', type=float)
41
+ parser.add_argument('--nbins', type=int,
42
+ help="Number of bins (default: auto)")
43
+ parser.add_argument('-s', '--source', choices=["electron", "xray", "neutron"], required=True,
44
+ help="Scattering factor choice")
45
+ parser.add_argument('--D_trans', choices=["exp", "splus"],
46
+ help="estimate D with positivity constraint")
47
+ parser.add_argument('--S_trans', choices=["exp", "splus"],
48
+ help="estimate variance of unexplained signal with positivity constraint")
49
+ parser.add_argument('--no_solvent', action='store_true',
50
+ help="Do not consider bulk solvent contribution")
51
+ parser.add_argument('--use_cc', action='store_true',
52
+ help="Use CC(|F1|,|F2|) to CC(F1,F2) conversion to derive D and S")
53
+ parser.add_argument('--use', choices=["all", "work", "test"], default="all",
54
+ help="Which reflections to be used for the parameter estimate.")
55
+ parser.add_argument('--twin', action="store_true", help="Turn on twin refinement")
56
+ parser.add_argument('--mask',
57
+ help="A solvent mask (by default calculated from the coordinates)")
58
+ parser.add_argument('--keep_charges', action='store_true',
59
+ help="Use scattering factor for charged atoms. Use it with care.")
60
+ parser.add_argument('-o','--output_prefix', default="sigmaa",
61
+ help='output file name prefix (default: %(default)s)')
62
+ # add_arguments()
63
+
64
+ def parse_args(arg_list):
65
+ parser = argparse.ArgumentParser()
66
+ add_arguments(parser)
67
+ return parser.parse_args(arg_list)
68
+ # parse_args()
69
+
70
+ def nanaverage(cc, w):
71
+ sel = ~numpy.isnan(cc)
72
+ if numpy.sum(w[sel]) == 0:
73
+ return numpy.nan
74
+ return numpy.average(cc[sel], weights=w[sel])
75
+
76
+ def calc_r_and_cc(hkldata, centric_and_selections, twin_data=None):
77
+ has_int = "I" in hkldata.df
78
+ has_free = "FREE" in hkldata.df
79
+ stats = hkldata.binned_df.copy()
80
+ stats["n_obs"] = 0
81
+ if has_free:
82
+ stats[["n_work", "n_free"]] = 0
83
+ rlab = "R1" if has_int else "R"
84
+ cclab = "CCI" if has_int else "CCF"
85
+ if twin_data:
86
+ Fc = numpy.sqrt(twin_data.i_calc_twin())
87
+ else:
88
+ Fc = numpy.abs(hkldata.df.FC * hkldata.df.k_aniso)
89
+ if has_int:
90
+ obs = hkldata.df.I
91
+ obs_sqrt = numpy.sqrt(numpy.maximum(0, hkldata.df.I))
92
+ obs_sqrt[hkldata.df.I/hkldata.df.SIGI < 2] = numpy.nan # SHELX equivalent
93
+ calc = Fc**2
94
+ calc_sqrt = Fc
95
+ else:
96
+ obs = obs_sqrt = hkldata.df.FP
97
+ calc = calc_sqrt = Fc
98
+ if has_free:
99
+ for lab in (cclab, rlab):
100
+ for suf in ("work", "free"):
101
+ stats[lab+suf] = numpy.nan
102
+ else:
103
+ stats[cclab] = numpy.nan
104
+ stats[rlab] = numpy.nan
105
+
106
+ for i_bin, idxes in hkldata.binned():
107
+ stats.loc[i_bin, "n_obs"] = numpy.sum(numpy.isfinite(obs[idxes]))
108
+ if has_free:
109
+ for j, suf in ((1, "work"), (2, "free")):
110
+ idxes2 = numpy.concatenate([sel[j] for sel in centric_and_selections[i_bin]])
111
+ stats.loc[i_bin, "n_"+suf] = numpy.sum(numpy.isfinite(obs[idxes2]))
112
+ stats.loc[i_bin, cclab+suf] = utils.hkl.correlation(obs[idxes2], calc[idxes2])
113
+ stats.loc[i_bin, rlab+suf] = utils.hkl.r_factor(obs_sqrt[idxes2], calc_sqrt[idxes2])
114
+ else:
115
+ stats.loc[i_bin, cclab] = utils.hkl.correlation(obs[idxes], calc[idxes])
116
+ stats.loc[i_bin, rlab] = utils.hkl.r_factor(obs_sqrt[idxes], calc_sqrt[idxes])
117
+
118
+ # Overall
119
+ ret = {}
120
+ if has_free:
121
+ for suf in ("work", "free"):
122
+ ret[cclab+suf+"avg"] = nanaverage(stats[cclab+suf], stats["n_"+suf])
123
+ for j, suf in ((1, "work"), (2, "free")):
124
+ idxes = numpy.concatenate([sel[j] for i_bin, _ in hkldata.binned() for sel in centric_and_selections[i_bin]])
125
+ ret[rlab+suf] = utils.hkl.r_factor(obs_sqrt[idxes], calc_sqrt[idxes])
126
+ else:
127
+ ret[cclab+"avg"] = nanaverage(stats[cclab], stats["n_obs"])
128
+ ret[rlab] = utils.hkl.r_factor(obs, calc)
129
+
130
+ return stats, ret
131
+ # calc_r_and_cc()
132
+
133
+ class VarTrans:
134
+ def __init__(self, D_trans, S_trans):
135
+ # splus (softplus) appears to be better than exp
136
+ # exp sometimes results in too large parameter value
137
+ trans_funcs = {"exp": (numpy.exp, # D = f(x)
138
+ numpy.exp, # dD/dx
139
+ numpy.log), # x = f^-1(D)
140
+ "splus": (lambda x: numpy.logaddexp(0, x),
141
+ scipy.special.expit, # lambda x: 1. / (1. + numpy.exp(-x))
142
+ lambda x: x + numpy.log(-numpy.expm1(-x))),
143
+ None: (lambda x: x,
144
+ lambda x: 1,
145
+ lambda x: x)}
146
+
147
+ self.D, self.D_deriv, self.D_inv = trans_funcs[D_trans]
148
+ self.S, self.S_deriv, self.S_inv = trans_funcs[S_trans]
149
+ # class VarTrans
150
+
151
+ class LsqScale:
152
+ # parameter x = [k_overall, adp_pars, k_sol, B_sol]
153
+ def __init__(self, k_as_exp=False, func_type="log_cosh"):
154
+ assert func_type in ("sq", "log_cosh")
155
+ self.k_trans = lambda x: numpy.exp(x) if k_as_exp else x
156
+ self.k_trans_der = lambda x: numpy.exp(x) if k_as_exp else 1
157
+ self.k_trans_inv = lambda x: numpy.log(x) if k_as_exp else x
158
+ self.func_type = func_type
159
+ self.reset()
160
+
161
+ def reset(self):
162
+ self.k_sol = 0.35 # same default as gemmi/scaling.hpp # refmac seems to use 0.33 and 100? SCALE_LS_PART
163
+ self.b_sol = 46.
164
+ self.k_overall = None
165
+ self.b_iso = None
166
+ self.b_aniso = None
167
+ self.stats = {}
168
+
169
+ def set_data(self, hkldata, fc_list, use_int=False, sigma_cutoff=None, twin_data=None):
170
+ assert 0 < len(fc_list) < 3
171
+ self.use_int = use_int
172
+ if sigma_cutoff is not None:
173
+ if use_int:
174
+ self.sel = hkldata.df.I / hkldata.df.SIGI > sigma_cutoff
175
+ self.labcut = "(I/SIGI>{})".format(sigma_cutoff)
176
+ else:
177
+ self.sel = hkldata.df.FP / hkldata.df.SIGFP > sigma_cutoff
178
+ self.labcut = "(F/SIGF>{})".format(sigma_cutoff)
179
+ else:
180
+ self.sel = hkldata.df.index
181
+ self.labcut = ""
182
+ self.obs = hkldata.df["I" if use_int else "FP"].to_numpy(copy=True)
183
+ self.obs[~self.sel] = numpy.nan
184
+ self.calc = [x for x in fc_list]
185
+ self.s2mat = hkldata.ssq_mat()
186
+ self.s2 = 1. / hkldata.d_spacings().to_numpy()**2
187
+ self.adpdirs = utils.model.adp_constraints(hkldata.sg.operations(), hkldata.cell, tr0=False)
188
+ self.twin_data = twin_data
189
+ if use_int:
190
+ self.sqrt_obs = numpy.sqrt(self.obs)
191
+
192
+ def get_solvent_scale(self, k_sol, b_sol, s2=None):
193
+ if s2 is None: s2 = self.s2
194
+ return k_sol * numpy.exp(-b_sol * s2 / 4)
195
+
196
+ def fc_and_mask_grad(self, x):
197
+ fc0 = self.calc[0]
198
+ if len(self.calc) == 2:
199
+ if self.twin_data:
200
+ r = self.twin_data.scaling_fc_and_mask_grad(self.calc[1], x[-2], x[-1])
201
+ return r[:,0], r[:,1], r[:,2]
202
+ else:
203
+ fmask = self.calc[1]
204
+ temp_sol = numpy.exp(-x[-1] * self.s2 / 4)
205
+ fbulk = x[-2] * temp_sol * fmask
206
+ fc = fc0 + fbulk
207
+ re_fmask_fcconj = (fmask * fc.conj()).real
208
+ fc_abs = numpy.abs(fc)
209
+ tmp = temp_sol / fc_abs * re_fmask_fcconj
210
+ return fc_abs, tmp, -tmp * x[-2] * self.s2 / 4
211
+ else:
212
+ if self.twin_data:
213
+ return numpy.sqrt(self.twin_data.i_calc_twin()), None, None
214
+ else:
215
+ return numpy.abs(fc0), None, None
216
+
217
+ def scaled_fc(self, x):
218
+ fc = self.fc_and_mask_grad(x)[0]
219
+ nadp = self.adpdirs.shape[0]
220
+ B = numpy.dot(x[1:nadp+1], self.adpdirs)
221
+ kani = numpy.exp(numpy.dot(-B, self.s2mat))
222
+ return self.k_trans(x[0]) * kani * fc
223
+
224
+ def target(self, x):
225
+ y = self.scaled_fc(x)
226
+ if self.use_int:
227
+ diff = self.sqrt_obs - y
228
+ #y2 = y**2
229
+ #diff = self.obs - y2
230
+ else:
231
+ diff = self.obs - y
232
+
233
+ if self.func_type == "sq":
234
+ return numpy.nansum(diff**2)
235
+ elif self.func_type == "log_cosh":
236
+ return numpy.nansum(gemmi.log_cosh(diff))
237
+ else:
238
+ raise RuntimeError("bad func_type")
239
+
240
+ def grad(self, x):
241
+ g = numpy.zeros_like(x)
242
+ fc_abs, der_ksol, der_bsol = self.fc_and_mask_grad(x)
243
+ nadp = self.adpdirs.shape[0]
244
+ B = numpy.dot(x[1:nadp+1], self.adpdirs)
245
+ kani = numpy.exp(numpy.dot(-B, self.s2mat))
246
+ k = self.k_trans(x[0])
247
+ y = k * kani * fc_abs
248
+ if self.use_int:
249
+ diff = self.sqrt_obs - y
250
+ diff_der = -1
251
+ #diff = self.obs - y**2
252
+ #diff_der = -2 * y
253
+ else:
254
+ diff = self.obs - y
255
+ diff_der = -1
256
+ if self.func_type == "sq":
257
+ dfdy = 2 * diff * diff_der
258
+ elif self.func_type == "log_cosh":
259
+ dfdy = numpy.tanh(diff) * diff_der
260
+ else:
261
+ raise RuntimeError("bad func_type")
262
+
263
+ dfdb = numpy.nansum(-self.s2mat * k * fc_abs * kani * dfdy, axis=1)
264
+ g[0] = numpy.nansum(kani * fc_abs * dfdy * self.k_trans_der(x[0]))
265
+ g[1:nadp+1] = numpy.dot(dfdb, self.adpdirs.T)
266
+ if len(self.calc) == 2:
267
+ g[-2] = numpy.nansum(k * kani * der_ksol * dfdy)
268
+ g[-1] = numpy.nansum(k * kani * der_bsol * dfdy)
269
+
270
+ return g
271
+
272
+ def calc_shift(self, x):
273
+ # TODO: sort out code duplication, if we use this.
274
+ g = numpy.zeros((len(self.obs), len(x)))
275
+ H = numpy.zeros((len(x), len(x)))
276
+ fc_abs, der_ksol, der_bsol = self.fc_and_mask_grad(x)
277
+ nadp = self.adpdirs.shape[0]
278
+ B = numpy.dot(x[1:nadp+1], self.adpdirs)
279
+ kani = numpy.exp(numpy.dot(-B, self.s2mat))
280
+ k = self.k_trans(x[0])
281
+ y = k * kani * fc_abs
282
+ if self.use_int:
283
+ diff = self.sqrt_obs - y
284
+ diff_der = -1
285
+ diff_der2 = 0
286
+ else:
287
+ diff = self.obs - y
288
+ diff_der = -1.
289
+ diff_der2 = 0.
290
+
291
+ if self.func_type == "sq":
292
+ dfdy = 2 * diff * diff_der
293
+ dfdy2 = 2 * diff_der**2 + 2 * diff * diff_der2
294
+ elif self.func_type == "log_cosh":
295
+ dfdy = numpy.tanh(diff) * diff_der
296
+ #dfdy2 = 1 /numpy.cosh(diff)**2 * diff_der**2 + numpy.tanh(diff) * diff_der2 # problematic with large diff
297
+ #dfdy2 = numpy.where(diff==0, 1., numpy.abs(numpy.tanh(diff)) / gemmi.log_cosh(diff)) * diff_der**2 + numpy.tanh(diff) * diff_der2
298
+ dfdy2 = numpy.where(diff==0, 1., numpy.tanh(diff) / diff) * diff_der**2 + numpy.tanh(diff) * diff_der2
299
+ else:
300
+ raise RuntimeError("bad func_type")
301
+
302
+ dfdb = -self.s2mat * k * fc_abs * kani
303
+ g[:,0] = kani * fc_abs * self.k_trans_der(x[0])
304
+ g[:,1:nadp+1] = numpy.dot(dfdb.T, self.adpdirs.T)
305
+ if len(self.calc) == 2:
306
+ g[:,-2] = k * kani * der_ksol
307
+ g[:,-1] = k * kani * der_bsol
308
+
309
+ # no numpy.nandot..
310
+ g, dfdy, dfdy2 = g[self.sel, :], dfdy[self.sel], dfdy2[self.sel]
311
+ H = numpy.dot(g.T, g * dfdy2[:,None])
312
+ g = numpy.sum(dfdy[:,None] * g, axis=0)
313
+ dx = -numpy.dot(g, numpy.linalg.pinv(H))
314
+ return dx
315
+
316
+ def initial_kb(self):
317
+ fc_abs = self.fc_and_mask_grad([self.k_sol, self.b_sol])[0]
318
+ sel = self.obs > 0 # exclude nan as well
319
+ f1p, f2p, s2p = self.obs[sel], fc_abs[sel], self.s2[sel]
320
+ if self.use_int: f2p *= f2p
321
+ tmp = numpy.log(f2p) - numpy.log(f1p)
322
+ # g = [dT/dk, dT/db]
323
+ g = numpy.array([2 * numpy.sum(tmp), -numpy.sum(tmp*s2p)/2])
324
+ H = numpy.zeros((2,2))
325
+ H[0,0] = 2*len(f1p)
326
+ H[1,1] = numpy.sum(s2p**2/8)
327
+ H[0,1] = H[1,0] = -numpy.sum(s2p)/2
328
+ x = -numpy.dot(numpy.linalg.inv(H), g)
329
+ if self.use_int: x /= 2
330
+ k = numpy.exp(x[0])
331
+ b = x[1]
332
+ logger.writeln(" initial k,b = {:.2e} {:.2e}".format(k, b))
333
+ logger.writeln(" R{} = {:.4f}".format(self.labcut, utils.hkl.r_factor(f1p, f2p * k * numpy.exp(-b*self.s2[sel]/4))))
334
+ return k, b
335
+
336
+ def scale(self):
337
+ use_sol = len(self.calc) == 2
338
+ msg = "Scaling Fc to {} {} bulk solvent contribution".format("Io" if self.use_int else "Fo",
339
+ "with" if use_sol else "without")
340
+ logger.writeln(msg)
341
+ if self.k_overall is None or self.b_iso is None:
342
+ k, b = self.initial_kb()
343
+ else:
344
+ k, b = self.k_overall, self.b_iso
345
+ if self.b_aniso is None:
346
+ self.b_aniso = gemmi.SMat33d(b,b,b,0,0,0)
347
+ x0 = [self.k_trans_inv(k)]
348
+ bounds = [(0, None)]
349
+ x0.extend(numpy.dot(self.b_aniso.elements_pdb(), self.adpdirs.T))
350
+ bounds.extend([(None, None)]*(len(x0)-1))
351
+ if use_sol:
352
+ x0.extend([self.k_sol, self.b_sol])
353
+ bounds.extend([(1e-4, None), (10., 400.)])
354
+ if 0:
355
+ f0 = self.target(x0)
356
+ ader = self.grad(x0)
357
+ e = 1e-4
358
+ nder = []
359
+ for i in range(len(x0)):
360
+ x = numpy.copy(x0)
361
+ x[i] += e
362
+ f1 = self.target(x)
363
+ nder.append((f1 - f0) / e)
364
+ print("ADER NDER RATIO")
365
+ print(ader)
366
+ print(nder)
367
+ print(ader / nder)
368
+ quit()
369
+
370
+ t0 = time.time()
371
+ if 1:
372
+ x = x0
373
+ for i in range(40):
374
+ x_ini = x.copy()
375
+ f0 = self.target(x)
376
+ dx = self.calc_shift(x)
377
+ if numpy.max(numpy.abs(dx)) < 1e-6:
378
+ break
379
+ for s in (1, 0.5, 0.25):
380
+ if 0:
381
+ with open("debug.dat", "w") as ofs:
382
+ for s in numpy.linspace(-2, 2, 100):
383
+ f1 = self.target(x+dx * s)
384
+ #print(dx, f0, f1, f0 - f1)
385
+ ofs.write("{:4e} {:4e}\n".format(s, f1))
386
+ shift = dx * s
387
+ x = x_ini + shift
388
+ if x[0] < 0: x[0] = x0[0]
389
+ if use_sol:
390
+ if x[-1] < 10: x[-1] = 10
391
+ elif x[-1] > 400: x[-1] = 400
392
+ if x[-2] < 1e-4: x[-2] = 1e-4
393
+ f1 = self.target(x)
394
+ if f1 < f0: break
395
+ #logger.writeln("cycle {} {} {} {} {} {}".format(i, f0, f1, s, shift, (f0 - f1) / f0))
396
+ if 0 < (f0 - f1) / f0 < 1e-6:
397
+ break
398
+ res_x = x
399
+ self.stats["fun"] = f1
400
+ self.stats["x"] = x
401
+ else:
402
+ res = scipy.optimize.minimize(fun=self.target, x0=x0, jac=self.grad, bounds=bounds)
403
+ #logger.writeln(str(res))
404
+ logger.writeln(" finished in {} iterations ({} evaluations)".format(res.nit, res.nfev))
405
+ res_x = res.x
406
+ self.stats["fun"] = res.fun
407
+ self.stats["x"] = res.x
408
+ logger.writeln(" time: {:.3f} sec".format(time.time() - t0))
409
+ self.k_overall = self.k_trans(res_x[0])
410
+ nadp = self.adpdirs.shape[0]
411
+ b_overall = gemmi.SMat33d(*numpy.dot(res_x[1:nadp+1], self.adpdirs))
412
+ self.b_iso = b_overall.trace() / 3
413
+ self.b_aniso = b_overall.added_kI(-self.b_iso) # subtract isotropic contribution
414
+
415
+ logger.writeln(" k_ov= {:.2e} B_iso= {:.2e} B_aniso= {}".format(self.k_overall, self.b_iso, self.b_aniso))
416
+ if use_sol:
417
+ self.k_sol = res_x[-2]
418
+ self.b_sol = res_x[-1]
419
+ logger.writeln(" k_sol= {:.2e} B_sol= {:.2e}".format(self.k_sol, self.b_sol))
420
+ calc = self.scaled_fc(res_x)
421
+ if self.use_int: calc *= calc
422
+ self.stats["cc"] = utils.hkl.correlation(self.obs, calc)
423
+ self.stats["r"] = utils.hkl.r_factor(self.obs, calc)
424
+ logger.writeln(" CC{} = {:.4f}".format(self.labcut, self.stats["cc"]))
425
+ logger.writeln(" R{} = {:.4f}".format(self.labcut, self.stats["r"]))
426
+ # class LsqScale
427
+
428
+ def calc_abs_DFc(Ds, Fcs):
429
+ DFc = sum(Ds[i] * Fcs[i] for i in range(len(Ds)))
430
+ return numpy.abs(DFc)
431
+ # calc_abs_DFc()
432
+
433
+ #import line_profiler
434
+ #profile = line_profiler.LineProfiler()
435
+ #import atexit
436
+ #atexit.register(profile.print_stats)
437
+ #@profile
438
+ def mlf(df, fc_labs, Ds, S, k_ani, idxes):
439
+ Fcs = numpy.vstack([df[lab].to_numpy()[idxes] for lab in fc_labs]).T
440
+ DFc = (Ds * Fcs).sum(axis=1)
441
+ ll = numpy.nansum(ext.ll_amp(df.FP.to_numpy()[idxes], df.SIGFP.to_numpy()[idxes],
442
+ k_ani[idxes], S * df.epsilon.to_numpy()[idxes],
443
+ numpy.abs(DFc), df.centric.to_numpy()[idxes]+1))
444
+ return numpy.nansum(ll)
445
+ # mlf()
446
+
447
+ #@profile
448
+ def deriv_mlf_wrt_D_S(df, fc_labs, Ds, S, k_ani, idxes):
449
+ Fcs = [df[lab].to_numpy()[idxes] for lab in fc_labs]
450
+ r = ext.ll_amp_der1_DS(df.FP.to_numpy()[idxes], df.SIGFP.to_numpy()[idxes], k_ani[idxes], S,
451
+ numpy.vstack(Fcs).T, Ds,
452
+ df.centric.to_numpy()[idxes]+1, df.epsilon.to_numpy()[idxes])
453
+ g = numpy.zeros(len(fc_labs)+1)
454
+ g[:len(fc_labs)] = numpy.nansum(r[:,:len(fc_labs)], axis=0) # D
455
+ g[-1] = numpy.nansum(r[:,-1]) # S
456
+ return g
457
+ # deriv_mlf_wrt_D_S()
458
+
459
+ #@profile
460
+ def mlf_shift_S(df, fc_labs, Ds, S, k_ani, idxes):
461
+ Fcs = [df[lab].to_numpy()[idxes] for lab in fc_labs]
462
+ r = ext.ll_amp_der1_DS(df.FP.to_numpy()[idxes], df.SIGFP.to_numpy()[idxes], k_ani[idxes], S,
463
+ numpy.vstack(Fcs).T, Ds,
464
+ df.centric.to_numpy()[idxes]+1, df.epsilon.to_numpy()[idxes])
465
+ g = numpy.nansum(r[:,-1])
466
+ H = numpy.nansum(r[:,-1]**2) # approximating expectation value of second derivative
467
+ return -g / H
468
+ # mlf_shift_S()
469
+
470
+ def mli(df, fc_labs, Ds, S, k_ani, idxes):
471
+ Fcs = numpy.vstack([df[lab].to_numpy()[idxes] for lab in fc_labs]).T
472
+ DFc = (Ds * Fcs).sum(axis=1)
473
+ ll = integr.ll_int(df.I.to_numpy()[idxes], df.SIGI.to_numpy()[idxes],
474
+ k_ani[idxes], S * df.epsilon.to_numpy()[idxes],
475
+ numpy.abs(DFc), df.centric.to_numpy()[idxes]+1)
476
+ return numpy.nansum(ll)
477
+ # mli()
478
+
479
+ def deriv_mli_wrt_D_S(df, fc_labs, Ds, S, k_ani, idxes):
480
+ Fcs = numpy.vstack([df[lab].to_numpy()[idxes] for lab in fc_labs]).T
481
+ r = integr.ll_int_der1_DS(df.I.to_numpy()[idxes], df.SIGI.to_numpy()[idxes], k_ani[idxes], S,
482
+ Fcs, Ds,
483
+ df.centric.to_numpy()[idxes]+1, df.epsilon.to_numpy()[idxes])
484
+ g = numpy.zeros(len(fc_labs)+1)
485
+ g[:len(fc_labs)] = numpy.nansum(r[:,:len(fc_labs)], axis=0) # D
486
+ g[-1] = numpy.nansum(r[:,-1]) # S
487
+ return g
488
+ # deriv_mli_wrt_D_S()
489
+
490
+ def mli_shift_D(df, fc_labs, Ds, S, k_ani, idxes):
491
+ Fcs = numpy.vstack([df[lab].to_numpy()[idxes] for lab in fc_labs]).T
492
+ r = integr.ll_int_der1_DS(df.I.to_numpy()[idxes], df.SIGI.to_numpy()[idxes], k_ani[idxes], S,
493
+ Fcs, Ds,
494
+ df.centric.to_numpy()[idxes]+1, df.epsilon.to_numpy()[idxes])[:,:len(fc_labs)]
495
+ g = numpy.nansum(r, axis=0)# * trans.D_deriv(x[:len(fc_labs)]) # D
496
+ #tmp = numpy.hstack([r[:,:len(fc_labs)] #* trans.D_deriv(x[:len(fc_labs)]),
497
+ # r[:,-1,None] * trans.S_deriv(x[-1])])
498
+ H = numpy.nansum(numpy.matmul(r[:,:,None], r[:,None]), axis=0)
499
+ return -numpy.dot(g, numpy.linalg.pinv(H))
500
+ # mli_shift_D()
501
+
502
+ def mli_shift_S(df, fc_labs, Ds, S, k_ani, idxes):
503
+ Fcs = numpy.vstack([df[lab].to_numpy()[idxes] for lab in fc_labs]).T
504
+ r = integr.ll_int_der1_DS(df.I.to_numpy()[idxes], df.SIGI.to_numpy()[idxes], k_ani[idxes], S,
505
+ Fcs, Ds,
506
+ df.centric.to_numpy()[idxes]+1, df.epsilon.to_numpy()[idxes])
507
+ g = numpy.nansum(r[:,-1])
508
+ H = numpy.nansum(r[:,-1]**2) # approximating expectation value of second derivative
509
+ return -g / H
510
+ # mli_shift_S()
511
+
512
+ def mltwin_est_ftrue(twin_data, df, k_ani, idxes):
513
+ kani2_inv = 1 / k_ani**2
514
+ i_sigi = numpy.empty((2, len(df.index)))
515
+ i_sigi[:] = numpy.nan
516
+ i_sigi[0, idxes] = (df.I.to_numpy() * kani2_inv)[idxes]
517
+ i_sigi[1, idxes] = (df.SIGI.to_numpy() * kani2_inv)[idxes]
518
+ twin_data.est_f_true(i_sigi[0,:], i_sigi[1,:])
519
+ # mltwin_est_ftrue()
520
+
521
+ def mltwin(df, twin_data, Ds, S, k_ani, idxes, i_bin):
522
+ twin_data.ml_sigma[i_bin] = S
523
+ twin_data.ml_scale[i_bin, :] = Ds
524
+ mltwin_est_ftrue(twin_data, df, k_ani, idxes)
525
+ return twin_data.ll()
526
+ # mltwin()
527
+
528
+ def deriv_mltwin_wrt_D_S(df, twin_data, Ds, S, k_ani, idxes, i_bin):
529
+ twin_data.ml_sigma[i_bin] = S
530
+ twin_data.ml_scale[i_bin, :] = Ds
531
+ mltwin_est_ftrue(twin_data, df, k_ani, idxes)
532
+ r = twin_data.ll_der_D_S()
533
+ g = numpy.zeros(r.shape[1])
534
+ g[:-1] = numpy.nansum(r[:,:-1], axis=0) # D
535
+ g[-1] = numpy.nansum(r[:,-1]) # S
536
+ return g
537
+ # deriv_mlf_wrt_D_S()
538
+
539
+ def mltwin_shift_S(df, twin_data, Ds, S, k_ani, idxes, i_bin):
540
+ twin_data.ml_sigma[i_bin] = S
541
+ twin_data.ml_scale[i_bin, :] = Ds
542
+ mltwin_est_ftrue(twin_data, df, k_ani, idxes)
543
+ r = twin_data.ll_der_D_S()
544
+ g = numpy.nansum(r[:,-1])
545
+ H = numpy.nansum(r[:,-1]**2) # approximating expectation value of second derivative
546
+ return -g / H
547
+ # mlf_shift_S()
548
+
549
+ def determine_mlf_params_from_cc(hkldata, fc_labs, D_labs, centric_and_selections, use="all", smoothing="gauss"):
550
+ # theorhetical values
551
+ cc_a = lambda cc: (numpy.pi/4*(1-cc**2)**2 * scipy.special.hyp2f1(3/2, 3/2, 1, cc**2) - numpy.pi/4) / (1-numpy.pi/4)
552
+ cc_c = lambda cc: 2/(numpy.pi-2) * (cc**2*numpy.sqrt(1-cc**2) + cc * numpy.arctan(cc/numpy.sqrt(1-cc**2)) + (1-cc**2)**(3/2)-1)
553
+ table_fsc = numpy.arange(0, 1, 1e-3)
554
+ table_cc = [cc_a(table_fsc), cc_c(table_fsc)]
555
+
556
+ for lab in D_labs: hkldata.binned_df[lab] = 1.
557
+ hkldata.binned_df["S"] = 1.
558
+
559
+ stats = hkldata.binned_df[["d_max", "d_min"]].copy()
560
+ for i, labi in enumerate(fc_labs):
561
+ stats["CC(FP,{})".format(labi)] = numpy.nan
562
+ for i, labi in enumerate(fc_labs):
563
+ for j in range(i+1, len(fc_labs)):
564
+ labj = fc_labs[j]
565
+ stats["CC({},{})".format(labi, labj)] = numpy.nan
566
+
567
+ # sqrt of eps * c; c = 1 for acentrics and 2 for centrics
568
+ inv_sqrt_c_eps = 1. / numpy.sqrt(hkldata.df.epsilon.to_numpy() * (hkldata.df.centric.to_numpy() + 1))
569
+ for i_bin, _ in hkldata.binned():
570
+ # assume they are all acentrics.. only correct by c
571
+ if use == "all":
572
+ cidxes = numpy.concatenate([sel[i] for sel in centric_and_selections[i_bin] for i in (1,2)])
573
+ else:
574
+ i = 1 if use == "work" else 2
575
+ cidxes = numpy.concatenate([sel[i] for sel in centric_and_selections[i_bin]])
576
+ valid_sel = numpy.isfinite(hkldata.df.FP.to_numpy()[cidxes])
577
+ cidxes = cidxes[valid_sel]
578
+ factor = inv_sqrt_c_eps[cidxes]
579
+ k_ani = hkldata.df.k_aniso.to_numpy()[cidxes]
580
+ Fo = hkldata.df.FP.to_numpy()[cidxes] * factor / k_ani
581
+ mean_Fo2 = numpy.mean(Fo**2)
582
+ SigFo = hkldata.df.SIGFP.to_numpy()[cidxes] / k_ani
583
+ Fcs = [hkldata.df[lab].to_numpy()[cidxes] * factor for lab in fc_labs]
584
+ mean_Fk2 = numpy.array([numpy.mean(numpy.abs(fk)**2) for fk in Fcs])
585
+
586
+ # estimate D
587
+ cc_fo_fj = [numpy.corrcoef(numpy.abs(fj), Fo)[1,0] for fj in Fcs]
588
+ for i in range(len(fc_labs)): stats.loc[i_bin, "CC(FP,{})".format(fc_labs[i])] = cc_fo_fj[i]
589
+ mat = [[numpy.sqrt(numpy.mean(numpy.abs(fk)**2)/mean_Fo2) * numpy.real(numpy.corrcoef(fk, fj)[1,0])
590
+ for fk in Fcs]
591
+ for fj in Fcs]
592
+ A = [[numpy.sqrt(numpy.mean(numpy.abs(fk)**2) * numpy.mean(numpy.abs(fj)**2))/mean_Fo2 * numpy.real(numpy.corrcoef(fk, fj)[1,0])
593
+ for fk in Fcs]
594
+ for fj in Fcs]
595
+ A = numpy.array([[numpy.real(numpy.corrcoef(fk, fj)[1,0]) for fk in Fcs] for fj in Fcs])
596
+ v = numpy.interp(cc_fo_fj, table_cc[0], table_fsc)
597
+
598
+ for i in range(len(fc_labs)):
599
+ labi = fc_labs[i]
600
+ for j in range(i+1, len(fc_labs)):
601
+ labj = fc_labs[j]
602
+ stats.loc[i_bin, "CC({},{})".format(labi, labj)] = numpy.real(numpy.corrcoef(Fcs[i], Fcs[j])[1,0])
603
+
604
+ # test all signs, fixing first Fc positive.
605
+ cc_max = -2
606
+ for v_test in itertools.product(*((x, -x) for x in v[1:])):
607
+ v_test = numpy.array((v[0],)+v_test)
608
+ Dj_test = numpy.dot(numpy.linalg.pinv(A), v_test) * numpy.sqrt(mean_Fo2 / mean_Fk2)
609
+ DFc_test = calc_abs_DFc(Dj_test, Fcs)
610
+ cc_test = numpy.corrcoef(Fo, numpy.abs(DFc_test))[1,0]
611
+ if cc_test > cc_max:
612
+ cc_max = cc_test
613
+ v_max = v_test
614
+ DFc = DFc_test
615
+ Dj = Dj_test
616
+
617
+ for lab, D in zip(D_labs, Dj):
618
+ hkldata.binned_df.loc[i_bin, lab] = D
619
+
620
+ # estimate S
621
+ mean_DFc2 = numpy.mean(DFc**2)
622
+ est_fsc_fo_fc = numpy.interp(numpy.corrcoef(Fo, DFc)[1,0], table_cc[0], table_fsc)
623
+ S = mean_Fo2 - 2 * numpy.sqrt(mean_Fo2 * mean_DFc2) * est_fsc_fo_fc + mean_DFc2 - numpy.mean(SigFo**2)
624
+ hkldata.binned_df.loc[i_bin, "S"] = S
625
+
626
+ logger.writeln("\nCC:")
627
+ logger.writeln(stats.to_string())
628
+ logger.writeln("\nEstimates:")
629
+ logger.writeln(hkldata.binned_df.to_string())
630
+ smooth_params(hkldata, D_labs, smoothing)
631
+ # determine_mlf_params_from_cc()
632
+
633
+ def initialize_ml_params(hkldata, use_int, D_labs, b_aniso, centric_and_selections, use, twin_data=None):
634
+ # Initial values
635
+ for lab in D_labs: hkldata.binned_df[lab] = 1.
636
+ hkldata.binned_df["S"] = 10000.
637
+ k_ani = hkldata.debye_waller_factors(b_cart=b_aniso)
638
+ lab_obs = "I" if use_int else "FP"
639
+ for i_bin, _ in hkldata.binned():
640
+ if use == "all":
641
+ idxes = numpy.concatenate([sel[i] for sel in centric_and_selections[i_bin] for i in (1,2)])
642
+ else:
643
+ i = 1 if use == "work" else 2
644
+ idxes = numpy.concatenate([sel[i] for sel in centric_and_selections[i_bin]])
645
+ valid_sel = numpy.isfinite(hkldata.df.loc[idxes, lab_obs]) # as there is no nan-safe numpy.corrcoef
646
+ if numpy.sum(valid_sel) < 2:
647
+ continue
648
+ idxes = idxes[valid_sel]
649
+ if use_int:
650
+ Io = hkldata.df.I.to_numpy()[idxes]
651
+ else:
652
+ Io = hkldata.df.FP.to_numpy()[idxes]**2
653
+ Io /= k_ani[idxes]**2
654
+ if twin_data:
655
+ Ic = twin_data.i_calc_twin()[idxes]
656
+ else:
657
+ Ic = numpy.abs(hkldata.df.FC.to_numpy()[idxes])**2
658
+ mean_Io = numpy.mean(Io)
659
+ mean_Ic = numpy.mean(Ic)
660
+ cc = numpy.corrcoef(Io, Ic)[1,0]
661
+ if cc > 0 and mean_Io > 0:
662
+ D = numpy.sqrt(mean_Io / mean_Ic * cc)
663
+ else:
664
+ D = 0 # will be taken care later
665
+ hkldata.binned_df.loc[i_bin, D_labs[0]] = D
666
+ if mean_Io > 0:
667
+ S = mean_Io - 2 * numpy.sqrt(mean_Io * mean_Ic * numpy.maximum(0, cc)) + mean_Ic
668
+ else:
669
+ S = numpy.std(Io) # similar initial to french_wilson
670
+ hkldata.binned_df.loc[i_bin, "S"] = S
671
+
672
+ for D_lab in D_labs:
673
+ if hkldata.binned_df[D_lab].min() <= 0:
674
+ min_D = hkldata.binned_df[D_lab][hkldata.binned_df[D_lab] > 0].min() * 0.1
675
+ logger.writeln("WARNING: negative {} is detected from initial estimates. Replacing it using minimum positive value {:.2e}".format(D_lab, min_D))
676
+ hkldata.binned_df[D_lab].where(hkldata.binned_df[D_lab] > 0, min_D, inplace=True) # arbitrary
677
+
678
+ if twin_data:
679
+ twin_data.ml_scale[:] = hkldata.binned_df.loc[:, D_labs]
680
+ twin_data.ml_sigma[:] = hkldata.binned_df.loc[:, "S"]
681
+
682
+ logger.writeln("Initial estimates:")
683
+ logger.writeln(hkldata.binned_df.to_string())
684
+ # initialize_ml_params()
685
+
686
+ def determine_ml_params(hkldata, use_int, fc_labs, D_labs, b_aniso, centric_and_selections,
687
+ D_trans=None, S_trans=None, use="all", n_cycle=1, smoothing="gauss",
688
+ twin_data=None):
689
+ assert use in ("all", "work", "test")
690
+ assert smoothing in (None, "gauss")
691
+ logger.writeln("Estimating sigma-A parameters using {}..".format(("intensities" if use_int else "amplitudes") + " (twin)" if twin_data else ""))
692
+ trans = VarTrans(D_trans, S_trans)
693
+ lab_obs = "I" if use_int else "FP"
694
+ def get_idxes(i_bin):
695
+ if use == "all":
696
+ return numpy.concatenate([sel[i] for sel in centric_and_selections[i_bin] for i in (1,2)])
697
+ else:
698
+ i = 1 if use == "work" else 2
699
+ return numpy.concatenate([sel[i] for sel in centric_and_selections[i_bin]])
700
+
701
+ if not set(D_labs + ["S"]).issubset(hkldata.binned_df):
702
+ initialize_ml_params(hkldata, use_int, D_labs, b_aniso, centric_and_selections, use, twin_data=twin_data)
703
+ for dlab, fclab in zip(D_labs, fc_labs):
704
+ hkldata.binned_df["Mn(|{}*{}|)".format(dlab, fclab)] = numpy.nan
705
+
706
+ refpar = "all"
707
+ for i_cyc in range(n_cycle):
708
+ t0 = time.time()
709
+ nfev_total = 0
710
+ k_ani = hkldata.debye_waller_factors(b_cart=b_aniso)
711
+ for i_bin, _ in hkldata.binned():
712
+ idxes = get_idxes(i_bin)
713
+ valid_sel = numpy.isfinite(hkldata.df.loc[idxes, lab_obs]) # as there is no nan-safe numpy.corrcoef
714
+ if numpy.sum(valid_sel) < 5:
715
+ logger.writeln("WARNING: bin {} has no sufficient reflections".format(i_bin))
716
+ continue
717
+
718
+ def target(x):
719
+ if refpar == "all":
720
+ Ds = trans.D(x[:len(fc_labs)])
721
+ S = trans.S(x[-1])
722
+ elif refpar == "D":
723
+ Ds = trans.D(x[:len(fc_labs)])
724
+ S = hkldata.binned_df.loc[i_bin, "S"]
725
+ else:
726
+ Ds = [hkldata.binned_df.loc[i_bin, lab] for lab in D_labs]
727
+ S = trans.S(x[-1])
728
+
729
+ if twin_data:
730
+ return mltwin(hkldata.df, twin_data, Ds, S, k_ani, idxes, i_bin)
731
+ else:
732
+ f = mli if use_int else mlf
733
+ return f(hkldata.df, fc_labs, Ds, S, k_ani, idxes)
734
+
735
+ def grad(x):
736
+ if refpar == "all":
737
+ Ds = trans.D(x[:len(fc_labs)])
738
+ S = trans.S(x[-1])
739
+ n_par = len(fc_labs)+1
740
+ elif refpar == "D":
741
+ Ds = trans.D(x[:len(fc_labs)])
742
+ S = hkldata.binned_df.loc[i_bin, "S"]
743
+ n_par = len(fc_labs)
744
+ else:
745
+ Ds = [hkldata.binned_df.loc[i_bin, lab] for lab in D_labs]
746
+ S = trans.S(x[-1])
747
+ n_par = 1
748
+ if twin_data:
749
+ r = deriv_mltwin_wrt_D_S(hkldata.df, twin_data, Ds, S, k_ani, idxes, i_bin)
750
+ else:
751
+ calc_deriv = deriv_mli_wrt_D_S if use_int else deriv_mlf_wrt_D_S
752
+ r = calc_deriv(hkldata.df, fc_labs, Ds, S, k_ani, idxes)
753
+ g = numpy.zeros(n_par)
754
+ if refpar in ("all", "D"):
755
+ g[:len(fc_labs)] = r[:len(fc_labs)]
756
+ g[:len(fc_labs)] *= trans.D_deriv(x[:len(fc_labs)])
757
+ if refpar in ("all", "S"):
758
+ g[-1] = r[-1]
759
+ g[-1] *= trans.S_deriv(x[-1])
760
+ return g
761
+
762
+ if 0:
763
+ refpar = "S"
764
+ x0 = trans.S_inv(hkldata.binned_df.loc[i_bin, "S"])
765
+ with open("s_line_{}.dat".format(i_bin), "w") as ofs:
766
+ for sval in numpy.linspace(1, x0*2, 100):
767
+ ofs.write("{:.4e} {:.10e} {:.10e}\n".format(sval,
768
+ target([sval]),
769
+ grad([sval])[0]))
770
+ continue
771
+ #print("Bin", i_bin)
772
+ if 1: # refine D and S iteratively
773
+ vals_last = None
774
+ for ids in range(10):
775
+ refpar = "D"
776
+ x0 = numpy.array([trans.D_inv(hkldata.binned_df.loc[i_bin, lab]) for lab in D_labs])
777
+ #print("MLTWIN=", target(x0))
778
+ #quit()
779
+ if 0:
780
+ h = 1e-3
781
+ f00 = target(x0)
782
+ g00 = grad(x0)
783
+ for ii in range(len(x0)):
784
+ xx = x0.copy()
785
+ xx[ii] += h
786
+ f01 = target(xx)
787
+ nder = (f01 - f00) / h
788
+ logger.writeln(f"DEBUG_der_D bin_{i_bin} {ii} ad={g00[ii]} nd={nder} r={g00[ii]/nder}")
789
+ vals_now = []
790
+ if 0:
791
+ f0 = target(x0)
792
+ nfev_total += 1
793
+ shift = mli_shift_D(hkldata.df, fc_labs, trans.D(x0), hkldata.binned_df.loc[i_bin, "S"], k_ani, idxes)
794
+ shift /= trans.D_deriv(x0)
795
+ #if abs(shift) < 1e-3: break
796
+ for itry in range(10):
797
+ x1 = x0 + shift
798
+ if (D_trans and any(x1 < -3)) or (not D_trans and any(x1 < 5e-2)):
799
+ #print(i_bin, cyc_s, trans.S(x0), trans.S(x1), shift, "BAD")
800
+ shift /= 2
801
+ continue
802
+ f1 = target(x1)
803
+ nfev_total += 1
804
+ if f1 > f0:
805
+ shift /= 2
806
+ continue
807
+ else: # good
808
+ for i, lab in enumerate(D_labs):
809
+ hkldata.binned_df.loc[i_bin, lab] = trans.D(x1[i])
810
+ vals_now.append(hkldata.binned_df.loc[i_bin, lab])
811
+ break
812
+ else:
813
+ break
814
+ else:
815
+ #print(mli_shift_D(hkldata.df, fc_labs, trans.D(x0), hkldata.binned_df.S[i_bin], k_ani, idxes))
816
+ res = scipy.optimize.minimize(fun=target, x0=x0, jac=grad,
817
+ bounds=((-5 if D_trans else 1e-5, None),)*len(x0))
818
+ nfev_total += res.nfev
819
+ #print(i_bin, "mini cycle", ids, refpar)
820
+ #print(res)
821
+ for i, lab in enumerate(D_labs):
822
+ hkldata.binned_df.loc[i_bin, lab] = trans.D(res.x[i])
823
+ vals_now.append(hkldata.binned_df.loc[i_bin, lab])
824
+ if twin_data:
825
+ twin_data.ml_scale[i_bin, :] = trans.D(res.x)
826
+ refpar = "S"
827
+ if 1:
828
+ for cyc_s in range(1):
829
+ x0 = trans.S_inv(hkldata.binned_df.loc[i_bin, "S"])
830
+ if 0:
831
+ h = 1e-1
832
+ f00 = target([x0])
833
+ g00 = grad([x0])
834
+ xx = x0 + h
835
+ f01 = target([xx])
836
+ nder = (f01 - f00) / h
837
+ logger.writeln(f"DEBUG_der_S bin_{i_bin} ad={g00} nd={nder} r={g00/nder}")
838
+
839
+ f0 = target([x0])
840
+ Ds = [hkldata.binned_df.loc[i_bin, lab] for lab in D_labs]
841
+ nfev_total += 1
842
+ if twin_data:
843
+ shift = mltwin_shift_S(hkldata.df, twin_data, Ds, trans.S(x0), k_ani, idxes, i_bin)
844
+ else:
845
+ calc_shift_S = mli_shift_S if use_int else mlf_shift_S
846
+ shift = calc_shift_S(hkldata.df, fc_labs, Ds, trans.S(x0), k_ani, idxes)
847
+ shift /= trans.S_deriv(x0)
848
+ if abs(shift) < 1e-3: break
849
+ for itry in range(10):
850
+ x1 = x0 + shift
851
+ if (S_trans and x1 < -3) or (not S_trans and x1 < 5e-2):
852
+ #print(i_bin, cyc_s, trans.S(x0), trans.S(x1), shift, "BAD")
853
+ shift /= 2
854
+ continue
855
+ f1 = target([x1])
856
+ nfev_total += 1
857
+ if f1 > f0:
858
+ shift /= 2
859
+ continue
860
+ else: # good
861
+ #print(i_bin, cyc_s, trans.S(x0), trans.S(x1), shift)
862
+ hkldata.binned_df.loc[i_bin, "S"] = trans.S(x1)
863
+ break
864
+ else:
865
+ #print("all bad")
866
+ break
867
+ if twin_data:
868
+ twin_data.ml_sigma[i_bin] = hkldata.binned_df.loc[i_bin, "S"]
869
+ else:
870
+ # somehow this does not work well.
871
+ x0 = [trans.S_inv(hkldata.binned_df.loc[i_bin, "S"])]
872
+ res = scipy.optimize.minimize(fun=target, x0=x0, jac=grad,
873
+ bounds=((-3 if S_trans else 5e-2, None),))
874
+ nfev_total += res.nfev
875
+ #print(i_bin, "mini cycle", ids, refpar)
876
+ #print(res)
877
+ hkldata.binned_df.loc[i_bin, "S"] = trans.S(res.x[-1])
878
+ if twin_data:
879
+ twin_data.ml_sigma[i_bin] = trans.S(res.x[-1])
880
+ vals_now.append(hkldata.binned_df.loc[i_bin, "S"])
881
+ vals_now = numpy.array(vals_now)
882
+ if vals_last is not None and numpy.all(numpy.abs((vals_last - vals_now) / vals_now) < 1e-2):
883
+ #logger.writeln("converged in mini cycle {}".format(ids+1))
884
+ break
885
+ vals_last = vals_now
886
+ else:
887
+ x0 = [trans.D_inv(hkldata.binned_df.loc[i_bin, lab]) for lab in D_labs] + [trans.S_inv(hkldata.binned_df.loc[i_bin, "S"])]
888
+ res = scipy.optimize.minimize(fun=target, x0=x0, jac=grad,
889
+ bounds=((-5 if D_trans else 1e-5, None), )*len(D_labs) + ((-3 if S_trans else 5e-2, None),))
890
+ nfev_total += res.nfev
891
+ #print(i_bin)
892
+ #print(res)
893
+ for i, lab in enumerate(D_labs):
894
+ hkldata.binned_df.loc[i_bin, lab] = trans.D(res.x[i])
895
+ hkldata.binned_df.loc[i_bin, "S"] = trans.S(res.x[-1])
896
+ if twin_data:
897
+ twin_data.ml_scale[i_bin, :] = trans.D(res.x[:-1])
898
+ twin_data.ml_sigma[i_bin] = trans.S(res.x[-1])
899
+
900
+ if twin_data:
901
+ dfc = numpy.abs(twin_data.f_calc) * twin_data.ml_scale_array()
902
+ for i_bin, idxes in hkldata.binned():
903
+ dfc_bin = dfc[numpy.asarray(twin_data.bin)==i_bin,:]
904
+ mean_dfc = numpy.nanmean(dfc_bin, axis=0)
905
+ for i, (dlab, fclab) in enumerate(zip(D_labs, fc_labs)):
906
+ hkldata.binned_df.loc[i_bin, "Mn(|{}*{}|)".format(dlab, fclab)] = mean_dfc[i]
907
+ else:
908
+ for i_bin, idxes in hkldata.binned():
909
+ for dlab, fclab in zip(D_labs, fc_labs):
910
+ mean_dfc = numpy.nanmean(numpy.abs(hkldata.binned_df[dlab][i_bin] * hkldata.df[fclab][idxes]))
911
+ hkldata.binned_df.loc[i_bin, "Mn(|{}*{}|)".format(dlab, fclab)] = mean_dfc
912
+
913
+ logger.writeln("Refined estimates:")
914
+ logger.writeln(hkldata.binned_df.to_string())
915
+ #numpy.testing.assert_allclose(hkldata.binned_df.S, twin_data.ml_sigma)
916
+ #numpy.testing.assert_allclose(hkldata.binned_df[D_labs], twin_data.ml_scale)
917
+ logger.writeln("time: {:.1f} sec ({} evaluations)".format(time.time() - t0, nfev_total))
918
+
919
+ if not use_int or twin_data:
920
+ break # did not implement MLF B_aniso optimization
921
+
922
+ # Refine b_aniso
923
+ adpdirs = utils.model.adp_constraints(hkldata.sg.operations(), hkldata.cell, tr0=True)
924
+ SMattolist = lambda B: [B.u11, B.u22, B.u33, B.u12, B.u13, B.u23]
925
+
926
+ def target_ani(x):
927
+ b = gemmi.SMat33d(*numpy.dot(x, adpdirs))
928
+ k_ani = hkldata.debye_waller_factors(b_cart=b)
929
+ ret = 0.
930
+ for i_bin, idxes in hkldata.binned():
931
+ Ds = [hkldata.binned_df.loc[i_bin, lab] for lab in D_labs]
932
+ ret += mli(hkldata.df, fc_labs, Ds, hkldata.binned_df.loc[i_bin, "S"], k_ani, idxes)
933
+ return ret
934
+ def grad_ani(x):
935
+ b = gemmi.SMat33d(*numpy.dot(x, adpdirs))
936
+ k_ani = hkldata.debye_waller_factors(b_cart=b)
937
+ S2mat = hkldata.ssq_mat() # ssqmat
938
+ g = numpy.zeros(6)
939
+ for i_bin, idxes in hkldata.binned():
940
+ r = integr.ll_int_der1_ani(hkldata.df.I.to_numpy()[idxes], hkldata.df.SIGI.to_numpy()[idxes],
941
+ k_ani[idxes], hkldata.binned_df.loc[i_bin, "S"],
942
+ hkldata.df[fc_labs].to_numpy()[idxes], hkldata.binned_df.loc[i_bin, D_labs],
943
+ hkldata.df.centric.to_numpy()[idxes]+1, hkldata.df.epsilon.to_numpy()[idxes])
944
+ S2 = S2mat[:,idxes]
945
+ g += -numpy.nansum(S2 * r[:,0], axis=1) # k_ani is already multiplied in r
946
+ return numpy.dot(g, adpdirs.T)
947
+ def shift_ani(x):
948
+ b = gemmi.SMat33d(*numpy.dot(x, adpdirs))
949
+ k_ani = hkldata.debye_waller_factors(b_cart=b)
950
+ S2mat = hkldata.ssq_mat() # ssqmat
951
+ g = numpy.zeros(6)
952
+ H = numpy.zeros((6, 6))
953
+ for i_bin, idxes in hkldata.binned():
954
+ r = integr.ll_int_der1_ani(hkldata.df.I.to_numpy()[idxes], hkldata.df.SIGI.to_numpy()[idxes],
955
+ k_ani[idxes], hkldata.binned_df.loc[i_bin, "S"],
956
+ hkldata.df[fc_labs].to_numpy()[idxes], list(hkldata.binned_df.loc[i_bin, D_labs]),
957
+ hkldata.df.centric.to_numpy()[idxes]+1, hkldata.df.epsilon.to_numpy()[idxes])
958
+ S2 = S2mat[:,idxes]
959
+ g += -numpy.nansum(S2 * r[:,0], axis=1) # k_ani is already multiplied in r
960
+ H += numpy.nansum(numpy.matmul(S2[None,:].T, S2.T[:,None]) * (r[:,0]**2)[:,None,None], axis=0)
961
+
962
+ g, H = numpy.dot(g, adpdirs.T), numpy.dot(adpdirs, numpy.dot(H, adpdirs.T))
963
+ return -numpy.dot(g, numpy.linalg.pinv(H))
964
+
965
+ logger.writeln("Refining B_aniso. Current = {}".format(b_aniso))
966
+ if 0:
967
+ x0 = numpy.dot(SMattolist(b_aniso), numpy.linalg.pinv(adpdirs))
968
+ res = scipy.optimize.minimize(fun=target_ani, x0=x0, jac=grad_ani)
969
+ print(res)
970
+ b_aniso = gemmi.SMat33d(*numpy.dot(res.x, adpdirs))
971
+ f1 = res.fun
972
+ else:
973
+ B_converged = False
974
+ for j in range(10):
975
+ x = numpy.dot(SMattolist(b_aniso), numpy.linalg.pinv(adpdirs))
976
+ f0 = target_ani(x)
977
+ shift = shift_ani(x)
978
+ for i in range(3):
979
+ ss = shift / 2**i
980
+ f1 = target_ani(x + ss)
981
+ #logger.writeln("{:2d} f0 = {:.3e} shift = {} df = {:.3e}".format(j, f0, ss, f1 - f0))
982
+ if f1 < f0:
983
+ b_aniso = gemmi.SMat33d(*numpy.dot(x+ss, adpdirs))
984
+ if numpy.max(numpy.abs(ss)) < 1e-4: B_converged = True
985
+ break
986
+ else:
987
+ B_converged = True
988
+ if B_converged: break
989
+
990
+ logger.writeln("Refined B_aniso = {}".format(b_aniso))
991
+ logger.writeln("cycle {} f= {}".format(i_cyc, f1))
992
+
993
+ smooth_params(hkldata, D_labs, smoothing)
994
+ return b_aniso
995
+ # determine_ml_params()
996
+
997
+ def smooth_params(hkldata, D_labs, smoothing): # XXX twin_data
998
+ if smoothing is None or len(hkldata.binned()) < 2:
999
+ for i, lab in enumerate(D_labs + ["S"]):
1000
+ hkldata.df[lab] = hkldata.binned_data_as_array(lab)
1001
+
1002
+ elif smoothing == "gauss":
1003
+ bin_centers = (0.5 / hkldata.binned_df[["d_min", "d_max"]]**2).sum(axis=1).to_numpy()
1004
+ vals = ext.smooth_gauss(bin_centers,
1005
+ hkldata.binned_df[D_labs + ["S"]].to_numpy(),
1006
+ 1./hkldata.df.d.to_numpy()**2,
1007
+ 100, # min(n_ref?)
1008
+ (bin_centers[1] - bin_centers[0]))
1009
+ for i, lab in enumerate(D_labs + ["S"]):
1010
+ hkldata.df[lab] = vals[:, i]
1011
+ # Update smoothened average; this affects next refinement.
1012
+ # TODO: update Mn(|Dj*FCj|) as well.
1013
+ #for i_bin, idxes in hkldata.binned():
1014
+ # for lab in D_labs + ["S"]:
1015
+ # hkldata.binned_df.loc[i_bin, lab] = numpy.mean(hkldata.df[lab].to_numpy()[idxes])
1016
+ else:
1017
+ raise RuntimeError("unknown smoothing method: {}".format(smoothing))
1018
+ # smooth_params()
1019
+
1020
+ def expected_F_from_int(Io, sigIo, k_ani, DFc, eps, c, S):
1021
+ if c == 0: # acentric
1022
+ k_num, k_den = 0.5, 0.
1023
+ else:
1024
+ k_num, k_den = 0., -0.5
1025
+ to = Io / sigIo - sigIo / (c+1) / k_ani**2 / S / eps
1026
+ tf = k_ani * numpy.abs(DFc) / numpy.sqrt(sigIo)
1027
+ sig1 = k_ani**2 * S * eps / sigIo
1028
+ f = ext.integ_J_ratio(k_num, k_den, True, to, tf, sig1, c+1, integr.exp2_threshold, integr.h, integr.N, integr.ewmax)
1029
+ f *= numpy.sqrt(sigIo) / k_ani
1030
+ m_proxy = ext.integ_J_ratio(k_num, k_num, True, to, tf, sig1, c+1, integr.exp2_threshold, integr.h, integr.N, integr.ewmax)
1031
+ return f, m_proxy
1032
+ # expected_F_from_int()
1033
+
1034
+ def calculate_maps_int(hkldata, b_aniso, fc_labs, D_labs, centric_and_selections, use="all"):
1035
+ nmodels = len(fc_labs)
1036
+ hkldata.df["FWT"] = 0j * numpy.nan
1037
+ hkldata.df["DELFWT"] = 0j * numpy.nan
1038
+ hkldata.df["FOM"] = numpy.nan # FOM proxy, |<F>| / <|F|>
1039
+ has_ano = "I(+)" in hkldata.df and "I(-)" in hkldata.df
1040
+ if has_ano:
1041
+ hkldata.df["FAN"] = 0j * numpy.nan
1042
+ ano_data = hkldata.df[["I(+)", "SIGI(+)", "I(-)", "SIGI(-)"]].to_numpy()
1043
+ Io = hkldata.df.I.to_numpy()
1044
+ sigIo = hkldata.df.SIGI.to_numpy()
1045
+ k_ani = hkldata.debye_waller_factors(b_cart=b_aniso)
1046
+ eps = hkldata.df.epsilon.to_numpy()
1047
+ Ds = numpy.vstack([hkldata.df[lab].to_numpy() for lab in D_labs]).T
1048
+ Fcs = numpy.vstack([hkldata.df[lab].to_numpy() for lab in fc_labs]).T
1049
+ DFc = (Ds * Fcs).sum(axis=1)
1050
+ hkldata.df["DFC"] = DFc
1051
+ for i_bin, idxes in hkldata.binned():
1052
+ for c, work, test in centric_and_selections[i_bin]:
1053
+ cidxes = numpy.concatenate([work, test])
1054
+ S = hkldata.df["S"].to_numpy()[cidxes]
1055
+ f, m_proxy = expected_F_from_int(Io[cidxes], sigIo[cidxes], k_ani[cidxes], DFc[cidxes], eps[cidxes], c, S)
1056
+ exp_ip = numpy.exp(numpy.angle(DFc[cidxes])*1j)
1057
+ if c == 0:
1058
+ hkldata.df.loc[cidxes, "FWT"] = 2 * f * exp_ip - DFc[cidxes]
1059
+ else:
1060
+ hkldata.df.loc[cidxes, "FWT"] = f * exp_ip
1061
+ hkldata.df.loc[cidxes, "DELFWT"] = f * exp_ip - DFc[cidxes]
1062
+ hkldata.df.loc[cidxes, "FOM"] = m_proxy
1063
+ if has_ano:
1064
+ f_p, _ = expected_F_from_int(ano_data[cidxes,0], ano_data[cidxes,1],
1065
+ k_ani[cidxes], DFc[cidxes], eps[cidxes], c, S)
1066
+ f_m, _ = expected_F_from_int(ano_data[cidxes,2], ano_data[cidxes,3],
1067
+ k_ani[cidxes], DFc[cidxes], eps[cidxes], c, S)
1068
+ hkldata.df.loc[cidxes, "FAN"] = (f_p - f_m) * exp_ip / 2j
1069
+ # remove reflections that should be hidden
1070
+ if use != "all":
1071
+ # usually use == "work"
1072
+ tohide = test if use == "work" else work
1073
+ hkldata.df.loc[tohide, "FWT"] = 0j * numpy.nan
1074
+ hkldata.df.loc[tohide, "DELFWT"] = 0j * numpy.nan
1075
+ fill_sel = numpy.isnan(hkldata.df["FWT"][cidxes].to_numpy())
1076
+ hkldata.df.loc[cidxes[fill_sel], "FWT"] = DFc[cidxes][fill_sel]
1077
+ # calculate_maps_int()
1078
+
1079
+ def calculate_maps_twin(hkldata, b_aniso, fc_labs, D_labs, twin_data, centric_and_selections, use="all"):
1080
+ k_ani2_inv = 1 / hkldata.debye_waller_factors(b_cart=b_aniso)**2
1081
+ Io = hkldata.df.I.to_numpy(copy=True) * k_ani2_inv
1082
+ sigIo = hkldata.df.SIGI.to_numpy(copy=True) * k_ani2_inv
1083
+ # Mask Io
1084
+ for i_bin, idxes in hkldata.binned():
1085
+ for c, work, test in centric_and_selections[i_bin]:
1086
+ if use != "all":
1087
+ tohide = test if use == "work" else work
1088
+ Io[tohide] = numpy.nan
1089
+
1090
+ twin_data.est_f_true(Io, sigIo)
1091
+ F_true = numpy.asarray(twin_data.f_true_max)
1092
+ Ds = twin_data.ml_scale_array()
1093
+ DFc = (twin_data.f_calc * Ds).sum(axis=1)
1094
+ exp_ip = numpy.exp(numpy.angle(DFc)*1j)
1095
+ Ft = numpy.asarray(twin_data.f_true_max)
1096
+ m = twin_data.calc_fom()
1097
+ fwt = numpy.where(numpy.asarray(twin_data.centric) == 0,
1098
+ 2 * m * Ft * exp_ip - DFc, m * Ft * exp_ip)
1099
+ delfwt = m * Ft * exp_ip - DFc
1100
+ sel = numpy.isnan(fwt)
1101
+ fwt[sel] = DFc[sel]
1102
+
1103
+ hkldata2 = utils.hkl.HklData(hkldata.cell, hkldata.sg,
1104
+ utils.hkl.df_from_twin_data(twin_data, fc_labs))
1105
+ hkldata2.df["FWT"] = fwt
1106
+ hkldata2.df["DELFWT"] = delfwt
1107
+ hkldata2.df["FOM"] = m
1108
+ hkldata2.df["F_est"] = F_true
1109
+ hkldata2.df["FC"] = twin_data.f_calc.sum(axis=1)
1110
+ hkldata2.df["DFC"] = DFc
1111
+ hkldata2.df[D_labs] = Ds
1112
+ hkldata2.df["S"] = twin_data.ml_sigma_array()
1113
+ return hkldata2
1114
+ # calculate_maps_twin()
1115
+
1116
+ def merge_models(sts): # simply merge models. no fix in chain ids etc.
1117
+ st2 = sts[0].clone()
1118
+ del st2[:]
1119
+ model = gemmi.Model("1")
1120
+ for st in sts:
1121
+ for m in st:
1122
+ for c in m:
1123
+ model.add_chain(c)
1124
+ st2.add_model(model)
1125
+ return st2
1126
+ # merge_models()
1127
+
1128
+ def decide_mtz_labels(mtz, find_free=True, require=None):
1129
+ # F is preferred for now by default
1130
+ obs_types = ("F", "J", "G", "K")
1131
+ if require:
1132
+ assert set(require).issubset(obs_types)
1133
+ else:
1134
+ require = obs_types
1135
+ dlabs = utils.hkl.mtz_find_data_columns(mtz)
1136
+ logger.writeln("Finding possible options from MTZ:")
1137
+ for typ in dlabs:
1138
+ for labs in dlabs[typ]:
1139
+ logger.writeln(" --labin '{}'".format(",".join(labs)))
1140
+ for typ in require:
1141
+ if dlabs[typ]:
1142
+ labin = dlabs[typ][0]
1143
+ break
1144
+ else:
1145
+ raise RuntimeError("Data not found from mtz")
1146
+ if find_free:
1147
+ flabs = utils.hkl.mtz_find_free_columns(mtz)
1148
+ if flabs:
1149
+ labin += [flabs[0]]
1150
+ logger.writeln("MTZ columns automatically selected: {}".format(labin))
1151
+ return labin
1152
+ # decide_mtz_labels()
1153
+
1154
+ def process_input(hklin, labin, n_bins, free, xyzins, source, d_max=None, d_min=None,
1155
+ n_per_bin=None, use="all", max_bins=None, cif_index=0, keep_charges=False,
1156
+ allow_unusual_occupancies=False):
1157
+ if labin: assert 1 < len(labin) < 6
1158
+ assert use in ("all", "work", "test")
1159
+ assert n_bins or n_per_bin #if n_bins not set, n_per_bin should be given
1160
+
1161
+ if len(xyzins) > 0 and type(xyzins[0]) is gemmi.Structure:
1162
+ sts = xyzins
1163
+ else:
1164
+ sts = []
1165
+
1166
+ if type(hklin) is gemmi.Mtz or utils.fileio.is_mmhkl_file(hklin):
1167
+ if type(hklin) is gemmi.Mtz:
1168
+ mtz = hklin
1169
+ else:
1170
+ mtz = utils.fileio.read_mmhkl(hklin, cif_index=cif_index)
1171
+ if not sts:
1172
+ sts = [utils.fileio.read_structure(f) for f in xyzins]
1173
+ else:
1174
+ assert len(xyzins) == 1
1175
+ assert not sts
1176
+ st, mtz = utils.fileio.read_small_molecule_files([hklin, xyzins[0]])
1177
+ sts = [st]
1178
+
1179
+ for st in sts:
1180
+ utils.model.check_occupancies(st, raise_error=not allow_unusual_occupancies)
1181
+
1182
+ if not labin:
1183
+ labin = decide_mtz_labels(mtz)
1184
+ col_types = {x.label:x.type for x in mtz.columns}
1185
+ if labin[0] not in col_types:
1186
+ raise RuntimeError("MTZ column not found: {}".format(labin[0]))
1187
+ labs_and_types = {"F": ("amplitude", ["FP","SIGFP"], ["F", "Q"]),
1188
+ "J": ("intensity", ["I","SIGI"], ["J", "Q"]),
1189
+ "G": ("anomalous amplitude", ["F(+)","SIGF(+)", "F(-)", "SIGF(-)"], ["G", "L", "G", "L"]),
1190
+ "K": ("anomalous intensity", ["I(+)","SIGI(+)", "I(-)", "SIGI(-)"], ["K", "M", "K", "M"])}
1191
+ if col_types[labin[0]] not in labs_and_types:
1192
+ raise RuntimeError("MTZ column {} is neither amplitude nor intensity".format(labin[0]))
1193
+ name, newlabels, require_types = labs_and_types[col_types[labin[0]]]
1194
+ logger.writeln("Observation type: {}".format(name))
1195
+ if len(newlabels) < len(labin): newlabels.append("FREE")
1196
+ hkldata = utils.hkl.hkldata_from_mtz(mtz, labin, newlabels=newlabels, require_types=require_types)
1197
+ if newlabels[0] == "F(+)":
1198
+ hkldata.merge_anomalous(newlabels[:4], ["FP", "SIGFP"])
1199
+ newlabels = ["FP", "SIGFP"] + newlabels[4:]
1200
+ elif newlabels[0] == "I(+)":
1201
+ hkldata.merge_anomalous(newlabels[:4], ["I", "SIGI"])
1202
+ newlabels = ["I", "SIGI"] + newlabels[4:]
1203
+
1204
+ if hkldata.df.empty:
1205
+ raise RuntimeError("No data in hkl data")
1206
+
1207
+ if sts:
1208
+ assert source in ["electron", "xray", "neutron"]
1209
+ for st in sts:
1210
+ if st[0].count_atom_sites() == 0:
1211
+ raise RuntimeError("No atom in the model")
1212
+ if not hkldata.cell.approx(sts[0].cell, 1e-3):
1213
+ logger.writeln("Warning: unit cell mismatch between model and reflection data")
1214
+ logger.writeln(" using unit cell from mtz")
1215
+
1216
+ for st in sts: st.cell = hkldata.cell # mtz cell is used in any case
1217
+
1218
+ sg_st = sts[0].find_spacegroup() # may be None
1219
+ sg_use = hkldata.sg
1220
+ if hkldata.sg != sg_st:
1221
+ if st.cell.is_crystal() and sg_st and sg_st.laue_str() != hkldata.sg.laue_str():
1222
+ raise RuntimeError("Crystal symmetry mismatch between model and data")
1223
+ logger.writeln("Warning: space group mismatch between model and mtz")
1224
+ if sg_st and sg_st.laue_str() == hkldata.sg.laue_str():
1225
+ logger.writeln(" using space group from model")
1226
+ sg_use = sg_st
1227
+ else:
1228
+ logger.writeln(" using space group from mtz")
1229
+ logger.writeln("")
1230
+
1231
+ for st in sts:
1232
+ st.spacegroup_hm = sg_use.xhm()
1233
+ st.setup_cell_images()
1234
+ hkldata.sg = sg_use
1235
+
1236
+ if not keep_charges:
1237
+ utils.model.remove_charge(sts)
1238
+ utils.model.check_atomsf(sts, source)
1239
+
1240
+ if newlabels[0] == "FP":
1241
+ hkldata.remove_nonpositive(newlabels[0])
1242
+ hkldata.remove_nonpositive(newlabels[1])
1243
+ hkldata.switch_to_asu()
1244
+ hkldata.remove_systematic_absences()
1245
+ #hkldata.df = hkldata.df.astype({name: 'float64' for name in ["I","SIGI","FP","SIGFP"] if name in hkldata.df})
1246
+
1247
+ if (d_min, d_max).count(None) != 2:
1248
+ hkldata = hkldata.copy(d_min=d_min, d_max=d_max)
1249
+ if hkldata.df.empty:
1250
+ raise RuntimeError("No data left in hkl data")
1251
+ d_min, d_max = hkldata.d_min_max()
1252
+
1253
+ hkldata.complete()
1254
+ hkldata.sort_by_resolution()
1255
+ hkldata.calc_epsilon()
1256
+ hkldata.calc_centric()
1257
+
1258
+ if "FREE" in hkldata.df and free is None:
1259
+ free = hkldata.guess_free_number(newlabels[0])
1260
+
1261
+ if n_bins is None:
1262
+ sel = hkldata.df[newlabels[0]].notna()
1263
+ if use == "work":
1264
+ sel &= hkldata.df.FREE != free
1265
+ elif use == "test":
1266
+ sel &= hkldata.df.FREE == free
1267
+ s_array = 1/hkldata.d_spacings()[sel]
1268
+ if len(s_array) == 0:
1269
+ raise RuntimeError("no reflections in {} set".format(use))
1270
+ n_bins = utils.hkl.decide_n_bins(n_per_bin, s_array, max_bins=max_bins)
1271
+ logger.writeln("n_per_bin={} requested for {}. n_bins set to {}".format(n_per_bin, use, n_bins))
1272
+
1273
+ hkldata.setup_binning(n_bins=n_bins)
1274
+ logger.writeln("Data completeness: {:.2f}%".format(hkldata.completeness()*100.))
1275
+
1276
+ fc_labs = ["FC{}".format(i) for i, _ in enumerate(sts)]
1277
+
1278
+ # Create a centric selection table for faster look up
1279
+ centric_and_selections = {}
1280
+ stats = hkldata.binned_df.copy()
1281
+ stats["n_all"] = 0
1282
+ stats["n_obs"] = 0
1283
+ stats[newlabels[0]] = numpy.nan
1284
+ snr = "I/sigma" if newlabels[0] == "I" else "F/sigma"
1285
+ stats[snr] = numpy.nan
1286
+ if newlabels[0] == "I":
1287
+ stats["Mn(I)/Std(I)"] = numpy.nan
1288
+ if "FREE" in hkldata.df:
1289
+ stats["n_work"] = 0
1290
+ stats["n_test"] = 0
1291
+
1292
+ for i_bin, idxes in hkldata.binned():
1293
+ centric_and_selections[i_bin] = []
1294
+ n_obs = 0
1295
+ n_work, n_test = 0, 0
1296
+ for c, g2 in hkldata.df.loc[idxes].groupby("centric", sort=False):
1297
+ valid_sel = numpy.isfinite(g2[newlabels[0]])
1298
+ if "FREE" in g2:
1299
+ test_sel = (g2.FREE == free).fillna(False)
1300
+ test = g2.index[test_sel]
1301
+ work = g2.index[~test_sel]
1302
+ n_work += (valid_sel & ~test_sel).sum()
1303
+ n_test += (valid_sel & test_sel).sum()
1304
+ else:
1305
+ work = g2.index
1306
+ test = type(work)([], dtype=work.dtype)
1307
+ centric_and_selections[i_bin].append((c, work, test))
1308
+ n_obs += numpy.sum(valid_sel)
1309
+
1310
+ stats.loc[i_bin, "n_obs"] = n_obs
1311
+ stats.loc[i_bin, "n_all"] = len(idxes)
1312
+ obs = hkldata.df[newlabels[0]].to_numpy()[idxes]
1313
+ sigma = hkldata.df[newlabels[1]].to_numpy()[idxes]
1314
+ if n_obs > 0:
1315
+ stats.loc[i_bin, snr] = numpy.nanmean(obs / sigma)
1316
+ mean_obs = numpy.nanmean(obs)
1317
+ stats.loc[i_bin, newlabels[0]] = mean_obs
1318
+ if newlabels[0] == "I":
1319
+ stats.loc[i_bin, "Mn(I)/Std(I)"] = mean_obs / numpy.nanstd(obs)
1320
+ if "FREE" in hkldata.df:
1321
+ stats.loc[i_bin, "n_work"] = n_work
1322
+ stats.loc[i_bin, "n_test"] = n_test
1323
+
1324
+ stats["completeness"] = stats["n_obs"] / stats["n_all"] * 100
1325
+ logger.writeln(stats.to_string())
1326
+ return hkldata, sts, fc_labs, centric_and_selections, free
1327
+ # process_input()
1328
+
1329
+ def update_fc(st_list, fc_labs, d_min, monlib, source, mott_bethe, hkldata=None, twin_data=None):
1330
+ #assert (hkldata, twin_data).count(None) == 1
1331
+ # hkldata not updated when twin_data is given
1332
+ for i, st in enumerate(st_list):
1333
+ if st.ncs:
1334
+ st = st.clone()
1335
+ st.expand_ncs(gemmi.HowToNameCopiedChain.Dup, merge_dist=0)
1336
+ if twin_data:
1337
+ hkl = twin_data.asu
1338
+ else:
1339
+ hkl = hkldata.miller_array()
1340
+ fc = utils.model.calc_fc_fft(st, d_min - 1e-6,
1341
+ monlib=monlib,
1342
+ source=source,
1343
+ mott_bethe=mott_bethe,
1344
+ miller_array=hkl)
1345
+ if twin_data:
1346
+ twin_data.f_calc[:,i] = fc
1347
+ else:
1348
+ hkldata.df[fc_labs[i]] = fc
1349
+ if not twin_data:
1350
+ hkldata.df["FC"] = hkldata.df[fc_labs].sum(axis=1)
1351
+ # update_fc()
1352
+
1353
+ def calc_Fmask(st, d_min, miller_array):
1354
+ logger.writeln("Calculating solvent contribution..")
1355
+ grid = gemmi.FloatGrid()
1356
+ grid.setup_from(st, spacing=min(0.6, (d_min-1e-6) / 2 - 1e-9))
1357
+ masker = gemmi.SolventMasker(gemmi.AtomicRadiiSet.Refmac)
1358
+ masker.put_mask_on_float_grid(grid, st[0])
1359
+ fmask_gr = gemmi.transform_map_to_f_phi(grid)
1360
+ Fmask = fmask_gr.get_value_by_hkl(miller_array)
1361
+ return Fmask
1362
+ # calc_Fmask()
1363
+
1364
+ def bulk_solvent_and_lsq_scales(hkldata, sts, fc_labs, use_solvent=True, use_int=False, mask=None, func_type="log_cosh", twin_data=None):
1365
+ # fc_labs must have solvent part at the end
1366
+ miller_array = twin_data.asu if twin_data else hkldata.miller_array()
1367
+ d_min = twin_data.d_min(sts[0].cell) if twin_data else hkldata.d_min_max()[0]
1368
+ if use_solvent:
1369
+ if mask is None:
1370
+ Fmask = calc_Fmask(merge_models(sts), d_min, miller_array)
1371
+ else:
1372
+ fmask_gr = gemmi.transform_map_to_f_phi(mask)
1373
+ Fmask = fmask_gr.get_value_by_hkl(miller_array)
1374
+ if twin_data:
1375
+ fc_sum = twin_data.f_calc[:,:-1].sum(axis=1)
1376
+ else:
1377
+ fc_sum = hkldata.df[fc_labs[:-1]].sum(axis=1).to_numpy()
1378
+ fc_list = [fc_sum, Fmask]
1379
+ else:
1380
+ if twin_data:
1381
+ fc_list = [twin_data.f_calc.sum(axis=1)]
1382
+ else:
1383
+ fc_list = [hkldata.df[fc_labs].sum(axis=1).to_numpy()]
1384
+
1385
+ scaling = LsqScale(func_type=func_type)
1386
+ scaling.set_data(hkldata, fc_list, use_int, sigma_cutoff=0, twin_data=twin_data)
1387
+ scaling.scale()
1388
+ b_iso = scaling.b_iso
1389
+ k_aniso = hkldata.debye_waller_factors(b_cart=scaling.b_aniso)
1390
+ hkldata.df["k_aniso"] = k_aniso # we need it later when calculating stats
1391
+
1392
+ if use_solvent:
1393
+ if twin_data:
1394
+ s2 = numpy.asarray(twin_data.s2_array)
1395
+ else:
1396
+ s2 = 1. / hkldata.d_spacings().to_numpy()**2
1397
+ Fbulk = Fmask * scaling.get_solvent_scale(scaling.k_sol, scaling.b_sol, s2)
1398
+ if twin_data:
1399
+ twin_data.f_calc[:,-1] = Fbulk
1400
+ else:
1401
+ hkldata.df[fc_labs[-1]] = Fbulk
1402
+
1403
+ # Apply scales
1404
+ if use_int:
1405
+ # in intensity case, we try to refine b_aniso with ML. perhaps we should do it in amplitude case also
1406
+ o_labs = ["I", "SIGI", "I(+)","SIGI(+)", "I(-)", "SIGI(-)"]
1407
+ hkldata.df[hkldata.df.columns.intersection(o_labs)] /= scaling.k_overall**2
1408
+ else:
1409
+ o_labs = ["FP", "SIGFP", "F(+)","SIGF(+)", "F(-)", "SIGF(-)"]
1410
+ hkldata.df[hkldata.df.columns.intersection(o_labs)] /= scaling.k_overall
1411
+ if twin_data:
1412
+ twin_data.f_calc[:] *= twin_data.debye_waller_factors(b_iso=b_iso)[:,None]
1413
+ else:
1414
+ k_iso = hkldata.debye_waller_factors(b_iso=b_iso)
1415
+ for lab in fc_labs: hkldata.df[lab] *= k_iso
1416
+ # total Fc
1417
+ hkldata.df["FC"] = hkldata.df[fc_labs].sum(axis=1)
1418
+ return scaling
1419
+ # bulk_solvent_and_lsq_scales()
1420
+
1421
+ def calculate_maps(hkldata, b_aniso, centric_and_selections, fc_labs, D_labs, log_out, use="all"):
1422
+ nmodels = len(fc_labs)
1423
+ hkldata.df["FWT"] = 0j * numpy.nan
1424
+ hkldata.df["DELFWT"] = 0j * numpy.nan
1425
+ hkldata.df["FOM"] = numpy.nan
1426
+ hkldata.df["X"] = numpy.nan # for FOM
1427
+ has_ano = "F(+)" in hkldata.df and "F(-)" in hkldata.df
1428
+ if has_ano:
1429
+ hkldata.df["FAN"] = 0j * numpy.nan
1430
+ stats_data = []
1431
+ k_ani = hkldata.debye_waller_factors(b_cart=b_aniso)
1432
+ Ds = numpy.vstack([hkldata.df[lab].to_numpy() for lab in D_labs]).T
1433
+ Fcs = numpy.vstack([hkldata.df[lab].to_numpy() for lab in fc_labs]).T
1434
+ DFc = (Ds * Fcs).sum(axis=1)
1435
+ hkldata.df["DFC"] = DFc
1436
+ for i_bin, idxes in hkldata.binned():
1437
+ bin_d_min = hkldata.binned_df.d_min[i_bin]
1438
+ bin_d_max = hkldata.binned_df.d_max[i_bin]
1439
+ # 0: acentric 1: centric
1440
+ mean_fom = [numpy.nan, numpy.nan]
1441
+ nrefs = [0, 0]
1442
+ for c, work, test in centric_and_selections[i_bin]:
1443
+ cidxes = numpy.concatenate([work, test])
1444
+ S = hkldata.df["S"].to_numpy()[cidxes]
1445
+ expip = numpy.exp(numpy.angle(DFc[cidxes])*1j)
1446
+ Fo = hkldata.df.FP.to_numpy()[cidxes] / k_ani[cidxes]
1447
+ SigFo = hkldata.df.SIGFP.to_numpy()[cidxes] / k_ani[cidxes]
1448
+ epsilon = hkldata.df.epsilon.to_numpy()[cidxes]
1449
+ nrefs[c] = numpy.sum(numpy.isfinite(Fo))
1450
+ DFc_abs = numpy.abs(DFc[cidxes])
1451
+ if c == 0:
1452
+ Sigma = 2 * SigFo**2 + epsilon * S
1453
+ X = 2 * Fo * DFc_abs / Sigma
1454
+ m = gemmi.bessel_i1_over_i0(X)
1455
+ hkldata.df.loc[cidxes, "FWT"] = (2 * m * Fo - DFc_abs) * expip
1456
+ else:
1457
+ Sigma = SigFo**2 + epsilon * S
1458
+ X = Fo * DFc_abs / Sigma
1459
+ m = numpy.tanh(X)
1460
+ hkldata.df.loc[cidxes, "FWT"] = (m * Fo) * expip
1461
+
1462
+ hkldata.df.loc[cidxes, "DELFWT"] = (m * Fo - DFc_abs) * expip
1463
+ hkldata.df.loc[cidxes, "FOM"] = m
1464
+ hkldata.df.loc[cidxes, "X"] = X
1465
+ if has_ano:
1466
+ Fo_dano = (hkldata.df["F(+)"].to_numpy()[cidxes] - hkldata.df["F(-)"].to_numpy()[cidxes]) / k_ani[cidxes]
1467
+ hkldata.df.loc[cidxes, "FAN"] = m * Fo_dano * expip / 2j
1468
+ if nrefs[c] > 0: mean_fom[c] = numpy.nanmean(m)
1469
+
1470
+ # remove reflections that should be hidden
1471
+ if use != "all":
1472
+ # usually use == "work"
1473
+ tohide = test if use == "work" else work
1474
+ hkldata.df.loc[tohide, "FWT"] = 0j * numpy.nan
1475
+ hkldata.df.loc[tohide, "DELFWT"] = 0j * numpy.nan
1476
+ fill_sel = numpy.isnan(hkldata.df["FWT"][cidxes].to_numpy())
1477
+ hkldata.df.loc[cidxes[fill_sel], "FWT"] = DFc[cidxes][fill_sel]
1478
+
1479
+ Fc = hkldata.df.FC.to_numpy()[idxes] * k_ani[idxes]
1480
+ Fo = hkldata.df.FP.to_numpy()[idxes]
1481
+ mean_DFc2 = numpy.nanmean(numpy.abs((Ds[idxes,:] * Fcs[idxes,:]).sum(axis=1) * k_ani[idxes])**2)
1482
+ mean_log_DFcs = numpy.log(numpy.nanmean(numpy.abs(Ds[idxes,:] * Fcs[idxes,:] * k_ani[idxes,None]), axis=0)).tolist()
1483
+ mean_Ds = numpy.nanmean(Ds[idxes,:], axis=0).tolist()
1484
+ if sum(nrefs) > 0:
1485
+ r = numpy.nansum(numpy.abs(numpy.abs(Fc)-Fo)) / numpy.nansum(Fo)
1486
+ cc = utils.hkl.correlation(Fo, numpy.abs(Fc))
1487
+ mean_Fo2 = numpy.nanmean(numpy.abs(Fo)**2)
1488
+ else:
1489
+ r, cc, mean_Fo2 = numpy.nan, numpy.nan, numpy.nan
1490
+ stats_data.append([i_bin, nrefs[0], nrefs[1], bin_d_max, bin_d_min,
1491
+ numpy.log(mean_Fo2),
1492
+ numpy.log(numpy.nanmean(numpy.abs(Fc)**2)),
1493
+ numpy.log(mean_DFc2),
1494
+ numpy.log(numpy.mean(hkldata.df["S"].to_numpy()[idxes])),
1495
+ mean_fom[0], mean_fom[1], r, cc] + mean_Ds + mean_log_DFcs)
1496
+
1497
+ DFc_labs = ["log(Mn(|{}{}|))".format(dl,fl) for dl,fl in zip(D_labs, fc_labs)]
1498
+ cols = ["bin", "n_a", "n_c", "d_max", "d_min",
1499
+ "log(Mn(|Fo|^2))", "log(Mn(|Fc|^2))", "log(Mn(|DFc|^2))",
1500
+ "log(Sigma)", "FOM_a", "FOM_c", "R", "CC(|Fo|,|Fc|)"] + D_labs + DFc_labs
1501
+ stats = pandas.DataFrame(stats_data, columns=cols)
1502
+ title_labs = [["log(Mn(|F|^2)) and variances", ["log(Mn(|Fo|^2))", "log(Mn(|Fc|^2))", "log(Mn(|DFc|^2))", "log(Sigma)"]],
1503
+ ["FOM", ["FOM_a", "FOM_c"]],
1504
+ ["D", D_labs],
1505
+ ["DFc", DFc_labs],
1506
+ ["R-factor", ["R"]],
1507
+ ["CC", ["CC(|Fo|,|Fc|)"]],
1508
+ ["number of reflections", ["n_a", "n_c"]]]
1509
+ with open(log_out, "w") as ofs:
1510
+ ofs.write(utils.make_loggraph_str(stats, main_title="Statistics",
1511
+ title_labs=title_labs,
1512
+ s2=1/stats["d_min"]**2))
1513
+ logger.writeln("output log: {}".format(log_out))
1514
+ # calculate_maps()
1515
+
1516
+ def main(args):
1517
+ n_per_bin = {"all": 500, "work": 500, "test": 50}[args.use]
1518
+ try:
1519
+ hkldata, sts, fc_labs, centric_and_selections,free = process_input(hklin=args.hklin,
1520
+ labin=args.labin.split(",") if args.labin else None,
1521
+ n_bins=args.nbins,
1522
+ free=args.free,
1523
+ xyzins=sum(args.model, []),
1524
+ source=args.source,
1525
+ d_max=args.d_max,
1526
+ d_min=args.d_min,
1527
+ n_per_bin=n_per_bin,
1528
+ use=args.use,
1529
+ max_bins=30,
1530
+ keep_charges=args.keep_charges)
1531
+ except RuntimeError as e:
1532
+ raise SystemExit("Error: {}".format(e))
1533
+
1534
+ if args.twin:
1535
+ twin_data = find_twin_domains_from_data(hkldata)
1536
+ else:
1537
+ twin_data = None
1538
+ if twin_data:
1539
+ twin_data.setup_f_calc(len(sts) + (0 if args.no_solvent else 1))
1540
+
1541
+ update_fc(sts, fc_labs, d_min=hkldata.d_min_max()[0], monlib=None,
1542
+ source=args.source, mott_bethe=(args.source=="electron"),
1543
+ hkldata=hkldata, twin_data=twin_data)
1544
+ is_int = "I" in hkldata.df
1545
+
1546
+ if args.mask:
1547
+ mask = utils.fileio.read_ccp4_map(args.mask)[0]
1548
+ else:
1549
+ mask = None
1550
+
1551
+ # Overall scaling & bulk solvent
1552
+ # FP/SIGFP will be scaled. Total FC will be added.
1553
+ if not args.no_solvent:
1554
+ fc_labs.append("Fbulk")
1555
+ lsq = bulk_solvent_and_lsq_scales(hkldata, sts, fc_labs, use_solvent=not args.no_solvent,
1556
+ use_int=is_int, mask=mask, twin_data=twin_data)
1557
+ b_aniso = lsq.b_aniso
1558
+ # stats
1559
+ stats, overall = calc_r_and_cc(hkldata, centric_and_selections, twin_data)
1560
+ for lab in "R", "CC":
1561
+ logger.writeln(" ".join("{} = {:.4f}".format(x, overall[x]) for x in overall if x.startswith(lab)))
1562
+ if is_int:
1563
+ logger.writeln("R1 is calculated for reflections with I/sigma>2.")
1564
+
1565
+ if twin_data:
1566
+ estimate_twin_fractions_from_model(twin_data, hkldata)
1567
+ #del hkldata.df["FC"]
1568
+ #del hkldata.df["Fbulk"]
1569
+ # Need to redo scaling?
1570
+ lsq = bulk_solvent_and_lsq_scales(hkldata, sts, fc_labs, use_solvent=not args.no_solvent,
1571
+ use_int=is_int, mask=mask, twin_data=twin_data)
1572
+ b_aniso = lsq.b_aniso
1573
+ stats, overall = calc_r_and_cc(hkldata, centric_and_selections, twin_data)
1574
+ for lab in "R", "CC":
1575
+ logger.writeln(" ".join("{} = {:.4f}".format(x, overall[x]) for x in overall if x.startswith(lab)))
1576
+
1577
+ # Estimate ML parameters
1578
+ D_labs = ["D{}".format(i) for i in range(len(fc_labs))]
1579
+
1580
+ if args.use_cc:
1581
+ assert not is_int
1582
+ assert not args.twin
1583
+ logger.writeln("Estimating sigma-A parameters from CC..")
1584
+ determine_mlf_params_from_cc(hkldata, fc_labs, D_labs, centric_and_selections, args.use)
1585
+ else:
1586
+ b_aniso = determine_ml_params(hkldata, is_int, fc_labs, D_labs, b_aniso, centric_and_selections, args.D_trans, args.S_trans, args.use,
1587
+ twin_data=twin_data)
1588
+ use = {"all": "all", "work": "work", "test": "work"}[args.use]
1589
+ if twin_data:
1590
+ # replace hkldata
1591
+ hkldata = calculate_maps_twin(hkldata, b_aniso, fc_labs, D_labs, twin_data, centric_and_selections, use)
1592
+ elif is_int:
1593
+ calculate_maps_int(hkldata, b_aniso, fc_labs, D_labs, centric_and_selections, use)
1594
+ else:
1595
+ log_out = "{}.log".format(args.output_prefix)
1596
+ calculate_maps(hkldata, b_aniso, centric_and_selections, fc_labs, D_labs, log_out, use)
1597
+
1598
+ # Write mtz file
1599
+ if twin_data:
1600
+ labs = ["F_est"]
1601
+ elif is_int:
1602
+ labs = ["I", "SIGI"]
1603
+ else:
1604
+ labs = ["FP", "SIGFP"]
1605
+ labs.extend(["FOM", "FWT", "DELFWT", "FC", "DFC"])
1606
+ if "FAN" in hkldata.df:
1607
+ labs.append("FAN")
1608
+ if not args.no_solvent:
1609
+ labs.append("Fbulk")
1610
+ if "FREE" in hkldata.df:
1611
+ labs.append("FREE")
1612
+ if "F_true_est" in hkldata.df:
1613
+ labs.append("F_true_est")
1614
+ labs += D_labs + ["S"]
1615
+ mtz_out = args.output_prefix+".mtz"
1616
+ hkldata.write_mtz(mtz_out, labs=labs, types={"FOM": "W", "FP":"F", "SIGFP":"Q", "F_est": "F"})
1617
+ return hkldata
1618
+ # main()
1619
+ if __name__ == "__main__":
1620
+ import sys
1621
+ args = parse_args(sys.argv[1:])
1622
+ main(args)