servalcat 0.4.99__cp38-cp38-macosx_10_14_x86_64.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of servalcat might be problematic. Click here for more details.
- servalcat/__init__.py +10 -0
- servalcat/__main__.py +120 -0
- servalcat/ext.cpython-38-darwin.so +0 -0
- servalcat/refine/__init__.py +0 -0
- servalcat/refine/cgsolve.py +100 -0
- servalcat/refine/refine.py +906 -0
- servalcat/refine/refine_geom.py +233 -0
- servalcat/refine/refine_spa.py +366 -0
- servalcat/refine/refine_xtal.py +281 -0
- servalcat/refine/spa.py +144 -0
- servalcat/refine/xtal.py +276 -0
- servalcat/refmac/__init__.py +0 -0
- servalcat/refmac/exte.py +182 -0
- servalcat/refmac/refmac_keywords.py +639 -0
- servalcat/refmac/refmac_wrapper.py +395 -0
- servalcat/spa/__init__.py +0 -0
- servalcat/spa/fofc.py +479 -0
- servalcat/spa/fsc.py +385 -0
- servalcat/spa/localcc.py +188 -0
- servalcat/spa/realspcc_from_var.py +128 -0
- servalcat/spa/run_refmac.py +977 -0
- servalcat/spa/shift_maps.py +293 -0
- servalcat/spa/shiftback.py +137 -0
- servalcat/spa/translate.py +129 -0
- servalcat/utils/__init__.py +35 -0
- servalcat/utils/commands.py +1547 -0
- servalcat/utils/fileio.py +744 -0
- servalcat/utils/generate_operators.py +296 -0
- servalcat/utils/hkl.py +714 -0
- servalcat/utils/logger.py +140 -0
- servalcat/utils/maps.py +345 -0
- servalcat/utils/model.py +782 -0
- servalcat/utils/refmac.py +760 -0
- servalcat/utils/restraints.py +781 -0
- servalcat/utils/symmetry.py +295 -0
- servalcat/xtal/__init__.py +0 -0
- servalcat/xtal/french_wilson.py +258 -0
- servalcat/xtal/run_refmac_small.py +240 -0
- servalcat/xtal/sigmaa.py +1644 -0
- servalcat/xtal/twin.py +121 -0
- servalcat-0.4.99.dist-info/METADATA +55 -0
- servalcat-0.4.99.dist-info/RECORD +45 -0
- servalcat-0.4.99.dist-info/WHEEL +5 -0
- servalcat-0.4.99.dist-info/entry_points.txt +4 -0
- servalcat-0.4.99.dist-info/licenses/LICENSE +373 -0
servalcat/xtal/sigmaa.py
ADDED
|
@@ -0,0 +1,1644 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Author: "Keitaro Yamashita, Garib N. Murshudov"
|
|
3
|
+
MRC Laboratory of Molecular Biology
|
|
4
|
+
|
|
5
|
+
This software is released under the
|
|
6
|
+
Mozilla Public License, version 2.0; see LICENSE.
|
|
7
|
+
"""
|
|
8
|
+
from __future__ import absolute_import, division, print_function, generators
|
|
9
|
+
import argparse
|
|
10
|
+
import gemmi
|
|
11
|
+
import numpy
|
|
12
|
+
import pandas
|
|
13
|
+
import itertools
|
|
14
|
+
import time
|
|
15
|
+
import scipy.special
|
|
16
|
+
import scipy.optimize
|
|
17
|
+
from servalcat.utils import logger
|
|
18
|
+
from servalcat import utils
|
|
19
|
+
from servalcat import ext
|
|
20
|
+
from servalcat.xtal.twin import find_twin_domains_from_data, estimate_twin_fractions_from_model
|
|
21
|
+
|
|
22
|
+
"""
|
|
23
|
+
DFc = sum_j D_j F_c,j
|
|
24
|
+
The last Fc,n is bulk solvent contribution.
|
|
25
|
+
"""
|
|
26
|
+
|
|
27
|
+
integr = ext.IntensityIntegrator()
|
|
28
|
+
|
|
29
|
+
def add_arguments(parser):
|
|
30
|
+
parser.description = 'Sigma-A parameter estimation for crystallographic data'
|
|
31
|
+
parser.add_argument('--hklin', required=True,
|
|
32
|
+
help='Input MTZ file')
|
|
33
|
+
parser.add_argument('--spacegroup',
|
|
34
|
+
help='Override space group')
|
|
35
|
+
parser.add_argument('--labin',
|
|
36
|
+
help='MTZ column for F,SIGF,FREE')
|
|
37
|
+
parser.add_argument('--free', type=int,
|
|
38
|
+
help='flag number for test set')
|
|
39
|
+
parser.add_argument('--model', required=True, nargs="+", action="append",
|
|
40
|
+
help='Input atomic model file(s)')
|
|
41
|
+
parser.add_argument("-d", '--d_min', type=float)
|
|
42
|
+
parser.add_argument('--d_max', type=float)
|
|
43
|
+
parser.add_argument('--nbins', type=int,
|
|
44
|
+
help="Number of bins (default: auto)")
|
|
45
|
+
parser.add_argument('-s', '--source', choices=["electron", "xray", "neutron"], required=True,
|
|
46
|
+
help="Scattering factor choice")
|
|
47
|
+
parser.add_argument('--D_trans', choices=["exp", "splus"],
|
|
48
|
+
help="estimate D with positivity constraint")
|
|
49
|
+
parser.add_argument('--S_trans', choices=["exp", "splus"],
|
|
50
|
+
help="estimate variance of unexplained signal with positivity constraint")
|
|
51
|
+
parser.add_argument('--no_solvent', action='store_true',
|
|
52
|
+
help="Do not consider bulk solvent contribution")
|
|
53
|
+
parser.add_argument('--use_cc', action='store_true',
|
|
54
|
+
help="Use CC(|F1|,|F2|) to CC(F1,F2) conversion to derive D and S")
|
|
55
|
+
parser.add_argument('--use', choices=["all", "work", "test"], default="all",
|
|
56
|
+
help="Which reflections to be used for the parameter estimate.")
|
|
57
|
+
parser.add_argument('--twin', action="store_true", help="Turn on twin refinement")
|
|
58
|
+
parser.add_argument('--mask',
|
|
59
|
+
help="A solvent mask (by default calculated from the coordinates)")
|
|
60
|
+
parser.add_argument('--keep_charges', action='store_true',
|
|
61
|
+
help="Use scattering factor for charged atoms. Use it with care.")
|
|
62
|
+
parser.add_argument('-o','--output_prefix', default="sigmaa",
|
|
63
|
+
help='output file name prefix (default: %(default)s)')
|
|
64
|
+
# add_arguments()
|
|
65
|
+
|
|
66
|
+
def parse_args(arg_list):
|
|
67
|
+
parser = argparse.ArgumentParser()
|
|
68
|
+
add_arguments(parser)
|
|
69
|
+
return parser.parse_args(arg_list)
|
|
70
|
+
# parse_args()
|
|
71
|
+
|
|
72
|
+
def nanaverage(cc, w):
|
|
73
|
+
sel = ~numpy.isnan(cc)
|
|
74
|
+
if numpy.sum(w[sel]) == 0:
|
|
75
|
+
return numpy.nan
|
|
76
|
+
return numpy.average(cc[sel], weights=w[sel])
|
|
77
|
+
|
|
78
|
+
def calc_r_and_cc(hkldata, centric_and_selections, twin_data=None):
|
|
79
|
+
has_int = "I" in hkldata.df
|
|
80
|
+
has_free = "FREE" in hkldata.df
|
|
81
|
+
stats = hkldata.binned_df.copy()
|
|
82
|
+
stats[["n_obs", "n_all"]] = 0
|
|
83
|
+
if has_free:
|
|
84
|
+
stats[["n_work", "n_free"]] = 0
|
|
85
|
+
rlab = "R1" if has_int else "R"
|
|
86
|
+
cclab = "CCI" if has_int else "CCF"
|
|
87
|
+
if twin_data:
|
|
88
|
+
Fc = numpy.sqrt(twin_data.i_calc_twin())
|
|
89
|
+
else:
|
|
90
|
+
Fc = numpy.abs(hkldata.df.FC * hkldata.df.k_aniso)
|
|
91
|
+
if has_int:
|
|
92
|
+
obs = hkldata.df.I
|
|
93
|
+
obs_sqrt = numpy.sqrt(numpy.maximum(0, hkldata.df.I))
|
|
94
|
+
obs_sqrt[hkldata.df.I/hkldata.df.SIGI < 2] = numpy.nan # SHELX equivalent
|
|
95
|
+
calc = Fc**2
|
|
96
|
+
calc_sqrt = Fc
|
|
97
|
+
else:
|
|
98
|
+
obs = obs_sqrt = hkldata.df.FP
|
|
99
|
+
calc = calc_sqrt = Fc
|
|
100
|
+
if has_free:
|
|
101
|
+
for lab in (cclab, rlab):
|
|
102
|
+
for suf in ("work", "free"):
|
|
103
|
+
stats[lab+suf] = numpy.nan
|
|
104
|
+
else:
|
|
105
|
+
stats[cclab] = numpy.nan
|
|
106
|
+
stats[rlab] = numpy.nan
|
|
107
|
+
|
|
108
|
+
for i_bin, idxes in hkldata.binned():
|
|
109
|
+
stats.loc[i_bin, "n_obs"] = numpy.sum(numpy.isfinite(obs[idxes]))
|
|
110
|
+
stats.loc[i_bin, "n_all"] = len(idxes)
|
|
111
|
+
if has_free:
|
|
112
|
+
for j, suf in ((1, "work"), (2, "free")):
|
|
113
|
+
idxes2 = numpy.concatenate([sel[j] for sel in centric_and_selections[i_bin]])
|
|
114
|
+
stats.loc[i_bin, "n_"+suf] = numpy.sum(numpy.isfinite(obs[idxes2]))
|
|
115
|
+
stats.loc[i_bin, cclab+suf] = utils.hkl.correlation(obs[idxes2], calc[idxes2])
|
|
116
|
+
stats.loc[i_bin, rlab+suf] = utils.hkl.r_factor(obs_sqrt[idxes2], calc_sqrt[idxes2])
|
|
117
|
+
else:
|
|
118
|
+
stats.loc[i_bin, cclab] = utils.hkl.correlation(obs[idxes], calc[idxes])
|
|
119
|
+
stats.loc[i_bin, rlab] = utils.hkl.r_factor(obs_sqrt[idxes], calc_sqrt[idxes])
|
|
120
|
+
|
|
121
|
+
# Overall
|
|
122
|
+
ret = {}
|
|
123
|
+
if has_free:
|
|
124
|
+
for suf in ("work", "free"):
|
|
125
|
+
ret[cclab+suf+"avg"] = nanaverage(stats[cclab+suf], stats["n_"+suf])
|
|
126
|
+
for j, suf in ((1, "work"), (2, "free")):
|
|
127
|
+
idxes = numpy.concatenate([sel[j] for i_bin, _ in hkldata.binned() for sel in centric_and_selections[i_bin]])
|
|
128
|
+
ret[rlab+suf] = utils.hkl.r_factor(obs_sqrt[idxes], calc_sqrt[idxes])
|
|
129
|
+
else:
|
|
130
|
+
ret[cclab+"avg"] = nanaverage(stats[cclab], stats["n_obs"])
|
|
131
|
+
ret[rlab] = utils.hkl.r_factor(obs, calc)
|
|
132
|
+
|
|
133
|
+
return stats, ret
|
|
134
|
+
# calc_r_and_cc()
|
|
135
|
+
|
|
136
|
+
class VarTrans:
|
|
137
|
+
def __init__(self, D_trans, S_trans):
|
|
138
|
+
# splus (softplus) appears to be better than exp
|
|
139
|
+
# exp sometimes results in too large parameter value
|
|
140
|
+
trans_funcs = {"exp": (numpy.exp, # D = f(x)
|
|
141
|
+
numpy.exp, # dD/dx
|
|
142
|
+
numpy.log), # x = f^-1(D)
|
|
143
|
+
"splus": (lambda x: numpy.logaddexp(0, x),
|
|
144
|
+
scipy.special.expit, # lambda x: 1. / (1. + numpy.exp(-x))
|
|
145
|
+
lambda x: x + numpy.log(-numpy.expm1(-x))),
|
|
146
|
+
None: (lambda x: x,
|
|
147
|
+
lambda x: 1,
|
|
148
|
+
lambda x: x)}
|
|
149
|
+
|
|
150
|
+
self.D, self.D_deriv, self.D_inv = trans_funcs[D_trans]
|
|
151
|
+
self.S, self.S_deriv, self.S_inv = trans_funcs[S_trans]
|
|
152
|
+
# class VarTrans
|
|
153
|
+
|
|
154
|
+
class LsqScale:
|
|
155
|
+
# parameter x = [k_overall, adp_pars, k_sol, B_sol]
|
|
156
|
+
def __init__(self, k_as_exp=False, func_type="log_cosh"):
|
|
157
|
+
assert func_type in ("sq", "log_cosh")
|
|
158
|
+
self.k_trans = lambda x: numpy.exp(x) if k_as_exp else x
|
|
159
|
+
self.k_trans_der = lambda x: numpy.exp(x) if k_as_exp else 1
|
|
160
|
+
self.k_trans_inv = lambda x: numpy.log(x) if k_as_exp else x
|
|
161
|
+
self.func_type = func_type
|
|
162
|
+
self.reset()
|
|
163
|
+
|
|
164
|
+
def reset(self):
|
|
165
|
+
self.k_sol = 0.35 # same default as gemmi/scaling.hpp # refmac seems to use 0.33 and 100? SCALE_LS_PART
|
|
166
|
+
self.b_sol = 46.
|
|
167
|
+
self.k_overall = None
|
|
168
|
+
self.b_iso = None
|
|
169
|
+
self.b_aniso = None
|
|
170
|
+
self.stats = {}
|
|
171
|
+
|
|
172
|
+
def set_data(self, hkldata, fc_list, use_int=False, sigma_cutoff=None, twin_data=None):
|
|
173
|
+
assert 0 < len(fc_list) < 3
|
|
174
|
+
self.use_int = use_int
|
|
175
|
+
if sigma_cutoff is not None:
|
|
176
|
+
if use_int:
|
|
177
|
+
self.sel = hkldata.df.I / hkldata.df.SIGI > sigma_cutoff
|
|
178
|
+
self.labcut = "(I/SIGI>{})".format(sigma_cutoff)
|
|
179
|
+
else:
|
|
180
|
+
self.sel = hkldata.df.FP / hkldata.df.SIGFP > sigma_cutoff
|
|
181
|
+
self.labcut = "(F/SIGF>{})".format(sigma_cutoff)
|
|
182
|
+
else:
|
|
183
|
+
self.sel = hkldata.df.index
|
|
184
|
+
self.labcut = ""
|
|
185
|
+
self.obs = hkldata.df["I" if use_int else "FP"].to_numpy(copy=True)
|
|
186
|
+
self.obs[~self.sel] = numpy.nan
|
|
187
|
+
self.calc = [x for x in fc_list]
|
|
188
|
+
self.s2mat = hkldata.ssq_mat()
|
|
189
|
+
self.s2 = 1. / hkldata.d_spacings().to_numpy()**2
|
|
190
|
+
self.adpdirs = utils.model.adp_constraints(hkldata.sg.operations(), hkldata.cell, tr0=False)
|
|
191
|
+
self.twin_data = twin_data
|
|
192
|
+
if use_int:
|
|
193
|
+
self.sqrt_obs = numpy.sqrt(self.obs)
|
|
194
|
+
|
|
195
|
+
def get_solvent_scale(self, k_sol, b_sol, s2=None):
|
|
196
|
+
if s2 is None: s2 = self.s2
|
|
197
|
+
return k_sol * numpy.exp(-b_sol * s2 / 4)
|
|
198
|
+
|
|
199
|
+
def fc_and_mask_grad(self, x):
|
|
200
|
+
fc0 = self.calc[0]
|
|
201
|
+
if len(self.calc) == 2:
|
|
202
|
+
if self.twin_data:
|
|
203
|
+
r = self.twin_data.scaling_fc_and_mask_grad(self.calc[1], x[-2], x[-1])
|
|
204
|
+
return r[:,0], r[:,1], r[:,2]
|
|
205
|
+
else:
|
|
206
|
+
fmask = self.calc[1]
|
|
207
|
+
temp_sol = numpy.exp(-x[-1] * self.s2 / 4)
|
|
208
|
+
fbulk = x[-2] * temp_sol * fmask
|
|
209
|
+
fc = fc0 + fbulk
|
|
210
|
+
re_fmask_fcconj = (fmask * fc.conj()).real
|
|
211
|
+
fc_abs = numpy.abs(fc)
|
|
212
|
+
tmp = temp_sol / fc_abs * re_fmask_fcconj
|
|
213
|
+
return fc_abs, tmp, -tmp * x[-2] * self.s2 / 4
|
|
214
|
+
else:
|
|
215
|
+
if self.twin_data:
|
|
216
|
+
return numpy.sqrt(self.twin_data.i_calc_twin()), None, None
|
|
217
|
+
else:
|
|
218
|
+
return numpy.abs(fc0), None, None
|
|
219
|
+
|
|
220
|
+
def scaled_fc(self, x):
|
|
221
|
+
fc = self.fc_and_mask_grad(x)[0]
|
|
222
|
+
nadp = self.adpdirs.shape[0]
|
|
223
|
+
B = numpy.dot(x[1:nadp+1], self.adpdirs)
|
|
224
|
+
kani = numpy.exp(numpy.dot(-B, self.s2mat))
|
|
225
|
+
return self.k_trans(x[0]) * kani * fc
|
|
226
|
+
|
|
227
|
+
def target(self, x):
|
|
228
|
+
y = self.scaled_fc(x)
|
|
229
|
+
if self.use_int:
|
|
230
|
+
diff = self.sqrt_obs - y
|
|
231
|
+
#y2 = y**2
|
|
232
|
+
#diff = self.obs - y2
|
|
233
|
+
else:
|
|
234
|
+
diff = self.obs - y
|
|
235
|
+
|
|
236
|
+
if self.func_type == "sq":
|
|
237
|
+
return numpy.nansum(diff**2)
|
|
238
|
+
elif self.func_type == "log_cosh":
|
|
239
|
+
return numpy.nansum(gemmi.log_cosh(diff))
|
|
240
|
+
else:
|
|
241
|
+
raise RuntimeError("bad func_type")
|
|
242
|
+
|
|
243
|
+
def grad(self, x):
|
|
244
|
+
g = numpy.zeros_like(x)
|
|
245
|
+
fc_abs, der_ksol, der_bsol = self.fc_and_mask_grad(x)
|
|
246
|
+
nadp = self.adpdirs.shape[0]
|
|
247
|
+
B = numpy.dot(x[1:nadp+1], self.adpdirs)
|
|
248
|
+
kani = numpy.exp(numpy.dot(-B, self.s2mat))
|
|
249
|
+
k = self.k_trans(x[0])
|
|
250
|
+
y = k * kani * fc_abs
|
|
251
|
+
if self.use_int:
|
|
252
|
+
diff = self.sqrt_obs - y
|
|
253
|
+
diff_der = -1
|
|
254
|
+
#diff = self.obs - y**2
|
|
255
|
+
#diff_der = -2 * y
|
|
256
|
+
else:
|
|
257
|
+
diff = self.obs - y
|
|
258
|
+
diff_der = -1
|
|
259
|
+
if self.func_type == "sq":
|
|
260
|
+
dfdy = 2 * diff * diff_der
|
|
261
|
+
elif self.func_type == "log_cosh":
|
|
262
|
+
dfdy = numpy.tanh(diff) * diff_der
|
|
263
|
+
else:
|
|
264
|
+
raise RuntimeError("bad func_type")
|
|
265
|
+
|
|
266
|
+
dfdb = numpy.nansum(-self.s2mat * k * fc_abs * kani * dfdy, axis=1)
|
|
267
|
+
g[0] = numpy.nansum(kani * fc_abs * dfdy * self.k_trans_der(x[0]))
|
|
268
|
+
g[1:nadp+1] = numpy.dot(dfdb, self.adpdirs.T)
|
|
269
|
+
if len(self.calc) == 2:
|
|
270
|
+
g[-2] = numpy.nansum(k * kani * der_ksol * dfdy)
|
|
271
|
+
g[-1] = numpy.nansum(k * kani * der_bsol * dfdy)
|
|
272
|
+
|
|
273
|
+
return g
|
|
274
|
+
|
|
275
|
+
def calc_shift(self, x):
|
|
276
|
+
# TODO: sort out code duplication, if we use this.
|
|
277
|
+
g = numpy.zeros((len(self.obs), len(x)))
|
|
278
|
+
H = numpy.zeros((len(x), len(x)))
|
|
279
|
+
fc_abs, der_ksol, der_bsol = self.fc_and_mask_grad(x)
|
|
280
|
+
nadp = self.adpdirs.shape[0]
|
|
281
|
+
B = numpy.dot(x[1:nadp+1], self.adpdirs)
|
|
282
|
+
kani = numpy.exp(numpy.dot(-B, self.s2mat))
|
|
283
|
+
k = self.k_trans(x[0])
|
|
284
|
+
y = k * kani * fc_abs
|
|
285
|
+
if self.use_int:
|
|
286
|
+
diff = self.sqrt_obs - y
|
|
287
|
+
diff_der = -1
|
|
288
|
+
diff_der2 = 0
|
|
289
|
+
else:
|
|
290
|
+
diff = self.obs - y
|
|
291
|
+
diff_der = -1.
|
|
292
|
+
diff_der2 = 0.
|
|
293
|
+
|
|
294
|
+
if self.func_type == "sq":
|
|
295
|
+
dfdy = 2 * diff * diff_der
|
|
296
|
+
dfdy2 = 2 * diff_der**2 + 2 * diff * diff_der2
|
|
297
|
+
elif self.func_type == "log_cosh":
|
|
298
|
+
dfdy = numpy.tanh(diff) * diff_der
|
|
299
|
+
#dfdy2 = 1 /numpy.cosh(diff)**2 * diff_der**2 + numpy.tanh(diff) * diff_der2 # problematic with large diff
|
|
300
|
+
#dfdy2 = numpy.where(diff==0, 1., numpy.abs(numpy.tanh(diff)) / gemmi.log_cosh(diff)) * diff_der**2 + numpy.tanh(diff) * diff_der2
|
|
301
|
+
dfdy2 = numpy.where(diff==0, 1., numpy.tanh(diff) / diff) * diff_der**2 + numpy.tanh(diff) * diff_der2
|
|
302
|
+
else:
|
|
303
|
+
raise RuntimeError("bad func_type")
|
|
304
|
+
|
|
305
|
+
dfdb = -self.s2mat * k * fc_abs * kani
|
|
306
|
+
g[:,0] = kani * fc_abs * self.k_trans_der(x[0])
|
|
307
|
+
g[:,1:nadp+1] = numpy.dot(dfdb.T, self.adpdirs.T)
|
|
308
|
+
if len(self.calc) == 2:
|
|
309
|
+
g[:,-2] = k * kani * der_ksol
|
|
310
|
+
g[:,-1] = k * kani * der_bsol
|
|
311
|
+
|
|
312
|
+
# no numpy.nandot..
|
|
313
|
+
g, dfdy, dfdy2 = g[self.sel, :], dfdy[self.sel], dfdy2[self.sel]
|
|
314
|
+
H = numpy.dot(g.T, g * dfdy2[:,None])
|
|
315
|
+
g = numpy.sum(dfdy[:,None] * g, axis=0)
|
|
316
|
+
dx = -numpy.dot(g, numpy.linalg.pinv(H))
|
|
317
|
+
return dx
|
|
318
|
+
|
|
319
|
+
def initial_kb(self):
|
|
320
|
+
fc_abs = self.fc_and_mask_grad([self.k_sol, self.b_sol])[0]
|
|
321
|
+
sel = self.obs > 0 # exclude nan as well
|
|
322
|
+
f1p, f2p, s2p = self.obs[sel], fc_abs[sel], self.s2[sel]
|
|
323
|
+
if self.use_int: f2p *= f2p
|
|
324
|
+
tmp = numpy.log(f2p) - numpy.log(f1p)
|
|
325
|
+
# g = [dT/dk, dT/db]
|
|
326
|
+
g = numpy.array([2 * numpy.sum(tmp), -numpy.sum(tmp*s2p)/2])
|
|
327
|
+
H = numpy.zeros((2,2))
|
|
328
|
+
H[0,0] = 2*len(f1p)
|
|
329
|
+
H[1,1] = numpy.sum(s2p**2/8)
|
|
330
|
+
H[0,1] = H[1,0] = -numpy.sum(s2p)/2
|
|
331
|
+
x = -numpy.dot(numpy.linalg.inv(H), g)
|
|
332
|
+
if self.use_int: x /= 2
|
|
333
|
+
k = numpy.exp(x[0])
|
|
334
|
+
b = x[1]
|
|
335
|
+
logger.writeln(" initial k,b = {:.2e} {:.2e}".format(k, b))
|
|
336
|
+
logger.writeln(" R{} = {:.4f}".format(self.labcut, utils.hkl.r_factor(f1p, f2p * k * numpy.exp(-b*self.s2[sel]/4))))
|
|
337
|
+
return k, b
|
|
338
|
+
|
|
339
|
+
def scale(self):
|
|
340
|
+
use_sol = len(self.calc) == 2
|
|
341
|
+
msg = "Scaling Fc to {} {} bulk solvent contribution".format("Io" if self.use_int else "Fo",
|
|
342
|
+
"with" if use_sol else "without")
|
|
343
|
+
logger.writeln(msg)
|
|
344
|
+
if self.k_overall is None or self.b_iso is None:
|
|
345
|
+
k, b = self.initial_kb()
|
|
346
|
+
else:
|
|
347
|
+
k, b = self.k_overall, self.b_iso
|
|
348
|
+
if self.b_aniso is None:
|
|
349
|
+
self.b_aniso = gemmi.SMat33d(b,b,b,0,0,0)
|
|
350
|
+
x0 = [self.k_trans_inv(k)]
|
|
351
|
+
bounds = [(0, None)]
|
|
352
|
+
x0.extend(numpy.dot(self.b_aniso.elements_pdb(), self.adpdirs.T))
|
|
353
|
+
bounds.extend([(None, None)]*(len(x0)-1))
|
|
354
|
+
if use_sol:
|
|
355
|
+
x0.extend([self.k_sol, self.b_sol])
|
|
356
|
+
bounds.extend([(1e-4, None), (10., 400.)])
|
|
357
|
+
if 0:
|
|
358
|
+
f0 = self.target(x0)
|
|
359
|
+
ader = self.grad(x0)
|
|
360
|
+
e = 1e-4
|
|
361
|
+
nder = []
|
|
362
|
+
for i in range(len(x0)):
|
|
363
|
+
x = numpy.copy(x0)
|
|
364
|
+
x[i] += e
|
|
365
|
+
f1 = self.target(x)
|
|
366
|
+
nder.append((f1 - f0) / e)
|
|
367
|
+
print("ADER NDER RATIO")
|
|
368
|
+
print(ader)
|
|
369
|
+
print(nder)
|
|
370
|
+
print(ader / nder)
|
|
371
|
+
quit()
|
|
372
|
+
|
|
373
|
+
t0 = time.time()
|
|
374
|
+
if 1:
|
|
375
|
+
x = x0
|
|
376
|
+
for i in range(40):
|
|
377
|
+
x_ini = x.copy()
|
|
378
|
+
f0 = self.target(x)
|
|
379
|
+
dx = self.calc_shift(x)
|
|
380
|
+
if numpy.max(numpy.abs(dx)) < 1e-6:
|
|
381
|
+
break
|
|
382
|
+
for s in (1, 0.5, 0.25):
|
|
383
|
+
if 0:
|
|
384
|
+
with open("debug.dat", "w") as ofs:
|
|
385
|
+
for s in numpy.linspace(-2, 2, 100):
|
|
386
|
+
f1 = self.target(x+dx * s)
|
|
387
|
+
#print(dx, f0, f1, f0 - f1)
|
|
388
|
+
ofs.write("{:4e} {:4e}\n".format(s, f1))
|
|
389
|
+
shift = dx * s
|
|
390
|
+
x = x_ini + shift
|
|
391
|
+
if x[0] < 0: x[0] = x0[0]
|
|
392
|
+
if use_sol:
|
|
393
|
+
if x[-1] < 10: x[-1] = 10
|
|
394
|
+
elif x[-1] > 400: x[-1] = 400
|
|
395
|
+
if x[-2] < 1e-4: x[-2] = 1e-4
|
|
396
|
+
f1 = self.target(x)
|
|
397
|
+
if f1 < f0: break
|
|
398
|
+
#logger.writeln("cycle {} {} {} {} {} {}".format(i, f0, f1, s, shift, (f0 - f1) / f0))
|
|
399
|
+
if 0 < (f0 - f1) / f0 < 1e-6:
|
|
400
|
+
break
|
|
401
|
+
res_x = x
|
|
402
|
+
self.stats["fun"] = f1
|
|
403
|
+
self.stats["x"] = x
|
|
404
|
+
else:
|
|
405
|
+
res = scipy.optimize.minimize(fun=self.target, x0=x0, jac=self.grad, bounds=bounds)
|
|
406
|
+
#logger.writeln(str(res))
|
|
407
|
+
logger.writeln(" finished in {} iterations ({} evaluations)".format(res.nit, res.nfev))
|
|
408
|
+
res_x = res.x
|
|
409
|
+
self.stats["fun"] = res.fun
|
|
410
|
+
self.stats["x"] = res.x
|
|
411
|
+
logger.writeln(" time: {:.3f} sec".format(time.time() - t0))
|
|
412
|
+
self.k_overall = self.k_trans(res_x[0])
|
|
413
|
+
nadp = self.adpdirs.shape[0]
|
|
414
|
+
b_overall = gemmi.SMat33d(*numpy.dot(res_x[1:nadp+1], self.adpdirs))
|
|
415
|
+
self.b_iso = b_overall.trace() / 3
|
|
416
|
+
self.b_aniso = b_overall.added_kI(-self.b_iso) # subtract isotropic contribution
|
|
417
|
+
|
|
418
|
+
logger.writeln(" k_ov= {:.2e} B_iso= {:.2e} B_aniso= {}".format(self.k_overall, self.b_iso, self.b_aniso))
|
|
419
|
+
if use_sol:
|
|
420
|
+
self.k_sol = res_x[-2]
|
|
421
|
+
self.b_sol = res_x[-1]
|
|
422
|
+
logger.writeln(" k_sol= {:.2e} B_sol= {:.2e}".format(self.k_sol, self.b_sol))
|
|
423
|
+
calc = self.scaled_fc(res_x)
|
|
424
|
+
if self.use_int: calc *= calc
|
|
425
|
+
self.stats["cc"] = utils.hkl.correlation(self.obs, calc)
|
|
426
|
+
self.stats["r"] = utils.hkl.r_factor(self.obs, calc)
|
|
427
|
+
logger.writeln(" CC{} = {:.4f}".format(self.labcut, self.stats["cc"]))
|
|
428
|
+
logger.writeln(" R{} = {:.4f}".format(self.labcut, self.stats["r"]))
|
|
429
|
+
# class LsqScale
|
|
430
|
+
|
|
431
|
+
def calc_abs_DFc(Ds, Fcs):
|
|
432
|
+
DFc = sum(Ds[i] * Fcs[i] for i in range(len(Ds)))
|
|
433
|
+
return numpy.abs(DFc)
|
|
434
|
+
# calc_abs_DFc()
|
|
435
|
+
|
|
436
|
+
#import line_profiler
|
|
437
|
+
#profile = line_profiler.LineProfiler()
|
|
438
|
+
#import atexit
|
|
439
|
+
#atexit.register(profile.print_stats)
|
|
440
|
+
#@profile
|
|
441
|
+
def mlf(df, fc_labs, Ds, S, k_ani, idxes):
|
|
442
|
+
Fcs = numpy.vstack([df[lab].to_numpy()[idxes] for lab in fc_labs]).T
|
|
443
|
+
DFc = (Ds * Fcs).sum(axis=1)
|
|
444
|
+
ll = numpy.nansum(ext.ll_amp(df.FP.to_numpy()[idxes], df.SIGFP.to_numpy()[idxes],
|
|
445
|
+
k_ani[idxes], S * df.epsilon.to_numpy()[idxes],
|
|
446
|
+
numpy.abs(DFc), df.centric.to_numpy()[idxes]+1))
|
|
447
|
+
return numpy.nansum(ll)
|
|
448
|
+
# mlf()
|
|
449
|
+
|
|
450
|
+
#@profile
|
|
451
|
+
def deriv_mlf_wrt_D_S(df, fc_labs, Ds, S, k_ani, idxes):
|
|
452
|
+
Fcs = [df[lab].to_numpy()[idxes] for lab in fc_labs]
|
|
453
|
+
r = ext.ll_amp_der1_DS(df.FP.to_numpy()[idxes], df.SIGFP.to_numpy()[idxes], k_ani[idxes], S,
|
|
454
|
+
numpy.vstack(Fcs).T, Ds,
|
|
455
|
+
df.centric.to_numpy()[idxes]+1, df.epsilon.to_numpy()[idxes])
|
|
456
|
+
g = numpy.zeros(len(fc_labs)+1)
|
|
457
|
+
g[:len(fc_labs)] = numpy.nansum(r[:,:len(fc_labs)], axis=0) # D
|
|
458
|
+
g[-1] = numpy.nansum(r[:,-1]) # S
|
|
459
|
+
return g
|
|
460
|
+
# deriv_mlf_wrt_D_S()
|
|
461
|
+
|
|
462
|
+
#@profile
|
|
463
|
+
def mlf_shift_S(df, fc_labs, Ds, S, k_ani, idxes):
|
|
464
|
+
Fcs = [df[lab].to_numpy()[idxes] for lab in fc_labs]
|
|
465
|
+
r = ext.ll_amp_der1_DS(df.FP.to_numpy()[idxes], df.SIGFP.to_numpy()[idxes], k_ani[idxes], S,
|
|
466
|
+
numpy.vstack(Fcs).T, Ds,
|
|
467
|
+
df.centric.to_numpy()[idxes]+1, df.epsilon.to_numpy()[idxes])
|
|
468
|
+
g = numpy.nansum(r[:,-1])
|
|
469
|
+
H = numpy.nansum(r[:,-1]**2) # approximating expectation value of second derivative
|
|
470
|
+
return -g / H
|
|
471
|
+
# mlf_shift_S()
|
|
472
|
+
|
|
473
|
+
def mli(df, fc_labs, Ds, S, k_ani, idxes):
|
|
474
|
+
Fcs = numpy.vstack([df[lab].to_numpy()[idxes] for lab in fc_labs]).T
|
|
475
|
+
DFc = (Ds * Fcs).sum(axis=1)
|
|
476
|
+
ll = integr.ll_int(df.I.to_numpy()[idxes], df.SIGI.to_numpy()[idxes],
|
|
477
|
+
k_ani[idxes], S * df.epsilon.to_numpy()[idxes],
|
|
478
|
+
numpy.abs(DFc), df.centric.to_numpy()[idxes]+1)
|
|
479
|
+
return numpy.nansum(ll)
|
|
480
|
+
# mli()
|
|
481
|
+
|
|
482
|
+
def deriv_mli_wrt_D_S(df, fc_labs, Ds, S, k_ani, idxes):
|
|
483
|
+
Fcs = numpy.vstack([df[lab].to_numpy()[idxes] for lab in fc_labs]).T
|
|
484
|
+
r = integr.ll_int_der1_DS(df.I.to_numpy()[idxes], df.SIGI.to_numpy()[idxes], k_ani[idxes], S,
|
|
485
|
+
Fcs, Ds,
|
|
486
|
+
df.centric.to_numpy()[idxes]+1, df.epsilon.to_numpy()[idxes])
|
|
487
|
+
g = numpy.zeros(len(fc_labs)+1)
|
|
488
|
+
g[:len(fc_labs)] = numpy.nansum(r[:,:len(fc_labs)], axis=0) # D
|
|
489
|
+
g[-1] = numpy.nansum(r[:,-1]) # S
|
|
490
|
+
return g
|
|
491
|
+
# deriv_mli_wrt_D_S()
|
|
492
|
+
|
|
493
|
+
def mli_shift_D(df, fc_labs, Ds, S, k_ani, idxes):
|
|
494
|
+
Fcs = numpy.vstack([df[lab].to_numpy()[idxes] for lab in fc_labs]).T
|
|
495
|
+
r = integr.ll_int_der1_DS(df.I.to_numpy()[idxes], df.SIGI.to_numpy()[idxes], k_ani[idxes], S,
|
|
496
|
+
Fcs, Ds,
|
|
497
|
+
df.centric.to_numpy()[idxes]+1, df.epsilon.to_numpy()[idxes])[:,:len(fc_labs)]
|
|
498
|
+
g = numpy.nansum(r, axis=0)# * trans.D_deriv(x[:len(fc_labs)]) # D
|
|
499
|
+
#tmp = numpy.hstack([r[:,:len(fc_labs)] #* trans.D_deriv(x[:len(fc_labs)]),
|
|
500
|
+
# r[:,-1,None] * trans.S_deriv(x[-1])])
|
|
501
|
+
H = numpy.nansum(numpy.matmul(r[:,:,None], r[:,None]), axis=0)
|
|
502
|
+
return -numpy.dot(g, numpy.linalg.pinv(H))
|
|
503
|
+
# mli_shift_D()
|
|
504
|
+
|
|
505
|
+
def mli_shift_S(df, fc_labs, Ds, S, k_ani, idxes):
|
|
506
|
+
Fcs = numpy.vstack([df[lab].to_numpy()[idxes] for lab in fc_labs]).T
|
|
507
|
+
r = integr.ll_int_der1_DS(df.I.to_numpy()[idxes], df.SIGI.to_numpy()[idxes], k_ani[idxes], S,
|
|
508
|
+
Fcs, Ds,
|
|
509
|
+
df.centric.to_numpy()[idxes]+1, df.epsilon.to_numpy()[idxes])
|
|
510
|
+
g = numpy.nansum(r[:,-1])
|
|
511
|
+
H = numpy.nansum(r[:,-1]**2) # approximating expectation value of second derivative
|
|
512
|
+
return -g / H
|
|
513
|
+
# mli_shift_S()
|
|
514
|
+
|
|
515
|
+
def mltwin_est_ftrue(twin_data, df, k_ani, idxes):
|
|
516
|
+
kani2_inv = 1 / k_ani**2
|
|
517
|
+
i_sigi = numpy.empty((2, len(df.index)))
|
|
518
|
+
i_sigi[:] = numpy.nan
|
|
519
|
+
i_sigi[0, idxes] = (df.I.to_numpy() * kani2_inv)[idxes]
|
|
520
|
+
i_sigi[1, idxes] = (df.SIGI.to_numpy() * kani2_inv)[idxes]
|
|
521
|
+
twin_data.est_f_true(i_sigi[0,:], i_sigi[1,:])
|
|
522
|
+
# mltwin_est_ftrue()
|
|
523
|
+
|
|
524
|
+
def mltwin(df, twin_data, Ds, S, k_ani, idxes, i_bin):
|
|
525
|
+
twin_data.ml_sigma[i_bin] = S
|
|
526
|
+
twin_data.ml_scale[i_bin, :] = Ds
|
|
527
|
+
mltwin_est_ftrue(twin_data, df, k_ani, idxes)
|
|
528
|
+
return twin_data.ll()
|
|
529
|
+
# mltwin()
|
|
530
|
+
|
|
531
|
+
def deriv_mltwin_wrt_D_S(df, twin_data, Ds, S, k_ani, idxes, i_bin):
|
|
532
|
+
twin_data.ml_sigma[i_bin] = S
|
|
533
|
+
twin_data.ml_scale[i_bin, :] = Ds
|
|
534
|
+
mltwin_est_ftrue(twin_data, df, k_ani, idxes)
|
|
535
|
+
r = twin_data.ll_der_D_S()
|
|
536
|
+
g = numpy.zeros(r.shape[1])
|
|
537
|
+
g[:-1] = numpy.nansum(r[:,:-1], axis=0) # D
|
|
538
|
+
g[-1] = numpy.nansum(r[:,-1]) # S
|
|
539
|
+
return g
|
|
540
|
+
# deriv_mlf_wrt_D_S()
|
|
541
|
+
|
|
542
|
+
def mltwin_shift_S(df, twin_data, Ds, S, k_ani, idxes, i_bin):
|
|
543
|
+
twin_data.ml_sigma[i_bin] = S
|
|
544
|
+
twin_data.ml_scale[i_bin, :] = Ds
|
|
545
|
+
mltwin_est_ftrue(twin_data, df, k_ani, idxes)
|
|
546
|
+
r = twin_data.ll_der_D_S()
|
|
547
|
+
g = numpy.nansum(r[:,-1])
|
|
548
|
+
H = numpy.nansum(r[:,-1]**2) # approximating expectation value of second derivative
|
|
549
|
+
return -g / H
|
|
550
|
+
# mlf_shift_S()
|
|
551
|
+
|
|
552
|
+
def determine_mlf_params_from_cc(hkldata, fc_labs, D_labs, centric_and_selections, use="all", smoothing="gauss"):
|
|
553
|
+
# theorhetical values
|
|
554
|
+
cc_a = lambda cc: (numpy.pi/4*(1-cc**2)**2 * scipy.special.hyp2f1(3/2, 3/2, 1, cc**2) - numpy.pi/4) / (1-numpy.pi/4)
|
|
555
|
+
cc_c = lambda cc: 2/(numpy.pi-2) * (cc**2*numpy.sqrt(1-cc**2) + cc * numpy.arctan(cc/numpy.sqrt(1-cc**2)) + (1-cc**2)**(3/2)-1)
|
|
556
|
+
table_fsc = numpy.arange(0, 1, 1e-3)
|
|
557
|
+
table_cc = [cc_a(table_fsc), cc_c(table_fsc)]
|
|
558
|
+
|
|
559
|
+
for lab in D_labs: hkldata.binned_df[lab] = 1.
|
|
560
|
+
hkldata.binned_df["S"] = 1.
|
|
561
|
+
|
|
562
|
+
stats = hkldata.binned_df[["d_max", "d_min"]].copy()
|
|
563
|
+
for i, labi in enumerate(fc_labs):
|
|
564
|
+
stats["CC(FP,{})".format(labi)] = numpy.nan
|
|
565
|
+
for i, labi in enumerate(fc_labs):
|
|
566
|
+
for j in range(i+1, len(fc_labs)):
|
|
567
|
+
labj = fc_labs[j]
|
|
568
|
+
stats["CC({},{})".format(labi, labj)] = numpy.nan
|
|
569
|
+
|
|
570
|
+
# sqrt of eps * c; c = 1 for acentrics and 2 for centrics
|
|
571
|
+
inv_sqrt_c_eps = 1. / numpy.sqrt(hkldata.df.epsilon.to_numpy() * (hkldata.df.centric.to_numpy() + 1))
|
|
572
|
+
for i_bin, _ in hkldata.binned():
|
|
573
|
+
# assume they are all acentrics.. only correct by c
|
|
574
|
+
if use == "all":
|
|
575
|
+
cidxes = numpy.concatenate([sel[i] for sel in centric_and_selections[i_bin] for i in (1,2)])
|
|
576
|
+
else:
|
|
577
|
+
i = 1 if use == "work" else 2
|
|
578
|
+
cidxes = numpy.concatenate([sel[i] for sel in centric_and_selections[i_bin]])
|
|
579
|
+
valid_sel = numpy.isfinite(hkldata.df.FP.to_numpy()[cidxes])
|
|
580
|
+
cidxes = cidxes[valid_sel]
|
|
581
|
+
factor = inv_sqrt_c_eps[cidxes]
|
|
582
|
+
k_ani = hkldata.df.k_aniso.to_numpy()[cidxes]
|
|
583
|
+
Fo = hkldata.df.FP.to_numpy()[cidxes] * factor / k_ani
|
|
584
|
+
mean_Fo2 = numpy.mean(Fo**2)
|
|
585
|
+
SigFo = hkldata.df.SIGFP.to_numpy()[cidxes] / k_ani
|
|
586
|
+
Fcs = [hkldata.df[lab].to_numpy()[cidxes] * factor for lab in fc_labs]
|
|
587
|
+
mean_Fk2 = numpy.array([numpy.mean(numpy.abs(fk)**2) for fk in Fcs])
|
|
588
|
+
|
|
589
|
+
# estimate D
|
|
590
|
+
cc_fo_fj = [numpy.corrcoef(numpy.abs(fj), Fo)[1,0] for fj in Fcs]
|
|
591
|
+
for i in range(len(fc_labs)): stats.loc[i_bin, "CC(FP,{})".format(fc_labs[i])] = cc_fo_fj[i]
|
|
592
|
+
mat = [[numpy.sqrt(numpy.mean(numpy.abs(fk)**2)/mean_Fo2) * numpy.real(numpy.corrcoef(fk, fj)[1,0])
|
|
593
|
+
for fk in Fcs]
|
|
594
|
+
for fj in Fcs]
|
|
595
|
+
A = [[numpy.sqrt(numpy.mean(numpy.abs(fk)**2) * numpy.mean(numpy.abs(fj)**2))/mean_Fo2 * numpy.real(numpy.corrcoef(fk, fj)[1,0])
|
|
596
|
+
for fk in Fcs]
|
|
597
|
+
for fj in Fcs]
|
|
598
|
+
A = numpy.array([[numpy.real(numpy.corrcoef(fk, fj)[1,0]) for fk in Fcs] for fj in Fcs])
|
|
599
|
+
v = numpy.interp(cc_fo_fj, table_cc[0], table_fsc)
|
|
600
|
+
|
|
601
|
+
for i in range(len(fc_labs)):
|
|
602
|
+
labi = fc_labs[i]
|
|
603
|
+
for j in range(i+1, len(fc_labs)):
|
|
604
|
+
labj = fc_labs[j]
|
|
605
|
+
stats.loc[i_bin, "CC({},{})".format(labi, labj)] = numpy.real(numpy.corrcoef(Fcs[i], Fcs[j])[1,0])
|
|
606
|
+
|
|
607
|
+
# test all signs, fixing first Fc positive.
|
|
608
|
+
cc_max = -2
|
|
609
|
+
for v_test in itertools.product(*((x, -x) for x in v[1:])):
|
|
610
|
+
v_test = numpy.array((v[0],)+v_test)
|
|
611
|
+
Dj_test = numpy.dot(numpy.linalg.pinv(A), v_test) * numpy.sqrt(mean_Fo2 / mean_Fk2)
|
|
612
|
+
DFc_test = calc_abs_DFc(Dj_test, Fcs)
|
|
613
|
+
cc_test = numpy.corrcoef(Fo, numpy.abs(DFc_test))[1,0]
|
|
614
|
+
if cc_test > cc_max:
|
|
615
|
+
cc_max = cc_test
|
|
616
|
+
v_max = v_test
|
|
617
|
+
DFc = DFc_test
|
|
618
|
+
Dj = Dj_test
|
|
619
|
+
|
|
620
|
+
for lab, D in zip(D_labs, Dj):
|
|
621
|
+
hkldata.binned_df.loc[i_bin, lab] = D
|
|
622
|
+
|
|
623
|
+
# estimate S
|
|
624
|
+
mean_DFc2 = numpy.mean(DFc**2)
|
|
625
|
+
est_fsc_fo_fc = numpy.interp(numpy.corrcoef(Fo, DFc)[1,0], table_cc[0], table_fsc)
|
|
626
|
+
S = mean_Fo2 - 2 * numpy.sqrt(mean_Fo2 * mean_DFc2) * est_fsc_fo_fc + mean_DFc2 - numpy.mean(SigFo**2)
|
|
627
|
+
hkldata.binned_df.loc[i_bin, "S"] = S
|
|
628
|
+
|
|
629
|
+
logger.writeln("\nCC:")
|
|
630
|
+
logger.writeln(stats.to_string())
|
|
631
|
+
logger.writeln("\nEstimates:")
|
|
632
|
+
logger.writeln(hkldata.binned_df.to_string())
|
|
633
|
+
smooth_params(hkldata, D_labs, smoothing)
|
|
634
|
+
# determine_mlf_params_from_cc()
|
|
635
|
+
|
|
636
|
+
def initialize_ml_params(hkldata, use_int, D_labs, b_aniso, centric_and_selections, use, twin_data=None):
|
|
637
|
+
# Initial values
|
|
638
|
+
for lab in D_labs: hkldata.binned_df[lab] = 1.
|
|
639
|
+
hkldata.binned_df["S"] = 10000.
|
|
640
|
+
k_ani = hkldata.debye_waller_factors(b_cart=b_aniso)
|
|
641
|
+
lab_obs = "I" if use_int else "FP"
|
|
642
|
+
for i_bin, _ in hkldata.binned():
|
|
643
|
+
if use == "all":
|
|
644
|
+
idxes = numpy.concatenate([sel[i] for sel in centric_and_selections[i_bin] for i in (1,2)])
|
|
645
|
+
else:
|
|
646
|
+
i = 1 if use == "work" else 2
|
|
647
|
+
idxes = numpy.concatenate([sel[i] for sel in centric_and_selections[i_bin]])
|
|
648
|
+
valid_sel = numpy.isfinite(hkldata.df.loc[idxes, lab_obs]) # as there is no nan-safe numpy.corrcoef
|
|
649
|
+
if numpy.sum(valid_sel) < 2:
|
|
650
|
+
continue
|
|
651
|
+
idxes = idxes[valid_sel]
|
|
652
|
+
if use_int:
|
|
653
|
+
Io = hkldata.df.I.to_numpy()[idxes]
|
|
654
|
+
else:
|
|
655
|
+
Io = hkldata.df.FP.to_numpy()[idxes]**2
|
|
656
|
+
Io /= k_ani[idxes]**2
|
|
657
|
+
if twin_data:
|
|
658
|
+
Ic = twin_data.i_calc_twin()[idxes]
|
|
659
|
+
else:
|
|
660
|
+
Ic = numpy.abs(hkldata.df.FC.to_numpy()[idxes])**2
|
|
661
|
+
mean_Io = numpy.mean(Io)
|
|
662
|
+
mean_Ic = numpy.mean(Ic)
|
|
663
|
+
cc = numpy.corrcoef(Io, Ic)[1,0]
|
|
664
|
+
if cc > 0 and mean_Io > 0:
|
|
665
|
+
D = numpy.sqrt(mean_Io / mean_Ic * cc)
|
|
666
|
+
else:
|
|
667
|
+
D = 0 # will be taken care later
|
|
668
|
+
hkldata.binned_df.loc[i_bin, D_labs[0]] = D
|
|
669
|
+
if mean_Io > 0:
|
|
670
|
+
S = mean_Io - 2 * numpy.sqrt(mean_Io * mean_Ic * numpy.maximum(0, cc)) + mean_Ic
|
|
671
|
+
else:
|
|
672
|
+
S = numpy.std(Io) # similar initial to french_wilson
|
|
673
|
+
hkldata.binned_df.loc[i_bin, "S"] = S
|
|
674
|
+
|
|
675
|
+
for D_lab in D_labs:
|
|
676
|
+
if hkldata.binned_df[D_lab].min() <= 0:
|
|
677
|
+
min_D = hkldata.binned_df[D_lab][hkldata.binned_df[D_lab] > 0].min() * 0.1
|
|
678
|
+
logger.writeln("WARNING: negative {} is detected from initial estimates. Replacing it using minimum positive value {:.2e}".format(D_lab, min_D))
|
|
679
|
+
hkldata.binned_df[D_lab].where(hkldata.binned_df[D_lab] > 0, min_D, inplace=True) # arbitrary
|
|
680
|
+
|
|
681
|
+
if twin_data:
|
|
682
|
+
twin_data.ml_scale[:] = hkldata.binned_df.loc[:, D_labs]
|
|
683
|
+
twin_data.ml_sigma[:] = hkldata.binned_df.loc[:, "S"]
|
|
684
|
+
|
|
685
|
+
logger.writeln("Initial estimates:")
|
|
686
|
+
logger.writeln(hkldata.binned_df.to_string())
|
|
687
|
+
# initialize_ml_params()
|
|
688
|
+
|
|
689
|
+
def determine_ml_params(hkldata, use_int, fc_labs, D_labs, b_aniso, centric_and_selections,
|
|
690
|
+
D_trans=None, S_trans=None, use="all", n_cycle=1, smoothing="gauss",
|
|
691
|
+
twin_data=None):
|
|
692
|
+
assert use in ("all", "work", "test")
|
|
693
|
+
assert smoothing in (None, "gauss")
|
|
694
|
+
logger.writeln("Estimating sigma-A parameters using {}..".format(("intensities" if use_int else "amplitudes") + " (twin)" if twin_data else ""))
|
|
695
|
+
trans = VarTrans(D_trans, S_trans)
|
|
696
|
+
lab_obs = "I" if use_int else "FP"
|
|
697
|
+
def get_idxes(i_bin):
|
|
698
|
+
if use == "all":
|
|
699
|
+
return numpy.concatenate([sel[i] for sel in centric_and_selections[i_bin] for i in (1,2)])
|
|
700
|
+
else:
|
|
701
|
+
i = 1 if use == "work" else 2
|
|
702
|
+
return numpy.concatenate([sel[i] for sel in centric_and_selections[i_bin]])
|
|
703
|
+
|
|
704
|
+
if not set(D_labs + ["S"]).issubset(hkldata.binned_df):
|
|
705
|
+
initialize_ml_params(hkldata, use_int, D_labs, b_aniso, centric_and_selections, use, twin_data=twin_data)
|
|
706
|
+
for dlab, fclab in zip(D_labs, fc_labs):
|
|
707
|
+
hkldata.binned_df["Mn(|{}*{}|)".format(dlab, fclab)] = numpy.nan
|
|
708
|
+
|
|
709
|
+
refpar = "all"
|
|
710
|
+
for i_cyc in range(n_cycle):
|
|
711
|
+
t0 = time.time()
|
|
712
|
+
nfev_total = 0
|
|
713
|
+
k_ani = hkldata.debye_waller_factors(b_cart=b_aniso)
|
|
714
|
+
for i_bin, _ in hkldata.binned():
|
|
715
|
+
idxes = get_idxes(i_bin)
|
|
716
|
+
valid_sel = numpy.isfinite(hkldata.df.loc[idxes, lab_obs]) # as there is no nan-safe numpy.corrcoef
|
|
717
|
+
if numpy.sum(valid_sel) < 5:
|
|
718
|
+
logger.writeln("WARNING: bin {} has no sufficient reflections".format(i_bin))
|
|
719
|
+
continue
|
|
720
|
+
|
|
721
|
+
def target(x):
|
|
722
|
+
if refpar == "all":
|
|
723
|
+
Ds = trans.D(x[:len(fc_labs)])
|
|
724
|
+
S = trans.S(x[-1])
|
|
725
|
+
elif refpar == "D":
|
|
726
|
+
Ds = trans.D(x[:len(fc_labs)])
|
|
727
|
+
S = hkldata.binned_df.loc[i_bin, "S"]
|
|
728
|
+
else:
|
|
729
|
+
Ds = [hkldata.binned_df.loc[i_bin, lab] for lab in D_labs]
|
|
730
|
+
S = trans.S(x[-1])
|
|
731
|
+
|
|
732
|
+
if twin_data:
|
|
733
|
+
return mltwin(hkldata.df, twin_data, Ds, S, k_ani, idxes, i_bin)
|
|
734
|
+
else:
|
|
735
|
+
f = mli if use_int else mlf
|
|
736
|
+
return f(hkldata.df, fc_labs, Ds, S, k_ani, idxes)
|
|
737
|
+
|
|
738
|
+
def grad(x):
|
|
739
|
+
if refpar == "all":
|
|
740
|
+
Ds = trans.D(x[:len(fc_labs)])
|
|
741
|
+
S = trans.S(x[-1])
|
|
742
|
+
n_par = len(fc_labs)+1
|
|
743
|
+
elif refpar == "D":
|
|
744
|
+
Ds = trans.D(x[:len(fc_labs)])
|
|
745
|
+
S = hkldata.binned_df.loc[i_bin, "S"]
|
|
746
|
+
n_par = len(fc_labs)
|
|
747
|
+
else:
|
|
748
|
+
Ds = [hkldata.binned_df.loc[i_bin, lab] for lab in D_labs]
|
|
749
|
+
S = trans.S(x[-1])
|
|
750
|
+
n_par = 1
|
|
751
|
+
if twin_data:
|
|
752
|
+
r = deriv_mltwin_wrt_D_S(hkldata.df, twin_data, Ds, S, k_ani, idxes, i_bin)
|
|
753
|
+
else:
|
|
754
|
+
calc_deriv = deriv_mli_wrt_D_S if use_int else deriv_mlf_wrt_D_S
|
|
755
|
+
r = calc_deriv(hkldata.df, fc_labs, Ds, S, k_ani, idxes)
|
|
756
|
+
g = numpy.zeros(n_par)
|
|
757
|
+
if refpar in ("all", "D"):
|
|
758
|
+
g[:len(fc_labs)] = r[:len(fc_labs)]
|
|
759
|
+
g[:len(fc_labs)] *= trans.D_deriv(x[:len(fc_labs)])
|
|
760
|
+
if refpar in ("all", "S"):
|
|
761
|
+
g[-1] = r[-1]
|
|
762
|
+
g[-1] *= trans.S_deriv(x[-1])
|
|
763
|
+
return g
|
|
764
|
+
|
|
765
|
+
if 0:
|
|
766
|
+
refpar = "S"
|
|
767
|
+
x0 = trans.S_inv(hkldata.binned_df.loc[i_bin, "S"])
|
|
768
|
+
with open("s_line_{}.dat".format(i_bin), "w") as ofs:
|
|
769
|
+
for sval in numpy.linspace(1, x0*2, 100):
|
|
770
|
+
ofs.write("{:.4e} {:.10e} {:.10e}\n".format(sval,
|
|
771
|
+
target([sval]),
|
|
772
|
+
grad([sval])[0]))
|
|
773
|
+
continue
|
|
774
|
+
#print("Bin", i_bin)
|
|
775
|
+
if 1: # refine D and S iteratively
|
|
776
|
+
vals_last = None
|
|
777
|
+
for ids in range(10):
|
|
778
|
+
refpar = "D"
|
|
779
|
+
x0 = numpy.array([trans.D_inv(hkldata.binned_df.loc[i_bin, lab]) for lab in D_labs])
|
|
780
|
+
#print("MLTWIN=", target(x0))
|
|
781
|
+
#quit()
|
|
782
|
+
if 0:
|
|
783
|
+
h = 1e-3
|
|
784
|
+
f00 = target(x0)
|
|
785
|
+
g00 = grad(x0)
|
|
786
|
+
for ii in range(len(x0)):
|
|
787
|
+
xx = x0.copy()
|
|
788
|
+
xx[ii] += h
|
|
789
|
+
f01 = target(xx)
|
|
790
|
+
nder = (f01 - f00) / h
|
|
791
|
+
logger.writeln(f"DEBUG_der_D bin_{i_bin} {ii} ad={g00[ii]} nd={nder} r={g00[ii]/nder}")
|
|
792
|
+
vals_now = []
|
|
793
|
+
if 0:
|
|
794
|
+
f0 = target(x0)
|
|
795
|
+
nfev_total += 1
|
|
796
|
+
shift = mli_shift_D(hkldata.df, fc_labs, trans.D(x0), hkldata.binned_df.loc[i_bin, "S"], k_ani, idxes)
|
|
797
|
+
shift /= trans.D_deriv(x0)
|
|
798
|
+
#if abs(shift) < 1e-3: break
|
|
799
|
+
for itry in range(10):
|
|
800
|
+
x1 = x0 + shift
|
|
801
|
+
if (D_trans and any(x1 < -3)) or (not D_trans and any(x1 < 5e-2)):
|
|
802
|
+
#print(i_bin, cyc_s, trans.S(x0), trans.S(x1), shift, "BAD")
|
|
803
|
+
shift /= 2
|
|
804
|
+
continue
|
|
805
|
+
f1 = target(x1)
|
|
806
|
+
nfev_total += 1
|
|
807
|
+
if f1 > f0:
|
|
808
|
+
shift /= 2
|
|
809
|
+
continue
|
|
810
|
+
else: # good
|
|
811
|
+
for i, lab in enumerate(D_labs):
|
|
812
|
+
hkldata.binned_df.loc[i_bin, lab] = trans.D(x1[i])
|
|
813
|
+
vals_now.append(hkldata.binned_df.loc[i_bin, lab])
|
|
814
|
+
break
|
|
815
|
+
else:
|
|
816
|
+
break
|
|
817
|
+
else:
|
|
818
|
+
#print(mli_shift_D(hkldata.df, fc_labs, trans.D(x0), hkldata.binned_df.S[i_bin], k_ani, idxes))
|
|
819
|
+
res = scipy.optimize.minimize(fun=target, x0=x0, jac=grad,
|
|
820
|
+
bounds=((-5 if D_trans else 1e-5, None),)*len(x0))
|
|
821
|
+
nfev_total += res.nfev
|
|
822
|
+
#print(i_bin, "mini cycle", ids, refpar)
|
|
823
|
+
#print(res)
|
|
824
|
+
for i, lab in enumerate(D_labs):
|
|
825
|
+
hkldata.binned_df.loc[i_bin, lab] = trans.D(res.x[i])
|
|
826
|
+
vals_now.append(hkldata.binned_df.loc[i_bin, lab])
|
|
827
|
+
if twin_data:
|
|
828
|
+
twin_data.ml_scale[i_bin, :] = trans.D(res.x)
|
|
829
|
+
refpar = "S"
|
|
830
|
+
if 1:
|
|
831
|
+
for cyc_s in range(1):
|
|
832
|
+
x0 = trans.S_inv(hkldata.binned_df.loc[i_bin, "S"])
|
|
833
|
+
if 0:
|
|
834
|
+
h = 1e-1
|
|
835
|
+
f00 = target([x0])
|
|
836
|
+
g00 = grad([x0])
|
|
837
|
+
xx = x0 + h
|
|
838
|
+
f01 = target([xx])
|
|
839
|
+
nder = (f01 - f00) / h
|
|
840
|
+
logger.writeln(f"DEBUG_der_S bin_{i_bin} ad={g00} nd={nder} r={g00/nder}")
|
|
841
|
+
|
|
842
|
+
f0 = target([x0])
|
|
843
|
+
Ds = [hkldata.binned_df.loc[i_bin, lab] for lab in D_labs]
|
|
844
|
+
nfev_total += 1
|
|
845
|
+
if twin_data:
|
|
846
|
+
shift = mltwin_shift_S(hkldata.df, twin_data, Ds, trans.S(x0), k_ani, idxes, i_bin)
|
|
847
|
+
else:
|
|
848
|
+
calc_shift_S = mli_shift_S if use_int else mlf_shift_S
|
|
849
|
+
shift = calc_shift_S(hkldata.df, fc_labs, Ds, trans.S(x0), k_ani, idxes)
|
|
850
|
+
shift /= trans.S_deriv(x0)
|
|
851
|
+
if abs(shift) < 1e-3: break
|
|
852
|
+
for itry in range(10):
|
|
853
|
+
x1 = x0 + shift
|
|
854
|
+
if (S_trans and x1 < -3) or (not S_trans and x1 < 5e-2):
|
|
855
|
+
#print(i_bin, cyc_s, trans.S(x0), trans.S(x1), shift, "BAD")
|
|
856
|
+
shift /= 2
|
|
857
|
+
continue
|
|
858
|
+
f1 = target([x1])
|
|
859
|
+
nfev_total += 1
|
|
860
|
+
if f1 > f0:
|
|
861
|
+
shift /= 2
|
|
862
|
+
continue
|
|
863
|
+
else: # good
|
|
864
|
+
#print(i_bin, cyc_s, trans.S(x0), trans.S(x1), shift)
|
|
865
|
+
hkldata.binned_df.loc[i_bin, "S"] = trans.S(x1)
|
|
866
|
+
break
|
|
867
|
+
else:
|
|
868
|
+
#print("all bad")
|
|
869
|
+
break
|
|
870
|
+
if twin_data:
|
|
871
|
+
twin_data.ml_sigma[i_bin] = hkldata.binned_df.loc[i_bin, "S"]
|
|
872
|
+
else:
|
|
873
|
+
# somehow this does not work well.
|
|
874
|
+
x0 = [trans.S_inv(hkldata.binned_df.loc[i_bin, "S"])]
|
|
875
|
+
res = scipy.optimize.minimize(fun=target, x0=x0, jac=grad,
|
|
876
|
+
bounds=((-3 if S_trans else 5e-2, None),))
|
|
877
|
+
nfev_total += res.nfev
|
|
878
|
+
#print(i_bin, "mini cycle", ids, refpar)
|
|
879
|
+
#print(res)
|
|
880
|
+
hkldata.binned_df.loc[i_bin, "S"] = trans.S(res.x[-1])
|
|
881
|
+
if twin_data:
|
|
882
|
+
twin_data.ml_sigma[i_bin] = trans.S(res.x[-1])
|
|
883
|
+
vals_now.append(hkldata.binned_df.loc[i_bin, "S"])
|
|
884
|
+
vals_now = numpy.array(vals_now)
|
|
885
|
+
if vals_last is not None and numpy.all(numpy.abs((vals_last - vals_now) / vals_now) < 1e-2):
|
|
886
|
+
#logger.writeln("converged in mini cycle {}".format(ids+1))
|
|
887
|
+
break
|
|
888
|
+
vals_last = vals_now
|
|
889
|
+
else:
|
|
890
|
+
x0 = [trans.D_inv(hkldata.binned_df.loc[i_bin, lab]) for lab in D_labs] + [trans.S_inv(hkldata.binned_df.loc[i_bin, "S"])]
|
|
891
|
+
res = scipy.optimize.minimize(fun=target, x0=x0, jac=grad,
|
|
892
|
+
bounds=((-5 if D_trans else 1e-5, None), )*len(D_labs) + ((-3 if S_trans else 5e-2, None),))
|
|
893
|
+
nfev_total += res.nfev
|
|
894
|
+
#print(i_bin)
|
|
895
|
+
#print(res)
|
|
896
|
+
for i, lab in enumerate(D_labs):
|
|
897
|
+
hkldata.binned_df.loc[i_bin, lab] = trans.D(res.x[i])
|
|
898
|
+
hkldata.binned_df.loc[i_bin, "S"] = trans.S(res.x[-1])
|
|
899
|
+
if twin_data:
|
|
900
|
+
twin_data.ml_scale[i_bin, :] = trans.D(res.x[:-1])
|
|
901
|
+
twin_data.ml_sigma[i_bin] = trans.S(res.x[-1])
|
|
902
|
+
|
|
903
|
+
if twin_data:
|
|
904
|
+
dfc = numpy.abs(twin_data.f_calc) * twin_data.ml_scale_array()
|
|
905
|
+
for i_bin, idxes in hkldata.binned():
|
|
906
|
+
dfc_bin = dfc[numpy.asarray(twin_data.bin)==i_bin,:]
|
|
907
|
+
mean_dfc = numpy.nanmean(dfc_bin, axis=0)
|
|
908
|
+
for i, (dlab, fclab) in enumerate(zip(D_labs, fc_labs)):
|
|
909
|
+
hkldata.binned_df.loc[i_bin, "Mn(|{}*{}|)".format(dlab, fclab)] = mean_dfc[i]
|
|
910
|
+
else:
|
|
911
|
+
for i_bin, idxes in hkldata.binned():
|
|
912
|
+
for dlab, fclab in zip(D_labs, fc_labs):
|
|
913
|
+
mean_dfc = numpy.nanmean(numpy.abs(hkldata.binned_df[dlab][i_bin] * hkldata.df[fclab][idxes]))
|
|
914
|
+
hkldata.binned_df.loc[i_bin, "Mn(|{}*{}|)".format(dlab, fclab)] = mean_dfc
|
|
915
|
+
|
|
916
|
+
logger.writeln("Refined estimates:")
|
|
917
|
+
logger.writeln(hkldata.binned_df.to_string())
|
|
918
|
+
#numpy.testing.assert_allclose(hkldata.binned_df.S, twin_data.ml_sigma)
|
|
919
|
+
#numpy.testing.assert_allclose(hkldata.binned_df[D_labs], twin_data.ml_scale)
|
|
920
|
+
logger.writeln("time: {:.1f} sec ({} evaluations)".format(time.time() - t0, nfev_total))
|
|
921
|
+
|
|
922
|
+
if not use_int or twin_data:
|
|
923
|
+
break # did not implement MLF B_aniso optimization
|
|
924
|
+
|
|
925
|
+
# Refine b_aniso
|
|
926
|
+
adpdirs = utils.model.adp_constraints(hkldata.sg.operations(), hkldata.cell, tr0=True)
|
|
927
|
+
SMattolist = lambda B: [B.u11, B.u22, B.u33, B.u12, B.u13, B.u23]
|
|
928
|
+
|
|
929
|
+
def target_ani(x):
|
|
930
|
+
b = gemmi.SMat33d(*numpy.dot(x, adpdirs))
|
|
931
|
+
k_ani = hkldata.debye_waller_factors(b_cart=b)
|
|
932
|
+
ret = 0.
|
|
933
|
+
for i_bin, idxes in hkldata.binned():
|
|
934
|
+
Ds = [hkldata.binned_df.loc[i_bin, lab] for lab in D_labs]
|
|
935
|
+
ret += mli(hkldata.df, fc_labs, Ds, hkldata.binned_df.loc[i_bin, "S"], k_ani, idxes)
|
|
936
|
+
return ret
|
|
937
|
+
def grad_ani(x):
|
|
938
|
+
b = gemmi.SMat33d(*numpy.dot(x, adpdirs))
|
|
939
|
+
k_ani = hkldata.debye_waller_factors(b_cart=b)
|
|
940
|
+
S2mat = hkldata.ssq_mat() # ssqmat
|
|
941
|
+
g = numpy.zeros(6)
|
|
942
|
+
for i_bin, idxes in hkldata.binned():
|
|
943
|
+
r = integr.ll_int_der1_ani(hkldata.df.I.to_numpy()[idxes], hkldata.df.SIGI.to_numpy()[idxes],
|
|
944
|
+
k_ani[idxes], hkldata.binned_df.loc[i_bin, "S"],
|
|
945
|
+
hkldata.df[fc_labs].to_numpy()[idxes], hkldata.binned_df.loc[i_bin, D_labs],
|
|
946
|
+
hkldata.df.centric.to_numpy()[idxes]+1, hkldata.df.epsilon.to_numpy()[idxes])
|
|
947
|
+
S2 = S2mat[:,idxes]
|
|
948
|
+
g += -numpy.nansum(S2 * r[:,0], axis=1) # k_ani is already multiplied in r
|
|
949
|
+
return numpy.dot(g, adpdirs.T)
|
|
950
|
+
def shift_ani(x):
|
|
951
|
+
b = gemmi.SMat33d(*numpy.dot(x, adpdirs))
|
|
952
|
+
k_ani = hkldata.debye_waller_factors(b_cart=b)
|
|
953
|
+
S2mat = hkldata.ssq_mat() # ssqmat
|
|
954
|
+
g = numpy.zeros(6)
|
|
955
|
+
H = numpy.zeros((6, 6))
|
|
956
|
+
for i_bin, idxes in hkldata.binned():
|
|
957
|
+
r = integr.ll_int_der1_ani(hkldata.df.I.to_numpy()[idxes], hkldata.df.SIGI.to_numpy()[idxes],
|
|
958
|
+
k_ani[idxes], hkldata.binned_df.loc[i_bin, "S"],
|
|
959
|
+
hkldata.df[fc_labs].to_numpy()[idxes], list(hkldata.binned_df.loc[i_bin, D_labs]),
|
|
960
|
+
hkldata.df.centric.to_numpy()[idxes]+1, hkldata.df.epsilon.to_numpy()[idxes])
|
|
961
|
+
S2 = S2mat[:,idxes]
|
|
962
|
+
g += -numpy.nansum(S2 * r[:,0], axis=1) # k_ani is already multiplied in r
|
|
963
|
+
H += numpy.nansum(numpy.matmul(S2[None,:].T, S2.T[:,None]) * (r[:,0]**2)[:,None,None], axis=0)
|
|
964
|
+
|
|
965
|
+
g, H = numpy.dot(g, adpdirs.T), numpy.dot(adpdirs, numpy.dot(H, adpdirs.T))
|
|
966
|
+
return -numpy.dot(g, numpy.linalg.pinv(H))
|
|
967
|
+
|
|
968
|
+
logger.writeln("Refining B_aniso. Current = {}".format(b_aniso))
|
|
969
|
+
if 0:
|
|
970
|
+
x0 = numpy.dot(SMattolist(b_aniso), numpy.linalg.pinv(adpdirs))
|
|
971
|
+
res = scipy.optimize.minimize(fun=target_ani, x0=x0, jac=grad_ani)
|
|
972
|
+
print(res)
|
|
973
|
+
b_aniso = gemmi.SMat33d(*numpy.dot(res.x, adpdirs))
|
|
974
|
+
f1 = res.fun
|
|
975
|
+
else:
|
|
976
|
+
B_converged = False
|
|
977
|
+
for j in range(10):
|
|
978
|
+
x = numpy.dot(SMattolist(b_aniso), numpy.linalg.pinv(adpdirs))
|
|
979
|
+
f0 = target_ani(x)
|
|
980
|
+
shift = shift_ani(x)
|
|
981
|
+
for i in range(3):
|
|
982
|
+
ss = shift / 2**i
|
|
983
|
+
f1 = target_ani(x + ss)
|
|
984
|
+
#logger.writeln("{:2d} f0 = {:.3e} shift = {} df = {:.3e}".format(j, f0, ss, f1 - f0))
|
|
985
|
+
if f1 < f0:
|
|
986
|
+
b_aniso = gemmi.SMat33d(*numpy.dot(x+ss, adpdirs))
|
|
987
|
+
if numpy.max(numpy.abs(ss)) < 1e-4: B_converged = True
|
|
988
|
+
break
|
|
989
|
+
else:
|
|
990
|
+
B_converged = True
|
|
991
|
+
if B_converged: break
|
|
992
|
+
|
|
993
|
+
logger.writeln("Refined B_aniso = {}".format(b_aniso))
|
|
994
|
+
logger.writeln("cycle {} f= {}".format(i_cyc, f1))
|
|
995
|
+
|
|
996
|
+
smooth_params(hkldata, D_labs, smoothing)
|
|
997
|
+
return b_aniso
|
|
998
|
+
# determine_ml_params()
|
|
999
|
+
|
|
1000
|
+
def smooth_params(hkldata, D_labs, smoothing): # XXX twin_data
|
|
1001
|
+
if smoothing is None or len(hkldata.binned()) < 2:
|
|
1002
|
+
for i, lab in enumerate(D_labs + ["S"]):
|
|
1003
|
+
hkldata.df[lab] = hkldata.binned_data_as_array(lab)
|
|
1004
|
+
|
|
1005
|
+
elif smoothing == "gauss":
|
|
1006
|
+
bin_centers = (0.5 / hkldata.binned_df[["d_min", "d_max"]]**2).sum(axis=1).to_numpy()
|
|
1007
|
+
vals = ext.smooth_gauss(bin_centers,
|
|
1008
|
+
hkldata.binned_df[D_labs + ["S"]].to_numpy(),
|
|
1009
|
+
1./hkldata.df.d.to_numpy()**2,
|
|
1010
|
+
100, # min(n_ref?)
|
|
1011
|
+
(bin_centers[1] - bin_centers[0]))
|
|
1012
|
+
for i, lab in enumerate(D_labs + ["S"]):
|
|
1013
|
+
hkldata.df[lab] = vals[:, i]
|
|
1014
|
+
# Update smoothened average; this affects next refinement.
|
|
1015
|
+
# TODO: update Mn(|Dj*FCj|) as well.
|
|
1016
|
+
#for i_bin, idxes in hkldata.binned():
|
|
1017
|
+
# for lab in D_labs + ["S"]:
|
|
1018
|
+
# hkldata.binned_df.loc[i_bin, lab] = numpy.mean(hkldata.df[lab].to_numpy()[idxes])
|
|
1019
|
+
else:
|
|
1020
|
+
raise RuntimeError("unknown smoothing method: {}".format(smoothing))
|
|
1021
|
+
# smooth_params()
|
|
1022
|
+
|
|
1023
|
+
def expected_F_from_int(Io, sigIo, k_ani, DFc, eps, c, S):
|
|
1024
|
+
k_num = numpy.repeat(0.5 if c == 0 else 0., Io.size) # 0.5 if acentric
|
|
1025
|
+
k_den = k_num - 0.5
|
|
1026
|
+
if numpy.isscalar(c): c = numpy.repeat(c, Io.size)
|
|
1027
|
+
to = Io / sigIo - sigIo / (c+1) / k_ani**2 / S / eps
|
|
1028
|
+
tf = k_ani * numpy.abs(DFc) / numpy.sqrt(sigIo)
|
|
1029
|
+
sig1 = k_ani**2 * S * eps / sigIo
|
|
1030
|
+
f = ext.integ_J_ratio(k_num, k_den, True, to, tf, sig1, c+1, integr.exp2_threshold, integr.h, integr.N, integr.ewmax)
|
|
1031
|
+
f *= numpy.sqrt(sigIo) / k_ani
|
|
1032
|
+
m_proxy = ext.integ_J_ratio(k_num, k_num, True, to, tf, sig1, c+1, integr.exp2_threshold, integr.h, integr.N, integr.ewmax)
|
|
1033
|
+
return f, m_proxy
|
|
1034
|
+
# expected_F_from_int()
|
|
1035
|
+
|
|
1036
|
+
def calculate_maps_int(hkldata, b_aniso, fc_labs, D_labs, centric_and_selections, use="all"):
|
|
1037
|
+
nmodels = len(fc_labs)
|
|
1038
|
+
hkldata.df["FWT"] = 0j * numpy.nan
|
|
1039
|
+
hkldata.df["DELFWT"] = 0j * numpy.nan
|
|
1040
|
+
hkldata.df["FOM"] = numpy.nan # FOM proxy, |<F>| / <|F|>
|
|
1041
|
+
has_ano = "I(+)" in hkldata.df and "I(-)" in hkldata.df
|
|
1042
|
+
if has_ano:
|
|
1043
|
+
hkldata.df["FAN"] = 0j * numpy.nan
|
|
1044
|
+
ano_data = hkldata.df[["I(+)", "SIGI(+)", "I(-)", "SIGI(-)"]].to_numpy()
|
|
1045
|
+
Io = hkldata.df.I.to_numpy()
|
|
1046
|
+
sigIo = hkldata.df.SIGI.to_numpy()
|
|
1047
|
+
k_ani = hkldata.debye_waller_factors(b_cart=b_aniso)
|
|
1048
|
+
eps = hkldata.df.epsilon.to_numpy()
|
|
1049
|
+
Ds = numpy.vstack([hkldata.df[lab].to_numpy() for lab in D_labs]).T
|
|
1050
|
+
Fcs = numpy.vstack([hkldata.df[lab].to_numpy() for lab in fc_labs]).T
|
|
1051
|
+
DFc = (Ds * Fcs).sum(axis=1)
|
|
1052
|
+
hkldata.df["DFC"] = DFc
|
|
1053
|
+
for i_bin, idxes in hkldata.binned():
|
|
1054
|
+
for c, work, test in centric_and_selections[i_bin]:
|
|
1055
|
+
cidxes = numpy.concatenate([work, test])
|
|
1056
|
+
S = hkldata.df["S"].to_numpy()[cidxes]
|
|
1057
|
+
f, m_proxy = expected_F_from_int(Io[cidxes], sigIo[cidxes], k_ani[cidxes], DFc[cidxes], eps[cidxes], c, S)
|
|
1058
|
+
exp_ip = numpy.exp(numpy.angle(DFc[cidxes])*1j)
|
|
1059
|
+
if c == 0:
|
|
1060
|
+
hkldata.df.loc[cidxes, "FWT"] = 2 * f * exp_ip - DFc[cidxes]
|
|
1061
|
+
else:
|
|
1062
|
+
hkldata.df.loc[cidxes, "FWT"] = f * exp_ip
|
|
1063
|
+
hkldata.df.loc[cidxes, "DELFWT"] = f * exp_ip - DFc[cidxes]
|
|
1064
|
+
hkldata.df.loc[cidxes, "FOM"] = m_proxy
|
|
1065
|
+
if has_ano:
|
|
1066
|
+
f_p, _ = expected_F_from_int(ano_data[cidxes,0], ano_data[cidxes,1],
|
|
1067
|
+
k_ani[cidxes], DFc[cidxes], eps[cidxes], c, S)
|
|
1068
|
+
f_m, _ = expected_F_from_int(ano_data[cidxes,2], ano_data[cidxes,3],
|
|
1069
|
+
k_ani[cidxes], DFc[cidxes], eps[cidxes], c, S)
|
|
1070
|
+
hkldata.df.loc[cidxes, "FAN"] = (f_p - f_m) * exp_ip / 2j
|
|
1071
|
+
# remove reflections that should be hidden
|
|
1072
|
+
if use != "all":
|
|
1073
|
+
# usually use == "work"
|
|
1074
|
+
tohide = test if use == "work" else work
|
|
1075
|
+
hkldata.df.loc[tohide, "FWT"] = 0j * numpy.nan
|
|
1076
|
+
hkldata.df.loc[tohide, "DELFWT"] = 0j * numpy.nan
|
|
1077
|
+
fill_sel = numpy.isnan(hkldata.df["FWT"][cidxes].to_numpy())
|
|
1078
|
+
hkldata.df.loc[cidxes[fill_sel], "FWT"] = DFc[cidxes][fill_sel]
|
|
1079
|
+
# calculate_maps_int()
|
|
1080
|
+
|
|
1081
|
+
def calculate_maps_twin(hkldata, b_aniso, fc_labs, D_labs, twin_data, centric_and_selections, use="all"):
|
|
1082
|
+
k_ani2_inv = 1 / hkldata.debye_waller_factors(b_cart=b_aniso)**2
|
|
1083
|
+
Io = hkldata.df.I.to_numpy(copy=True) * k_ani2_inv
|
|
1084
|
+
sigIo = hkldata.df.SIGI.to_numpy(copy=True) * k_ani2_inv
|
|
1085
|
+
# Mask Io
|
|
1086
|
+
for i_bin, idxes in hkldata.binned():
|
|
1087
|
+
for c, work, test in centric_and_selections[i_bin]:
|
|
1088
|
+
if use != "all":
|
|
1089
|
+
tohide = test if use == "work" else work
|
|
1090
|
+
Io[tohide] = numpy.nan
|
|
1091
|
+
|
|
1092
|
+
twin_data.est_f_true(Io, sigIo)
|
|
1093
|
+
Ds = twin_data.ml_scale_array()
|
|
1094
|
+
DFc = (twin_data.f_calc * Ds).sum(axis=1)
|
|
1095
|
+
exp_ip = numpy.exp(numpy.angle(DFc)*1j)
|
|
1096
|
+
Ft = numpy.asarray(twin_data.f_true_max)
|
|
1097
|
+
m = twin_data.calc_fom()
|
|
1098
|
+
Fexp = twin_data.expected_F(Io, sigIo)
|
|
1099
|
+
if 1:
|
|
1100
|
+
fwt = numpy.where(numpy.asarray(twin_data.centric) == 0,
|
|
1101
|
+
2 * m * Ft * exp_ip - DFc,
|
|
1102
|
+
m * Ft * exp_ip)
|
|
1103
|
+
delfwt = m * Ft * exp_ip - DFc
|
|
1104
|
+
else: # based on "more accurate" evaluation of <m|F|>
|
|
1105
|
+
fwt = numpy.where(numpy.asarray(twin_data.centric) == 0,
|
|
1106
|
+
2 * Fexp * exp_ip - DFc,
|
|
1107
|
+
m * Fexp * exp_ip)
|
|
1108
|
+
delfwt = Fexp * exp_ip - DFc
|
|
1109
|
+
|
|
1110
|
+
sel = numpy.isnan(fwt)
|
|
1111
|
+
fwt[sel] = DFc[sel]
|
|
1112
|
+
|
|
1113
|
+
hkldata2 = utils.hkl.HklData(hkldata.cell, hkldata.sg,
|
|
1114
|
+
utils.hkl.df_from_twin_data(twin_data, fc_labs))
|
|
1115
|
+
hkldata2.df["FWT"] = fwt
|
|
1116
|
+
hkldata2.df["DELFWT"] = delfwt
|
|
1117
|
+
hkldata2.df["FOM"] = m
|
|
1118
|
+
hkldata2.df["F_est"] = Ft
|
|
1119
|
+
hkldata2.df["F_exp"] = Fexp
|
|
1120
|
+
hkldata2.df["FC"] = twin_data.f_calc.sum(axis=1)
|
|
1121
|
+
hkldata2.df["DFC"] = DFc
|
|
1122
|
+
hkldata2.df[D_labs] = Ds
|
|
1123
|
+
hkldata2.df["S"] = twin_data.ml_sigma_array()
|
|
1124
|
+
return hkldata2
|
|
1125
|
+
# calculate_maps_twin()
|
|
1126
|
+
|
|
1127
|
+
def merge_models(sts): # simply merge models. no fix in chain ids etc.
|
|
1128
|
+
st2 = sts[0].clone()
|
|
1129
|
+
del st2[:]
|
|
1130
|
+
model = gemmi.Model(1)
|
|
1131
|
+
for st in sts:
|
|
1132
|
+
for m in st:
|
|
1133
|
+
for c in m:
|
|
1134
|
+
model.add_chain(c)
|
|
1135
|
+
st2.add_model(model)
|
|
1136
|
+
return st2
|
|
1137
|
+
# merge_models()
|
|
1138
|
+
|
|
1139
|
+
def decide_mtz_labels(mtz, find_free=True, require=None):
|
|
1140
|
+
# F is preferred for now by default
|
|
1141
|
+
obs_types = ("F", "J", "G", "K")
|
|
1142
|
+
if require:
|
|
1143
|
+
assert set(require).issubset(obs_types)
|
|
1144
|
+
else:
|
|
1145
|
+
require = obs_types
|
|
1146
|
+
dlabs = utils.hkl.mtz_find_data_columns(mtz)
|
|
1147
|
+
logger.writeln("Finding possible options from MTZ:")
|
|
1148
|
+
for typ in dlabs:
|
|
1149
|
+
for labs in dlabs[typ]:
|
|
1150
|
+
logger.writeln(" --labin '{}'".format(",".join(labs)))
|
|
1151
|
+
for typ in require:
|
|
1152
|
+
if dlabs[typ]:
|
|
1153
|
+
labin = dlabs[typ][0]
|
|
1154
|
+
break
|
|
1155
|
+
else:
|
|
1156
|
+
raise RuntimeError("Data not found from mtz")
|
|
1157
|
+
if find_free:
|
|
1158
|
+
flabs = utils.hkl.mtz_find_free_columns(mtz)
|
|
1159
|
+
if flabs:
|
|
1160
|
+
labin += [flabs[0]]
|
|
1161
|
+
logger.writeln("MTZ columns automatically selected: {}".format(labin))
|
|
1162
|
+
return labin
|
|
1163
|
+
# decide_mtz_labels()
|
|
1164
|
+
|
|
1165
|
+
def process_input(hklin, labin, n_bins, free, xyzins, source, d_max=None, d_min=None,
|
|
1166
|
+
n_per_bin=None, use="all", max_bins=None, cif_index=0, keep_charges=False,
|
|
1167
|
+
allow_unusual_occupancies=False, space_group=None):
|
|
1168
|
+
if labin: assert 1 < len(labin) < 6
|
|
1169
|
+
assert use in ("all", "work", "test")
|
|
1170
|
+
assert n_bins or n_per_bin #if n_bins not set, n_per_bin should be given
|
|
1171
|
+
|
|
1172
|
+
if len(xyzins) > 0 and type(xyzins[0]) is gemmi.Structure:
|
|
1173
|
+
sts = xyzins
|
|
1174
|
+
else:
|
|
1175
|
+
sts = []
|
|
1176
|
+
|
|
1177
|
+
if type(hklin) is gemmi.Mtz or utils.fileio.is_mmhkl_file(hklin):
|
|
1178
|
+
if type(hklin) is gemmi.Mtz:
|
|
1179
|
+
mtz = hklin
|
|
1180
|
+
else:
|
|
1181
|
+
mtz = utils.fileio.read_mmhkl(hklin, cif_index=cif_index)
|
|
1182
|
+
if not sts:
|
|
1183
|
+
sts = [utils.fileio.read_structure(f) for f in xyzins]
|
|
1184
|
+
else:
|
|
1185
|
+
assert len(xyzins) == 1
|
|
1186
|
+
assert not sts
|
|
1187
|
+
st, mtz = utils.fileio.read_small_molecule_files([hklin, xyzins[0]])
|
|
1188
|
+
sts = [st]
|
|
1189
|
+
|
|
1190
|
+
for st in sts:
|
|
1191
|
+
utils.model.check_occupancies(st, raise_error=not allow_unusual_occupancies)
|
|
1192
|
+
|
|
1193
|
+
if not labin:
|
|
1194
|
+
labin = decide_mtz_labels(mtz)
|
|
1195
|
+
col_types = {x.label:x.type for x in mtz.columns}
|
|
1196
|
+
if labin[0] not in col_types:
|
|
1197
|
+
raise RuntimeError("MTZ column not found: {}".format(labin[0]))
|
|
1198
|
+
labs_and_types = {"F": ("amplitude", ["FP","SIGFP"], ["F", "Q"]),
|
|
1199
|
+
"J": ("intensity", ["I","SIGI"], ["J", "Q"]),
|
|
1200
|
+
"G": ("anomalous amplitude", ["F(+)","SIGF(+)", "F(-)", "SIGF(-)"], ["G", "L", "G", "L"]),
|
|
1201
|
+
"K": ("anomalous intensity", ["I(+)","SIGI(+)", "I(-)", "SIGI(-)"], ["K", "M", "K", "M"])}
|
|
1202
|
+
if col_types[labin[0]] not in labs_and_types:
|
|
1203
|
+
raise RuntimeError("MTZ column {} is neither amplitude nor intensity".format(labin[0]))
|
|
1204
|
+
name, newlabels, require_types = labs_and_types[col_types[labin[0]]]
|
|
1205
|
+
logger.writeln("Observation type: {}".format(name))
|
|
1206
|
+
if len(newlabels) < len(labin): newlabels.append("FREE")
|
|
1207
|
+
hkldata = utils.hkl.hkldata_from_mtz(mtz, labin, newlabels=newlabels, require_types=require_types)
|
|
1208
|
+
if newlabels[0] == "F(+)":
|
|
1209
|
+
hkldata.merge_anomalous(newlabels[:4], ["FP", "SIGFP"])
|
|
1210
|
+
newlabels = ["FP", "SIGFP"] + newlabels[4:]
|
|
1211
|
+
elif newlabels[0] == "I(+)":
|
|
1212
|
+
hkldata.merge_anomalous(newlabels[:4], ["I", "SIGI"])
|
|
1213
|
+
newlabels = ["I", "SIGI"] + newlabels[4:]
|
|
1214
|
+
|
|
1215
|
+
if hkldata.df.empty:
|
|
1216
|
+
raise RuntimeError("No data in hkl data")
|
|
1217
|
+
|
|
1218
|
+
if space_group is None:
|
|
1219
|
+
sg_use = None
|
|
1220
|
+
else:
|
|
1221
|
+
sg_use = gemmi.SpaceGroup(space_group)
|
|
1222
|
+
logger.writeln(f"Space group overridden by user. Using {sg_use.xhm()}")
|
|
1223
|
+
|
|
1224
|
+
if sts:
|
|
1225
|
+
assert source in ["electron", "xray", "neutron"]
|
|
1226
|
+
for st in sts:
|
|
1227
|
+
if st[0].count_atom_sites() == 0:
|
|
1228
|
+
raise RuntimeError("No atom in the model")
|
|
1229
|
+
if not hkldata.cell.approx(sts[0].cell, 1e-3):
|
|
1230
|
+
logger.writeln("Warning: unit cell mismatch between model and reflection data")
|
|
1231
|
+
logger.writeln(" using unit cell from mtz")
|
|
1232
|
+
|
|
1233
|
+
for st in sts: st.cell = hkldata.cell # mtz cell is used in any case
|
|
1234
|
+
|
|
1235
|
+
sg_st = sts[0].find_spacegroup() # may be None
|
|
1236
|
+
if sg_use is None:
|
|
1237
|
+
sg_use = hkldata.sg
|
|
1238
|
+
if hkldata.sg != sg_st:
|
|
1239
|
+
if st.cell.is_crystal() and sg_st and sg_st.laue_str() != hkldata.sg.laue_str():
|
|
1240
|
+
raise RuntimeError("Crystal symmetry mismatch between model and data")
|
|
1241
|
+
logger.writeln("Warning: space group mismatch between model and mtz")
|
|
1242
|
+
if sg_st and sg_st.laue_str() == hkldata.sg.laue_str():
|
|
1243
|
+
logger.writeln(" using space group from model")
|
|
1244
|
+
sg_use = sg_st
|
|
1245
|
+
else:
|
|
1246
|
+
logger.writeln(" using space group from mtz")
|
|
1247
|
+
logger.writeln("")
|
|
1248
|
+
|
|
1249
|
+
for st in sts:
|
|
1250
|
+
st.spacegroup_hm = sg_use.xhm()
|
|
1251
|
+
st.setup_cell_images()
|
|
1252
|
+
|
|
1253
|
+
if not keep_charges:
|
|
1254
|
+
utils.model.remove_charge(sts)
|
|
1255
|
+
utils.model.check_atomsf(sts, source)
|
|
1256
|
+
|
|
1257
|
+
if sg_use is not None:
|
|
1258
|
+
hkldata.sg = sg_use
|
|
1259
|
+
if newlabels[0] == "FP":
|
|
1260
|
+
hkldata.remove_nonpositive(newlabels[0])
|
|
1261
|
+
hkldata.remove_nonpositive(newlabels[1])
|
|
1262
|
+
hkldata.switch_to_asu()
|
|
1263
|
+
hkldata.remove_systematic_absences()
|
|
1264
|
+
#hkldata.df = hkldata.df.astype({name: 'float64' for name in ["I","SIGI","FP","SIGFP"] if name in hkldata.df})
|
|
1265
|
+
d_min_data = hkldata.d_min_max(newlabels)[0]
|
|
1266
|
+
if d_min is None and hkldata.d_min_max()[0] != d_min_data:
|
|
1267
|
+
d_min = d_min_data
|
|
1268
|
+
logger.writeln(f"Changing resolution to {d_min:.3f} A")
|
|
1269
|
+
if (d_min, d_max).count(None) != 2:
|
|
1270
|
+
hkldata = hkldata.copy(d_min=d_min, d_max=d_max)
|
|
1271
|
+
if hkldata.df.empty:
|
|
1272
|
+
raise RuntimeError("No data left in hkl data")
|
|
1273
|
+
|
|
1274
|
+
hkldata.complete()
|
|
1275
|
+
hkldata.sort_by_resolution()
|
|
1276
|
+
hkldata.calc_epsilon()
|
|
1277
|
+
hkldata.calc_centric()
|
|
1278
|
+
|
|
1279
|
+
if "FREE" in hkldata.df and free is None:
|
|
1280
|
+
free = hkldata.guess_free_number(newlabels[0])
|
|
1281
|
+
|
|
1282
|
+
if n_bins is None:
|
|
1283
|
+
sel = hkldata.df[newlabels[0]].notna()
|
|
1284
|
+
if use == "work":
|
|
1285
|
+
sel &= hkldata.df.FREE != free
|
|
1286
|
+
elif use == "test":
|
|
1287
|
+
sel &= hkldata.df.FREE == free
|
|
1288
|
+
s_array = 1/hkldata.d_spacings()[sel]
|
|
1289
|
+
if len(s_array) == 0:
|
|
1290
|
+
raise RuntimeError("no reflections in {} set".format(use))
|
|
1291
|
+
n_bins = utils.hkl.decide_n_bins(n_per_bin, s_array, max_bins=max_bins)
|
|
1292
|
+
logger.writeln("n_per_bin={} requested for {}. n_bins set to {}".format(n_per_bin, use, n_bins))
|
|
1293
|
+
|
|
1294
|
+
hkldata.setup_binning(n_bins=n_bins)
|
|
1295
|
+
logger.writeln("Data completeness: {:.2f}%".format(hkldata.completeness()*100.))
|
|
1296
|
+
|
|
1297
|
+
fc_labs = ["FC{}".format(i) for i, _ in enumerate(sts)]
|
|
1298
|
+
|
|
1299
|
+
# Create a centric selection table for faster look up
|
|
1300
|
+
centric_and_selections = {}
|
|
1301
|
+
stats = hkldata.binned_df.copy()
|
|
1302
|
+
stats["n_all"] = 0
|
|
1303
|
+
stats["n_obs"] = 0
|
|
1304
|
+
stats[newlabels[0]] = numpy.nan
|
|
1305
|
+
snr = "I/sigma" if newlabels[0] == "I" else "F/sigma"
|
|
1306
|
+
stats[snr] = numpy.nan
|
|
1307
|
+
if newlabels[0] == "I":
|
|
1308
|
+
stats["Mn(I)/Std(I)"] = numpy.nan
|
|
1309
|
+
if "FREE" in hkldata.df:
|
|
1310
|
+
stats["n_work"] = 0
|
|
1311
|
+
stats["n_test"] = 0
|
|
1312
|
+
|
|
1313
|
+
for i_bin, idxes in hkldata.binned():
|
|
1314
|
+
centric_and_selections[i_bin] = []
|
|
1315
|
+
n_obs = 0
|
|
1316
|
+
n_work, n_test = 0, 0
|
|
1317
|
+
for c, g2 in hkldata.df.loc[idxes].groupby("centric", sort=False):
|
|
1318
|
+
valid_sel = numpy.isfinite(g2[newlabels[0]])
|
|
1319
|
+
if "FREE" in g2:
|
|
1320
|
+
test_sel = (g2.FREE == free).fillna(False)
|
|
1321
|
+
test = g2.index[test_sel]
|
|
1322
|
+
work = g2.index[~test_sel]
|
|
1323
|
+
n_work += (valid_sel & ~test_sel).sum()
|
|
1324
|
+
n_test += (valid_sel & test_sel).sum()
|
|
1325
|
+
else:
|
|
1326
|
+
work = g2.index
|
|
1327
|
+
test = type(work)([], dtype=work.dtype)
|
|
1328
|
+
centric_and_selections[i_bin].append((c, work, test))
|
|
1329
|
+
n_obs += numpy.sum(valid_sel)
|
|
1330
|
+
|
|
1331
|
+
stats.loc[i_bin, "n_obs"] = n_obs
|
|
1332
|
+
stats.loc[i_bin, "n_all"] = len(idxes)
|
|
1333
|
+
obs = hkldata.df[newlabels[0]].to_numpy()[idxes]
|
|
1334
|
+
sigma = hkldata.df[newlabels[1]].to_numpy()[idxes]
|
|
1335
|
+
if n_obs > 0:
|
|
1336
|
+
stats.loc[i_bin, snr] = numpy.nanmean(obs / sigma)
|
|
1337
|
+
mean_obs = numpy.nanmean(obs)
|
|
1338
|
+
stats.loc[i_bin, newlabels[0]] = mean_obs
|
|
1339
|
+
if newlabels[0] == "I":
|
|
1340
|
+
stats.loc[i_bin, "Mn(I)/Std(I)"] = mean_obs / numpy.nanstd(obs)
|
|
1341
|
+
if "FREE" in hkldata.df:
|
|
1342
|
+
stats.loc[i_bin, "n_work"] = n_work
|
|
1343
|
+
stats.loc[i_bin, "n_test"] = n_test
|
|
1344
|
+
|
|
1345
|
+
stats["completeness"] = stats["n_obs"] / stats["n_all"] * 100
|
|
1346
|
+
logger.writeln(stats.to_string())
|
|
1347
|
+
return hkldata, sts, fc_labs, centric_and_selections, free
|
|
1348
|
+
# process_input()
|
|
1349
|
+
|
|
1350
|
+
def update_fc(st_list, fc_labs, d_min, monlib, source, mott_bethe, hkldata=None, twin_data=None):
|
|
1351
|
+
#assert (hkldata, twin_data).count(None) == 1
|
|
1352
|
+
# hkldata not updated when twin_data is given
|
|
1353
|
+
for i, st in enumerate(st_list):
|
|
1354
|
+
if st.ncs:
|
|
1355
|
+
st = st.clone()
|
|
1356
|
+
st.expand_ncs(gemmi.HowToNameCopiedChain.Dup, merge_dist=0)
|
|
1357
|
+
if twin_data:
|
|
1358
|
+
hkl = twin_data.asu
|
|
1359
|
+
else:
|
|
1360
|
+
hkl = hkldata.miller_array()
|
|
1361
|
+
fc = utils.model.calc_fc_fft(st, d_min - 1e-6,
|
|
1362
|
+
monlib=monlib,
|
|
1363
|
+
source=source,
|
|
1364
|
+
mott_bethe=mott_bethe,
|
|
1365
|
+
miller_array=hkl)
|
|
1366
|
+
if twin_data:
|
|
1367
|
+
twin_data.f_calc[:,i] = fc
|
|
1368
|
+
else:
|
|
1369
|
+
hkldata.df[fc_labs[i]] = fc
|
|
1370
|
+
if not twin_data:
|
|
1371
|
+
hkldata.df["FC"] = hkldata.df[fc_labs].sum(axis=1)
|
|
1372
|
+
# update_fc()
|
|
1373
|
+
|
|
1374
|
+
def calc_Fmask(st, d_min, miller_array):
|
|
1375
|
+
logger.writeln("Calculating solvent contribution..")
|
|
1376
|
+
grid = gemmi.FloatGrid()
|
|
1377
|
+
grid.setup_from(st, spacing=min(0.6, (d_min-1e-6) / 2 - 1e-9))
|
|
1378
|
+
masker = gemmi.SolventMasker(gemmi.AtomicRadiiSet.Refmac)
|
|
1379
|
+
masker.put_mask_on_float_grid(grid, st[0])
|
|
1380
|
+
fmask_gr = gemmi.transform_map_to_f_phi(grid)
|
|
1381
|
+
Fmask = fmask_gr.get_value_by_hkl(miller_array)
|
|
1382
|
+
return Fmask
|
|
1383
|
+
# calc_Fmask()
|
|
1384
|
+
|
|
1385
|
+
def bulk_solvent_and_lsq_scales(hkldata, sts, fc_labs, use_solvent=True, use_int=False, mask=None, func_type="log_cosh", twin_data=None):
|
|
1386
|
+
# fc_labs must have solvent part at the end
|
|
1387
|
+
miller_array = twin_data.asu if twin_data else hkldata.miller_array()
|
|
1388
|
+
d_min = twin_data.d_min(sts[0].cell) if twin_data else hkldata.d_min_max()[0]
|
|
1389
|
+
if use_solvent:
|
|
1390
|
+
if mask is None:
|
|
1391
|
+
Fmask = calc_Fmask(merge_models(sts), d_min, miller_array)
|
|
1392
|
+
else:
|
|
1393
|
+
fmask_gr = gemmi.transform_map_to_f_phi(mask)
|
|
1394
|
+
Fmask = fmask_gr.get_value_by_hkl(miller_array)
|
|
1395
|
+
if twin_data:
|
|
1396
|
+
fc_sum = twin_data.f_calc[:,:-1].sum(axis=1)
|
|
1397
|
+
else:
|
|
1398
|
+
fc_sum = hkldata.df[fc_labs[:-1]].sum(axis=1).to_numpy()
|
|
1399
|
+
fc_list = [fc_sum, Fmask]
|
|
1400
|
+
else:
|
|
1401
|
+
if twin_data:
|
|
1402
|
+
fc_list = [twin_data.f_calc.sum(axis=1)]
|
|
1403
|
+
else:
|
|
1404
|
+
fc_list = [hkldata.df[fc_labs].sum(axis=1).to_numpy()]
|
|
1405
|
+
|
|
1406
|
+
scaling = LsqScale(func_type=func_type)
|
|
1407
|
+
scaling.set_data(hkldata, fc_list, use_int, sigma_cutoff=0, twin_data=twin_data)
|
|
1408
|
+
scaling.scale()
|
|
1409
|
+
b_iso = scaling.b_iso
|
|
1410
|
+
k_aniso = hkldata.debye_waller_factors(b_cart=scaling.b_aniso)
|
|
1411
|
+
hkldata.df["k_aniso"] = k_aniso # we need it later when calculating stats
|
|
1412
|
+
|
|
1413
|
+
if use_solvent:
|
|
1414
|
+
if twin_data:
|
|
1415
|
+
s2 = numpy.asarray(twin_data.s2_array)
|
|
1416
|
+
else:
|
|
1417
|
+
s2 = 1. / hkldata.d_spacings().to_numpy()**2
|
|
1418
|
+
Fbulk = Fmask * scaling.get_solvent_scale(scaling.k_sol, scaling.b_sol, s2)
|
|
1419
|
+
if twin_data:
|
|
1420
|
+
twin_data.f_calc[:,-1] = Fbulk
|
|
1421
|
+
else:
|
|
1422
|
+
hkldata.df[fc_labs[-1]] = Fbulk
|
|
1423
|
+
|
|
1424
|
+
# Apply scales
|
|
1425
|
+
if use_int:
|
|
1426
|
+
# in intensity case, we try to refine b_aniso with ML. perhaps we should do it in amplitude case also
|
|
1427
|
+
o_labs = ["I", "SIGI", "I(+)","SIGI(+)", "I(-)", "SIGI(-)"]
|
|
1428
|
+
hkldata.df[hkldata.df.columns.intersection(o_labs)] /= scaling.k_overall**2
|
|
1429
|
+
else:
|
|
1430
|
+
o_labs = ["FP", "SIGFP", "F(+)","SIGF(+)", "F(-)", "SIGF(-)"]
|
|
1431
|
+
hkldata.df[hkldata.df.columns.intersection(o_labs)] /= scaling.k_overall
|
|
1432
|
+
if twin_data:
|
|
1433
|
+
twin_data.f_calc[:] *= twin_data.debye_waller_factors(b_iso=b_iso)[:,None]
|
|
1434
|
+
else:
|
|
1435
|
+
k_iso = hkldata.debye_waller_factors(b_iso=b_iso)
|
|
1436
|
+
for lab in fc_labs: hkldata.df[lab] *= k_iso
|
|
1437
|
+
# total Fc
|
|
1438
|
+
hkldata.df["FC"] = hkldata.df[fc_labs].sum(axis=1)
|
|
1439
|
+
return scaling
|
|
1440
|
+
# bulk_solvent_and_lsq_scales()
|
|
1441
|
+
|
|
1442
|
+
def calculate_maps(hkldata, b_aniso, centric_and_selections, fc_labs, D_labs, log_out, use="all"):
|
|
1443
|
+
nmodels = len(fc_labs)
|
|
1444
|
+
hkldata.df["FWT"] = 0j * numpy.nan
|
|
1445
|
+
hkldata.df["DELFWT"] = 0j * numpy.nan
|
|
1446
|
+
hkldata.df["FOM"] = numpy.nan
|
|
1447
|
+
hkldata.df["X"] = numpy.nan # for FOM
|
|
1448
|
+
has_ano = "F(+)" in hkldata.df and "F(-)" in hkldata.df
|
|
1449
|
+
if has_ano:
|
|
1450
|
+
hkldata.df["FAN"] = 0j * numpy.nan
|
|
1451
|
+
stats_data = []
|
|
1452
|
+
k_ani = hkldata.debye_waller_factors(b_cart=b_aniso)
|
|
1453
|
+
Ds = numpy.vstack([hkldata.df[lab].to_numpy() for lab in D_labs]).T
|
|
1454
|
+
Fcs = numpy.vstack([hkldata.df[lab].to_numpy() for lab in fc_labs]).T
|
|
1455
|
+
DFc = (Ds * Fcs).sum(axis=1)
|
|
1456
|
+
hkldata.df["DFC"] = DFc
|
|
1457
|
+
for i_bin, idxes in hkldata.binned():
|
|
1458
|
+
bin_d_min = hkldata.binned_df.d_min[i_bin]
|
|
1459
|
+
bin_d_max = hkldata.binned_df.d_max[i_bin]
|
|
1460
|
+
# 0: acentric 1: centric
|
|
1461
|
+
mean_fom = [numpy.nan, numpy.nan]
|
|
1462
|
+
nrefs = [0, 0]
|
|
1463
|
+
for c, work, test in centric_and_selections[i_bin]:
|
|
1464
|
+
cidxes = numpy.concatenate([work, test])
|
|
1465
|
+
S = hkldata.df["S"].to_numpy()[cidxes]
|
|
1466
|
+
expip = numpy.exp(numpy.angle(DFc[cidxes])*1j)
|
|
1467
|
+
Fo = hkldata.df.FP.to_numpy()[cidxes] / k_ani[cidxes]
|
|
1468
|
+
SigFo = hkldata.df.SIGFP.to_numpy()[cidxes] / k_ani[cidxes]
|
|
1469
|
+
epsilon = hkldata.df.epsilon.to_numpy()[cidxes]
|
|
1470
|
+
nrefs[c] = numpy.sum(numpy.isfinite(Fo))
|
|
1471
|
+
DFc_abs = numpy.abs(DFc[cidxes])
|
|
1472
|
+
if c == 0:
|
|
1473
|
+
Sigma = 2 * SigFo**2 + epsilon * S
|
|
1474
|
+
X = 2 * Fo * DFc_abs / Sigma
|
|
1475
|
+
m = gemmi.bessel_i1_over_i0(X)
|
|
1476
|
+
hkldata.df.loc[cidxes, "FWT"] = (2 * m * Fo - DFc_abs) * expip
|
|
1477
|
+
else:
|
|
1478
|
+
Sigma = SigFo**2 + epsilon * S
|
|
1479
|
+
X = Fo * DFc_abs / Sigma
|
|
1480
|
+
m = numpy.tanh(X)
|
|
1481
|
+
hkldata.df.loc[cidxes, "FWT"] = (m * Fo) * expip
|
|
1482
|
+
|
|
1483
|
+
hkldata.df.loc[cidxes, "DELFWT"] = (m * Fo - DFc_abs) * expip
|
|
1484
|
+
hkldata.df.loc[cidxes, "FOM"] = m
|
|
1485
|
+
hkldata.df.loc[cidxes, "X"] = X
|
|
1486
|
+
if has_ano:
|
|
1487
|
+
Fo_dano = (hkldata.df["F(+)"].to_numpy()[cidxes] - hkldata.df["F(-)"].to_numpy()[cidxes]) / k_ani[cidxes]
|
|
1488
|
+
hkldata.df.loc[cidxes, "FAN"] = m * Fo_dano * expip / 2j
|
|
1489
|
+
if nrefs[c] > 0: mean_fom[c] = numpy.nanmean(m)
|
|
1490
|
+
|
|
1491
|
+
# remove reflections that should be hidden
|
|
1492
|
+
if use != "all":
|
|
1493
|
+
# usually use == "work"
|
|
1494
|
+
tohide = test if use == "work" else work
|
|
1495
|
+
hkldata.df.loc[tohide, "FWT"] = 0j * numpy.nan
|
|
1496
|
+
hkldata.df.loc[tohide, "DELFWT"] = 0j * numpy.nan
|
|
1497
|
+
fill_sel = numpy.isnan(hkldata.df["FWT"][cidxes].to_numpy())
|
|
1498
|
+
hkldata.df.loc[cidxes[fill_sel], "FWT"] = DFc[cidxes][fill_sel]
|
|
1499
|
+
|
|
1500
|
+
Fc = hkldata.df.FC.to_numpy()[idxes] * k_ani[idxes]
|
|
1501
|
+
Fo = hkldata.df.FP.to_numpy()[idxes]
|
|
1502
|
+
mean_DFc2 = numpy.nanmean(numpy.abs((Ds[idxes,:] * Fcs[idxes,:]).sum(axis=1) * k_ani[idxes])**2)
|
|
1503
|
+
mean_log_DFcs = numpy.log(numpy.nanmean(numpy.abs(Ds[idxes,:] * Fcs[idxes,:] * k_ani[idxes,None]), axis=0)).tolist()
|
|
1504
|
+
mean_Ds = numpy.nanmean(Ds[idxes,:], axis=0).tolist()
|
|
1505
|
+
if sum(nrefs) > 0:
|
|
1506
|
+
r = numpy.nansum(numpy.abs(numpy.abs(Fc)-Fo)) / numpy.nansum(Fo)
|
|
1507
|
+
cc = utils.hkl.correlation(Fo, numpy.abs(Fc))
|
|
1508
|
+
mean_Fo2 = numpy.nanmean(numpy.abs(Fo)**2)
|
|
1509
|
+
else:
|
|
1510
|
+
r, cc, mean_Fo2 = numpy.nan, numpy.nan, numpy.nan
|
|
1511
|
+
stats_data.append([i_bin, nrefs[0], nrefs[1], bin_d_max, bin_d_min,
|
|
1512
|
+
numpy.log(mean_Fo2),
|
|
1513
|
+
numpy.log(numpy.nanmean(numpy.abs(Fc)**2)),
|
|
1514
|
+
numpy.log(mean_DFc2),
|
|
1515
|
+
numpy.log(numpy.mean(hkldata.df["S"].to_numpy()[idxes])),
|
|
1516
|
+
mean_fom[0], mean_fom[1], r, cc] + mean_Ds + mean_log_DFcs)
|
|
1517
|
+
|
|
1518
|
+
DFc_labs = ["log(Mn(|{}{}|))".format(dl,fl) for dl,fl in zip(D_labs, fc_labs)]
|
|
1519
|
+
cols = ["bin", "n_a", "n_c", "d_max", "d_min",
|
|
1520
|
+
"log(Mn(|Fo|^2))", "log(Mn(|Fc|^2))", "log(Mn(|DFc|^2))",
|
|
1521
|
+
"log(Sigma)", "FOM_a", "FOM_c", "R", "CC(|Fo|,|Fc|)"] + D_labs + DFc_labs
|
|
1522
|
+
stats = pandas.DataFrame(stats_data, columns=cols)
|
|
1523
|
+
title_labs = [["log(Mn(|F|^2)) and variances", ["log(Mn(|Fo|^2))", "log(Mn(|Fc|^2))", "log(Mn(|DFc|^2))", "log(Sigma)"]],
|
|
1524
|
+
["FOM", ["FOM_a", "FOM_c"]],
|
|
1525
|
+
["D", D_labs],
|
|
1526
|
+
["DFc", DFc_labs],
|
|
1527
|
+
["R-factor", ["R"]],
|
|
1528
|
+
["CC", ["CC(|Fo|,|Fc|)"]],
|
|
1529
|
+
["number of reflections", ["n_a", "n_c"]]]
|
|
1530
|
+
with open(log_out, "w") as ofs:
|
|
1531
|
+
ofs.write(utils.make_loggraph_str(stats, main_title="Statistics",
|
|
1532
|
+
title_labs=title_labs,
|
|
1533
|
+
s2=1/stats["d_min"]**2))
|
|
1534
|
+
logger.writeln("output log: {}".format(log_out))
|
|
1535
|
+
# calculate_maps()
|
|
1536
|
+
|
|
1537
|
+
def main(args):
|
|
1538
|
+
n_per_bin = {"all": 500, "work": 500, "test": 50}[args.use]
|
|
1539
|
+
try:
|
|
1540
|
+
hkldata, sts, fc_labs, centric_and_selections,free = process_input(hklin=args.hklin,
|
|
1541
|
+
labin=args.labin.split(",") if args.labin else None,
|
|
1542
|
+
n_bins=args.nbins,
|
|
1543
|
+
free=args.free,
|
|
1544
|
+
xyzins=sum(args.model, []),
|
|
1545
|
+
source=args.source,
|
|
1546
|
+
d_max=args.d_max,
|
|
1547
|
+
d_min=args.d_min,
|
|
1548
|
+
n_per_bin=n_per_bin,
|
|
1549
|
+
use=args.use,
|
|
1550
|
+
max_bins=30,
|
|
1551
|
+
keep_charges=args.keep_charges,
|
|
1552
|
+
space_group=args.spacegroup)
|
|
1553
|
+
except RuntimeError as e:
|
|
1554
|
+
raise SystemExit("Error: {}".format(e))
|
|
1555
|
+
|
|
1556
|
+
if args.twin:
|
|
1557
|
+
twin_data = find_twin_domains_from_data(hkldata)
|
|
1558
|
+
else:
|
|
1559
|
+
twin_data = None
|
|
1560
|
+
if twin_data:
|
|
1561
|
+
twin_data.setup_f_calc(len(sts) + (0 if args.no_solvent else 1))
|
|
1562
|
+
|
|
1563
|
+
update_fc(sts, fc_labs, d_min=hkldata.d_min_max()[0], monlib=None,
|
|
1564
|
+
source=args.source, mott_bethe=(args.source=="electron"),
|
|
1565
|
+
hkldata=hkldata, twin_data=twin_data)
|
|
1566
|
+
is_int = "I" in hkldata.df
|
|
1567
|
+
|
|
1568
|
+
if args.mask:
|
|
1569
|
+
mask = utils.fileio.read_ccp4_map(args.mask)[0]
|
|
1570
|
+
else:
|
|
1571
|
+
mask = None
|
|
1572
|
+
|
|
1573
|
+
# Overall scaling & bulk solvent
|
|
1574
|
+
# FP/SIGFP will be scaled. Total FC will be added.
|
|
1575
|
+
if not args.no_solvent:
|
|
1576
|
+
fc_labs.append("Fbulk")
|
|
1577
|
+
lsq = bulk_solvent_and_lsq_scales(hkldata, sts, fc_labs, use_solvent=not args.no_solvent,
|
|
1578
|
+
use_int=is_int, mask=mask, twin_data=twin_data)
|
|
1579
|
+
b_aniso = lsq.b_aniso
|
|
1580
|
+
# stats
|
|
1581
|
+
stats, overall = calc_r_and_cc(hkldata, centric_and_selections, twin_data)
|
|
1582
|
+
for lab in "R", "CC":
|
|
1583
|
+
logger.writeln(" ".join("{} = {:.4f}".format(x, overall[x]) for x in overall if x.startswith(lab)))
|
|
1584
|
+
if is_int:
|
|
1585
|
+
logger.writeln("R1 is calculated for reflections with I/sigma>2.")
|
|
1586
|
+
|
|
1587
|
+
if twin_data:
|
|
1588
|
+
estimate_twin_fractions_from_model(twin_data, hkldata)
|
|
1589
|
+
#del hkldata.df["FC"]
|
|
1590
|
+
#del hkldata.df["Fbulk"]
|
|
1591
|
+
# Need to redo scaling?
|
|
1592
|
+
lsq = bulk_solvent_and_lsq_scales(hkldata, sts, fc_labs, use_solvent=not args.no_solvent,
|
|
1593
|
+
use_int=is_int, mask=mask, twin_data=twin_data)
|
|
1594
|
+
b_aniso = lsq.b_aniso
|
|
1595
|
+
stats, overall = calc_r_and_cc(hkldata, centric_and_selections, twin_data)
|
|
1596
|
+
for lab in "R", "CC":
|
|
1597
|
+
logger.writeln(" ".join("{} = {:.4f}".format(x, overall[x]) for x in overall if x.startswith(lab)))
|
|
1598
|
+
|
|
1599
|
+
# Estimate ML parameters
|
|
1600
|
+
D_labs = ["D{}".format(i) for i in range(len(fc_labs))]
|
|
1601
|
+
|
|
1602
|
+
if args.use_cc:
|
|
1603
|
+
assert not is_int
|
|
1604
|
+
assert not args.twin
|
|
1605
|
+
logger.writeln("Estimating sigma-A parameters from CC..")
|
|
1606
|
+
determine_mlf_params_from_cc(hkldata, fc_labs, D_labs, centric_and_selections, args.use)
|
|
1607
|
+
else:
|
|
1608
|
+
b_aniso = determine_ml_params(hkldata, is_int, fc_labs, D_labs, b_aniso, centric_and_selections, args.D_trans, args.S_trans, args.use,
|
|
1609
|
+
twin_data=twin_data)
|
|
1610
|
+
use = {"all": "all", "work": "work", "test": "work"}[args.use]
|
|
1611
|
+
if twin_data:
|
|
1612
|
+
# replace hkldata
|
|
1613
|
+
hkldata = calculate_maps_twin(hkldata, b_aniso, fc_labs, D_labs, twin_data, centric_and_selections, use)
|
|
1614
|
+
elif is_int:
|
|
1615
|
+
calculate_maps_int(hkldata, b_aniso, fc_labs, D_labs, centric_and_selections, use)
|
|
1616
|
+
else:
|
|
1617
|
+
log_out = "{}.log".format(args.output_prefix)
|
|
1618
|
+
calculate_maps(hkldata, b_aniso, centric_and_selections, fc_labs, D_labs, log_out, use)
|
|
1619
|
+
|
|
1620
|
+
# Write mtz file
|
|
1621
|
+
if twin_data:
|
|
1622
|
+
labs = ["F_est", "F_exp"]
|
|
1623
|
+
elif is_int:
|
|
1624
|
+
labs = ["I", "SIGI"]
|
|
1625
|
+
else:
|
|
1626
|
+
labs = ["FP", "SIGFP"]
|
|
1627
|
+
labs.extend(["FOM", "FWT", "DELFWT", "FC", "DFC"])
|
|
1628
|
+
if "FAN" in hkldata.df:
|
|
1629
|
+
labs.append("FAN")
|
|
1630
|
+
if not args.no_solvent:
|
|
1631
|
+
labs.append("Fbulk")
|
|
1632
|
+
if "FREE" in hkldata.df:
|
|
1633
|
+
labs.append("FREE")
|
|
1634
|
+
if "F_true_est" in hkldata.df:
|
|
1635
|
+
labs.append("F_true_est")
|
|
1636
|
+
labs += D_labs + ["S"]
|
|
1637
|
+
mtz_out = args.output_prefix+".mtz"
|
|
1638
|
+
hkldata.write_mtz(mtz_out, labs=labs, types={"FOM": "W", "FP":"F", "SIGFP":"Q", "F_est": "F", "F_exp": "F"})
|
|
1639
|
+
return hkldata
|
|
1640
|
+
# main()
|
|
1641
|
+
if __name__ == "__main__":
|
|
1642
|
+
import sys
|
|
1643
|
+
args = parse_args(sys.argv[1:])
|
|
1644
|
+
main(args)
|