servalcat 0.4.99__cp39-cp39-win_amd64.whl → 0.4.105__cp39-cp39-win_amd64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of servalcat might be problematic. Click here for more details.

servalcat/utils/hkl.py CHANGED
@@ -493,6 +493,31 @@ class HklData:
493
493
  self.df = self.df[~sel]
494
494
  # remove_nonpositive()
495
495
 
496
+ def mask_invalid_obs_values(self, labels):
497
+ assert 1 < len(labels) < 6
498
+ assert labels[1].startswith("SIG")
499
+ def do_mask(label, target_labels):
500
+ sel = self.df[label] <= 0
501
+ n_bad = sel.sum()
502
+ if n_bad > 0:
503
+ logger.writeln("Removing {} reflections with {}<=0".format(n_bad, label))
504
+ self.df.loc[sel, target_labels] = numpy.nan
505
+ # If any element within target_labels is non-finite, mask all elements
506
+ self.df.loc[(~numpy.isfinite(self.df[target_labels])).any(axis=1), target_labels] = numpy.nan
507
+
508
+ if len(labels) < 4: # F/SIGF or I/SIGI
509
+ if labels[0].startswith("F"):
510
+ do_mask(labels[0], labels[:2]) # bad F
511
+ do_mask(labels[1], labels[:2]) # bad sigma
512
+ else: # I(+)/SIGI(+)/I(-)/SIGI(-) or F...
513
+ assert labels[3].startswith("SIG")
514
+ if labels[0].startswith("F"):
515
+ do_mask(labels[0], labels[:2]) # bad F+
516
+ do_mask(labels[2], labels[2:4]) # bad F-
517
+ do_mask(labels[1], labels[:2]) # bad sigma+
518
+ do_mask(labels[3], labels[2:4]) # bad sigma-
519
+ # mask_invalid_obs_values()
520
+
496
521
  def remove_systematic_absences(self):
497
522
  is_absent = self.sg.operations().systematic_absences(self.miller_array())
498
523
  n_absent = numpy.sum(is_absent)
@@ -501,12 +526,22 @@ class HklData:
501
526
  self.df = self.df[~is_absent]
502
527
  # remove_systematic_absences()
503
528
 
504
- def merge_anomalous(self, labs, newlabs):
529
+ def merge_anomalous(self, labs, newlabs, method="weighted"):
530
+ assert method in ("weighted", "simple")
505
531
  assert len(labs) == 4 # i+,sigi+,i-,sigi- for example
506
532
  assert len(newlabs) == 2
507
- # skipna=True is default, so missing value is handled nicely.
508
- self.df[newlabs[0]] = self.df[[labs[0], labs[2]]].mean(axis=1)
509
- self.df[newlabs[1]] = self.df[[labs[1], labs[3]]].pow(2).mean(axis=1).pow(0.5)
533
+ if method == "simple":
534
+ # skipna=True is default, so missing value is handled nicely.
535
+ self.df[newlabs[0]] = self.df[[labs[0], labs[2]]].mean(axis=1)
536
+ self.df[newlabs[1]] = self.df[[labs[1], labs[3]]].pow(2).mean(axis=1).pow(0.5)
537
+ else:
538
+ obs = self.df[[labs[0], labs[2]]].to_numpy()
539
+ weights = 1. / self.df[[labs[1], labs[3]]].to_numpy()**2
540
+ sum_w = numpy.nansum(weights, axis=1)
541
+ sum_w[sum_w == 0] = numpy.nan # mask when both are nan
542
+ self.df[newlabs[0]] = numpy.nansum(obs * weights, axis=1) / sum_w
543
+ self.df[newlabs[1]] = numpy.sqrt(1. / sum_w)
544
+ # merge_anomalous()
510
545
 
511
546
  def as_asu_data(self, label=None, data=None, label_sigma=None):
512
547
  if label is None: assert data is not None
servalcat/utils/logger.py CHANGED
@@ -37,7 +37,7 @@ class Logger(object):
37
37
  def write(self, l, end="", flush=True, fs=None, print_fs=sys.stdout):
38
38
  if self.stopped: return
39
39
  if self.prefix:
40
- l = "\n".join(self.prefix + x for x in l.splitlines(keepends=True))
40
+ l = "".join(self.prefix + x for x in l.splitlines(keepends=True))
41
41
  print(l, end=end, file=print_fs, flush=flush)
42
42
  for f in (self.ofs, fs):
43
43
  if f is not None:
servalcat/utils/model.py CHANGED
@@ -73,11 +73,12 @@ def remove_charge(sts):
73
73
  def check_atomsf(sts, source, mott_bethe=True):
74
74
  assert source in ("xray", "electron", "neutron")
75
75
  if source != "electron": mott_bethe = False
76
- logger.writeln("Atomic scattering factors for {}".format("electron (Mott-Bethe)" if mott_bethe else source))
76
+ logger.writeln("Atomic scattering factors for {}".format("xray (use Mott-Bethe to convert to electrons)" if mott_bethe else source))
77
77
  if source != "xray" and not mott_bethe:
78
78
  logger.writeln(" Note that charges will be ignored")
79
79
  el_charges = {(cra.atom.element, cra.atom.charge) for st in sts for cra in st[0].all()}
80
80
  elems = {x[0] for x in el_charges}
81
+ tmp = {}
81
82
  if source == "xray" or mott_bethe:
82
83
  shown = set()
83
84
  for el, charge in sorted(el_charges, key=lambda x: (x[0].atomic_number, x[1])):
@@ -88,12 +89,16 @@ def check_atomsf(sts, source, mott_bethe=True):
88
89
  charge = 0
89
90
  if (el, charge) in shown: continue
90
91
  label = el.name if charge == 0 else "{}{:+}".format(el.name, charge)
91
- logger.writeln(" {} {}".format(label, tuple(sf.get_coefs())))
92
92
  shown.add((el, charge))
93
+ tmp[label] = {**{f"{k}{i+1}": x for k in ("a", "b") for i, x in enumerate(getattr(sf, k))}, "c": sf.c}
93
94
  else:
94
95
  for el in sorted(elems, key=lambda x: x.atomic_number):
95
- sf = el.c4322 if source == "electron" else el.neutron92
96
- logger.writeln(" {} {}".format(el.name, tuple(sf.get_coefs())))
96
+ if source == "electron":
97
+ tmp[el.name] = {f"{k}{i+1}": x for k in ("a", "b") for i, x in enumerate(getattr(el.c4322, k))}
98
+ else:
99
+ tmp[el.name] = {"a": el.neutron92.get_coefs()[0]}
100
+ with logger.with_prefix(" "):
101
+ logger.writeln(pandas.DataFrame(tmp).T.to_string())
97
102
  logger.writeln("")
98
103
  # check_atomsf()
99
104
 
@@ -489,9 +494,10 @@ def filter_contacting_ncs(st, cutoff=5.):
489
494
  st.setup_cell_images()
490
495
  ns = gemmi.NeighborSearch(st[0], st.cell, cutoff*2).populate() # This is considered crystallographic cell if not 1 1 1. Undesirable result may be seen.
491
496
  cs = gemmi.ContactSearch(cutoff)
497
+ cs.twice = True # since we need all image_idx
492
498
  cs.ignore = gemmi.ContactSearch.Ignore.SameAsu
493
499
  results = cs.find_contacts(ns)
494
- indices = set([r.image_idx for r in results])
500
+ indices = {r.image_idx for r in results}
495
501
  logger.writeln(" contacting copies: {}".format(indices))
496
502
  ops = [st.ncs[i-1] for i in indices] # XXX is this correct? maybe yes as long as identity operator is not there
497
503
  st.ncs.clear()
@@ -553,7 +559,9 @@ def reset_adp(model, bfactor=None, adp_mode="iso"):
553
559
  if adp_mode == "iso" or (adp_mode == "fix" and bfactor is not None):
554
560
  cra.atom.aniso = gemmi.SMat33f(0,0,0,0,0,0)
555
561
  elif adp_mode == "aniso":
556
- if not cra.atom.aniso.nonzero() or bfactor is not None:
562
+ if cra.atom.aniso.nonzero() and bfactor is None: # just in case
563
+ cra.atom.b_iso = numpy.mean(cra.atom.aniso.calculate_eigenvalues()) * u_to_b
564
+ else:
557
565
  u = cra.atom.b_iso * b_to_u
558
566
  cra.atom.aniso = gemmi.SMat33f(u, u, u, 0, 0, 0)
559
567
  # reset_adp()
@@ -39,8 +39,11 @@ def update_ncs_from_args(args, st, map_and_start=None, filter_contacting=False,
39
39
  ncsops = ncsops_from_args(args, st.cell, map_and_start=map_and_start, st=st,
40
40
  helical_min_n=helical_min_n, helical_max_n=helical_max_n)
41
41
 
42
- st.ncs.clear()
43
- st.ncs.extend([x for x in ncsops if not x.tr.is_identity()])
42
+ st.ncs = [x for x in ncsops if not x.tr.is_identity()]
43
+ # To write identity op to the output model
44
+ idop_id = next((x.id for x in ncsops if x.tr.is_identity()), None)
45
+ if idop_id:
46
+ st.info["_struct_ncs_oper.id"] = idop_id
44
47
 
45
48
  if filter_contacting:
46
49
  model.filter_contacting_ncs(st)
servalcat/xtal/sigmaa.py CHANGED
@@ -30,10 +30,14 @@ def add_arguments(parser):
30
30
  parser.description = 'Sigma-A parameter estimation for crystallographic data'
31
31
  parser.add_argument('--hklin', required=True,
32
32
  help='Input MTZ file')
33
+ parser.add_argument('--hklin_free',
34
+ help='Input MTZ file for test flags')
33
35
  parser.add_argument('--spacegroup',
34
36
  help='Override space group')
35
37
  parser.add_argument('--labin',
36
- help='MTZ column for F,SIGF,FREE')
38
+ help='MTZ columns of --hklin for F,SIGF,FREE')
39
+ parser.add_argument('--labin_free',
40
+ help='MTZ column of --hklin_free')
37
41
  parser.add_argument('--free', type=int,
38
42
  help='flag number for test set')
39
43
  parser.add_argument('--model', required=True, nargs="+", action="append",
@@ -97,6 +101,8 @@ def calc_r_and_cc(hkldata, centric_and_selections, twin_data=None):
97
101
  else:
98
102
  obs = obs_sqrt = hkldata.df.FP
99
103
  calc = calc_sqrt = Fc
104
+ if "CC*" in stats: # swap the positions
105
+ stats.insert(len(stats.columns)-1, "CC*", stats.pop("CC*"))
100
106
  if has_free:
101
107
  for lab in (cclab, rlab):
102
108
  for suf in ("work", "free"):
@@ -1037,6 +1043,7 @@ def calculate_maps_int(hkldata, b_aniso, fc_labs, D_labs, centric_and_selections
1037
1043
  nmodels = len(fc_labs)
1038
1044
  hkldata.df["FWT"] = 0j * numpy.nan
1039
1045
  hkldata.df["DELFWT"] = 0j * numpy.nan
1046
+ hkldata.df["F_est"] = numpy.nan
1040
1047
  hkldata.df["FOM"] = numpy.nan # FOM proxy, |<F>| / <|F|>
1041
1048
  has_ano = "I(+)" in hkldata.df and "I(-)" in hkldata.df
1042
1049
  if has_ano:
@@ -1056,12 +1063,10 @@ def calculate_maps_int(hkldata, b_aniso, fc_labs, D_labs, centric_and_selections
1056
1063
  S = hkldata.df["S"].to_numpy()[cidxes]
1057
1064
  f, m_proxy = expected_F_from_int(Io[cidxes], sigIo[cidxes], k_ani[cidxes], DFc[cidxes], eps[cidxes], c, S)
1058
1065
  exp_ip = numpy.exp(numpy.angle(DFc[cidxes])*1j)
1059
- if c == 0:
1060
- hkldata.df.loc[cidxes, "FWT"] = 2 * f * exp_ip - DFc[cidxes]
1061
- else:
1062
- hkldata.df.loc[cidxes, "FWT"] = f * exp_ip
1066
+ hkldata.df.loc[cidxes, "FWT"] = 2 * f * exp_ip - DFc[cidxes]
1063
1067
  hkldata.df.loc[cidxes, "DELFWT"] = f * exp_ip - DFc[cidxes]
1064
1068
  hkldata.df.loc[cidxes, "FOM"] = m_proxy
1069
+ hkldata.df.loc[cidxes, "F_est"] = f
1065
1070
  if has_ano:
1066
1071
  f_p, _ = expected_F_from_int(ano_data[cidxes,0], ano_data[cidxes,1],
1067
1072
  k_ani[cidxes], DFc[cidxes], eps[cidxes], c, S)
@@ -1162,12 +1167,34 @@ def decide_mtz_labels(mtz, find_free=True, require=None):
1162
1167
  return labin
1163
1168
  # decide_mtz_labels()
1164
1169
 
1170
+ def decide_spacegroup(sg_user, sg_st, sg_hkl):
1171
+ assert sg_hkl is not None
1172
+ ret = None
1173
+ if sg_user is not None:
1174
+ ret = sg_user
1175
+ logger.writeln(f"Space group overridden by user. Using {ret.xhm()}")
1176
+ else:
1177
+ ret = sg_hkl
1178
+ if sg_hkl != sg_st:
1179
+ if sg_st and sg_st.laue_str() != sg_hkl.laue_str():
1180
+ raise RuntimeError("Crystal symmetry mismatch between model and data")
1181
+ logger.writeln("Warning: space group mismatch between model and mtz")
1182
+ if sg_st and sg_st.laue_str() == sg_hkl.laue_str():
1183
+ logger.writeln(" using space group from model")
1184
+ ret = sg_st
1185
+ else:
1186
+ logger.writeln(" using space group from mtz")
1187
+ logger.writeln("")
1188
+
1189
+ return ret
1190
+ # decide_spacegroup
1191
+
1165
1192
  def process_input(hklin, labin, n_bins, free, xyzins, source, d_max=None, d_min=None,
1166
1193
  n_per_bin=None, use="all", max_bins=None, cif_index=0, keep_charges=False,
1167
- allow_unusual_occupancies=False, space_group=None):
1194
+ allow_unusual_occupancies=False, space_group=None,
1195
+ hklin_free=None, labin_free=None):
1168
1196
  if labin: assert 1 < len(labin) < 6
1169
1197
  assert use in ("all", "work", "test")
1170
- assert n_bins or n_per_bin #if n_bins not set, n_per_bin should be given
1171
1198
 
1172
1199
  if len(xyzins) > 0 and type(xyzins[0]) is gemmi.Structure:
1173
1200
  sts = xyzins
@@ -1185,13 +1212,18 @@ def process_input(hklin, labin, n_bins, free, xyzins, source, d_max=None, d_min=
1185
1212
  assert len(xyzins) == 1
1186
1213
  assert not sts
1187
1214
  st, mtz = utils.fileio.read_small_molecule_files([hklin, xyzins[0]])
1215
+ if None in (st, mtz):
1216
+ raise SystemExit("Failed to read small molecule file(s)")
1188
1217
  sts = [st]
1189
1218
 
1190
1219
  for st in sts:
1191
1220
  utils.model.check_occupancies(st, raise_error=not allow_unusual_occupancies)
1192
1221
 
1222
+ sg_use = decide_spacegroup(sg_user=gemmi.SpaceGroup(space_group) if space_group else None,
1223
+ sg_st=sts[0].find_spacegroup() if sts else None,
1224
+ sg_hkl=mtz.spacegroup)
1193
1225
  if not labin:
1194
- labin = decide_mtz_labels(mtz)
1226
+ labin = decide_mtz_labels(mtz, find_free=hklin_free is None)
1195
1227
  col_types = {x.label:x.type for x in mtz.columns}
1196
1228
  if labin[0] not in col_types:
1197
1229
  raise RuntimeError("MTZ column not found: {}".format(labin[0]))
@@ -1201,10 +1233,31 @@ def process_input(hklin, labin, n_bins, free, xyzins, source, d_max=None, d_min=
1201
1233
  "K": ("anomalous intensity", ["I(+)","SIGI(+)", "I(-)", "SIGI(-)"], ["K", "M", "K", "M"])}
1202
1234
  if col_types[labin[0]] not in labs_and_types:
1203
1235
  raise RuntimeError("MTZ column {} is neither amplitude nor intensity".format(labin[0]))
1236
+ if col_types[labin[0]] == "J": # may be unmerged data
1237
+ ints = gemmi.Intensities()
1238
+ ints.set_data(mtz.cell, sg_use, mtz.make_miller_array(),
1239
+ mtz.array[:,mtz.column_labels().index(labin[0])],
1240
+ mtz.array[:,mtz.column_labels().index(labin[1])])
1241
+ dtype = ints.prepare_for_merging(gemmi.DataType.Mean) # do we want Anomalous?
1242
+ ints_bak = ints.clone() # for stats
1243
+ ints.merge_in_place(dtype)
1244
+ if (ints.nobs_array > 1).any():
1245
+ mtz = ints.prepare_merged_mtz(with_nobs=False)
1246
+ labin = mtz.column_labels()[3:]
1247
+ col_types = {x.label:x.type for x in mtz.columns}
1248
+ mult = ints.nobs_array.mean()
1249
+ logger.writeln(f"Input data were merged (multiplicity: {mult:.2f}). Overriding labin={','.join(labin)}")
1250
+ else:
1251
+ ints_bak = None
1252
+ else:
1253
+ ints_bak = None
1254
+
1204
1255
  name, newlabels, require_types = labs_and_types[col_types[labin[0]]]
1205
1256
  logger.writeln("Observation type: {}".format(name))
1206
1257
  if len(newlabels) < len(labin): newlabels.append("FREE")
1207
1258
  hkldata = utils.hkl.hkldata_from_mtz(mtz, labin, newlabels=newlabels, require_types=require_types)
1259
+ hkldata.sg = sg_use
1260
+ hkldata.mask_invalid_obs_values(newlabels)
1208
1261
  if newlabels[0] == "F(+)":
1209
1262
  hkldata.merge_anomalous(newlabels[:4], ["FP", "SIGFP"])
1210
1263
  newlabels = ["FP", "SIGFP"] + newlabels[4:]
@@ -1214,13 +1267,7 @@ def process_input(hklin, labin, n_bins, free, xyzins, source, d_max=None, d_min=
1214
1267
 
1215
1268
  if hkldata.df.empty:
1216
1269
  raise RuntimeError("No data in hkl data")
1217
-
1218
- if space_group is None:
1219
- sg_use = None
1220
- else:
1221
- sg_use = gemmi.SpaceGroup(space_group)
1222
- logger.writeln(f"Space group overridden by user. Using {sg_use.xhm()}")
1223
-
1270
+
1224
1271
  if sts:
1225
1272
  assert source in ["electron", "xray", "neutron"]
1226
1273
  for st in sts:
@@ -1230,23 +1277,8 @@ def process_input(hklin, labin, n_bins, free, xyzins, source, d_max=None, d_min=
1230
1277
  logger.writeln("Warning: unit cell mismatch between model and reflection data")
1231
1278
  logger.writeln(" using unit cell from mtz")
1232
1279
 
1233
- for st in sts: st.cell = hkldata.cell # mtz cell is used in any case
1234
-
1235
- sg_st = sts[0].find_spacegroup() # may be None
1236
- if sg_use is None:
1237
- sg_use = hkldata.sg
1238
- if hkldata.sg != sg_st:
1239
- if st.cell.is_crystal() and sg_st and sg_st.laue_str() != hkldata.sg.laue_str():
1240
- raise RuntimeError("Crystal symmetry mismatch between model and data")
1241
- logger.writeln("Warning: space group mismatch between model and mtz")
1242
- if sg_st and sg_st.laue_str() == hkldata.sg.laue_str():
1243
- logger.writeln(" using space group from model")
1244
- sg_use = sg_st
1245
- else:
1246
- logger.writeln(" using space group from mtz")
1247
- logger.writeln("")
1248
-
1249
1280
  for st in sts:
1281
+ st.cell = hkldata.cell # mtz cell is used in any case
1250
1282
  st.spacegroup_hm = sg_use.xhm()
1251
1283
  st.setup_cell_images()
1252
1284
 
@@ -1254,22 +1286,36 @@ def process_input(hklin, labin, n_bins, free, xyzins, source, d_max=None, d_min=
1254
1286
  utils.model.remove_charge(sts)
1255
1287
  utils.model.check_atomsf(sts, source)
1256
1288
 
1257
- if sg_use is not None:
1258
- hkldata.sg = sg_use
1259
- if newlabels[0] == "FP":
1260
- hkldata.remove_nonpositive(newlabels[0])
1261
- hkldata.remove_nonpositive(newlabels[1])
1262
1289
  hkldata.switch_to_asu()
1263
1290
  hkldata.remove_systematic_absences()
1264
1291
  #hkldata.df = hkldata.df.astype({name: 'float64' for name in ["I","SIGI","FP","SIGFP"] if name in hkldata.df})
1265
- d_min_data = hkldata.d_min_max(newlabels)[0]
1266
- if d_min is None and hkldata.d_min_max()[0] != d_min_data:
1267
- d_min = d_min_data
1292
+ d_min_max_data = hkldata.d_min_max(newlabels)
1293
+ if d_min is None and hkldata.d_min_max()[0] != d_min_max_data[0]:
1294
+ d_min = d_min_max_data[0]
1268
1295
  logger.writeln(f"Changing resolution to {d_min:.3f} A")
1269
1296
  if (d_min, d_max).count(None) != 2:
1270
1297
  hkldata = hkldata.copy(d_min=d_min, d_max=d_max)
1271
1298
  if hkldata.df.empty:
1272
1299
  raise RuntimeError("No data left in hkl data")
1300
+
1301
+ if hklin_free is not None:
1302
+ mtz2 = utils.fileio.read_mmhkl(hklin_free)
1303
+ if labin_free and labin_free not in mtz2.column_labels():
1304
+ raise RuntimeError(f"specified label ({labin_free}) not found in {hklin_free}")
1305
+ if not labin_free:
1306
+ tmp = utils.hkl.mtz_find_free_columns(mtz2)
1307
+ if tmp:
1308
+ labin_free = tmp[0]
1309
+ else:
1310
+ raise RuntimeError(f"Test flag label not found in {hklin_free}")
1311
+ tmp = utils.hkl.hkldata_from_mtz(mtz2, [labin_free], newlabels=["FREE"])
1312
+ tmp.sg = sg_use
1313
+ tmp.switch_to_asu()
1314
+ tmp.remove_systematic_absences()
1315
+ tmp = tmp.copy(d_min=d_min_max_data[0], d_max=d_min_max_data[1])
1316
+ hkldata.complete()
1317
+ tmp.complete()
1318
+ hkldata.merge(tmp.df[["H","K","L","FREE"]])
1273
1319
 
1274
1320
  hkldata.complete()
1275
1321
  hkldata.sort_by_resolution()
@@ -1277,9 +1323,20 @@ def process_input(hklin, labin, n_bins, free, xyzins, source, d_max=None, d_min=
1277
1323
  hkldata.calc_centric()
1278
1324
 
1279
1325
  if "FREE" in hkldata.df and free is None:
1280
- free = hkldata.guess_free_number(newlabels[0])
1326
+ free = hkldata.guess_free_number(newlabels[0]) # also check NaN
1281
1327
 
1282
1328
  if n_bins is None:
1329
+ if n_per_bin is None:
1330
+ if use == "all" or "FREE" not in hkldata.df:
1331
+ n_per_bin = 100
1332
+ use = "all"
1333
+ elif use == "work":
1334
+ n_per_bin = 100
1335
+ elif use == "test":
1336
+ n_per_bin = 50
1337
+ else:
1338
+ raise RuntimeError(f"should not happen: {use=}")
1339
+
1283
1340
  sel = hkldata.df[newlabels[0]].notna()
1284
1341
  if use == "work":
1285
1342
  sel &= hkldata.df.FREE != free
@@ -1292,8 +1349,6 @@ def process_input(hklin, labin, n_bins, free, xyzins, source, d_max=None, d_min=
1292
1349
  logger.writeln("n_per_bin={} requested for {}. n_bins set to {}".format(n_per_bin, use, n_bins))
1293
1350
 
1294
1351
  hkldata.setup_binning(n_bins=n_bins)
1295
- logger.writeln("Data completeness: {:.2f}%".format(hkldata.completeness()*100.))
1296
-
1297
1352
  fc_labs = ["FC{}".format(i) for i, _ in enumerate(sts)]
1298
1353
 
1299
1354
  # Create a centric selection table for faster look up
@@ -1343,6 +1398,14 @@ def process_input(hklin, labin, n_bins, free, xyzins, source, d_max=None, d_min=
1343
1398
  stats.loc[i_bin, "n_test"] = n_test
1344
1399
 
1345
1400
  stats["completeness"] = stats["n_obs"] / stats["n_all"] * 100
1401
+ logger.writeln("Data completeness: {:.2%}".format(stats["n_obs"].sum() / stats["n_all"].sum()))
1402
+ if ints_bak is not None:
1403
+ binner = gemmi.Binner()
1404
+ binner.setup(n_bins, gemmi.Binner.Method.Dstar2, ints_bak)
1405
+ bin_stats = ints_bak.calculate_merging_stats(binner, use_weights="X")
1406
+ stats["CC1/2"] = [stats.cc_half() for stats in bin_stats]
1407
+ hkldata.binned_df["CC*"] = numpy.sqrt(2 * stats["CC1/2"] / (1 + stats["CC1/2"]))
1408
+
1346
1409
  logger.writeln(stats.to_string())
1347
1410
  return hkldata, sts, fc_labs, centric_and_selections, free
1348
1411
  # process_input()
@@ -1473,13 +1536,11 @@ def calculate_maps(hkldata, b_aniso, centric_and_selections, fc_labs, D_labs, lo
1473
1536
  Sigma = 2 * SigFo**2 + epsilon * S
1474
1537
  X = 2 * Fo * DFc_abs / Sigma
1475
1538
  m = gemmi.bessel_i1_over_i0(X)
1476
- hkldata.df.loc[cidxes, "FWT"] = (2 * m * Fo - DFc_abs) * expip
1477
1539
  else:
1478
1540
  Sigma = SigFo**2 + epsilon * S
1479
1541
  X = Fo * DFc_abs / Sigma
1480
1542
  m = numpy.tanh(X)
1481
- hkldata.df.loc[cidxes, "FWT"] = (m * Fo) * expip
1482
-
1543
+ hkldata.df.loc[cidxes, "FWT"] = (2 * m * Fo - DFc_abs) * expip
1483
1544
  hkldata.df.loc[cidxes, "DELFWT"] = (m * Fo - DFc_abs) * expip
1484
1545
  hkldata.df.loc[cidxes, "FOM"] = m
1485
1546
  hkldata.df.loc[cidxes, "X"] = X
@@ -1500,7 +1561,8 @@ def calculate_maps(hkldata, b_aniso, centric_and_selections, fc_labs, D_labs, lo
1500
1561
  Fc = hkldata.df.FC.to_numpy()[idxes] * k_ani[idxes]
1501
1562
  Fo = hkldata.df.FP.to_numpy()[idxes]
1502
1563
  mean_DFc2 = numpy.nanmean(numpy.abs((Ds[idxes,:] * Fcs[idxes,:]).sum(axis=1) * k_ani[idxes])**2)
1503
- mean_log_DFcs = numpy.log(numpy.nanmean(numpy.abs(Ds[idxes,:] * Fcs[idxes,:] * k_ani[idxes,None]), axis=0)).tolist()
1564
+ with numpy.errstate(divide="ignore"):
1565
+ mean_log_DFcs = numpy.log(numpy.nanmean(numpy.abs(Ds[idxes,:] * Fcs[idxes,:] * k_ani[idxes,None]), axis=0)).tolist()
1504
1566
  mean_Ds = numpy.nanmean(Ds[idxes,:], axis=0).tolist()
1505
1567
  if sum(nrefs) > 0:
1506
1568
  r = numpy.nansum(numpy.abs(numpy.abs(Fc)-Fo)) / numpy.nansum(Fo)
@@ -1535,7 +1597,6 @@ def calculate_maps(hkldata, b_aniso, centric_and_selections, fc_labs, D_labs, lo
1535
1597
  # calculate_maps()
1536
1598
 
1537
1599
  def main(args):
1538
- n_per_bin = {"all": 500, "work": 500, "test": 50}[args.use]
1539
1600
  try:
1540
1601
  hkldata, sts, fc_labs, centric_and_selections,free = process_input(hklin=args.hklin,
1541
1602
  labin=args.labin.split(",") if args.labin else None,
@@ -1545,16 +1606,17 @@ def main(args):
1545
1606
  source=args.source,
1546
1607
  d_max=args.d_max,
1547
1608
  d_min=args.d_min,
1548
- n_per_bin=n_per_bin,
1549
1609
  use=args.use,
1550
1610
  max_bins=30,
1551
1611
  keep_charges=args.keep_charges,
1552
- space_group=args.spacegroup)
1612
+ space_group=args.spacegroup,
1613
+ hklin_free=args.hklin_free,
1614
+ labin_free=args.labin_free)
1553
1615
  except RuntimeError as e:
1554
1616
  raise SystemExit("Error: {}".format(e))
1555
1617
 
1556
1618
  if args.twin:
1557
- twin_data = find_twin_domains_from_data(hkldata)
1619
+ twin_data, _ = find_twin_domains_from_data(hkldata)
1558
1620
  else:
1559
1621
  twin_data = None
1560
1622
  if twin_data:
@@ -1621,7 +1683,7 @@ def main(args):
1621
1683
  if twin_data:
1622
1684
  labs = ["F_est", "F_exp"]
1623
1685
  elif is_int:
1624
- labs = ["I", "SIGI"]
1686
+ labs = ["I", "SIGI", "F_est"]
1625
1687
  else:
1626
1688
  labs = ["FP", "SIGFP"]
1627
1689
  labs.extend(["FOM", "FWT", "DELFWT", "FC", "DFC"])
servalcat/xtal/twin.py CHANGED
@@ -21,39 +21,42 @@ def find_twin_domains_from_data(hkldata, max_oblique=5, min_alpha=0.05):
21
21
  #for op in ops:
22
22
  # logger.writeln(f" {op.triplet()}")
23
23
  if not ops:
24
- return
24
+ logger.writeln("")
25
+ return None, None
25
26
  twin_data = ext.TwinData()
26
27
  twin_data.setup(hkldata.miller_array(), hkldata.df.bin, hkldata.sg, hkldata.cell, ops)
27
28
  if "I" in hkldata.df:
28
29
  Io = hkldata.df.I.to_numpy()
29
30
  else:
30
31
  Io = hkldata.df.FP.to_numpy()**2
31
- alphas = []
32
32
  ccs, nums = [], []
33
+ tmp = []
33
34
  for i_bin, bin_idxes in hkldata.binned():
34
35
  ratios = [1.]
35
36
  ccs.append([])
36
37
  nums.append([])
38
+ rs = []
37
39
  for i_op, op in enumerate(ops):
40
+ cc = r = numpy.nan
38
41
  ii = numpy.array(twin_data.pairs(i_op, i_bin))
39
- val = numpy.all(numpy.isfinite(Io[ii]), axis=1)
40
- if numpy.sum(val) == 0:
41
- cc = numpy.nan
42
- else:
42
+ val = numpy.all(numpy.isfinite(Io[ii]), axis=1) if ii.size != 0 else []
43
+ if numpy.sum(val) != 0:
43
44
  cc = numpy.corrcoef(Io[ii][val].T)[0,1]
44
- rr = (1 - numpy.sqrt(1 - cc**2)) / cc
45
- ratios.append(rr)
45
+ r = numpy.sum(numpy.abs(Io[ii][val, 0] - Io[ii][val, 1])) / numpy.sum(Io[ii][val])
46
+ ratio = (1 - numpy.sqrt(1 - cc**2)) / cc
47
+ ratios.append(ratio)
46
48
  ccs[-1].append(cc)
49
+ rs.append(r)
47
50
  nums[-1].append(len(val))
48
- alphas.append(numpy.array(ratios) / numpy.nansum(ratios))
49
- alphas = numpy.maximum(0, numpy.mean(alphas, axis=0))
50
- alphas /= numpy.nansum(alphas)
51
+ tmp.append(rs + ccs[-1] + nums[-1] + (numpy.array(ratios) / numpy.nansum(ratios)).tolist()[1:])
52
+ df = pandas.DataFrame(tmp, columns=[f"{n}_op{i+1}" for n in ("R", "CC", "num", "raw_est") for i in range(len(ops))])
53
+ with logger.with_prefix(" "):
54
+ logger.writeln(df.to_string(float_format="%.4f"))
51
55
  ccs = numpy.array(ccs)
52
56
  nums = numpy.array(nums)
53
57
  tmp = [{"Operator": gemmi.Op().triplet(),
54
58
  "R_twin_obs": 0,
55
- "CC_mean": 1,
56
- "Alpha_from_CC": alphas[0]}]
59
+ "CC_mean": 1}]
57
60
  for i_op, op in enumerate(ops):
58
61
  ii = numpy.array(twin_data.pairs(i_op))
59
62
  val = numpy.all(numpy.isfinite(Io[ii]), axis=1)
@@ -61,61 +64,80 @@ def find_twin_domains_from_data(hkldata, max_oblique=5, min_alpha=0.05):
61
64
  r_obs = numpy.nan
62
65
  else:
63
66
  r_obs = numpy.sum(numpy.abs(Io[ii][val, 0] - Io[ii][val, 1])) / numpy.sum(Io[ii][val])
67
+ cc = numpy.sum(nums[:,i_op] * ccs[:,i_op]) / numpy.sum(nums[:,i_op])
64
68
  tmp.append({"Operator": op.triplet(),
65
- "CC_mean": numpy.sum(nums[:,i_op] * ccs[:,i_op]) / numpy.sum(nums[:,i_op]),
69
+ "CC_mean": cc,
66
70
  "R_twin_obs": r_obs,
67
- "Alpha_from_CC": alphas[i_op+1],
68
71
  })
69
72
  df = pandas.DataFrame(tmp)
70
- logger.writeln(df.to_string(float_format="%.2f"))
73
+ df["Alpha_from_CC"] = (1 - numpy.sqrt(1 - df["CC_mean"]**2)) / df["CC_mean"]
74
+ df["Alpha_from_CC"] /= numpy.nansum(df["Alpha_from_CC"])
75
+ logger.writeln("\n Initial twin fraction estimates:")
76
+ with logger.with_prefix(" "):
77
+ logger.writeln(df.to_string(float_format="%.2f"))
71
78
 
72
- sel_idxes = [i for i, a in enumerate(alphas) if i > 0 and a > min_alpha]
73
- if not sel_idxes:
74
- logger.writeln(" No twinning detected")
75
- return
79
+ sel = df["Alpha_from_CC"].to_numpy() > min_alpha
80
+ if sel[1:].sum() == 0:
81
+ logger.writeln(" No twinning detected\n")
82
+ return None, None
76
83
 
77
- if len(sel_idxes) + 1 != len(alphas):
78
- ops = [ops[i-1] for i in sel_idxes]
79
- logger.writeln(" Twin operators after filtering small fractions")
80
- alphas = numpy.array([alphas[0]] + [alphas[i] for i in sel_idxes])
81
- alphas /= numpy.sum(alphas)
82
- df = pandas.DataFrame({"Operator": [x.triplet() for x in [gemmi.Op()]+ops],
83
- "Alpha": alphas})
84
- logger.writeln(df.to_string(float_format="%.2f"))
84
+ if not sel.all():
85
+ ops = [ops[i] for i in range(len(ops)) if sel[i+1]]
86
+ logger.writeln(f"\n Twin operators after filtering small fractions (<= {min_alpha})")
87
+ df = df[sel]
88
+ df["Alpha_from_CC"] /= numpy.nansum(df["Alpha_from_CC"])
89
+ with logger.with_prefix(" "):
90
+ logger.writeln(df.to_string(float_format="%.2f"))
85
91
  twin_data = ext.TwinData()
86
92
  twin_data.setup(hkldata.miller_array(), hkldata.df.bin, hkldata.sg, hkldata.cell, ops)
87
- twin_data.alphas = alphas
93
+ twin_data.alphas = df["Alpha_from_CC"].tolist()
88
94
  if "I" not in hkldata.df:
89
95
  logger.writeln('Generating "observed" intensities for twin refinement: Io = Fo**2, SigIo = 2*F*SigFo')
90
96
  hkldata.df["I"] = hkldata.df.FP**2
91
97
  hkldata.df["SIGI"] = 2 * hkldata.df.FP * hkldata.df.SIGFP
92
- return twin_data
98
+ logger.writeln("")
99
+ return twin_data, df
93
100
 
94
101
  # find_twin_domains_from_data()
95
102
 
96
103
  def estimate_twin_fractions_from_model(twin_data, hkldata):
97
104
  logger.writeln("Estimating twin fractions")
98
105
  Ic = numpy.abs(twin_data.f_calc.sum(axis=1))**2
99
- Ic_all = Ic[twin_data.twin_related(hkldata.sg)]
106
+ idx_all = twin_data.twin_related(hkldata.sg)
107
+ Ic_all = Ic[idx_all]
108
+ Ic_all[(idx_all < 0).any(axis=1)] = numpy.nan
100
109
  rr = twin_data.obs_related_asu()
101
110
  tmp = []
111
+ P_list, cc_oc_list, weight_list = [], [], []
112
+ n_ops = len(twin_data.ops) + 1
113
+ tidxes = numpy.triu_indices(n_ops, 1)
102
114
  for i_bin, bin_idxes in hkldata.binned():
103
- cc_o_c = []
104
115
  i_tmp = Ic_all[numpy.asarray(twin_data.bin)==i_bin,:]
116
+ i_tmp = i_tmp[numpy.isfinite(i_tmp).all(axis=1)]
105
117
  P = numpy.corrcoef(i_tmp.T)
106
118
  iobs = hkldata.df.I.to_numpy()[bin_idxes]
107
119
  ic_bin = Ic[rr[bin_idxes,:]]
108
- val = numpy.isfinite(iobs) & numpy.isfinite(ic_bin).all(axis=1)
120
+ val = numpy.isfinite(iobs) & numpy.isfinite(ic_bin).all(axis=1) & numpy.all(rr[bin_idxes,:]>=0, axis=1)
109
121
  iobs, ic_bin = iobs[val], ic_bin[val,:]
110
- cc_o_c = [numpy.corrcoef(iobs, ic_bin[:,i])[0,1] for i in range(len(twin_data.ops)+1)]
111
- frac_est = numpy.dot(numpy.linalg.pinv(P), cc_o_c)
112
- tmp.append(frac_est.tolist())
113
-
114
- df = pandas.DataFrame(tmp)
115
- df.iloc[:,:] /= df.sum(axis=1).to_numpy()[:,None]
116
- mean_alphas = numpy.maximum(0, df.mean())
117
- mean_alphas /= numpy.sum(mean_alphas)
118
- logger.write(" Estimated fractions from data-model correlations: ")
119
- logger.writeln(" ".join("%.2f"%x for x in mean_alphas))
120
- twin_data.alphas = mean_alphas
122
+ cc_oc = [numpy.corrcoef(iobs, ic_bin[:,i])[0,1] for i in range(n_ops)]
123
+ P_list.append(P)
124
+ cc_oc_list.append(cc_oc)
125
+ weight_list.append(numpy.sum(val))
126
+ frac_est = numpy.dot(numpy.linalg.pinv(P), cc_oc)
127
+ frac_est /= frac_est.sum()
128
+ tmp.append(P[tidxes].tolist() + cc_oc + [weight_list[-1]] + frac_est.tolist())
121
129
 
130
+ P = numpy.average(P_list, axis=0, weights=weight_list)
131
+ cc_oc = numpy.average(cc_oc_list, axis=0, weights=weight_list)
132
+ frac_est = numpy.dot(numpy.linalg.pinv(P), cc_oc)
133
+ frac_est = numpy.maximum(0, frac_est)
134
+ frac_est /= frac_est.sum()
135
+ df = pandas.DataFrame(tmp, columns=[f"cc_{i+1}_{j+1}" for i, j in zip(*tidxes)] +
136
+ [f"cc_o_{i+1}" for i in range(n_ops)] +
137
+ ["nref"] + [f"raw_est_{i+1}" for i in range(n_ops)])
138
+ with logger.with_prefix(" "):
139
+ logger.writeln(df.to_string(float_format="%.4f"))
140
+ logger.write(" Final twin fraction estimate: ")
141
+ logger.writeln(" ".join("%.2f"%x for x in frac_est))
142
+ twin_data.alphas = frac_est
143
+ return df