PyPI - servalcat - Versions diffs - 0.4.99__cp312-cp312-win_amd64.whl → 0.4.105__cp312-cp312-win_amd64.whl - Mend

servalcat 0.4.99__cp312-cp312-win_amd64.whl → 0.4.105__cp312-cp312-win_amd64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of servalcat might be problematic. Click here for more details.

Files changed (22) hide show

servalcat/__init__.py +2 -2
servalcat/ext.cp312-win_amd64.pyd +0 -0
servalcat/refine/refine.py +11 -1
servalcat/refine/refine_spa.py +10 -9
servalcat/refine/refine_xtal.py +31 -26
servalcat/refine/xtal.py +4 -4
servalcat/refmac/refmac_wrapper.py +36 -2
servalcat/spa/fofc.py +4 -4
servalcat/spa/run_refmac.py +3 -3
servalcat/utils/commands.py +4 -4
servalcat/utils/fileio.py +78 -39
servalcat/utils/hkl.py +39 -4
servalcat/utils/logger.py +1 -1
servalcat/utils/model.py +14 -6
servalcat/utils/symmetry.py +5 -2
servalcat/xtal/sigmaa.py +113 -51
servalcat/xtal/twin.py +66 -44
{servalcat-0.4.99.dist-info → servalcat-0.4.105.dist-info}/METADATA +5 -5
{servalcat-0.4.99.dist-info → servalcat-0.4.105.dist-info}/RECORD +22 -22
{servalcat-0.4.99.dist-info → servalcat-0.4.105.dist-info}/WHEEL +1 -1
{servalcat-0.4.99.dist-info → servalcat-0.4.105.dist-info}/entry_points.txt +0 -0
{servalcat-0.4.99.dist-info → servalcat-0.4.105.dist-info}/licenses/LICENSE +0 -0

servalcat/utils/hkl.py CHANGED Viewed

@@ -493,6 +493,31 @@ class HklData:
             self.df = self.df[~sel]
     # remove_nonpositive()
+    def mask_invalid_obs_values(self, labels):
+        assert 1 < len(labels) < 6
+        assert labels[1].startswith("SIG")
+        def do_mask(label, target_labels):
+            sel = self.df[label] <= 0
+            n_bad = sel.sum()
+            if n_bad > 0:
+                logger.writeln("Removing {} reflections with {}<=0".format(n_bad, label))
+                self.df.loc[sel, target_labels] = numpy.nan
+            # If any element within target_labels is non-finite, mask all elements
+            self.df.loc[(~numpy.isfinite(self.df[target_labels])).any(axis=1), target_labels] = numpy.nan
+        if len(labels) < 4: # F/SIGF or I/SIGI
+            if labels[0].startswith("F"):
+                do_mask(labels[0], labels[:2]) # bad F
+            do_mask(labels[1], labels[:2]) # bad sigma
+        else: # I(+)/SIGI(+)/I(-)/SIGI(-) or F...
+            assert labels[3].startswith("SIG")
+            if labels[0].startswith("F"):
+                do_mask(labels[0], labels[:2]) # bad F+
+                do_mask(labels[2], labels[2:4]) # bad F-
+            do_mask(labels[1], labels[:2]) # bad sigma+
+            do_mask(labels[3], labels[2:4]) # bad sigma-
+    # mask_invalid_obs_values()
     def remove_systematic_absences(self):
         is_absent = self.sg.operations().systematic_absences(self.miller_array())
         n_absent = numpy.sum(is_absent)
@@ -501,12 +526,22 @@ class HklData:
             self.df = self.df[~is_absent]
     # remove_systematic_absences()
-    def merge_anomalous(self, labs, newlabs):
+    def merge_anomalous(self, labs, newlabs, method="weighted"):
+        assert method in ("weighted", "simple")
         assert len(labs) == 4 # i+,sigi+,i-,sigi- for example
         assert len(newlabs) == 2
-        # skipna=True is default, so missing value is handled nicely.
-        self.df[newlabs[0]] = self.df[[labs[0], labs[2]]].mean(axis=1)
-        self.df[newlabs[1]] = self.df[[labs[1], labs[3]]].pow(2).mean(axis=1).pow(0.5)
+        if method == "simple":
+            # skipna=True is default, so missing value is handled nicely.
+            self.df[newlabs[0]] = self.df[[labs[0], labs[2]]].mean(axis=1)
+            self.df[newlabs[1]] = self.df[[labs[1], labs[3]]].pow(2).mean(axis=1).pow(0.5)
+        else:
+            obs = self.df[[labs[0], labs[2]]].to_numpy()
+            weights = 1. / self.df[[labs[1], labs[3]]].to_numpy()**2
+            sum_w = numpy.nansum(weights, axis=1)
+            sum_w[sum_w == 0] = numpy.nan # mask when both are nan
+            self.df[newlabs[0]] = numpy.nansum(obs * weights, axis=1) / sum_w
+            self.df[newlabs[1]] = numpy.sqrt(1. / sum_w)
+    # merge_anomalous()
     def as_asu_data(self, label=None, data=None, label_sigma=None):
         if label is None: assert data is not None

servalcat/utils/logger.py CHANGED Viewed

@@ -37,7 +37,7 @@ class Logger(object):
     def write(self, l, end="", flush=True, fs=None, print_fs=sys.stdout):
         if self.stopped: return
         if self.prefix:
-            l = "\n".join(self.prefix + x for x in l.splitlines(keepends=True))
+            l = "".join(self.prefix + x for x in l.splitlines(keepends=True))
         print(l, end=end, file=print_fs, flush=flush)
         for f in (self.ofs, fs):
             if f is not None:

servalcat/utils/model.py CHANGED Viewed

@@ -73,11 +73,12 @@ def remove_charge(sts):
 def check_atomsf(sts, source, mott_bethe=True):
     assert source in ("xray", "electron", "neutron")
     if source != "electron": mott_bethe = False
-    logger.writeln("Atomic scattering factors for {}".format("electron (Mott-Bethe)" if mott_bethe else source))
+    logger.writeln("Atomic scattering factors for {}".format("xray (use Mott-Bethe to convert to electrons)" if mott_bethe else source))
     if source != "xray" and not mott_bethe:
         logger.writeln("  Note that charges will be ignored")
     el_charges = {(cra.atom.element, cra.atom.charge) for st in sts for cra in st[0].all()}
     elems = {x[0] for x in el_charges}
+    tmp = {}
     if source == "xray" or mott_bethe:
         shown = set()
         for el, charge in sorted(el_charges, key=lambda x: (x[0].atomic_number, x[1])):
@@ -88,12 +89,16 @@ def check_atomsf(sts, source, mott_bethe=True):
                 charge = 0
             if (el, charge) in shown: continue
             label = el.name if charge == 0 else "{}{:+}".format(el.name, charge)
-            logger.writeln("  {} {}".format(label, tuple(sf.get_coefs())))
             shown.add((el, charge))
+            tmp[label] = {**{f"{k}{i+1}": x for k in ("a", "b") for i, x in enumerate(getattr(sf, k))}, "c": sf.c}
     else:
         for el in sorted(elems, key=lambda x: x.atomic_number):
-            sf = el.c4322 if source == "electron" else el.neutron92
-            logger.writeln("  {} {}".format(el.name, tuple(sf.get_coefs())))
+            if source == "electron":
+                tmp[el.name] = {f"{k}{i+1}": x for k in ("a", "b") for i, x in enumerate(getattr(el.c4322, k))}
+            else:
+                tmp[el.name] = {"a": el.neutron92.get_coefs()[0]}
+    with logger.with_prefix("  "):
+        logger.writeln(pandas.DataFrame(tmp).T.to_string())
     logger.writeln("")
 # check_atomsf()
@@ -489,9 +494,10 @@ def filter_contacting_ncs(st, cutoff=5.):
     st.setup_cell_images()
     ns = gemmi.NeighborSearch(st[0], st.cell, cutoff*2).populate() # This is considered crystallographic cell if not 1 1 1. Undesirable result may be seen.
     cs = gemmi.ContactSearch(cutoff)
+    cs.twice = True # since we need all image_idx
     cs.ignore = gemmi.ContactSearch.Ignore.SameAsu
     results = cs.find_contacts(ns)
-    indices = set([r.image_idx for r in results])
+    indices = {r.image_idx for r in results}
     logger.writeln(" contacting copies: {}".format(indices))
     ops = [st.ncs[i-1] for i in indices] # XXX is this correct? maybe yes as long as identity operator is not there
     st.ncs.clear()
@@ -553,7 +559,9 @@ def reset_adp(model, bfactor=None, adp_mode="iso"):
         if adp_mode == "iso" or (adp_mode == "fix" and bfactor is not None):
             cra.atom.aniso = gemmi.SMat33f(0,0,0,0,0,0)
         elif adp_mode == "aniso":
-            if not cra.atom.aniso.nonzero() or bfactor is not None:
+            if cra.atom.aniso.nonzero() and bfactor is None: # just in case
+                cra.atom.b_iso = numpy.mean(cra.atom.aniso.calculate_eigenvalues()) * u_to_b
+            else:
                 u = cra.atom.b_iso * b_to_u
                 cra.atom.aniso = gemmi.SMat33f(u, u, u, 0, 0, 0)
 # reset_adp()

servalcat/utils/symmetry.py CHANGED Viewed

@@ -39,8 +39,11 @@ def update_ncs_from_args(args, st, map_and_start=None, filter_contacting=False,
     ncsops = ncsops_from_args(args, st.cell, map_and_start=map_and_start, st=st,
                               helical_min_n=helical_min_n, helical_max_n=helical_max_n)
-    st.ncs.clear()
-    st.ncs.extend([x for x in ncsops if not x.tr.is_identity()])
+    st.ncs = [x for x in ncsops if not x.tr.is_identity()]
+    # To write identity op to the output model
+    idop_id = next((x.id for x in ncsops if x.tr.is_identity()), None)
+    if idop_id:
+        st.info["_struct_ncs_oper.id"] = idop_id
     if filter_contacting:
         model.filter_contacting_ncs(st)

servalcat/xtal/sigmaa.py CHANGED Viewed

@@ -30,10 +30,14 @@ def add_arguments(parser):
     parser.description = 'Sigma-A parameter estimation for crystallographic data'
     parser.add_argument('--hklin', required=True,
                         help='Input MTZ file')
+    parser.add_argument('--hklin_free',
+                        help='Input MTZ file for test flags')
     parser.add_argument('--spacegroup',
                         help='Override space group')
     parser.add_argument('--labin',
-                        help='MTZ column for F,SIGF,FREE')
+                        help='MTZ columns of --hklin for F,SIGF,FREE')
+    parser.add_argument('--labin_free',
+                        help='MTZ column of --hklin_free')
     parser.add_argument('--free', type=int,
                         help='flag number for test set')
     parser.add_argument('--model', required=True, nargs="+", action="append",
@@ -97,6 +101,8 @@ def calc_r_and_cc(hkldata, centric_and_selections, twin_data=None):
     else:
         obs = obs_sqrt = hkldata.df.FP
         calc = calc_sqrt = Fc
+    if "CC*" in stats: # swap the positions
+        stats.insert(len(stats.columns)-1, "CC*", stats.pop("CC*"))
     if has_free:
         for lab in (cclab, rlab):
             for suf in ("work", "free"):
@@ -1037,6 +1043,7 @@ def calculate_maps_int(hkldata, b_aniso, fc_labs, D_labs, centric_and_selections
     nmodels = len(fc_labs)
     hkldata.df["FWT"] = 0j * numpy.nan
     hkldata.df["DELFWT"] = 0j * numpy.nan
+    hkldata.df["F_est"] = numpy.nan
     hkldata.df["FOM"] = numpy.nan # FOM proxy, |<F>| / <|F|>
     has_ano = "I(+)" in hkldata.df and "I(-)" in hkldata.df
     if has_ano:
@@ -1056,12 +1063,10 @@ def calculate_maps_int(hkldata, b_aniso, fc_labs, D_labs, centric_and_selections
             S = hkldata.df["S"].to_numpy()[cidxes]
             f, m_proxy = expected_F_from_int(Io[cidxes], sigIo[cidxes], k_ani[cidxes], DFc[cidxes], eps[cidxes], c, S)
             exp_ip = numpy.exp(numpy.angle(DFc[cidxes])*1j)
-            if c == 0:
-                hkldata.df.loc[cidxes, "FWT"] = 2 * f * exp_ip - DFc[cidxes]
-            else:
-                hkldata.df.loc[cidxes, "FWT"] = f * exp_ip
+            hkldata.df.loc[cidxes, "FWT"] = 2 * f * exp_ip - DFc[cidxes]
             hkldata.df.loc[cidxes, "DELFWT"] = f * exp_ip - DFc[cidxes]
             hkldata.df.loc[cidxes, "FOM"] = m_proxy
+            hkldata.df.loc[cidxes, "F_est"] = f
             if has_ano:
                 f_p, _ = expected_F_from_int(ano_data[cidxes,0], ano_data[cidxes,1],
                                              k_ani[cidxes], DFc[cidxes], eps[cidxes], c, S)
@@ -1162,12 +1167,34 @@ def decide_mtz_labels(mtz, find_free=True, require=None):
     return labin
 # decide_mtz_labels()
+def decide_spacegroup(sg_user, sg_st, sg_hkl):
+    assert sg_hkl is not None
+    ret = None
+    if sg_user is not None:
+        ret = sg_user
+        logger.writeln(f"Space group overridden by user. Using {ret.xhm()}")
+    else:
+        ret = sg_hkl
+        if sg_hkl != sg_st:
+            if sg_st and sg_st.laue_str() != sg_hkl.laue_str():
+                raise RuntimeError("Crystal symmetry mismatch between model and data")
+            logger.writeln("Warning: space group mismatch between model and mtz")
+            if sg_st and sg_st.laue_str() == sg_hkl.laue_str():
+                logger.writeln("         using space group from model")
+                ret = sg_st
+            else:
+                logger.writeln("         using space group from mtz")
+            logger.writeln("")
+    return ret
+# decide_spacegroup
 def process_input(hklin, labin, n_bins, free, xyzins, source, d_max=None, d_min=None,
                   n_per_bin=None, use="all", max_bins=None, cif_index=0, keep_charges=False,
-                  allow_unusual_occupancies=False, space_group=None):
+                  allow_unusual_occupancies=False, space_group=None,
+                  hklin_free=None, labin_free=None):
     if labin: assert 1 < len(labin) < 6
     assert use in ("all", "work", "test")
-    assert n_bins or n_per_bin #if n_bins not set, n_per_bin should be given
     if len(xyzins) > 0 and type(xyzins[0]) is gemmi.Structure:
         sts = xyzins
@@ -1185,13 +1212,18 @@ def process_input(hklin, labin, n_bins, free, xyzins, source, d_max=None, d_min=
         assert len(xyzins) == 1
         assert not sts
         st, mtz = utils.fileio.read_small_molecule_files([hklin, xyzins[0]])
+        if None in (st, mtz):
+            raise SystemExit("Failed to read small molecule file(s)")
         sts = [st]
     for st in sts:
         utils.model.check_occupancies(st, raise_error=not allow_unusual_occupancies)
+    sg_use = decide_spacegroup(sg_user=gemmi.SpaceGroup(space_group) if space_group else None,
+                               sg_st=sts[0].find_spacegroup() if sts else None,
+                               sg_hkl=mtz.spacegroup)
     if not labin:
-        labin = decide_mtz_labels(mtz)
+        labin = decide_mtz_labels(mtz, find_free=hklin_free is None)
     col_types = {x.label:x.type for x in mtz.columns}
     if labin[0] not in col_types:
         raise RuntimeError("MTZ column not found: {}".format(labin[0]))
@@ -1201,10 +1233,31 @@ def process_input(hklin, labin, n_bins, free, xyzins, source, d_max=None, d_min=
                       "K": ("anomalous intensity", ["I(+)","SIGI(+)", "I(-)", "SIGI(-)"], ["K", "M", "K", "M"])}
     if col_types[labin[0]] not in labs_and_types:
         raise RuntimeError("MTZ column {} is neither amplitude nor intensity".format(labin[0]))
+    if col_types[labin[0]] == "J": # may be unmerged data
+        ints = gemmi.Intensities()
+        ints.set_data(mtz.cell, sg_use, mtz.make_miller_array(),
+                      mtz.array[:,mtz.column_labels().index(labin[0])],
+                      mtz.array[:,mtz.column_labels().index(labin[1])])
+        dtype = ints.prepare_for_merging(gemmi.DataType.Mean) # do we want Anomalous?
+        ints_bak = ints.clone() # for stats
+        ints.merge_in_place(dtype)
+        if (ints.nobs_array > 1).any():
+            mtz = ints.prepare_merged_mtz(with_nobs=False)
+            labin = mtz.column_labels()[3:]
+            col_types = {x.label:x.type for x in mtz.columns}
+            mult = ints.nobs_array.mean()
+            logger.writeln(f"Input data were merged (multiplicity: {mult:.2f}). Overriding labin={','.join(labin)}")
+        else:
+            ints_bak = None
+    else:
+        ints_bak = None
     name, newlabels, require_types = labs_and_types[col_types[labin[0]]]
     logger.writeln("Observation type: {}".format(name))
     if len(newlabels) < len(labin): newlabels.append("FREE")
     hkldata = utils.hkl.hkldata_from_mtz(mtz, labin, newlabels=newlabels, require_types=require_types)
+    hkldata.sg = sg_use
+    hkldata.mask_invalid_obs_values(newlabels)
     if newlabels[0] == "F(+)":
         hkldata.merge_anomalous(newlabels[:4], ["FP", "SIGFP"])
         newlabels = ["FP", "SIGFP"] + newlabels[4:]
@@ -1214,13 +1267,7 @@ def process_input(hklin, labin, n_bins, free, xyzins, source, d_max=None, d_min=
     if hkldata.df.empty:
         raise RuntimeError("No data in hkl data")
-    if space_group is None:
-        sg_use = None
-    else:
-        sg_use = gemmi.SpaceGroup(space_group)
-        logger.writeln(f"Space group overridden by user. Using {sg_use.xhm()}")
     if sts:
         assert source in ["electron", "xray", "neutron"]
         for st in sts:
@@ -1230,23 +1277,8 @@ def process_input(hklin, labin, n_bins, free, xyzins, source, d_max=None, d_min=
             logger.writeln("Warning: unit cell mismatch between model and reflection data")
             logger.writeln("         using unit cell from mtz")
-        for st in sts: st.cell = hkldata.cell # mtz cell is used in any case
-        sg_st = sts[0].find_spacegroup() # may be None
-        if sg_use is None:
-            sg_use = hkldata.sg
-            if hkldata.sg != sg_st:
-                if st.cell.is_crystal() and sg_st and sg_st.laue_str() != hkldata.sg.laue_str():
-                    raise RuntimeError("Crystal symmetry mismatch between model and data")
-                logger.writeln("Warning: space group mismatch between model and mtz")
-                if sg_st and sg_st.laue_str() == hkldata.sg.laue_str():
-                    logger.writeln("         using space group from model")
-                    sg_use = sg_st
-                else:
-                    logger.writeln("         using space group from mtz")
-                logger.writeln("")
         for st in sts:
+            st.cell = hkldata.cell # mtz cell is used in any case
             st.spacegroup_hm = sg_use.xhm()
             st.setup_cell_images()
@@ -1254,22 +1286,36 @@ def process_input(hklin, labin, n_bins, free, xyzins, source, d_max=None, d_min=
             utils.model.remove_charge(sts)
         utils.model.check_atomsf(sts, source)
-    if sg_use is not None:
-        hkldata.sg = sg_use
-    if newlabels[0] == "FP":
-        hkldata.remove_nonpositive(newlabels[0])
-    hkldata.remove_nonpositive(newlabels[1])
     hkldata.switch_to_asu()
     hkldata.remove_systematic_absences()
     #hkldata.df = hkldata.df.astype({name: 'float64' for name in ["I","SIGI","FP","SIGFP"] if name in hkldata.df})
-    d_min_data = hkldata.d_min_max(newlabels)[0]
-    if d_min is None and hkldata.d_min_max()[0] != d_min_data:
-        d_min = d_min_data
+    d_min_max_data = hkldata.d_min_max(newlabels)
+    if d_min is None and hkldata.d_min_max()[0] != d_min_max_data[0]:
+        d_min = d_min_max_data[0]
         logger.writeln(f"Changing resolution to {d_min:.3f} A")
     if (d_min, d_max).count(None) != 2:
         hkldata = hkldata.copy(d_min=d_min, d_max=d_max)
     if hkldata.df.empty:
         raise RuntimeError("No data left in hkl data")
+    if hklin_free is not None:
+        mtz2 = utils.fileio.read_mmhkl(hklin_free)
+        if labin_free and labin_free not in mtz2.column_labels():
+            raise RuntimeError(f"specified label ({labin_free}) not found in {hklin_free}")
+        if not labin_free:
+            tmp = utils.hkl.mtz_find_free_columns(mtz2)
+            if tmp:
+                labin_free = tmp[0]
+            else:
+                raise RuntimeError(f"Test flag label not found in {hklin_free}")
+        tmp = utils.hkl.hkldata_from_mtz(mtz2, [labin_free], newlabels=["FREE"])
+        tmp.sg = sg_use
+        tmp.switch_to_asu()
+        tmp.remove_systematic_absences()
+        tmp = tmp.copy(d_min=d_min_max_data[0], d_max=d_min_max_data[1])
+        hkldata.complete()
+        tmp.complete()
+        hkldata.merge(tmp.df[["H","K","L","FREE"]])
     hkldata.complete()
     hkldata.sort_by_resolution()
@@ -1277,9 +1323,20 @@ def process_input(hklin, labin, n_bins, free, xyzins, source, d_max=None, d_min=
     hkldata.calc_centric()
     if "FREE" in hkldata.df and free is None:
-        free = hkldata.guess_free_number(newlabels[0])
+        free = hkldata.guess_free_number(newlabels[0]) # also check NaN
     if n_bins is None:
+        if n_per_bin is None:
+            if use == "all" or "FREE" not in hkldata.df:
+                n_per_bin = 100
+                use = "all"
+            elif use == "work":
+                n_per_bin = 100
+            elif use == "test":
+                n_per_bin = 50
+            else:
+                raise RuntimeError(f"should not happen: {use=}")
         sel = hkldata.df[newlabels[0]].notna()
         if use == "work":
             sel &= hkldata.df.FREE != free
@@ -1292,8 +1349,6 @@ def process_input(hklin, labin, n_bins, free, xyzins, source, d_max=None, d_min=
         logger.writeln("n_per_bin={} requested for {}. n_bins set to {}".format(n_per_bin, use, n_bins))
     hkldata.setup_binning(n_bins=n_bins)
-    logger.writeln("Data completeness: {:.2f}%".format(hkldata.completeness()*100.))
     fc_labs = ["FC{}".format(i)  for i, _ in enumerate(sts)]
     # Create a centric selection table for faster look up
@@ -1343,6 +1398,14 @@ def process_input(hklin, labin, n_bins, free, xyzins, source, d_max=None, d_min=
             stats.loc[i_bin, "n_test"] = n_test
     stats["completeness"] = stats["n_obs"] / stats["n_all"] * 100
+    logger.writeln("Data completeness: {:.2%}".format(stats["n_obs"].sum() / stats["n_all"].sum()))
+    if ints_bak is not None:
+        binner = gemmi.Binner()
+        binner.setup(n_bins, gemmi.Binner.Method.Dstar2, ints_bak)
+        bin_stats = ints_bak.calculate_merging_stats(binner, use_weights="X")
+        stats["CC1/2"] = [stats.cc_half() for stats in bin_stats]
+        hkldata.binned_df["CC*"] = numpy.sqrt(2 * stats["CC1/2"] / (1 + stats["CC1/2"]))
     logger.writeln(stats.to_string())
     return hkldata, sts, fc_labs, centric_and_selections, free
 # process_input()
@@ -1473,13 +1536,11 @@ def calculate_maps(hkldata, b_aniso, centric_and_selections, fc_labs, D_labs, lo
                 Sigma = 2 * SigFo**2 + epsilon * S
                 X = 2 * Fo * DFc_abs / Sigma
                 m = gemmi.bessel_i1_over_i0(X)
-                hkldata.df.loc[cidxes, "FWT"] = (2 * m * Fo - DFc_abs) * expip
             else:
                 Sigma = SigFo**2 + epsilon * S
                 X = Fo * DFc_abs / Sigma
                 m = numpy.tanh(X)
-                hkldata.df.loc[cidxes, "FWT"] = (m * Fo) * expip
+            hkldata.df.loc[cidxes, "FWT"] = (2 * m * Fo - DFc_abs) * expip
             hkldata.df.loc[cidxes, "DELFWT"] = (m * Fo - DFc_abs) * expip
             hkldata.df.loc[cidxes, "FOM"] = m
             hkldata.df.loc[cidxes, "X"] = X
@@ -1500,7 +1561,8 @@ def calculate_maps(hkldata, b_aniso, centric_and_selections, fc_labs, D_labs, lo
         Fc = hkldata.df.FC.to_numpy()[idxes] * k_ani[idxes]
         Fo = hkldata.df.FP.to_numpy()[idxes]
         mean_DFc2 = numpy.nanmean(numpy.abs((Ds[idxes,:] * Fcs[idxes,:]).sum(axis=1) * k_ani[idxes])**2)
-        mean_log_DFcs = numpy.log(numpy.nanmean(numpy.abs(Ds[idxes,:] * Fcs[idxes,:] * k_ani[idxes,None]), axis=0)).tolist()
+        with numpy.errstate(divide="ignore"):
+            mean_log_DFcs = numpy.log(numpy.nanmean(numpy.abs(Ds[idxes,:] * Fcs[idxes,:] * k_ani[idxes,None]), axis=0)).tolist()
         mean_Ds = numpy.nanmean(Ds[idxes,:], axis=0).tolist()
         if sum(nrefs) > 0:
             r = numpy.nansum(numpy.abs(numpy.abs(Fc)-Fo)) / numpy.nansum(Fo)
@@ -1535,7 +1597,6 @@ def calculate_maps(hkldata, b_aniso, centric_and_selections, fc_labs, D_labs, lo
 # calculate_maps()
 def main(args):
-    n_per_bin = {"all": 500, "work": 500, "test": 50}[args.use]
     try:
         hkldata, sts, fc_labs, centric_and_selections,free = process_input(hklin=args.hklin,
                                                                            labin=args.labin.split(",") if args.labin else None,
@@ -1545,16 +1606,17 @@ def main(args):
                                                                            source=args.source,
                                                                            d_max=args.d_max,
                                                                            d_min=args.d_min,
-                                                                           n_per_bin=n_per_bin,
                                                                            use=args.use,
                                                                            max_bins=30,
                                                                            keep_charges=args.keep_charges,
-                                                                           space_group=args.spacegroup)
+                                                                           space_group=args.spacegroup,
+                                                                           hklin_free=args.hklin_free,
+                                                                           labin_free=args.labin_free)
     except RuntimeError as e:
         raise SystemExit("Error: {}".format(e))
     if args.twin:
-        twin_data = find_twin_domains_from_data(hkldata)
+        twin_data, _ = find_twin_domains_from_data(hkldata)
     else:
         twin_data = None
     if twin_data:
@@ -1621,7 +1683,7 @@ def main(args):
     if twin_data:
         labs = ["F_est", "F_exp"]
     elif is_int:
-        labs = ["I", "SIGI"]
+        labs = ["I", "SIGI", "F_est"]
     else:
         labs = ["FP", "SIGFP"]
     labs.extend(["FOM", "FWT", "DELFWT", "FC", "DFC"])

servalcat/xtal/twin.py CHANGED Viewed

@@ -21,39 +21,42 @@ def find_twin_domains_from_data(hkldata, max_oblique=5, min_alpha=0.05):
     #for op in ops:
     #    logger.writeln(f"  {op.triplet()}")
     if not ops:
-        return
+        logger.writeln("")
+        return None, None
     twin_data = ext.TwinData()
     twin_data.setup(hkldata.miller_array(), hkldata.df.bin, hkldata.sg, hkldata.cell, ops)
     if "I" in hkldata.df:
         Io = hkldata.df.I.to_numpy()
     else:
         Io = hkldata.df.FP.to_numpy()**2
-    alphas = []
     ccs, nums = [], []
+    tmp = []
     for i_bin, bin_idxes in hkldata.binned():
         ratios = [1.]
         ccs.append([])
         nums.append([])
+        rs = []
         for i_op, op in enumerate(ops):
+            cc = r = numpy.nan
             ii = numpy.array(twin_data.pairs(i_op, i_bin))
-            val = numpy.all(numpy.isfinite(Io[ii]), axis=1)
-            if numpy.sum(val) == 0:
-                cc = numpy.nan
-            else:
+            val = numpy.all(numpy.isfinite(Io[ii]), axis=1) if ii.size != 0 else []
+            if numpy.sum(val) != 0:
                 cc = numpy.corrcoef(Io[ii][val].T)[0,1]
-            rr = (1 - numpy.sqrt(1 - cc**2)) / cc
-            ratios.append(rr)
+                r = numpy.sum(numpy.abs(Io[ii][val, 0] - Io[ii][val, 1])) / numpy.sum(Io[ii][val])
+            ratio = (1 - numpy.sqrt(1 - cc**2)) / cc
+            ratios.append(ratio)
             ccs[-1].append(cc)
+            rs.append(r)
             nums[-1].append(len(val))
-        alphas.append(numpy.array(ratios) / numpy.nansum(ratios))
-    alphas = numpy.maximum(0, numpy.mean(alphas, axis=0))
-    alphas /= numpy.nansum(alphas)
+        tmp.append(rs + ccs[-1] + nums[-1] + (numpy.array(ratios) / numpy.nansum(ratios)).tolist()[1:])
+    df = pandas.DataFrame(tmp, columns=[f"{n}_op{i+1}" for n in ("R", "CC", "num", "raw_est") for i in range(len(ops))])
+    with logger.with_prefix(" "):
+        logger.writeln(df.to_string(float_format="%.4f"))
     ccs = numpy.array(ccs)
     nums = numpy.array(nums)
     tmp = [{"Operator": gemmi.Op().triplet(),
             "R_twin_obs": 0,
-            "CC_mean": 1,
-            "Alpha_from_CC": alphas[0]}]
+            "CC_mean": 1}]
     for i_op, op in enumerate(ops):
         ii = numpy.array(twin_data.pairs(i_op))
         val = numpy.all(numpy.isfinite(Io[ii]), axis=1)
@@ -61,61 +64,80 @@ def find_twin_domains_from_data(hkldata, max_oblique=5, min_alpha=0.05):
             r_obs = numpy.nan
         else:
             r_obs = numpy.sum(numpy.abs(Io[ii][val, 0] - Io[ii][val, 1])) / numpy.sum(Io[ii][val])
+        cc = numpy.sum(nums[:,i_op] * ccs[:,i_op]) / numpy.sum(nums[:,i_op])
         tmp.append({"Operator": op.triplet(),
-                    "CC_mean": numpy.sum(nums[:,i_op] * ccs[:,i_op]) / numpy.sum(nums[:,i_op]),
+                    "CC_mean": cc,
                     "R_twin_obs": r_obs,
-                    "Alpha_from_CC": alphas[i_op+1],
                     })
     df = pandas.DataFrame(tmp)
-    logger.writeln(df.to_string(float_format="%.2f"))
+    df["Alpha_from_CC"] = (1 - numpy.sqrt(1 - df["CC_mean"]**2)) / df["CC_mean"]
+    df["Alpha_from_CC"] /= numpy.nansum(df["Alpha_from_CC"])
+    logger.writeln("\n Initial twin fraction estimates:")
+    with logger.with_prefix(" "):
+        logger.writeln(df.to_string(float_format="%.2f"))
-    sel_idxes = [i for i, a in enumerate(alphas) if i > 0 and a > min_alpha]
-    if not sel_idxes:
-        logger.writeln(" No twinning detected")
-        return
+    sel = df["Alpha_from_CC"].to_numpy() > min_alpha
+    if sel[1:].sum() == 0:
+        logger.writeln(" No twinning detected\n")
+        return None, None
-    if len(sel_idxes) + 1 != len(alphas):
-        ops = [ops[i-1] for i in sel_idxes]
-        logger.writeln(" Twin operators after filtering small fractions")
-        alphas = numpy.array([alphas[0]] + [alphas[i] for i in sel_idxes])
-        alphas /= numpy.sum(alphas)
-        df = pandas.DataFrame({"Operator": [x.triplet() for x in [gemmi.Op()]+ops],
-                               "Alpha": alphas})
-        logger.writeln(df.to_string(float_format="%.2f"))
+    if not sel.all():
+        ops = [ops[i] for i in range(len(ops)) if sel[i+1]]
+        logger.writeln(f"\n Twin operators after filtering small fractions (<= {min_alpha})")
+        df = df[sel]
+        df["Alpha_from_CC"] /= numpy.nansum(df["Alpha_from_CC"])
+        with logger.with_prefix(" "):
+            logger.writeln(df.to_string(float_format="%.2f"))
         twin_data = ext.TwinData()
         twin_data.setup(hkldata.miller_array(), hkldata.df.bin, hkldata.sg, hkldata.cell, ops)
-    twin_data.alphas = alphas
+    twin_data.alphas = df["Alpha_from_CC"].tolist()
     if "I" not in hkldata.df:
         logger.writeln('Generating "observed" intensities for twin refinement: Io = Fo**2, SigIo = 2*F*SigFo')
         hkldata.df["I"] = hkldata.df.FP**2
         hkldata.df["SIGI"] = 2 * hkldata.df.FP * hkldata.df.SIGFP
-    return twin_data
+    logger.writeln("")
+    return twin_data, df
 # find_twin_domains_from_data()
 def estimate_twin_fractions_from_model(twin_data, hkldata):
     logger.writeln("Estimating twin fractions")
     Ic = numpy.abs(twin_data.f_calc.sum(axis=1))**2
-    Ic_all = Ic[twin_data.twin_related(hkldata.sg)]
+    idx_all = twin_data.twin_related(hkldata.sg)
+    Ic_all = Ic[idx_all]
+    Ic_all[(idx_all < 0).any(axis=1)] = numpy.nan
     rr = twin_data.obs_related_asu()
     tmp = []
+    P_list, cc_oc_list, weight_list = [], [], []
+    n_ops = len(twin_data.ops) + 1
+    tidxes = numpy.triu_indices(n_ops, 1)
     for i_bin, bin_idxes in hkldata.binned():
-        cc_o_c = []
         i_tmp = Ic_all[numpy.asarray(twin_data.bin)==i_bin,:]
+        i_tmp = i_tmp[numpy.isfinite(i_tmp).all(axis=1)]
         P = numpy.corrcoef(i_tmp.T)
         iobs = hkldata.df.I.to_numpy()[bin_idxes]
         ic_bin = Ic[rr[bin_idxes,:]]
-        val = numpy.isfinite(iobs) & numpy.isfinite(ic_bin).all(axis=1)
+        val = numpy.isfinite(iobs) & numpy.isfinite(ic_bin).all(axis=1) & numpy.all(rr[bin_idxes,:]>=0, axis=1)
         iobs, ic_bin = iobs[val], ic_bin[val,:]
-        cc_o_c = [numpy.corrcoef(iobs, ic_bin[:,i])[0,1] for i in range(len(twin_data.ops)+1)]
-        frac_est = numpy.dot(numpy.linalg.pinv(P), cc_o_c)
-        tmp.append(frac_est.tolist())
-    df = pandas.DataFrame(tmp)
-    df.iloc[:,:] /= df.sum(axis=1).to_numpy()[:,None]
-    mean_alphas = numpy.maximum(0, df.mean())
-    mean_alphas /= numpy.sum(mean_alphas)
-    logger.write(" Estimated fractions from data-model correlations: ")
-    logger.writeln(" ".join("%.2f"%x for x in mean_alphas))
-    twin_data.alphas = mean_alphas
+        cc_oc = [numpy.corrcoef(iobs, ic_bin[:,i])[0,1] for i in range(n_ops)]
+        P_list.append(P)
+        cc_oc_list.append(cc_oc)
+        weight_list.append(numpy.sum(val))
+        frac_est = numpy.dot(numpy.linalg.pinv(P), cc_oc)
+        frac_est /= frac_est.sum()
+        tmp.append(P[tidxes].tolist() + cc_oc + [weight_list[-1]] + frac_est.tolist())
+    P = numpy.average(P_list, axis=0, weights=weight_list)
+    cc_oc = numpy.average(cc_oc_list, axis=0, weights=weight_list)
+    frac_est = numpy.dot(numpy.linalg.pinv(P), cc_oc)
+    frac_est = numpy.maximum(0, frac_est)
+    frac_est /= frac_est.sum()
+    df = pandas.DataFrame(tmp, columns=[f"cc_{i+1}_{j+1}" for i, j in zip(*tidxes)] +
+                          [f"cc_o_{i+1}" for i in range(n_ops)] +
+                          ["nref"] + [f"raw_est_{i+1}" for i in range(n_ops)])
+    with logger.with_prefix(" "):
+        logger.writeln(df.to_string(float_format="%.4f"))
+    logger.write(" Final twin fraction estimate: ")
+    logger.writeln(" ".join("%.2f"%x for x in frac_est))
+    twin_data.alphas = frac_est
+    return df