PyPI - bvalcalc - Versions diffs - 0.6.2__tar.gz - Mend

bvalcalc 0.6.2__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (60) hide show

bvalcalc-0.6.2/Bvalcalc/__init__.py +16 -0
bvalcalc-0.6.2/Bvalcalc/__main__.py +4 -0
bvalcalc-0.6.2/Bvalcalc/cli.py +68 -0
bvalcalc-0.6.2/Bvalcalc/core/__init__.py +20 -0
bvalcalc-0.6.2/Bvalcalc/core/calculateB.py +351 -0
bvalcalc-0.6.2/Bvalcalc/core/chromBcalc.py +194 -0
bvalcalc-0.6.2/Bvalcalc/core/deprecated/Bcalc_stdOut.py +118 -0
bvalcalc-0.6.2/Bvalcalc/core/deprecated/calculate_B_analytically_Eq3_mine_demography.py +80 -0
bvalcalc-0.6.2/Bvalcalc/core/deprecated/findFlankLen.py +23 -0
bvalcalc-0.6.2/Bvalcalc/core/deprecated/old_calculateB.py +35 -0
bvalcalc-0.6.2/Bvalcalc/core/deprecated/plotB_figures.py +273 -0
bvalcalc-0.6.2/Bvalcalc/core/deprecated/plotB_figures_200kb.py +208 -0
bvalcalc-0.6.2/Bvalcalc/core/geneBcalc.py +73 -0
bvalcalc-0.6.2/Bvalcalc/core/genomeBcalc.py +50 -0
bvalcalc-0.6.2/Bvalcalc/core/helpers/__init__.py +19 -0
bvalcalc-0.6.2/Bvalcalc/core/helpers/calc_B_from_chunks.py +89 -0
bvalcalc-0.6.2/Bvalcalc/core/helpers/calc_B_in_genes.py +104 -0
bvalcalc-0.6.2/Bvalcalc/core/helpers/calc_B_in_hri_region.py +39 -0
bvalcalc-0.6.2/Bvalcalc/core/helpers/calc_B_precise_noninterfering.py +170 -0
bvalcalc-0.6.2/Bvalcalc/core/helpers/calc_L_per_chunk.py +65 -0
bvalcalc-0.6.2/Bvalcalc/core/helpers/calc_R_len_dist.py +122 -0
bvalcalc-0.6.2/Bvalcalc/core/helpers/demography_helpers.py +45 -0
bvalcalc-0.6.2/Bvalcalc/core/helpers/extend_hri_regions_correction.py +113 -0
bvalcalc-0.6.2/Bvalcalc/core/helpers/process_single_chunk.py +131 -0
bvalcalc-0.6.2/Bvalcalc/core/plotB.py +174 -0
bvalcalc-0.6.2/Bvalcalc/core/plotChromB.py +47 -0
bvalcalc-0.6.2/Bvalcalc/core/positionsBstats.py +131 -0
bvalcalc-0.6.2/Bvalcalc/core/regionBcalc.py +49 -0
bvalcalc-0.6.2/Bvalcalc/core/siteBcalc.py +19 -0
bvalcalc-0.6.2/Bvalcalc/templates/ArabidopsisParams.py +0 -0
bvalcalc-0.6.2/Bvalcalc/templates/CelegansParams.py +1 -0
bvalcalc-0.6.2/Bvalcalc/templates/DrosophilaParams.py +29 -0
bvalcalc-0.6.2/Bvalcalc/templates/HumanParams.py +28 -0
bvalcalc-0.6.2/Bvalcalc/templates/MouseParams.py +0 -0
bvalcalc-0.6.2/Bvalcalc/templates/PfalciparumParams.py +29 -0
bvalcalc-0.6.2/Bvalcalc/templates/SelfingParams.py +29 -0
bvalcalc-0.6.2/Bvalcalc/utils/__init__.py +26 -0
bvalcalc-0.6.2/Bvalcalc/utils/bin_outputs.py +37 -0
bvalcalc-0.6.2/Bvalcalc/utils/dfe_helper.py +117 -0
bvalcalc-0.6.2/Bvalcalc/utils/generateParams.py +46 -0
bvalcalc-0.6.2/Bvalcalc/utils/load_Bmap.py +39 -0
bvalcalc-0.6.2/Bvalcalc/utils/load_bed_gff.py +94 -0
bvalcalc-0.6.2/Bvalcalc/utils/load_chr_sizes.py +28 -0
bvalcalc-0.6.2/Bvalcalc/utils/load_rec_map.py +81 -0
bvalcalc-0.6.2/Bvalcalc/utils/load_vcf.py +38 -0
bvalcalc-0.6.2/Bvalcalc/utils/parseArgs.py +161 -0
bvalcalc-0.6.2/Bvalcalc/utils/write_chrom_B_to_file.py +67 -0
bvalcalc-0.6.2/Bvalcalc.egg-info/PKG-INFO +714 -0
bvalcalc-0.6.2/Bvalcalc.egg-info/SOURCES.txt +64 -0
bvalcalc-0.6.2/Bvalcalc.egg-info/dependency_links.txt +1 -0
bvalcalc-0.6.2/Bvalcalc.egg-info/entry_points.txt +3 -0
bvalcalc-0.6.2/Bvalcalc.egg-info/requires.txt +11 -0
bvalcalc-0.6.2/Bvalcalc.egg-info/top_level.txt +1 -0
bvalcalc-0.6.2/LICENSE +675 -0
bvalcalc-0.6.2/PKG-INFO +714 -0
bvalcalc-0.6.2/README.md +6 -0
bvalcalc-0.6.2/pyproject.toml +63 -0
bvalcalc-0.6.2/setup.cfg +4 -0
bvalcalc-0.6.2/tests/test_calculateB.py +6 -0
bvalcalc-0.6.2/tests/test_cli.py +271 -0

bvalcalc-0.6.2/Bvalcalc/__init__.py ADDED Viewed

@@ -0,0 +1,16 @@
+"""
+bvalcalc: calculate relative diversity (B) under background selection.
+"""
+__version__ = "0.6.2"
+# Expose main entry point
+from .cli import main
+from .core.calculateB import calculateB_linear, calculateB_recmap, calculateB_unlinked, get_params
+__all__ = [
+    "get_params", "calculateB_linear", "calculateB_unlinked",
+    "main",
+    "__version__",
+]

bvalcalc-0.6.2/Bvalcalc/__main__.py ADDED Viewed

@@ -0,0 +1,4 @@
+from .cli import main
+if __name__ == "__main__":
+    main()

bvalcalc-0.6.2/Bvalcalc/cli.py ADDED Viewed

@@ -0,0 +1,68 @@
+#!/usr/bin/env python3
+import os
+import sys
+import time
+import argparse
+from Bvalcalc.utils.parseArgs import parse_args, parseGenomeArgs, parseRegionArgs, parseGeneArgs, parseSiteArgs, parseBmapArgs
+from Bvalcalc.core.plotB import plotB
+from Bvalcalc.core.deprecated.plotB_figures import plotB_figures
+from Bvalcalc.core.deprecated.plotB_figures_200kb import plotB_figures_200kb
+from Bvalcalc.utils.generateParams import SPECIES, generateParams, check_generate_params_args
+from Bvalcalc.core.positionsBstats import positionsBstats
+from Bvalcalc.core.plotChromB import plotChromB
+__version__ = "0.6.2"
+def main():
+    start_time = time.time()
+    check_generate_params_args() # Unique error message for --generate_params to print species names
+    parser = parse_args(__version__)
+    known_args, remaining_args = parser.parse_known_args()
+    if known_args.generate_params is not None: # if --generate_params
+        print(f"Retrieving params from template...")
+        generateParams(known_args.generate_params, known_args.dir)
+        return
+    if known_args.Bmap is not None: # if --Bmap
+        args = parseBmapArgs(remaining_args)
+        flat_b, flat_chrom = positionsBstats(args, known_args.Bmap)
+        if args.plot_distribution:
+            plotChromB(flat_b, flat_chrom, args.plot_distribution, args.quiet)
+        return
+    print(f"= Calculating relative diversity (B) for all neutral sites across the genome. = = =")
+    if known_args.genome: # Run genome Bcalc
+        args = parseGenomeArgs(remaining_args)
+        os.environ["BCALC_POP_PARAMS"] = args.pop_params  # Save params to global
+        from Bvalcalc.core.genomeBcalc import genomeBcalc
+        genomeBcalc(args)
+    elif known_args.region: # Run region Bcalc
+        args = parseRegionArgs(remaining_args)
+        os.environ["BCALC_POP_PARAMS"] = args.pop_params  # Save params to global
+        from Bvalcalc.core.regionBcalc import regionBcalc
+        output_data, block_ranges, rec_rate_per_chunk_in_region, chunk_size = regionBcalc(args, known_args.region)
+        if getattr(args, 'plot_output', True):
+            plotB(b_values_input=output_data, caller="chromosome", output_path=args.plot_output, quiet=args.quiet, gene_ranges=block_ranges, neutral_only=args.neutral_only, rec_rates=rec_rate_per_chunk_in_region, chunk_size=chunk_size)
+    elif known_args.gene: # Run gene Bcalc
+        args = parseGeneArgs(remaining_args)
+        os.environ["BCALC_POP_PARAMS"] = args.pop_params  # Save params to global
+        from Bvalcalc.core.geneBcalc import geneBcalc
+        output_data = geneBcalc(args) # Capture the output from geneBcalc
+        if getattr(args, 'plot_output', False): # If the --plot_output flag was provided, call plotB with geneBcalc's output.
+            plotB(b_values_input=output_data, caller="gene", output_path=args.plot_output, quiet=args.quiet)
+    elif known_args.site: # Run single site Bcalc
+        args = parseSiteArgs(remaining_args)
+        os.environ["BCALC_POP_PARAMS"] = args.pop_params  # Save params to global
+        from Bvalcalc.core.siteBcalc import siteBcalc
+        siteBcalc(args)
+    print(f"= B value calculated in {time.time() - start_time:.2f} seconds. = = =")
+if __name__ == "__main__":
+    main()

bvalcalc-0.6.2/Bvalcalc/core/__init__.py ADDED Viewed

@@ -0,0 +1,20 @@
+"""
+Core calculation modules for bvalcalc.
+"""
+from .genomeBcalc import genomeBcalc
+from .regionBcalc import regionBcalc
+from .geneBcalc import geneBcalc
+from .siteBcalc import siteBcalc
+from .plotB import plotB
+from .calculateB import calculateB_linear, calculateB_recmap, calculateB_unlinked
+__all__ = [
+    "genomeBcalc",
+    "regionBcalc",
+    "geneBcalc",
+    "siteBcalc",
+    "plotB",
+    "calculateB_linear",
+    "calculateB_recmap",
+    "calculateB_unlinked"
+]

bvalcalc-0.6.2/Bvalcalc/core/calculateB.py ADDED Viewed

@@ -0,0 +1,351 @@
+import numpy as np
+from Bvalcalc.utils.dfe_helper import get_DFE_params
+from scipy.optimize import root_scalar
+from scipy.integrate import trapezoid
+_params_cache: dict | None = None
+_cache_args: tuple[str | None, bool, bool] | None = None
+def get_params(
+    params_path: str | None = None,
+    gamma_dfe: bool = False,
+    constant_dfe: bool = False,
+):
+    """
+    Loads DFE parameters from the provided population genetic parameters file.
+    Caches on (params_path, gamma_dfe, constant_dfe) and rebuilds whenever
+    any of those three inputs change.
+    """
+    global _params_cache#, _cache_args # COMMENTED OUT CACHING FOR API USAGE, CAN RE-IMPLEMENT FOR CLI IF IT SLOWS IT DOWN
+    # key = (params_path, gamma_dfe, constant_dfe)
+    # if _cache_args != key:
+    _params_cache = get_DFE_params(params_path, gamma_dfe, constant_dfe)
+    # _cache_args = key
+    return _params_cache
+def calculateB_linear(distance_to_element: int, length_of_element: int, params: dict | None = None):
+    """
+    Calculate B due to purifying selection acting on a linked selected element of arbitrary length, assuming a constant crossover and gene conversion rate (analytical solution).
+    Parameters
+    ----------
+    distance_to_element: int
+        Distance (bp) from the neutral site to the nearest edge of the selected element.
+    length_of_element: int
+        Length (bp) of the selected element.
+    params : dict
+        Required parameters from ``get_params()``, only kept as default (None) when being called by CLI,
+        in which case parameters are sourced from the params file directly.
+    """
+    with np.errstate(divide='ignore', invalid='ignore'):
+        if params is None:
+            params = get_params()
+        r, u, g, k, t1, t1half, t2, t3, t4, f1, f2, f3, f0, t_constant = params["r"], params["u"], params["g"], params["k"], params["t1"], params["t1half"], params["t2"], params["t3"], params["t4"], params["f1"], params["f2"], params["f3"], params["f0"], params["t_constant"]
+        C = (1.0 - np.exp(-2.0 * r * distance_to_element)) / 2.0 # cM
+        U = length_of_element * u
+        if g == 0:
+            a = C # RECOMBINATION IN Y
+            b = C + (r * length_of_element) # RECOMBINATION IN X
+        elif g > 0:
+            a, b = get_a_b_with_GC(C, distance_to_element, length_of_element)
+        if t_constant: #If --constant_dfe is active
+            E_constant = calculate_exponent(t_constant, t_constant, U, a, b)
+            B = np.exp(-1.0 * E_constant)
+            return np.where(length_of_element == 0, 1.0, B)
+        E_f1 = calculate_exponent(t1half, t2, U, a, b)
+        E_f2 = calculate_exponent(t2, t3, U, a, b)
+        E_f3 = calculate_exponent(t3, t4, U, a, b)
+        E_bar = ( # Sum over the DFE
+            f0 * 0.0
+            + f1 * ((t1half - t1) / (t2 - t1)) * 0.0
+            + f1 * ((t2 - t1half) / (t2 - t1)) * E_f1
+            + f2 * E_f2
+            + f3 * E_f3)
+        B = np.exp(-1.0 * E_bar)
+    return np.where(length_of_element == 0, 1.0, B)
+def calculateB_recmap(distance_to_element, length_of_element,
+                      rec_distances = None, rec_lengths = None,
+                      gc_distances = None, gc_lengths = None, params = None):
+    """
+    Calculate the B value WITH REC MAP for a single functional element at the focal site,
+    summing over the DFE while consolidating the intermediate calculations.
+    """
+    with np.errstate(divide='ignore', invalid='ignore'):
+        if params is None:
+            params = get_params()
+        r, u, g, k, t1, t1half, t2, t3, t4, f1, f2, f3, f0, t_constant = params["r"], params["u"], params["g"], params["k"], params["t1"], params["t1half"], params["t2"], params["t3"], params["t4"], params["f1"], params["f2"], params["f3"], params["f0"], params["t_constant"]
+        # rec_distances is the length of the element * rec rate in each spanned region.
+        if rec_distances is not None:
+            rec_adjusted_length_of_element = rec_lengths
+            rec_adjusted_distance_to_element = rec_distances
+        else:
+            rec_adjusted_length_of_element = length_of_element
+            rec_adjusted_distance_to_element = distance_to_element
+        if gc_distances is not None:
+            local_g = (gc_lengths + gc_distances)/(length_of_element + distance_to_element) * g
+        else:
+            local_g = g
+        C = (1.0 - np.exp(-2.0 * r * rec_adjusted_distance_to_element)) / 2.0 # cM
+        U = length_of_element * u
+        if g == 0:
+            a = C
+            b = C + r * rec_adjusted_length_of_element # cM
+        elif g > 0:
+             a, b = get_a_b_with_GC_andMaps(C, y=distance_to_element, l=length_of_element,
+                                            rec_l=rec_adjusted_length_of_element, local_g = local_g)
+        if t_constant: #If --constant_dfe is active
+            E_constant = calculate_exponent(t_constant, t_constant, U, a, b)
+            B = np.exp(-1.0 * E_constant)
+            return np.where(length_of_element == 0, 1.0, B)
+        E_f1 = calculate_exponent(t1half, t2, U, a, b)
+        E_f2 = calculate_exponent(t2, t3, U, a, b)
+        E_f3 = calculate_exponent(t3, t4, U, a, b)
+        E_bar = ( # Sum over the DFE
+            f0 * 0.0
+            + f1 * ((t1half - t1) / (t2 - t1)) * 0.0
+            + f1 * ((t2 - t1half) / (t2 - t1)) * E_f1
+            + f2 * E_f2
+            + f3 * E_f3)
+        B = np.exp(-1.0 * E_bar)
+    return np.where(length_of_element == 0, 1.0, B)
+def calculateB_unlinked(unlinked_L: int, params: dict | None = None):
+    """
+    Calculate B due to purifying selection at unlinked sites (numerical integration over DFE).
+    Parameters
+    ----------
+    unlinked_L : float
+        Cumulative count of selected sites in unlinked regions.
+    params : dict
+        Required parameters from ``get_params()``, only kept as default (None) when being called by CLI,
+        in which case parameters are sourced from the params file directly.
+    """
+    if params is None:
+        params = get_params()
+    u, t1, t1half, t2, t3, t4, f0, f1, f2, f3, t_constant = params["u"], params["t1"], params["t1half"], params["t2"], params["t3"], params["t4"], params["f0"], params["f1"], params["f2"], params["f3"], params["t_constant"]
+    if t_constant: #If --constant_dfe is active
+        unlinked_B  = np.exp(-8 * u * 1.0 * unlinked_L * (t_constant/(1 + t_constant)**2))
+        return unlinked_B
+    f1_above_cutoff = f1 * ((t1half - t1) / (t2 - t1))
+    sum_f1 = (f1_above_cutoff / (t2 - t1half)) * (np.log((1 + t2) /(1 + t1half)) + (1 / (1 + t2)) - (1 / (1 + t1half)))
+    sum_f2 = (f2 / (t3 - t2)) * (np.log((1 + t3) /(1 + t2)) + (1 / (1 + t3)) - (1 / (1 + t2)))
+    sum_f3 = (f3 / (t4 - t3)) * (np.log((1 + t4) /(1 + t3)) + (1 / (1 + t4)) - (1 / (1 + t3)))
+    unlinked_B  = np.exp(-8 * u * 1.0 * unlinked_L * (sum_f1 + sum_f2 + sum_f3))
+    return unlinked_B
+##
+## Helper functions
+def calculate_exponent(t_start, t_end, U, a, b):
+    """"
+    Helper to calculate the exponent using "a" and "b"
+    """
+    a, b, U = np.asarray(a), np.asarray(b), np.asarray(U)
+    if U.size == 0: return 0 # If e.g. f1 proportion is 0, no need to calculate exponent
+    if t_end == t_start: # If --constant_dfe
+        E = (U / (a - b)) * (
+            a / (a + (1 - a) * t_start) -
+            b / (b + (1 - b) * t_start)
+        )
+    else: # Using discretized DFE (f0,f1,f2,f3 or --gamma_dfe)
+        E1 = ((U * a)
+                / ((1 - a) * (a - b) * (t_end - t_start))) * np.log((a + (t_end * (1 - a)))
+                / (a + (t_start * (1 - a))))
+        E2 = -1.0 * ((U * b)
+                / ((1 - b) * (a - b) * (t_end - t_start))) * np.log((b + ((1 - b) * t_end))
+                / (b + ((1 - b) * t_start)))
+        E = np.asarray(E1 + E2)
+    rec_0_mask = np.isclose(a, b)  # Get mask for where recombination rate = 0 within the gene
+    if rec_0_mask.any(): # 4a) If a_arr is scalar (0‐d), compute limit once as scalar
+        if a.ndim == 0:
+            limit_factor = (1 / ((t_end - t_start)*(1-a)**2)) * ( # Calculate exponent with 0 recombination between gene and site, avoiding limits
+                np.log((a + (1 - a) * t_end)
+                       / (a + (1 - a) * t_start))
+                + a / (a + (1 - a) * t_end)
+                - a / (a + (1 - a) * t_start))
+            if t_start == t_end: limit_factor = t_start / (a + (1 - a) * t_start)**2 # If --constant_dfe
+            # Broadcast scalar limit_factor to all masked positions
+            E[rec_0_mask] = U[rec_0_mask] * limit_factor  # Get corresponding U for the numerator and plug back into E array to replace nan's
+        else: # 4b) If a_arr is array, compute limit for each masked element
+            ae = a[rec_0_mask]  # array of a_i where a_i ≈ b_i
+            limit_factor = (1 / ((t_end - t_start)*(1-ae)**2)) * ( # Calculate exponent with 0 recombination between gene and site, avoiding limits
+                np.log((ae + (1 - ae) * t_end)
+                       / (ae + (1 - ae) * t_start))
+                + ae / (ae + (1 - ae) * t_end)
+                - ae / (ae + (1 - ae) * t_start))
+            if t_start == t_end: limit_factor = t_start / (ae + (1 - ae) * t_start)**2 # If --constant_dfe
+            ## REPLACED BELOW WITH THE NEW LINE TO FIX FAR GENE ISSUE, MAY NEED TO REVERT
+            E[rec_0_mask] = U[rec_0_mask] * limit_factor
+            # Match array of limit_factor to corresponding positions in E (where rec_0_mask has True);'l;'l''
+            # if len(rec_0_mask[False]) == 0:
+            #     # print(f"Need to fix --gene when r = 0, see calculateB ~line 176") Fixed??
+            #     E[rec_0_mask] = U * limit_factor
+            # else:
+            #     E[rec_0_mask] = U[rec_0_mask] * limit_factor  # Get corresponding U for the numerator and plug back into E array to replace nan's
+    return E
+def get_a_b_with_GC(C, y, l):
+        with np.errstate(divide='ignore', invalid='ignore'):
+            params = get_params()
+            r, u, g, k, t1, t1half, t2, t3, t4, f1, f2, f3, f0 = params["r"], params["u"], params["g"], params["k"], params["t1"], params["t1half"], params["t2"], params["t3"], params["t4"], params["f1"], params["f2"], params["f3"], params["f0"]
+            proportion_nogc_a = np.where(k < y + l, # When GC includes neutral site, this is proportion of the gene it includes
+                                        np.maximum((0.5*(k-y)/l), 0),
+                                        1-(y + l)/(2 * k)
+                                        )
+            proportion_nogc_b = np.where(k < y + l, # When GC includes gene site, this is probability the tract includes neutral site of interest
+                                    1/(2*k) * np.maximum(k-y+1,0) * np.maximum(k - y, 0) / l, # When overshooting not possible
+                                    (k - y - 0.5 * l) / k) # When overshooting possible
+        a = np.where(k < y,
+            C + (2 * g * k), # Probability of GC on neutral site, where overlap with element not possible
+            C + (2 * g * (y) + # When overlap possible this is probability gc is in neutral but doesn't include any of element
+                g * (k - y) * # Probability gc is in neutral and includes some element (remaining probability from above)
+                (1 - proportion_nogc_a) # Proportion of gene that gc breaks linkage with when it includes some element
+        ))
+        b = C + (r * l) + (2 * g * k) * (1 - (1-proportion_nogc_a)*proportion_nogc_b) #* prop k out
+        return a, b
+def get_a_b_with_GC_andMaps(C, y, l, rec_l, local_g):
+        params = get_params()
+        r, u, g, k, t1, t1half, t2, t3, t4, f1, f2, f3, f0 = params["r"], params["u"], params["g"], params["k"], params["t1"], params["t1half"], params["t2"], params["t3"], params["t4"], params["f1"], params["f2"], params["f3"], params["f0"]
+        with np.errstate(divide='ignore', invalid='ignore'):
+            proportion_nogc_a = np.where(k < y + l, # When GC includes neutral site, this is proportion of the gene it includes
+                                        np.maximum((0.5*(k-y)/l), 0),
+                                        ((y) * (2 * k - (y + l)))/(2 * k * y)
+                                        )
+            proportion_nogc_b = np.where(k < y + l, # When GC includes gene site, this is probability the tract includes neutral site of interest
+                                    1/(2*k) * np.maximum(k-y+1,0) * np.maximum(k - y, 0) / l,
+                                    (k - y - 0.5 * l) / k)
+        a = np.where(k < y,
+            C + (2 * local_g * k), # Probability of GC on neutral site, where overlap with element not possible
+            C + (2 * local_g * (y) + # When overlap possible this is probability gc is in neutral but doesn't include any of element
+                local_g * (k - y) * # Probability gc is in neutral and includes some element (remaining probability from above)
+                (1 - proportion_nogc_a) # Proportion of gene that gc breaks linkage with when it includes some element
+        ))
+        b = C + (r * rec_l) + (2 * local_g * k) * (1 - (1-proportion_nogc_a)*proportion_nogc_b) #* prop k out
+        return a, b
+def calculateB_hri(distant_B, interfering_L, params: dict | None = None):
+    """
+    Fully vectorized calculation of B' under Hill-Robertson interference.
+    """
+    if params is None:
+        params = get_DFE_params()
+    Nanc, u, f1, f2 = params["Nanc"], params["u"], params["f1"], params["f2"]
+    distant_B = np.atleast_1d(distant_B).astype(float)
+    interfering_L = np.atleast_1d(interfering_L).astype(float)
+    scalar_input = distant_B.shape == () or distant_B.shape == (1,)
+    N0 = distant_B * Nanc
+    h = 0.5
+    u = 2 * u
+    u1 = f1 * u
+    u2 = f2 * u
+    u_total = u1 + u2
+    E_X2_f1 = (1**2 + 1*10 + 10**2) / 3
+    E_X2_f2 = (10**2 + 10*100 + 100**2) / 3
+    t_sq1 = (h**2 * E_X2_f1) / (4 * N0**2)
+    t_sq2 = (h**2 * E_X2_f2) / (4 * N0**2)
+    t = np.sqrt((u1 * t_sq1 + u2 * t_sq2) / u_total)
+    gamma = 2 * N0 * t
+    U = u_total * interfering_L
+    alpha2 = 2 * N0 * U
+    kappa = 1.0
+    def eq4(B, U, gamma, t):
+        exp_term = np.exp(-gamma * B)
+        num = 0.5 * U * (1 - exp_term)**3
+        denom = t * (1 + kappa * exp_term)**3
+        return -np.log(B) - num / denom
+    def solve_eq4_batched(U, gamma, t, n=500):
+        Bgrid = np.linspace(1e-10, 1.0, n)[None, :]
+        U = np.asarray(U).reshape(-1, 1)
+        gamma = np.asarray(gamma).reshape(-1, 1)
+        t = np.asarray(t).reshape(-1, 1)
+        fvals = eq4(Bgrid, U, gamma, t)
+        signs = np.sign(fvals)
+        crossing = np.diff(signs, axis=1) < 0
+        idx = np.argmax(crossing, axis=1)
+        B_left = Bgrid[0, idx]
+        B_right = Bgrid[0, idx + 1]
+        f_left = fvals[np.arange(len(U)), idx]
+        f_right = fvals[np.arange(len(U)), idx + 1]
+        B_root = B_left - f_left * (B_right - B_left) / (f_right - f_left)
+        return B_root
+    Bval = solve_eq4_batched(U, gamma, t)
+    def eq5_vectorized(B, alpha2, gamma, Tmax=100.0, n_steps=2000):
+        x = np.linspace(0, Tmax, n_steps)[None, :]  # shape (1, n_steps)
+        dx = x[0, 1] - x[0, 0]
+        B = B[:, None]
+        alpha2 = alpha2[:, None]
+        gamma = gamma[:, None]
+        f1 = 1 - np.exp(-gamma * B)
+        f2 = 1 + kappa * np.exp(-gamma * B)
+        A = f1 / f2
+        c = 0.5 * alpha2 / gamma * A**3
+        d = 2 * gamma * B * (f2 / f1)
+        x_broadcasted = np.broadcast_to(x, (B.shape[0], x.shape[1]))
+        gx = np.exp(c * (1 - np.exp(-d * x_broadcasted))**2)
+        cumI = np.cumsum((gx[:, :-1] + gx[:, 1:]) * 0.5 * dx, axis=1)
+        cumI = np.hstack([np.zeros((gx.shape[0], 1)), cumI])
+        hx = np.exp(-B * cumI)
+        Bprime = B[:, 0] * trapezoid(hx, x[0], axis=1)
+        return Bprime
+    Bprime = eq5_vectorized(Bval, alpha2, gamma)
+    return Bprime[0] if scalar_input else Bprime

bvalcalc-0.6.2/Bvalcalc/core/chromBcalc.py ADDED Viewed

@@ -0,0 +1,194 @@
+from Bvalcalc.core.helpers.process_single_chunk import process_single_chunk
+from Bvalcalc.core.helpers.calc_L_per_chunk import calculate_L_per_chunk
+from Bvalcalc.core.helpers.demography_helpers import get_Bcur
+from Bvalcalc.utils.load_rec_map import load_rec_map
+from Bvalcalc.utils.bin_outputs import bin_outputs
+from concurrent.futures import ThreadPoolExecutor
+from concurrent.futures import as_completed
+import numpy as np
+import os
+import sys
+def chromBcalc(args, blockstart, blockend, chromosome, unlinked_B, prior_pos = None, prior_b = None, calc_start=None, calc_end=None, chr_size=None, caller="regionBcalc"):
+    #Shared arguments between genomeBcalc and regionBcalc
+    file_path, chunk_size, precise_chunks, no_hri, quiet, verbose = args.bedgff_path, args.chunk_size, args.precise_chunks, args.no_hri, args.quiet, args.verbose
+    # Auto-adjust chunk size for large datasets (only if user hasn't manually set chunk_size)
+    if args.chunk_size is None:  # If they did not explicitly provide --chunk_size
+        num_blocks = len(blockstart)
+        # Set default chunk size
+        chunk_size = 20000
+        original_chunk_size = chunk_size
+        if num_blocks > 250000:
+            chunk_size = 1000  # Use 1kb chunks for extremely massive datasets
+            if not quiet:
+                print(f"Extremely massive dataset detected ({num_blocks} blocks). Auto-adjusting chunk size from {original_chunk_size} to {chunk_size} bp for memory efficiency. Use --chunk_size to override.")
+        elif num_blocks > 125000:
+            chunk_size = 2000  # Use 2kb chunks for massive datasets
+            if not quiet:
+                print(f"Massive dataset detected ({num_blocks} blocks). Auto-adjusting chunk size from {original_chunk_size} to {chunk_size} bp for memory efficiency. Use --chunk_size to override.")
+        elif num_blocks > 50000:
+            chunk_size = 5000  # Use 5kb chunks for very large datasets
+            if not quiet:
+                print(f"Very large dataset detected ({num_blocks} blocks). Auto-adjusting chunk size from {original_chunk_size} to {chunk_size} bp for memory efficiency. Use --chunk_size to override.")
+        elif num_blocks > 25000:
+            chunk_size = 10000  # Use 10kb chunks for large datasets
+            if not quiet:
+                print(f"Large dataset detected ({num_blocks} blocks). Auto-adjusting chunk size from {original_chunk_size} to {chunk_size} bp for memory efficiency. Use --chunk_size to override.")
+    elif not quiet and num_blocks > 25000:
+        print(f"Large dataset detected ({num_blocks} blocks) but using user-specified chunk size of {chunk_size} bp.")
+    #Arguments specific to regionBcalc
+    if caller == "regionBcalc":
+        calc_start, calc_end = calc_start, calc_end
+        if calc_end > blockend[-1]:
+            chr_size = calc_end
+        else:
+            chr_size = None
+    if not args.quiet:
+        print(f"====== P A R A M E T E R S =========================")
+        print(f"BED/GFF file for regions under selection: {file_path}")
+        if chr_size is not None: print(f"Last position in chromosome {chromosome}: {calc_end}")
+        print(f"Size of chunks to calculate B in per iteration: {chunk_size}bp")
+        print(f"Number of adjacent chunks to calculate B precisely for: {precise_chunks}")
+    if chr_size is not None and chr_size < blockend[-1]:
+        raise ValueError(f"chr_size provided is less than gene position for chromosome {chromosome}")
+    if chr_size is None: # Default chr_size to last value in blockend if not given
+        if len(blockend) == 0 and caller != "regionBcalc":
+            raise ValueError("chr_size was not provided for chromosome: {chromosome} and gene position ends not computed. Check BED/GFF input, and specify chr_size if needed")
+        chr_size = blockend[-1]
+        if calc_end is None and not args.quiet:
+            print(f"No --chr_size provided for chromosome: {chromosome}. Using last position in BED/GFF: {chr_size}")
+    if not quiet: print(f"====== S T A R T I N G ===== C A L C ===============")
+    if calc_start is None and calc_end is None:
+        if not quiet: print(f"Calculating B for entire chromosome, to only calculate for a subregion, use --calc_start and --calc_end")
+    if calc_start is None:
+        calc_start = 1
+    if calc_end is None:
+        calc_end = chr_size
+    chr_start = 1 # Currently hardcoded, can change if needed
+    num_chunks = (chr_size - chr_start + chunk_size - 1) // chunk_size
+    calc_chunk_start = (calc_start - chr_start) // chunk_size
+    calc_chunk_end = (calc_end - chr_start) // chunk_size
+    calc_chunks = np.arange(calc_chunk_start,calc_chunk_end + 1) # Relevant chunks to calculate B for based on calc_start and calc_end
+    b_values = np.ones(chr_size + 2 - chr_start, dtype=np.float64) # Initialize array of B values
+    if prior_pos is not None and prior_b is not None: # If we have prior B map, overwrite those positions' B values
+        idx = np.asarray(prior_pos, dtype=int)
+        calc_mask = (idx >= calc_start) & (idx <= calc_end)
+        idx = idx[calc_mask] # filter to only those within [calc_start, calc_end]
+        bprior = np.asarray(prior_b, dtype=b_values.dtype)[calc_mask]
+        b_values[idx] = bprior
+    lperchunk = calculate_L_per_chunk(chunk_size, blockstart, blockend, chr_start, chr_size) # Cumulative conserved length in each chunk
+    if args.rec_map: # Process recombination map if provided
+        if not quiet: print(f"Using recombination (crossover) map from {args.rec_map}")
+        rec_rate_per_chunk = load_rec_map(args.rec_map, chr_start, chr_size, chunk_size, chromosome)
+    else:
+        rec_rate_per_chunk = None
+    if args.gc_map:
+        if not quiet: print(f"Using gene conversion map from {args.gc_map}")
+        gc_rate_per_chunk = load_rec_map(args.gc_map, chr_start, chr_size, chunk_size, chromosome)
+    else:
+        gc_rate_per_chunk = None
+    if verbose: print(f"====== R E S U L T S == P E R == C H U N K =========")
+    elif not quiet: print(f"To print per-chunk summaries, add --verbose.")
+    import gc
+    BATCH_SIZE = args.chunk_batch_size
+    total_chunks = len(calc_chunks)
+    completed = 0
+    for batch_start in range(0, total_chunks, BATCH_SIZE):
+        batch = calc_chunks[batch_start : batch_start + BATCH_SIZE]
+        with ThreadPoolExecutor() as executor:
+            futures = {
+                executor.submit(process_single_chunk, chunk_idx,
+                                chunk_size, blockstart, blockend, chr_start, chr_size, calc_start,
+                                calc_end, num_chunks, precise_chunks, lperchunk, b_values,
+                                rec_rate_per_chunk, gc_rate_per_chunk, no_hri, quiet, verbose, unlinked_B): chunk_idx
+                for chunk_idx in batch
+            }
+            if not quiet and not verbose:
+                for future in as_completed(futures):
+                    completed += 1
+                    progress = int((completed / total_chunks) * 100)
+                    sys.stdout.write(f"\rProgress ({chromosome}): {progress}% ({completed}/{total_chunks} chunks [{chunk_size}])")
+                    sys.stdout.flush()
+                # After batch is done, cleanup
+                print()  # Move to the next line after progress printing
+                del futures
+                gc.collect()
+    b_values = b_values[calc_start:(calc_end+1)] # Trim b_values array to only calculated region
+    b_values = b_values * unlinked_B
+    # print('Hriii', np.shape(b_values))
+    if not no_hri and rec_rate_per_chunk is not None: # If --no_hri is not active
+        from Bvalcalc.core.helpers.extend_hri_regions_correction import extend_hri_regions_correction
+        hri_extended_starts, hri_extended_ends = extend_hri_regions_correction(b_values, rec_rate_per_chunk, chunk_size, chr_start, calc_start, calc_end, hri_r_threshold = 0.1) # Extend HRI regions until B > B' to avoid sharp decrease in B at the border between normal and HRI regions. See manuscript.
+    else:
+        hri_extended_starts, hri_extended_ends = np.array([], dtype=int), np.array([], dtype=int)
+    if not quiet:
+        print(f"====== F I N I S H E D ===== C A L C ===============")
+        print(f"====== R E S U L T S ====== S U M M A R Y ==========")
+                # Total genic bases within calc_start to calc_end
+        calc_selected_length = 0
+        for start, end in zip(blockstart, blockend):
+            # Find overlap between this block and the calculated region
+            overlap_start = max(start, calc_start)
+            overlap_end = min(end, calc_end)
+            if overlap_start <= overlap_end:
+                calc_selected_length += (overlap_end - overlap_start + 1)
+        print(f"Cumulative length of calculated region under selection: {calc_selected_length}bp "f"({round((calc_selected_length / (calc_end - calc_start + 1)) * 100, 2)}%)")
+        print(f"Cumulative length of chromosome under selection: {int(sum(lperchunk))}bp ({round((sum(lperchunk)/(chr_size - chr_start + 1))*100,2)}%)")
+        print(f"B from unlinked sites for chromosome {chromosome}: {unlinked_B}")
+        if caller == "genomeBcalc": print(f"Mean B of neutral sites across chromosome {chromosome}: {b_values[~np.isnan(b_values)].mean()}")
+        elif caller == "regionBcalc": print(f"Mean B of neutral sites across specified region: {b_values[~np.isnan(b_values)].mean()}")
+        if args.rec_map: # Process recombination map if provided
+            print(f"Calculated using recombination (crossover) map, with rates averaged within {chunk_size}bp chunks")
+        if args.gc_map: # Process recombination map if provided
+            print(f"Calculated using gene conversion map, with rates averaged within {chunk_size}bp chunks")
+    block_ranges = np.column_stack((np.repeat(chromosome, blockstart.shape[0]), blockstart, blockend))
+    positions = np.arange(calc_start, calc_end + 1)
+    conserved = np.full_like(positions, "N", dtype="<U1")
+    for start, end in zip(blockstart, blockend): # Mark conserved regions
+        conserved[max(start, calc_start) - calc_start : min(end, calc_end) - calc_start + 1] = "C"
+    if args.pop_change:
+        b_values = get_Bcur(b_values)
+        if not quiet: print("Demographic change applied to B-calculation")
+    binned_b_values, binned_positions = bin_outputs(b_values, positions, args.out_binsize)
+    chrom_col = np.full(binned_positions.shape, chromosome, dtype="<U20")
+    output_data = np.core.records.fromarrays(
+        [chrom_col,binned_positions.astype(int),binned_b_values.astype(float)],
+        names='Chromosome,Start,B',formats='U20,i8,f8')
+    if args.out is not None: # Write to CSVs
+        print(f"Writing B output to file...")
+        from Bvalcalc.utils.write_chrom_B_to_file import write_chrom_B_to_file
+        write_chrom_B_to_file(args.out, output_data, quiet, hri_extended_starts, hri_extended_ends, args.out_binsize, calc_end)
+        print(f"Appended B values to: {os.path.abspath(args.out)}")
+    else:
+        if not args.quiet:
+            print("No output CSV requested; skipping save.")
+    if caller == "regionBcalc":
+        if rec_rate_per_chunk is not None:
+            rec_rate_per_chunk_in_region = rec_rate_per_chunk[calc_start // chunk_size:] # Slice rec_rate_per_chunk from region start onward
+        else: rec_rate_per_chunk_in_region = None
+        return output_data, block_ranges, rec_rate_per_chunk_in_region, chunk_size
+    else: #caller is genomeBcalc
+        return