bvalcalc 0.6.2__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (60) hide show
  1. bvalcalc-0.6.2/Bvalcalc/__init__.py +16 -0
  2. bvalcalc-0.6.2/Bvalcalc/__main__.py +4 -0
  3. bvalcalc-0.6.2/Bvalcalc/cli.py +68 -0
  4. bvalcalc-0.6.2/Bvalcalc/core/__init__.py +20 -0
  5. bvalcalc-0.6.2/Bvalcalc/core/calculateB.py +351 -0
  6. bvalcalc-0.6.2/Bvalcalc/core/chromBcalc.py +194 -0
  7. bvalcalc-0.6.2/Bvalcalc/core/deprecated/Bcalc_stdOut.py +118 -0
  8. bvalcalc-0.6.2/Bvalcalc/core/deprecated/calculate_B_analytically_Eq3_mine_demography.py +80 -0
  9. bvalcalc-0.6.2/Bvalcalc/core/deprecated/findFlankLen.py +23 -0
  10. bvalcalc-0.6.2/Bvalcalc/core/deprecated/old_calculateB.py +35 -0
  11. bvalcalc-0.6.2/Bvalcalc/core/deprecated/plotB_figures.py +273 -0
  12. bvalcalc-0.6.2/Bvalcalc/core/deprecated/plotB_figures_200kb.py +208 -0
  13. bvalcalc-0.6.2/Bvalcalc/core/geneBcalc.py +73 -0
  14. bvalcalc-0.6.2/Bvalcalc/core/genomeBcalc.py +50 -0
  15. bvalcalc-0.6.2/Bvalcalc/core/helpers/__init__.py +19 -0
  16. bvalcalc-0.6.2/Bvalcalc/core/helpers/calc_B_from_chunks.py +89 -0
  17. bvalcalc-0.6.2/Bvalcalc/core/helpers/calc_B_in_genes.py +104 -0
  18. bvalcalc-0.6.2/Bvalcalc/core/helpers/calc_B_in_hri_region.py +39 -0
  19. bvalcalc-0.6.2/Bvalcalc/core/helpers/calc_B_precise_noninterfering.py +170 -0
  20. bvalcalc-0.6.2/Bvalcalc/core/helpers/calc_L_per_chunk.py +65 -0
  21. bvalcalc-0.6.2/Bvalcalc/core/helpers/calc_R_len_dist.py +122 -0
  22. bvalcalc-0.6.2/Bvalcalc/core/helpers/demography_helpers.py +45 -0
  23. bvalcalc-0.6.2/Bvalcalc/core/helpers/extend_hri_regions_correction.py +113 -0
  24. bvalcalc-0.6.2/Bvalcalc/core/helpers/process_single_chunk.py +131 -0
  25. bvalcalc-0.6.2/Bvalcalc/core/plotB.py +174 -0
  26. bvalcalc-0.6.2/Bvalcalc/core/plotChromB.py +47 -0
  27. bvalcalc-0.6.2/Bvalcalc/core/positionsBstats.py +131 -0
  28. bvalcalc-0.6.2/Bvalcalc/core/regionBcalc.py +49 -0
  29. bvalcalc-0.6.2/Bvalcalc/core/siteBcalc.py +19 -0
  30. bvalcalc-0.6.2/Bvalcalc/templates/ArabidopsisParams.py +0 -0
  31. bvalcalc-0.6.2/Bvalcalc/templates/CelegansParams.py +1 -0
  32. bvalcalc-0.6.2/Bvalcalc/templates/DrosophilaParams.py +29 -0
  33. bvalcalc-0.6.2/Bvalcalc/templates/HumanParams.py +28 -0
  34. bvalcalc-0.6.2/Bvalcalc/templates/MouseParams.py +0 -0
  35. bvalcalc-0.6.2/Bvalcalc/templates/PfalciparumParams.py +29 -0
  36. bvalcalc-0.6.2/Bvalcalc/templates/SelfingParams.py +29 -0
  37. bvalcalc-0.6.2/Bvalcalc/utils/__init__.py +26 -0
  38. bvalcalc-0.6.2/Bvalcalc/utils/bin_outputs.py +37 -0
  39. bvalcalc-0.6.2/Bvalcalc/utils/dfe_helper.py +117 -0
  40. bvalcalc-0.6.2/Bvalcalc/utils/generateParams.py +46 -0
  41. bvalcalc-0.6.2/Bvalcalc/utils/load_Bmap.py +39 -0
  42. bvalcalc-0.6.2/Bvalcalc/utils/load_bed_gff.py +94 -0
  43. bvalcalc-0.6.2/Bvalcalc/utils/load_chr_sizes.py +28 -0
  44. bvalcalc-0.6.2/Bvalcalc/utils/load_rec_map.py +81 -0
  45. bvalcalc-0.6.2/Bvalcalc/utils/load_vcf.py +38 -0
  46. bvalcalc-0.6.2/Bvalcalc/utils/parseArgs.py +161 -0
  47. bvalcalc-0.6.2/Bvalcalc/utils/write_chrom_B_to_file.py +67 -0
  48. bvalcalc-0.6.2/Bvalcalc.egg-info/PKG-INFO +714 -0
  49. bvalcalc-0.6.2/Bvalcalc.egg-info/SOURCES.txt +64 -0
  50. bvalcalc-0.6.2/Bvalcalc.egg-info/dependency_links.txt +1 -0
  51. bvalcalc-0.6.2/Bvalcalc.egg-info/entry_points.txt +3 -0
  52. bvalcalc-0.6.2/Bvalcalc.egg-info/requires.txt +11 -0
  53. bvalcalc-0.6.2/Bvalcalc.egg-info/top_level.txt +1 -0
  54. bvalcalc-0.6.2/LICENSE +675 -0
  55. bvalcalc-0.6.2/PKG-INFO +714 -0
  56. bvalcalc-0.6.2/README.md +6 -0
  57. bvalcalc-0.6.2/pyproject.toml +63 -0
  58. bvalcalc-0.6.2/setup.cfg +4 -0
  59. bvalcalc-0.6.2/tests/test_calculateB.py +6 -0
  60. bvalcalc-0.6.2/tests/test_cli.py +271 -0
@@ -0,0 +1,16 @@
1
+ """
2
+ bvalcalc: calculate relative diversity (B) under background selection.
3
+ """
4
+
5
+ __version__ = "0.6.2"
6
+
7
+ # Expose main entry point
8
+ from .cli import main
9
+ from .core.calculateB import calculateB_linear, calculateB_recmap, calculateB_unlinked, get_params
10
+
11
+
12
+ __all__ = [
13
+ "get_params", "calculateB_linear", "calculateB_unlinked",
14
+ "main",
15
+ "__version__",
16
+ ]
@@ -0,0 +1,4 @@
1
+ from .cli import main
2
+
3
+ if __name__ == "__main__":
4
+ main()
@@ -0,0 +1,68 @@
1
+ #!/usr/bin/env python3
2
+ import os
3
+ import sys
4
+ import time
5
+ import argparse
6
+ from Bvalcalc.utils.parseArgs import parse_args, parseGenomeArgs, parseRegionArgs, parseGeneArgs, parseSiteArgs, parseBmapArgs
7
+ from Bvalcalc.core.plotB import plotB
8
+ from Bvalcalc.core.deprecated.plotB_figures import plotB_figures
9
+ from Bvalcalc.core.deprecated.plotB_figures_200kb import plotB_figures_200kb
10
+ from Bvalcalc.utils.generateParams import SPECIES, generateParams, check_generate_params_args
11
+ from Bvalcalc.core.positionsBstats import positionsBstats
12
+ from Bvalcalc.core.plotChromB import plotChromB
13
+
14
+ __version__ = "0.6.2"
15
+
16
+ def main():
17
+ start_time = time.time()
18
+
19
+ check_generate_params_args() # Unique error message for --generate_params to print species names
20
+ parser = parse_args(__version__)
21
+ known_args, remaining_args = parser.parse_known_args()
22
+
23
+ if known_args.generate_params is not None: # if --generate_params
24
+ print(f"Retrieving params from template...")
25
+ generateParams(known_args.generate_params, known_args.dir)
26
+ return
27
+
28
+ if known_args.Bmap is not None: # if --Bmap
29
+ args = parseBmapArgs(remaining_args)
30
+ flat_b, flat_chrom = positionsBstats(args, known_args.Bmap)
31
+ if args.plot_distribution:
32
+ plotChromB(flat_b, flat_chrom, args.plot_distribution, args.quiet)
33
+ return
34
+
35
+ print(f"= Calculating relative diversity (B) for all neutral sites across the genome. = = =")
36
+
37
+ if known_args.genome: # Run genome Bcalc
38
+ args = parseGenomeArgs(remaining_args)
39
+ os.environ["BCALC_POP_PARAMS"] = args.pop_params # Save params to global
40
+ from Bvalcalc.core.genomeBcalc import genomeBcalc
41
+ genomeBcalc(args)
42
+
43
+ elif known_args.region: # Run region Bcalc
44
+ args = parseRegionArgs(remaining_args)
45
+ os.environ["BCALC_POP_PARAMS"] = args.pop_params # Save params to global
46
+ from Bvalcalc.core.regionBcalc import regionBcalc
47
+ output_data, block_ranges, rec_rate_per_chunk_in_region, chunk_size = regionBcalc(args, known_args.region)
48
+ if getattr(args, 'plot_output', True):
49
+ plotB(b_values_input=output_data, caller="chromosome", output_path=args.plot_output, quiet=args.quiet, gene_ranges=block_ranges, neutral_only=args.neutral_only, rec_rates=rec_rate_per_chunk_in_region, chunk_size=chunk_size)
50
+
51
+ elif known_args.gene: # Run gene Bcalc
52
+ args = parseGeneArgs(remaining_args)
53
+ os.environ["BCALC_POP_PARAMS"] = args.pop_params # Save params to global
54
+ from Bvalcalc.core.geneBcalc import geneBcalc
55
+ output_data = geneBcalc(args) # Capture the output from geneBcalc
56
+ if getattr(args, 'plot_output', False): # If the --plot_output flag was provided, call plotB with geneBcalc's output.
57
+ plotB(b_values_input=output_data, caller="gene", output_path=args.plot_output, quiet=args.quiet)
58
+
59
+ elif known_args.site: # Run single site Bcalc
60
+ args = parseSiteArgs(remaining_args)
61
+ os.environ["BCALC_POP_PARAMS"] = args.pop_params # Save params to global
62
+ from Bvalcalc.core.siteBcalc import siteBcalc
63
+ siteBcalc(args)
64
+
65
+ print(f"= B value calculated in {time.time() - start_time:.2f} seconds. = = =")
66
+
67
+ if __name__ == "__main__":
68
+ main()
@@ -0,0 +1,20 @@
1
+ """
2
+ Core calculation modules for bvalcalc.
3
+ """
4
+ from .genomeBcalc import genomeBcalc
5
+ from .regionBcalc import regionBcalc
6
+ from .geneBcalc import geneBcalc
7
+ from .siteBcalc import siteBcalc
8
+ from .plotB import plotB
9
+ from .calculateB import calculateB_linear, calculateB_recmap, calculateB_unlinked
10
+
11
+ __all__ = [
12
+ "genomeBcalc",
13
+ "regionBcalc",
14
+ "geneBcalc",
15
+ "siteBcalc",
16
+ "plotB",
17
+ "calculateB_linear",
18
+ "calculateB_recmap",
19
+ "calculateB_unlinked"
20
+ ]
@@ -0,0 +1,351 @@
1
+ import numpy as np
2
+ from Bvalcalc.utils.dfe_helper import get_DFE_params
3
+ from scipy.optimize import root_scalar
4
+ from scipy.integrate import trapezoid
5
+
6
+ _params_cache: dict | None = None
7
+ _cache_args: tuple[str | None, bool, bool] | None = None
8
+
9
+ def get_params(
10
+ params_path: str | None = None,
11
+ gamma_dfe: bool = False,
12
+ constant_dfe: bool = False,
13
+ ):
14
+ """
15
+ Loads DFE parameters from the provided population genetic parameters file.
16
+ Caches on (params_path, gamma_dfe, constant_dfe) and rebuilds whenever
17
+ any of those three inputs change.
18
+ """
19
+ global _params_cache#, _cache_args # COMMENTED OUT CACHING FOR API USAGE, CAN RE-IMPLEMENT FOR CLI IF IT SLOWS IT DOWN
20
+ # key = (params_path, gamma_dfe, constant_dfe)
21
+ # if _cache_args != key:
22
+ _params_cache = get_DFE_params(params_path, gamma_dfe, constant_dfe)
23
+ # _cache_args = key
24
+ return _params_cache
25
+
26
+ def calculateB_linear(distance_to_element: int, length_of_element: int, params: dict | None = None):
27
+ """
28
+ Calculate B due to purifying selection acting on a linked selected element of arbitrary length, assuming a constant crossover and gene conversion rate (analytical solution).
29
+
30
+ Parameters
31
+ ----------
32
+ distance_to_element: int
33
+ Distance (bp) from the neutral site to the nearest edge of the selected element.
34
+ length_of_element: int
35
+ Length (bp) of the selected element.
36
+ params : dict
37
+ Required parameters from ``get_params()``, only kept as default (None) when being called by CLI,
38
+ in which case parameters are sourced from the params file directly.
39
+ """
40
+ with np.errstate(divide='ignore', invalid='ignore'):
41
+ if params is None:
42
+ params = get_params()
43
+ r, u, g, k, t1, t1half, t2, t3, t4, f1, f2, f3, f0, t_constant = params["r"], params["u"], params["g"], params["k"], params["t1"], params["t1half"], params["t2"], params["t3"], params["t4"], params["f1"], params["f2"], params["f3"], params["f0"], params["t_constant"]
44
+
45
+ C = (1.0 - np.exp(-2.0 * r * distance_to_element)) / 2.0 # cM
46
+ U = length_of_element * u
47
+ if g == 0:
48
+ a = C # RECOMBINATION IN Y
49
+ b = C + (r * length_of_element) # RECOMBINATION IN X
50
+ elif g > 0:
51
+ a, b = get_a_b_with_GC(C, distance_to_element, length_of_element)
52
+
53
+ if t_constant: #If --constant_dfe is active
54
+ E_constant = calculate_exponent(t_constant, t_constant, U, a, b)
55
+ B = np.exp(-1.0 * E_constant)
56
+ return np.where(length_of_element == 0, 1.0, B)
57
+
58
+ E_f1 = calculate_exponent(t1half, t2, U, a, b)
59
+ E_f2 = calculate_exponent(t2, t3, U, a, b)
60
+ E_f3 = calculate_exponent(t3, t4, U, a, b)
61
+
62
+ E_bar = ( # Sum over the DFE
63
+ f0 * 0.0
64
+ + f1 * ((t1half - t1) / (t2 - t1)) * 0.0
65
+ + f1 * ((t2 - t1half) / (t2 - t1)) * E_f1
66
+ + f2 * E_f2
67
+ + f3 * E_f3)
68
+
69
+ B = np.exp(-1.0 * E_bar)
70
+
71
+ return np.where(length_of_element == 0, 1.0, B)
72
+
73
+ def calculateB_recmap(distance_to_element, length_of_element,
74
+ rec_distances = None, rec_lengths = None,
75
+ gc_distances = None, gc_lengths = None, params = None):
76
+ """
77
+ Calculate the B value WITH REC MAP for a single functional element at the focal site,
78
+ summing over the DFE while consolidating the intermediate calculations.
79
+ """
80
+ with np.errstate(divide='ignore', invalid='ignore'):
81
+ if params is None:
82
+ params = get_params()
83
+ r, u, g, k, t1, t1half, t2, t3, t4, f1, f2, f3, f0, t_constant = params["r"], params["u"], params["g"], params["k"], params["t1"], params["t1half"], params["t2"], params["t3"], params["t4"], params["f1"], params["f2"], params["f3"], params["f0"], params["t_constant"]
84
+ # rec_distances is the length of the element * rec rate in each spanned region.
85
+
86
+ if rec_distances is not None:
87
+ rec_adjusted_length_of_element = rec_lengths
88
+ rec_adjusted_distance_to_element = rec_distances
89
+ else:
90
+ rec_adjusted_length_of_element = length_of_element
91
+ rec_adjusted_distance_to_element = distance_to_element
92
+
93
+ if gc_distances is not None:
94
+ local_g = (gc_lengths + gc_distances)/(length_of_element + distance_to_element) * g
95
+ else:
96
+ local_g = g
97
+
98
+ C = (1.0 - np.exp(-2.0 * r * rec_adjusted_distance_to_element)) / 2.0 # cM
99
+ U = length_of_element * u
100
+ if g == 0:
101
+ a = C
102
+ b = C + r * rec_adjusted_length_of_element # cM
103
+ elif g > 0:
104
+ a, b = get_a_b_with_GC_andMaps(C, y=distance_to_element, l=length_of_element,
105
+ rec_l=rec_adjusted_length_of_element, local_g = local_g)
106
+
107
+ if t_constant: #If --constant_dfe is active
108
+ E_constant = calculate_exponent(t_constant, t_constant, U, a, b)
109
+ B = np.exp(-1.0 * E_constant)
110
+ return np.where(length_of_element == 0, 1.0, B)
111
+
112
+ E_f1 = calculate_exponent(t1half, t2, U, a, b)
113
+ E_f2 = calculate_exponent(t2, t3, U, a, b)
114
+ E_f3 = calculate_exponent(t3, t4, U, a, b)
115
+
116
+ E_bar = ( # Sum over the DFE
117
+ f0 * 0.0
118
+ + f1 * ((t1half - t1) / (t2 - t1)) * 0.0
119
+ + f1 * ((t2 - t1half) / (t2 - t1)) * E_f1
120
+ + f2 * E_f2
121
+ + f3 * E_f3)
122
+
123
+ B = np.exp(-1.0 * E_bar)
124
+
125
+ return np.where(length_of_element == 0, 1.0, B)
126
+
127
+ def calculateB_unlinked(unlinked_L: int, params: dict | None = None):
128
+ """
129
+ Calculate B due to purifying selection at unlinked sites (numerical integration over DFE).
130
+
131
+ Parameters
132
+ ----------
133
+ unlinked_L : float
134
+ Cumulative count of selected sites in unlinked regions.
135
+ params : dict
136
+ Required parameters from ``get_params()``, only kept as default (None) when being called by CLI,
137
+ in which case parameters are sourced from the params file directly.
138
+ """
139
+ if params is None:
140
+ params = get_params()
141
+
142
+ u, t1, t1half, t2, t3, t4, f0, f1, f2, f3, t_constant = params["u"], params["t1"], params["t1half"], params["t2"], params["t3"], params["t4"], params["f0"], params["f1"], params["f2"], params["f3"], params["t_constant"]
143
+
144
+ if t_constant: #If --constant_dfe is active
145
+
146
+ unlinked_B = np.exp(-8 * u * 1.0 * unlinked_L * (t_constant/(1 + t_constant)**2))
147
+ return unlinked_B
148
+
149
+ f1_above_cutoff = f1 * ((t1half - t1) / (t2 - t1))
150
+
151
+ sum_f1 = (f1_above_cutoff / (t2 - t1half)) * (np.log((1 + t2) /(1 + t1half)) + (1 / (1 + t2)) - (1 / (1 + t1half)))
152
+ sum_f2 = (f2 / (t3 - t2)) * (np.log((1 + t3) /(1 + t2)) + (1 / (1 + t3)) - (1 / (1 + t2)))
153
+ sum_f3 = (f3 / (t4 - t3)) * (np.log((1 + t4) /(1 + t3)) + (1 / (1 + t4)) - (1 / (1 + t3)))
154
+
155
+ unlinked_B = np.exp(-8 * u * 1.0 * unlinked_L * (sum_f1 + sum_f2 + sum_f3))
156
+
157
+ return unlinked_B
158
+
159
+
160
+ ##
161
+
162
+
163
+
164
+ ## Helper functions
165
+
166
+ def calculate_exponent(t_start, t_end, U, a, b):
167
+ """"
168
+ Helper to calculate the exponent using "a" and "b"
169
+ """
170
+ a, b, U = np.asarray(a), np.asarray(b), np.asarray(U)
171
+
172
+ if U.size == 0: return 0 # If e.g. f1 proportion is 0, no need to calculate exponent
173
+
174
+ if t_end == t_start: # If --constant_dfe
175
+ E = (U / (a - b)) * (
176
+ a / (a + (1 - a) * t_start) -
177
+ b / (b + (1 - b) * t_start)
178
+ )
179
+ else: # Using discretized DFE (f0,f1,f2,f3 or --gamma_dfe)
180
+ E1 = ((U * a)
181
+ / ((1 - a) * (a - b) * (t_end - t_start))) * np.log((a + (t_end * (1 - a)))
182
+ / (a + (t_start * (1 - a))))
183
+ E2 = -1.0 * ((U * b)
184
+ / ((1 - b) * (a - b) * (t_end - t_start))) * np.log((b + ((1 - b) * t_end))
185
+ / (b + ((1 - b) * t_start)))
186
+
187
+ E = np.asarray(E1 + E2)
188
+
189
+ rec_0_mask = np.isclose(a, b) # Get mask for where recombination rate = 0 within the gene
190
+ if rec_0_mask.any(): # 4a) If a_arr is scalar (0‐d), compute limit once as scalar
191
+ if a.ndim == 0:
192
+ limit_factor = (1 / ((t_end - t_start)*(1-a)**2)) * ( # Calculate exponent with 0 recombination between gene and site, avoiding limits
193
+ np.log((a + (1 - a) * t_end)
194
+ / (a + (1 - a) * t_start))
195
+ + a / (a + (1 - a) * t_end)
196
+ - a / (a + (1 - a) * t_start))
197
+ if t_start == t_end: limit_factor = t_start / (a + (1 - a) * t_start)**2 # If --constant_dfe
198
+ # Broadcast scalar limit_factor to all masked positions
199
+ E[rec_0_mask] = U[rec_0_mask] * limit_factor # Get corresponding U for the numerator and plug back into E array to replace nan's
200
+
201
+ else: # 4b) If a_arr is array, compute limit for each masked element
202
+ ae = a[rec_0_mask] # array of a_i where a_i ≈ b_i
203
+ limit_factor = (1 / ((t_end - t_start)*(1-ae)**2)) * ( # Calculate exponent with 0 recombination between gene and site, avoiding limits
204
+ np.log((ae + (1 - ae) * t_end)
205
+ / (ae + (1 - ae) * t_start))
206
+ + ae / (ae + (1 - ae) * t_end)
207
+ - ae / (ae + (1 - ae) * t_start))
208
+ if t_start == t_end: limit_factor = t_start / (ae + (1 - ae) * t_start)**2 # If --constant_dfe
209
+ ## REPLACED BELOW WITH THE NEW LINE TO FIX FAR GENE ISSUE, MAY NEED TO REVERT
210
+ E[rec_0_mask] = U[rec_0_mask] * limit_factor
211
+ # Match array of limit_factor to corresponding positions in E (where rec_0_mask has True);'l;'l''
212
+ # if len(rec_0_mask[False]) == 0:
213
+ # # print(f"Need to fix --gene when r = 0, see calculateB ~line 176") Fixed??
214
+ # E[rec_0_mask] = U * limit_factor
215
+ # else:
216
+ # E[rec_0_mask] = U[rec_0_mask] * limit_factor # Get corresponding U for the numerator and plug back into E array to replace nan's
217
+
218
+ return E
219
+
220
+ def get_a_b_with_GC(C, y, l):
221
+ with np.errstate(divide='ignore', invalid='ignore'):
222
+ params = get_params()
223
+ r, u, g, k, t1, t1half, t2, t3, t4, f1, f2, f3, f0 = params["r"], params["u"], params["g"], params["k"], params["t1"], params["t1half"], params["t2"], params["t3"], params["t4"], params["f1"], params["f2"], params["f3"], params["f0"]
224
+ proportion_nogc_a = np.where(k < y + l, # When GC includes neutral site, this is proportion of the gene it includes
225
+ np.maximum((0.5*(k-y)/l), 0),
226
+ 1-(y + l)/(2 * k)
227
+ )
228
+
229
+ proportion_nogc_b = np.where(k < y + l, # When GC includes gene site, this is probability the tract includes neutral site of interest
230
+ 1/(2*k) * np.maximum(k-y+1,0) * np.maximum(k - y, 0) / l, # When overshooting not possible
231
+ (k - y - 0.5 * l) / k) # When overshooting possible
232
+
233
+
234
+ a = np.where(k < y,
235
+ C + (2 * g * k), # Probability of GC on neutral site, where overlap with element not possible
236
+ C + (2 * g * (y) + # When overlap possible this is probability gc is in neutral but doesn't include any of element
237
+ g * (k - y) * # Probability gc is in neutral and includes some element (remaining probability from above)
238
+ (1 - proportion_nogc_a) # Proportion of gene that gc breaks linkage with when it includes some element
239
+ ))
240
+ b = C + (r * l) + (2 * g * k) * (1 - (1-proportion_nogc_a)*proportion_nogc_b) #* prop k out
241
+
242
+ return a, b
243
+
244
+ def get_a_b_with_GC_andMaps(C, y, l, rec_l, local_g):
245
+ params = get_params()
246
+ r, u, g, k, t1, t1half, t2, t3, t4, f1, f2, f3, f0 = params["r"], params["u"], params["g"], params["k"], params["t1"], params["t1half"], params["t2"], params["t3"], params["t4"], params["f1"], params["f2"], params["f3"], params["f0"]
247
+ with np.errstate(divide='ignore', invalid='ignore'):
248
+ proportion_nogc_a = np.where(k < y + l, # When GC includes neutral site, this is proportion of the gene it includes
249
+ np.maximum((0.5*(k-y)/l), 0),
250
+ ((y) * (2 * k - (y + l)))/(2 * k * y)
251
+ )
252
+
253
+ proportion_nogc_b = np.where(k < y + l, # When GC includes gene site, this is probability the tract includes neutral site of interest
254
+ 1/(2*k) * np.maximum(k-y+1,0) * np.maximum(k - y, 0) / l,
255
+ (k - y - 0.5 * l) / k)
256
+
257
+ a = np.where(k < y,
258
+ C + (2 * local_g * k), # Probability of GC on neutral site, where overlap with element not possible
259
+ C + (2 * local_g * (y) + # When overlap possible this is probability gc is in neutral but doesn't include any of element
260
+ local_g * (k - y) * # Probability gc is in neutral and includes some element (remaining probability from above)
261
+ (1 - proportion_nogc_a) # Proportion of gene that gc breaks linkage with when it includes some element
262
+ ))
263
+ b = C + (r * rec_l) + (2 * local_g * k) * (1 - (1-proportion_nogc_a)*proportion_nogc_b) #* prop k out
264
+
265
+ return a, b
266
+
267
+ def calculateB_hri(distant_B, interfering_L, params: dict | None = None):
268
+ """
269
+ Fully vectorized calculation of B' under Hill-Robertson interference.
270
+ """
271
+ if params is None:
272
+ params = get_DFE_params()
273
+
274
+ Nanc, u, f1, f2 = params["Nanc"], params["u"], params["f1"], params["f2"]
275
+
276
+ distant_B = np.atleast_1d(distant_B).astype(float)
277
+ interfering_L = np.atleast_1d(interfering_L).astype(float)
278
+
279
+ scalar_input = distant_B.shape == () or distant_B.shape == (1,)
280
+
281
+ N0 = distant_B * Nanc
282
+ h = 0.5
283
+ u = 2 * u
284
+ u1 = f1 * u
285
+ u2 = f2 * u
286
+ u_total = u1 + u2
287
+
288
+ E_X2_f1 = (1**2 + 1*10 + 10**2) / 3
289
+ E_X2_f2 = (10**2 + 10*100 + 100**2) / 3
290
+
291
+ t_sq1 = (h**2 * E_X2_f1) / (4 * N0**2)
292
+ t_sq2 = (h**2 * E_X2_f2) / (4 * N0**2)
293
+ t = np.sqrt((u1 * t_sq1 + u2 * t_sq2) / u_total)
294
+
295
+ gamma = 2 * N0 * t
296
+ U = u_total * interfering_L
297
+ alpha2 = 2 * N0 * U
298
+ kappa = 1.0
299
+
300
+ def eq4(B, U, gamma, t):
301
+ exp_term = np.exp(-gamma * B)
302
+ num = 0.5 * U * (1 - exp_term)**3
303
+ denom = t * (1 + kappa * exp_term)**3
304
+ return -np.log(B) - num / denom
305
+
306
+ def solve_eq4_batched(U, gamma, t, n=500):
307
+ Bgrid = np.linspace(1e-10, 1.0, n)[None, :]
308
+ U = np.asarray(U).reshape(-1, 1)
309
+ gamma = np.asarray(gamma).reshape(-1, 1)
310
+ t = np.asarray(t).reshape(-1, 1)
311
+
312
+ fvals = eq4(Bgrid, U, gamma, t)
313
+ signs = np.sign(fvals)
314
+ crossing = np.diff(signs, axis=1) < 0
315
+ idx = np.argmax(crossing, axis=1)
316
+
317
+ B_left = Bgrid[0, idx]
318
+ B_right = Bgrid[0, idx + 1]
319
+ f_left = fvals[np.arange(len(U)), idx]
320
+ f_right = fvals[np.arange(len(U)), idx + 1]
321
+
322
+ B_root = B_left - f_left * (B_right - B_left) / (f_right - f_left)
323
+ return B_root
324
+
325
+ Bval = solve_eq4_batched(U, gamma, t)
326
+
327
+ def eq5_vectorized(B, alpha2, gamma, Tmax=100.0, n_steps=2000):
328
+ x = np.linspace(0, Tmax, n_steps)[None, :] # shape (1, n_steps)
329
+ dx = x[0, 1] - x[0, 0]
330
+
331
+ B = B[:, None]
332
+ alpha2 = alpha2[:, None]
333
+ gamma = gamma[:, None]
334
+
335
+ f1 = 1 - np.exp(-gamma * B)
336
+ f2 = 1 + kappa * np.exp(-gamma * B)
337
+ A = f1 / f2
338
+ c = 0.5 * alpha2 / gamma * A**3
339
+ d = 2 * gamma * B * (f2 / f1)
340
+
341
+ x_broadcasted = np.broadcast_to(x, (B.shape[0], x.shape[1]))
342
+ gx = np.exp(c * (1 - np.exp(-d * x_broadcasted))**2)
343
+ cumI = np.cumsum((gx[:, :-1] + gx[:, 1:]) * 0.5 * dx, axis=1)
344
+ cumI = np.hstack([np.zeros((gx.shape[0], 1)), cumI])
345
+
346
+ hx = np.exp(-B * cumI)
347
+ Bprime = B[:, 0] * trapezoid(hx, x[0], axis=1)
348
+ return Bprime
349
+
350
+ Bprime = eq5_vectorized(Bval, alpha2, gamma)
351
+ return Bprime[0] if scalar_input else Bprime
@@ -0,0 +1,194 @@
1
+ from Bvalcalc.core.helpers.process_single_chunk import process_single_chunk
2
+ from Bvalcalc.core.helpers.calc_L_per_chunk import calculate_L_per_chunk
3
+ from Bvalcalc.core.helpers.demography_helpers import get_Bcur
4
+ from Bvalcalc.utils.load_rec_map import load_rec_map
5
+ from Bvalcalc.utils.bin_outputs import bin_outputs
6
+ from concurrent.futures import ThreadPoolExecutor
7
+ from concurrent.futures import as_completed
8
+ import numpy as np
9
+ import os
10
+ import sys
11
+
12
+ def chromBcalc(args, blockstart, blockend, chromosome, unlinked_B, prior_pos = None, prior_b = None, calc_start=None, calc_end=None, chr_size=None, caller="regionBcalc"):
13
+ #Shared arguments between genomeBcalc and regionBcalc
14
+ file_path, chunk_size, precise_chunks, no_hri, quiet, verbose = args.bedgff_path, args.chunk_size, args.precise_chunks, args.no_hri, args.quiet, args.verbose
15
+
16
+ # Auto-adjust chunk size for large datasets (only if user hasn't manually set chunk_size)
17
+ if args.chunk_size is None: # If they did not explicitly provide --chunk_size
18
+ num_blocks = len(blockstart)
19
+ # Set default chunk size
20
+ chunk_size = 20000
21
+ original_chunk_size = chunk_size
22
+ if num_blocks > 250000:
23
+ chunk_size = 1000 # Use 1kb chunks for extremely massive datasets
24
+ if not quiet:
25
+ print(f"Extremely massive dataset detected ({num_blocks} blocks). Auto-adjusting chunk size from {original_chunk_size} to {chunk_size} bp for memory efficiency. Use --chunk_size to override.")
26
+ elif num_blocks > 125000:
27
+ chunk_size = 2000 # Use 2kb chunks for massive datasets
28
+ if not quiet:
29
+ print(f"Massive dataset detected ({num_blocks} blocks). Auto-adjusting chunk size from {original_chunk_size} to {chunk_size} bp for memory efficiency. Use --chunk_size to override.")
30
+ elif num_blocks > 50000:
31
+ chunk_size = 5000 # Use 5kb chunks for very large datasets
32
+ if not quiet:
33
+ print(f"Very large dataset detected ({num_blocks} blocks). Auto-adjusting chunk size from {original_chunk_size} to {chunk_size} bp for memory efficiency. Use --chunk_size to override.")
34
+ elif num_blocks > 25000:
35
+ chunk_size = 10000 # Use 10kb chunks for large datasets
36
+ if not quiet:
37
+ print(f"Large dataset detected ({num_blocks} blocks). Auto-adjusting chunk size from {original_chunk_size} to {chunk_size} bp for memory efficiency. Use --chunk_size to override.")
38
+ elif not quiet and num_blocks > 25000:
39
+ print(f"Large dataset detected ({num_blocks} blocks) but using user-specified chunk size of {chunk_size} bp.")
40
+ #Arguments specific to regionBcalc
41
+ if caller == "regionBcalc":
42
+ calc_start, calc_end = calc_start, calc_end
43
+ if calc_end > blockend[-1]:
44
+ chr_size = calc_end
45
+ else:
46
+ chr_size = None
47
+
48
+ if not args.quiet:
49
+ print(f"====== P A R A M E T E R S =========================")
50
+ print(f"BED/GFF file for regions under selection: {file_path}")
51
+ if chr_size is not None: print(f"Last position in chromosome {chromosome}: {calc_end}")
52
+ print(f"Size of chunks to calculate B in per iteration: {chunk_size}bp")
53
+ print(f"Number of adjacent chunks to calculate B precisely for: {precise_chunks}")
54
+
55
+ if chr_size is not None and chr_size < blockend[-1]:
56
+ raise ValueError(f"chr_size provided is less than gene position for chromosome {chromosome}")
57
+ if chr_size is None: # Default chr_size to last value in blockend if not given
58
+ if len(blockend) == 0 and caller != "regionBcalc":
59
+ raise ValueError("chr_size was not provided for chromosome: {chromosome} and gene position ends not computed. Check BED/GFF input, and specify chr_size if needed")
60
+ chr_size = blockend[-1]
61
+ if calc_end is None and not args.quiet:
62
+ print(f"No --chr_size provided for chromosome: {chromosome}. Using last position in BED/GFF: {chr_size}")
63
+
64
+ if not quiet: print(f"====== S T A R T I N G ===== C A L C ===============")
65
+ if calc_start is None and calc_end is None:
66
+ if not quiet: print(f"Calculating B for entire chromosome, to only calculate for a subregion, use --calc_start and --calc_end")
67
+ if calc_start is None:
68
+ calc_start = 1
69
+ if calc_end is None:
70
+ calc_end = chr_size
71
+
72
+ chr_start = 1 # Currently hardcoded, can change if needed
73
+ num_chunks = (chr_size - chr_start + chunk_size - 1) // chunk_size
74
+
75
+ calc_chunk_start = (calc_start - chr_start) // chunk_size
76
+ calc_chunk_end = (calc_end - chr_start) // chunk_size
77
+ calc_chunks = np.arange(calc_chunk_start,calc_chunk_end + 1) # Relevant chunks to calculate B for based on calc_start and calc_end
78
+
79
+ b_values = np.ones(chr_size + 2 - chr_start, dtype=np.float64) # Initialize array of B values
80
+ if prior_pos is not None and prior_b is not None: # If we have prior B map, overwrite those positions' B values
81
+ idx = np.asarray(prior_pos, dtype=int)
82
+ calc_mask = (idx >= calc_start) & (idx <= calc_end)
83
+ idx = idx[calc_mask] # filter to only those within [calc_start, calc_end]
84
+ bprior = np.asarray(prior_b, dtype=b_values.dtype)[calc_mask]
85
+ b_values[idx] = bprior
86
+
87
+ lperchunk = calculate_L_per_chunk(chunk_size, blockstart, blockend, chr_start, chr_size) # Cumulative conserved length in each chunk
88
+
89
+ if args.rec_map: # Process recombination map if provided
90
+ if not quiet: print(f"Using recombination (crossover) map from {args.rec_map}")
91
+ rec_rate_per_chunk = load_rec_map(args.rec_map, chr_start, chr_size, chunk_size, chromosome)
92
+ else:
93
+ rec_rate_per_chunk = None
94
+
95
+ if args.gc_map:
96
+ if not quiet: print(f"Using gene conversion map from {args.gc_map}")
97
+ gc_rate_per_chunk = load_rec_map(args.gc_map, chr_start, chr_size, chunk_size, chromosome)
98
+ else:
99
+ gc_rate_per_chunk = None
100
+
101
+ if verbose: print(f"====== R E S U L T S == P E R == C H U N K =========")
102
+ elif not quiet: print(f"To print per-chunk summaries, add --verbose.")
103
+
104
+ import gc
105
+ BATCH_SIZE = args.chunk_batch_size
106
+ total_chunks = len(calc_chunks)
107
+ completed = 0
108
+
109
+ for batch_start in range(0, total_chunks, BATCH_SIZE):
110
+ batch = calc_chunks[batch_start : batch_start + BATCH_SIZE]
111
+ with ThreadPoolExecutor() as executor:
112
+ futures = {
113
+ executor.submit(process_single_chunk, chunk_idx,
114
+ chunk_size, blockstart, blockend, chr_start, chr_size, calc_start,
115
+ calc_end, num_chunks, precise_chunks, lperchunk, b_values,
116
+ rec_rate_per_chunk, gc_rate_per_chunk, no_hri, quiet, verbose, unlinked_B): chunk_idx
117
+ for chunk_idx in batch
118
+ }
119
+ if not quiet and not verbose:
120
+ for future in as_completed(futures):
121
+ completed += 1
122
+ progress = int((completed / total_chunks) * 100)
123
+ sys.stdout.write(f"\rProgress ({chromosome}): {progress}% ({completed}/{total_chunks} chunks [{chunk_size}])")
124
+ sys.stdout.flush()
125
+ # After batch is done, cleanup
126
+ print() # Move to the next line after progress printing
127
+ del futures
128
+ gc.collect()
129
+
130
+ b_values = b_values[calc_start:(calc_end+1)] # Trim b_values array to only calculated region
131
+ b_values = b_values * unlinked_B
132
+ # print('Hriii', np.shape(b_values))
133
+
134
+ if not no_hri and rec_rate_per_chunk is not None: # If --no_hri is not active
135
+ from Bvalcalc.core.helpers.extend_hri_regions_correction import extend_hri_regions_correction
136
+ hri_extended_starts, hri_extended_ends = extend_hri_regions_correction(b_values, rec_rate_per_chunk, chunk_size, chr_start, calc_start, calc_end, hri_r_threshold = 0.1) # Extend HRI regions until B > B' to avoid sharp decrease in B at the border between normal and HRI regions. See manuscript.
137
+ else:
138
+ hri_extended_starts, hri_extended_ends = np.array([], dtype=int), np.array([], dtype=int)
139
+
140
+ if not quiet:
141
+ print(f"====== F I N I S H E D ===== C A L C ===============")
142
+ print(f"====== R E S U L T S ====== S U M M A R Y ==========")
143
+ # Total genic bases within calc_start to calc_end
144
+ calc_selected_length = 0
145
+ for start, end in zip(blockstart, blockend):
146
+ # Find overlap between this block and the calculated region
147
+ overlap_start = max(start, calc_start)
148
+ overlap_end = min(end, calc_end)
149
+ if overlap_start <= overlap_end:
150
+ calc_selected_length += (overlap_end - overlap_start + 1)
151
+ print(f"Cumulative length of calculated region under selection: {calc_selected_length}bp "f"({round((calc_selected_length / (calc_end - calc_start + 1)) * 100, 2)}%)")
152
+ print(f"Cumulative length of chromosome under selection: {int(sum(lperchunk))}bp ({round((sum(lperchunk)/(chr_size - chr_start + 1))*100,2)}%)")
153
+ print(f"B from unlinked sites for chromosome {chromosome}: {unlinked_B}")
154
+ if caller == "genomeBcalc": print(f"Mean B of neutral sites across chromosome {chromosome}: {b_values[~np.isnan(b_values)].mean()}")
155
+ elif caller == "regionBcalc": print(f"Mean B of neutral sites across specified region: {b_values[~np.isnan(b_values)].mean()}")
156
+ if args.rec_map: # Process recombination map if provided
157
+ print(f"Calculated using recombination (crossover) map, with rates averaged within {chunk_size}bp chunks")
158
+ if args.gc_map: # Process recombination map if provided
159
+ print(f"Calculated using gene conversion map, with rates averaged within {chunk_size}bp chunks")
160
+
161
+ block_ranges = np.column_stack((np.repeat(chromosome, blockstart.shape[0]), blockstart, blockend))
162
+
163
+ positions = np.arange(calc_start, calc_end + 1)
164
+ conserved = np.full_like(positions, "N", dtype="<U1")
165
+ for start, end in zip(blockstart, blockend): # Mark conserved regions
166
+ conserved[max(start, calc_start) - calc_start : min(end, calc_end) - calc_start + 1] = "C"
167
+
168
+ if args.pop_change:
169
+ b_values = get_Bcur(b_values)
170
+ if not quiet: print("Demographic change applied to B-calculation")
171
+
172
+ binned_b_values, binned_positions = bin_outputs(b_values, positions, args.out_binsize)
173
+ chrom_col = np.full(binned_positions.shape, chromosome, dtype="<U20")
174
+
175
+ output_data = np.core.records.fromarrays(
176
+ [chrom_col,binned_positions.astype(int),binned_b_values.astype(float)],
177
+ names='Chromosome,Start,B',formats='U20,i8,f8')
178
+
179
+ if args.out is not None: # Write to CSVs
180
+ print(f"Writing B output to file...")
181
+ from Bvalcalc.utils.write_chrom_B_to_file import write_chrom_B_to_file
182
+ write_chrom_B_to_file(args.out, output_data, quiet, hri_extended_starts, hri_extended_ends, args.out_binsize, calc_end)
183
+ print(f"Appended B values to: {os.path.abspath(args.out)}")
184
+ else:
185
+ if not args.quiet:
186
+ print("No output CSV requested; skipping save.")
187
+
188
+ if caller == "regionBcalc":
189
+ if rec_rate_per_chunk is not None:
190
+ rec_rate_per_chunk_in_region = rec_rate_per_chunk[calc_start // chunk_size:] # Slice rec_rate_per_chunk from region start onward
191
+ else: rec_rate_per_chunk_in_region = None
192
+ return output_data, block_ranges, rec_rate_per_chunk_in_region, chunk_size
193
+ else: #caller is genomeBcalc
194
+ return