pycmplot 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
pycmplot/annotation.py ADDED
@@ -0,0 +1,368 @@
1
+ """
2
+ pycmplot.annotation
3
+ ===================
4
+ Nearest-gene annotation and locus summary table generation.
5
+ """
6
+
7
+ from __future__ import annotations
8
+
9
+ import bisect
10
+ import logging
11
+ from typing import Optional
12
+
13
+ import natsort
14
+ import numpy as np
15
+ import pandas as pd
16
+
17
+ from pycmplot.constants import BIOTYPE_WEIGHTS
18
+ from pycmplot.resources import ResourceConfig, default_resources
19
+
20
+ logger = logging.getLogger(__name__)
21
+
22
+
23
+ # ---------------------------------------------------------------------------
24
+ # Internal: gene dictionary builder
25
+ # ---------------------------------------------------------------------------
26
+
27
+ def _build_genes_dict(genes_df: pd.DataFrame) -> dict:
28
+ """Build a chromosome-keyed interval dict with sorted start positions.
29
+
30
+ Parameters
31
+ ----------
32
+ genes_df:
33
+ DataFrame with columns ``CHR``, ``START``, ``END``, ``STRAND``, ``GENE``.
34
+
35
+ Returns
36
+ -------
37
+ dict keyed by chromosome string; each value is
38
+ ``{"intervals": [...], "starts": [...]}``.
39
+ """
40
+ genes_df = genes_df.sort_values(["CHR", "START"])
41
+ genes_dict: dict = {}
42
+
43
+ for chrom, group in genes_df.groupby("CHR"):
44
+ intervals = list(
45
+ zip(
46
+ group["START"].astype(int),
47
+ group["END"].astype(int),
48
+ group["STRAND"],
49
+ group["GENE"],
50
+ )
51
+ )
52
+ starts = [g[0] for g in intervals]
53
+ genes_dict[str(chrom)] = {"intervals": intervals, "starts": starts}
54
+
55
+ return genes_dict
56
+
57
+
58
+ # ---------------------------------------------------------------------------
59
+ # Internal: strand-aware variant annotation
60
+ # ---------------------------------------------------------------------------
61
+
62
+ def _annotate_variant(
63
+ chrom: str,
64
+ pos: int,
65
+ genes_dict: dict,
66
+ window: int = 500_000,
67
+ promoter_window: int = 2_000,
68
+ ) -> dict:
69
+ """Return strand-aware nearest-gene annotation for a single variant.
70
+
71
+ Returns a dict with keys:
72
+ ``genic``, ``nearest_upstream_gene``, ``upstream_distance``,
73
+ ``nearest_downstream_gene``, ``downstream_distance``,
74
+ ``promoter_upstream_flag``, ``gene_density``.
75
+ """
76
+ _empty = {
77
+ "genic": False,
78
+ "nearest_upstream_gene": None,
79
+ "upstream_distance": None,
80
+ "nearest_downstream_gene": None,
81
+ "downstream_distance": None,
82
+ "promoter_upstream_flag": False,
83
+ "bidirectional_promoter_flag": False,
84
+ "gene_density": 0,
85
+ }
86
+
87
+ if chrom not in genes_dict:
88
+ return _empty
89
+
90
+ chrom_data = genes_dict[chrom]
91
+ genes = chrom_data["intervals"]
92
+ starts = chrom_data["starts"]
93
+
94
+ left_bound = pos - window
95
+ right_bound = pos + window
96
+
97
+ i = bisect.bisect_left(starts, left_bound)
98
+
99
+ gene_density = 0
100
+ nearest_upstream: Optional[str] = None
101
+ nearest_downstream: Optional[str] = None
102
+ min_up_dist = float("inf")
103
+ min_down_dist = float("inf")
104
+ promoter_upstream_flag = False
105
+
106
+ while i < len(genes):
107
+ start, end, strand, gene = genes[i]
108
+
109
+ if start > right_bound:
110
+ break
111
+
112
+ if end >= left_bound:
113
+ gene_density += 1
114
+
115
+ if start <= pos <= end:
116
+ return {
117
+ "genic": True,
118
+ "nearest_upstream_gene": gene,
119
+ "upstream_distance": 0,
120
+ "nearest_downstream_gene": None,
121
+ "downstream_distance": None,
122
+ "promoter_upstream_flag": False,
123
+ "gene_density": gene_density,
124
+ }
125
+
126
+ tss = start if strand == "+" else end
127
+ distance = abs(pos - tss)
128
+
129
+ if distance <= window:
130
+ if strand == "+":
131
+ is_upstream = pos < tss
132
+ in_promoter = (tss - promoter_window) <= pos < tss
133
+ else:
134
+ is_upstream = pos > tss
135
+ in_promoter = tss < pos <= (tss + promoter_window)
136
+
137
+ if is_upstream:
138
+ if distance < min_up_dist:
139
+ min_up_dist = distance
140
+ nearest_upstream = gene
141
+ if in_promoter:
142
+ promoter_upstream_flag = True
143
+ else:
144
+ if distance < min_down_dist:
145
+ min_down_dist = distance
146
+ nearest_downstream = gene
147
+
148
+ i += 1
149
+
150
+ return {
151
+ "genic": False,
152
+ "nearest_upstream_gene": nearest_upstream,
153
+ "upstream_distance": min_up_dist if nearest_upstream else None,
154
+ "nearest_downstream_gene": nearest_downstream,
155
+ "downstream_distance": min_down_dist if nearest_downstream else None,
156
+ "promoter_upstream_flag": promoter_upstream_flag,
157
+ "gene_density": gene_density,
158
+ }
159
+
160
+
161
+ # ---------------------------------------------------------------------------
162
+ # Internal: prioritisation scorer
163
+ # ---------------------------------------------------------------------------
164
+
165
+ def _annotate_and_prioritize_variant(
166
+ chrom: str,
167
+ pos: int,
168
+ genes_df: pd.DataFrame,
169
+ lead_snps_df: pd.DataFrame,
170
+ window: int = 500_000,
171
+ promoter_window: int = 2_000,
172
+ biotype_weights: Optional[dict] = None,
173
+ ) -> Optional[dict]:
174
+ if biotype_weights is None:
175
+ biotype_weights = BIOTYPE_WEIGHTS
176
+
177
+ genes_df = genes_df.copy()
178
+ genes_df["TSS"] = np.where(
179
+ genes_df["STRAND"] == "+",
180
+ genes_df["START"],
181
+ genes_df["END"],
182
+ )
183
+
184
+ chr_genes = genes_df[genes_df["CHR"] == chrom]
185
+ if chr_genes.empty:
186
+ return None
187
+
188
+ candidates = chr_genes[
189
+ (chr_genes["START"] <= pos + window) & (chr_genes["END"] >= pos - window)
190
+ ].copy()
191
+
192
+ if candidates.empty:
193
+ return None
194
+
195
+ gene_density = len(candidates)
196
+
197
+ candidates["distance"] = np.where(
198
+ (pos >= candidates["START"]) & (pos <= candidates["END"]),
199
+ 0,
200
+ np.minimum(
201
+ abs(pos - candidates["START"]),
202
+ abs(pos - candidates["END"]),
203
+ ),
204
+ )
205
+
206
+ candidates["genic"] = (pos >= candidates["START"]) & (pos <= candidates["END"])
207
+
208
+ candidates["promoter_flag"] = (
209
+ (candidates["STRAND"] == "+")
210
+ & (pos >= candidates["TSS"] - promoter_window)
211
+ & (pos <= candidates["TSS"])
212
+ ) | (
213
+ (candidates["STRAND"] == "-")
214
+ & (pos <= candidates["TSS"] + promoter_window)
215
+ & (pos >= candidates["TSS"])
216
+ )
217
+
218
+ candidates["distance_score"] = 1 / np.log10(candidates["distance"] + 10)
219
+ candidates["biotype_weight"] = candidates["BIOTYPE"].map(
220
+ lambda x: biotype_weights.get(x, 0)
221
+ )
222
+ candidates["promoter_bonus"] = candidates["promoter_flag"].astype(int) * 0.5
223
+ candidates["priority_score"] = (
224
+ candidates["genic"].astype(int) * 2
225
+ + candidates["promoter_flag"].astype(int) * 1
226
+ + candidates["biotype_weight"] * 2 * candidates["distance_score"]
227
+ )
228
+
229
+ candidates = candidates.sort_values("priority_score", ascending=False)
230
+
231
+ if candidates.empty:
232
+ return {
233
+ "top_gene": None, "biotype": None, "priority_score": None,
234
+ "distance": None, "promoter_flag": None, "distance_score": None,
235
+ "biotype_weight": None, "promoter_bonus": None, "gene_density": None,
236
+ }
237
+
238
+ if candidates["genic"].any():
239
+ top = candidates.iloc[0]
240
+ return {
241
+ "top_gene": top["GENE"],
242
+ "biotype": top["BIOTYPE"],
243
+ "priority_score": top["priority_score"],
244
+ "distance": top["distance"],
245
+ "promoter_flag": top["promoter_flag"],
246
+ "distance_score": top["distance_score"],
247
+ "biotype_weight": top["biotype_weight"],
248
+ "promoter_bonus": top["promoter_bonus"],
249
+ "gene_density": gene_density,
250
+ }
251
+ else:
252
+ top2 = candidates.head(2)
253
+ return {
254
+ "top_gene": "-".join(top2["GENE"]),
255
+ "biotype": "intergenic",
256
+ "priority_score": None,
257
+ "distance": "-".join(map(str, top2["distance"])),
258
+ "promoter_flag": None,
259
+ "distance_score": None,
260
+ "biotype_weight": None,
261
+ "promoter_bonus": None,
262
+ "gene_density": None,
263
+ }
264
+
265
+
266
+ # ---------------------------------------------------------------------------
267
+ # Internal: clumping
268
+ # ---------------------------------------------------------------------------
269
+
270
+ def _clump_by_distance(df: pd.DataFrame, window_kb: int = 500) -> pd.DataFrame:
271
+ window = window_kb * 1000
272
+ clumped: list[pd.Series] = []
273
+
274
+ for _chrom, group in df.groupby("CHR"):
275
+ if "logP" in df.columns:
276
+ group = group.sort_values("logP", ascending=False)
277
+ else:
278
+ group = group.sort_values("P", ascending=True)
279
+
280
+ kept_positions: list[int] = []
281
+ for _, row in group.iterrows():
282
+ if all(abs(row["POS"] - p) > window for p in kept_positions):
283
+ clumped.append(row)
284
+ kept_positions.append(row["POS"])
285
+
286
+ return pd.DataFrame(clumped).sort_values(
287
+ ["CHR", "POS"], key=natsort.natsort_keygen()
288
+ )
289
+
290
+
291
+ # ---------------------------------------------------------------------------
292
+ # Public API
293
+ # ---------------------------------------------------------------------------
294
+
295
+ def get_hits_summary_table(
296
+ leads_df: pd.DataFrame,
297
+ window_kb: int = 500,
298
+ table_out: Optional[str] = None,
299
+ resources: Optional[ResourceConfig] = None,
300
+ ) -> pd.DataFrame:
301
+ """Annotate lead SNPs with nearest genes and write a summary table.
302
+
303
+ Parameters
304
+ ----------
305
+ leads_df:
306
+ DataFrame of lead SNPs (output of :func:`~pycmplot.stats.get_lead_snps`).
307
+ Must contain columns ``CHR``, ``POS``, ``P``, ``BUILD``.
308
+ window_kb:
309
+ Window in kb around each lead SNP to search for genes (default 500 kb).
310
+ table_out:
311
+ If provided, write the clumped table to this TSV file path.
312
+ resources:
313
+ :class:`~pycmplot.resources.ResourceConfig` instance.
314
+
315
+ Returns
316
+ -------
317
+ pd.DataFrame
318
+ Clumped locus summary table with gene annotations.
319
+ """
320
+ if resources is None:
321
+ resources = default_resources
322
+
323
+ # Choose gene info file based on build
324
+ if "OLD_POS" not in leads_df.columns and list(set(leads_df["BUILD"])) == ["hg19"]:
325
+ geneinfo_path = resources.require("geneinfo_hg19")
326
+ else:
327
+ geneinfo_path = resources.require("geneinfo_hg38")
328
+
329
+ logger.info("Loading gene info from: %s", geneinfo_path)
330
+ geneinfo = pd.read_csv(geneinfo_path, header=0, sep="\t")
331
+ genes_dict = _build_genes_dict(geneinfo)
332
+
333
+ window = window_kb * 1_000
334
+ records: list[dict] = []
335
+
336
+
337
+ logger.info("Annotating lead variants and generating hits summary table ...")
338
+ for _, row in leads_df.iterrows():
339
+ annotation = _annotate_variant(
340
+ chrom=row["CHR"],
341
+ pos=row["POS"],
342
+ genes_dict=genes_dict,
343
+ window=window,
344
+ )
345
+ prioritized = _annotate_and_prioritize_variant(
346
+ chrom=row["CHR"],
347
+ pos=row["POS"],
348
+ genes_df=geneinfo,
349
+ lead_snps_df=leads_df,
350
+ window=window,
351
+ )
352
+
353
+ record = {
354
+ **(row.to_dict()),
355
+ **(annotation if annotation is not None else {}),
356
+ **(prioritized if prioritized is not None else {}),
357
+ }
358
+ records.append(record)
359
+
360
+ locus_table = pd.DataFrame(records).sort_values(
361
+ ["CHR", "POS"], key=natsort.natsort_keygen()
362
+ )
363
+
364
+ if table_out is not None:
365
+ locus_table.to_csv(table_out, index=False, sep="\t", na_rep="None")
366
+ logger.info("Locus summary written to: %s", table_out)
367
+
368
+ return _clump_by_distance(locus_table, window_kb=window_kb)
pycmplot/cli.py ADDED
@@ -0,0 +1,229 @@
1
+ """
2
+ pycmplot.cli
3
+ ============
4
+ Command-line argument definitions.
5
+ """
6
+
7
+ from __future__ import annotations
8
+
9
+ import argparse
10
+ from pathlib import Path
11
+
12
+ DESCMSG = """
13
+ #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~#
14
+ | PACKAGE FOR CIRCULAR AND LINEAR MANHATTAN PLOTTING |
15
+ | Kevin Esoh, 2026 |
16
+ | kesohku1@jh.edu |
17
+ #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~#
18
+ """
19
+
20
+
21
+ def get_arguments(descmsg: str = DESCMSG) -> argparse.Namespace:
22
+ """Parse and return command-line arguments."""
23
+
24
+ parser = argparse.ArgumentParser(
25
+ prog="pycmplot",
26
+ description=descmsg,
27
+ formatter_class=argparse.RawTextHelpFormatter,
28
+ add_help=False,
29
+ )
30
+
31
+ req = parser.add_argument_group("Required")
32
+ opt = parser.add_argument_group("Optional")
33
+ cio = parser.add_argument_group("Circular Only")
34
+ lio = parser.add_argument_group("Linear Only")
35
+
36
+ # ------------------------------------------------------------------
37
+ # Required
38
+ # ------------------------------------------------------------------
39
+ req.add_argument(
40
+ "-s", "--sum_stats",
41
+ help="Comma-separated list of GWAS summary stats files (e.g. file1.txt.gz,file2.tsv).",
42
+ required=True, type=str, metavar="str",
43
+ )
44
+ req.add_argument(
45
+ "-l", "--labels",
46
+ help=(
47
+ "Comma-separated track labels, same order as --sum_stats.\n"
48
+ "E.g. HbF,MCV,MCH"
49
+ ),
50
+ required=True, type=str, metavar="str",
51
+ )
52
+ req.add_argument(
53
+ "-b", "--build_column", required=True, type=str, metavar="str",
54
+ help="Genome build column name (containing hg18/hg19/hg38)."
55
+ )
56
+
57
+ # ------------------------------------------------------------------
58
+ # Optional
59
+ # ------------------------------------------------------------------
60
+ opt.add_argument(
61
+ "-m", "--mode",
62
+ help="Plot mode: lm (linear Manhattan) or cm (circular Manhattan). Default: lm.",
63
+ choices=["lm", "cm"], default="lm", type=str,
64
+ )
65
+ opt.add_argument(
66
+ "-chr", "--chrom_column", type=str, metavar="str",
67
+ help="Chromosome column name in sumstats (e.g. CHR)."
68
+ )
69
+ opt.add_argument(
70
+ "-pos", "--pos_column", type=str, metavar="str",
71
+ help="Position column name (e.g. BP)."
72
+ )
73
+ opt.add_argument(
74
+ "-snp", "--snp_column", type=str, metavar="str",
75
+ help="SNP ID column name (e.g. ID)."
76
+ )
77
+ opt.add_argument(
78
+ "-p", "--pval_column", type=str, metavar="str",
79
+ help="P-value column name (e.g. P)."
80
+ )
81
+ opt.add_argument(
82
+ "-d", "--delim",
83
+ choices=["space", "tab", "comma", "colon", "semi-colon"],
84
+ type=str, metavar="str",
85
+ help="File delimiter (autodetected if omitted)."
86
+ )
87
+ opt.add_argument(
88
+ "--logp", action="store_true",
89
+ help="Plot −log₁₀(p) instead of raw p-values."
90
+ )
91
+ opt.add_argument(
92
+ "-qq", "--qq_plot", action="store_true",
93
+ help="Also generate a QQ-plot."
94
+ )
95
+ opt.add_argument(
96
+ "-tp", "--trim_pval", type=float, metavar="float",
97
+ help="Trim variants with p > this value before plotting."
98
+ )
99
+ opt.add_argument(
100
+ "-sig", "--signif_threshold",
101
+ default=None, const=5e-8, nargs="?", type=float, metavar="float",
102
+ help="Genome-wide significance threshold (default: 5e-8)."
103
+ )
104
+ opt.add_argument(
105
+ "-sigl", "--signif_line",
106
+ default=None, const=5e-8, nargs="?", type=float, metavar="float",
107
+ help="Value for genome-wide significance line if different from `-sig` (default: 5e-8)."
108
+ )
109
+ opt.add_argument(
110
+ "-sug", "--suggest_threshold",
111
+ default=None, const=1e-5, nargs="?", type=float, metavar="float",
112
+ help="Suggestive significance threshold (default: 1e-5)."
113
+ )
114
+ opt.add_argument(
115
+ "-a", "--annotate",
116
+ choices=["SNP", "GENE"], nargs="?",
117
+ default="SNP", const="SNP", type=str, #metavar="str",
118
+ help="Annotate significant loci by SNP ID or nearest gene."
119
+ )
120
+ opt.add_argument(
121
+ "-p_size", "--point_size", default=6, type=float, metavar="float",
122
+ help="Size of each point of scatter plot (default: 6)."
123
+ )
124
+ opt.add_argument(
125
+ "-a_size", "--annotation_size", default=6, type=float, metavar="float",
126
+ help="Annotation label font size (default: 6)."
127
+ )
128
+ opt.add_argument(
129
+ "-hl", "--highlight", action="store_true",
130
+ help="Highlight significant loci."
131
+ )
132
+ opt.add_argument(
133
+ "-ht", "--highlight_thresh", default=5e-8, type=float, metavar="float",
134
+ help="P-value threshold for highlighting (default: 5e-8)."
135
+ )
136
+ opt.add_argument(
137
+ "-hl_line", "--highlight_line", action="store_true",
138
+ help="Draw vertical lines through highlighted positions."
139
+ )
140
+ opt.add_argument(
141
+ "--colors", default="steelblue,silver", type=str, metavar="str",
142
+ help="Two comma-separated alternating chromosome colours (default: steelblue,silver)."
143
+ )
144
+ opt.add_argument(
145
+ "-st", "--sort_track",
146
+ choices=["chrom_len", "label"], nargs="?",
147
+ const="chrom_len", default=None, type=str, #metavar="str",
148
+ help="Sort tracks by chromosome count or label."
149
+ )
150
+ opt.add_argument(
151
+ "-plt", "--plot_title", default="MyCMplot", type=str, metavar="str",
152
+ help="Plot plot_title / output file stem."
153
+ )
154
+ opt.add_argument(
155
+ "-pts", "--plot_title_size", default=8, type=float, metavar="float",
156
+ help="Plot plot_title font size (default: 8)."
157
+ )
158
+ opt.add_argument(
159
+ "-od", "--output_dir", default=".", type=Path, metavar="path",
160
+ help="Output directory (default: current directory)."
161
+ )
162
+ opt.add_argument(
163
+ "-of", "--output_format",
164
+ choices=["png", "pdf", "svg", "jpg"],
165
+ default="png", type=str, metavar="str",
166
+ help="Output image format (default: png)."
167
+ )
168
+ opt.add_argument(
169
+ "--dpi", default=300, type=int, metavar="int",
170
+ help="Output resolution in DPI (default: 300)."
171
+ )
172
+ opt.add_argument(
173
+ "-f", "--force", action="store_true",
174
+ help="Overwrite existing output files."
175
+ )
176
+
177
+ # circular only
178
+ cio.add_argument(
179
+ "--pad", default=1, type=int, metavar="int",
180
+ help="Space between circular tracks (default: 1)."
181
+ )
182
+ cio.add_argument(
183
+ "-cl_size", "--chrom_label_size", default=6, type=float, metavar="float",
184
+ help="Chromosome label font size (default: 6)."
185
+ )
186
+ cio.add_argument(
187
+ "-cl_side", "--chrom_label_side", choices=["inside", "outside"],
188
+ nargs="?", default="inside", const="inside", type=str,
189
+ help="Chromosome label placement (default: inside)."
190
+ )
191
+ cio.add_argument(
192
+ "-tl_size", "--track_label_size", default=6, type=float, metavar="float",
193
+ help="Track label font size (default: 6)."
194
+ )
195
+ cio.add_argument(
196
+ "-tl_orient", "--track_label_orientation",
197
+ choices=["vertical", "horizontal"], nargs="?",
198
+ default="vertical", const="vertical", type=str,
199
+ help="Track label orientation (default: vertical)."
200
+ )
201
+ cio.add_argument(
202
+ "--r_min", default=20, type=int, metavar="int",
203
+ help="Inner radius proportion (circular mode, default: 20)."
204
+ )
205
+ cio.add_argument(
206
+ "--r_max", default=100, type=int, metavar="int",
207
+ help="Outer radius (circular mode, default: 100)."
208
+ )
209
+
210
+ # linear only
211
+ lio.add_argument(
212
+ "-th", "--track_heights", type=str, metavar="str",
213
+ help="Comma-separated relative track heights (e.g. 2,2,1.5)."
214
+ )
215
+ lio.add_argument(
216
+ "-cs","--chr_spacing", default=9e6, type=float, metavar="float",
217
+ help="Spacing between chromosomes. Useful to reduce chromosome overlap (default: 9e6 or 9000000)."
218
+ )
219
+ lio.add_argument(
220
+ "-t_space", "--track_spacing", default=0.10, type=float, metavar="float",
221
+ help="Space between linear tracks (default: 0.10)."
222
+ )
223
+
224
+ opt.add_argument(
225
+ "-h", "--help", action="help",
226
+ help="Show this help message and exit."
227
+ )
228
+
229
+ return parser.parse_args()
pycmplot/constants.py ADDED
@@ -0,0 +1,66 @@
1
+ """
2
+ pycmplot.constants
3
+ ==================
4
+ Genome-level constants shared across modules.
5
+ """
6
+
7
+ # ---------------------------------------------------------------------------
8
+ # hg38 chromosome lengths (GRCh38)
9
+ # ---------------------------------------------------------------------------
10
+ hg38_chr_lengths: dict[str, int] = {
11
+ "chr1": 249698942,
12
+ "chr2": 242508799,
13
+ "chr3": 198450956,
14
+ "chr4": 190424264,
15
+ "chr5": 181630948,
16
+ "chr6": 170805979,
17
+ "chr7": 159345973,
18
+ "chr8": 145138636,
19
+ "chr9": 138688728,
20
+ "chr10": 133797422,
21
+ "chr11": 135186938,
22
+ "chr12": 133275309,
23
+ "chr13": 114364328,
24
+ "chr14": 108136338,
25
+ "chr15": 102439437,
26
+ "chr16": 92211104,
27
+ "chr17": 83836422,
28
+ "chr18": 80373285,
29
+ "chr19": 58617616,
30
+ "chr20": 64444167,
31
+ "chr21": 46709983,
32
+ "chr22": 51857516,
33
+ "chrX": 156040895,
34
+ "chrY": 57264655,
35
+ }
36
+
37
+ # ---------------------------------------------------------------------------
38
+ # Gene biotype weights used for nearest-gene prioritisation
39
+ # ---------------------------------------------------------------------------
40
+ BIOTYPE_WEIGHTS: dict[str, float] = {
41
+ "gene": 1.00,
42
+ "protein_coding": 1.00,
43
+ "miRNA": 0.75,
44
+ "lncRNA": 0.70,
45
+ "ncRNA": 0.70,
46
+ "lincRNA": 0.70,
47
+ "ribozyme": 0.70,
48
+ "snRNA": 0.65,
49
+ "snoRNA": 0.65,
50
+ "scaRNA": 0.65,
51
+ "vault_RNA": 0.60,
52
+ "antisense": 0.30,
53
+ "rRNA": 0.55,
54
+ "processed_transcript": 0.50,
55
+ "transcribed_processed_pseudogene": 0.45,
56
+ "transcribed_unitary_pseudogene": 0.40,
57
+ "transcribed_unprocessed_pseudogene": 0.35,
58
+ "processed_pseudogene": 0.30,
59
+ "pseudogene": 0.20,
60
+ "unprocessed_pseudogene": 0.20,
61
+ }
62
+
63
+ # ---------------------------------------------------------------------------
64
+ # Standard chromosome order (autosomes + sex + MT)
65
+ # ---------------------------------------------------------------------------
66
+ CHROM_ORDER: list[str] = [str(i) for i in range(1, 23)] + ["X", "Y", "MT"]