rwe 0.0.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
rwe/__init__.py ADDED
File without changes
File without changes
rwe/clients/aou.py ADDED
@@ -0,0 +1,211 @@
1
+ import os
2
+ import numpy as np
3
+ import pandas as pd
4
+ from tqdm import tqdm
5
+ import multiprocessing as mp
6
+ from scipy import stats
7
+ from docx import Document
8
+ from docx.shared import Inches
9
+
10
+ import rwe.utils.helpers as uth
11
+ from rwe.plots.variant_info import create_variant_frequency_plots
12
+ from rwe.plots.demographics import demographics_plot
13
+ from rwe.plots.clinical import manhattan
14
+ from rwe.plots.measurements import plot_measurements
15
+ from rwe.plots.surveys import plot_survey_questions
16
+
17
+ ############### Variant Info and Demographics ###############
18
+ def get_individual_info(bucket, chrm, gene, zygosity="hetz"):
19
+ person_df = pd.read_csv(f"{bucket}/data/rwe_info/raw/person.csv.gz")
20
+ variant_df = pd.read_csv(f"{bucket}/data/rwe_info/genes/chr{chrm}/{gene}/lof_{zygosity}.csv.gz")
21
+ person_df["cases"] = person_df.person_id.isin(variant_df[f"{zygosity}_samples"])
22
+ person_df["group"] = np.where(person_df["cases"], "Cases", "Controls")
23
+ return person_df, variant_df
24
+
25
+ def generate_aou_variant_info_demographics_report(doc: Document, chrm: str, gene: str, zygosity: str ="hetz") -> Document:
26
+ from rwe.parsers.aou.config import BUCKET
27
+ person_df, variant_df = get_individual_info(BUCKET, chrm, gene, zygosity)
28
+ if person_df.empty or variant_df.empty:
29
+ doc.add_paragraph("No variant or individual level data provided to generate figures.")
30
+ doc.add_page_break()
31
+ return doc
32
+ # 1) Variant information figure
33
+ fig1, _axes1 = create_variant_frequency_plots(variant_df)
34
+ fig1_path = uth._save_fig_to_tmp(fig1, basename="variant_information", dpi=300)
35
+ doc.add_paragraph() # spacing
36
+ doc.add_picture(fig1_path, width=Inches(6.5))
37
+ uth._add_caption(doc, "Figure 1. Variant information summary (carrier counts by consequence; most/least frequent variants).")
38
+ # 2) Demographics figure
39
+ fig2, _axes2 = demographics_plot(person_df)
40
+ fig2_path = uth._save_fig_to_tmp(fig2, basename="demographics", dpi=300)
41
+ doc.add_paragraph() # spacing
42
+ doc.add_picture(fig2_path, width=Inches(6.5))
43
+ uth._add_caption(doc, "Figure 2. Demographics of cases vs controls (age, ancestry, sex at birth, ethnicity).")
44
+ return doc
45
+
46
+
47
+ ############### Clinical Records ###############
48
+ def clean_aou_phewas(phewas_file):
49
+ df = pd.read_csv(phewas_file, sep="\t")
50
+ df = df.loc[(df.ancestry == "all")&(df.converged==True)]
51
+ return df
52
+
53
+ def get_aou_manhattan(df, gene):
54
+ name_col = "phecode_string"
55
+ p_col = "p_value"
56
+ odds_ratio_col = "odds_ratio"
57
+ cat_col = "phecode_category"
58
+
59
+ fig, ax, plot_df = manhattan(
60
+ df,
61
+ p_col=p_col,
62
+ category_col=cat_col,
63
+ label_col=name_col,
64
+ odds_ratio_col=odds_ratio_col,
65
+ sig_p=2.8e-4,
66
+ top_k_labels=5,
67
+ title=f"{gene} PheWAS AoU",
68
+ )
69
+ return fig, plot_df
70
+
71
+ def generate_aou_clinical_report(doc, chrm, gene, zygosity):
72
+ from rwe.parsers.aou.config import BUCKET
73
+ phewas_file = f"{BUCKET}/data/phewas/results/{chrm}/{gene}_phewas.csv"
74
+ if uth._gcs_size(phewas_file, BUCKET) > 0:
75
+ df = clean_aou_phewas(phewas_file)
76
+ fig, plot_df = get_aou_manhattan(df, gene)
77
+ fig_path = uth._save_fig_to_tmp(fig, basename="aou_phewas", dpi=300)
78
+ doc.add_paragraph() # spacing
79
+ doc.add_picture(fig_path, width=Inches(6.5))
80
+ uth._add_caption(doc, f"Figure X. {gene} PheWAS results from All of Us.")
81
+ doc.add_paragraph() # spacing
82
+
83
+ # Table 1: Top 10 significant hits (sorted by p-value)
84
+ top_significant = plot_df.nsmallest(10, 'p_value')
85
+ columns = ['phecode_string', 'phecode_category', 'odds_ratio', 'p_value']
86
+ # Rename columns for display
87
+ display_df = top_significant[columns].copy()
88
+ display_df.columns = ['Phecode', 'Category', 'OR', 'P-value']
89
+ uth._add_table_to_doc(doc, display_df, f"Table X. Top 10 significant associations for {gene}",
90
+ ['Phecode', 'Category', 'OR', 'P-value'], list(map(Inches, [2.75, 1.75, 0.75, 0.75]))
91
+ )
92
+
93
+ # Table 2: Top 10 negative beta hits (odds_ratio < 0, sorted by p-value)
94
+ negative_beta = plot_df[plot_df['odds_ratio'] < 1].nsmallest(10, 'p_value')
95
+ if len(negative_beta) > 0:
96
+ display_df_neg = negative_beta[columns].copy()
97
+ display_df_neg.columns = ['Phecode', 'Category', 'OR', 'P-value']
98
+ uth._add_table_to_doc(doc, display_df_neg, f"Table X. Top 10 protective associations for {gene}",
99
+ ['Phecode', 'Category', 'OR', 'P-value'], list(map(Inches, [2.75, 1.75, 0.75, 0.75]))
100
+ )
101
+
102
+ doc.add_paragraph() # spacing
103
+ else:
104
+ doc.add_paragraph(f"No AoU PheWAS results found for {gene}.")
105
+ doc.add_paragraph() # spacing
106
+ return doc
107
+
108
+
109
+ ############### Labs and Measurements ###############
110
+ def remove_outliers_iqr(df, column, multiplier=5):
111
+ """
112
+ Remove outliers from a specified column in a pandas DataFrame using the IQR method.
113
+ """
114
+ value = np.log1p(df[column].astype(float))
115
+ Q1 = value.quantile(0.25)
116
+ Q3 = value.quantile(0.75)
117
+ IQR = Q3 - Q1
118
+ lower_bound = Q1 - multiplier * IQR
119
+ upper_bound = Q3 + multiplier * IQR
120
+ return np.where(value.between(lower_bound, upper_bound), df[column], pd.NA)
121
+
122
+ def clean_measurement(df, col="median_value", multiplier=5):
123
+ df = df.copy()
124
+ df = df[df[col] >= 0]
125
+ df[col] = remove_outliers_iqr(df, col, multiplier)
126
+ return df.dropna(subset=[col])
127
+
128
+ def compare(df, measurement):
129
+ mdf = df.loc[df.measurement==measurement].copy()
130
+ ctrls = mdf.loc[mdf.cases==False].drop_duplicates(["person_id", "measurement_concept_id"])
131
+ cases = mdf.loc[mdf.cases==True].drop_duplicates(["person_id", "measurement_concept_id"])
132
+ ctrls = clean_measurement(ctrls)
133
+ x = pd.to_numeric(cases["median_value"], errors="coerce").dropna().to_numpy()
134
+ y = pd.to_numeric(ctrls["median_value"], errors="coerce").dropna().to_numpy()
135
+ caq1, caq2, caq3 = pd.Series(x).quantile([0.25, 0.5, 0.75])
136
+ ctq1, ctq2, ctq3 = pd.Series(y).quantile([0.25, 0.5, 0.75])
137
+ # guard against empty arrays
138
+ if x.size<20 or y.size<20:
139
+ return (measurement, np.nan, np.nan, len(cases), len(ctrls), np.nan, np.nan, np.nan)
140
+ u_stat, p_mwu = stats.mannwhitneyu(x, y, alternative="two-sided")
141
+ return (measurement, float(u_stat), float(p_mwu), int(len(cases)), int(len(ctrls)), caq1, caq2, caq3, ctq1, ctq2, ctq3)
142
+
143
+ _G_DF = None
144
+
145
+ def _init_worker(df):
146
+ global _G_DF
147
+ _G_DF = df
148
+
149
+ def _worker_compare(measurement):
150
+ # uses global df set once per process
151
+ return compare(_G_DF, measurement)
152
+
153
+ def run_parallel(df, measurements=None, n_jobs=None, chunksize=10):
154
+ if measurements is None:
155
+ measurements = sorted(df["measurement"].dropna().unique().tolist())
156
+ if n_jobs is None:
157
+ n_jobs = max(1, mp.cpu_count() - 1)
158
+ with mp.Pool(processes=n_jobs, initializer=_init_worker, initargs=(df,)) as pool:
159
+ rows = list(
160
+ tqdm(
161
+ pool.imap_unordered(_worker_compare, measurements, chunksize=chunksize),
162
+ total=len(measurements),
163
+ desc="Processing measurements"
164
+ )
165
+ )
166
+ results_df = pd.DataFrame(
167
+ rows, columns=["measurement", "u_stat", "p_mwu", "n_cases", "n_ctrls", "q1_case", "median_case", "q3_case", "q1_ctrl", "median_ctrl", "q3_ctrl"]
168
+ )
169
+ return results_df
170
+
171
+ def generate_aou_labs_measurements_report(doc: Document, chrm: str, gene: str, zygosity: str) -> Document:
172
+ from rwe.parsers.aou.config import MEASUREMENT_GROUPS, BUCKET
173
+ numerical_measurements_df = pd.read_parquet(f"{BUCKET}/data/rwe_info/processed/selected_numerical_measurements.parquet")
174
+ variant_df = pd.read_csv(f"{BUCKET}/data/rwe_info/genes/chr{chrm}/{gene}/lof_{zygosity}.csv.gz")
175
+ numerical_measurements_df["cases"] = numerical_measurements_df.person_id.isin(variant_df[f"{zygosity}_samples"])
176
+ measurements = numerical_measurements_df["measurement"].unique().tolist()
177
+ res_df = run_parallel(numerical_measurements_df, measurements=measurements, n_jobs=None, chunksize=20)
178
+ # TODO: add most significant measurements table to doc
179
+ for k,v in MEASUREMENT_GROUPS.items():
180
+ f,a,p = plot_measurements(numerical_measurements_df, v, multiplier=5, col="median_value", res_df=res_df)
181
+ fig_path = uth._save_fig_to_tmp(f, basename=f"aou_measurements_{k}", dpi=300)
182
+ doc.add_paragraph() # spacing
183
+ doc.add_picture(fig_path, width=Inches(6.5))
184
+ uth._add_caption(doc, f"Figure X. {gene} pLoF carrier {k} results from All of Us.")
185
+ doc.add_paragraph() # spacing
186
+ return doc
187
+
188
+ ############### Surveys ###############
189
+ def clean_aou_surveys(df, survey_col="survey", question_col="question", answer_col="answer_category", zygosity="hetz"):
190
+ df = df.copy()
191
+ df[question_col] = df[question_col].str.replace("^" + df[survey_col] + ": ", "", regex=True)
192
+ df[answer_col] = df[answer_col].str.replace("^" + df[question_col] + ": ", "", regex=True)
193
+ return df
194
+
195
+ def generate_aou_survey_report(doc: Document, chrm: str, gene: str, zygosity: str) -> Document:
196
+ from rwe.parsers.aou.config import BUCKET
197
+ survey_df = pd.read_parquet(f"{BUCKET}/data/rwe_info/processed/selected_surveys.parquet")
198
+ variant_df = pd.read_csv(f"{BUCKET}/data/rwe_info/genes/chr{chrm}/{gene}/lof_{zygosity}.csv.gz")
199
+ survey_df["cases"] = survey_df.person_id.isin(variant_df[f"{zygosity}_samples"])
200
+ survey_df = clean_aou_surveys(survey_df, zygosity=zygosity)
201
+ fig, ax = plot_survey_questions(survey_df)
202
+ fig_path = uth._save_fig_to_tmp(fig, basename="aou_surveys", dpi=300)
203
+ doc.add_paragraph() # spacing
204
+ doc.add_picture(fig_path, width=Inches(6.5))
205
+ uth._add_caption(doc, f"Figure X. {gene} pLoF carrier survey results from All of Us.")
206
+ doc.add_paragraph() # spacing
207
+ return doc
208
+
209
+ if __name__ == "__main__":
210
+ from docx import Document
211
+ doc = Document()
rwe/clients/azn.py ADDED
@@ -0,0 +1,186 @@
1
+ import os
2
+ import shutil
3
+ import zipfile
4
+ from pathlib import Path
5
+ from playwright.sync_api import Playwright, sync_playwright
6
+ from rwe.plots.clinical import manhattan
7
+ import pandas as pd
8
+
9
+ import rwe.utils.helpers as uth
10
+ from docx.shared import Inches
11
+
12
+ def extract_only_csv_rename(zip_path: str, gene: str) -> str:
13
+ """
14
+ Unzips `zip_path` into its parent directory, keeps only the csv,
15
+ renames it to {gene}_azphewas.csv, removes everything else including the zip.
16
+ Returns final csv path.
17
+ """
18
+ zip_path = Path(zip_path)
19
+ gene_dir = zip_path.parent
20
+ target_csv = gene_dir / f"{gene}_azphewas.csv"
21
+
22
+ # Extract zip into gene_dir
23
+ with zipfile.ZipFile(zip_path, "r") as z:
24
+ z.extractall(gene_dir)
25
+
26
+ # Find CSV(s) extracted
27
+ csvs = list(gene_dir.rglob("*.csv"))
28
+ if not csvs:
29
+ raise FileNotFoundError(f"No CSV found after extracting {zip_path}")
30
+
31
+ # If multiple CSVs, pick the largest (usually the main table export)
32
+ csvs.sort(key=lambda p: p.stat().st_size, reverse=True)
33
+ chosen = csvs[0]
34
+
35
+ # Move/rename chosen CSV to target name at gene_dir root
36
+ if target_csv.exists():
37
+ target_csv.unlink()
38
+ chosen.replace(target_csv)
39
+
40
+ # Cleanup: remove everything except the renamed CSV
41
+ for p in gene_dir.iterdir():
42
+ if p == target_csv:
43
+ continue
44
+ if p.is_dir():
45
+ shutil.rmtree(p)
46
+ else:
47
+ p.unlink()
48
+
49
+ return str(target_csv)
50
+
51
+
52
+ def run(playwright: Playwright, gene: str, save_path: str) -> None:
53
+ browser = playwright.chromium.launch(headless=True)
54
+ context = browser.new_context(accept_downloads=True)
55
+ page = context.new_page()
56
+
57
+ url = f"https://azphewas.com/geneView/6319c068-fd59-46d8-85ee-82d82482eb14/{gene}/glr/binary"
58
+ page.goto(url, wait_until="networkidle")
59
+
60
+ export_control = page.locator('[data-testid="export-settings-control"] button:has-text("Export")')
61
+ export_control.click()
62
+
63
+ inner_export_btn = page.get_by_test_id("export_button")
64
+ inner_export_btn.wait_for(state="visible")
65
+
66
+ plot_checkbox = page.locator('[data-testid="plot-export-type-toggler"] input[type="checkbox"]')
67
+ plot_checkbox.set_checked(False, force=True)
68
+ page.wait_for_timeout(300)
69
+
70
+ os.makedirs(save_path, exist_ok=True)
71
+
72
+ with page.expect_download(timeout=120_000) as dl_info:
73
+ inner_export_btn.click(force=True)
74
+
75
+ download = dl_info.value
76
+ fname = download.suggested_filename
77
+ out_dir = save_path
78
+ os.makedirs(out_dir, exist_ok=True)
79
+ out_path = os.path.join(out_dir, fname)
80
+ download.save_as(out_path)
81
+ print("Saved zip:", out_path)
82
+
83
+ final_csv = extract_only_csv_rename(out_path, gene=gene)
84
+ print("Final CSV:", final_csv)
85
+
86
+ context.close()
87
+ browser.close()
88
+ return
89
+
90
+ def clean_azn_phewas(phewas_file):
91
+ df = pd.read_csv(phewas_file, sep=",")
92
+ df = df.loc[df["Collapsing model"]=="ptv"]
93
+ return df
94
+
95
+ def get_azn_manhattan(gene, phewas_file):
96
+ df = clean_azn_phewas(phewas_file)
97
+ name_col = "Phenotype"
98
+ p_col = "P value"
99
+ odds_ratio_col = "Odds ratio"
100
+ cat_col = "Phenotypic category"
101
+ AZN_CHAPTER_SHORT = {
102
+ "Chapter I Certain infectious and parasitic diseases": "Infectious",
103
+ "Chapter II Neoplasms": "Neoplasms",
104
+ "Chapter III Diseases of the blood and blood-forming organs and certain disorders involving the immune mechanism": "Blood/Immune",
105
+ "Chapter IV Endocrine nutritional and metabolic diseases": "Endocrine/Metabolic",
106
+ "Chapter V Mental and behavioural disorders": "Mental/Behavioral",
107
+ "Chapter VI Diseases of the nervous system": "Nervous System",
108
+ "Chapter VII Diseases of the eye and adnexa": "Eye",
109
+ "Chapter VIII Diseases of the ear and mastoid process": "Ear/Mastoid",
110
+ "Chapter IX Diseases of the circulatory system": "Circulatory",
111
+ "Chapter X Diseases of the respiratory system": "Respiratory",
112
+ "Chapter XI Diseases of the digestive system": "Digestive",
113
+ "Chapter XII Diseases of the skin and subcutaneous tissue": "Skin",
114
+ "Chapter XIII Diseases of the musculoskeletal system and connective tissue": "Musculoskeletal",
115
+ "Chapter XIV Diseases of the genitourinary system": "Genitourinary",
116
+ "Chapter XV Pregnancy childbirth and the puerperium": "Pregnancy/Childbirth",
117
+ "Chapter XVII Congenital malformations deformations and chromosomal abnormalities": "Congenital",
118
+ "Chapter XVIII Symptoms signs and abnormal clinical and laboratory findings not elsewhere classified": "Symptoms/Findings",
119
+ "Chapter XXI Factors influencing health status and contact with health services": "Health Services",
120
+ }
121
+
122
+ df[name_col] = df[name_col].astype(str).str.rsplit("#", n=1).str[-1]
123
+ df[cat_col] = df[cat_col].map(AZN_CHAPTER_SHORT).fillna("Other")
124
+
125
+ fig, ax, plot_df = manhattan(
126
+ df,
127
+ p_col=p_col,
128
+ category_col=cat_col,
129
+ label_col=name_col,
130
+ odds_ratio_col=odds_ratio_col,
131
+ sig_p=2.8e-4,
132
+ top_k_labels=5,
133
+ title=f"{gene} PheWAS AstraZeneca",
134
+ )
135
+ return fig, plot_df
136
+
137
+ def generate_azn_clinical_report(doc, gene, phewas_dir, phewas_filename=""):
138
+ if not phewas_filename:
139
+ phewas_filename = f"{gene}_azphewas.csv"
140
+ phewas_file = os.path.join(phewas_dir, phewas_filename)
141
+
142
+ if not os.path.exists(phewas_file):
143
+ with sync_playwright() as playwright:
144
+ run(playwright, gene=gene, save_path=phewas_dir)
145
+
146
+ if os.path.exists(phewas_file):
147
+ fig, plot_df = get_azn_manhattan(gene, phewas_file)
148
+ fig_path = uth._save_fig_to_tmp(fig, basename="azn_phewas", dpi=300)
149
+ doc.add_paragraph() # spacing
150
+ doc.add_picture(fig_path, width=Inches(6.5))
151
+ uth._add_caption(doc, f"Figure X. {gene} PheWAS results from AstraZeneca.")
152
+ doc.add_paragraph() # spacing
153
+
154
+ # Table 1: Top 10 significant hits (sorted by p-value)
155
+ top_significant = plot_df.nsmallest(10, 'P value')
156
+ columns = ['Phenotype', 'Phenotypic category', 'Odds ratio', 'P value']
157
+ # Rename columns for display
158
+ display_df = top_significant[columns].copy()
159
+ display_df.columns = ['Phenotype', 'Category', 'OR', 'P-value']
160
+ uth._add_table_to_doc(doc, display_df, f"Table X. Top 10 significant associations for {gene}",
161
+ ['Phenotype', 'Category', 'OR', 'P-value'], list(map(Inches, [2.75, 1.75, 0.75, 0.75]))
162
+ )
163
+ # Table 2: Top 10 negative beta hits (odds_ratio < 0, sorted by p-value)
164
+ negative_beta = plot_df[plot_df['Odds ratio'] < 1].nsmallest(10, 'P value')
165
+ if len(negative_beta) > 0:
166
+ display_df_neg = negative_beta[columns].copy()
167
+ display_df_neg.columns = ['Phenotype', 'Category', 'OR', 'P-value']
168
+ uth._add_table_to_doc(doc, display_df_neg, f"Table X. Top 10 protective associations for {gene}",
169
+ ['Phenotype', 'Category', 'OR', 'P-value'], list(map(Inches, [2.75, 1.75, 0.75, 0.75]))
170
+ )
171
+ doc.add_paragraph() # spacing
172
+
173
+ else:
174
+ doc.add_paragraph(f"No AstraZeneca PheWAS results found for {gene}.")
175
+ doc.add_paragraph() # spacing
176
+ return doc
177
+
178
+
179
+ if __name__ == "__main__":
180
+ from docx import Document
181
+ doc = Document()
182
+ gene = "INHBE"
183
+ phewas_dir = "/home/dbanerjee/deepro/rwe/data/test/"
184
+ doc = generate_azn_clinical_report(doc, gene, phewas_dir)
185
+ doc.save("/home/dbanerjee/deepro/rwe/data/test/INHBE_azn_report.docx")
186
+
@@ -0,0 +1,137 @@
1
+ import os
2
+ import numpy as np
3
+ from playwright.sync_api import Playwright, sync_playwright
4
+ import random
5
+ import pandas as pd
6
+ from docx.shared import Inches, Pt
7
+
8
+ from rwe.plots.clinical import manhattan
9
+ import rwe.utils.helpers as uth
10
+ from rwe.clients.hgnc import get_gene_info
11
+
12
+ def run(playwright: Playwright, gene:str, gene_id:str, save_path:str) -> None:
13
+ browser = playwright.chromium.launch(headless=True)
14
+ context = browser.new_context(accept_downloads=True)
15
+ # Open new page
16
+ page = context.new_page()
17
+ # Go to https://app.genebass.org/
18
+ # page.goto("https://app.genebass.org/")
19
+ query_id = "https://app.genebass.org/gene/" + gene_id +"?burdenSet=pLoF&phewasOpts=1&resultLayout=full"
20
+ page.goto(query_id)
21
+ page.click("label:has-text(\"Burden\")")
22
+ page.wait_for_timeout(random.uniform(100, 1000))
23
+ # Click text=Export data to CSV
24
+ with page.expect_download() as download_info:
25
+ page.click("text=Export data to CSV")
26
+ download = download_info.value
27
+ download_path = os.path.join(save_path, f"{gene}_genebass.csv")
28
+ os.makedirs(os.path.dirname(download_path), exist_ok=True)
29
+ download.save_as(path=download_path)
30
+ page.wait_for_timeout(random.uniform(100, 1000))
31
+ # ---------------------
32
+ context.close()
33
+ browser.close()
34
+ return
35
+
36
+ def clean_genebass_phewas(phewas_file):
37
+ df = pd.read_csv(phewas_file, sep=",")
38
+ df = df.loc[df["Trait type"]=="icd_first_occurrence"]
39
+ return df
40
+
41
+ def get_genebass_manhattan(gene, phewas_file):
42
+ df = clean_genebass_phewas(phewas_file)
43
+ name_col = "Description"
44
+ p_col = "P-Value (Burden)"
45
+ odds_ratio_col = None
46
+ beta_col = "Beta"
47
+ cat_col = "Category"
48
+
49
+ df[p_col] = pd.to_numeric(df[p_col], errors="coerce")
50
+ df = df[df[p_col].notna() & (df[p_col] > 0) & np.isfinite(df[p_col])]
51
+ prefix = "Health-related outcomes > First occurrences > "
52
+ df[cat_col] = (df[cat_col].astype(str)
53
+ .str.replace(prefix, "", regex=False)
54
+ .fillna("Unknown"))
55
+ GENEBASS_CATEGORY_SHORT = {
56
+ "Blood, blood-forming organs and certain immune disorders": "Blood/Immune",
57
+ "Certain conditions originating in the perinatal period": "Perinatal",
58
+ "Certain infectious and parasitic diseases": "Infectious",
59
+ "Circulatory system disorders": "Circulatory",
60
+ "Congenital disruptions and chromosomal abnormalities": "Congenital",
61
+ "Digestive system disorders": "Digestive",
62
+ "Ear and mastoid process disorders": "Ear/Mastoid",
63
+ "Endocrine, nutritional and metabolic diseases": "Endocrine/Metabolic",
64
+ "Eye and adnexa disorders": "Eye",
65
+ "Genitourinary system disorders": "Genitourinary",
66
+ "Mental and behavioural disorders": "Mental/Behavioral",
67
+ "Musculoskeletal system and connective tissue disorders": "Musculoskeletal",
68
+ "Nervous system disorders": "Nervous System",
69
+ "Pregnancy, childbirth and the puerperium": "Pregnancy/Childbirth",
70
+ "Respiratory system disorders": "Respiratory",
71
+ "Skin and subcutaneous tissue disorders": "Skin",
72
+ }
73
+ df[cat_col] = df[cat_col].map(GENEBASS_CATEGORY_SHORT).fillna(df[cat_col])
74
+ fig, ax, plot_df = manhattan(
75
+ df,
76
+ p_col=p_col,
77
+ category_col=cat_col,
78
+ label_col=name_col,
79
+ beta_col=beta_col,
80
+ sig_p=2.8e-4,
81
+ top_k_labels=5,
82
+ title=f"{gene} PheWAS GeneBass",
83
+ )
84
+ return fig, plot_df
85
+
86
+ def generate_genebass_clinical_report(doc, gene, phewas_dir, phewas_filename=""):
87
+ if not phewas_filename:
88
+ phewas_filename = f"{gene}_genebass.csv"
89
+ phewas_file = os.path.join(phewas_dir, phewas_filename)
90
+
91
+ if not os.path.exists(phewas_file):
92
+ hgnc_path = os.path.join(phewas_dir, "gene_with_protein_product.txt")
93
+ approved_symbol, ensembl_id, entrez_id = get_gene_info(gene, hgnc_path)
94
+ with sync_playwright() as playwright:
95
+ run(playwright, approved_symbol, ensembl_id, save_path=phewas_dir)
96
+
97
+ if os.path.exists(phewas_file):
98
+ fig, plot_df = get_genebass_manhattan(gene, phewas_file)
99
+ fig_path = uth._save_fig_to_tmp(fig, basename="genebass_phewas", dpi=300)
100
+ doc.add_paragraph() # spacing
101
+ doc.add_picture(fig_path, width=Inches(6))
102
+ uth._add_caption(doc, f"Figure X. {gene} PheWAS results from GeneBass.")
103
+ doc.add_paragraph() # spacing
104
+
105
+ # Table 1: Top 10 significant hits (sorted by p-value)
106
+ top_significant = plot_df.nsmallest(10, 'P-Value (Burden)')
107
+ columns = ['Description', 'Category', 'Beta', 'P-Value (Burden)']
108
+ # Rename columns for display
109
+ display_df = top_significant[columns].copy()
110
+ display_df.columns = ['Description', 'Category', 'Beta', 'P-value']
111
+ uth._add_table_to_doc(doc, display_df, f"Table X. Top 10 significant associations for {gene}",
112
+ ['Description', 'Category', 'Beta', 'P-value'], list(map(Inches, [2.75, 2.0, 0.5, 0.75]))
113
+ )
114
+
115
+ # Table 2: Top 10 negative beta hits (beta < 0, sorted by p-value)
116
+ negative_beta = plot_df[plot_df['Beta'] < 0].nsmallest(10, 'P-Value (Burden)')
117
+ if len(negative_beta) > 0:
118
+ display_df_neg = negative_beta[columns].copy()
119
+ display_df_neg.columns = ['Description', 'Category', 'Beta', 'P-value']
120
+ uth._add_table_to_doc(doc, display_df_neg, f"Table X. Top 10 protective associations for {gene}",
121
+ ['Description', 'Category', 'Beta', 'P-value'], list(map(Inches, [2.75, 2.0, 0.5, 0.75]))
122
+ )
123
+
124
+ doc.add_paragraph() # spacing
125
+ else:
126
+ doc.add_paragraph(f"No GeneBass PheWAS results found for {gene}.")
127
+ doc.add_paragraph() # spacing
128
+ return doc
129
+
130
+ if __name__ == "__main__":
131
+ from docx import Document
132
+ doc = Document()
133
+ gene = "INHBE"
134
+ phewas_dir = "/home/dbanerjee/deepro/rwe/data/test/"
135
+ doc = generate_genebass_clinical_report(doc, gene, phewas_dir)
136
+ doc.save("/home/dbanerjee/deepro/rwe/data/test/INHBE_genebass_report.docx")
137
+
rwe/clients/hgnc.py ADDED
@@ -0,0 +1,52 @@
1
+ import pandas as pd
2
+ import os
3
+ import requests
4
+
5
+ def load_data(hgnc_path):
6
+ # Load HGNC gene list (protein-coding genes)
7
+ if os.path.exists(hgnc_path):
8
+ hgnc_df = pd.read_csv(hgnc_path, sep='\t', dtype=str)
9
+ else:
10
+ url = 'https://storage.googleapis.com/public-download-files/hgnc/tsv/tsv/locus_types/gene_with_protein_product.txt'
11
+ resp = requests.get(url)
12
+ resp.raise_for_status()
13
+ with open(hgnc_path, 'wb') as f:
14
+ f.write(resp.content)
15
+ hgnc_df = pd.read_csv(hgnc_path, sep='\t', dtype=str)
16
+
17
+ return hgnc_df
18
+
19
+
20
+ def build_gene_symbol_map(hgnc_df):
21
+ """
22
+ Map all approved, alias, and previous symbols to the current HGNC symbol.
23
+ """
24
+ symbol_map = {}
25
+ for _, row in hgnc_df.iterrows():
26
+ approved = row.get('symbol')
27
+ if pd.isna(approved):
28
+ continue
29
+ symbol_map[approved] = approved
30
+ for col in ['alias_symbol', 'prev_symbol']:
31
+ if col in row and pd.notna(row[col]):
32
+ for alias in row[col].split('|'):
33
+ symbol_map[alias] = approved
34
+ ensembl_map = dict(zip(hgnc_df['symbol'], hgnc_df['ensembl_gene_id']))
35
+ entrez_map = dict(zip(hgnc_df['symbol'], hgnc_df['entrez_id']))
36
+ return symbol_map, ensembl_map, entrez_map
37
+
38
+ def get_gene_info(gene, hgnc_path):
39
+ hgnc_df = load_data(hgnc_path)
40
+ symbol_map, ensembl_map, entrez_map = build_gene_symbol_map(hgnc_df)
41
+ approved_symbol = symbol_map.get(gene)
42
+ if not approved_symbol:
43
+ raise ValueError(f"Gene symbol '{gene}' not found in HGNC data.")
44
+ ensembl_id = ensembl_map.get(approved_symbol)
45
+ entrez_id = entrez_map.get(approved_symbol)
46
+ return approved_symbol, ensembl_id, entrez_id
47
+
48
+ if __name__ == "__main__":
49
+ hgnc_path = "/home/dbanerjee/deepro/rwe/data/hgnc/gene_with_protein_product.txt"
50
+ approved_symbol, ensembl_id, entrez_id = get_gene_info("ANGPTL3", hgnc_path)
51
+ print(f"Approved symbol: {approved_symbol}, Ensembl ID: {ensembl_id}, Entrez ID: {entrez_id}")
52
+
rwe/generate_report.py ADDED
@@ -0,0 +1,66 @@
1
+ import os
2
+ import pandas as pd
3
+
4
+ import matplotlib
5
+ matplotlib.use("Agg") # important for batch/report generation (no display)
6
+
7
+ import rwe.utils.report as utr
8
+
9
+
10
+
11
+ # ---------- Main user-facing function ----------
12
+
13
+ def generate_rwe_report(
14
+ gene: str,
15
+ chrm: str,
16
+ zygosity: str = "hetz",
17
+ out_docx_path: str = "",
18
+ logo_path: str = "",
19
+ report_date: str = "",
20
+ ):
21
+ """
22
+ Creates a DOCX report with:
23
+ - Title page
24
+ - Contents (TOC field)
25
+ - Five section headers
26
+ - Variant information + Demographics figures inserted into section 1
27
+
28
+ Parameters
29
+ ----------
30
+ gene : str
31
+ chrm : str
32
+ out_docx_path : str
33
+ logo_path : optional str (path to Arrowhead logo PNG/JPG)
34
+ report_date : optional str (e.g. "January 26th, 2026")
35
+ """
36
+ proj_dir = os.path.dirname(out_docx_path) or "."
37
+ os.makedirs(proj_dir, exist_ok=True)
38
+
39
+ # --- Title and Contents --- #
40
+ doc = utr.generate_title_and_contents(gene, logo_path=logo_path, report_date=report_date)
41
+
42
+ # --- Variant information and demographics --- #
43
+ doc = utr.generate_variant_information_and_demographics(doc, chrm, gene, zygosity)
44
+
45
+ # --- Clinical records --- #
46
+ doc = utr.generate_clinical_records(doc, chrm, gene, zygosity, proj_dir)
47
+
48
+ # --- Labs and measurements --- #
49
+ doc = utr.generate_labs_and_measurements(doc, chrm, gene, zygosity, proj_dir)
50
+
51
+ # --- Survey information --- #
52
+ doc = utr.generate_survey_information(doc, chrm, gene, zygosity, proj_dir)
53
+
54
+ # --- Homozygous loss of function carriers --- #
55
+ doc = utr.generate_homozygous_lof_carriers(doc, chrm, gene, zygosity, proj_dir)
56
+
57
+
58
+ # Save
59
+ if not out_docx_path:
60
+ out_docx_path = f"{gene}_RWE_report.docx"
61
+ doc.save(out_docx_path)
62
+ return out_docx_path
63
+
64
+ if __name__ == "__main__":
65
+ import pandas as pd
66
+
File without changes
File without changes