rwe 0.0.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- rwe/__init__.py +0 -0
- rwe/assets/Arrowhead_Pharmaceuticals_logo.png +0 -0
- rwe/clients/__init__.py +0 -0
- rwe/clients/aou.py +211 -0
- rwe/clients/azn.py +186 -0
- rwe/clients/genebass.py +137 -0
- rwe/clients/hgnc.py +52 -0
- rwe/generate_report.py +66 -0
- rwe/parsers/__init__.py +0 -0
- rwe/parsers/aou/__init__.py +0 -0
- rwe/parsers/aou/collect.py +399 -0
- rwe/parsers/aou/config.py +618 -0
- rwe/parsers/aou/process.py +74 -0
- rwe/plots/__init__.py +0 -0
- rwe/plots/clinical.py +214 -0
- rwe/plots/demographics.py +205 -0
- rwe/plots/measurements.py +89 -0
- rwe/plots/surveys.py +193 -0
- rwe/plots/variant_info.py +163 -0
- rwe/utils/__init__.py +0 -0
- rwe/utils/helpers.py +140 -0
- rwe/utils/report.py +137 -0
- rwe-0.0.1.dist-info/METADATA +182 -0
- rwe-0.0.1.dist-info/RECORD +27 -0
- rwe-0.0.1.dist-info/WHEEL +5 -0
- rwe-0.0.1.dist-info/licenses/LICENSE +21 -0
- rwe-0.0.1.dist-info/top_level.txt +1 -0
rwe/__init__.py
ADDED
|
File without changes
|
|
Binary file
|
rwe/clients/__init__.py
ADDED
|
File without changes
|
rwe/clients/aou.py
ADDED
|
@@ -0,0 +1,211 @@
|
|
|
1
|
+
import os
|
|
2
|
+
import numpy as np
|
|
3
|
+
import pandas as pd
|
|
4
|
+
from tqdm import tqdm
|
|
5
|
+
import multiprocessing as mp
|
|
6
|
+
from scipy import stats
|
|
7
|
+
from docx import Document
|
|
8
|
+
from docx.shared import Inches
|
|
9
|
+
|
|
10
|
+
import rwe.utils.helpers as uth
|
|
11
|
+
from rwe.plots.variant_info import create_variant_frequency_plots
|
|
12
|
+
from rwe.plots.demographics import demographics_plot
|
|
13
|
+
from rwe.plots.clinical import manhattan
|
|
14
|
+
from rwe.plots.measurements import plot_measurements
|
|
15
|
+
from rwe.plots.surveys import plot_survey_questions
|
|
16
|
+
|
|
17
|
+
############### Variant Info and Demographics ###############
|
|
18
|
+
def get_individual_info(bucket, chrm, gene, zygosity="hetz"):
|
|
19
|
+
person_df = pd.read_csv(f"{bucket}/data/rwe_info/raw/person.csv.gz")
|
|
20
|
+
variant_df = pd.read_csv(f"{bucket}/data/rwe_info/genes/chr{chrm}/{gene}/lof_{zygosity}.csv.gz")
|
|
21
|
+
person_df["cases"] = person_df.person_id.isin(variant_df[f"{zygosity}_samples"])
|
|
22
|
+
person_df["group"] = np.where(person_df["cases"], "Cases", "Controls")
|
|
23
|
+
return person_df, variant_df
|
|
24
|
+
|
|
25
|
+
def generate_aou_variant_info_demographics_report(doc: Document, chrm: str, gene: str, zygosity: str ="hetz") -> Document:
|
|
26
|
+
from rwe.parsers.aou.config import BUCKET
|
|
27
|
+
person_df, variant_df = get_individual_info(BUCKET, chrm, gene, zygosity)
|
|
28
|
+
if person_df.empty or variant_df.empty:
|
|
29
|
+
doc.add_paragraph("No variant or individual level data provided to generate figures.")
|
|
30
|
+
doc.add_page_break()
|
|
31
|
+
return doc
|
|
32
|
+
# 1) Variant information figure
|
|
33
|
+
fig1, _axes1 = create_variant_frequency_plots(variant_df)
|
|
34
|
+
fig1_path = uth._save_fig_to_tmp(fig1, basename="variant_information", dpi=300)
|
|
35
|
+
doc.add_paragraph() # spacing
|
|
36
|
+
doc.add_picture(fig1_path, width=Inches(6.5))
|
|
37
|
+
uth._add_caption(doc, "Figure 1. Variant information summary (carrier counts by consequence; most/least frequent variants).")
|
|
38
|
+
# 2) Demographics figure
|
|
39
|
+
fig2, _axes2 = demographics_plot(person_df)
|
|
40
|
+
fig2_path = uth._save_fig_to_tmp(fig2, basename="demographics", dpi=300)
|
|
41
|
+
doc.add_paragraph() # spacing
|
|
42
|
+
doc.add_picture(fig2_path, width=Inches(6.5))
|
|
43
|
+
uth._add_caption(doc, "Figure 2. Demographics of cases vs controls (age, ancestry, sex at birth, ethnicity).")
|
|
44
|
+
return doc
|
|
45
|
+
|
|
46
|
+
|
|
47
|
+
############### Clinical Records ###############
|
|
48
|
+
def clean_aou_phewas(phewas_file):
|
|
49
|
+
df = pd.read_csv(phewas_file, sep="\t")
|
|
50
|
+
df = df.loc[(df.ancestry == "all")&(df.converged==True)]
|
|
51
|
+
return df
|
|
52
|
+
|
|
53
|
+
def get_aou_manhattan(df, gene):
|
|
54
|
+
name_col = "phecode_string"
|
|
55
|
+
p_col = "p_value"
|
|
56
|
+
odds_ratio_col = "odds_ratio"
|
|
57
|
+
cat_col = "phecode_category"
|
|
58
|
+
|
|
59
|
+
fig, ax, plot_df = manhattan(
|
|
60
|
+
df,
|
|
61
|
+
p_col=p_col,
|
|
62
|
+
category_col=cat_col,
|
|
63
|
+
label_col=name_col,
|
|
64
|
+
odds_ratio_col=odds_ratio_col,
|
|
65
|
+
sig_p=2.8e-4,
|
|
66
|
+
top_k_labels=5,
|
|
67
|
+
title=f"{gene} PheWAS AoU",
|
|
68
|
+
)
|
|
69
|
+
return fig, plot_df
|
|
70
|
+
|
|
71
|
+
def generate_aou_clinical_report(doc, chrm, gene, zygosity):
|
|
72
|
+
from rwe.parsers.aou.config import BUCKET
|
|
73
|
+
phewas_file = f"{BUCKET}/data/phewas/results/{chrm}/{gene}_phewas.csv"
|
|
74
|
+
if uth._gcs_size(phewas_file, BUCKET) > 0:
|
|
75
|
+
df = clean_aou_phewas(phewas_file)
|
|
76
|
+
fig, plot_df = get_aou_manhattan(df, gene)
|
|
77
|
+
fig_path = uth._save_fig_to_tmp(fig, basename="aou_phewas", dpi=300)
|
|
78
|
+
doc.add_paragraph() # spacing
|
|
79
|
+
doc.add_picture(fig_path, width=Inches(6.5))
|
|
80
|
+
uth._add_caption(doc, f"Figure X. {gene} PheWAS results from All of Us.")
|
|
81
|
+
doc.add_paragraph() # spacing
|
|
82
|
+
|
|
83
|
+
# Table 1: Top 10 significant hits (sorted by p-value)
|
|
84
|
+
top_significant = plot_df.nsmallest(10, 'p_value')
|
|
85
|
+
columns = ['phecode_string', 'phecode_category', 'odds_ratio', 'p_value']
|
|
86
|
+
# Rename columns for display
|
|
87
|
+
display_df = top_significant[columns].copy()
|
|
88
|
+
display_df.columns = ['Phecode', 'Category', 'OR', 'P-value']
|
|
89
|
+
uth._add_table_to_doc(doc, display_df, f"Table X. Top 10 significant associations for {gene}",
|
|
90
|
+
['Phecode', 'Category', 'OR', 'P-value'], list(map(Inches, [2.75, 1.75, 0.75, 0.75]))
|
|
91
|
+
)
|
|
92
|
+
|
|
93
|
+
# Table 2: Top 10 negative beta hits (odds_ratio < 0, sorted by p-value)
|
|
94
|
+
negative_beta = plot_df[plot_df['odds_ratio'] < 1].nsmallest(10, 'p_value')
|
|
95
|
+
if len(negative_beta) > 0:
|
|
96
|
+
display_df_neg = negative_beta[columns].copy()
|
|
97
|
+
display_df_neg.columns = ['Phecode', 'Category', 'OR', 'P-value']
|
|
98
|
+
uth._add_table_to_doc(doc, display_df_neg, f"Table X. Top 10 protective associations for {gene}",
|
|
99
|
+
['Phecode', 'Category', 'OR', 'P-value'], list(map(Inches, [2.75, 1.75, 0.75, 0.75]))
|
|
100
|
+
)
|
|
101
|
+
|
|
102
|
+
doc.add_paragraph() # spacing
|
|
103
|
+
else:
|
|
104
|
+
doc.add_paragraph(f"No AoU PheWAS results found for {gene}.")
|
|
105
|
+
doc.add_paragraph() # spacing
|
|
106
|
+
return doc
|
|
107
|
+
|
|
108
|
+
|
|
109
|
+
############### Labs and Measurements ###############
|
|
110
|
+
def remove_outliers_iqr(df, column, multiplier=5):
|
|
111
|
+
"""
|
|
112
|
+
Remove outliers from a specified column in a pandas DataFrame using the IQR method.
|
|
113
|
+
"""
|
|
114
|
+
value = np.log1p(df[column].astype(float))
|
|
115
|
+
Q1 = value.quantile(0.25)
|
|
116
|
+
Q3 = value.quantile(0.75)
|
|
117
|
+
IQR = Q3 - Q1
|
|
118
|
+
lower_bound = Q1 - multiplier * IQR
|
|
119
|
+
upper_bound = Q3 + multiplier * IQR
|
|
120
|
+
return np.where(value.between(lower_bound, upper_bound), df[column], pd.NA)
|
|
121
|
+
|
|
122
|
+
def clean_measurement(df, col="median_value", multiplier=5):
|
|
123
|
+
df = df.copy()
|
|
124
|
+
df = df[df[col] >= 0]
|
|
125
|
+
df[col] = remove_outliers_iqr(df, col, multiplier)
|
|
126
|
+
return df.dropna(subset=[col])
|
|
127
|
+
|
|
128
|
+
def compare(df, measurement):
|
|
129
|
+
mdf = df.loc[df.measurement==measurement].copy()
|
|
130
|
+
ctrls = mdf.loc[mdf.cases==False].drop_duplicates(["person_id", "measurement_concept_id"])
|
|
131
|
+
cases = mdf.loc[mdf.cases==True].drop_duplicates(["person_id", "measurement_concept_id"])
|
|
132
|
+
ctrls = clean_measurement(ctrls)
|
|
133
|
+
x = pd.to_numeric(cases["median_value"], errors="coerce").dropna().to_numpy()
|
|
134
|
+
y = pd.to_numeric(ctrls["median_value"], errors="coerce").dropna().to_numpy()
|
|
135
|
+
caq1, caq2, caq3 = pd.Series(x).quantile([0.25, 0.5, 0.75])
|
|
136
|
+
ctq1, ctq2, ctq3 = pd.Series(y).quantile([0.25, 0.5, 0.75])
|
|
137
|
+
# guard against empty arrays
|
|
138
|
+
if x.size<20 or y.size<20:
|
|
139
|
+
return (measurement, np.nan, np.nan, len(cases), len(ctrls), np.nan, np.nan, np.nan)
|
|
140
|
+
u_stat, p_mwu = stats.mannwhitneyu(x, y, alternative="two-sided")
|
|
141
|
+
return (measurement, float(u_stat), float(p_mwu), int(len(cases)), int(len(ctrls)), caq1, caq2, caq3, ctq1, ctq2, ctq3)
|
|
142
|
+
|
|
143
|
+
_G_DF = None
|
|
144
|
+
|
|
145
|
+
def _init_worker(df):
|
|
146
|
+
global _G_DF
|
|
147
|
+
_G_DF = df
|
|
148
|
+
|
|
149
|
+
def _worker_compare(measurement):
|
|
150
|
+
# uses global df set once per process
|
|
151
|
+
return compare(_G_DF, measurement)
|
|
152
|
+
|
|
153
|
+
def run_parallel(df, measurements=None, n_jobs=None, chunksize=10):
|
|
154
|
+
if measurements is None:
|
|
155
|
+
measurements = sorted(df["measurement"].dropna().unique().tolist())
|
|
156
|
+
if n_jobs is None:
|
|
157
|
+
n_jobs = max(1, mp.cpu_count() - 1)
|
|
158
|
+
with mp.Pool(processes=n_jobs, initializer=_init_worker, initargs=(df,)) as pool:
|
|
159
|
+
rows = list(
|
|
160
|
+
tqdm(
|
|
161
|
+
pool.imap_unordered(_worker_compare, measurements, chunksize=chunksize),
|
|
162
|
+
total=len(measurements),
|
|
163
|
+
desc="Processing measurements"
|
|
164
|
+
)
|
|
165
|
+
)
|
|
166
|
+
results_df = pd.DataFrame(
|
|
167
|
+
rows, columns=["measurement", "u_stat", "p_mwu", "n_cases", "n_ctrls", "q1_case", "median_case", "q3_case", "q1_ctrl", "median_ctrl", "q3_ctrl"]
|
|
168
|
+
)
|
|
169
|
+
return results_df
|
|
170
|
+
|
|
171
|
+
def generate_aou_labs_measurements_report(doc: Document, chrm: str, gene: str, zygosity: str) -> Document:
|
|
172
|
+
from rwe.parsers.aou.config import MEASUREMENT_GROUPS, BUCKET
|
|
173
|
+
numerical_measurements_df = pd.read_parquet(f"{BUCKET}/data/rwe_info/processed/selected_numerical_measurements.parquet")
|
|
174
|
+
variant_df = pd.read_csv(f"{BUCKET}/data/rwe_info/genes/chr{chrm}/{gene}/lof_{zygosity}.csv.gz")
|
|
175
|
+
numerical_measurements_df["cases"] = numerical_measurements_df.person_id.isin(variant_df[f"{zygosity}_samples"])
|
|
176
|
+
measurements = numerical_measurements_df["measurement"].unique().tolist()
|
|
177
|
+
res_df = run_parallel(numerical_measurements_df, measurements=measurements, n_jobs=None, chunksize=20)
|
|
178
|
+
# TODO: add most significant measurements table to doc
|
|
179
|
+
for k,v in MEASUREMENT_GROUPS.items():
|
|
180
|
+
f,a,p = plot_measurements(numerical_measurements_df, v, multiplier=5, col="median_value", res_df=res_df)
|
|
181
|
+
fig_path = uth._save_fig_to_tmp(f, basename=f"aou_measurements_{k}", dpi=300)
|
|
182
|
+
doc.add_paragraph() # spacing
|
|
183
|
+
doc.add_picture(fig_path, width=Inches(6.5))
|
|
184
|
+
uth._add_caption(doc, f"Figure X. {gene} pLoF carrier {k} results from All of Us.")
|
|
185
|
+
doc.add_paragraph() # spacing
|
|
186
|
+
return doc
|
|
187
|
+
|
|
188
|
+
############### Surveys ###############
|
|
189
|
+
def clean_aou_surveys(df, survey_col="survey", question_col="question", answer_col="answer_category", zygosity="hetz"):
|
|
190
|
+
df = df.copy()
|
|
191
|
+
df[question_col] = df[question_col].str.replace("^" + df[survey_col] + ": ", "", regex=True)
|
|
192
|
+
df[answer_col] = df[answer_col].str.replace("^" + df[question_col] + ": ", "", regex=True)
|
|
193
|
+
return df
|
|
194
|
+
|
|
195
|
+
def generate_aou_survey_report(doc: Document, chrm: str, gene: str, zygosity: str) -> Document:
|
|
196
|
+
from rwe.parsers.aou.config import BUCKET
|
|
197
|
+
survey_df = pd.read_parquet(f"{BUCKET}/data/rwe_info/processed/selected_surveys.parquet")
|
|
198
|
+
variant_df = pd.read_csv(f"{BUCKET}/data/rwe_info/genes/chr{chrm}/{gene}/lof_{zygosity}.csv.gz")
|
|
199
|
+
survey_df["cases"] = survey_df.person_id.isin(variant_df[f"{zygosity}_samples"])
|
|
200
|
+
survey_df = clean_aou_surveys(survey_df, zygosity=zygosity)
|
|
201
|
+
fig, ax = plot_survey_questions(survey_df)
|
|
202
|
+
fig_path = uth._save_fig_to_tmp(fig, basename="aou_surveys", dpi=300)
|
|
203
|
+
doc.add_paragraph() # spacing
|
|
204
|
+
doc.add_picture(fig_path, width=Inches(6.5))
|
|
205
|
+
uth._add_caption(doc, f"Figure X. {gene} pLoF carrier survey results from All of Us.")
|
|
206
|
+
doc.add_paragraph() # spacing
|
|
207
|
+
return doc
|
|
208
|
+
|
|
209
|
+
if __name__ == "__main__":
|
|
210
|
+
from docx import Document
|
|
211
|
+
doc = Document()
|
rwe/clients/azn.py
ADDED
|
@@ -0,0 +1,186 @@
|
|
|
1
|
+
import os
|
|
2
|
+
import shutil
|
|
3
|
+
import zipfile
|
|
4
|
+
from pathlib import Path
|
|
5
|
+
from playwright.sync_api import Playwright, sync_playwright
|
|
6
|
+
from rwe.plots.clinical import manhattan
|
|
7
|
+
import pandas as pd
|
|
8
|
+
|
|
9
|
+
import rwe.utils.helpers as uth
|
|
10
|
+
from docx.shared import Inches
|
|
11
|
+
|
|
12
|
+
def extract_only_csv_rename(zip_path: str, gene: str) -> str:
|
|
13
|
+
"""
|
|
14
|
+
Unzips `zip_path` into its parent directory, keeps only the csv,
|
|
15
|
+
renames it to {gene}_azphewas.csv, removes everything else including the zip.
|
|
16
|
+
Returns final csv path.
|
|
17
|
+
"""
|
|
18
|
+
zip_path = Path(zip_path)
|
|
19
|
+
gene_dir = zip_path.parent
|
|
20
|
+
target_csv = gene_dir / f"{gene}_azphewas.csv"
|
|
21
|
+
|
|
22
|
+
# Extract zip into gene_dir
|
|
23
|
+
with zipfile.ZipFile(zip_path, "r") as z:
|
|
24
|
+
z.extractall(gene_dir)
|
|
25
|
+
|
|
26
|
+
# Find CSV(s) extracted
|
|
27
|
+
csvs = list(gene_dir.rglob("*.csv"))
|
|
28
|
+
if not csvs:
|
|
29
|
+
raise FileNotFoundError(f"No CSV found after extracting {zip_path}")
|
|
30
|
+
|
|
31
|
+
# If multiple CSVs, pick the largest (usually the main table export)
|
|
32
|
+
csvs.sort(key=lambda p: p.stat().st_size, reverse=True)
|
|
33
|
+
chosen = csvs[0]
|
|
34
|
+
|
|
35
|
+
# Move/rename chosen CSV to target name at gene_dir root
|
|
36
|
+
if target_csv.exists():
|
|
37
|
+
target_csv.unlink()
|
|
38
|
+
chosen.replace(target_csv)
|
|
39
|
+
|
|
40
|
+
# Cleanup: remove everything except the renamed CSV
|
|
41
|
+
for p in gene_dir.iterdir():
|
|
42
|
+
if p == target_csv:
|
|
43
|
+
continue
|
|
44
|
+
if p.is_dir():
|
|
45
|
+
shutil.rmtree(p)
|
|
46
|
+
else:
|
|
47
|
+
p.unlink()
|
|
48
|
+
|
|
49
|
+
return str(target_csv)
|
|
50
|
+
|
|
51
|
+
|
|
52
|
+
def run(playwright: Playwright, gene: str, save_path: str) -> None:
|
|
53
|
+
browser = playwright.chromium.launch(headless=True)
|
|
54
|
+
context = browser.new_context(accept_downloads=True)
|
|
55
|
+
page = context.new_page()
|
|
56
|
+
|
|
57
|
+
url = f"https://azphewas.com/geneView/6319c068-fd59-46d8-85ee-82d82482eb14/{gene}/glr/binary"
|
|
58
|
+
page.goto(url, wait_until="networkidle")
|
|
59
|
+
|
|
60
|
+
export_control = page.locator('[data-testid="export-settings-control"] button:has-text("Export")')
|
|
61
|
+
export_control.click()
|
|
62
|
+
|
|
63
|
+
inner_export_btn = page.get_by_test_id("export_button")
|
|
64
|
+
inner_export_btn.wait_for(state="visible")
|
|
65
|
+
|
|
66
|
+
plot_checkbox = page.locator('[data-testid="plot-export-type-toggler"] input[type="checkbox"]')
|
|
67
|
+
plot_checkbox.set_checked(False, force=True)
|
|
68
|
+
page.wait_for_timeout(300)
|
|
69
|
+
|
|
70
|
+
os.makedirs(save_path, exist_ok=True)
|
|
71
|
+
|
|
72
|
+
with page.expect_download(timeout=120_000) as dl_info:
|
|
73
|
+
inner_export_btn.click(force=True)
|
|
74
|
+
|
|
75
|
+
download = dl_info.value
|
|
76
|
+
fname = download.suggested_filename
|
|
77
|
+
out_dir = save_path
|
|
78
|
+
os.makedirs(out_dir, exist_ok=True)
|
|
79
|
+
out_path = os.path.join(out_dir, fname)
|
|
80
|
+
download.save_as(out_path)
|
|
81
|
+
print("Saved zip:", out_path)
|
|
82
|
+
|
|
83
|
+
final_csv = extract_only_csv_rename(out_path, gene=gene)
|
|
84
|
+
print("Final CSV:", final_csv)
|
|
85
|
+
|
|
86
|
+
context.close()
|
|
87
|
+
browser.close()
|
|
88
|
+
return
|
|
89
|
+
|
|
90
|
+
def clean_azn_phewas(phewas_file):
|
|
91
|
+
df = pd.read_csv(phewas_file, sep=",")
|
|
92
|
+
df = df.loc[df["Collapsing model"]=="ptv"]
|
|
93
|
+
return df
|
|
94
|
+
|
|
95
|
+
def get_azn_manhattan(gene, phewas_file):
|
|
96
|
+
df = clean_azn_phewas(phewas_file)
|
|
97
|
+
name_col = "Phenotype"
|
|
98
|
+
p_col = "P value"
|
|
99
|
+
odds_ratio_col = "Odds ratio"
|
|
100
|
+
cat_col = "Phenotypic category"
|
|
101
|
+
AZN_CHAPTER_SHORT = {
|
|
102
|
+
"Chapter I Certain infectious and parasitic diseases": "Infectious",
|
|
103
|
+
"Chapter II Neoplasms": "Neoplasms",
|
|
104
|
+
"Chapter III Diseases of the blood and blood-forming organs and certain disorders involving the immune mechanism": "Blood/Immune",
|
|
105
|
+
"Chapter IV Endocrine nutritional and metabolic diseases": "Endocrine/Metabolic",
|
|
106
|
+
"Chapter V Mental and behavioural disorders": "Mental/Behavioral",
|
|
107
|
+
"Chapter VI Diseases of the nervous system": "Nervous System",
|
|
108
|
+
"Chapter VII Diseases of the eye and adnexa": "Eye",
|
|
109
|
+
"Chapter VIII Diseases of the ear and mastoid process": "Ear/Mastoid",
|
|
110
|
+
"Chapter IX Diseases of the circulatory system": "Circulatory",
|
|
111
|
+
"Chapter X Diseases of the respiratory system": "Respiratory",
|
|
112
|
+
"Chapter XI Diseases of the digestive system": "Digestive",
|
|
113
|
+
"Chapter XII Diseases of the skin and subcutaneous tissue": "Skin",
|
|
114
|
+
"Chapter XIII Diseases of the musculoskeletal system and connective tissue": "Musculoskeletal",
|
|
115
|
+
"Chapter XIV Diseases of the genitourinary system": "Genitourinary",
|
|
116
|
+
"Chapter XV Pregnancy childbirth and the puerperium": "Pregnancy/Childbirth",
|
|
117
|
+
"Chapter XVII Congenital malformations deformations and chromosomal abnormalities": "Congenital",
|
|
118
|
+
"Chapter XVIII Symptoms signs and abnormal clinical and laboratory findings not elsewhere classified": "Symptoms/Findings",
|
|
119
|
+
"Chapter XXI Factors influencing health status and contact with health services": "Health Services",
|
|
120
|
+
}
|
|
121
|
+
|
|
122
|
+
df[name_col] = df[name_col].astype(str).str.rsplit("#", n=1).str[-1]
|
|
123
|
+
df[cat_col] = df[cat_col].map(AZN_CHAPTER_SHORT).fillna("Other")
|
|
124
|
+
|
|
125
|
+
fig, ax, plot_df = manhattan(
|
|
126
|
+
df,
|
|
127
|
+
p_col=p_col,
|
|
128
|
+
category_col=cat_col,
|
|
129
|
+
label_col=name_col,
|
|
130
|
+
odds_ratio_col=odds_ratio_col,
|
|
131
|
+
sig_p=2.8e-4,
|
|
132
|
+
top_k_labels=5,
|
|
133
|
+
title=f"{gene} PheWAS AstraZeneca",
|
|
134
|
+
)
|
|
135
|
+
return fig, plot_df
|
|
136
|
+
|
|
137
|
+
def generate_azn_clinical_report(doc, gene, phewas_dir, phewas_filename=""):
|
|
138
|
+
if not phewas_filename:
|
|
139
|
+
phewas_filename = f"{gene}_azphewas.csv"
|
|
140
|
+
phewas_file = os.path.join(phewas_dir, phewas_filename)
|
|
141
|
+
|
|
142
|
+
if not os.path.exists(phewas_file):
|
|
143
|
+
with sync_playwright() as playwright:
|
|
144
|
+
run(playwright, gene=gene, save_path=phewas_dir)
|
|
145
|
+
|
|
146
|
+
if os.path.exists(phewas_file):
|
|
147
|
+
fig, plot_df = get_azn_manhattan(gene, phewas_file)
|
|
148
|
+
fig_path = uth._save_fig_to_tmp(fig, basename="azn_phewas", dpi=300)
|
|
149
|
+
doc.add_paragraph() # spacing
|
|
150
|
+
doc.add_picture(fig_path, width=Inches(6.5))
|
|
151
|
+
uth._add_caption(doc, f"Figure X. {gene} PheWAS results from AstraZeneca.")
|
|
152
|
+
doc.add_paragraph() # spacing
|
|
153
|
+
|
|
154
|
+
# Table 1: Top 10 significant hits (sorted by p-value)
|
|
155
|
+
top_significant = plot_df.nsmallest(10, 'P value')
|
|
156
|
+
columns = ['Phenotype', 'Phenotypic category', 'Odds ratio', 'P value']
|
|
157
|
+
# Rename columns for display
|
|
158
|
+
display_df = top_significant[columns].copy()
|
|
159
|
+
display_df.columns = ['Phenotype', 'Category', 'OR', 'P-value']
|
|
160
|
+
uth._add_table_to_doc(doc, display_df, f"Table X. Top 10 significant associations for {gene}",
|
|
161
|
+
['Phenotype', 'Category', 'OR', 'P-value'], list(map(Inches, [2.75, 1.75, 0.75, 0.75]))
|
|
162
|
+
)
|
|
163
|
+
# Table 2: Top 10 negative beta hits (odds_ratio < 0, sorted by p-value)
|
|
164
|
+
negative_beta = plot_df[plot_df['Odds ratio'] < 1].nsmallest(10, 'P value')
|
|
165
|
+
if len(negative_beta) > 0:
|
|
166
|
+
display_df_neg = negative_beta[columns].copy()
|
|
167
|
+
display_df_neg.columns = ['Phenotype', 'Category', 'OR', 'P-value']
|
|
168
|
+
uth._add_table_to_doc(doc, display_df_neg, f"Table X. Top 10 protective associations for {gene}",
|
|
169
|
+
['Phenotype', 'Category', 'OR', 'P-value'], list(map(Inches, [2.75, 1.75, 0.75, 0.75]))
|
|
170
|
+
)
|
|
171
|
+
doc.add_paragraph() # spacing
|
|
172
|
+
|
|
173
|
+
else:
|
|
174
|
+
doc.add_paragraph(f"No AstraZeneca PheWAS results found for {gene}.")
|
|
175
|
+
doc.add_paragraph() # spacing
|
|
176
|
+
return doc
|
|
177
|
+
|
|
178
|
+
|
|
179
|
+
if __name__ == "__main__":
|
|
180
|
+
from docx import Document
|
|
181
|
+
doc = Document()
|
|
182
|
+
gene = "INHBE"
|
|
183
|
+
phewas_dir = "/home/dbanerjee/deepro/rwe/data/test/"
|
|
184
|
+
doc = generate_azn_clinical_report(doc, gene, phewas_dir)
|
|
185
|
+
doc.save("/home/dbanerjee/deepro/rwe/data/test/INHBE_azn_report.docx")
|
|
186
|
+
|
rwe/clients/genebass.py
ADDED
|
@@ -0,0 +1,137 @@
|
|
|
1
|
+
import os
|
|
2
|
+
import numpy as np
|
|
3
|
+
from playwright.sync_api import Playwright, sync_playwright
|
|
4
|
+
import random
|
|
5
|
+
import pandas as pd
|
|
6
|
+
from docx.shared import Inches, Pt
|
|
7
|
+
|
|
8
|
+
from rwe.plots.clinical import manhattan
|
|
9
|
+
import rwe.utils.helpers as uth
|
|
10
|
+
from rwe.clients.hgnc import get_gene_info
|
|
11
|
+
|
|
12
|
+
def run(playwright: Playwright, gene:str, gene_id:str, save_path:str) -> None:
|
|
13
|
+
browser = playwright.chromium.launch(headless=True)
|
|
14
|
+
context = browser.new_context(accept_downloads=True)
|
|
15
|
+
# Open new page
|
|
16
|
+
page = context.new_page()
|
|
17
|
+
# Go to https://app.genebass.org/
|
|
18
|
+
# page.goto("https://app.genebass.org/")
|
|
19
|
+
query_id = "https://app.genebass.org/gene/" + gene_id +"?burdenSet=pLoF&phewasOpts=1&resultLayout=full"
|
|
20
|
+
page.goto(query_id)
|
|
21
|
+
page.click("label:has-text(\"Burden\")")
|
|
22
|
+
page.wait_for_timeout(random.uniform(100, 1000))
|
|
23
|
+
# Click text=Export data to CSV
|
|
24
|
+
with page.expect_download() as download_info:
|
|
25
|
+
page.click("text=Export data to CSV")
|
|
26
|
+
download = download_info.value
|
|
27
|
+
download_path = os.path.join(save_path, f"{gene}_genebass.csv")
|
|
28
|
+
os.makedirs(os.path.dirname(download_path), exist_ok=True)
|
|
29
|
+
download.save_as(path=download_path)
|
|
30
|
+
page.wait_for_timeout(random.uniform(100, 1000))
|
|
31
|
+
# ---------------------
|
|
32
|
+
context.close()
|
|
33
|
+
browser.close()
|
|
34
|
+
return
|
|
35
|
+
|
|
36
|
+
def clean_genebass_phewas(phewas_file):
|
|
37
|
+
df = pd.read_csv(phewas_file, sep=",")
|
|
38
|
+
df = df.loc[df["Trait type"]=="icd_first_occurrence"]
|
|
39
|
+
return df
|
|
40
|
+
|
|
41
|
+
def get_genebass_manhattan(gene, phewas_file):
|
|
42
|
+
df = clean_genebass_phewas(phewas_file)
|
|
43
|
+
name_col = "Description"
|
|
44
|
+
p_col = "P-Value (Burden)"
|
|
45
|
+
odds_ratio_col = None
|
|
46
|
+
beta_col = "Beta"
|
|
47
|
+
cat_col = "Category"
|
|
48
|
+
|
|
49
|
+
df[p_col] = pd.to_numeric(df[p_col], errors="coerce")
|
|
50
|
+
df = df[df[p_col].notna() & (df[p_col] > 0) & np.isfinite(df[p_col])]
|
|
51
|
+
prefix = "Health-related outcomes > First occurrences > "
|
|
52
|
+
df[cat_col] = (df[cat_col].astype(str)
|
|
53
|
+
.str.replace(prefix, "", regex=False)
|
|
54
|
+
.fillna("Unknown"))
|
|
55
|
+
GENEBASS_CATEGORY_SHORT = {
|
|
56
|
+
"Blood, blood-forming organs and certain immune disorders": "Blood/Immune",
|
|
57
|
+
"Certain conditions originating in the perinatal period": "Perinatal",
|
|
58
|
+
"Certain infectious and parasitic diseases": "Infectious",
|
|
59
|
+
"Circulatory system disorders": "Circulatory",
|
|
60
|
+
"Congenital disruptions and chromosomal abnormalities": "Congenital",
|
|
61
|
+
"Digestive system disorders": "Digestive",
|
|
62
|
+
"Ear and mastoid process disorders": "Ear/Mastoid",
|
|
63
|
+
"Endocrine, nutritional and metabolic diseases": "Endocrine/Metabolic",
|
|
64
|
+
"Eye and adnexa disorders": "Eye",
|
|
65
|
+
"Genitourinary system disorders": "Genitourinary",
|
|
66
|
+
"Mental and behavioural disorders": "Mental/Behavioral",
|
|
67
|
+
"Musculoskeletal system and connective tissue disorders": "Musculoskeletal",
|
|
68
|
+
"Nervous system disorders": "Nervous System",
|
|
69
|
+
"Pregnancy, childbirth and the puerperium": "Pregnancy/Childbirth",
|
|
70
|
+
"Respiratory system disorders": "Respiratory",
|
|
71
|
+
"Skin and subcutaneous tissue disorders": "Skin",
|
|
72
|
+
}
|
|
73
|
+
df[cat_col] = df[cat_col].map(GENEBASS_CATEGORY_SHORT).fillna(df[cat_col])
|
|
74
|
+
fig, ax, plot_df = manhattan(
|
|
75
|
+
df,
|
|
76
|
+
p_col=p_col,
|
|
77
|
+
category_col=cat_col,
|
|
78
|
+
label_col=name_col,
|
|
79
|
+
beta_col=beta_col,
|
|
80
|
+
sig_p=2.8e-4,
|
|
81
|
+
top_k_labels=5,
|
|
82
|
+
title=f"{gene} PheWAS GeneBass",
|
|
83
|
+
)
|
|
84
|
+
return fig, plot_df
|
|
85
|
+
|
|
86
|
+
def generate_genebass_clinical_report(doc, gene, phewas_dir, phewas_filename=""):
|
|
87
|
+
if not phewas_filename:
|
|
88
|
+
phewas_filename = f"{gene}_genebass.csv"
|
|
89
|
+
phewas_file = os.path.join(phewas_dir, phewas_filename)
|
|
90
|
+
|
|
91
|
+
if not os.path.exists(phewas_file):
|
|
92
|
+
hgnc_path = os.path.join(phewas_dir, "gene_with_protein_product.txt")
|
|
93
|
+
approved_symbol, ensembl_id, entrez_id = get_gene_info(gene, hgnc_path)
|
|
94
|
+
with sync_playwright() as playwright:
|
|
95
|
+
run(playwright, approved_symbol, ensembl_id, save_path=phewas_dir)
|
|
96
|
+
|
|
97
|
+
if os.path.exists(phewas_file):
|
|
98
|
+
fig, plot_df = get_genebass_manhattan(gene, phewas_file)
|
|
99
|
+
fig_path = uth._save_fig_to_tmp(fig, basename="genebass_phewas", dpi=300)
|
|
100
|
+
doc.add_paragraph() # spacing
|
|
101
|
+
doc.add_picture(fig_path, width=Inches(6))
|
|
102
|
+
uth._add_caption(doc, f"Figure X. {gene} PheWAS results from GeneBass.")
|
|
103
|
+
doc.add_paragraph() # spacing
|
|
104
|
+
|
|
105
|
+
# Table 1: Top 10 significant hits (sorted by p-value)
|
|
106
|
+
top_significant = plot_df.nsmallest(10, 'P-Value (Burden)')
|
|
107
|
+
columns = ['Description', 'Category', 'Beta', 'P-Value (Burden)']
|
|
108
|
+
# Rename columns for display
|
|
109
|
+
display_df = top_significant[columns].copy()
|
|
110
|
+
display_df.columns = ['Description', 'Category', 'Beta', 'P-value']
|
|
111
|
+
uth._add_table_to_doc(doc, display_df, f"Table X. Top 10 significant associations for {gene}",
|
|
112
|
+
['Description', 'Category', 'Beta', 'P-value'], list(map(Inches, [2.75, 2.0, 0.5, 0.75]))
|
|
113
|
+
)
|
|
114
|
+
|
|
115
|
+
# Table 2: Top 10 negative beta hits (beta < 0, sorted by p-value)
|
|
116
|
+
negative_beta = plot_df[plot_df['Beta'] < 0].nsmallest(10, 'P-Value (Burden)')
|
|
117
|
+
if len(negative_beta) > 0:
|
|
118
|
+
display_df_neg = negative_beta[columns].copy()
|
|
119
|
+
display_df_neg.columns = ['Description', 'Category', 'Beta', 'P-value']
|
|
120
|
+
uth._add_table_to_doc(doc, display_df_neg, f"Table X. Top 10 protective associations for {gene}",
|
|
121
|
+
['Description', 'Category', 'Beta', 'P-value'], list(map(Inches, [2.75, 2.0, 0.5, 0.75]))
|
|
122
|
+
)
|
|
123
|
+
|
|
124
|
+
doc.add_paragraph() # spacing
|
|
125
|
+
else:
|
|
126
|
+
doc.add_paragraph(f"No GeneBass PheWAS results found for {gene}.")
|
|
127
|
+
doc.add_paragraph() # spacing
|
|
128
|
+
return doc
|
|
129
|
+
|
|
130
|
+
if __name__ == "__main__":
|
|
131
|
+
from docx import Document
|
|
132
|
+
doc = Document()
|
|
133
|
+
gene = "INHBE"
|
|
134
|
+
phewas_dir = "/home/dbanerjee/deepro/rwe/data/test/"
|
|
135
|
+
doc = generate_genebass_clinical_report(doc, gene, phewas_dir)
|
|
136
|
+
doc.save("/home/dbanerjee/deepro/rwe/data/test/INHBE_genebass_report.docx")
|
|
137
|
+
|
rwe/clients/hgnc.py
ADDED
|
@@ -0,0 +1,52 @@
|
|
|
1
|
+
import pandas as pd
|
|
2
|
+
import os
|
|
3
|
+
import requests
|
|
4
|
+
|
|
5
|
+
def load_data(hgnc_path):
|
|
6
|
+
# Load HGNC gene list (protein-coding genes)
|
|
7
|
+
if os.path.exists(hgnc_path):
|
|
8
|
+
hgnc_df = pd.read_csv(hgnc_path, sep='\t', dtype=str)
|
|
9
|
+
else:
|
|
10
|
+
url = 'https://storage.googleapis.com/public-download-files/hgnc/tsv/tsv/locus_types/gene_with_protein_product.txt'
|
|
11
|
+
resp = requests.get(url)
|
|
12
|
+
resp.raise_for_status()
|
|
13
|
+
with open(hgnc_path, 'wb') as f:
|
|
14
|
+
f.write(resp.content)
|
|
15
|
+
hgnc_df = pd.read_csv(hgnc_path, sep='\t', dtype=str)
|
|
16
|
+
|
|
17
|
+
return hgnc_df
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
def build_gene_symbol_map(hgnc_df):
|
|
21
|
+
"""
|
|
22
|
+
Map all approved, alias, and previous symbols to the current HGNC symbol.
|
|
23
|
+
"""
|
|
24
|
+
symbol_map = {}
|
|
25
|
+
for _, row in hgnc_df.iterrows():
|
|
26
|
+
approved = row.get('symbol')
|
|
27
|
+
if pd.isna(approved):
|
|
28
|
+
continue
|
|
29
|
+
symbol_map[approved] = approved
|
|
30
|
+
for col in ['alias_symbol', 'prev_symbol']:
|
|
31
|
+
if col in row and pd.notna(row[col]):
|
|
32
|
+
for alias in row[col].split('|'):
|
|
33
|
+
symbol_map[alias] = approved
|
|
34
|
+
ensembl_map = dict(zip(hgnc_df['symbol'], hgnc_df['ensembl_gene_id']))
|
|
35
|
+
entrez_map = dict(zip(hgnc_df['symbol'], hgnc_df['entrez_id']))
|
|
36
|
+
return symbol_map, ensembl_map, entrez_map
|
|
37
|
+
|
|
38
|
+
def get_gene_info(gene, hgnc_path):
|
|
39
|
+
hgnc_df = load_data(hgnc_path)
|
|
40
|
+
symbol_map, ensembl_map, entrez_map = build_gene_symbol_map(hgnc_df)
|
|
41
|
+
approved_symbol = symbol_map.get(gene)
|
|
42
|
+
if not approved_symbol:
|
|
43
|
+
raise ValueError(f"Gene symbol '{gene}' not found in HGNC data.")
|
|
44
|
+
ensembl_id = ensembl_map.get(approved_symbol)
|
|
45
|
+
entrez_id = entrez_map.get(approved_symbol)
|
|
46
|
+
return approved_symbol, ensembl_id, entrez_id
|
|
47
|
+
|
|
48
|
+
if __name__ == "__main__":
|
|
49
|
+
hgnc_path = "/home/dbanerjee/deepro/rwe/data/hgnc/gene_with_protein_product.txt"
|
|
50
|
+
approved_symbol, ensembl_id, entrez_id = get_gene_info("ANGPTL3", hgnc_path)
|
|
51
|
+
print(f"Approved symbol: {approved_symbol}, Ensembl ID: {ensembl_id}, Entrez ID: {entrez_id}")
|
|
52
|
+
|
rwe/generate_report.py
ADDED
|
@@ -0,0 +1,66 @@
|
|
|
1
|
+
import os
|
|
2
|
+
import pandas as pd
|
|
3
|
+
|
|
4
|
+
import matplotlib
|
|
5
|
+
matplotlib.use("Agg") # important for batch/report generation (no display)
|
|
6
|
+
|
|
7
|
+
import rwe.utils.report as utr
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
# ---------- Main user-facing function ----------
|
|
12
|
+
|
|
13
|
+
def generate_rwe_report(
|
|
14
|
+
gene: str,
|
|
15
|
+
chrm: str,
|
|
16
|
+
zygosity: str = "hetz",
|
|
17
|
+
out_docx_path: str = "",
|
|
18
|
+
logo_path: str = "",
|
|
19
|
+
report_date: str = "",
|
|
20
|
+
):
|
|
21
|
+
"""
|
|
22
|
+
Creates a DOCX report with:
|
|
23
|
+
- Title page
|
|
24
|
+
- Contents (TOC field)
|
|
25
|
+
- Five section headers
|
|
26
|
+
- Variant information + Demographics figures inserted into section 1
|
|
27
|
+
|
|
28
|
+
Parameters
|
|
29
|
+
----------
|
|
30
|
+
gene : str
|
|
31
|
+
chrm : str
|
|
32
|
+
out_docx_path : str
|
|
33
|
+
logo_path : optional str (path to Arrowhead logo PNG/JPG)
|
|
34
|
+
report_date : optional str (e.g. "January 26th, 2026")
|
|
35
|
+
"""
|
|
36
|
+
proj_dir = os.path.dirname(out_docx_path) or "."
|
|
37
|
+
os.makedirs(proj_dir, exist_ok=True)
|
|
38
|
+
|
|
39
|
+
# --- Title and Contents --- #
|
|
40
|
+
doc = utr.generate_title_and_contents(gene, logo_path=logo_path, report_date=report_date)
|
|
41
|
+
|
|
42
|
+
# --- Variant information and demographics --- #
|
|
43
|
+
doc = utr.generate_variant_information_and_demographics(doc, chrm, gene, zygosity)
|
|
44
|
+
|
|
45
|
+
# --- Clinical records --- #
|
|
46
|
+
doc = utr.generate_clinical_records(doc, chrm, gene, zygosity, proj_dir)
|
|
47
|
+
|
|
48
|
+
# --- Labs and measurements --- #
|
|
49
|
+
doc = utr.generate_labs_and_measurements(doc, chrm, gene, zygosity, proj_dir)
|
|
50
|
+
|
|
51
|
+
# --- Survey information --- #
|
|
52
|
+
doc = utr.generate_survey_information(doc, chrm, gene, zygosity, proj_dir)
|
|
53
|
+
|
|
54
|
+
# --- Homozygous loss of function carriers --- #
|
|
55
|
+
doc = utr.generate_homozygous_lof_carriers(doc, chrm, gene, zygosity, proj_dir)
|
|
56
|
+
|
|
57
|
+
|
|
58
|
+
# Save
|
|
59
|
+
if not out_docx_path:
|
|
60
|
+
out_docx_path = f"{gene}_RWE_report.docx"
|
|
61
|
+
doc.save(out_docx_path)
|
|
62
|
+
return out_docx_path
|
|
63
|
+
|
|
64
|
+
if __name__ == "__main__":
|
|
65
|
+
import pandas as pd
|
|
66
|
+
|
rwe/parsers/__init__.py
ADDED
|
File without changes
|
|
File without changes
|