rwe 0.0.1__py3-none-any.whl → 0.0.3__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
rwe/clients/aou.py CHANGED
@@ -45,9 +45,13 @@ def generate_aou_variant_info_demographics_report(doc: Document, chrm: str, gene
45
45
 
46
46
 
47
47
  ############### Clinical Records ###############
48
- def clean_aou_phewas(phewas_file):
49
- df = pd.read_csv(phewas_file, sep="\t")
50
- df = df.loc[(df.ancestry == "all")&(df.converged==True)]
48
+ def clean_aou_phewas(phewas_file, version="1.2"):
49
+ if version == "1.2":
50
+ df = pd.read_csv(phewas_file)
51
+ df = df.loc[df.converged==True]
52
+ elif version =="X":
53
+ df = pd.read_csv(phewas_file, sep="\t")
54
+ df = df.loc[(df.ancestry == "all")&(df.converged==True)]
51
55
  return df
52
56
 
53
57
  def get_aou_manhattan(df, gene):
@@ -70,7 +74,7 @@ def get_aou_manhattan(df, gene):
70
74
 
71
75
  def generate_aou_clinical_report(doc, chrm, gene, zygosity):
72
76
  from rwe.parsers.aou.config import BUCKET
73
- phewas_file = f"{BUCKET}/data/phewas/results/{chrm}/{gene}_phewas.csv"
77
+ phewas_file = f"{BUCKET}/data/phewas/results/chr{chrm}/{gene}_phewas.csv"
74
78
  if uth._gcs_size(phewas_file, BUCKET) > 0:
75
79
  df = clean_aou_phewas(phewas_file)
76
80
  fig, plot_df = get_aou_manhattan(df, gene)
@@ -107,29 +111,10 @@ def generate_aou_clinical_report(doc, chrm, gene, zygosity):
107
111
 
108
112
 
109
113
  ############### Labs and Measurements ###############
110
- def remove_outliers_iqr(df, column, multiplier=5):
111
- """
112
- Remove outliers from a specified column in a pandas DataFrame using the IQR method.
113
- """
114
- value = np.log1p(df[column].astype(float))
115
- Q1 = value.quantile(0.25)
116
- Q3 = value.quantile(0.75)
117
- IQR = Q3 - Q1
118
- lower_bound = Q1 - multiplier * IQR
119
- upper_bound = Q3 + multiplier * IQR
120
- return np.where(value.between(lower_bound, upper_bound), df[column], pd.NA)
121
-
122
- def clean_measurement(df, col="median_value", multiplier=5):
123
- df = df.copy()
124
- df = df[df[col] >= 0]
125
- df[col] = remove_outliers_iqr(df, col, multiplier)
126
- return df.dropna(subset=[col])
127
-
128
114
  def compare(df, measurement):
129
115
  mdf = df.loc[df.measurement==measurement].copy()
130
116
  ctrls = mdf.loc[mdf.cases==False].drop_duplicates(["person_id", "measurement_concept_id"])
131
117
  cases = mdf.loc[mdf.cases==True].drop_duplicates(["person_id", "measurement_concept_id"])
132
- ctrls = clean_measurement(ctrls)
133
118
  x = pd.to_numeric(cases["median_value"], errors="coerce").dropna().to_numpy()
134
119
  y = pd.to_numeric(ctrls["median_value"], errors="coerce").dropna().to_numpy()
135
120
  caq1, caq2, caq3 = pd.Series(x).quantile([0.25, 0.5, 0.75])
@@ -177,7 +162,7 @@ def generate_aou_labs_measurements_report(doc: Document, chrm: str, gene: str, z
177
162
  res_df = run_parallel(numerical_measurements_df, measurements=measurements, n_jobs=None, chunksize=20)
178
163
  # TODO: add most significant measurements table to doc
179
164
  for k,v in MEASUREMENT_GROUPS.items():
180
- f,a,p = plot_measurements(numerical_measurements_df, v, multiplier=5, col="median_value", res_df=res_df)
165
+ f, a = plot_measurements(numerical_measurements_df, v, col="median_value", res_df=res_df)
181
166
  fig_path = uth._save_fig_to_tmp(f, basename=f"aou_measurements_{k}", dpi=300)
182
167
  doc.add_paragraph() # spacing
183
168
  doc.add_picture(fig_path, width=Inches(6.5))
@@ -188,8 +173,8 @@ def generate_aou_labs_measurements_report(doc: Document, chrm: str, gene: str, z
188
173
  ############### Surveys ###############
189
174
  def clean_aou_surveys(df, survey_col="survey", question_col="question", answer_col="answer_category", zygosity="hetz"):
190
175
  df = df.copy()
191
- df[question_col] = df[question_col].str.replace("^" + df[survey_col] + ": ", "", regex=True)
192
- df[answer_col] = df[answer_col].str.replace("^" + df[question_col] + ": ", "", regex=True)
176
+ df[question_col] = df[question_col].str.replace(r"^.*?:\s*", "", regex=True, n=1)
177
+ df[answer_col] = df[answer_col].str.replace(r"^.*?:\s*", "", regex=True, n=1)
193
178
  return df
194
179
 
195
180
  def generate_aou_survey_report(doc: Document, chrm: str, gene: str, zygosity: str) -> Document:
rwe/parsers/aou/config.py CHANGED
@@ -61,7 +61,7 @@ PLAUSIBLE = {
61
61
 
62
62
  # --- Liver / protein balance (AoU typical units: albumin g/L, protein g/L, enzymes U/L, bilirubin mg/dL OR umol/L; range is loose) ---
63
63
  "Albumin [Mass/volume] in Serum or Plasma": (15, 60), # g/L
64
- "Protein [Mass/volume] in Serum or Plasma": (40, 100), # g/L
64
+ "Protein [Mass/volume] in Serum or Plasma": (3, 12), # g/dL
65
65
  "Alanine aminotransferase [Enzymatic activity/volume] in Serum or Plasma": (0, 1000), # U/L
66
66
  "Aspartate aminotransferase [Enzymatic activity/volume] in Serum or Plasma": (0, 1000), # U/L
67
67
  "Alkaline phosphatase [Enzymatic activity/volume] in Serum or Plasma": (10, 2000), # U/L
@@ -7,9 +7,29 @@ from scipy import stats
7
7
  import multiprocessing as mp
8
8
  from scipy.stats import ks_2samp
9
9
 
10
+ def remove_outliers_iqr(df, column, multiplier=5):
11
+ """
12
+ Remove outliers from a specified column in a pandas DataFrame using the IQR method.
13
+ """
14
+ value = np.log1p(df[column].astype(float))
15
+ Q1 = value.quantile(0.25)
16
+ Q3 = value.quantile(0.75)
17
+ IQR = Q3 - Q1
18
+ lower_bound = Q1 - multiplier * IQR
19
+ upper_bound = Q3 + multiplier * IQR
20
+ return np.where(value.between(lower_bound, upper_bound), df[column], pd.NA)
21
+
22
+ def remove_outliers(df, col="median_value", multiplier=5):
23
+ df = df.copy()
24
+ df = df[df[col] >= 0]
25
+ df[col] = remove_outliers_iqr(df, col, multiplier)
26
+ return df.dropna(subset=[col])
10
27
 
11
28
  def clean_measurements_helper(g):
29
+ from rwe.parsers.aou.config import PLAUSIBLE, UNIT_DROPS, UNIT_CONVERSIONS
12
30
  m = g["measurement"].iat[0]
31
+ g["median_value"] = pd.to_numeric(g["median_value"], errors="coerce")
32
+ g = g.dropna(subset=["median_value"])
13
33
  # 1) drop units (manual)
14
34
  drops = UNIT_DROPS.get(m, set())
15
35
  if drops:
@@ -25,15 +45,17 @@ def clean_measurements_helper(g):
25
45
  # 3) plausible range filter
26
46
  lo, hi = PLAUSIBLE[m]
27
47
  g = g[g["median_value"].between(lo, hi)].copy()
28
-
29
- # 4) remove units present in < MIN_UNIT_N samples
48
+ # 4) IQR outlier removal (pooled; before KS)
49
+ g = remove_outliers(g, col="median_value", multiplier=5)
50
+ if g.empty:
51
+ return g
52
+ # 5) remove units present in < MIN_UNIT_N samples
30
53
  unit_counts = g["unit"].value_counts()
31
54
  keep_units = unit_counts[unit_counts >= 5].index
32
55
  g = g[g["unit"].isin(keep_units)].copy()
33
56
  if g.empty or g["unit"].nunique() == 1:
34
57
  return g
35
-
36
- # 5) dissimilar distributions via KS test vs most common unit
58
+ # 6) dissimilar distributions via KS test vs most common unit
37
59
  ref_unit = g["unit"].value_counts().idxmax()
38
60
  ref = g.loc[g["unit"].eq(ref_unit), "median_value"].astype(float).dropna()
39
61
 
@@ -48,7 +70,7 @@ def clean_measurements_helper(g):
48
70
  return g[g["unit"].isin(keep)].copy()
49
71
 
50
72
  def clean_measurements():
51
- from rwe.parsers.aou.config import BUCKET, CDR, GOOGLE_PROJECT
73
+ from rwe.parsers.aou.config import BUCKET, CDR, GOOGLE_PROJECT, PLAUSIBLE
52
74
  numerical_measurements_df = pd.read_parquet(f"{BUCKET}/data/rwe_info/raw/numerical_measurements.parquet")
53
75
  selected_nm_df = numerical_measurements_df.loc[
54
76
  numerical_measurements_df.measurement.isin(PLAUSIBLE.keys())
@@ -71,4 +93,3 @@ def clean_surveys():
71
93
  selected_survey_df = survey_df.loc[survey_df.question.isin(questions)].copy()
72
94
  selected_survey_df.to_parquet(f"{BUCKET}/data/rwe_info/processed/selected_surveys.parquet")
73
95
  return
74
-
rwe/plots/demographics.py CHANGED
@@ -121,7 +121,7 @@ def demographics_plot(person_df,
121
121
  top_n_sex=2,
122
122
  top_n_ethnicity=2,
123
123
  top_n_ancestry=None,
124
- figsize=(14, 7),
124
+ figsize=(4, 5),
125
125
  palette=None,
126
126
  share_legend=True,
127
127
  savepath=None,
rwe/plots/measurements.py CHANGED
@@ -27,14 +27,26 @@ def fmt_p(p):
27
27
  return f"P={p:.2e}" if p < 0.001 else f"P={p:.3f}"
28
28
 
29
29
  def plot_measurements(
30
- df_long, measurements, multiplier=5, col="median_value",
31
- res_df=None, res_name_col="measurement", res_p_col="p_mwu"
32
- ):
30
+ df_long,
31
+ measurements,
32
+ col="median_value",
33
+ res_df=None,
34
+ res_name_col="measurement",
35
+ res_p_col="p_mwu",
36
+ ncols=3,
37
+ figsize=(6, 4),
38
+ palette=None
39
+ ):
33
40
  d = df_long[df_long["measurement"].isin(measurements)].copy()
34
41
  d[col] = pd.to_numeric(d[col], errors="coerce")
35
42
  d = d.dropna(subset=[col])
36
43
  d["group"] = np.where(d["cases"], "Cases", "Controls")
37
44
 
45
+ if palette is None:
46
+ palette = {
47
+ "Controls": "#2F6690", # BLUE
48
+ "Cases": "#D1495B", # RED
49
+ }
38
50
  # star map
39
51
  p_map, star_map = {}, {}
40
52
  if res_df is not None:
@@ -43,25 +55,22 @@ def plot_measurements(
43
55
  p_map = dict(zip(tmp[res_name_col], tmp[res_p_col]))
44
56
  star_map = {k: p_to_stars(v) for k, v in p_map.items()}
45
57
 
46
- parts = []
47
- for m in measurements:
48
- sub = d[d["measurement"] == m].copy()
49
- sub = apply_plausible_range(sub, m)
50
- ctrl = clean_measurement(sub[sub["group"] == "Controls"], col=col, multiplier=multiplier)
51
- case = sub[sub["group"] == "Cases"].copy() # don't IQR-clean cases
52
- parts.append(pd.concat([ctrl, case], ignore_index=True))
53
- p = pd.concat(parts, ignore_index=True)
54
-
55
- fig, axes = plt.subplots(2, 3, figsize=(4, 3))
56
- axes = axes.flatten()
58
+ # Layout: dynamic grid
59
+ n = len(measurements)
60
+ nrows = int(np.ceil(n / ncols))
61
+ fig, axes = plt.subplots(nrows, ncols, figsize=figsize)
62
+ axes = np.array(axes).ravel()
57
63
 
58
64
  for i, m in enumerate(measurements):
59
65
  ax = axes[i]
60
- sub = p[p["measurement"] == m].copy()
61
- sub[col] = pd.to_numeric(sub[col], errors="coerce")
62
- sub = sub.dropna(subset=[col])
63
- sns.violinplot(data=sub, x="group", y=col, order=["Controls", "Cases"],
64
- inner="quartile", cut=0, linewidth=1, ax=ax)
66
+ sub = d[d["measurement"] == m].copy()
67
+
68
+ sns.violinplot(
69
+ data=sub, x="group", y=col, order=["Controls", "Cases"],
70
+ hue="group", hue_order=["Controls", "Cases"], legend=False,
71
+ palette=palette, inner="quartile", cut=0, linewidth=1, ax=ax
72
+ )
73
+
65
74
  ax.set_title(textwrap.fill(m, width=20, max_lines=3))
66
75
  ax.set_xlabel("")
67
76
  ax.set_ylabel("")
@@ -75,15 +84,20 @@ def plot_measurements(
75
84
  ymin = np.nanmin(sub[col].values)
76
85
  span = (ymax - ymin) if ymax > ymin else 1.0
77
86
 
78
- y_line = ymax + 0.07 * span
79
- y_star = y_line + 0.02 * span
80
- y_p = y_star - 0.03 * span # right below stars
81
-
87
+ # put annotation above violins
88
+ y_txt = ymax + 0.08 * span
89
+
90
+ # x=0.5 is between the two categories (0 and 1)
91
+ if stars:
92
+ ax.text(0.5, y_txt + 0.03 * span, stars, ha="center", va="bottom")
82
93
  if not pd.isna(pval):
83
- ax.text(0.5, y_p, fmt_p(pval), ha="center", va="top")
94
+ ax.text(0.5, y_txt, fmt_p(pval), ha="center", va="bottom")
95
+
96
+ # give headroom so text doesn't clip
97
+ ax.set_ylim(ymin, y_txt + 0.12 * span)
84
98
 
85
- for j in range(len(vitals), 6):
99
+ for j in range(n, len(axes)):
86
100
  axes[j].axis("off")
87
101
 
88
102
  plt.tight_layout()
89
- return fig, axes, p
103
+ return fig, axes
rwe/plots/surveys.py CHANGED
@@ -84,7 +84,7 @@ def percent_bar_with_n(
84
84
  palette=palette
85
85
  )
86
86
 
87
- ax.set_title(title or f"{cat_col} (% within group)")
87
+ ax.set_title(title or f"{cat_col} (% within group)", pad=20)
88
88
  ax.set_xlabel("")
89
89
  ax.set_ylabel("Percent (%)")
90
90
  ax.tick_params(axis="x", rotation=25)
@@ -158,6 +158,7 @@ def plot_survey_questions(
158
158
 
159
159
  fig, axes = plt.subplots(nrows, ncols, figsize=figsize)
160
160
  axes = axes.ravel()
161
+ handles = labels = None
161
162
 
162
163
  for i, ((svy, q), sub) in enumerate(groups):
163
164
  percent_bar_with_n(
rwe/plots/variant_info.py CHANGED
@@ -48,7 +48,7 @@ def create_variant_frequency_plots(
48
48
  bar_color="#D1495B",
49
49
  bar_fontsize=6,
50
50
  table_fontsize=6,
51
- bar_height_ratio=1.5, # relative height of bar panel vs table panel
51
+ bar_height_ratio=0.25, # relative height of bar panel vs table panel
52
52
  ):
53
53
  df = variant_df.copy()
54
54
 
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: rwe
3
- Version: 0.0.1
3
+ Version: 0.0.3
4
4
  Summary: Real World Evidence utilities and reporting
5
5
  Author: Deepro Banerjee
6
6
  License: MIT License
@@ -32,7 +32,7 @@ License-File: LICENSE
32
32
  Requires-Dist: pandas
33
33
  Requires-Dist: numpy
34
34
  Requires-Dist: matplotlib
35
- Requires-Dist: seaborn>=0.12
35
+ Requires-Dist: seaborn>=0.13
36
36
  Requires-Dist: python-docx>=1.1.0
37
37
  Requires-Dist: tqdm
38
38
  Requires-Dist: requests
@@ -2,26 +2,26 @@ rwe/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
2
2
  rwe/generate_report.py,sha256=-NbQJ4H-NYdDzziWMoi5PUWyhZ7p8iVC8UXYUP1v_xY,1833
3
3
  rwe/assets/Arrowhead_Pharmaceuticals_logo.png,sha256=C2mAn6GG4gArds4sBnqM5LcxhuZgujXjwGiZxIuxbLo,51053
4
4
  rwe/clients/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
5
- rwe/clients/aou.py,sha256=7nPg2JU1LtkQ5AClqQMpeqjWh3kBrI63RmYOFcfVqxc,9949
5
+ rwe/clients/aou.py,sha256=XVvRArzcTTBO0JmiTtWvuiafTuzpupaZJBb1Mtt8QIQ,9387
6
6
  rwe/clients/azn.py,sha256=6zwEv0IAgBA-FcI2glGerA9M6pzdKQx7M5M4HZI8HQs,7541
7
7
  rwe/clients/genebass.py,sha256=OkF0_J4lmiujWlffsBlAM3uKxKOCzi7_3f_ehju_tPE,6103
8
8
  rwe/clients/hgnc.py,sha256=ZabNiMgzwu4rCVCptbgUb38qHsL9cHlsQX7s6mZA8Nk,2015
9
9
  rwe/parsers/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
10
10
  rwe/parsers/aou/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
11
11
  rwe/parsers/aou/collect.py,sha256=3j9S0vg3s5u-lJYhP9gdJA_SJVUTUld-i8M1mS-qg-0,13610
12
- rwe/parsers/aou/config.py,sha256=_BdQohVgSeLNIkoLBQ4XIgo2ucep6NDU8k0vGKgONj8,27018
13
- rwe/parsers/aou/process.py,sha256=T3B4Ir3qHoq7txEO6Bxi1L_qHqjTbWikVLtFHS6QuSM,2705
12
+ rwe/parsers/aou/config.py,sha256=7O8NhrxOAeDJnKYsiQBgufNkxo_O6nRPSrhJ49pteLk,27017
13
+ rwe/parsers/aou/process.py,sha256=q-H12OQdwBZMqWxuDo389m6WCJb9Qsp5ZIwVc2MHqu4,3683
14
14
  rwe/plots/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
15
15
  rwe/plots/clinical.py,sha256=dBgdrFog-4gWJyJphoObSuoPtS7Q4wOprWyXMcDD6Dw,7738
16
- rwe/plots/demographics.py,sha256=QrDA1QSKG56Iuv9lodGfYBTymBOf0WKm0q6QFzs0SB4,7288
17
- rwe/plots/measurements.py,sha256=gKSWcon9Wzb3_oO4CnBRn3GTbNqjkfILXj6CmY3josQ,3177
18
- rwe/plots/surveys.py,sha256=X8DT2vnnA1S2u98mrzcA-apSOht8BAsCFjMbHQj7pfY,6636
19
- rwe/plots/variant_info.py,sha256=7V9dJWc8XEQU_t7KY1NzldxYbfOPMKRYBiSNIysizB4,5361
16
+ rwe/plots/demographics.py,sha256=TqYY0dxroLAq9YOZHo1-SebacvNZpQjzUSui-nBUKSg,7287
17
+ rwe/plots/measurements.py,sha256=Z7GG4SMXD3ExGGJF6LvmZ_XwvMwGB_6FFk7nxz_8fr8,3286
18
+ rwe/plots/surveys.py,sha256=2SiNBnjLQL2V4jfeMTIj2y2v7y7Wh4kLji2dnEJ0cdo,6672
19
+ rwe/plots/variant_info.py,sha256=KLUnefwmSxcmOn7mkNBcxvJdpNN4i4F9SZ0SZGlNrvk,5362
20
20
  rwe/utils/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
21
21
  rwe/utils/helpers.py,sha256=t3fmx2OsFx3fvg2uZggagm5LS-PE7J3Yr7Ew3VJUsnA,4336
22
22
  rwe/utils/report.py,sha256=Vyfle1aOoCkqtVeBHAzCdb6hgR0sQZn_qQ4szUw_A3w,4658
23
- rwe-0.0.1.dist-info/licenses/LICENSE,sha256=0PfJPAoyFVWY4L80aEcQyjy4PojxzfqcOenSuXdhgBg,1072
24
- rwe-0.0.1.dist-info/METADATA,sha256=x5Fi7T0sZPjz3wiZyhB8oyS_VU6O5IRYJ4RmOqPyV1w,23601
25
- rwe-0.0.1.dist-info/WHEEL,sha256=wUyA8OaulRlbfwMtmQsvNngGrxQHAvkKcvRmdizlJi0,92
26
- rwe-0.0.1.dist-info/top_level.txt,sha256=zwX0M3dfnEfklZqvvBPQXiJykChpiqt219EolC3WFRc,4
27
- rwe-0.0.1.dist-info/RECORD,,
23
+ rwe-0.0.3.dist-info/licenses/LICENSE,sha256=0PfJPAoyFVWY4L80aEcQyjy4PojxzfqcOenSuXdhgBg,1072
24
+ rwe-0.0.3.dist-info/METADATA,sha256=huTbgeLgndyhlIZHX4qOEPQi-jNFCpfRUPF2FM__EZI,23601
25
+ rwe-0.0.3.dist-info/WHEEL,sha256=wUyA8OaulRlbfwMtmQsvNngGrxQHAvkKcvRmdizlJi0,92
26
+ rwe-0.0.3.dist-info/top_level.txt,sha256=zwX0M3dfnEfklZqvvBPQXiJykChpiqt219EolC3WFRc,4
27
+ rwe-0.0.3.dist-info/RECORD,,
File without changes