PyPI - rwe - Versions diffs - 0.0.1__py3-none-any.whl → 0.0.3__py3-none-any.whl - Mend

rwe 0.0.1py3-none-any.whl → 0.0.3py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (12) hide show

rwe/clients/aou.py +11 -26
rwe/parsers/aou/config.py +1 -1
rwe/parsers/aou/process.py +27 -6
rwe/plots/demographics.py +1 -1
rwe/plots/measurements.py +40 -26
rwe/plots/surveys.py +2 -1
rwe/plots/variant_info.py +1 -1
{rwe-0.0.1.dist-info → rwe-0.0.3.dist-info}/METADATA +2 -2
{rwe-0.0.1.dist-info → rwe-0.0.3.dist-info}/RECORD +12 -12
{rwe-0.0.1.dist-info → rwe-0.0.3.dist-info}/WHEEL +0 -0
{rwe-0.0.1.dist-info → rwe-0.0.3.dist-info}/licenses/LICENSE +0 -0
{rwe-0.0.1.dist-info → rwe-0.0.3.dist-info}/top_level.txt +0 -0

rwe/clients/aou.py CHANGED Viewed

@@ -45,9 +45,13 @@ def generate_aou_variant_info_demographics_report(doc: Document, chrm: str, gene
 ############### Clinical Records ###############
-def clean_aou_phewas(phewas_file):
-    df = pd.read_csv(phewas_file, sep="\t")
-    df = df.loc[(df.ancestry == "all")&(df.converged==True)]
+def clean_aou_phewas(phewas_file, version="1.2"):
+    if version == "1.2":
+        df = pd.read_csv(phewas_file)
+        df = df.loc[df.converged==True]
+    elif version =="X":
+        df = pd.read_csv(phewas_file, sep="\t")
+        df = df.loc[(df.ancestry == "all")&(df.converged==True)]
     return df
 def get_aou_manhattan(df, gene):
@@ -70,7 +74,7 @@ def get_aou_manhattan(df, gene):
 def generate_aou_clinical_report(doc, chrm, gene, zygosity):
     from rwe.parsers.aou.config import BUCKET
-    phewas_file = f"{BUCKET}/data/phewas/results/{chrm}/{gene}_phewas.csv"
+    phewas_file = f"{BUCKET}/data/phewas/results/chr{chrm}/{gene}_phewas.csv"
     if uth._gcs_size(phewas_file, BUCKET) > 0:
         df = clean_aou_phewas(phewas_file)
         fig, plot_df = get_aou_manhattan(df, gene)
@@ -107,29 +111,10 @@ def generate_aou_clinical_report(doc, chrm, gene, zygosity):
 ############### Labs and Measurements ###############
-def remove_outliers_iqr(df, column, multiplier=5):
-    """
-    Remove outliers from a specified column in a pandas DataFrame using the IQR method.
-    """
-    value = np.log1p(df[column].astype(float))
-    Q1 = value.quantile(0.25)
-    Q3 = value.quantile(0.75)
-    IQR = Q3 - Q1
-    lower_bound = Q1 - multiplier * IQR
-    upper_bound = Q3 + multiplier * IQR
-    return np.where(value.between(lower_bound, upper_bound), df[column], pd.NA)
-def clean_measurement(df, col="median_value", multiplier=5):
-    df =  df.copy()
-    df = df[df[col] >= 0]
-    df[col] = remove_outliers_iqr(df, col, multiplier)
-    return df.dropna(subset=[col])
 def compare(df, measurement):
     mdf = df.loc[df.measurement==measurement].copy()
     ctrls = mdf.loc[mdf.cases==False].drop_duplicates(["person_id", "measurement_concept_id"])
     cases = mdf.loc[mdf.cases==True].drop_duplicates(["person_id", "measurement_concept_id"])
-    ctrls = clean_measurement(ctrls)
     x = pd.to_numeric(cases["median_value"], errors="coerce").dropna().to_numpy()
     y = pd.to_numeric(ctrls["median_value"], errors="coerce").dropna().to_numpy()
     caq1, caq2, caq3 = pd.Series(x).quantile([0.25, 0.5, 0.75])
@@ -177,7 +162,7 @@ def generate_aou_labs_measurements_report(doc: Document, chrm: str, gene: str, z
     res_df = run_parallel(numerical_measurements_df, measurements=measurements, n_jobs=None, chunksize=20)
     # TODO: add most significant measurements table to doc
     for k,v in MEASUREMENT_GROUPS.items():
-        f,a,p = plot_measurements(numerical_measurements_df, v, multiplier=5, col="median_value", res_df=res_df)
+        f, a = plot_measurements(numerical_measurements_df, v, col="median_value", res_df=res_df)
         fig_path = uth._save_fig_to_tmp(f, basename=f"aou_measurements_{k}", dpi=300)
         doc.add_paragraph()  # spacing
         doc.add_picture(fig_path, width=Inches(6.5))
@@ -188,8 +173,8 @@ def generate_aou_labs_measurements_report(doc: Document, chrm: str, gene: str, z
 ############### Surveys ###############
 def clean_aou_surveys(df, survey_col="survey", question_col="question", answer_col="answer_category", zygosity="hetz"):
     df = df.copy()
-    df[question_col] = df[question_col].str.replace("^" + df[survey_col] + ": ", "", regex=True)
-    df[answer_col] = df[answer_col].str.replace("^" + df[question_col] + ": ", "", regex=True)
+    df[question_col] = df[question_col].str.replace(r"^.*?:\s*", "", regex=True, n=1)
+    df[answer_col] = df[answer_col].str.replace(r"^.*?:\s*", "", regex=True, n=1)
     return df
 def generate_aou_survey_report(doc: Document, chrm: str, gene: str, zygosity: str) -> Document:

rwe/parsers/aou/config.py CHANGED Viewed

@@ -61,7 +61,7 @@ PLAUSIBLE = {
     # --- Liver / protein balance (AoU typical units: albumin g/L, protein g/L, enzymes U/L, bilirubin mg/dL OR umol/L; range is loose) ---
     "Albumin [Mass/volume] in Serum or Plasma": (15, 60),                   # g/L
-    "Protein [Mass/volume] in Serum or Plasma": (40, 100),                  # g/L
+    "Protein [Mass/volume] in Serum or Plasma": (3, 12),                  # g/dL
     "Alanine aminotransferase [Enzymatic activity/volume] in Serum or Plasma": (0, 1000),     # U/L
     "Aspartate aminotransferase [Enzymatic activity/volume] in Serum or Plasma": (0, 1000),   # U/L
     "Alkaline phosphatase [Enzymatic activity/volume] in Serum or Plasma": (10, 2000),        # U/L

rwe/parsers/aou/process.py CHANGED Viewed

@@ -7,9 +7,29 @@ from scipy import stats
 import multiprocessing as mp
 from scipy.stats import ks_2samp
+def remove_outliers_iqr(df, column, multiplier=5):
+    """
+    Remove outliers from a specified column in a pandas DataFrame using the IQR method.
+    """
+    value = np.log1p(df[column].astype(float))
+    Q1 = value.quantile(0.25)
+    Q3 = value.quantile(0.75)
+    IQR = Q3 - Q1
+    lower_bound = Q1 - multiplier * IQR
+    upper_bound = Q3 + multiplier * IQR
+    return np.where(value.between(lower_bound, upper_bound), df[column], pd.NA)
+def remove_outliers(df, col="median_value", multiplier=5):
+    df =  df.copy()
+    df = df[df[col] >= 0]
+    df[col] = remove_outliers_iqr(df, col, multiplier)
+    return df.dropna(subset=[col])
 def clean_measurements_helper(g):
+    from rwe.parsers.aou.config import PLAUSIBLE, UNIT_DROPS, UNIT_CONVERSIONS
     m = g["measurement"].iat[0]
+    g["median_value"] = pd.to_numeric(g["median_value"], errors="coerce")
+    g = g.dropna(subset=["median_value"])
     # 1) drop units (manual)
     drops = UNIT_DROPS.get(m, set())
     if drops:
@@ -25,15 +45,17 @@ def clean_measurements_helper(g):
     # 3) plausible range filter
     lo, hi = PLAUSIBLE[m]
     g = g[g["median_value"].between(lo, hi)].copy()
-    # 4) remove units present in < MIN_UNIT_N samples
+    # 4) IQR outlier removal (pooled; before KS)
+    g = remove_outliers(g, col="median_value", multiplier=5)
+    if g.empty:
+        return g
+    # 5) remove units present in < MIN_UNIT_N samples
     unit_counts = g["unit"].value_counts()
     keep_units = unit_counts[unit_counts >= 5].index
     g = g[g["unit"].isin(keep_units)].copy()
     if g.empty or g["unit"].nunique() == 1:
         return g
-    # 5) dissimilar distributions via KS test vs most common unit
+    # 6) dissimilar distributions via KS test vs most common unit
     ref_unit = g["unit"].value_counts().idxmax()
     ref = g.loc[g["unit"].eq(ref_unit), "median_value"].astype(float).dropna()
@@ -48,7 +70,7 @@ def clean_measurements_helper(g):
     return g[g["unit"].isin(keep)].copy()
 def clean_measurements():
-    from rwe.parsers.aou.config import BUCKET, CDR, GOOGLE_PROJECT
+    from rwe.parsers.aou.config import BUCKET, CDR, GOOGLE_PROJECT, PLAUSIBLE
     numerical_measurements_df = pd.read_parquet(f"{BUCKET}/data/rwe_info/raw/numerical_measurements.parquet")
     selected_nm_df = numerical_measurements_df.loc[
     numerical_measurements_df.measurement.isin(PLAUSIBLE.keys())
@@ -71,4 +93,3 @@ def clean_surveys():
     selected_survey_df = survey_df.loc[survey_df.question.isin(questions)].copy()
     selected_survey_df.to_parquet(f"{BUCKET}/data/rwe_info/processed/selected_surveys.parquet")
     return

rwe/plots/demographics.py CHANGED Viewed

@@ -121,7 +121,7 @@ def demographics_plot(person_df,
                         top_n_sex=2,
                         top_n_ethnicity=2,
                         top_n_ancestry=None,
-                        figsize=(14, 7),
+                        figsize=(4, 5),
                         palette=None,
                         share_legend=True,
                         savepath=None,

rwe/plots/measurements.py CHANGED Viewed

@@ -27,14 +27,26 @@ def fmt_p(p):
     return f"P={p:.2e}" if p < 0.001 else f"P={p:.3f}"
 def plot_measurements(
-    df_long, measurements, multiplier=5, col="median_value",
-    res_df=None, res_name_col="measurement", res_p_col="p_mwu"
-):
+    df_long,
+    measurements,
+    col="median_value",
+    res_df=None,
+    res_name_col="measurement",
+    res_p_col="p_mwu",
+    ncols=3,
+    figsize=(6, 4),
+    palette=None
+    ):
     d = df_long[df_long["measurement"].isin(measurements)].copy()
     d[col] = pd.to_numeric(d[col], errors="coerce")
     d = d.dropna(subset=[col])
     d["group"] = np.where(d["cases"], "Cases", "Controls")
+    if palette is None:
+        palette = {
+            "Controls": "#2F6690",  # BLUE
+            "Cases":    "#D1495B",  # RED
+        }
     # star map
     p_map, star_map = {}, {}
     if res_df is not None:
@@ -43,25 +55,22 @@ def plot_measurements(
         p_map = dict(zip(tmp[res_name_col], tmp[res_p_col]))
         star_map = {k: p_to_stars(v) for k, v in p_map.items()}
-    parts = []
-    for m in measurements:
-        sub = d[d["measurement"] == m].copy()
-        sub = apply_plausible_range(sub, m)
-        ctrl = clean_measurement(sub[sub["group"] == "Controls"], col=col, multiplier=multiplier)
-        case = sub[sub["group"] == "Cases"].copy()  # don't IQR-clean cases
-        parts.append(pd.concat([ctrl, case], ignore_index=True))
-    p = pd.concat(parts, ignore_index=True)
-    fig, axes = plt.subplots(2, 3, figsize=(4, 3))
-    axes = axes.flatten()
+    # Layout: dynamic grid
+    n = len(measurements)
+    nrows = int(np.ceil(n / ncols))
+    fig, axes = plt.subplots(nrows, ncols, figsize=figsize)
+    axes = np.array(axes).ravel()
     for i, m in enumerate(measurements):
         ax = axes[i]
-        sub = p[p["measurement"] == m].copy()
-        sub[col] = pd.to_numeric(sub[col], errors="coerce")
-        sub = sub.dropna(subset=[col])
-        sns.violinplot(data=sub, x="group", y=col, order=["Controls", "Cases"],
-                       inner="quartile", cut=0, linewidth=1, ax=ax)
+        sub = d[d["measurement"] == m].copy()
+        sns.violinplot(
+            data=sub, x="group", y=col, order=["Controls", "Cases"],
+            hue="group", hue_order=["Controls", "Cases"], legend=False,
+            palette=palette, inner="quartile", cut=0, linewidth=1, ax=ax
+            )
         ax.set_title(textwrap.fill(m, width=20,  max_lines=3))
         ax.set_xlabel("")
         ax.set_ylabel("")
@@ -75,15 +84,20 @@ def plot_measurements(
             ymin = np.nanmin(sub[col].values)
             span = (ymax - ymin) if ymax > ymin else 1.0
-            y_line = ymax + 0.07 * span
-            y_star = y_line + 0.02 * span
-            y_p = y_star - 0.03 * span  # right below stars
+            # put annotation above violins
+            y_txt = ymax + 0.08 * span
+            # x=0.5 is between the two categories (0 and 1)
+            if stars:
+                ax.text(0.5, y_txt + 0.03 * span, stars, ha="center", va="bottom")
             if not pd.isna(pval):
-                ax.text(0.5, y_p, fmt_p(pval), ha="center", va="top")
+                ax.text(0.5, y_txt, fmt_p(pval), ha="center", va="bottom")
+            # give headroom so text doesn't clip
+            ax.set_ylim(ymin, y_txt + 0.12 * span)
-    for j in range(len(vitals), 6):
+    for j in range(n, len(axes)):
         axes[j].axis("off")
     plt.tight_layout()
-    return fig, axes, p
+    return fig, axes

rwe/plots/surveys.py CHANGED Viewed

@@ -84,7 +84,7 @@ def percent_bar_with_n(
         palette=palette
     )
-    ax.set_title(title or f"{cat_col} (% within group)")
+    ax.set_title(title or f"{cat_col} (% within group)", pad=20)
     ax.set_xlabel("")
     ax.set_ylabel("Percent (%)")
     ax.tick_params(axis="x", rotation=25)
@@ -158,6 +158,7 @@ def plot_survey_questions(
     fig, axes = plt.subplots(nrows, ncols, figsize=figsize)
     axes = axes.ravel()
+    handles = labels = None
     for i, ((svy, q), sub) in enumerate(groups):
         percent_bar_with_n(

rwe/plots/variant_info.py CHANGED Viewed

@@ -48,7 +48,7 @@ def create_variant_frequency_plots(
     bar_color="#D1495B",
     bar_fontsize=6,
     table_fontsize=6,
-    bar_height_ratio=1.5,            # relative height of bar panel vs table panel
+    bar_height_ratio=0.25,            # relative height of bar panel vs table panel
 ):
     df = variant_df.copy()

{rwe-0.0.1.dist-info → rwe-0.0.3.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: rwe
-Version: 0.0.1
+Version: 0.0.3
 Summary: Real World Evidence utilities and reporting
 Author: Deepro Banerjee
 License: MIT License
@@ -32,7 +32,7 @@ License-File: LICENSE
 Requires-Dist: pandas
 Requires-Dist: numpy
 Requires-Dist: matplotlib
-Requires-Dist: seaborn>=0.12
+Requires-Dist: seaborn>=0.13
 Requires-Dist: python-docx>=1.1.0
 Requires-Dist: tqdm
 Requires-Dist: requests

{rwe-0.0.1.dist-info → rwe-0.0.3.dist-info}/RECORD RENAMED Viewed

@@ -2,26 +2,26 @@ rwe/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
 rwe/generate_report.py,sha256=-NbQJ4H-NYdDzziWMoi5PUWyhZ7p8iVC8UXYUP1v_xY,1833
 rwe/assets/Arrowhead_Pharmaceuticals_logo.png,sha256=C2mAn6GG4gArds4sBnqM5LcxhuZgujXjwGiZxIuxbLo,51053
 rwe/clients/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
-rwe/clients/aou.py,sha256=7nPg2JU1LtkQ5AClqQMpeqjWh3kBrI63RmYOFcfVqxc,9949
+rwe/clients/aou.py,sha256=XVvRArzcTTBO0JmiTtWvuiafTuzpupaZJBb1Mtt8QIQ,9387
 rwe/clients/azn.py,sha256=6zwEv0IAgBA-FcI2glGerA9M6pzdKQx7M5M4HZI8HQs,7541
 rwe/clients/genebass.py,sha256=OkF0_J4lmiujWlffsBlAM3uKxKOCzi7_3f_ehju_tPE,6103
 rwe/clients/hgnc.py,sha256=ZabNiMgzwu4rCVCptbgUb38qHsL9cHlsQX7s6mZA8Nk,2015
 rwe/parsers/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
 rwe/parsers/aou/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
 rwe/parsers/aou/collect.py,sha256=3j9S0vg3s5u-lJYhP9gdJA_SJVUTUld-i8M1mS-qg-0,13610
-rwe/parsers/aou/config.py,sha256=_BdQohVgSeLNIkoLBQ4XIgo2ucep6NDU8k0vGKgONj8,27018
-rwe/parsers/aou/process.py,sha256=T3B4Ir3qHoq7txEO6Bxi1L_qHqjTbWikVLtFHS6QuSM,2705
+rwe/parsers/aou/config.py,sha256=7O8NhrxOAeDJnKYsiQBgufNkxo_O6nRPSrhJ49pteLk,27017
+rwe/parsers/aou/process.py,sha256=q-H12OQdwBZMqWxuDo389m6WCJb9Qsp5ZIwVc2MHqu4,3683
 rwe/plots/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
 rwe/plots/clinical.py,sha256=dBgdrFog-4gWJyJphoObSuoPtS7Q4wOprWyXMcDD6Dw,7738
-rwe/plots/demographics.py,sha256=QrDA1QSKG56Iuv9lodGfYBTymBOf0WKm0q6QFzs0SB4,7288
-rwe/plots/measurements.py,sha256=gKSWcon9Wzb3_oO4CnBRn3GTbNqjkfILXj6CmY3josQ,3177
-rwe/plots/surveys.py,sha256=X8DT2vnnA1S2u98mrzcA-apSOht8BAsCFjMbHQj7pfY,6636
-rwe/plots/variant_info.py,sha256=7V9dJWc8XEQU_t7KY1NzldxYbfOPMKRYBiSNIysizB4,5361
+rwe/plots/demographics.py,sha256=TqYY0dxroLAq9YOZHo1-SebacvNZpQjzUSui-nBUKSg,7287
+rwe/plots/measurements.py,sha256=Z7GG4SMXD3ExGGJF6LvmZ_XwvMwGB_6FFk7nxz_8fr8,3286
+rwe/plots/surveys.py,sha256=2SiNBnjLQL2V4jfeMTIj2y2v7y7Wh4kLji2dnEJ0cdo,6672
+rwe/plots/variant_info.py,sha256=KLUnefwmSxcmOn7mkNBcxvJdpNN4i4F9SZ0SZGlNrvk,5362
 rwe/utils/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
 rwe/utils/helpers.py,sha256=t3fmx2OsFx3fvg2uZggagm5LS-PE7J3Yr7Ew3VJUsnA,4336
 rwe/utils/report.py,sha256=Vyfle1aOoCkqtVeBHAzCdb6hgR0sQZn_qQ4szUw_A3w,4658
-rwe-0.0.1.dist-info/licenses/LICENSE,sha256=0PfJPAoyFVWY4L80aEcQyjy4PojxzfqcOenSuXdhgBg,1072
-rwe-0.0.1.dist-info/METADATA,sha256=x5Fi7T0sZPjz3wiZyhB8oyS_VU6O5IRYJ4RmOqPyV1w,23601
-rwe-0.0.1.dist-info/WHEEL,sha256=wUyA8OaulRlbfwMtmQsvNngGrxQHAvkKcvRmdizlJi0,92
-rwe-0.0.1.dist-info/top_level.txt,sha256=zwX0M3dfnEfklZqvvBPQXiJykChpiqt219EolC3WFRc,4
-rwe-0.0.1.dist-info/RECORD,,
+rwe-0.0.3.dist-info/licenses/LICENSE,sha256=0PfJPAoyFVWY4L80aEcQyjy4PojxzfqcOenSuXdhgBg,1072
+rwe-0.0.3.dist-info/METADATA,sha256=huTbgeLgndyhlIZHX4qOEPQi-jNFCpfRUPF2FM__EZI,23601
+rwe-0.0.3.dist-info/WHEEL,sha256=wUyA8OaulRlbfwMtmQsvNngGrxQHAvkKcvRmdizlJi0,92
+rwe-0.0.3.dist-info/top_level.txt,sha256=zwX0M3dfnEfklZqvvBPQXiJykChpiqt219EolC3WFRc,4
+rwe-0.0.3.dist-info/RECORD,,

{rwe-0.0.1.dist-info → rwe-0.0.3.dist-info}/WHEEL RENAMED Viewed

File without changes

{rwe-0.0.1.dist-info → rwe-0.0.3.dist-info}/licenses/LICENSE RENAMED Viewed

File without changes

{rwe-0.0.1.dist-info → rwe-0.0.3.dist-info}/top_level.txt RENAMED Viewed

File without changes

rwe 0.0.1__py3-none-any.whl → 0.0.3__py3-none-any.whl

rwe 0.0.1py3-none-any.whl → 0.0.3py3-none-any.whl