PyPI - clinicedc - Versions diffs - 2.0.11__py3-none-any.whl → 2.0.13__py3-none-any.whl - Mend

clinicedc 2.0.11py3-none-any.whl → 2.0.13py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of clinicedc might be problematic. Click here for more details.

Files changed (137) hide show

edc_analytics/custom_tables/ogtt.py ADDED Viewed

@@ -0,0 +1,95 @@
+import pandas as pd
+from edc_constants.constants import FEMALE, MALE
+from ..constants import MEAN_95CI, N_ONLY, N_WITH_COL_PROP, N_WITH_ROW_PROP
+from ..row import RowDefinition, RowDefinitions
+from ..table import Table
+class OgttTable(Table):
+    def __init__(self, main_df: pd.DataFrame = None):
+        super().__init__(
+            colname="ogtt",
+            main_df=main_df,
+            title="OGTT (mmol/L) categories",
+        )
+    @property
+    def row_definitions(self) -> RowDefinitions:
+        df_tmp = self.main_df.copy()
+        row_defs = RowDefinitions(reverse_rows=False)
+        row0 = RowDefinition(
+            title=self.title,
+            label=self.default_sublabel,
+            condition=(df_tmp["gender"].notna()),
+            columns={FEMALE: (N_ONLY, 2), MALE: (N_ONLY, 2), "All": (N_ONLY, 2)},
+            drop=False,
+        )
+        row_defs.add(row0)
+        columns = {
+            FEMALE: (N_WITH_COL_PROP, 2),
+            MALE: (N_WITH_COL_PROP, 2),
+            "All": (N_WITH_ROW_PROP, 2),
+        }
+        row_defs.add(
+            RowDefinition(
+                colname=self.colname,
+                label="Not fasted",
+                condition=(self.main_df["fasting_ogtt_hrs"] < 8.0),
+                columns=columns,
+                drop=True,
+            )
+        )
+        row_defs.add(
+            RowDefinition(
+                colname=self.colname,
+                label="<7.8",
+                condition=(self.main_df[self.colname] < 7.8),
+                columns=columns,
+                drop=False,
+            )
+        )
+        row_defs.add(
+            RowDefinition(
+                colname=self.colname,
+                label="7.8-11.1",
+                condition=(self.main_df[self.colname] >= 7.8)
+                & (self.main_df[self.colname] < 11.1),
+                columns=columns,
+                drop=False,
+            )
+        )
+        row_defs.add(
+            RowDefinition(
+                colname=self.colname,
+                label="11.1 and above",
+                condition=(self.main_df[self.colname] >= 11.1),
+                columns=columns,
+                drop=False,
+            )
+        )
+        row_defs.add(
+            RowDefinition(
+                colname=self.colname,
+                label="not measured",
+                condition=(self.main_df[self.colname].isna()),
+                columns=columns,
+                drop=False,
+            )
+        )
+        columns = {
+            FEMALE: (MEAN_95CI, 2),
+            MALE: (MEAN_95CI, 2),
+            "All": (MEAN_95CI, 2),
+        }
+        row_defs.add(
+            RowDefinition(
+                colname=self.colname,
+                label="Mean (95% CI)",
+                condition=(self.main_df[self.colname].notna()),
+                columns=columns,
+                drop=False,
+            )
+        )
+        return row_defs

edc_analytics/custom_tables/waist.py ADDED Viewed

@@ -0,0 +1,105 @@
+import pandas as pd
+from edc_constants.constants import FEMALE, MALE
+from ..constants import (
+    MEDIAN_IQR,
+    MEDIAN_RANGE,
+    N_ONLY,
+    N_WITH_COL_PROP,
+    N_WITH_ROW_PROP,
+)
+from ..row import RowDefinition, RowDefinitions
+from ..table import Table
+class WaistCircumferenceTable(Table):
+    def __init__(self, main_df: pd.DataFrame = None):
+        super().__init__(
+            colname="waist_circumference",
+            main_df=main_df,
+            title="Waist circumference (cm)",
+        )
+    @property
+    def row_definitions(self) -> RowDefinitions:
+        df_tmp = self.main_df.copy()
+        row_defs = RowDefinitions(reverse_rows=False)
+        row0 = RowDefinition(
+            title=self.title,
+            label=self.default_sublabel,
+            condition=(df_tmp["gender"].notna()),
+            columns={FEMALE: (N_ONLY, 2), MALE: (N_ONLY, 2), "All": (N_ONLY, 2)},
+            drop=False,
+        )
+        row_defs.add(row0)
+        columns = {
+            FEMALE: (N_WITH_COL_PROP, 2),
+            MALE: (N_WITH_COL_PROP, 2),
+            "All": (N_WITH_ROW_PROP, 2),
+        }
+        cond_lt_102 = (
+            (self.main_df[self.colname] < 102.0) & (self.main_df["gender"] == "Male")
+        ) | ((self.main_df[self.colname] < 88.0) & (self.main_df["gender"] == "Female"))
+        row_defs.add(
+            RowDefinition(
+                colname=self.colname,
+                label="Women<88 / Men<102",
+                condition=cond_lt_102,
+                columns=columns,
+                drop=False,
+            )
+        )
+        cond_gte_102 = (
+            (self.main_df[self.colname] >= 102.0) & (self.main_df["gender"] == "Male")
+        ) | ((self.main_df[self.colname] >= 88.0) & (self.main_df["gender"] == "Female"))
+        row_defs.add(
+            RowDefinition(
+                colname=self.colname,
+                label="Women>=88 / Men>=102",
+                condition=cond_gte_102,
+                columns=columns,
+                drop=False,
+            )
+        )
+        cond_gte_missing = self.main_df[self.colname].isna()
+        row_defs.add(
+            RowDefinition(
+                colname=self.colname,
+                label="not measured",
+                condition=cond_gte_missing,
+                columns=columns,
+                drop=False,
+            )
+        )
+        columns = {
+            FEMALE: (MEDIAN_RANGE, 2),
+            MALE: (MEDIAN_RANGE, 2),
+            "All": (MEDIAN_RANGE, 2),
+        }
+        row_defs.add(
+            RowDefinition(
+                colname=self.colname,
+                label="Median (range)",
+                condition=(self.main_df[self.colname].notna()),
+                columns=columns,
+            )
+        )
+        columns = {
+            FEMALE: (MEDIAN_IQR, 2),
+            MALE: (MEDIAN_IQR, 2),
+            "All": (MEDIAN_IQR, 2),
+        }
+        row_defs.add(
+            RowDefinition(
+                colname=self.colname,
+                label="Median (IQR)",
+                condition=(self.main_df[self.colname].notna()),
+                columns=columns,
+            )
+        )
+        return row_defs

edc_analytics/data.py ADDED Viewed

@@ -0,0 +1,36 @@
+from datetime import datetime
+from pathlib import Path
+import pandas as pd
+class Data:
+    def __init__(
+        self,
+        label: str,
+        table_df: pd.DataFrame,
+        data_df: pd.DataFrame,
+        filename_prefix: str,
+        folder: str | None = None,
+    ):
+        self.label = label
+        self.table_df = table_df
+        self.data_df = data_df
+        self.filename_prefix = filename_prefix
+        self.folder = folder or "~/"
+    def __repr__(self):
+        return f"Data({self.label}) <obs={len(self.data_df)}>"
+    def to_csv(
+        self, folder: str | None = None, filename: str | None = None, cols: int | None = None
+    ):
+        folder = folder or self.folder
+        cols = cols or 5
+        datestamp = datetime.now().strftime("%Y%m%d%H%M")
+        filename = filename or f"{self.filename_prefix}_table_{self.label}_{datestamp}.csv"
+        path = Path(folder) / filename
+        self.table_df.iloc[:, :cols].to_csv(
+            path_or_buf=path, encoding="utf-8", index=0, sep="|"
+        )

edc_analytics/row/__init__.py ADDED Viewed

@@ -0,0 +1,4 @@
+from .row_definition import RowDefinition
+from .row_definitions import RowDefinitions
+from .row_statistics import RowStatistics
+from .row_statistics_with_gender import RowStatisticsWithGender

edc_analytics/row/row_definition.py ADDED Viewed

@@ -0,0 +1,43 @@
+import pandas as pd
+from ..constants import N_WITH_ROW_PROP, STATISTICS
+from ..styler import StylerError
+class RowDefinition:
+    def __init__(
+        self,
+        title: str | None = None,
+        label: str = None,
+        colname: str | None = None,
+        condition: pd.Series = None,
+        columns: dict[str, tuple[str, int]] = None,
+        drop: bool | None = None,
+    ):
+        """
+        A row definition is passed by Table to the RowStatistics
+        class.
+        :param title:
+        :param label:
+        :param colname:
+        :param condition:
+        :param columns: dictionary of label: (statistic, places)=
+            {
+                FEMALE: (N_WITH_ROW_PROP, 2),
+                MALE: (N_WITH_ROW_PROP, 2),
+                "All": (N_ONLY, 2),
+            }
+        :param drop: drops rows from the source dataframe once used by
+          the row definition.
+        """
+        self.title = title or ""
+        self.label = label
+        self.colname = colname
+        self.condition = condition  # condition to filter DF
+        self.drop = False if drop is None else drop  # drop index of previous row numerator
+        self.columns = columns or {"All": (N_WITH_ROW_PROP, 2)}
+        for col, style_info in self.columns.items():
+            style, _ = style_info
+            if style not in STATISTICS:
+                raise StylerError(f"Unknown statistic. Got `{style}` for column `{col}`.")

edc_analytics/row/row_definitions.py ADDED Viewed

@@ -0,0 +1,32 @@
+from typing import Iterable
+from .row_definition import RowDefinition
+from .row_statistics import RowStatistics
+from .row_statistics_with_gender import RowStatisticsWithGender
+class RowDefinitions:
+    """Collection of RowDefinitions"""
+    def __init__(
+        self,
+        colname: str = None,
+        row_statistics_cls: RowStatistics | RowStatisticsWithGender = None,
+        reverse_rows: bool = False,
+    ):
+        self.definitions: list[RowDefinition] = []
+        self.row_statistics_cls = row_statistics_cls
+        self.colname = colname
+        self.reverse_rows = reverse_rows
+    def add(self, row_definition: RowDefinition):
+        self.definitions.append(row_definition)
+    def extend(self, row_definition: list[RowDefinition]):
+        self.definitions.extend(row_definition)
+    def reverse(self):
+        self.definitions.reverse()
+    def __iter__(self) -> Iterable[RowDefinition]:
+        return iter(self.definitions)

edc_analytics/row/row_statistics.py ADDED Viewed

@@ -0,0 +1,88 @@
+import numpy as np
+import pandas as pd
+from pandas.api.types import is_numeric_dtype
+from ..constants import COUNT_COLUMN, N_ONLY
+from ..styler import Styler
+class RowStatistics:
+    """A class that calculates descriptive statistics for an
+    indictor.
+    """
+    def __init__(
+        self,
+        colname: str = None,
+        df_numerator: pd.DataFrame = None,
+        df_denominator: pd.DataFrame = None,
+        df_all: pd.DataFrame = None,
+        coltotal: float | int | None = None,
+        style: str | None = None,
+        places: int | None = None,
+    ):
+        self.places = 2 if places is None else places
+        self.style = style or N_ONLY
+        # counts (6 columns)
+        self.count = 0.0 if df_numerator.empty else len(df_numerator)
+        self.total = len(df_all)
+        self.coltotal = coltotal or len(df_denominator)
+        self.rowtotal = self.count  # rowtotal or len(df_denominator)
+        self.colprop = self.count / self.coltotal if self.count else 0.0
+        self.rowprop = self.count / self.total if self.count else 0.0
+        # numeric stats (9 columns)
+        if colname and not df_numerator.empty and is_numeric_dtype(df_numerator[colname]):
+            stats = df_numerator[colname].describe()
+            self.mean = stats.loc["mean"]
+            self.sd = stats.loc["std"]
+            self.min = stats.loc["min"]
+            self.max = stats.loc["max"]
+            self.q25, self.q50, self.q75 = df_numerator[colname].quantile([0.25, 0.50, 0.75])
+            stats = df_numerator[colname].agg(["mean", "sem"])
+            self.ci95l = stats.loc["mean"] - 1.96 * stats.loc["sem"]
+            self.ci95h = stats.loc["mean"] + 1.96 * stats.loc["sem"]
+        else:
+            (
+                self.mean,
+                self.sd,
+                self.min,
+                self.max,
+                self.q25,
+                self.q50,
+                self.q75,
+                self.ci95l,
+                self.ci95h,
+            ) = [np.nan] * 9
+    def values_list(self) -> list:
+        return list(self.as_dict().values())
+    def labels(self) -> list:
+        return list(self.as_dict().keys())
+    def as_dict(self):
+        return {
+            COUNT_COLUMN: self.count,
+            "coltotal": self.coltotal,
+            "rowtotal": self.rowtotal,
+            "total": self.total,
+            "colprop": self.colprop,
+            "rowprop": self.rowprop,
+            "mean": self.mean,
+            "sd": self.sd,
+            "min": self.min,
+            "max": self.max,
+            "q25": self.q25,
+            "q50": self.q50,
+            "q75": self.q75,
+            "ci95l": self.ci95l,
+            "ci95h": self.ci95h,
+        }
+    def formatted_cell(self) -> str:
+        return Styler(style=self.style, statistics=self, places=self.places).value
+    def row(self):
+        return [self.formatted_cell()] + self.values_list()

edc_analytics/row/row_statistics_with_gender.py ADDED Viewed

@@ -0,0 +1,115 @@
+import pandas as pd
+from edc_constants.constants import FEMALE, MALE
+from .row_statistics import RowStatistics
+class RowStatisticsError(Exception):
+    pass
+class RowStatisticsFemale(RowStatistics):
+    def __init__(
+        self,
+        df_numerator: pd.DataFrame = None,
+        df_denominator: pd.DataFrame = None,
+        **kwargs,
+    ):
+        df_numerator = df_numerator.loc[df_numerator["gender"] == FEMALE]
+        super().__init__(
+            df_numerator=df_numerator,
+            df_denominator=df_denominator,
+            **kwargs,
+        )
+class RowStatisticsMale(RowStatistics):
+    def __init__(
+        self,
+        df_numerator: pd.DataFrame = None,
+        df_denominator: pd.DataFrame = None,
+        **kwargs,
+    ):
+        df_numerator = df_numerator.loc[df_numerator["gender"] == MALE]
+        super().__init__(
+            df_numerator=df_numerator,
+            df_denominator=df_denominator,
+            **kwargs,
+        )
+class RowStatisticsWithGender(RowStatistics):
+    def __init__(
+        self,
+        columns: dict[str, tuple[str, int]] = None,
+        df_all: pd.DataFrame = None,
+        coltotal: float | int | None = None,
+        **kwargs,
+    ):
+        """
+        custom row for displaying with gender columns: F, M, All
+        :param colname:
+        :param df_numerator:
+        :param df_denominator:
+        :param df_all:
+        :param columns: dict of {col: (style name, places)} where col
+               is "F", "M" or "All"
+        Note: the default df["gender"] is "M" or "F".
+        """
+        female_style, female_places = columns[FEMALE]
+        male_style, male_places = columns[MALE]
+        all_style, all_places = columns["All"]
+        super().__init__(
+            places=all_places,
+            style=all_style,
+            df_all=df_all,
+            coltotal=coltotal,
+            **kwargs,
+        )
+        self.m = RowStatisticsMale(
+            places=male_places,
+            style=male_style,
+            coltotal=len(df_all[df_all["gender"] == MALE]),
+            df_all=df_all,
+            **kwargs,
+        )
+        self.f = RowStatisticsFemale(
+            places=female_places,
+            style=female_style,
+            coltotal=len(df_all[df_all["gender"] == FEMALE]),
+            df_all=df_all,
+            **kwargs,
+        )
+    def values_list(self, style: str | None = None, places: int | None = None) -> list:
+        values_list = super().values_list()
+        return (
+            list(self.formatted_cells().values())
+            + self.f.values_list()
+            + self.m.values_list()
+            + values_list
+        )
+    def labels(self) -> list[str]:
+        labels = super().labels()
+        return (
+            list(self.formatted_cells().keys())
+            + [f"f{x}" for x in self.f.labels()]
+            + [f"m{x}" for x in self.m.labels()]
+            + labels
+        )
+    def row(self):
+        return [self.formatted_cells()] + self.values_list()
+    def formatted_cells(self) -> dict:
+        formatted_cell = super().formatted_cell()
+        return dict(
+            F=self.f.formatted_cell(),
+            M=self.m.formatted_cell(),
+            All=formatted_cell,
+        )

edc_analytics/stata/__init__.py ADDED Viewed

	@@ -0,0 +1 @@
1	+ from .get_stata_labels_from_model import get_stata_labels_from_model

edc_analytics/stata/get_stata_labels_from_model.py ADDED Viewed

@@ -0,0 +1,44 @@
+import re
+import pandas as pd
+from bs4 import BeautifulSoup
+from django.apps import apps as django_apps
+def strip_html(text: str) -> str:
+    if pd.isna(text):
+        return text
+    if bool(re.search(r"<[^>]+>", text)):
+        return BeautifulSoup(text, "html.parser").get_text()
+    return text
+# def get_stata_labels_from_model(df: pd.DataFrame, model: str, suffix: str) -> dict[str:str]:
+#     """Generate STATA labels"""
+#     labels = {}
+#     _, model_name = model.split(".")
+#     model_cls = django_apps.get_model(model)
+#     for fld in model_cls._meta.get_fields():
+#         if f"{fld.name}_{suffix}" in df.columns:
+#             labels.update({f"{fld.name}_{suffix}": strip_html(str(fld.verbose_name)[:80])})
+#     return labels
+def get_stata_labels_from_model(
+    df: pd.DataFrame, model: str, suffix: str | None = None
+) -> dict[str:str]:
+    """Generate STATA labels"""
+    labels = {}
+    _, model_name = model.split(".")
+    model_cls = django_apps.get_model(model)
+    for fld in model_cls._meta.get_fields():
+        if suffix:
+            if f"{fld.name}_{suffix}" in df.columns:
+                labels.update({f"{fld.name}_{suffix}": strip_html(str(fld.verbose_name)[:80])})
+        else:
+            if f"{fld.name}_{suffix}" in df.columns:
+                try:
+                    labels.update({fld.name: strip_html(str(fld.verbose_name)[:80])})
+                except AttributeError:
+                    pass
+    return labels

edc_analytics/styler.py ADDED Viewed

@@ -0,0 +1,93 @@
+from __future__ import annotations
+from typing import TYPE_CHECKING
+from .constants import (
+    MEAN_95CI,
+    MEAN_RANGE,
+    MEAN_SD,
+    MEDIAN_IQR,
+    MEDIAN_RANGE,
+    N_MEAN,
+    N_ONLY,
+    N_WITH_COL_PROP,
+    N_WITH_ROW_PROP,
+    STATISTICS,
+)
+if TYPE_CHECKING:
+    from .row import RowStatistics
+class StylerError(Exception):
+    pass
+class Styler:
+    """A class to format statistics per the format label given."""
+    def __init__(
+        self,
+        style: str = None,
+        statistics: RowStatistics = None,
+        places: int | None = None,
+    ):
+        self.style = style
+        self.row = statistics
+        self.places = places if places is not None else 2
+        if style not in STATISTICS:
+            raise StylerError(f"Unknown style. Got `{style}`.")
+    @property
+    def value(self):
+        """Make sure values are numerics first!
+        For example, when preparing the dataframe convert values to
+        numerics:
+            df[cols] = df[cols].apply(pd.to_numeric)
+        """
+        col_value = "no style"
+        if self.style == N_WITH_ROW_PROP:
+            col_value = (
+                f"{self.row.count} ("
+                f"{round(self.row.rowprop * 100, self.places):.{self.places}f}%)"
+            )
+        elif self.style == N_ONLY:
+            col_value = f"{self.row.count}"
+        elif self.style == N_WITH_COL_PROP:
+            col_value = (
+                f"{self.row.count:.{self.places}f} "
+                f"({round(self.row.colprop * 100, self.places):.{self.places}f}%)"
+            )
+        elif self.style == N_MEAN:
+            col_value = f"{round(self.row.mean, self.places):.{self.places}f}"
+        elif self.style == MEDIAN_IQR:
+            col_value = (
+                f"{round(self.row.q50, self.places):.{self.places}f} "
+                f"({round(self.row.q25, self.places)},"
+                f"{round(self.row.q75, self.places):.{self.places}f})"
+            )
+        elif self.style == MEDIAN_RANGE:
+            col_value = (
+                f"{round(self.row.q50, self.places):.{self.places}f} "
+                f"({round(self.row.min, self.places):.{self.places}f}, "
+                f"{round(self.row.max, self.places):.{self.places}f})"
+            )
+        elif self.style == MEAN_RANGE:
+            col_value = (
+                f"{round(self.row.mean, self.places):.{self.places}f} "
+                f"({round(self.row.min, self.places):.{self.places}f}, "
+                f"{round(self.row.max, self.places):.{self.places}f})"
+            )
+        elif self.style == MEAN_SD:
+            col_value = (
+                f"{round(self.row.mean, self.places):.{self.places}f} "
+                f"({round(self.row.sd, self.places):.{self.places}f})"
+            )
+        elif self.style == MEAN_95CI:
+            col_value = (
+                f"{round(self.row.mean, self.places):.{self.places}f} "
+                f"({round(self.row.ci95l, self.places):.{self.places}f}, "
+                f"{round(self.row.ci95h, self.places):.{self.places}f})"
+            )
+        return col_value

clinicedc 2.0.11__py3-none-any.whl → 2.0.13__py3-none-any.whl

Potentially problematic release.

clinicedc 2.0.11py3-none-any.whl → 2.0.13py3-none-any.whl