PyPI - ltc-code - Versions diffs - 0.1.3__tar.gz → 0.1.4__tar.gz - Mend

ltc-code 0.1.3tar.gz → 0.1.4tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (6) hide show

{ltc_code-0.1.3 → ltc_code-0.1.4}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.3
 Name: ltc-code
-Version: 0.1.3
+Version: 0.1.4
 Summary: Add your description here
 Requires-Python: >=3.9
 Description-Content-Type: text/markdown

{ltc_code-0.1.3 → ltc_code-0.1.4}/pyproject.toml RENAMED Viewed

@@ -1,6 +1,6 @@
 [project]
 name = "ltc-code"
-version = "0.1.3"
+version = "0.1.4"
 description = "Add your description here"
 readme = "README.md"
 requires-python = ">=3.9"

{ltc_code-0.1.3 → ltc_code-0.1.4}/src/ltc_code/may27.py RENAMED Viewed

@@ -1414,3 +1414,467 @@ def lookup_sid_cepr(
     if not is_lazy:
         result = result.collect()
     return result
+####################################################################################
+# LOOK UP KEYS NEW
+####################################################################################
+def _build_lookup(
+census: pl.DataFrame,
+*,
+fname_expr: pl.Expr,
+lname_expr: pl.Expr,
+dob_col: str,
+label: str,
+) -> pl.DataFrame:
+"""
+Build a deterministic SID lookup table.
+```
+Output schema:
+    _fname_key
+    _lname_key
+    _dob_key
+    sid_cepr
+Ambiguous keys are removed.
+"""
+lookup = (
+    census
+    .select(
+        [
+            fname_expr.alias("_fname_key"),
+            lname_expr.alias("_lname_key"),
+            pl.col(dob_col).alias("_dob_key"),
+            pl.col("sid_cepr"),
+        ]
+    )
+    .drop_nulls(
+        [
+            "_fname_key",
+            "_lname_key",
+            "_dob_key",
+            "sid_cepr",
+        ]
+    )
+    .group_by(
+        [
+            "_fname_key",
+            "_lname_key",
+            "_dob_key",
+        ]
+    )
+    .agg(
+        pl.col("sid_cepr").unique().alias("_sids")
+    )
+    .with_columns(
+        pl.col("_sids").list.len().alias("_sid_count")
+    )
+    .filter(
+        pl.col("_sid_count") == 1
+    )
+    .select(
+        [
+            "_fname_key",
+            "_lname_key",
+            "_dob_key",
+            pl.col("_sids").list.first().alias("sid_cepr"),
+        ]
+    )
+)
+print(f"built lookup: {label}")
+return lookup
+```
+def build_census_lookups(
+*,
+cmo_name: str,
+) -> dict[str, pl.DataFrame]:
+```
+try:
+    import mappings
+except ImportError:
+    import mapppings as mappings
+annual_frames = []
+for year in range(1994, 2023):
+    path = CENSUS_STUDENTS / f"census_student_{year}.csv"
+    annual = (
+        pl.scan_csv(
+            path,
+            infer_schema=False,
+            null_values=[],
+            try_parse_dates=False,
+            ignore_errors=False,
+        )
+        .select(
+            [
+                "cmo_code",
+                "sid_cepr",
+                "fname_clean",
+                "lname_clean",
+                "mname_clean",
+                "suff_clean",
+                "birthdate_clean",
+                "birthdate_imp",
+            ]
+        )
+        .rename(
+            {
+                "fname_clean": "fname",
+                "lname_clean": "lname",
+                "mname_clean": "mname",
+                "suff_clean": "suffix",
+                "birthdate_clean": "dob",
+                "birthdate_imp": "dob_imp",
+            }
+        )
+        .with_columns(
+            pl.col("cmo_code")
+            .replace(mappings.CMO_CODE_TO_NAME)
+            .alias("cmo_name")
+        )
+        .filter(
+            pl.col("cmo_name") == cmo_name
+        )
+        .with_columns(
+            *clean_name("fname"),
+            *clean_name("lname"),
+            *clean_other_name("mname"),
+            *clean_other_name("suffix"),
+            *clean_dob(col="dob"),
+            *clean_dob(col="dob_imp"),
+        )
+        .drop(
+            [
+                "dob",
+                "dob_imp",
+            ]
+        )
+        .rename(
+            {
+                "dob_clean": "dob",
+                "dob_imp_clean": "dob_imp",
+            }
+        )
+    )
+    annual_frames.append(annual)
+#
+# MATERIALIZE ONCE
+#
+census = (
+    pl.concat(
+        annual_frames,
+        how="vertical_relaxed",
+    )
+    .collect()
+)
+print(f"census rows: {len(census):,}")
+#
+# BUILD LOOKUPS ONCE
+#
+lookup_exact = _build_lookup(
+    census,
+    fname_expr=pl.col("fname"),
+    lname_expr=pl.col("lname"),
+    dob_col="dob",
+    label="exact",
+)
+lookup_mname = _build_lookup(
+    census,
+    fname_expr=pl.concat_str(
+        [
+            pl.col("fname"),
+            pl.col("mname"),
+        ],
+        separator=" ",
+    ),
+    lname_expr=pl.col("lname"),
+    dob_col="dob",
+    label="mname",
+)
+lookup_suffix = _build_lookup(
+    census,
+    fname_expr=pl.col("fname"),
+    lname_expr=pl.concat_str(
+        [
+            pl.col("lname"),
+            pl.col("suffix"),
+        ],
+        separator=" ",
+    ),
+    dob_col="dob",
+    label="suffix",
+)
+lookup_dob_imp = _build_lookup(
+    census,
+    fname_expr=pl.col("fname"),
+    lname_expr=pl.col("lname"),
+    dob_col="dob_imp",
+    label="dob_imp",
+)
+return {
+    "exact": lookup_exact,
+    "mname": lookup_mname,
+    "suffix": lookup_suffix,
+    "dob_imp": lookup_dob_imp,
+}
+```
+def _run_match_stage(
+unmatched: pl.DataFrame,
+*,
+lookup: pl.DataFrame,
+fname_expr: pl.Expr,
+lname_expr: pl.Expr,
+dob_expr: pl.Expr,
+label: str,
+) -> tuple[pl.DataFrame, pl.DataFrame]:
+```
+before = len(unmatched)
+stage = (
+    unmatched
+    .with_columns(
+        [
+            fname_expr.alias("_fname_key"),
+            lname_expr.alias("_lname_key"),
+            dob_expr.alias("_dob_key"),
+        ]
+    )
+    .join(
+        lookup,
+        on=[
+            "_fname_key",
+            "_lname_key",
+            "_dob_key",
+        ],
+        how="left",
+        validate="m:1",
+    )
+    .drop(
+        [
+            "_fname_key",
+            "_lname_key",
+            "_dob_key",
+        ]
+    )
+)
+matched = (
+    stage
+    .filter(
+        pl.col("sid_cepr").is_not_null()
+    )
+)
+unmatched = (
+    stage
+    .filter(
+        pl.col("sid_cepr").is_null()
+    )
+    .drop("sid_cepr")
+)
+added = len(matched)
+print(
+    f"{label}: matched {added:,}/{before:,}"
+)
+return matched, unmatched
+```
+def lookup_sid_cepr(
+frame: Frame,
+*,
+cols: Mapping[str, str],
+lookups: dict[str, pl.DataFrame],
+) -> Frame:
+```
+is_lazy = isinstance(frame, pl.LazyFrame)
+current = (
+    frame.collect()
+    if is_lazy
+    else frame
+)
+#
+# CLEAN LEFT SIDE
+#
+current = (
+    current
+    .with_columns(
+        *clean_name(cols["fname"]),
+        *clean_name(cols["lname"]),
+        *clean_dob(col=cols["dob"]),
+    )
+)
+matched_frames = []
+unmatched = current
+#
+# STAGE 1
+# EXACT
+#
+matched, unmatched = _run_match_stage(
+    unmatched,
+    lookup=lookups["exact"],
+    fname_expr=pl.col(f"{cols['fname']}_clean"),
+    lname_expr=pl.col(f"{cols['lname']}_clean"),
+    dob_expr=pl.col(f"{cols['dob']}_clean"),
+    label="exact",
+)
+matched_frames.append(matched)
+#
+# STAGE 2
+# MNAME
+#
+if "mname" in cols:
+    unmatched = (
+        unmatched
+        .with_columns(
+            *clean_other_name(cols["mname"])
+        )
+    )
+    matched, unmatched = _run_match_stage(
+        unmatched,
+        lookup=lookups["mname"],
+        fname_expr=pl.concat_str(
+            [
+                pl.col(f"{cols['fname']}_clean"),
+                pl.col(f"{cols['mname']}_clean"),
+            ],
+            separator=" ",
+        ),
+        lname_expr=pl.col(f"{cols['lname']}_clean"),
+        dob_expr=pl.col(f"{cols['dob']}_clean"),
+        label="mname",
+    )
+    matched_frames.append(matched)
+#
+# STAGE 3
+# SUFFIX
+#
+if "suffix" in cols:
+    unmatched = (
+        unmatched
+        .with_columns(
+            *clean_other_name(cols["suffix"])
+        )
+    )
+    matched, unmatched = _run_match_stage(
+        unmatched,
+        lookup=lookups["suffix"],
+        fname_expr=pl.col(f"{cols['fname']}_clean"),
+        lname_expr=pl.concat_str(
+            [
+                pl.col(f"{cols['lname']}_clean"),
+                pl.col(f"{cols['suffix']}_clean"),
+            ],
+            separator=" ",
+        ),
+        dob_expr=pl.col(f"{cols['dob']}_clean"),
+        label="suffix",
+    )
+    matched_frames.append(matched)
+#
+# STAGE 4
+# DOB IMP
+#
+matched, unmatched = _run_match_stage(
+    unmatched,
+    lookup=lookups["dob_imp"],
+    fname_expr=pl.col(f"{cols['fname']}_clean"),
+    lname_expr=pl.col(f"{cols['lname']}_clean"),
+    dob_expr=pl.col(f"{cols['dob']}_clean"),
+    label="dob_imp",
+)
+matched_frames.append(matched)
+#
+# FINAL
+#
+result = pl.concat(
+    matched_frames + [unmatched],
+    how="diagonal_relaxed",
+)
+print(
+    f"final matched: {result['sid_cepr'].is_not_null().sum():,}/{len(result):,}"
+)
+return result
+```
+#
+# EXAMPLE USAGE
+#
+lookups = build_census_lookups(
+cmo_name="Aspire",
+)
+result = (
+df
+.pipe(
+lookup_sid_cepr,
+cols={
+"fname": "fname",
+"lname": "lname",
+"mname": "mname",
+"suffix": "suffix",
+"dob": "dob",
+},
+lookups=lookups,
+)
+)

{ltc_code-0.1.3 → ltc_code-0.1.4}/README.md RENAMED Viewed

File without changes

{ltc_code-0.1.3 → ltc_code-0.1.4}/src/ltc_code/__init__.py RENAMED Viewed

File without changes

{ltc_code-0.1.3 → ltc_code-0.1.4}/src/ltc_code/polars_dates.py RENAMED Viewed

File without changes

ltc-code 0.1.3__tar.gz → 0.1.4__tar.gz

ltc-code 0.1.3tar.gz → 0.1.4tar.gz