PyPI - ltc-code - Versions diffs - 0.1.3__tar.gz → 0.1.5__tar.gz - Mend

ltc-code 0.1.3tar.gz → 0.1.5tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (6) hide show

{ltc_code-0.1.3 → ltc_code-0.1.5}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.3
 Name: ltc-code
-Version: 0.1.3
+Version: 0.1.5
 Summary: Add your description here
 Requires-Python: >=3.9
 Description-Content-Type: text/markdown

{ltc_code-0.1.3 → ltc_code-0.1.5}/pyproject.toml RENAMED Viewed

@@ -1,6 +1,6 @@
 [project]
 name = "ltc-code"
-version = "0.1.3"
+version = "0.1.5"
 description = "Add your description here"
 readme = "README.md"
 requires-python = ">=3.9"

{ltc_code-0.1.3 → ltc_code-0.1.5}/src/ltc_code/may27.py RENAMED Viewed

@@ -1414,3 +1414,459 @@ def lookup_sid_cepr(
     if not is_lazy:
         result = result.collect()
     return result
+####################################################################################
+# LOOK UP KEYS NEW
+####################################################################################
+def _build_lookup(
+    census: pl.DataFrame,
+    *,
+    fname_expr: pl.Expr,
+    lname_expr: pl.Expr,
+    dob_col: str,
+    label: str,
+) -> pl.DataFrame:
+    """
+    Build a deterministic SID lookup table.
+    Output schema:
+        _fname_key
+        _lname_key
+        _dob_key
+        sid_cepr
+    Ambiguous keys are removed.
+    """
+    lookup = (
+        census
+        .select(
+            [
+                fname_expr.alias("_fname_key"),
+                lname_expr.alias("_lname_key"),
+                pl.col(dob_col).alias("_dob_key"),
+                pl.col("sid_cepr"),
+            ]
+        )
+        .drop_nulls(
+            [
+                "_fname_key",
+                "_lname_key",
+                "_dob_key",
+                "sid_cepr",
+            ]
+        )
+        .group_by(
+            [
+                "_fname_key",
+                "_lname_key",
+                "_dob_key",
+            ]
+        )
+        .agg(
+            pl.col("sid_cepr").unique().alias("_sids")
+        )
+        .with_columns(
+            pl.col("_sids").list.len().alias("_sid_count")
+        )
+        .filter(
+            pl.col("_sid_count") == 1
+        )
+        .select(
+            [
+                "_fname_key",
+                "_lname_key",
+                "_dob_key",
+                pl.col("_sids").list.first().alias("sid_cepr"),
+            ]
+        )
+    )
+    print(f"built lookup: {label}")
+    return lookup
+def build_census_lookups(
+    *,
+    cmo_name: str,
+) -> dict[str, pl.DataFrame]:
+    try:
+        import mappings
+    except ImportError:
+        import mapppings as mappings
+    annual_frames = []
+    for year in range(1994, 2023):
+        path = CENSUS_STUDENTS / f"census_student_{year}.csv"
+        annual = (
+            pl.scan_csv(
+                path,
+                infer_schema=False,
+                null_values=[],
+                try_parse_dates=False,
+                ignore_errors=False,
+            )
+            .select(
+                [
+                    "cmo_code",
+                    "sid_cepr",
+                    "fname_clean",
+                    "lname_clean",
+                    "mname_clean",
+                    "suff_clean",
+                    "birthdate_clean",
+                    "birthdate_imp",
+                ]
+            )
+            .rename(
+                {
+                    "fname_clean": "fname",
+                    "lname_clean": "lname",
+                    "mname_clean": "mname",
+                    "suff_clean": "suffix",
+                    "birthdate_clean": "dob",
+                    "birthdate_imp": "dob_imp",
+                }
+            )
+            .with_columns(
+                pl.col("cmo_code")
+                .replace(mappings.CMO_CODE_TO_NAME)
+                .alias("cmo_name")
+            )
+            .filter(
+                pl.col("cmo_name") == cmo_name
+            )
+            .with_columns(
+                *clean_name("fname"),
+                *clean_name("lname"),
+                *clean_other_name("mname"),
+                *clean_other_name("suffix"),
+                *clean_dob(col="dob"),
+                *clean_dob(col="dob_imp"),
+            )
+            .drop(
+                [
+                    "dob",
+                    "dob_imp",
+                ]
+            )
+            .rename(
+                {
+                    "dob_clean": "dob",
+                    "dob_imp_clean": "dob_imp",
+                }
+            )
+        )
+        annual_frames.append(annual)
+    #
+    # MATERIALIZE ONCE
+    #
+    census = (
+        pl.concat(
+            annual_frames,
+            how="vertical_relaxed",
+        )
+        .collect()
+    )
+    print(f"census rows: {len(census):,}")
+    #
+    # BUILD LOOKUPS ONCE
+    #
+    lookup_exact = _build_lookup(
+        census,
+        fname_expr=pl.col("fname"),
+        lname_expr=pl.col("lname"),
+        dob_col="dob",
+        label="exact",
+    )
+    lookup_mname = _build_lookup(
+        census,
+        fname_expr=pl.concat_str(
+            [
+                pl.col("fname"),
+                pl.col("mname"),
+            ],
+            separator=" ",
+        ),
+        lname_expr=pl.col("lname"),
+        dob_col="dob",
+        label="mname",
+    )
+    lookup_suffix = _build_lookup(
+        census,
+        fname_expr=pl.col("fname"),
+        lname_expr=pl.concat_str(
+            [
+                pl.col("lname"),
+                pl.col("suffix"),
+            ],
+            separator=" ",
+        ),
+        dob_col="dob",
+        label="suffix",
+    )
+    lookup_dob_imp = _build_lookup(
+        census,
+        fname_expr=pl.col("fname"),
+        lname_expr=pl.col("lname"),
+        dob_col="dob_imp",
+        label="dob_imp",
+    )
+    return {
+        "exact": lookup_exact,
+        "mname": lookup_mname,
+        "suffix": lookup_suffix,
+        "dob_imp": lookup_dob_imp,
+    }
+def _run_match_stage(
+    unmatched: pl.DataFrame,
+    *,
+    lookup: pl.DataFrame,
+    fname_expr: pl.Expr,
+    lname_expr: pl.Expr,
+    dob_expr: pl.Expr,
+    label: str,
+) -> tuple[pl.DataFrame, pl.DataFrame]:
+    before = len(unmatched)
+    stage = (
+        unmatched
+        .with_columns(
+            [
+                fname_expr.alias("_fname_key"),
+                lname_expr.alias("_lname_key"),
+                dob_expr.alias("_dob_key"),
+            ]
+        )
+        .join(
+            lookup,
+            on=[
+                "_fname_key",
+                "_lname_key",
+                "_dob_key",
+            ],
+            how="left",
+            validate="m:1",
+        )
+        .drop(
+            [
+                "_fname_key",
+                "_lname_key",
+                "_dob_key",
+            ]
+        )
+    )
+    matched = (
+        stage
+        .filter(
+            pl.col("sid_cepr").is_not_null()
+        )
+    )
+    unmatched = (
+        stage
+        .filter(
+            pl.col("sid_cepr").is_null()
+        )
+        .drop("sid_cepr")
+    )
+    added = len(matched)
+    print(
+        f"{label}: matched {added:,}/{before:,}"
+    )
+    return matched, unmatched
+def lookup_sid_cepr(
+    frame: Frame,
+    *,
+    cols: Mapping[str, str],
+    lookups: dict[str, pl.DataFrame],
+) -> Frame:
+    is_lazy = isinstance(frame, pl.LazyFrame)
+    current = (
+        frame.collect()
+        if is_lazy
+        else frame
+    )
+    #
+    # CLEAN LEFT SIDE
+    #
+    current = (
+        current
+        .with_columns(
+            *clean_name(cols["fname"]),
+            *clean_name(cols["lname"]),
+            *clean_dob(col=cols["dob"]),
+        )
+    )
+    matched_frames = []
+    unmatched = current
+    #
+    # STAGE 1
+    # EXACT
+    #
+    matched, unmatched = _run_match_stage(
+        unmatched,
+        lookup=lookups["exact"],
+        fname_expr=pl.col(f"{cols['fname']}_clean"),
+        lname_expr=pl.col(f"{cols['lname']}_clean"),
+        dob_expr=pl.col(f"{cols['dob']}_clean"),
+        label="exact",
+    )
+    matched_frames.append(matched)
+    #
+    # STAGE 2
+    # MNAME
+    #
+    if "mname" in cols:
+        unmatched = (
+            unmatched
+            .with_columns(
+                *clean_other_name(cols["mname"])
+            )
+        )
+        matched, unmatched = _run_match_stage(
+            unmatched,
+            lookup=lookups["mname"],
+            fname_expr=pl.concat_str(
+                [
+                    pl.col(f"{cols['fname']}_clean"),
+                    pl.col(f"{cols['mname']}_clean"),
+                ],
+                separator=" ",
+            ),
+            lname_expr=pl.col(f"{cols['lname']}_clean"),
+            dob_expr=pl.col(f"{cols['dob']}_clean"),
+            label="mname",
+        )
+        matched_frames.append(matched)
+    #
+    # STAGE 3
+    # SUFFIX
+    #
+    if "suffix" in cols:
+        unmatched = (
+            unmatched
+            .with_columns(
+                *clean_other_name(cols["suffix"])
+            )
+        )
+        matched, unmatched = _run_match_stage(
+            unmatched,
+            lookup=lookups["suffix"],
+            fname_expr=pl.col(f"{cols['fname']}_clean"),
+            lname_expr=pl.concat_str(
+                [
+                    pl.col(f"{cols['lname']}_clean"),
+                    pl.col(f"{cols['suffix']}_clean"),
+                ],
+                separator=" ",
+            ),
+            dob_expr=pl.col(f"{cols['dob']}_clean"),
+            label="suffix",
+        )
+        matched_frames.append(matched)
+    #
+    # STAGE 4
+    # DOB IMP
+    #
+    matched, unmatched = _run_match_stage(
+        unmatched,
+        lookup=lookups["dob_imp"],
+        fname_expr=pl.col(f"{cols['fname']}_clean"),
+        lname_expr=pl.col(f"{cols['lname']}_clean"),
+        dob_expr=pl.col(f"{cols['dob']}_clean"),
+        label="dob_imp",
+    )
+    matched_frames.append(matched)
+    #
+    # FINAL
+    #
+    result = pl.concat(
+        matched_frames + [unmatched],
+        how="diagonal_relaxed",
+    )
+    print(
+        f"final matched: {result['sid_cepr'].is_not_null().sum():,}/{len(result):,}"
+    )
+    return result
+#
+# EXAMPLE USAGE
+#
+lookups = build_census_lookups(
+cmo_name="Aspire",
+)
+result = (
+df
+.pipe(
+lookup_sid_cepr,
+cols={
+"fname": "fname",
+"lname": "lname",
+"mname": "mname",
+"suffix": "suffix",
+"dob": "dob",
+},
+lookups=lookups,
+)
+)

{ltc_code-0.1.3 → ltc_code-0.1.5}/README.md RENAMED Viewed

File without changes

{ltc_code-0.1.3 → ltc_code-0.1.5}/src/ltc_code/__init__.py RENAMED Viewed

File without changes

{ltc_code-0.1.3 → ltc_code-0.1.5}/src/ltc_code/polars_dates.py RENAMED Viewed

File without changes

ltc-code 0.1.3__tar.gz → 0.1.5__tar.gz

ltc-code 0.1.3tar.gz → 0.1.5tar.gz