PyPI - ltc-code - Versions diffs - 0.1.4__tar.gz → 0.1.6__tar.gz - Mend

ltc-code 0.1.4tar.gz → 0.1.6tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (6) hide show

{ltc_code-0.1.4 → ltc_code-0.1.6}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.3
 Name: ltc-code
-Version: 0.1.4
+Version: 0.1.6
 Summary: Add your description here
 Requires-Python: >=3.9
 Description-Content-Type: text/markdown

{ltc_code-0.1.4 → ltc_code-0.1.6}/pyproject.toml RENAMED Viewed

@@ -1,6 +1,6 @@
 [project]
 name = "ltc-code"
-version = "0.1.4"
+version = "0.1.6"
 description = "Add your description here"
 readme = "README.md"
 requires-python = ">=3.9"

{ltc_code-0.1.4 → ltc_code-0.1.6}/src/ltc_code/may27.py RENAMED Viewed

@@ -1428,431 +1428,492 @@ def lookup_sid_cepr(
 def _build_lookup(
-census: pl.DataFrame,
-*,
-fname_expr: pl.Expr,
-lname_expr: pl.Expr,
-dob_col: str,
-label: str,
+    census: pl.DataFrame,
+    *,
+    fname_expr: pl.Expr,
+    lname_expr: pl.Expr,
+    dob_col: str,
+    label: str,
 ) -> pl.DataFrame:
-"""
-Build a deterministic SID lookup table.
-```
-Output schema:
-    _fname_key
-    _lname_key
-    _dob_key
-    sid_cepr
-Ambiguous keys are removed.
-"""
-lookup = (
-    census
-    .select(
-        [
-            fname_expr.alias("_fname_key"),
-            lname_expr.alias("_lname_key"),
-            pl.col(dob_col).alias("_dob_key"),
-            pl.col("sid_cepr"),
-        ]
-    )
-    .drop_nulls(
-        [
-            "_fname_key",
-            "_lname_key",
-            "_dob_key",
-            "sid_cepr",
-        ]
-    )
-    .group_by(
-        [
-            "_fname_key",
-            "_lname_key",
-            "_dob_key",
-        ]
-    )
-    .agg(
-        pl.col("sid_cepr").unique().alias("_sids")
-    )
-    .with_columns(
-        pl.col("_sids").list.len().alias("_sid_count")
-    )
-    .filter(
-        pl.col("_sid_count") == 1
-    )
-    .select(
-        [
-            "_fname_key",
-            "_lname_key",
-            "_dob_key",
-            pl.col("_sids").list.first().alias("sid_cepr"),
-        ]
-    )
-)
-print(f"built lookup: {label}")
-return lookup
-```
-def build_census_lookups(
-*,
-cmo_name: str,
-) -> dict[str, pl.DataFrame]:
-```
-try:
-    import mappings
-except ImportError:
-    import mapppings as mappings
-annual_frames = []
+    """
+    Build a deterministic SID lookup table.
-for year in range(1994, 2023):
+    Output schema:
+        _fname_key
+        _lname_key
+        _dob_key
+        sid_cepr
-    path = CENSUS_STUDENTS / f"census_student_{year}.csv"
+    Ambiguous keys are removed.
+    """
-    annual = (
-        pl.scan_csv(
-            path,
-            infer_schema=False,
-            null_values=[],
-            try_parse_dates=False,
-            ignore_errors=False,
-        )
+    lookup = (
+        census
         .select(
             [
-                "cmo_code",
+                fname_expr.alias("_fname_key"),
+                lname_expr.alias("_lname_key"),
+                pl.col(dob_col).alias("_dob_key"),
+                pl.col("sid_cepr"),
+            ]
+        )
+        .drop_nulls(
+            [
+                "_fname_key",
+                "_lname_key",
+                "_dob_key",
                 "sid_cepr",
-                "fname_clean",
-                "lname_clean",
-                "mname_clean",
-                "suff_clean",
-                "birthdate_clean",
-                "birthdate_imp",
             ]
         )
-        .rename(
-            {
-                "fname_clean": "fname",
-                "lname_clean": "lname",
-                "mname_clean": "mname",
-                "suff_clean": "suffix",
-                "birthdate_clean": "dob",
-                "birthdate_imp": "dob_imp",
-            }
+        .group_by(
+            [
+                "_fname_key",
+                "_lname_key",
+                "_dob_key",
+            ]
+        )
+        .agg(
+            pl.col("sid_cepr").unique().alias("_sids")
         )
         .with_columns(
-            pl.col("cmo_code")
-            .replace(mappings.CMO_CODE_TO_NAME)
-            .alias("cmo_name")
+            pl.col("_sids").list.len().alias("_sid_count")
         )
         .filter(
-            pl.col("cmo_name") == cmo_name
+            pl.col("_sid_count") == 1
         )
-        .with_columns(
-            *clean_name("fname"),
-            *clean_name("lname"),
-            *clean_other_name("mname"),
-            *clean_other_name("suffix"),
-            *clean_dob(col="dob"),
-            *clean_dob(col="dob_imp"),
-        )
-        .drop(
+        .select(
             [
-                "dob",
-                "dob_imp",
+                "_fname_key",
+                "_lname_key",
+                "_dob_key",
+                pl.col("_sids").list.first().alias("sid_cepr"),
             ]
         )
-        .rename(
-            {
-                "dob_clean": "dob",
-                "dob_imp_clean": "dob_imp",
-            }
-        )
     )
-    annual_frames.append(annual)
+    print(f"built lookup: {label}")
-#
-# MATERIALIZE ONCE
-#
+    return lookup
+def build_census_lookups(
+    *,
+    cmo_name: str,
+) -> dict[str, pl.DataFrame]:
+    try:
+        import mappings
+    except ImportError:
+        import mapppings as mappings
+    annual_frames = []
+    for year in range(1994, 2023):
+        path = CENSUS_STUDENTS / f"census_student_{year}.csv"
+        annual = (
+            pl.scan_csv(
+                path,
+                infer_schema=False,
+                null_values=[],
+                try_parse_dates=False,
+                ignore_errors=False,
+            )
+            .select(
+                [
+                    "cmo_code",
+                    "sid_cepr",
+                    "fname_clean",
+                    "lname_clean",
+                    "mname_clean",
+                    "suff_clean",
+                    "birthdate_clean",
+                    "birthdate_imp",
+                ]
+            )
+            .rename(
+                {
+                    "fname_clean": "fname",
+                    "lname_clean": "lname",
+                    "mname_clean": "mname",
+                    "suff_clean": "suffix",
+                    "birthdate_clean": "dob",
+                    "birthdate_imp": "dob_imp",
+                }
+            )
+            .with_columns(
+                pl.col("cmo_code")
+                .replace(mappings.CMO_CODE_TO_NAME)
+                .alias("cmo_name")
+            )
+            .filter(
+                pl.col("cmo_name") == cmo_name
+            )
+            .with_columns(
+                *clean_name("fname"),
+                *clean_name("lname"),
+                *clean_other_name("mname"),
+                *clean_other_name("suffix"),
+                *clean_dob(col="dob"),
+                *clean_dob(col="dob_imp"),
+            )
+            .drop(
+                [
+                    "dob",
+                    "dob_imp",
+                ]
+            )
+            .rename(
+                {
+                    "dob_clean": "dob",
+                    "dob_imp_clean": "dob_imp",
+                }
+            )
+        )
+        annual_frames.append(annual)
-census = (
-    pl.concat(
-        annual_frames,
-        how="vertical_relaxed",
+    #
+    # MATERIALIZE ONCE
+    #
+    census = (
+        pl.concat(
+            annual_frames,
+            how="vertical_relaxed",
+        )
+        .collect()
     )
-    .collect()
-)
-print(f"census rows: {len(census):,}")
+    print(f"census rows: {len(census):,}")
-#
-# BUILD LOOKUPS ONCE
-#
+    #
+    # BUILD LOOKUPS ONCE
+    #
-lookup_exact = _build_lookup(
-    census,
-    fname_expr=pl.col("fname"),
-    lname_expr=pl.col("lname"),
-    dob_col="dob",
-    label="exact",
-)
+    lookup_exact = _build_lookup(
+        census,
+        fname_expr=pl.col("fname"),
+        lname_expr=pl.col("lname"),
+        dob_col="dob",
+        label="exact",
+    )
-lookup_mname = _build_lookup(
-    census,
-    fname_expr=pl.concat_str(
-        [
-            pl.col("fname"),
-            pl.col("mname"),
-        ],
-        separator=" ",
-    ),
-    lname_expr=pl.col("lname"),
-    dob_col="dob",
-    label="mname",
-)
+    lookup_mname = _build_lookup(
+        census,
+        fname_expr=pl.concat_str(
+            [
+                pl.col("fname"),
+                pl.col("mname"),
+            ],
+            separator=" ",
+        ),
+        lname_expr=pl.col("lname"),
+        dob_col="dob",
+        label="mname",
+    )
-lookup_suffix = _build_lookup(
-    census,
-    fname_expr=pl.col("fname"),
-    lname_expr=pl.concat_str(
-        [
-            pl.col("lname"),
-            pl.col("suffix"),
-        ],
-        separator=" ",
-    ),
-    dob_col="dob",
-    label="suffix",
-)
+    lookup_suffix = _build_lookup(
+        census,
+        fname_expr=pl.col("fname"),
+        lname_expr=pl.concat_str(
+            [
+                pl.col("lname"),
+                pl.col("suffix"),
+            ],
+            separator=" ",
+        ),
+        dob_col="dob",
+        label="suffix",
+    )
-lookup_dob_imp = _build_lookup(
-    census,
-    fname_expr=pl.col("fname"),
-    lname_expr=pl.col("lname"),
-    dob_col="dob_imp",
-    label="dob_imp",
-)
+    lookup_dob_imp = _build_lookup(
+        census,
+        fname_expr=pl.col("fname"),
+        lname_expr=pl.col("lname"),
+        dob_col="dob_imp",
+        label="dob_imp",
+    )
-return {
-    "exact": lookup_exact,
-    "mname": lookup_mname,
-    "suffix": lookup_suffix,
-    "dob_imp": lookup_dob_imp,
-}
-```
+    return {
+        "exact": lookup_exact,
+        "mname": lookup_mname,
+        "suffix": lookup_suffix,
+        "dob_imp": lookup_dob_imp,
+    }
 def _run_match_stage(
-unmatched: pl.DataFrame,
-*,
-lookup: pl.DataFrame,
-fname_expr: pl.Expr,
-lname_expr: pl.Expr,
-dob_expr: pl.Expr,
-label: str,
+    unmatched: pl.DataFrame,
+    *,
+    lookup: pl.DataFrame,
+    fname_expr: pl.Expr,
+    lname_expr: pl.Expr,
+    dob_expr: pl.Expr,
+    label: str,
 ) -> tuple[pl.DataFrame, pl.DataFrame]:
-```
-before = len(unmatched)
+    before = len(unmatched)
-stage = (
-    unmatched
-    .with_columns(
-        [
-            fname_expr.alias("_fname_key"),
-            lname_expr.alias("_lname_key"),
-            dob_expr.alias("_dob_key"),
-        ]
-    )
-    .join(
-        lookup,
-        on=[
-            "_fname_key",
-            "_lname_key",
-            "_dob_key",
-        ],
-        how="left",
-        validate="m:1",
-    )
-    .drop(
-        [
-            "_fname_key",
-            "_lname_key",
-            "_dob_key",
-        ]
+    stage = (
+        unmatched
+        .with_columns(
+            [
+                fname_expr.alias("_fname_key"),
+                lname_expr.alias("_lname_key"),
+                dob_expr.alias("_dob_key"),
+            ]
+        )
+        .join(
+            lookup,
+            on=[
+                "_fname_key",
+                "_lname_key",
+                "_dob_key",
+            ],
+            how="left",
+            validate="m:1",
+        )
+        .drop(
+            [
+                "_fname_key",
+                "_lname_key",
+                "_dob_key",
+            ]
+        )
     )
-)
-matched = (
-    stage
-    .filter(
-        pl.col("sid_cepr").is_not_null()
+    matched = (
+        stage
+        .filter(
+            pl.col("sid_cepr").is_not_null()
+        )
     )
-)
-unmatched = (
-    stage
-    .filter(
-        pl.col("sid_cepr").is_null()
+    unmatched = (
+        stage
+        .filter(
+            pl.col("sid_cepr").is_null()
+        )
+        .drop("sid_cepr")
     )
-    .drop("sid_cepr")
-)
-added = len(matched)
+    added = len(matched)
-print(
-    f"{label}: matched {added:,}/{before:,}"
-)
+    print(
+        f"{label}: matched {added:,}/{before:,}"
+    )
-return matched, unmatched
-```
+    return matched, unmatched
 def lookup_sid_cepr(
-frame: Frame,
-*,
-cols: Mapping[str, str],
-lookups: dict[str, pl.DataFrame],
+    frame: Frame,
+    *,
+    cols: Mapping[str, str],
+    lookups: dict[str, pl.DataFrame],
 ) -> Frame:
-```
-is_lazy = isinstance(frame, pl.LazyFrame)
-current = (
-    frame.collect()
-    if is_lazy
-    else frame
-)
-#
-# CLEAN LEFT SIDE
-#
+    is_lazy = isinstance(frame, pl.LazyFrame)
+    current = frame.collect() if is_lazy else frame
-current = (
-    current
-    .with_columns(
+    current = current.with_columns(
         *clean_name(cols["fname"]),
         *clean_name(cols["lname"]),
         *clean_dob(col=cols["dob"]),
     )
-)
-matched_frames = []
-unmatched = current
-#
-# STAGE 1
-# EXACT
-#
-matched, unmatched = _run_match_stage(
-    unmatched,
-    lookup=lookups["exact"],
-    fname_expr=pl.col(f"{cols['fname']}_clean"),
-    lname_expr=pl.col(f"{cols['lname']}_clean"),
-    dob_expr=pl.col(f"{cols['dob']}_clean"),
-    label="exact",
-)
-matched_frames.append(matched)
-#
-# STAGE 2
-# MNAME
-#
-if "mname" in cols:
-    unmatched = (
-        unmatched
-        .with_columns(
-            *clean_other_name(cols["mname"])
-        )
+    current = current.with_columns(
+        _parse_dob_expr(f"{cols['dob']}_clean").alias(f"{cols['dob']}_clean")
     )
+    matched_frames = []
+    unmatched = current
+    # exact
     matched, unmatched = _run_match_stage(
         unmatched,
-        lookup=lookups["mname"],
-        fname_expr=pl.concat_str(
-            [
-                pl.col(f"{cols['fname']}_clean"),
-                pl.col(f"{cols['mname']}_clean"),
-            ],
-            separator=" ",
-        ),
+        lookup=lookups["exact"],
+        fname_expr=pl.col(f"{cols['fname']}_clean"),
         lname_expr=pl.col(f"{cols['lname']}_clean"),
         dob_expr=pl.col(f"{cols['dob']}_clean"),
-        label="mname",
+        label="exact",
     )
     matched_frames.append(matched)
-#
-# STAGE 3
-# SUFFIX
-#
-if "suffix" in cols:
-    unmatched = (
-        unmatched
-        .with_columns(
-            *clean_other_name(cols["suffix"])
+    # middle-name variants
+    if "mname" in cols:
+        unmatched = unmatched.with_columns(
+            *clean_other_name(cols["mname"])
         )
-    )
-    matched, unmatched = _run_match_stage(
-        unmatched,
-        lookup=lookups["suffix"],
-        fname_expr=pl.col(f"{cols['fname']}_clean"),
-        lname_expr=pl.concat_str(
-            [
+        mname_stages = [
+            (
+                "left exact -> right fname + mname",
+                lookups["mname"],
+                pl.col(f"{cols['fname']}_clean"),
                 pl.col(f"{cols['lname']}_clean"),
-                pl.col(f"{cols['suffix']}_clean"),
-            ],
-            separator=" ",
-        ),
-        dob_expr=pl.col(f"{cols['dob']}_clean"),
-        label="suffix",
-    )
+            ),
+            (
+                "left exact -> right mname + lname",
+                lookups["mname_lname"],
+                pl.col(f"{cols['fname']}_clean"),
+                pl.col(f"{cols['lname']}_clean"),
+            ),
+            (
+                "left exact -> right fname + mname no space",
+                lookups["mname_nospace"],
+                pl.col(f"{cols['fname']}_clean"),
+                pl.col(f"{cols['lname']}_clean"),
+            ),
+            (
+                "left fname + mname -> right exact",
+                lookups["exact"],
+                pl.concat_str(
+                    [pl.col(f"{cols['fname']}_clean"), pl.col(f"{cols['mname']}_clean")],
+                    separator=" ",
+                ),
+                pl.col(f"{cols['lname']}_clean"),
+            ),
+            (
+                "left mname + lname -> right exact",
+                lookups["exact"],
+                pl.col(f"{cols['fname']}_clean"),
+                pl.concat_str(
+                    [pl.col(f"{cols['mname']}_clean"), pl.col(f"{cols['lname']}_clean")],
+                    separator=" ",
+                ),
+            ),
+            (
+                "left fname + mname no space -> right exact",
+                lookups["exact"],
+                pl.concat_str(
+                    [pl.col(f"{cols['fname']}_clean"), pl.col(f"{cols['mname']}_clean")],
+                    separator="",
+                ),
+                pl.col(f"{cols['lname']}_clean"),
+            ),
+        ]
-    matched_frames.append(matched)
+        for label, lookup, fname_expr, lname_expr in mname_stages:
+            matched, unmatched = _run_match_stage(
+                unmatched,
+                lookup=lookup,
+                fname_expr=fname_expr,
+                lname_expr=lname_expr,
+                dob_expr=pl.col(f"{cols['dob']}_clean"),
+                label=label,
+            )
+            matched_frames.append(matched)
-#
-# STAGE 4
-# DOB IMP
-#
+    # suffix variants
+    if "suffix" in cols:
+        unmatched = unmatched.with_columns(
+            *clean_other_name(cols["suffix"])
+        )
-matched, unmatched = _run_match_stage(
-    unmatched,
-    lookup=lookups["dob_imp"],
-    fname_expr=pl.col(f"{cols['fname']}_clean"),
-    lname_expr=pl.col(f"{cols['lname']}_clean"),
-    dob_expr=pl.col(f"{cols['dob']}_clean"),
-    label="dob_imp",
-)
+        suffix_stages = [
+            (
+                "left exact -> right lname + suffix",
+                lookups["suffix"],
+                pl.col(f"{cols['fname']}_clean"),
+                pl.col(f"{cols['lname']}_clean"),
+            ),
+            (
+                "left exact -> right fname + suffix",
+                lookups["suffix_fname"],
+                pl.col(f"{cols['fname']}_clean"),
+                pl.col(f"{cols['lname']}_clean"),
+            ),
+            (
+                "left exact -> right fname + suffix no space",
+                lookups["suffix_fname_nospace"],
+                pl.col(f"{cols['fname']}_clean"),
+                pl.col(f"{cols['lname']}_clean"),
+            ),
+            (
+                "left exact -> right lname + suffix no space",
+                lookups["suffix_lname_nospace"],
+                pl.col(f"{cols['fname']}_clean"),
+                pl.col(f"{cols['lname']}_clean"),
+            ),
+            (
+                "left fname + suffix -> right exact",
+                lookups["exact"],
+                pl.concat_str(
+                    [pl.col(f"{cols['fname']}_clean"), pl.col(f"{cols['suffix']}_clean")],
+                    separator=" ",
+                ),
+                pl.col(f"{cols['lname']}_clean"),
+            ),
+            (
+                "left fname + suffix no space -> right exact",
+                lookups["exact"],
+                pl.concat_str(
+                    [pl.col(f"{cols['fname']}_clean"), pl.col(f"{cols['suffix']}_clean")],
+                    separator="",
+                ),
+                pl.col(f"{cols['lname']}_clean"),
+            ),
+            (
+                "left lname + suffix -> right exact",
+                lookups["exact"],
+                pl.col(f"{cols['fname']}_clean"),
+                pl.concat_str(
+                    [pl.col(f"{cols['lname']}_clean"), pl.col(f"{cols['suffix']}_clean")],
+                    separator=" ",
+                ),
+            ),
+            (
+                "left lname + suffix no space -> right exact",
+                lookups["exact"],
+                pl.col(f"{cols['fname']}_clean"),
+                pl.concat_str(
+                    [pl.col(f"{cols['lname']}_clean"), pl.col(f"{cols['suffix']}_clean")],
+                    separator="",
+                ),
+            ),
+        ]
-matched_frames.append(matched)
+        for label, lookup, fname_expr, lname_expr in suffix_stages:
+            matched, unmatched = _run_match_stage(
+                unmatched,
+                lookup=lookup,
+                fname_expr=fname_expr,
+                lname_expr=lname_expr,
+                dob_expr=pl.col(f"{cols['dob']}_clean"),
+                label=label,
+            )
+            matched_frames.append(matched)
+    # dob_imp variants
+    dob_imp_stages = [
+        "dob_imp",
+        "dob_imp_minus_1",
+        "dob_imp_plus_1",
+        "dob_imp_minus_2",
+        "dob_imp_plus_2",
+    ]
-#
-# FINAL
-#
+    for key in dob_imp_stages:
+        matched, unmatched = _run_match_stage(
+            unmatched,
+            lookup=lookups[key],
+            fname_expr=pl.col(f"{cols['fname']}_clean"),
+            lname_expr=pl.col(f"{cols['lname']}_clean"),
+            dob_expr=pl.col(f"{cols['dob']}_clean"),
+            label=key,
+        )
+        matched_frames.append(matched)
-result = pl.concat(
-    matched_frames + [unmatched],
-    how="diagonal_relaxed",
-)
+    result = pl.concat(
+        matched_frames + [unmatched],
+        how="diagonal_relaxed",
+    )
-print(
-    f"final matched: {result['sid_cepr'].is_not_null().sum():,}/{len(result):,}"
-)
+    print(
+        f"final matched: {result['sid_cepr'].is_not_null().sum():,}/{len(result):,}"
+    )
-return result
-```
+    return result
 #

{ltc_code-0.1.4 → ltc_code-0.1.6}/README.md RENAMED Viewed

File without changes

{ltc_code-0.1.4 → ltc_code-0.1.6}/src/ltc_code/__init__.py RENAMED Viewed

File without changes

{ltc_code-0.1.4 → ltc_code-0.1.6}/src/ltc_code/polars_dates.py RENAMED Viewed

File without changes

ltc-code 0.1.4__tar.gz → 0.1.6__tar.gz

ltc-code 0.1.4tar.gz → 0.1.6tar.gz