PyPI - ltc-code - Versions diffs - 0.1.7__tar.gz → 0.1.8__tar.gz - Mend

ltc-code 0.1.7tar.gz → 0.1.8tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (6) hide show

{ltc_code-0.1.7 → ltc_code-0.1.8}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.3
 Name: ltc-code
-Version: 0.1.7
+Version: 0.1.8
 Summary: Add your description here
 Requires-Python: >=3.9
 Description-Content-Type: text/markdown

{ltc_code-0.1.7 → ltc_code-0.1.8}/pyproject.toml RENAMED Viewed

@@ -1,6 +1,6 @@
 [project]
 name = "ltc-code"
-version = "0.1.7"
+version = "0.1.8"
 description = "Add your description here"
 readme = "README.md"
 requires-python = ">=3.9"

{ltc_code-0.1.7 → ltc_code-0.1.8}/src/ltc_code/may27.py RENAMED Viewed

@@ -1424,9 +1424,6 @@ def lookup_sid_cepr(
 ####################################################################################
 def _parse_dob_expr(col: str) -> pl.Expr:
     return pl.coalesce(
         [
@@ -1437,6 +1434,14 @@ def _parse_dob_expr(col: str) -> pl.Expr:
     )
+def _first_word_expr(col: str) -> pl.Expr:
+    return pl.col(col).cast(pl.String).str.split(" ").list.first()
+def _second_word_expr(col: str) -> pl.Expr:
+    return pl.col(col).cast(pl.String).str.split(" ").list.get(1, null_on_oob=True)
 def _build_lookup(
     census: pl.DataFrame,
     *,
@@ -1456,13 +1461,11 @@ def _build_lookup(
         .drop_nulls(["_fname_key", "_lname_key", "_dob_key", "sid_cepr"])
         .group_by(["_fname_key", "_lname_key", "_dob_key"])
         .agg(pl.col("sid_cepr").unique().alias("_sids"))
-        .with_columns(pl.col("_sids").list.len().alias("_sid_count"))
-        .filter(pl.col("_sid_count") == 1)
         .select(
             "_fname_key",
             "_lname_key",
             "_dob_key",
-            pl.col("_sids").list.first().alias("sid_cepr"),
+            pl.col("_sids").list.sort().list.first().alias("sid_cepr"),
         )
     )
@@ -1603,6 +1606,30 @@ def build_census_lookups(*, cmo_name: str) -> dict[str, pl.DataFrame]:
         label="right lname + suffix no space",
     )
+    lookups["fname_first_word"] = _build_lookup(
+        census,
+        fname_expr=_first_word_expr("fname"),
+        lname_expr=pl.col("lname"),
+        dob_col="dob",
+        label="right fname first word",
+    )
+    lookups["lname_first_word"] = _build_lookup(
+        census,
+        fname_expr=pl.col("fname"),
+        lname_expr=_first_word_expr("lname"),
+        dob_col="dob",
+        label="right lname first word",
+    )
+    lookups["lname_second_word"] = _build_lookup(
+        census,
+        fname_expr=pl.col("fname"),
+        lname_expr=_second_word_expr("lname"),
+        dob_col="dob",
+        label="right lname second word",
+    )
     lookups["dob_imp"] = _build_lookup(
         census,
         fname_expr=pl.col("fname"),
@@ -1659,7 +1686,6 @@ def _run_match_stage(
     unmatched = stage.filter(pl.col("sid_cepr").is_null()).drop("sid_cepr")
     print(f"{label}: matched {len(matched):,}/{before:,}")
     return matched, unmatched
@@ -1710,6 +1736,9 @@ def lookup_sid_cepr(
         ("left exact -> right fname + suffix", lookups["suffix_fname"]),
         ("left exact -> right fname + suffix no space", lookups["suffix_fname_nospace"]),
         ("left exact -> right lname + suffix no space", lookups["suffix_lname_nospace"]),
+        ("left exact -> right fname first word", lookups["fname_first_word"]),
+        ("left exact -> right lname first word", lookups["lname_first_word"]),
+        ("left exact -> right lname second word", lookups["lname_second_word"]),
     ]:
         matched, unmatched = _run_match_stage(
             unmatched,
@@ -1721,6 +1750,33 @@ def lookup_sid_cepr(
         )
         matched_frames.append(matched)
+    for label, fname_expr, lname_expr in [
+        (
+            "left fname first word -> right exact",
+            _first_word_expr(f"{cols['fname']}_clean"),
+            pl.col(f"{cols['lname']}_clean"),
+        ),
+        (
+            "left lname first word -> right exact",
+            pl.col(f"{cols['fname']}_clean"),
+            _first_word_expr(f"{cols['lname']}_clean"),
+        ),
+        (
+            "left lname second word -> right exact",
+            pl.col(f"{cols['fname']}_clean"),
+            _second_word_expr(f"{cols['lname']}_clean"),
+        ),
+    ]:
+        matched, unmatched = _run_match_stage(
+            unmatched,
+            lookup=lookups["exact"],
+            fname_expr=fname_expr,
+            lname_expr=lname_expr,
+            dob_expr=pl.col(f"{cols['dob']}_clean"),
+            label=label,
+        )
+        matched_frames.append(matched)
     if "mname" in cols:
         unmatched = unmatched.with_columns(*clean_other_name(cols["mname"]))
@@ -1838,7 +1894,6 @@ def lookup_sid_cepr(
     print(f"final matched: {result['sid_cepr'].is_not_null().sum():,}/{len(result):,}")
     return result
-#
 # EXAMPLE USAGE

{ltc_code-0.1.7 → ltc_code-0.1.8}/README.md RENAMED Viewed

File without changes

{ltc_code-0.1.7 → ltc_code-0.1.8}/src/ltc_code/__init__.py RENAMED Viewed

File without changes

{ltc_code-0.1.7 → ltc_code-0.1.8}/src/ltc_code/polars_dates.py RENAMED Viewed

File without changes

ltc-code 0.1.7__tar.gz → 0.1.8__tar.gz

ltc-code 0.1.7tar.gz → 0.1.8tar.gz