ltc-code 0.1.7__tar.gz → 0.1.8__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {ltc_code-0.1.7 → ltc_code-0.1.8}/PKG-INFO +1 -1
- {ltc_code-0.1.7 → ltc_code-0.1.8}/pyproject.toml +1 -1
- {ltc_code-0.1.7 → ltc_code-0.1.8}/src/ltc_code/may27.py +63 -8
- {ltc_code-0.1.7 → ltc_code-0.1.8}/README.md +0 -0
- {ltc_code-0.1.7 → ltc_code-0.1.8}/src/ltc_code/__init__.py +0 -0
- {ltc_code-0.1.7 → ltc_code-0.1.8}/src/ltc_code/polars_dates.py +0 -0
|
@@ -1424,9 +1424,6 @@ def lookup_sid_cepr(
|
|
|
1424
1424
|
####################################################################################
|
|
1425
1425
|
|
|
1426
1426
|
|
|
1427
|
-
|
|
1428
|
-
|
|
1429
|
-
|
|
1430
1427
|
def _parse_dob_expr(col: str) -> pl.Expr:
|
|
1431
1428
|
return pl.coalesce(
|
|
1432
1429
|
[
|
|
@@ -1437,6 +1434,14 @@ def _parse_dob_expr(col: str) -> pl.Expr:
|
|
|
1437
1434
|
)
|
|
1438
1435
|
|
|
1439
1436
|
|
|
1437
|
+
def _first_word_expr(col: str) -> pl.Expr:
|
|
1438
|
+
return pl.col(col).cast(pl.String).str.split(" ").list.first()
|
|
1439
|
+
|
|
1440
|
+
|
|
1441
|
+
def _second_word_expr(col: str) -> pl.Expr:
|
|
1442
|
+
return pl.col(col).cast(pl.String).str.split(" ").list.get(1, null_on_oob=True)
|
|
1443
|
+
|
|
1444
|
+
|
|
1440
1445
|
def _build_lookup(
|
|
1441
1446
|
census: pl.DataFrame,
|
|
1442
1447
|
*,
|
|
@@ -1456,13 +1461,11 @@ def _build_lookup(
|
|
|
1456
1461
|
.drop_nulls(["_fname_key", "_lname_key", "_dob_key", "sid_cepr"])
|
|
1457
1462
|
.group_by(["_fname_key", "_lname_key", "_dob_key"])
|
|
1458
1463
|
.agg(pl.col("sid_cepr").unique().alias("_sids"))
|
|
1459
|
-
.with_columns(pl.col("_sids").list.len().alias("_sid_count"))
|
|
1460
|
-
.filter(pl.col("_sid_count") == 1)
|
|
1461
1464
|
.select(
|
|
1462
1465
|
"_fname_key",
|
|
1463
1466
|
"_lname_key",
|
|
1464
1467
|
"_dob_key",
|
|
1465
|
-
pl.col("_sids").list.first().alias("sid_cepr"),
|
|
1468
|
+
pl.col("_sids").list.sort().list.first().alias("sid_cepr"),
|
|
1466
1469
|
)
|
|
1467
1470
|
)
|
|
1468
1471
|
|
|
@@ -1603,6 +1606,30 @@ def build_census_lookups(*, cmo_name: str) -> dict[str, pl.DataFrame]:
|
|
|
1603
1606
|
label="right lname + suffix no space",
|
|
1604
1607
|
)
|
|
1605
1608
|
|
|
1609
|
+
lookups["fname_first_word"] = _build_lookup(
|
|
1610
|
+
census,
|
|
1611
|
+
fname_expr=_first_word_expr("fname"),
|
|
1612
|
+
lname_expr=pl.col("lname"),
|
|
1613
|
+
dob_col="dob",
|
|
1614
|
+
label="right fname first word",
|
|
1615
|
+
)
|
|
1616
|
+
|
|
1617
|
+
lookups["lname_first_word"] = _build_lookup(
|
|
1618
|
+
census,
|
|
1619
|
+
fname_expr=pl.col("fname"),
|
|
1620
|
+
lname_expr=_first_word_expr("lname"),
|
|
1621
|
+
dob_col="dob",
|
|
1622
|
+
label="right lname first word",
|
|
1623
|
+
)
|
|
1624
|
+
|
|
1625
|
+
lookups["lname_second_word"] = _build_lookup(
|
|
1626
|
+
census,
|
|
1627
|
+
fname_expr=pl.col("fname"),
|
|
1628
|
+
lname_expr=_second_word_expr("lname"),
|
|
1629
|
+
dob_col="dob",
|
|
1630
|
+
label="right lname second word",
|
|
1631
|
+
)
|
|
1632
|
+
|
|
1606
1633
|
lookups["dob_imp"] = _build_lookup(
|
|
1607
1634
|
census,
|
|
1608
1635
|
fname_expr=pl.col("fname"),
|
|
@@ -1659,7 +1686,6 @@ def _run_match_stage(
|
|
|
1659
1686
|
unmatched = stage.filter(pl.col("sid_cepr").is_null()).drop("sid_cepr")
|
|
1660
1687
|
|
|
1661
1688
|
print(f"{label}: matched {len(matched):,}/{before:,}")
|
|
1662
|
-
|
|
1663
1689
|
return matched, unmatched
|
|
1664
1690
|
|
|
1665
1691
|
|
|
@@ -1710,6 +1736,9 @@ def lookup_sid_cepr(
|
|
|
1710
1736
|
("left exact -> right fname + suffix", lookups["suffix_fname"]),
|
|
1711
1737
|
("left exact -> right fname + suffix no space", lookups["suffix_fname_nospace"]),
|
|
1712
1738
|
("left exact -> right lname + suffix no space", lookups["suffix_lname_nospace"]),
|
|
1739
|
+
("left exact -> right fname first word", lookups["fname_first_word"]),
|
|
1740
|
+
("left exact -> right lname first word", lookups["lname_first_word"]),
|
|
1741
|
+
("left exact -> right lname second word", lookups["lname_second_word"]),
|
|
1713
1742
|
]:
|
|
1714
1743
|
matched, unmatched = _run_match_stage(
|
|
1715
1744
|
unmatched,
|
|
@@ -1721,6 +1750,33 @@ def lookup_sid_cepr(
|
|
|
1721
1750
|
)
|
|
1722
1751
|
matched_frames.append(matched)
|
|
1723
1752
|
|
|
1753
|
+
for label, fname_expr, lname_expr in [
|
|
1754
|
+
(
|
|
1755
|
+
"left fname first word -> right exact",
|
|
1756
|
+
_first_word_expr(f"{cols['fname']}_clean"),
|
|
1757
|
+
pl.col(f"{cols['lname']}_clean"),
|
|
1758
|
+
),
|
|
1759
|
+
(
|
|
1760
|
+
"left lname first word -> right exact",
|
|
1761
|
+
pl.col(f"{cols['fname']}_clean"),
|
|
1762
|
+
_first_word_expr(f"{cols['lname']}_clean"),
|
|
1763
|
+
),
|
|
1764
|
+
(
|
|
1765
|
+
"left lname second word -> right exact",
|
|
1766
|
+
pl.col(f"{cols['fname']}_clean"),
|
|
1767
|
+
_second_word_expr(f"{cols['lname']}_clean"),
|
|
1768
|
+
),
|
|
1769
|
+
]:
|
|
1770
|
+
matched, unmatched = _run_match_stage(
|
|
1771
|
+
unmatched,
|
|
1772
|
+
lookup=lookups["exact"],
|
|
1773
|
+
fname_expr=fname_expr,
|
|
1774
|
+
lname_expr=lname_expr,
|
|
1775
|
+
dob_expr=pl.col(f"{cols['dob']}_clean"),
|
|
1776
|
+
label=label,
|
|
1777
|
+
)
|
|
1778
|
+
matched_frames.append(matched)
|
|
1779
|
+
|
|
1724
1780
|
if "mname" in cols:
|
|
1725
1781
|
unmatched = unmatched.with_columns(*clean_other_name(cols["mname"]))
|
|
1726
1782
|
|
|
@@ -1838,7 +1894,6 @@ def lookup_sid_cepr(
|
|
|
1838
1894
|
print(f"final matched: {result['sid_cepr'].is_not_null().sum():,}/{len(result):,}")
|
|
1839
1895
|
|
|
1840
1896
|
return result
|
|
1841
|
-
#
|
|
1842
1897
|
|
|
1843
1898
|
# EXAMPLE USAGE
|
|
1844
1899
|
|
|
File without changes
|
|
File without changes
|
|
File without changes
|