ltc-code 0.1.6__tar.gz → 0.1.8__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {ltc_code-0.1.6 → ltc_code-0.1.8}/PKG-INFO +1 -1
- {ltc_code-0.1.6 → ltc_code-0.1.8}/pyproject.toml +1 -1
- {ltc_code-0.1.6 → ltc_code-0.1.8}/src/ltc_code/may27.py +231 -253
- {ltc_code-0.1.6 → ltc_code-0.1.8}/README.md +0 -0
- {ltc_code-0.1.6 → ltc_code-0.1.8}/src/ltc_code/__init__.py +0 -0
- {ltc_code-0.1.6 → ltc_code-0.1.8}/src/ltc_code/polars_dates.py +0 -0
|
@@ -1424,7 +1424,22 @@ def lookup_sid_cepr(
|
|
|
1424
1424
|
####################################################################################
|
|
1425
1425
|
|
|
1426
1426
|
|
|
1427
|
+
def _parse_dob_expr(col: str) -> pl.Expr:
|
|
1428
|
+
return pl.coalesce(
|
|
1429
|
+
[
|
|
1430
|
+
pl.col(col).cast(pl.String).str.strptime(pl.Date, "%m/%d/%Y", strict=False),
|
|
1431
|
+
pl.col(col).cast(pl.String).str.strptime(pl.Date, "%m/%d/%y", strict=False),
|
|
1432
|
+
pl.col(col).cast(pl.String).str.strptime(pl.Date, "%Y-%m-%d", strict=False),
|
|
1433
|
+
]
|
|
1434
|
+
)
|
|
1435
|
+
|
|
1436
|
+
|
|
1437
|
+
def _first_word_expr(col: str) -> pl.Expr:
|
|
1438
|
+
return pl.col(col).cast(pl.String).str.split(" ").list.first()
|
|
1439
|
+
|
|
1427
1440
|
|
|
1441
|
+
def _second_word_expr(col: str) -> pl.Expr:
|
|
1442
|
+
return pl.col(col).cast(pl.String).str.split(" ").list.get(1, null_on_oob=True)
|
|
1428
1443
|
|
|
1429
1444
|
|
|
1430
1445
|
def _build_lookup(
|
|
@@ -1435,80 +1450,33 @@ def _build_lookup(
|
|
|
1435
1450
|
dob_col: str,
|
|
1436
1451
|
label: str,
|
|
1437
1452
|
) -> pl.DataFrame:
|
|
1438
|
-
"""
|
|
1439
|
-
Build a deterministic SID lookup table.
|
|
1440
|
-
|
|
1441
|
-
Output schema:
|
|
1442
|
-
_fname_key
|
|
1443
|
-
_lname_key
|
|
1444
|
-
_dob_key
|
|
1445
|
-
sid_cepr
|
|
1446
|
-
|
|
1447
|
-
Ambiguous keys are removed.
|
|
1448
|
-
"""
|
|
1449
|
-
|
|
1450
1453
|
lookup = (
|
|
1451
1454
|
census
|
|
1452
1455
|
.select(
|
|
1453
|
-
|
|
1454
|
-
|
|
1455
|
-
|
|
1456
|
-
|
|
1457
|
-
pl.col("sid_cepr"),
|
|
1458
|
-
]
|
|
1459
|
-
)
|
|
1460
|
-
.drop_nulls(
|
|
1461
|
-
[
|
|
1462
|
-
"_fname_key",
|
|
1463
|
-
"_lname_key",
|
|
1464
|
-
"_dob_key",
|
|
1465
|
-
"sid_cepr",
|
|
1466
|
-
]
|
|
1467
|
-
)
|
|
1468
|
-
.group_by(
|
|
1469
|
-
[
|
|
1470
|
-
"_fname_key",
|
|
1471
|
-
"_lname_key",
|
|
1472
|
-
"_dob_key",
|
|
1473
|
-
]
|
|
1474
|
-
)
|
|
1475
|
-
.agg(
|
|
1476
|
-
pl.col("sid_cepr").unique().alias("_sids")
|
|
1477
|
-
)
|
|
1478
|
-
.with_columns(
|
|
1479
|
-
pl.col("_sids").list.len().alias("_sid_count")
|
|
1480
|
-
)
|
|
1481
|
-
.filter(
|
|
1482
|
-
pl.col("_sid_count") == 1
|
|
1456
|
+
fname_expr.alias("_fname_key"),
|
|
1457
|
+
lname_expr.alias("_lname_key"),
|
|
1458
|
+
pl.col(dob_col).alias("_dob_key"),
|
|
1459
|
+
pl.col("sid_cepr"),
|
|
1483
1460
|
)
|
|
1461
|
+
.drop_nulls(["_fname_key", "_lname_key", "_dob_key", "sid_cepr"])
|
|
1462
|
+
.group_by(["_fname_key", "_lname_key", "_dob_key"])
|
|
1463
|
+
.agg(pl.col("sid_cepr").unique().alias("_sids"))
|
|
1484
1464
|
.select(
|
|
1485
|
-
|
|
1486
|
-
|
|
1487
|
-
|
|
1488
|
-
|
|
1489
|
-
pl.col("_sids").list.first().alias("sid_cepr"),
|
|
1490
|
-
]
|
|
1465
|
+
"_fname_key",
|
|
1466
|
+
"_lname_key",
|
|
1467
|
+
"_dob_key",
|
|
1468
|
+
pl.col("_sids").list.sort().list.first().alias("sid_cepr"),
|
|
1491
1469
|
)
|
|
1492
1470
|
)
|
|
1493
1471
|
|
|
1494
|
-
print(f"built lookup: {label}")
|
|
1495
|
-
|
|
1472
|
+
print(f"built lookup: {label} ({len(lookup):,} usable keys)")
|
|
1496
1473
|
return lookup
|
|
1497
1474
|
|
|
1498
|
-
def build_census_lookups(
|
|
1499
|
-
*,
|
|
1500
|
-
cmo_name: str,
|
|
1501
|
-
) -> dict[str, pl.DataFrame]:
|
|
1502
|
-
|
|
1503
|
-
try:
|
|
1504
|
-
import mappings
|
|
1505
|
-
except ImportError:
|
|
1506
|
-
import mapppings as mappings
|
|
1507
1475
|
|
|
1476
|
+
def build_census_lookups(*, cmo_name: str) -> dict[str, pl.DataFrame]:
|
|
1508
1477
|
annual_frames = []
|
|
1509
1478
|
|
|
1510
1479
|
for year in range(1994, 2023):
|
|
1511
|
-
|
|
1512
1480
|
path = CENSUS_STUDENTS / f"census_student_{year}.csv"
|
|
1513
1481
|
|
|
1514
1482
|
annual = (
|
|
@@ -1520,16 +1488,14 @@ def build_census_lookups(
|
|
|
1520
1488
|
ignore_errors=False,
|
|
1521
1489
|
)
|
|
1522
1490
|
.select(
|
|
1523
|
-
|
|
1524
|
-
|
|
1525
|
-
|
|
1526
|
-
|
|
1527
|
-
|
|
1528
|
-
|
|
1529
|
-
|
|
1530
|
-
|
|
1531
|
-
"birthdate_imp",
|
|
1532
|
-
]
|
|
1491
|
+
"cmo_code",
|
|
1492
|
+
"sid_cepr",
|
|
1493
|
+
"fname_clean",
|
|
1494
|
+
"lname_clean",
|
|
1495
|
+
"mname_clean",
|
|
1496
|
+
"suff_clean",
|
|
1497
|
+
"birthdate_clean",
|
|
1498
|
+
"birthdate_imp",
|
|
1533
1499
|
)
|
|
1534
1500
|
.rename(
|
|
1535
1501
|
{
|
|
@@ -1542,92 +1508,129 @@ def build_census_lookups(
|
|
|
1542
1508
|
}
|
|
1543
1509
|
)
|
|
1544
1510
|
.with_columns(
|
|
1545
|
-
pl.col("cmo_code")
|
|
1546
|
-
.replace(mappings.CMO_CODE_TO_NAME)
|
|
1547
|
-
.alias("cmo_name")
|
|
1548
|
-
)
|
|
1549
|
-
.filter(
|
|
1550
|
-
pl.col("cmo_name") == cmo_name
|
|
1511
|
+
pl.col("cmo_code").replace(cmo_map).alias("cmo_name")
|
|
1551
1512
|
)
|
|
1513
|
+
.filter(pl.col("cmo_name") == cmo_name)
|
|
1552
1514
|
.with_columns(
|
|
1553
1515
|
*clean_name("fname"),
|
|
1554
1516
|
*clean_name("lname"),
|
|
1555
1517
|
*clean_other_name("mname"),
|
|
1556
1518
|
*clean_other_name("suffix"),
|
|
1557
|
-
*clean_dob(col="dob"),
|
|
1558
|
-
*clean_dob(col="dob_imp"),
|
|
1559
1519
|
)
|
|
1560
|
-
.
|
|
1561
|
-
|
|
1562
|
-
|
|
1563
|
-
|
|
1564
|
-
|
|
1520
|
+
.with_columns(*clean_dob(col="dob"))
|
|
1521
|
+
.with_columns(*clean_dob(col="dob_imp"))
|
|
1522
|
+
.with_columns(
|
|
1523
|
+
_parse_dob_expr("dob_clean").alias("dob"),
|
|
1524
|
+
_parse_dob_expr("dob_imp_clean").alias("dob_imp"),
|
|
1565
1525
|
)
|
|
1566
|
-
.
|
|
1567
|
-
|
|
1568
|
-
|
|
1569
|
-
|
|
1570
|
-
|
|
1526
|
+
.select(
|
|
1527
|
+
"sid_cepr",
|
|
1528
|
+
pl.col("fname_clean").alias("fname"),
|
|
1529
|
+
pl.col("lname_clean").alias("lname"),
|
|
1530
|
+
pl.col("mname_clean").alias("mname"),
|
|
1531
|
+
pl.col("suffix_clean").alias("suffix"),
|
|
1532
|
+
"dob",
|
|
1533
|
+
"dob_imp",
|
|
1571
1534
|
)
|
|
1572
1535
|
)
|
|
1573
1536
|
|
|
1574
1537
|
annual_frames.append(annual)
|
|
1575
1538
|
|
|
1576
|
-
|
|
1577
|
-
# MATERIALIZE ONCE
|
|
1578
|
-
#
|
|
1539
|
+
census = pl.concat(annual_frames, how="vertical_relaxed").collect()
|
|
1579
1540
|
|
|
1580
|
-
census
|
|
1581
|
-
|
|
1582
|
-
|
|
1583
|
-
|
|
1584
|
-
|
|
1585
|
-
|
|
1541
|
+
print(f"census rows after CMO filter: {len(census):,}")
|
|
1542
|
+
|
|
1543
|
+
lookups = {}
|
|
1544
|
+
|
|
1545
|
+
lookups["exact"] = _build_lookup(
|
|
1546
|
+
census,
|
|
1547
|
+
fname_expr=pl.col("fname"),
|
|
1548
|
+
lname_expr=pl.col("lname"),
|
|
1549
|
+
dob_col="dob",
|
|
1550
|
+
label="exact",
|
|
1586
1551
|
)
|
|
1587
1552
|
|
|
1588
|
-
|
|
1553
|
+
lookups["mname"] = _build_lookup(
|
|
1554
|
+
census,
|
|
1555
|
+
fname_expr=pl.concat_str([pl.col("fname"), pl.col("mname")], separator=" "),
|
|
1556
|
+
lname_expr=pl.col("lname"),
|
|
1557
|
+
dob_col="dob",
|
|
1558
|
+
label="right fname + mname",
|
|
1559
|
+
)
|
|
1589
1560
|
|
|
1590
|
-
|
|
1591
|
-
|
|
1592
|
-
|
|
1561
|
+
lookups["mname_lname"] = _build_lookup(
|
|
1562
|
+
census,
|
|
1563
|
+
fname_expr=pl.col("fname"),
|
|
1564
|
+
lname_expr=pl.concat_str([pl.col("mname"), pl.col("lname")], separator=" "),
|
|
1565
|
+
dob_col="dob",
|
|
1566
|
+
label="right mname + lname",
|
|
1567
|
+
)
|
|
1593
1568
|
|
|
1594
|
-
|
|
1569
|
+
lookups["mname_nospace"] = _build_lookup(
|
|
1570
|
+
census,
|
|
1571
|
+
fname_expr=pl.concat_str([pl.col("fname"), pl.col("mname")], separator=""),
|
|
1572
|
+
lname_expr=pl.col("lname"),
|
|
1573
|
+
dob_col="dob",
|
|
1574
|
+
label="right fname + mname no space",
|
|
1575
|
+
)
|
|
1576
|
+
|
|
1577
|
+
lookups["suffix"] = _build_lookup(
|
|
1595
1578
|
census,
|
|
1596
1579
|
fname_expr=pl.col("fname"),
|
|
1580
|
+
lname_expr=pl.concat_str([pl.col("lname"), pl.col("suffix")], separator=" "),
|
|
1581
|
+
dob_col="dob",
|
|
1582
|
+
label="right lname + suffix",
|
|
1583
|
+
)
|
|
1584
|
+
|
|
1585
|
+
lookups["suffix_fname"] = _build_lookup(
|
|
1586
|
+
census,
|
|
1587
|
+
fname_expr=pl.concat_str([pl.col("fname"), pl.col("suffix")], separator=" "),
|
|
1597
1588
|
lname_expr=pl.col("lname"),
|
|
1598
1589
|
dob_col="dob",
|
|
1599
|
-
label="
|
|
1590
|
+
label="right fname + suffix",
|
|
1600
1591
|
)
|
|
1601
1592
|
|
|
1602
|
-
|
|
1593
|
+
lookups["suffix_fname_nospace"] = _build_lookup(
|
|
1603
1594
|
census,
|
|
1604
|
-
fname_expr=pl.concat_str(
|
|
1605
|
-
[
|
|
1606
|
-
pl.col("fname"),
|
|
1607
|
-
pl.col("mname"),
|
|
1608
|
-
],
|
|
1609
|
-
separator=" ",
|
|
1610
|
-
),
|
|
1595
|
+
fname_expr=pl.concat_str([pl.col("fname"), pl.col("suffix")], separator=""),
|
|
1611
1596
|
lname_expr=pl.col("lname"),
|
|
1612
1597
|
dob_col="dob",
|
|
1613
|
-
label="
|
|
1598
|
+
label="right fname + suffix no space",
|
|
1614
1599
|
)
|
|
1615
1600
|
|
|
1616
|
-
|
|
1601
|
+
lookups["suffix_lname_nospace"] = _build_lookup(
|
|
1617
1602
|
census,
|
|
1618
1603
|
fname_expr=pl.col("fname"),
|
|
1619
|
-
lname_expr=pl.concat_str(
|
|
1620
|
-
|
|
1621
|
-
|
|
1622
|
-
|
|
1623
|
-
|
|
1624
|
-
|
|
1625
|
-
|
|
1604
|
+
lname_expr=pl.concat_str([pl.col("lname"), pl.col("suffix")], separator=""),
|
|
1605
|
+
dob_col="dob",
|
|
1606
|
+
label="right lname + suffix no space",
|
|
1607
|
+
)
|
|
1608
|
+
|
|
1609
|
+
lookups["fname_first_word"] = _build_lookup(
|
|
1610
|
+
census,
|
|
1611
|
+
fname_expr=_first_word_expr("fname"),
|
|
1612
|
+
lname_expr=pl.col("lname"),
|
|
1613
|
+
dob_col="dob",
|
|
1614
|
+
label="right fname first word",
|
|
1615
|
+
)
|
|
1616
|
+
|
|
1617
|
+
lookups["lname_first_word"] = _build_lookup(
|
|
1618
|
+
census,
|
|
1619
|
+
fname_expr=pl.col("fname"),
|
|
1620
|
+
lname_expr=_first_word_expr("lname"),
|
|
1626
1621
|
dob_col="dob",
|
|
1627
|
-
label="
|
|
1622
|
+
label="right lname first word",
|
|
1628
1623
|
)
|
|
1629
1624
|
|
|
1630
|
-
|
|
1625
|
+
lookups["lname_second_word"] = _build_lookup(
|
|
1626
|
+
census,
|
|
1627
|
+
fname_expr=pl.col("fname"),
|
|
1628
|
+
lname_expr=_second_word_expr("lname"),
|
|
1629
|
+
dob_col="dob",
|
|
1630
|
+
label="right lname second word",
|
|
1631
|
+
)
|
|
1632
|
+
|
|
1633
|
+
lookups["dob_imp"] = _build_lookup(
|
|
1631
1634
|
census,
|
|
1632
1635
|
fname_expr=pl.col("fname"),
|
|
1633
1636
|
lname_expr=pl.col("lname"),
|
|
@@ -1635,12 +1638,22 @@ def build_census_lookups(
|
|
|
1635
1638
|
label="dob_imp",
|
|
1636
1639
|
)
|
|
1637
1640
|
|
|
1638
|
-
|
|
1639
|
-
"
|
|
1640
|
-
"
|
|
1641
|
-
"
|
|
1642
|
-
"
|
|
1643
|
-
|
|
1641
|
+
for offset, key in [
|
|
1642
|
+
("-1y", "dob_imp_minus_1"),
|
|
1643
|
+
("1y", "dob_imp_plus_1"),
|
|
1644
|
+
("-2y", "dob_imp_minus_2"),
|
|
1645
|
+
("2y", "dob_imp_plus_2"),
|
|
1646
|
+
]:
|
|
1647
|
+
lookups[key] = _build_lookup(
|
|
1648
|
+
census.with_columns(pl.col("dob_imp").dt.offset_by(offset).alias(key)),
|
|
1649
|
+
fname_expr=pl.col("fname"),
|
|
1650
|
+
lname_expr=pl.col("lname"),
|
|
1651
|
+
dob_col=key,
|
|
1652
|
+
label=key,
|
|
1653
|
+
)
|
|
1654
|
+
|
|
1655
|
+
return lookups
|
|
1656
|
+
|
|
1644
1657
|
|
|
1645
1658
|
def _run_match_stage(
|
|
1646
1659
|
unmatched: pl.DataFrame,
|
|
@@ -1651,84 +1664,60 @@ def _run_match_stage(
|
|
|
1651
1664
|
dob_expr: pl.Expr,
|
|
1652
1665
|
label: str,
|
|
1653
1666
|
) -> tuple[pl.DataFrame, pl.DataFrame]:
|
|
1654
|
-
|
|
1655
1667
|
before = len(unmatched)
|
|
1656
1668
|
|
|
1657
1669
|
stage = (
|
|
1658
1670
|
unmatched
|
|
1659
1671
|
.with_columns(
|
|
1660
|
-
|
|
1661
|
-
|
|
1662
|
-
|
|
1663
|
-
dob_expr.alias("_dob_key"),
|
|
1664
|
-
]
|
|
1672
|
+
fname_expr.alias("_fname_key"),
|
|
1673
|
+
lname_expr.alias("_lname_key"),
|
|
1674
|
+
dob_expr.alias("_dob_key"),
|
|
1665
1675
|
)
|
|
1666
1676
|
.join(
|
|
1667
1677
|
lookup,
|
|
1668
|
-
on=[
|
|
1669
|
-
"_fname_key",
|
|
1670
|
-
"_lname_key",
|
|
1671
|
-
"_dob_key",
|
|
1672
|
-
],
|
|
1678
|
+
on=["_fname_key", "_lname_key", "_dob_key"],
|
|
1673
1679
|
how="left",
|
|
1674
1680
|
validate="m:1",
|
|
1675
1681
|
)
|
|
1676
|
-
.drop(
|
|
1677
|
-
[
|
|
1678
|
-
"_fname_key",
|
|
1679
|
-
"_lname_key",
|
|
1680
|
-
"_dob_key",
|
|
1681
|
-
]
|
|
1682
|
-
)
|
|
1683
|
-
)
|
|
1684
|
-
|
|
1685
|
-
matched = (
|
|
1686
|
-
stage
|
|
1687
|
-
.filter(
|
|
1688
|
-
pl.col("sid_cepr").is_not_null()
|
|
1689
|
-
)
|
|
1690
|
-
)
|
|
1691
|
-
|
|
1692
|
-
unmatched = (
|
|
1693
|
-
stage
|
|
1694
|
-
.filter(
|
|
1695
|
-
pl.col("sid_cepr").is_null()
|
|
1696
|
-
)
|
|
1697
|
-
.drop("sid_cepr")
|
|
1682
|
+
.drop(["_fname_key", "_lname_key", "_dob_key"])
|
|
1698
1683
|
)
|
|
1699
1684
|
|
|
1700
|
-
|
|
1701
|
-
|
|
1702
|
-
print(
|
|
1703
|
-
f"{label}: matched {added:,}/{before:,}"
|
|
1704
|
-
)
|
|
1685
|
+
matched = stage.filter(pl.col("sid_cepr").is_not_null())
|
|
1686
|
+
unmatched = stage.filter(pl.col("sid_cepr").is_null()).drop("sid_cepr")
|
|
1705
1687
|
|
|
1688
|
+
print(f"{label}: matched {len(matched):,}/{before:,}")
|
|
1706
1689
|
return matched, unmatched
|
|
1707
1690
|
|
|
1691
|
+
|
|
1708
1692
|
def lookup_sid_cepr(
|
|
1709
|
-
frame
|
|
1693
|
+
frame,
|
|
1710
1694
|
*,
|
|
1711
1695
|
cols: Mapping[str, str],
|
|
1712
1696
|
lookups: dict[str, pl.DataFrame],
|
|
1713
|
-
)
|
|
1714
|
-
|
|
1697
|
+
):
|
|
1715
1698
|
is_lazy = isinstance(frame, pl.LazyFrame)
|
|
1716
1699
|
current = frame.collect() if is_lazy else frame
|
|
1717
1700
|
|
|
1701
|
+
input_columns = current.columns
|
|
1702
|
+
|
|
1703
|
+
current = current.with_row_index("_row_id")
|
|
1704
|
+
|
|
1718
1705
|
current = current.with_columns(
|
|
1719
1706
|
*clean_name(cols["fname"]),
|
|
1720
1707
|
*clean_name(cols["lname"]),
|
|
1721
|
-
*clean_dob(col=cols["dob"]),
|
|
1722
1708
|
)
|
|
1723
1709
|
|
|
1724
|
-
current =
|
|
1725
|
-
|
|
1710
|
+
current = (
|
|
1711
|
+
current
|
|
1712
|
+
.with_columns(*clean_dob(col=cols["dob"]))
|
|
1713
|
+
.with_columns(
|
|
1714
|
+
_parse_dob_expr(f"{cols['dob']}_clean").alias(f"{cols['dob']}_clean")
|
|
1715
|
+
)
|
|
1726
1716
|
)
|
|
1727
1717
|
|
|
1728
1718
|
matched_frames = []
|
|
1729
1719
|
unmatched = current
|
|
1730
1720
|
|
|
1731
|
-
# exact
|
|
1732
1721
|
matched, unmatched = _run_match_stage(
|
|
1733
1722
|
unmatched,
|
|
1734
1723
|
lookup=lookups["exact"],
|
|
@@ -1739,34 +1728,61 @@ def lookup_sid_cepr(
|
|
|
1739
1728
|
)
|
|
1740
1729
|
matched_frames.append(matched)
|
|
1741
1730
|
|
|
1742
|
-
|
|
1743
|
-
|
|
1744
|
-
|
|
1745
|
-
|
|
1731
|
+
for label, lookup in [
|
|
1732
|
+
("left exact -> right fname + mname", lookups["mname"]),
|
|
1733
|
+
("left exact -> right mname + lname", lookups["mname_lname"]),
|
|
1734
|
+
("left exact -> right fname + mname no space", lookups["mname_nospace"]),
|
|
1735
|
+
("left exact -> right lname + suffix", lookups["suffix"]),
|
|
1736
|
+
("left exact -> right fname + suffix", lookups["suffix_fname"]),
|
|
1737
|
+
("left exact -> right fname + suffix no space", lookups["suffix_fname_nospace"]),
|
|
1738
|
+
("left exact -> right lname + suffix no space", lookups["suffix_lname_nospace"]),
|
|
1739
|
+
("left exact -> right fname first word", lookups["fname_first_word"]),
|
|
1740
|
+
("left exact -> right lname first word", lookups["lname_first_word"]),
|
|
1741
|
+
("left exact -> right lname second word", lookups["lname_second_word"]),
|
|
1742
|
+
]:
|
|
1743
|
+
matched, unmatched = _run_match_stage(
|
|
1744
|
+
unmatched,
|
|
1745
|
+
lookup=lookup,
|
|
1746
|
+
fname_expr=pl.col(f"{cols['fname']}_clean"),
|
|
1747
|
+
lname_expr=pl.col(f"{cols['lname']}_clean"),
|
|
1748
|
+
dob_expr=pl.col(f"{cols['dob']}_clean"),
|
|
1749
|
+
label=label,
|
|
1746
1750
|
)
|
|
1751
|
+
matched_frames.append(matched)
|
|
1747
1752
|
|
|
1748
|
-
|
|
1749
|
-
|
|
1750
|
-
|
|
1751
|
-
|
|
1752
|
-
|
|
1753
|
-
|
|
1754
|
-
|
|
1755
|
-
|
|
1756
|
-
|
|
1757
|
-
|
|
1758
|
-
|
|
1759
|
-
|
|
1760
|
-
|
|
1761
|
-
(
|
|
1762
|
-
|
|
1763
|
-
|
|
1764
|
-
|
|
1765
|
-
|
|
1766
|
-
|
|
1753
|
+
for label, fname_expr, lname_expr in [
|
|
1754
|
+
(
|
|
1755
|
+
"left fname first word -> right exact",
|
|
1756
|
+
_first_word_expr(f"{cols['fname']}_clean"),
|
|
1757
|
+
pl.col(f"{cols['lname']}_clean"),
|
|
1758
|
+
),
|
|
1759
|
+
(
|
|
1760
|
+
"left lname first word -> right exact",
|
|
1761
|
+
pl.col(f"{cols['fname']}_clean"),
|
|
1762
|
+
_first_word_expr(f"{cols['lname']}_clean"),
|
|
1763
|
+
),
|
|
1764
|
+
(
|
|
1765
|
+
"left lname second word -> right exact",
|
|
1766
|
+
pl.col(f"{cols['fname']}_clean"),
|
|
1767
|
+
_second_word_expr(f"{cols['lname']}_clean"),
|
|
1768
|
+
),
|
|
1769
|
+
]:
|
|
1770
|
+
matched, unmatched = _run_match_stage(
|
|
1771
|
+
unmatched,
|
|
1772
|
+
lookup=lookups["exact"],
|
|
1773
|
+
fname_expr=fname_expr,
|
|
1774
|
+
lname_expr=lname_expr,
|
|
1775
|
+
dob_expr=pl.col(f"{cols['dob']}_clean"),
|
|
1776
|
+
label=label,
|
|
1777
|
+
)
|
|
1778
|
+
matched_frames.append(matched)
|
|
1779
|
+
|
|
1780
|
+
if "mname" in cols:
|
|
1781
|
+
unmatched = unmatched.with_columns(*clean_other_name(cols["mname"]))
|
|
1782
|
+
|
|
1783
|
+
for label, fname_expr, lname_expr in [
|
|
1767
1784
|
(
|
|
1768
1785
|
"left fname + mname -> right exact",
|
|
1769
|
-
lookups["exact"],
|
|
1770
1786
|
pl.concat_str(
|
|
1771
1787
|
[pl.col(f"{cols['fname']}_clean"), pl.col(f"{cols['mname']}_clean")],
|
|
1772
1788
|
separator=" ",
|
|
@@ -1775,7 +1791,6 @@ def lookup_sid_cepr(
|
|
|
1775
1791
|
),
|
|
1776
1792
|
(
|
|
1777
1793
|
"left mname + lname -> right exact",
|
|
1778
|
-
lookups["exact"],
|
|
1779
1794
|
pl.col(f"{cols['fname']}_clean"),
|
|
1780
1795
|
pl.concat_str(
|
|
1781
1796
|
[pl.col(f"{cols['mname']}_clean"), pl.col(f"{cols['lname']}_clean")],
|
|
@@ -1784,19 +1799,16 @@ def lookup_sid_cepr(
|
|
|
1784
1799
|
),
|
|
1785
1800
|
(
|
|
1786
1801
|
"left fname + mname no space -> right exact",
|
|
1787
|
-
lookups["exact"],
|
|
1788
1802
|
pl.concat_str(
|
|
1789
1803
|
[pl.col(f"{cols['fname']}_clean"), pl.col(f"{cols['mname']}_clean")],
|
|
1790
1804
|
separator="",
|
|
1791
1805
|
),
|
|
1792
1806
|
pl.col(f"{cols['lname']}_clean"),
|
|
1793
1807
|
),
|
|
1794
|
-
]
|
|
1795
|
-
|
|
1796
|
-
for label, lookup, fname_expr, lname_expr in mname_stages:
|
|
1808
|
+
]:
|
|
1797
1809
|
matched, unmatched = _run_match_stage(
|
|
1798
1810
|
unmatched,
|
|
1799
|
-
lookup=
|
|
1811
|
+
lookup=lookups["exact"],
|
|
1800
1812
|
fname_expr=fname_expr,
|
|
1801
1813
|
lname_expr=lname_expr,
|
|
1802
1814
|
dob_expr=pl.col(f"{cols['dob']}_clean"),
|
|
@@ -1804,40 +1816,12 @@ def lookup_sid_cepr(
|
|
|
1804
1816
|
)
|
|
1805
1817
|
matched_frames.append(matched)
|
|
1806
1818
|
|
|
1807
|
-
# suffix variants
|
|
1808
1819
|
if "suffix" in cols:
|
|
1809
|
-
unmatched = unmatched.with_columns(
|
|
1810
|
-
*clean_other_name(cols["suffix"])
|
|
1811
|
-
)
|
|
1820
|
+
unmatched = unmatched.with_columns(*clean_other_name(cols["suffix"]))
|
|
1812
1821
|
|
|
1813
|
-
|
|
1814
|
-
(
|
|
1815
|
-
"left exact -> right lname + suffix",
|
|
1816
|
-
lookups["suffix"],
|
|
1817
|
-
pl.col(f"{cols['fname']}_clean"),
|
|
1818
|
-
pl.col(f"{cols['lname']}_clean"),
|
|
1819
|
-
),
|
|
1820
|
-
(
|
|
1821
|
-
"left exact -> right fname + suffix",
|
|
1822
|
-
lookups["suffix_fname"],
|
|
1823
|
-
pl.col(f"{cols['fname']}_clean"),
|
|
1824
|
-
pl.col(f"{cols['lname']}_clean"),
|
|
1825
|
-
),
|
|
1826
|
-
(
|
|
1827
|
-
"left exact -> right fname + suffix no space",
|
|
1828
|
-
lookups["suffix_fname_nospace"],
|
|
1829
|
-
pl.col(f"{cols['fname']}_clean"),
|
|
1830
|
-
pl.col(f"{cols['lname']}_clean"),
|
|
1831
|
-
),
|
|
1832
|
-
(
|
|
1833
|
-
"left exact -> right lname + suffix no space",
|
|
1834
|
-
lookups["suffix_lname_nospace"],
|
|
1835
|
-
pl.col(f"{cols['fname']}_clean"),
|
|
1836
|
-
pl.col(f"{cols['lname']}_clean"),
|
|
1837
|
-
),
|
|
1822
|
+
for label, fname_expr, lname_expr in [
|
|
1838
1823
|
(
|
|
1839
1824
|
"left fname + suffix -> right exact",
|
|
1840
|
-
lookups["exact"],
|
|
1841
1825
|
pl.concat_str(
|
|
1842
1826
|
[pl.col(f"{cols['fname']}_clean"), pl.col(f"{cols['suffix']}_clean")],
|
|
1843
1827
|
separator=" ",
|
|
@@ -1846,7 +1830,6 @@ def lookup_sid_cepr(
|
|
|
1846
1830
|
),
|
|
1847
1831
|
(
|
|
1848
1832
|
"left fname + suffix no space -> right exact",
|
|
1849
|
-
lookups["exact"],
|
|
1850
1833
|
pl.concat_str(
|
|
1851
1834
|
[pl.col(f"{cols['fname']}_clean"), pl.col(f"{cols['suffix']}_clean")],
|
|
1852
1835
|
separator="",
|
|
@@ -1855,7 +1838,6 @@ def lookup_sid_cepr(
|
|
|
1855
1838
|
),
|
|
1856
1839
|
(
|
|
1857
1840
|
"left lname + suffix -> right exact",
|
|
1858
|
-
lookups["exact"],
|
|
1859
1841
|
pl.col(f"{cols['fname']}_clean"),
|
|
1860
1842
|
pl.concat_str(
|
|
1861
1843
|
[pl.col(f"{cols['lname']}_clean"), pl.col(f"{cols['suffix']}_clean")],
|
|
@@ -1864,19 +1846,16 @@ def lookup_sid_cepr(
|
|
|
1864
1846
|
),
|
|
1865
1847
|
(
|
|
1866
1848
|
"left lname + suffix no space -> right exact",
|
|
1867
|
-
lookups["exact"],
|
|
1868
1849
|
pl.col(f"{cols['fname']}_clean"),
|
|
1869
1850
|
pl.concat_str(
|
|
1870
1851
|
[pl.col(f"{cols['lname']}_clean"), pl.col(f"{cols['suffix']}_clean")],
|
|
1871
1852
|
separator="",
|
|
1872
1853
|
),
|
|
1873
1854
|
),
|
|
1874
|
-
]
|
|
1875
|
-
|
|
1876
|
-
for label, lookup, fname_expr, lname_expr in suffix_stages:
|
|
1855
|
+
]:
|
|
1877
1856
|
matched, unmatched = _run_match_stage(
|
|
1878
1857
|
unmatched,
|
|
1879
|
-
lookup=
|
|
1858
|
+
lookup=lookups["exact"],
|
|
1880
1859
|
fname_expr=fname_expr,
|
|
1881
1860
|
lname_expr=lname_expr,
|
|
1882
1861
|
dob_expr=pl.col(f"{cols['dob']}_clean"),
|
|
@@ -1884,16 +1863,13 @@ def lookup_sid_cepr(
|
|
|
1884
1863
|
)
|
|
1885
1864
|
matched_frames.append(matched)
|
|
1886
1865
|
|
|
1887
|
-
|
|
1888
|
-
dob_imp_stages = [
|
|
1866
|
+
for key in [
|
|
1889
1867
|
"dob_imp",
|
|
1890
1868
|
"dob_imp_minus_1",
|
|
1891
1869
|
"dob_imp_plus_1",
|
|
1892
1870
|
"dob_imp_minus_2",
|
|
1893
1871
|
"dob_imp_plus_2",
|
|
1894
|
-
]
|
|
1895
|
-
|
|
1896
|
-
for key in dob_imp_stages:
|
|
1872
|
+
]:
|
|
1897
1873
|
matched, unmatched = _run_match_stage(
|
|
1898
1874
|
unmatched,
|
|
1899
1875
|
lookup=lookups[key],
|
|
@@ -1904,18 +1880,20 @@ def lookup_sid_cepr(
|
|
|
1904
1880
|
)
|
|
1905
1881
|
matched_frames.append(matched)
|
|
1906
1882
|
|
|
1907
|
-
result =
|
|
1908
|
-
matched_frames + [unmatched],
|
|
1909
|
-
|
|
1883
|
+
result = (
|
|
1884
|
+
pl.concat(matched_frames + [unmatched], how="diagonal_relaxed")
|
|
1885
|
+
.sort("_row_id")
|
|
1886
|
+
.drop("_row_id")
|
|
1910
1887
|
)
|
|
1911
1888
|
|
|
1912
|
-
|
|
1913
|
-
|
|
1914
|
-
)
|
|
1889
|
+
if "sid_cepr" not in input_columns:
|
|
1890
|
+
input_columns = input_columns + ["sid_cepr"]
|
|
1915
1891
|
|
|
1916
|
-
|
|
1892
|
+
result = result.select(input_columns)
|
|
1917
1893
|
|
|
1918
|
-
|
|
1894
|
+
print(f"final matched: {result['sid_cepr'].is_not_null().sum():,}/{len(result):,}")
|
|
1895
|
+
|
|
1896
|
+
return result
|
|
1919
1897
|
|
|
1920
1898
|
# EXAMPLE USAGE
|
|
1921
1899
|
|
|
File without changes
|
|
File without changes
|
|
File without changes
|