ltc-code 0.1.5__tar.gz → 0.1.7__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {ltc_code-0.1.5 → ltc_code-0.1.7}/PKG-INFO +1 -1
- {ltc_code-0.1.5 → ltc_code-0.1.7}/pyproject.toml +1 -1
- {ltc_code-0.1.5 → ltc_code-0.1.7}/src/ltc_code/may27.py +247 -255
- {ltc_code-0.1.5 → ltc_code-0.1.7}/README.md +0 -0
- {ltc_code-0.1.5 → ltc_code-0.1.7}/src/ltc_code/__init__.py +0 -0
- {ltc_code-0.1.5 → ltc_code-0.1.7}/src/ltc_code/polars_dates.py +0 -0
|
@@ -1427,6 +1427,16 @@ def lookup_sid_cepr(
|
|
|
1427
1427
|
|
|
1428
1428
|
|
|
1429
1429
|
|
|
1430
|
+
def _parse_dob_expr(col: str) -> pl.Expr:
|
|
1431
|
+
return pl.coalesce(
|
|
1432
|
+
[
|
|
1433
|
+
pl.col(col).cast(pl.String).str.strptime(pl.Date, "%m/%d/%Y", strict=False),
|
|
1434
|
+
pl.col(col).cast(pl.String).str.strptime(pl.Date, "%m/%d/%y", strict=False),
|
|
1435
|
+
pl.col(col).cast(pl.String).str.strptime(pl.Date, "%Y-%m-%d", strict=False),
|
|
1436
|
+
]
|
|
1437
|
+
)
|
|
1438
|
+
|
|
1439
|
+
|
|
1430
1440
|
def _build_lookup(
|
|
1431
1441
|
census: pl.DataFrame,
|
|
1432
1442
|
*,
|
|
@@ -1435,80 +1445,35 @@ def _build_lookup(
|
|
|
1435
1445
|
dob_col: str,
|
|
1436
1446
|
label: str,
|
|
1437
1447
|
) -> pl.DataFrame:
|
|
1438
|
-
"""
|
|
1439
|
-
Build a deterministic SID lookup table.
|
|
1440
|
-
|
|
1441
|
-
Output schema:
|
|
1442
|
-
_fname_key
|
|
1443
|
-
_lname_key
|
|
1444
|
-
_dob_key
|
|
1445
|
-
sid_cepr
|
|
1446
|
-
|
|
1447
|
-
Ambiguous keys are removed.
|
|
1448
|
-
"""
|
|
1449
|
-
|
|
1450
1448
|
lookup = (
|
|
1451
1449
|
census
|
|
1452
1450
|
.select(
|
|
1453
|
-
|
|
1454
|
-
|
|
1455
|
-
|
|
1456
|
-
|
|
1457
|
-
pl.col("sid_cepr"),
|
|
1458
|
-
]
|
|
1459
|
-
)
|
|
1460
|
-
.drop_nulls(
|
|
1461
|
-
[
|
|
1462
|
-
"_fname_key",
|
|
1463
|
-
"_lname_key",
|
|
1464
|
-
"_dob_key",
|
|
1465
|
-
"sid_cepr",
|
|
1466
|
-
]
|
|
1467
|
-
)
|
|
1468
|
-
.group_by(
|
|
1469
|
-
[
|
|
1470
|
-
"_fname_key",
|
|
1471
|
-
"_lname_key",
|
|
1472
|
-
"_dob_key",
|
|
1473
|
-
]
|
|
1474
|
-
)
|
|
1475
|
-
.agg(
|
|
1476
|
-
pl.col("sid_cepr").unique().alias("_sids")
|
|
1477
|
-
)
|
|
1478
|
-
.with_columns(
|
|
1479
|
-
pl.col("_sids").list.len().alias("_sid_count")
|
|
1480
|
-
)
|
|
1481
|
-
.filter(
|
|
1482
|
-
pl.col("_sid_count") == 1
|
|
1451
|
+
fname_expr.alias("_fname_key"),
|
|
1452
|
+
lname_expr.alias("_lname_key"),
|
|
1453
|
+
pl.col(dob_col).alias("_dob_key"),
|
|
1454
|
+
pl.col("sid_cepr"),
|
|
1483
1455
|
)
|
|
1456
|
+
.drop_nulls(["_fname_key", "_lname_key", "_dob_key", "sid_cepr"])
|
|
1457
|
+
.group_by(["_fname_key", "_lname_key", "_dob_key"])
|
|
1458
|
+
.agg(pl.col("sid_cepr").unique().alias("_sids"))
|
|
1459
|
+
.with_columns(pl.col("_sids").list.len().alias("_sid_count"))
|
|
1460
|
+
.filter(pl.col("_sid_count") == 1)
|
|
1484
1461
|
.select(
|
|
1485
|
-
|
|
1486
|
-
|
|
1487
|
-
|
|
1488
|
-
|
|
1489
|
-
pl.col("_sids").list.first().alias("sid_cepr"),
|
|
1490
|
-
]
|
|
1462
|
+
"_fname_key",
|
|
1463
|
+
"_lname_key",
|
|
1464
|
+
"_dob_key",
|
|
1465
|
+
pl.col("_sids").list.first().alias("sid_cepr"),
|
|
1491
1466
|
)
|
|
1492
1467
|
)
|
|
1493
1468
|
|
|
1494
|
-
print(f"built lookup: {label}")
|
|
1495
|
-
|
|
1469
|
+
print(f"built lookup: {label} ({len(lookup):,} usable keys)")
|
|
1496
1470
|
return lookup
|
|
1497
1471
|
|
|
1498
|
-
def build_census_lookups(
|
|
1499
|
-
*,
|
|
1500
|
-
cmo_name: str,
|
|
1501
|
-
) -> dict[str, pl.DataFrame]:
|
|
1502
|
-
|
|
1503
|
-
try:
|
|
1504
|
-
import mappings
|
|
1505
|
-
except ImportError:
|
|
1506
|
-
import mapppings as mappings
|
|
1507
1472
|
|
|
1473
|
+
def build_census_lookups(*, cmo_name: str) -> dict[str, pl.DataFrame]:
|
|
1508
1474
|
annual_frames = []
|
|
1509
1475
|
|
|
1510
1476
|
for year in range(1994, 2023):
|
|
1511
|
-
|
|
1512
1477
|
path = CENSUS_STUDENTS / f"census_student_{year}.csv"
|
|
1513
1478
|
|
|
1514
1479
|
annual = (
|
|
@@ -1520,16 +1485,14 @@ def build_census_lookups(
|
|
|
1520
1485
|
ignore_errors=False,
|
|
1521
1486
|
)
|
|
1522
1487
|
.select(
|
|
1523
|
-
|
|
1524
|
-
|
|
1525
|
-
|
|
1526
|
-
|
|
1527
|
-
|
|
1528
|
-
|
|
1529
|
-
|
|
1530
|
-
|
|
1531
|
-
"birthdate_imp",
|
|
1532
|
-
]
|
|
1488
|
+
"cmo_code",
|
|
1489
|
+
"sid_cepr",
|
|
1490
|
+
"fname_clean",
|
|
1491
|
+
"lname_clean",
|
|
1492
|
+
"mname_clean",
|
|
1493
|
+
"suff_clean",
|
|
1494
|
+
"birthdate_clean",
|
|
1495
|
+
"birthdate_imp",
|
|
1533
1496
|
)
|
|
1534
1497
|
.rename(
|
|
1535
1498
|
{
|
|
@@ -1542,92 +1505,105 @@ def build_census_lookups(
|
|
|
1542
1505
|
}
|
|
1543
1506
|
)
|
|
1544
1507
|
.with_columns(
|
|
1545
|
-
pl.col("cmo_code")
|
|
1546
|
-
.replace(mappings.CMO_CODE_TO_NAME)
|
|
1547
|
-
.alias("cmo_name")
|
|
1548
|
-
)
|
|
1549
|
-
.filter(
|
|
1550
|
-
pl.col("cmo_name") == cmo_name
|
|
1508
|
+
pl.col("cmo_code").replace(cmo_map).alias("cmo_name")
|
|
1551
1509
|
)
|
|
1510
|
+
.filter(pl.col("cmo_name") == cmo_name)
|
|
1552
1511
|
.with_columns(
|
|
1553
1512
|
*clean_name("fname"),
|
|
1554
1513
|
*clean_name("lname"),
|
|
1555
1514
|
*clean_other_name("mname"),
|
|
1556
1515
|
*clean_other_name("suffix"),
|
|
1557
|
-
*clean_dob(col="dob"),
|
|
1558
|
-
*clean_dob(col="dob_imp"),
|
|
1559
1516
|
)
|
|
1560
|
-
.
|
|
1561
|
-
|
|
1562
|
-
|
|
1563
|
-
|
|
1564
|
-
|
|
1517
|
+
.with_columns(*clean_dob(col="dob"))
|
|
1518
|
+
.with_columns(*clean_dob(col="dob_imp"))
|
|
1519
|
+
.with_columns(
|
|
1520
|
+
_parse_dob_expr("dob_clean").alias("dob"),
|
|
1521
|
+
_parse_dob_expr("dob_imp_clean").alias("dob_imp"),
|
|
1565
1522
|
)
|
|
1566
|
-
.
|
|
1567
|
-
|
|
1568
|
-
|
|
1569
|
-
|
|
1570
|
-
|
|
1523
|
+
.select(
|
|
1524
|
+
"sid_cepr",
|
|
1525
|
+
pl.col("fname_clean").alias("fname"),
|
|
1526
|
+
pl.col("lname_clean").alias("lname"),
|
|
1527
|
+
pl.col("mname_clean").alias("mname"),
|
|
1528
|
+
pl.col("suffix_clean").alias("suffix"),
|
|
1529
|
+
"dob",
|
|
1530
|
+
"dob_imp",
|
|
1571
1531
|
)
|
|
1572
1532
|
)
|
|
1573
1533
|
|
|
1574
1534
|
annual_frames.append(annual)
|
|
1575
1535
|
|
|
1576
|
-
|
|
1577
|
-
# MATERIALIZE ONCE
|
|
1578
|
-
#
|
|
1536
|
+
census = pl.concat(annual_frames, how="vertical_relaxed").collect()
|
|
1579
1537
|
|
|
1580
|
-
census
|
|
1581
|
-
|
|
1582
|
-
|
|
1583
|
-
|
|
1584
|
-
|
|
1585
|
-
|
|
1538
|
+
print(f"census rows after CMO filter: {len(census):,}")
|
|
1539
|
+
|
|
1540
|
+
lookups = {}
|
|
1541
|
+
|
|
1542
|
+
lookups["exact"] = _build_lookup(
|
|
1543
|
+
census,
|
|
1544
|
+
fname_expr=pl.col("fname"),
|
|
1545
|
+
lname_expr=pl.col("lname"),
|
|
1546
|
+
dob_col="dob",
|
|
1547
|
+
label="exact",
|
|
1548
|
+
)
|
|
1549
|
+
|
|
1550
|
+
lookups["mname"] = _build_lookup(
|
|
1551
|
+
census,
|
|
1552
|
+
fname_expr=pl.concat_str([pl.col("fname"), pl.col("mname")], separator=" "),
|
|
1553
|
+
lname_expr=pl.col("lname"),
|
|
1554
|
+
dob_col="dob",
|
|
1555
|
+
label="right fname + mname",
|
|
1586
1556
|
)
|
|
1587
1557
|
|
|
1588
|
-
|
|
1558
|
+
lookups["mname_lname"] = _build_lookup(
|
|
1559
|
+
census,
|
|
1560
|
+
fname_expr=pl.col("fname"),
|
|
1561
|
+
lname_expr=pl.concat_str([pl.col("mname"), pl.col("lname")], separator=" "),
|
|
1562
|
+
dob_col="dob",
|
|
1563
|
+
label="right mname + lname",
|
|
1564
|
+
)
|
|
1589
1565
|
|
|
1590
|
-
|
|
1591
|
-
|
|
1592
|
-
|
|
1566
|
+
lookups["mname_nospace"] = _build_lookup(
|
|
1567
|
+
census,
|
|
1568
|
+
fname_expr=pl.concat_str([pl.col("fname"), pl.col("mname")], separator=""),
|
|
1569
|
+
lname_expr=pl.col("lname"),
|
|
1570
|
+
dob_col="dob",
|
|
1571
|
+
label="right fname + mname no space",
|
|
1572
|
+
)
|
|
1593
1573
|
|
|
1594
|
-
|
|
1574
|
+
lookups["suffix"] = _build_lookup(
|
|
1595
1575
|
census,
|
|
1596
1576
|
fname_expr=pl.col("fname"),
|
|
1577
|
+
lname_expr=pl.concat_str([pl.col("lname"), pl.col("suffix")], separator=" "),
|
|
1578
|
+
dob_col="dob",
|
|
1579
|
+
label="right lname + suffix",
|
|
1580
|
+
)
|
|
1581
|
+
|
|
1582
|
+
lookups["suffix_fname"] = _build_lookup(
|
|
1583
|
+
census,
|
|
1584
|
+
fname_expr=pl.concat_str([pl.col("fname"), pl.col("suffix")], separator=" "),
|
|
1597
1585
|
lname_expr=pl.col("lname"),
|
|
1598
1586
|
dob_col="dob",
|
|
1599
|
-
label="
|
|
1587
|
+
label="right fname + suffix",
|
|
1600
1588
|
)
|
|
1601
1589
|
|
|
1602
|
-
|
|
1590
|
+
lookups["suffix_fname_nospace"] = _build_lookup(
|
|
1603
1591
|
census,
|
|
1604
|
-
fname_expr=pl.concat_str(
|
|
1605
|
-
[
|
|
1606
|
-
pl.col("fname"),
|
|
1607
|
-
pl.col("mname"),
|
|
1608
|
-
],
|
|
1609
|
-
separator=" ",
|
|
1610
|
-
),
|
|
1592
|
+
fname_expr=pl.concat_str([pl.col("fname"), pl.col("suffix")], separator=""),
|
|
1611
1593
|
lname_expr=pl.col("lname"),
|
|
1612
1594
|
dob_col="dob",
|
|
1613
|
-
label="
|
|
1595
|
+
label="right fname + suffix no space",
|
|
1614
1596
|
)
|
|
1615
1597
|
|
|
1616
|
-
|
|
1598
|
+
lookups["suffix_lname_nospace"] = _build_lookup(
|
|
1617
1599
|
census,
|
|
1618
1600
|
fname_expr=pl.col("fname"),
|
|
1619
|
-
lname_expr=pl.concat_str(
|
|
1620
|
-
[
|
|
1621
|
-
pl.col("lname"),
|
|
1622
|
-
pl.col("suffix"),
|
|
1623
|
-
],
|
|
1624
|
-
separator=" ",
|
|
1625
|
-
),
|
|
1601
|
+
lname_expr=pl.concat_str([pl.col("lname"), pl.col("suffix")], separator=""),
|
|
1626
1602
|
dob_col="dob",
|
|
1627
|
-
label="suffix",
|
|
1603
|
+
label="right lname + suffix no space",
|
|
1628
1604
|
)
|
|
1629
1605
|
|
|
1630
|
-
|
|
1606
|
+
lookups["dob_imp"] = _build_lookup(
|
|
1631
1607
|
census,
|
|
1632
1608
|
fname_expr=pl.col("fname"),
|
|
1633
1609
|
lname_expr=pl.col("lname"),
|
|
@@ -1635,12 +1611,22 @@ def build_census_lookups(
|
|
|
1635
1611
|
label="dob_imp",
|
|
1636
1612
|
)
|
|
1637
1613
|
|
|
1638
|
-
|
|
1639
|
-
"
|
|
1640
|
-
"
|
|
1641
|
-
"
|
|
1642
|
-
"
|
|
1643
|
-
|
|
1614
|
+
for offset, key in [
|
|
1615
|
+
("-1y", "dob_imp_minus_1"),
|
|
1616
|
+
("1y", "dob_imp_plus_1"),
|
|
1617
|
+
("-2y", "dob_imp_minus_2"),
|
|
1618
|
+
("2y", "dob_imp_plus_2"),
|
|
1619
|
+
]:
|
|
1620
|
+
lookups[key] = _build_lookup(
|
|
1621
|
+
census.with_columns(pl.col("dob_imp").dt.offset_by(offset).alias(key)),
|
|
1622
|
+
fname_expr=pl.col("fname"),
|
|
1623
|
+
lname_expr=pl.col("lname"),
|
|
1624
|
+
dob_col=key,
|
|
1625
|
+
label=key,
|
|
1626
|
+
)
|
|
1627
|
+
|
|
1628
|
+
return lookups
|
|
1629
|
+
|
|
1644
1630
|
|
|
1645
1631
|
def _run_match_stage(
|
|
1646
1632
|
unmatched: pl.DataFrame,
|
|
@@ -1651,97 +1637,61 @@ def _run_match_stage(
|
|
|
1651
1637
|
dob_expr: pl.Expr,
|
|
1652
1638
|
label: str,
|
|
1653
1639
|
) -> tuple[pl.DataFrame, pl.DataFrame]:
|
|
1654
|
-
|
|
1655
1640
|
before = len(unmatched)
|
|
1656
1641
|
|
|
1657
1642
|
stage = (
|
|
1658
1643
|
unmatched
|
|
1659
1644
|
.with_columns(
|
|
1660
|
-
|
|
1661
|
-
|
|
1662
|
-
|
|
1663
|
-
dob_expr.alias("_dob_key"),
|
|
1664
|
-
]
|
|
1645
|
+
fname_expr.alias("_fname_key"),
|
|
1646
|
+
lname_expr.alias("_lname_key"),
|
|
1647
|
+
dob_expr.alias("_dob_key"),
|
|
1665
1648
|
)
|
|
1666
1649
|
.join(
|
|
1667
1650
|
lookup,
|
|
1668
|
-
on=[
|
|
1669
|
-
"_fname_key",
|
|
1670
|
-
"_lname_key",
|
|
1671
|
-
"_dob_key",
|
|
1672
|
-
],
|
|
1651
|
+
on=["_fname_key", "_lname_key", "_dob_key"],
|
|
1673
1652
|
how="left",
|
|
1674
1653
|
validate="m:1",
|
|
1675
1654
|
)
|
|
1676
|
-
.drop(
|
|
1677
|
-
[
|
|
1678
|
-
"_fname_key",
|
|
1679
|
-
"_lname_key",
|
|
1680
|
-
"_dob_key",
|
|
1681
|
-
]
|
|
1682
|
-
)
|
|
1655
|
+
.drop(["_fname_key", "_lname_key", "_dob_key"])
|
|
1683
1656
|
)
|
|
1684
1657
|
|
|
1685
|
-
matched = (
|
|
1686
|
-
|
|
1687
|
-
.filter(
|
|
1688
|
-
pl.col("sid_cepr").is_not_null()
|
|
1689
|
-
)
|
|
1690
|
-
)
|
|
1691
|
-
|
|
1692
|
-
unmatched = (
|
|
1693
|
-
stage
|
|
1694
|
-
.filter(
|
|
1695
|
-
pl.col("sid_cepr").is_null()
|
|
1696
|
-
)
|
|
1697
|
-
.drop("sid_cepr")
|
|
1698
|
-
)
|
|
1658
|
+
matched = stage.filter(pl.col("sid_cepr").is_not_null())
|
|
1659
|
+
unmatched = stage.filter(pl.col("sid_cepr").is_null()).drop("sid_cepr")
|
|
1699
1660
|
|
|
1700
|
-
|
|
1701
|
-
|
|
1702
|
-
print(
|
|
1703
|
-
f"{label}: matched {added:,}/{before:,}"
|
|
1704
|
-
)
|
|
1661
|
+
print(f"{label}: matched {len(matched):,}/{before:,}")
|
|
1705
1662
|
|
|
1706
1663
|
return matched, unmatched
|
|
1707
1664
|
|
|
1665
|
+
|
|
1708
1666
|
def lookup_sid_cepr(
|
|
1709
|
-
frame
|
|
1667
|
+
frame,
|
|
1710
1668
|
*,
|
|
1711
1669
|
cols: Mapping[str, str],
|
|
1712
1670
|
lookups: dict[str, pl.DataFrame],
|
|
1713
|
-
)
|
|
1714
|
-
|
|
1671
|
+
):
|
|
1715
1672
|
is_lazy = isinstance(frame, pl.LazyFrame)
|
|
1673
|
+
current = frame.collect() if is_lazy else frame
|
|
1716
1674
|
|
|
1717
|
-
|
|
1718
|
-
frame.collect()
|
|
1719
|
-
if is_lazy
|
|
1720
|
-
else frame
|
|
1721
|
-
)
|
|
1675
|
+
input_columns = current.columns
|
|
1722
1676
|
|
|
1723
|
-
|
|
1724
|
-
|
|
1725
|
-
|
|
1677
|
+
current = current.with_row_index("_row_id")
|
|
1678
|
+
|
|
1679
|
+
current = current.with_columns(
|
|
1680
|
+
*clean_name(cols["fname"]),
|
|
1681
|
+
*clean_name(cols["lname"]),
|
|
1682
|
+
)
|
|
1726
1683
|
|
|
1727
1684
|
current = (
|
|
1728
1685
|
current
|
|
1686
|
+
.with_columns(*clean_dob(col=cols["dob"]))
|
|
1729
1687
|
.with_columns(
|
|
1730
|
-
|
|
1731
|
-
*clean_name(cols["lname"]),
|
|
1732
|
-
*clean_dob(col=cols["dob"]),
|
|
1688
|
+
_parse_dob_expr(f"{cols['dob']}_clean").alias(f"{cols['dob']}_clean")
|
|
1733
1689
|
)
|
|
1734
1690
|
)
|
|
1735
1691
|
|
|
1736
1692
|
matched_frames = []
|
|
1737
|
-
|
|
1738
1693
|
unmatched = current
|
|
1739
1694
|
|
|
1740
|
-
#
|
|
1741
|
-
# STAGE 1
|
|
1742
|
-
# EXACT
|
|
1743
|
-
#
|
|
1744
|
-
|
|
1745
1695
|
matched, unmatched = _run_match_stage(
|
|
1746
1696
|
unmatched,
|
|
1747
1697
|
lookup=lookups["exact"],
|
|
@@ -1750,102 +1700,144 @@ def lookup_sid_cepr(
|
|
|
1750
1700
|
dob_expr=pl.col(f"{cols['dob']}_clean"),
|
|
1751
1701
|
label="exact",
|
|
1752
1702
|
)
|
|
1753
|
-
|
|
1754
1703
|
matched_frames.append(matched)
|
|
1755
1704
|
|
|
1756
|
-
|
|
1757
|
-
|
|
1758
|
-
|
|
1759
|
-
|
|
1760
|
-
|
|
1761
|
-
|
|
1762
|
-
|
|
1763
|
-
|
|
1764
|
-
|
|
1765
|
-
.with_columns(
|
|
1766
|
-
*clean_other_name(cols["mname"])
|
|
1767
|
-
)
|
|
1768
|
-
)
|
|
1769
|
-
|
|
1705
|
+
for label, lookup in [
|
|
1706
|
+
("left exact -> right fname + mname", lookups["mname"]),
|
|
1707
|
+
("left exact -> right mname + lname", lookups["mname_lname"]),
|
|
1708
|
+
("left exact -> right fname + mname no space", lookups["mname_nospace"]),
|
|
1709
|
+
("left exact -> right lname + suffix", lookups["suffix"]),
|
|
1710
|
+
("left exact -> right fname + suffix", lookups["suffix_fname"]),
|
|
1711
|
+
("left exact -> right fname + suffix no space", lookups["suffix_fname_nospace"]),
|
|
1712
|
+
("left exact -> right lname + suffix no space", lookups["suffix_lname_nospace"]),
|
|
1713
|
+
]:
|
|
1770
1714
|
matched, unmatched = _run_match_stage(
|
|
1771
1715
|
unmatched,
|
|
1772
|
-
lookup=
|
|
1773
|
-
fname_expr=pl.
|
|
1774
|
-
[
|
|
1775
|
-
pl.col(f"{cols['fname']}_clean"),
|
|
1776
|
-
pl.col(f"{cols['mname']}_clean"),
|
|
1777
|
-
],
|
|
1778
|
-
separator=" ",
|
|
1779
|
-
),
|
|
1716
|
+
lookup=lookup,
|
|
1717
|
+
fname_expr=pl.col(f"{cols['fname']}_clean"),
|
|
1780
1718
|
lname_expr=pl.col(f"{cols['lname']}_clean"),
|
|
1781
1719
|
dob_expr=pl.col(f"{cols['dob']}_clean"),
|
|
1782
|
-
label=
|
|
1720
|
+
label=label,
|
|
1783
1721
|
)
|
|
1784
|
-
|
|
1785
1722
|
matched_frames.append(matched)
|
|
1786
1723
|
|
|
1787
|
-
|
|
1788
|
-
|
|
1789
|
-
|
|
1790
|
-
|
|
1724
|
+
if "mname" in cols:
|
|
1725
|
+
unmatched = unmatched.with_columns(*clean_other_name(cols["mname"]))
|
|
1726
|
+
|
|
1727
|
+
for label, fname_expr, lname_expr in [
|
|
1728
|
+
(
|
|
1729
|
+
"left fname + mname -> right exact",
|
|
1730
|
+
pl.concat_str(
|
|
1731
|
+
[pl.col(f"{cols['fname']}_clean"), pl.col(f"{cols['mname']}_clean")],
|
|
1732
|
+
separator=" ",
|
|
1733
|
+
),
|
|
1734
|
+
pl.col(f"{cols['lname']}_clean"),
|
|
1735
|
+
),
|
|
1736
|
+
(
|
|
1737
|
+
"left mname + lname -> right exact",
|
|
1738
|
+
pl.col(f"{cols['fname']}_clean"),
|
|
1739
|
+
pl.concat_str(
|
|
1740
|
+
[pl.col(f"{cols['mname']}_clean"), pl.col(f"{cols['lname']}_clean")],
|
|
1741
|
+
separator=" ",
|
|
1742
|
+
),
|
|
1743
|
+
),
|
|
1744
|
+
(
|
|
1745
|
+
"left fname + mname no space -> right exact",
|
|
1746
|
+
pl.concat_str(
|
|
1747
|
+
[pl.col(f"{cols['fname']}_clean"), pl.col(f"{cols['mname']}_clean")],
|
|
1748
|
+
separator="",
|
|
1749
|
+
),
|
|
1750
|
+
pl.col(f"{cols['lname']}_clean"),
|
|
1751
|
+
),
|
|
1752
|
+
]:
|
|
1753
|
+
matched, unmatched = _run_match_stage(
|
|
1754
|
+
unmatched,
|
|
1755
|
+
lookup=lookups["exact"],
|
|
1756
|
+
fname_expr=fname_expr,
|
|
1757
|
+
lname_expr=lname_expr,
|
|
1758
|
+
dob_expr=pl.col(f"{cols['dob']}_clean"),
|
|
1759
|
+
label=label,
|
|
1760
|
+
)
|
|
1761
|
+
matched_frames.append(matched)
|
|
1791
1762
|
|
|
1792
1763
|
if "suffix" in cols:
|
|
1764
|
+
unmatched = unmatched.with_columns(*clean_other_name(cols["suffix"]))
|
|
1793
1765
|
|
|
1794
|
-
|
|
1795
|
-
|
|
1796
|
-
|
|
1797
|
-
|
|
1766
|
+
for label, fname_expr, lname_expr in [
|
|
1767
|
+
(
|
|
1768
|
+
"left fname + suffix -> right exact",
|
|
1769
|
+
pl.concat_str(
|
|
1770
|
+
[pl.col(f"{cols['fname']}_clean"), pl.col(f"{cols['suffix']}_clean")],
|
|
1771
|
+
separator=" ",
|
|
1772
|
+
),
|
|
1773
|
+
pl.col(f"{cols['lname']}_clean"),
|
|
1774
|
+
),
|
|
1775
|
+
(
|
|
1776
|
+
"left fname + suffix no space -> right exact",
|
|
1777
|
+
pl.concat_str(
|
|
1778
|
+
[pl.col(f"{cols['fname']}_clean"), pl.col(f"{cols['suffix']}_clean")],
|
|
1779
|
+
separator="",
|
|
1780
|
+
),
|
|
1781
|
+
pl.col(f"{cols['lname']}_clean"),
|
|
1782
|
+
),
|
|
1783
|
+
(
|
|
1784
|
+
"left lname + suffix -> right exact",
|
|
1785
|
+
pl.col(f"{cols['fname']}_clean"),
|
|
1786
|
+
pl.concat_str(
|
|
1787
|
+
[pl.col(f"{cols['lname']}_clean"), pl.col(f"{cols['suffix']}_clean")],
|
|
1788
|
+
separator=" ",
|
|
1789
|
+
),
|
|
1790
|
+
),
|
|
1791
|
+
(
|
|
1792
|
+
"left lname + suffix no space -> right exact",
|
|
1793
|
+
pl.col(f"{cols['fname']}_clean"),
|
|
1794
|
+
pl.concat_str(
|
|
1795
|
+
[pl.col(f"{cols['lname']}_clean"), pl.col(f"{cols['suffix']}_clean")],
|
|
1796
|
+
separator="",
|
|
1797
|
+
),
|
|
1798
|
+
),
|
|
1799
|
+
]:
|
|
1800
|
+
matched, unmatched = _run_match_stage(
|
|
1801
|
+
unmatched,
|
|
1802
|
+
lookup=lookups["exact"],
|
|
1803
|
+
fname_expr=fname_expr,
|
|
1804
|
+
lname_expr=lname_expr,
|
|
1805
|
+
dob_expr=pl.col(f"{cols['dob']}_clean"),
|
|
1806
|
+
label=label,
|
|
1798
1807
|
)
|
|
1799
|
-
|
|
1800
|
-
|
|
1808
|
+
matched_frames.append(matched)
|
|
1809
|
+
|
|
1810
|
+
for key in [
|
|
1811
|
+
"dob_imp",
|
|
1812
|
+
"dob_imp_minus_1",
|
|
1813
|
+
"dob_imp_plus_1",
|
|
1814
|
+
"dob_imp_minus_2",
|
|
1815
|
+
"dob_imp_plus_2",
|
|
1816
|
+
]:
|
|
1801
1817
|
matched, unmatched = _run_match_stage(
|
|
1802
1818
|
unmatched,
|
|
1803
|
-
lookup=lookups[
|
|
1819
|
+
lookup=lookups[key],
|
|
1804
1820
|
fname_expr=pl.col(f"{cols['fname']}_clean"),
|
|
1805
|
-
lname_expr=pl.
|
|
1806
|
-
[
|
|
1807
|
-
pl.col(f"{cols['lname']}_clean"),
|
|
1808
|
-
pl.col(f"{cols['suffix']}_clean"),
|
|
1809
|
-
],
|
|
1810
|
-
separator=" ",
|
|
1811
|
-
),
|
|
1821
|
+
lname_expr=pl.col(f"{cols['lname']}_clean"),
|
|
1812
1822
|
dob_expr=pl.col(f"{cols['dob']}_clean"),
|
|
1813
|
-
label=
|
|
1823
|
+
label=key,
|
|
1814
1824
|
)
|
|
1815
|
-
|
|
1816
1825
|
matched_frames.append(matched)
|
|
1817
1826
|
|
|
1818
|
-
|
|
1819
|
-
|
|
1820
|
-
|
|
1821
|
-
|
|
1822
|
-
|
|
1823
|
-
matched, unmatched = _run_match_stage(
|
|
1824
|
-
unmatched,
|
|
1825
|
-
lookup=lookups["dob_imp"],
|
|
1826
|
-
fname_expr=pl.col(f"{cols['fname']}_clean"),
|
|
1827
|
-
lname_expr=pl.col(f"{cols['lname']}_clean"),
|
|
1828
|
-
dob_expr=pl.col(f"{cols['dob']}_clean"),
|
|
1829
|
-
label="dob_imp",
|
|
1827
|
+
result = (
|
|
1828
|
+
pl.concat(matched_frames + [unmatched], how="diagonal_relaxed")
|
|
1829
|
+
.sort("_row_id")
|
|
1830
|
+
.drop("_row_id")
|
|
1830
1831
|
)
|
|
1831
1832
|
|
|
1832
|
-
|
|
1833
|
+
if "sid_cepr" not in input_columns:
|
|
1834
|
+
input_columns = input_columns + ["sid_cepr"]
|
|
1833
1835
|
|
|
1834
|
-
|
|
1835
|
-
# FINAL
|
|
1836
|
-
#
|
|
1836
|
+
result = result.select(input_columns)
|
|
1837
1837
|
|
|
1838
|
-
|
|
1839
|
-
matched_frames + [unmatched],
|
|
1840
|
-
how="diagonal_relaxed",
|
|
1841
|
-
)
|
|
1842
|
-
|
|
1843
|
-
print(
|
|
1844
|
-
f"final matched: {result['sid_cepr'].is_not_null().sum():,}/{len(result):,}"
|
|
1845
|
-
)
|
|
1838
|
+
print(f"final matched: {result['sid_cepr'].is_not_null().sum():,}/{len(result):,}")
|
|
1846
1839
|
|
|
1847
1840
|
return result
|
|
1848
|
-
|
|
1849
1841
|
#
|
|
1850
1842
|
|
|
1851
1843
|
# EXAMPLE USAGE
|
|
File without changes
|
|
File without changes
|
|
File without changes
|