ltc-code 0.1.6__tar.gz → 0.1.7__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {ltc_code-0.1.6 → ltc_code-0.1.7}/PKG-INFO +1 -1
- {ltc_code-0.1.6 → ltc_code-0.1.7}/pyproject.toml +1 -1
- {ltc_code-0.1.6 → ltc_code-0.1.7}/src/ltc_code/may27.py +174 -251
- {ltc_code-0.1.6 → ltc_code-0.1.7}/README.md +0 -0
- {ltc_code-0.1.6 → ltc_code-0.1.7}/src/ltc_code/__init__.py +0 -0
- {ltc_code-0.1.6 → ltc_code-0.1.7}/src/ltc_code/polars_dates.py +0 -0
|
@@ -1427,6 +1427,16 @@ def lookup_sid_cepr(
|
|
|
1427
1427
|
|
|
1428
1428
|
|
|
1429
1429
|
|
|
1430
|
+
def _parse_dob_expr(col: str) -> pl.Expr:
|
|
1431
|
+
return pl.coalesce(
|
|
1432
|
+
[
|
|
1433
|
+
pl.col(col).cast(pl.String).str.strptime(pl.Date, "%m/%d/%Y", strict=False),
|
|
1434
|
+
pl.col(col).cast(pl.String).str.strptime(pl.Date, "%m/%d/%y", strict=False),
|
|
1435
|
+
pl.col(col).cast(pl.String).str.strptime(pl.Date, "%Y-%m-%d", strict=False),
|
|
1436
|
+
]
|
|
1437
|
+
)
|
|
1438
|
+
|
|
1439
|
+
|
|
1430
1440
|
def _build_lookup(
|
|
1431
1441
|
census: pl.DataFrame,
|
|
1432
1442
|
*,
|
|
@@ -1435,80 +1445,35 @@ def _build_lookup(
|
|
|
1435
1445
|
dob_col: str,
|
|
1436
1446
|
label: str,
|
|
1437
1447
|
) -> pl.DataFrame:
|
|
1438
|
-
"""
|
|
1439
|
-
Build a deterministic SID lookup table.
|
|
1440
|
-
|
|
1441
|
-
Output schema:
|
|
1442
|
-
_fname_key
|
|
1443
|
-
_lname_key
|
|
1444
|
-
_dob_key
|
|
1445
|
-
sid_cepr
|
|
1446
|
-
|
|
1447
|
-
Ambiguous keys are removed.
|
|
1448
|
-
"""
|
|
1449
|
-
|
|
1450
1448
|
lookup = (
|
|
1451
1449
|
census
|
|
1452
1450
|
.select(
|
|
1453
|
-
|
|
1454
|
-
|
|
1455
|
-
|
|
1456
|
-
|
|
1457
|
-
pl.col("sid_cepr"),
|
|
1458
|
-
]
|
|
1459
|
-
)
|
|
1460
|
-
.drop_nulls(
|
|
1461
|
-
[
|
|
1462
|
-
"_fname_key",
|
|
1463
|
-
"_lname_key",
|
|
1464
|
-
"_dob_key",
|
|
1465
|
-
"sid_cepr",
|
|
1466
|
-
]
|
|
1467
|
-
)
|
|
1468
|
-
.group_by(
|
|
1469
|
-
[
|
|
1470
|
-
"_fname_key",
|
|
1471
|
-
"_lname_key",
|
|
1472
|
-
"_dob_key",
|
|
1473
|
-
]
|
|
1474
|
-
)
|
|
1475
|
-
.agg(
|
|
1476
|
-
pl.col("sid_cepr").unique().alias("_sids")
|
|
1477
|
-
)
|
|
1478
|
-
.with_columns(
|
|
1479
|
-
pl.col("_sids").list.len().alias("_sid_count")
|
|
1480
|
-
)
|
|
1481
|
-
.filter(
|
|
1482
|
-
pl.col("_sid_count") == 1
|
|
1451
|
+
fname_expr.alias("_fname_key"),
|
|
1452
|
+
lname_expr.alias("_lname_key"),
|
|
1453
|
+
pl.col(dob_col).alias("_dob_key"),
|
|
1454
|
+
pl.col("sid_cepr"),
|
|
1483
1455
|
)
|
|
1456
|
+
.drop_nulls(["_fname_key", "_lname_key", "_dob_key", "sid_cepr"])
|
|
1457
|
+
.group_by(["_fname_key", "_lname_key", "_dob_key"])
|
|
1458
|
+
.agg(pl.col("sid_cepr").unique().alias("_sids"))
|
|
1459
|
+
.with_columns(pl.col("_sids").list.len().alias("_sid_count"))
|
|
1460
|
+
.filter(pl.col("_sid_count") == 1)
|
|
1484
1461
|
.select(
|
|
1485
|
-
|
|
1486
|
-
|
|
1487
|
-
|
|
1488
|
-
|
|
1489
|
-
pl.col("_sids").list.first().alias("sid_cepr"),
|
|
1490
|
-
]
|
|
1462
|
+
"_fname_key",
|
|
1463
|
+
"_lname_key",
|
|
1464
|
+
"_dob_key",
|
|
1465
|
+
pl.col("_sids").list.first().alias("sid_cepr"),
|
|
1491
1466
|
)
|
|
1492
1467
|
)
|
|
1493
1468
|
|
|
1494
|
-
print(f"built lookup: {label}")
|
|
1495
|
-
|
|
1469
|
+
print(f"built lookup: {label} ({len(lookup):,} usable keys)")
|
|
1496
1470
|
return lookup
|
|
1497
1471
|
|
|
1498
|
-
def build_census_lookups(
|
|
1499
|
-
*,
|
|
1500
|
-
cmo_name: str,
|
|
1501
|
-
) -> dict[str, pl.DataFrame]:
|
|
1502
|
-
|
|
1503
|
-
try:
|
|
1504
|
-
import mappings
|
|
1505
|
-
except ImportError:
|
|
1506
|
-
import mapppings as mappings
|
|
1507
1472
|
|
|
1473
|
+
def build_census_lookups(*, cmo_name: str) -> dict[str, pl.DataFrame]:
|
|
1508
1474
|
annual_frames = []
|
|
1509
1475
|
|
|
1510
1476
|
for year in range(1994, 2023):
|
|
1511
|
-
|
|
1512
1477
|
path = CENSUS_STUDENTS / f"census_student_{year}.csv"
|
|
1513
1478
|
|
|
1514
1479
|
annual = (
|
|
@@ -1520,16 +1485,14 @@ def build_census_lookups(
|
|
|
1520
1485
|
ignore_errors=False,
|
|
1521
1486
|
)
|
|
1522
1487
|
.select(
|
|
1523
|
-
|
|
1524
|
-
|
|
1525
|
-
|
|
1526
|
-
|
|
1527
|
-
|
|
1528
|
-
|
|
1529
|
-
|
|
1530
|
-
|
|
1531
|
-
"birthdate_imp",
|
|
1532
|
-
]
|
|
1488
|
+
"cmo_code",
|
|
1489
|
+
"sid_cepr",
|
|
1490
|
+
"fname_clean",
|
|
1491
|
+
"lname_clean",
|
|
1492
|
+
"mname_clean",
|
|
1493
|
+
"suff_clean",
|
|
1494
|
+
"birthdate_clean",
|
|
1495
|
+
"birthdate_imp",
|
|
1533
1496
|
)
|
|
1534
1497
|
.rename(
|
|
1535
1498
|
{
|
|
@@ -1542,92 +1505,105 @@ def build_census_lookups(
|
|
|
1542
1505
|
}
|
|
1543
1506
|
)
|
|
1544
1507
|
.with_columns(
|
|
1545
|
-
pl.col("cmo_code")
|
|
1546
|
-
.replace(mappings.CMO_CODE_TO_NAME)
|
|
1547
|
-
.alias("cmo_name")
|
|
1548
|
-
)
|
|
1549
|
-
.filter(
|
|
1550
|
-
pl.col("cmo_name") == cmo_name
|
|
1508
|
+
pl.col("cmo_code").replace(cmo_map).alias("cmo_name")
|
|
1551
1509
|
)
|
|
1510
|
+
.filter(pl.col("cmo_name") == cmo_name)
|
|
1552
1511
|
.with_columns(
|
|
1553
1512
|
*clean_name("fname"),
|
|
1554
1513
|
*clean_name("lname"),
|
|
1555
1514
|
*clean_other_name("mname"),
|
|
1556
1515
|
*clean_other_name("suffix"),
|
|
1557
|
-
*clean_dob(col="dob"),
|
|
1558
|
-
*clean_dob(col="dob_imp"),
|
|
1559
1516
|
)
|
|
1560
|
-
.
|
|
1561
|
-
|
|
1562
|
-
|
|
1563
|
-
|
|
1564
|
-
|
|
1517
|
+
.with_columns(*clean_dob(col="dob"))
|
|
1518
|
+
.with_columns(*clean_dob(col="dob_imp"))
|
|
1519
|
+
.with_columns(
|
|
1520
|
+
_parse_dob_expr("dob_clean").alias("dob"),
|
|
1521
|
+
_parse_dob_expr("dob_imp_clean").alias("dob_imp"),
|
|
1565
1522
|
)
|
|
1566
|
-
.
|
|
1567
|
-
|
|
1568
|
-
|
|
1569
|
-
|
|
1570
|
-
|
|
1523
|
+
.select(
|
|
1524
|
+
"sid_cepr",
|
|
1525
|
+
pl.col("fname_clean").alias("fname"),
|
|
1526
|
+
pl.col("lname_clean").alias("lname"),
|
|
1527
|
+
pl.col("mname_clean").alias("mname"),
|
|
1528
|
+
pl.col("suffix_clean").alias("suffix"),
|
|
1529
|
+
"dob",
|
|
1530
|
+
"dob_imp",
|
|
1571
1531
|
)
|
|
1572
1532
|
)
|
|
1573
1533
|
|
|
1574
1534
|
annual_frames.append(annual)
|
|
1575
1535
|
|
|
1576
|
-
|
|
1577
|
-
# MATERIALIZE ONCE
|
|
1578
|
-
#
|
|
1536
|
+
census = pl.concat(annual_frames, how="vertical_relaxed").collect()
|
|
1579
1537
|
|
|
1580
|
-
census
|
|
1581
|
-
|
|
1582
|
-
|
|
1583
|
-
|
|
1584
|
-
|
|
1585
|
-
|
|
1538
|
+
print(f"census rows after CMO filter: {len(census):,}")
|
|
1539
|
+
|
|
1540
|
+
lookups = {}
|
|
1541
|
+
|
|
1542
|
+
lookups["exact"] = _build_lookup(
|
|
1543
|
+
census,
|
|
1544
|
+
fname_expr=pl.col("fname"),
|
|
1545
|
+
lname_expr=pl.col("lname"),
|
|
1546
|
+
dob_col="dob",
|
|
1547
|
+
label="exact",
|
|
1548
|
+
)
|
|
1549
|
+
|
|
1550
|
+
lookups["mname"] = _build_lookup(
|
|
1551
|
+
census,
|
|
1552
|
+
fname_expr=pl.concat_str([pl.col("fname"), pl.col("mname")], separator=" "),
|
|
1553
|
+
lname_expr=pl.col("lname"),
|
|
1554
|
+
dob_col="dob",
|
|
1555
|
+
label="right fname + mname",
|
|
1586
1556
|
)
|
|
1587
1557
|
|
|
1588
|
-
|
|
1558
|
+
lookups["mname_lname"] = _build_lookup(
|
|
1559
|
+
census,
|
|
1560
|
+
fname_expr=pl.col("fname"),
|
|
1561
|
+
lname_expr=pl.concat_str([pl.col("mname"), pl.col("lname")], separator=" "),
|
|
1562
|
+
dob_col="dob",
|
|
1563
|
+
label="right mname + lname",
|
|
1564
|
+
)
|
|
1589
1565
|
|
|
1590
|
-
|
|
1591
|
-
|
|
1592
|
-
|
|
1566
|
+
lookups["mname_nospace"] = _build_lookup(
|
|
1567
|
+
census,
|
|
1568
|
+
fname_expr=pl.concat_str([pl.col("fname"), pl.col("mname")], separator=""),
|
|
1569
|
+
lname_expr=pl.col("lname"),
|
|
1570
|
+
dob_col="dob",
|
|
1571
|
+
label="right fname + mname no space",
|
|
1572
|
+
)
|
|
1593
1573
|
|
|
1594
|
-
|
|
1574
|
+
lookups["suffix"] = _build_lookup(
|
|
1595
1575
|
census,
|
|
1596
1576
|
fname_expr=pl.col("fname"),
|
|
1577
|
+
lname_expr=pl.concat_str([pl.col("lname"), pl.col("suffix")], separator=" "),
|
|
1578
|
+
dob_col="dob",
|
|
1579
|
+
label="right lname + suffix",
|
|
1580
|
+
)
|
|
1581
|
+
|
|
1582
|
+
lookups["suffix_fname"] = _build_lookup(
|
|
1583
|
+
census,
|
|
1584
|
+
fname_expr=pl.concat_str([pl.col("fname"), pl.col("suffix")], separator=" "),
|
|
1597
1585
|
lname_expr=pl.col("lname"),
|
|
1598
1586
|
dob_col="dob",
|
|
1599
|
-
label="
|
|
1587
|
+
label="right fname + suffix",
|
|
1600
1588
|
)
|
|
1601
1589
|
|
|
1602
|
-
|
|
1590
|
+
lookups["suffix_fname_nospace"] = _build_lookup(
|
|
1603
1591
|
census,
|
|
1604
|
-
fname_expr=pl.concat_str(
|
|
1605
|
-
[
|
|
1606
|
-
pl.col("fname"),
|
|
1607
|
-
pl.col("mname"),
|
|
1608
|
-
],
|
|
1609
|
-
separator=" ",
|
|
1610
|
-
),
|
|
1592
|
+
fname_expr=pl.concat_str([pl.col("fname"), pl.col("suffix")], separator=""),
|
|
1611
1593
|
lname_expr=pl.col("lname"),
|
|
1612
1594
|
dob_col="dob",
|
|
1613
|
-
label="
|
|
1595
|
+
label="right fname + suffix no space",
|
|
1614
1596
|
)
|
|
1615
1597
|
|
|
1616
|
-
|
|
1598
|
+
lookups["suffix_lname_nospace"] = _build_lookup(
|
|
1617
1599
|
census,
|
|
1618
1600
|
fname_expr=pl.col("fname"),
|
|
1619
|
-
lname_expr=pl.concat_str(
|
|
1620
|
-
[
|
|
1621
|
-
pl.col("lname"),
|
|
1622
|
-
pl.col("suffix"),
|
|
1623
|
-
],
|
|
1624
|
-
separator=" ",
|
|
1625
|
-
),
|
|
1601
|
+
lname_expr=pl.concat_str([pl.col("lname"), pl.col("suffix")], separator=""),
|
|
1626
1602
|
dob_col="dob",
|
|
1627
|
-
label="suffix",
|
|
1603
|
+
label="right lname + suffix no space",
|
|
1628
1604
|
)
|
|
1629
1605
|
|
|
1630
|
-
|
|
1606
|
+
lookups["dob_imp"] = _build_lookup(
|
|
1631
1607
|
census,
|
|
1632
1608
|
fname_expr=pl.col("fname"),
|
|
1633
1609
|
lname_expr=pl.col("lname"),
|
|
@@ -1635,12 +1611,22 @@ def build_census_lookups(
|
|
|
1635
1611
|
label="dob_imp",
|
|
1636
1612
|
)
|
|
1637
1613
|
|
|
1638
|
-
|
|
1639
|
-
"
|
|
1640
|
-
"
|
|
1641
|
-
"
|
|
1642
|
-
"
|
|
1643
|
-
|
|
1614
|
+
for offset, key in [
|
|
1615
|
+
("-1y", "dob_imp_minus_1"),
|
|
1616
|
+
("1y", "dob_imp_plus_1"),
|
|
1617
|
+
("-2y", "dob_imp_minus_2"),
|
|
1618
|
+
("2y", "dob_imp_plus_2"),
|
|
1619
|
+
]:
|
|
1620
|
+
lookups[key] = _build_lookup(
|
|
1621
|
+
census.with_columns(pl.col("dob_imp").dt.offset_by(offset).alias(key)),
|
|
1622
|
+
fname_expr=pl.col("fname"),
|
|
1623
|
+
lname_expr=pl.col("lname"),
|
|
1624
|
+
dob_col=key,
|
|
1625
|
+
label=key,
|
|
1626
|
+
)
|
|
1627
|
+
|
|
1628
|
+
return lookups
|
|
1629
|
+
|
|
1644
1630
|
|
|
1645
1631
|
def _run_match_stage(
|
|
1646
1632
|
unmatched: pl.DataFrame,
|
|
@@ -1651,84 +1637,61 @@ def _run_match_stage(
|
|
|
1651
1637
|
dob_expr: pl.Expr,
|
|
1652
1638
|
label: str,
|
|
1653
1639
|
) -> tuple[pl.DataFrame, pl.DataFrame]:
|
|
1654
|
-
|
|
1655
1640
|
before = len(unmatched)
|
|
1656
1641
|
|
|
1657
1642
|
stage = (
|
|
1658
1643
|
unmatched
|
|
1659
1644
|
.with_columns(
|
|
1660
|
-
|
|
1661
|
-
|
|
1662
|
-
|
|
1663
|
-
dob_expr.alias("_dob_key"),
|
|
1664
|
-
]
|
|
1645
|
+
fname_expr.alias("_fname_key"),
|
|
1646
|
+
lname_expr.alias("_lname_key"),
|
|
1647
|
+
dob_expr.alias("_dob_key"),
|
|
1665
1648
|
)
|
|
1666
1649
|
.join(
|
|
1667
1650
|
lookup,
|
|
1668
|
-
on=[
|
|
1669
|
-
"_fname_key",
|
|
1670
|
-
"_lname_key",
|
|
1671
|
-
"_dob_key",
|
|
1672
|
-
],
|
|
1651
|
+
on=["_fname_key", "_lname_key", "_dob_key"],
|
|
1673
1652
|
how="left",
|
|
1674
1653
|
validate="m:1",
|
|
1675
1654
|
)
|
|
1676
|
-
.drop(
|
|
1677
|
-
[
|
|
1678
|
-
"_fname_key",
|
|
1679
|
-
"_lname_key",
|
|
1680
|
-
"_dob_key",
|
|
1681
|
-
]
|
|
1682
|
-
)
|
|
1683
|
-
)
|
|
1684
|
-
|
|
1685
|
-
matched = (
|
|
1686
|
-
stage
|
|
1687
|
-
.filter(
|
|
1688
|
-
pl.col("sid_cepr").is_not_null()
|
|
1689
|
-
)
|
|
1690
|
-
)
|
|
1691
|
-
|
|
1692
|
-
unmatched = (
|
|
1693
|
-
stage
|
|
1694
|
-
.filter(
|
|
1695
|
-
pl.col("sid_cepr").is_null()
|
|
1696
|
-
)
|
|
1697
|
-
.drop("sid_cepr")
|
|
1655
|
+
.drop(["_fname_key", "_lname_key", "_dob_key"])
|
|
1698
1656
|
)
|
|
1699
1657
|
|
|
1700
|
-
|
|
1658
|
+
matched = stage.filter(pl.col("sid_cepr").is_not_null())
|
|
1659
|
+
unmatched = stage.filter(pl.col("sid_cepr").is_null()).drop("sid_cepr")
|
|
1701
1660
|
|
|
1702
|
-
print(
|
|
1703
|
-
f"{label}: matched {added:,}/{before:,}"
|
|
1704
|
-
)
|
|
1661
|
+
print(f"{label}: matched {len(matched):,}/{before:,}")
|
|
1705
1662
|
|
|
1706
1663
|
return matched, unmatched
|
|
1707
1664
|
|
|
1665
|
+
|
|
1708
1666
|
def lookup_sid_cepr(
|
|
1709
|
-
frame
|
|
1667
|
+
frame,
|
|
1710
1668
|
*,
|
|
1711
1669
|
cols: Mapping[str, str],
|
|
1712
1670
|
lookups: dict[str, pl.DataFrame],
|
|
1713
|
-
)
|
|
1714
|
-
|
|
1671
|
+
):
|
|
1715
1672
|
is_lazy = isinstance(frame, pl.LazyFrame)
|
|
1716
1673
|
current = frame.collect() if is_lazy else frame
|
|
1717
1674
|
|
|
1675
|
+
input_columns = current.columns
|
|
1676
|
+
|
|
1677
|
+
current = current.with_row_index("_row_id")
|
|
1678
|
+
|
|
1718
1679
|
current = current.with_columns(
|
|
1719
1680
|
*clean_name(cols["fname"]),
|
|
1720
1681
|
*clean_name(cols["lname"]),
|
|
1721
|
-
*clean_dob(col=cols["dob"]),
|
|
1722
1682
|
)
|
|
1723
1683
|
|
|
1724
|
-
current =
|
|
1725
|
-
|
|
1684
|
+
current = (
|
|
1685
|
+
current
|
|
1686
|
+
.with_columns(*clean_dob(col=cols["dob"]))
|
|
1687
|
+
.with_columns(
|
|
1688
|
+
_parse_dob_expr(f"{cols['dob']}_clean").alias(f"{cols['dob']}_clean")
|
|
1689
|
+
)
|
|
1726
1690
|
)
|
|
1727
1691
|
|
|
1728
1692
|
matched_frames = []
|
|
1729
1693
|
unmatched = current
|
|
1730
1694
|
|
|
1731
|
-
# exact
|
|
1732
1695
|
matched, unmatched = _run_match_stage(
|
|
1733
1696
|
unmatched,
|
|
1734
1697
|
lookup=lookups["exact"],
|
|
@@ -1739,34 +1702,31 @@ def lookup_sid_cepr(
|
|
|
1739
1702
|
)
|
|
1740
1703
|
matched_frames.append(matched)
|
|
1741
1704
|
|
|
1742
|
-
|
|
1743
|
-
|
|
1744
|
-
|
|
1745
|
-
|
|
1705
|
+
for label, lookup in [
|
|
1706
|
+
("left exact -> right fname + mname", lookups["mname"]),
|
|
1707
|
+
("left exact -> right mname + lname", lookups["mname_lname"]),
|
|
1708
|
+
("left exact -> right fname + mname no space", lookups["mname_nospace"]),
|
|
1709
|
+
("left exact -> right lname + suffix", lookups["suffix"]),
|
|
1710
|
+
("left exact -> right fname + suffix", lookups["suffix_fname"]),
|
|
1711
|
+
("left exact -> right fname + suffix no space", lookups["suffix_fname_nospace"]),
|
|
1712
|
+
("left exact -> right lname + suffix no space", lookups["suffix_lname_nospace"]),
|
|
1713
|
+
]:
|
|
1714
|
+
matched, unmatched = _run_match_stage(
|
|
1715
|
+
unmatched,
|
|
1716
|
+
lookup=lookup,
|
|
1717
|
+
fname_expr=pl.col(f"{cols['fname']}_clean"),
|
|
1718
|
+
lname_expr=pl.col(f"{cols['lname']}_clean"),
|
|
1719
|
+
dob_expr=pl.col(f"{cols['dob']}_clean"),
|
|
1720
|
+
label=label,
|
|
1746
1721
|
)
|
|
1722
|
+
matched_frames.append(matched)
|
|
1747
1723
|
|
|
1748
|
-
|
|
1749
|
-
|
|
1750
|
-
|
|
1751
|
-
|
|
1752
|
-
pl.col(f"{cols['fname']}_clean"),
|
|
1753
|
-
pl.col(f"{cols['lname']}_clean"),
|
|
1754
|
-
),
|
|
1755
|
-
(
|
|
1756
|
-
"left exact -> right mname + lname",
|
|
1757
|
-
lookups["mname_lname"],
|
|
1758
|
-
pl.col(f"{cols['fname']}_clean"),
|
|
1759
|
-
pl.col(f"{cols['lname']}_clean"),
|
|
1760
|
-
),
|
|
1761
|
-
(
|
|
1762
|
-
"left exact -> right fname + mname no space",
|
|
1763
|
-
lookups["mname_nospace"],
|
|
1764
|
-
pl.col(f"{cols['fname']}_clean"),
|
|
1765
|
-
pl.col(f"{cols['lname']}_clean"),
|
|
1766
|
-
),
|
|
1724
|
+
if "mname" in cols:
|
|
1725
|
+
unmatched = unmatched.with_columns(*clean_other_name(cols["mname"]))
|
|
1726
|
+
|
|
1727
|
+
for label, fname_expr, lname_expr in [
|
|
1767
1728
|
(
|
|
1768
1729
|
"left fname + mname -> right exact",
|
|
1769
|
-
lookups["exact"],
|
|
1770
1730
|
pl.concat_str(
|
|
1771
1731
|
[pl.col(f"{cols['fname']}_clean"), pl.col(f"{cols['mname']}_clean")],
|
|
1772
1732
|
separator=" ",
|
|
@@ -1775,7 +1735,6 @@ def lookup_sid_cepr(
|
|
|
1775
1735
|
),
|
|
1776
1736
|
(
|
|
1777
1737
|
"left mname + lname -> right exact",
|
|
1778
|
-
lookups["exact"],
|
|
1779
1738
|
pl.col(f"{cols['fname']}_clean"),
|
|
1780
1739
|
pl.concat_str(
|
|
1781
1740
|
[pl.col(f"{cols['mname']}_clean"), pl.col(f"{cols['lname']}_clean")],
|
|
@@ -1784,19 +1743,16 @@ def lookup_sid_cepr(
|
|
|
1784
1743
|
),
|
|
1785
1744
|
(
|
|
1786
1745
|
"left fname + mname no space -> right exact",
|
|
1787
|
-
lookups["exact"],
|
|
1788
1746
|
pl.concat_str(
|
|
1789
1747
|
[pl.col(f"{cols['fname']}_clean"), pl.col(f"{cols['mname']}_clean")],
|
|
1790
1748
|
separator="",
|
|
1791
1749
|
),
|
|
1792
1750
|
pl.col(f"{cols['lname']}_clean"),
|
|
1793
1751
|
),
|
|
1794
|
-
]
|
|
1795
|
-
|
|
1796
|
-
for label, lookup, fname_expr, lname_expr in mname_stages:
|
|
1752
|
+
]:
|
|
1797
1753
|
matched, unmatched = _run_match_stage(
|
|
1798
1754
|
unmatched,
|
|
1799
|
-
lookup=
|
|
1755
|
+
lookup=lookups["exact"],
|
|
1800
1756
|
fname_expr=fname_expr,
|
|
1801
1757
|
lname_expr=lname_expr,
|
|
1802
1758
|
dob_expr=pl.col(f"{cols['dob']}_clean"),
|
|
@@ -1804,40 +1760,12 @@ def lookup_sid_cepr(
|
|
|
1804
1760
|
)
|
|
1805
1761
|
matched_frames.append(matched)
|
|
1806
1762
|
|
|
1807
|
-
# suffix variants
|
|
1808
1763
|
if "suffix" in cols:
|
|
1809
|
-
unmatched = unmatched.with_columns(
|
|
1810
|
-
*clean_other_name(cols["suffix"])
|
|
1811
|
-
)
|
|
1764
|
+
unmatched = unmatched.with_columns(*clean_other_name(cols["suffix"]))
|
|
1812
1765
|
|
|
1813
|
-
|
|
1814
|
-
(
|
|
1815
|
-
"left exact -> right lname + suffix",
|
|
1816
|
-
lookups["suffix"],
|
|
1817
|
-
pl.col(f"{cols['fname']}_clean"),
|
|
1818
|
-
pl.col(f"{cols['lname']}_clean"),
|
|
1819
|
-
),
|
|
1820
|
-
(
|
|
1821
|
-
"left exact -> right fname + suffix",
|
|
1822
|
-
lookups["suffix_fname"],
|
|
1823
|
-
pl.col(f"{cols['fname']}_clean"),
|
|
1824
|
-
pl.col(f"{cols['lname']}_clean"),
|
|
1825
|
-
),
|
|
1826
|
-
(
|
|
1827
|
-
"left exact -> right fname + suffix no space",
|
|
1828
|
-
lookups["suffix_fname_nospace"],
|
|
1829
|
-
pl.col(f"{cols['fname']}_clean"),
|
|
1830
|
-
pl.col(f"{cols['lname']}_clean"),
|
|
1831
|
-
),
|
|
1832
|
-
(
|
|
1833
|
-
"left exact -> right lname + suffix no space",
|
|
1834
|
-
lookups["suffix_lname_nospace"],
|
|
1835
|
-
pl.col(f"{cols['fname']}_clean"),
|
|
1836
|
-
pl.col(f"{cols['lname']}_clean"),
|
|
1837
|
-
),
|
|
1766
|
+
for label, fname_expr, lname_expr in [
|
|
1838
1767
|
(
|
|
1839
1768
|
"left fname + suffix -> right exact",
|
|
1840
|
-
lookups["exact"],
|
|
1841
1769
|
pl.concat_str(
|
|
1842
1770
|
[pl.col(f"{cols['fname']}_clean"), pl.col(f"{cols['suffix']}_clean")],
|
|
1843
1771
|
separator=" ",
|
|
@@ -1846,7 +1774,6 @@ def lookup_sid_cepr(
|
|
|
1846
1774
|
),
|
|
1847
1775
|
(
|
|
1848
1776
|
"left fname + suffix no space -> right exact",
|
|
1849
|
-
lookups["exact"],
|
|
1850
1777
|
pl.concat_str(
|
|
1851
1778
|
[pl.col(f"{cols['fname']}_clean"), pl.col(f"{cols['suffix']}_clean")],
|
|
1852
1779
|
separator="",
|
|
@@ -1855,7 +1782,6 @@ def lookup_sid_cepr(
|
|
|
1855
1782
|
),
|
|
1856
1783
|
(
|
|
1857
1784
|
"left lname + suffix -> right exact",
|
|
1858
|
-
lookups["exact"],
|
|
1859
1785
|
pl.col(f"{cols['fname']}_clean"),
|
|
1860
1786
|
pl.concat_str(
|
|
1861
1787
|
[pl.col(f"{cols['lname']}_clean"), pl.col(f"{cols['suffix']}_clean")],
|
|
@@ -1864,19 +1790,16 @@ def lookup_sid_cepr(
|
|
|
1864
1790
|
),
|
|
1865
1791
|
(
|
|
1866
1792
|
"left lname + suffix no space -> right exact",
|
|
1867
|
-
lookups["exact"],
|
|
1868
1793
|
pl.col(f"{cols['fname']}_clean"),
|
|
1869
1794
|
pl.concat_str(
|
|
1870
1795
|
[pl.col(f"{cols['lname']}_clean"), pl.col(f"{cols['suffix']}_clean")],
|
|
1871
1796
|
separator="",
|
|
1872
1797
|
),
|
|
1873
1798
|
),
|
|
1874
|
-
]
|
|
1875
|
-
|
|
1876
|
-
for label, lookup, fname_expr, lname_expr in suffix_stages:
|
|
1799
|
+
]:
|
|
1877
1800
|
matched, unmatched = _run_match_stage(
|
|
1878
1801
|
unmatched,
|
|
1879
|
-
lookup=
|
|
1802
|
+
lookup=lookups["exact"],
|
|
1880
1803
|
fname_expr=fname_expr,
|
|
1881
1804
|
lname_expr=lname_expr,
|
|
1882
1805
|
dob_expr=pl.col(f"{cols['dob']}_clean"),
|
|
@@ -1884,16 +1807,13 @@ def lookup_sid_cepr(
|
|
|
1884
1807
|
)
|
|
1885
1808
|
matched_frames.append(matched)
|
|
1886
1809
|
|
|
1887
|
-
|
|
1888
|
-
dob_imp_stages = [
|
|
1810
|
+
for key in [
|
|
1889
1811
|
"dob_imp",
|
|
1890
1812
|
"dob_imp_minus_1",
|
|
1891
1813
|
"dob_imp_plus_1",
|
|
1892
1814
|
"dob_imp_minus_2",
|
|
1893
1815
|
"dob_imp_plus_2",
|
|
1894
|
-
]
|
|
1895
|
-
|
|
1896
|
-
for key in dob_imp_stages:
|
|
1816
|
+
]:
|
|
1897
1817
|
matched, unmatched = _run_match_stage(
|
|
1898
1818
|
unmatched,
|
|
1899
1819
|
lookup=lookups[key],
|
|
@@ -1904,17 +1824,20 @@ def lookup_sid_cepr(
|
|
|
1904
1824
|
)
|
|
1905
1825
|
matched_frames.append(matched)
|
|
1906
1826
|
|
|
1907
|
-
result =
|
|
1908
|
-
matched_frames + [unmatched],
|
|
1909
|
-
|
|
1827
|
+
result = (
|
|
1828
|
+
pl.concat(matched_frames + [unmatched], how="diagonal_relaxed")
|
|
1829
|
+
.sort("_row_id")
|
|
1830
|
+
.drop("_row_id")
|
|
1910
1831
|
)
|
|
1911
1832
|
|
|
1912
|
-
|
|
1913
|
-
|
|
1914
|
-
)
|
|
1833
|
+
if "sid_cepr" not in input_columns:
|
|
1834
|
+
input_columns = input_columns + ["sid_cepr"]
|
|
1915
1835
|
|
|
1916
|
-
|
|
1836
|
+
result = result.select(input_columns)
|
|
1917
1837
|
|
|
1838
|
+
print(f"final matched: {result['sid_cepr'].is_not_null().sum():,}/{len(result):,}")
|
|
1839
|
+
|
|
1840
|
+
return result
|
|
1918
1841
|
#
|
|
1919
1842
|
|
|
1920
1843
|
# EXAMPLE USAGE
|
|
File without changes
|
|
File without changes
|
|
File without changes
|