ltc-code 0.1.4__tar.gz → 0.1.6__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {ltc_code-0.1.4 → ltc_code-0.1.6}/PKG-INFO +1 -1
- {ltc_code-0.1.4 → ltc_code-0.1.6}/pyproject.toml +1 -1
- {ltc_code-0.1.4 → ltc_code-0.1.6}/src/ltc_code/may27.py +414 -353
- {ltc_code-0.1.4 → ltc_code-0.1.6}/README.md +0 -0
- {ltc_code-0.1.4 → ltc_code-0.1.6}/src/ltc_code/__init__.py +0 -0
- {ltc_code-0.1.4 → ltc_code-0.1.6}/src/ltc_code/polars_dates.py +0 -0
|
@@ -1428,431 +1428,492 @@ def lookup_sid_cepr(
|
|
|
1428
1428
|
|
|
1429
1429
|
|
|
1430
1430
|
def _build_lookup(
|
|
1431
|
-
census: pl.DataFrame,
|
|
1432
|
-
*,
|
|
1433
|
-
fname_expr: pl.Expr,
|
|
1434
|
-
lname_expr: pl.Expr,
|
|
1435
|
-
dob_col: str,
|
|
1436
|
-
label: str,
|
|
1431
|
+
census: pl.DataFrame,
|
|
1432
|
+
*,
|
|
1433
|
+
fname_expr: pl.Expr,
|
|
1434
|
+
lname_expr: pl.Expr,
|
|
1435
|
+
dob_col: str,
|
|
1436
|
+
label: str,
|
|
1437
1437
|
) -> pl.DataFrame:
|
|
1438
|
-
"""
|
|
1439
|
-
Build a deterministic SID lookup table.
|
|
1440
|
-
|
|
1441
|
-
```
|
|
1442
|
-
Output schema:
|
|
1443
|
-
_fname_key
|
|
1444
|
-
_lname_key
|
|
1445
|
-
_dob_key
|
|
1446
|
-
sid_cepr
|
|
1447
|
-
|
|
1448
|
-
Ambiguous keys are removed.
|
|
1449
|
-
"""
|
|
1450
|
-
|
|
1451
|
-
lookup = (
|
|
1452
|
-
census
|
|
1453
|
-
.select(
|
|
1454
|
-
[
|
|
1455
|
-
fname_expr.alias("_fname_key"),
|
|
1456
|
-
lname_expr.alias("_lname_key"),
|
|
1457
|
-
pl.col(dob_col).alias("_dob_key"),
|
|
1458
|
-
pl.col("sid_cepr"),
|
|
1459
|
-
]
|
|
1460
|
-
)
|
|
1461
|
-
.drop_nulls(
|
|
1462
|
-
[
|
|
1463
|
-
"_fname_key",
|
|
1464
|
-
"_lname_key",
|
|
1465
|
-
"_dob_key",
|
|
1466
|
-
"sid_cepr",
|
|
1467
|
-
]
|
|
1468
|
-
)
|
|
1469
|
-
.group_by(
|
|
1470
|
-
[
|
|
1471
|
-
"_fname_key",
|
|
1472
|
-
"_lname_key",
|
|
1473
|
-
"_dob_key",
|
|
1474
|
-
]
|
|
1475
|
-
)
|
|
1476
|
-
.agg(
|
|
1477
|
-
pl.col("sid_cepr").unique().alias("_sids")
|
|
1478
|
-
)
|
|
1479
|
-
.with_columns(
|
|
1480
|
-
pl.col("_sids").list.len().alias("_sid_count")
|
|
1481
|
-
)
|
|
1482
|
-
.filter(
|
|
1483
|
-
pl.col("_sid_count") == 1
|
|
1484
|
-
)
|
|
1485
|
-
.select(
|
|
1486
|
-
[
|
|
1487
|
-
"_fname_key",
|
|
1488
|
-
"_lname_key",
|
|
1489
|
-
"_dob_key",
|
|
1490
|
-
pl.col("_sids").list.first().alias("sid_cepr"),
|
|
1491
|
-
]
|
|
1492
|
-
)
|
|
1493
|
-
)
|
|
1494
|
-
|
|
1495
|
-
print(f"built lookup: {label}")
|
|
1496
|
-
|
|
1497
|
-
return lookup
|
|
1498
|
-
```
|
|
1499
|
-
|
|
1500
|
-
def build_census_lookups(
|
|
1501
|
-
*,
|
|
1502
|
-
cmo_name: str,
|
|
1503
|
-
) -> dict[str, pl.DataFrame]:
|
|
1504
|
-
|
|
1505
|
-
```
|
|
1506
|
-
try:
|
|
1507
|
-
import mappings
|
|
1508
|
-
except ImportError:
|
|
1509
|
-
import mapppings as mappings
|
|
1510
|
-
|
|
1511
|
-
annual_frames = []
|
|
1438
|
+
"""
|
|
1439
|
+
Build a deterministic SID lookup table.
|
|
1512
1440
|
|
|
1513
|
-
|
|
1441
|
+
Output schema:
|
|
1442
|
+
_fname_key
|
|
1443
|
+
_lname_key
|
|
1444
|
+
_dob_key
|
|
1445
|
+
sid_cepr
|
|
1514
1446
|
|
|
1515
|
-
|
|
1447
|
+
Ambiguous keys are removed.
|
|
1448
|
+
"""
|
|
1516
1449
|
|
|
1517
|
-
|
|
1518
|
-
|
|
1519
|
-
path,
|
|
1520
|
-
infer_schema=False,
|
|
1521
|
-
null_values=[],
|
|
1522
|
-
try_parse_dates=False,
|
|
1523
|
-
ignore_errors=False,
|
|
1524
|
-
)
|
|
1450
|
+
lookup = (
|
|
1451
|
+
census
|
|
1525
1452
|
.select(
|
|
1526
1453
|
[
|
|
1527
|
-
"
|
|
1454
|
+
fname_expr.alias("_fname_key"),
|
|
1455
|
+
lname_expr.alias("_lname_key"),
|
|
1456
|
+
pl.col(dob_col).alias("_dob_key"),
|
|
1457
|
+
pl.col("sid_cepr"),
|
|
1458
|
+
]
|
|
1459
|
+
)
|
|
1460
|
+
.drop_nulls(
|
|
1461
|
+
[
|
|
1462
|
+
"_fname_key",
|
|
1463
|
+
"_lname_key",
|
|
1464
|
+
"_dob_key",
|
|
1528
1465
|
"sid_cepr",
|
|
1529
|
-
"fname_clean",
|
|
1530
|
-
"lname_clean",
|
|
1531
|
-
"mname_clean",
|
|
1532
|
-
"suff_clean",
|
|
1533
|
-
"birthdate_clean",
|
|
1534
|
-
"birthdate_imp",
|
|
1535
1466
|
]
|
|
1536
1467
|
)
|
|
1537
|
-
.
|
|
1538
|
-
|
|
1539
|
-
"
|
|
1540
|
-
"
|
|
1541
|
-
"
|
|
1542
|
-
|
|
1543
|
-
|
|
1544
|
-
|
|
1545
|
-
|
|
1468
|
+
.group_by(
|
|
1469
|
+
[
|
|
1470
|
+
"_fname_key",
|
|
1471
|
+
"_lname_key",
|
|
1472
|
+
"_dob_key",
|
|
1473
|
+
]
|
|
1474
|
+
)
|
|
1475
|
+
.agg(
|
|
1476
|
+
pl.col("sid_cepr").unique().alias("_sids")
|
|
1546
1477
|
)
|
|
1547
1478
|
.with_columns(
|
|
1548
|
-
pl.col("
|
|
1549
|
-
.replace(mappings.CMO_CODE_TO_NAME)
|
|
1550
|
-
.alias("cmo_name")
|
|
1479
|
+
pl.col("_sids").list.len().alias("_sid_count")
|
|
1551
1480
|
)
|
|
1552
1481
|
.filter(
|
|
1553
|
-
pl.col("
|
|
1482
|
+
pl.col("_sid_count") == 1
|
|
1554
1483
|
)
|
|
1555
|
-
.
|
|
1556
|
-
*clean_name("fname"),
|
|
1557
|
-
*clean_name("lname"),
|
|
1558
|
-
*clean_other_name("mname"),
|
|
1559
|
-
*clean_other_name("suffix"),
|
|
1560
|
-
*clean_dob(col="dob"),
|
|
1561
|
-
*clean_dob(col="dob_imp"),
|
|
1562
|
-
)
|
|
1563
|
-
.drop(
|
|
1484
|
+
.select(
|
|
1564
1485
|
[
|
|
1565
|
-
"
|
|
1566
|
-
"
|
|
1486
|
+
"_fname_key",
|
|
1487
|
+
"_lname_key",
|
|
1488
|
+
"_dob_key",
|
|
1489
|
+
pl.col("_sids").list.first().alias("sid_cepr"),
|
|
1567
1490
|
]
|
|
1568
1491
|
)
|
|
1569
|
-
.rename(
|
|
1570
|
-
{
|
|
1571
|
-
"dob_clean": "dob",
|
|
1572
|
-
"dob_imp_clean": "dob_imp",
|
|
1573
|
-
}
|
|
1574
|
-
)
|
|
1575
1492
|
)
|
|
1576
1493
|
|
|
1577
|
-
|
|
1494
|
+
print(f"built lookup: {label}")
|
|
1578
1495
|
|
|
1579
|
-
|
|
1580
|
-
|
|
1581
|
-
|
|
1496
|
+
return lookup
|
|
1497
|
+
|
|
1498
|
+
def build_census_lookups(
|
|
1499
|
+
*,
|
|
1500
|
+
cmo_name: str,
|
|
1501
|
+
) -> dict[str, pl.DataFrame]:
|
|
1502
|
+
|
|
1503
|
+
try:
|
|
1504
|
+
import mappings
|
|
1505
|
+
except ImportError:
|
|
1506
|
+
import mapppings as mappings
|
|
1507
|
+
|
|
1508
|
+
annual_frames = []
|
|
1509
|
+
|
|
1510
|
+
for year in range(1994, 2023):
|
|
1511
|
+
|
|
1512
|
+
path = CENSUS_STUDENTS / f"census_student_{year}.csv"
|
|
1513
|
+
|
|
1514
|
+
annual = (
|
|
1515
|
+
pl.scan_csv(
|
|
1516
|
+
path,
|
|
1517
|
+
infer_schema=False,
|
|
1518
|
+
null_values=[],
|
|
1519
|
+
try_parse_dates=False,
|
|
1520
|
+
ignore_errors=False,
|
|
1521
|
+
)
|
|
1522
|
+
.select(
|
|
1523
|
+
[
|
|
1524
|
+
"cmo_code",
|
|
1525
|
+
"sid_cepr",
|
|
1526
|
+
"fname_clean",
|
|
1527
|
+
"lname_clean",
|
|
1528
|
+
"mname_clean",
|
|
1529
|
+
"suff_clean",
|
|
1530
|
+
"birthdate_clean",
|
|
1531
|
+
"birthdate_imp",
|
|
1532
|
+
]
|
|
1533
|
+
)
|
|
1534
|
+
.rename(
|
|
1535
|
+
{
|
|
1536
|
+
"fname_clean": "fname",
|
|
1537
|
+
"lname_clean": "lname",
|
|
1538
|
+
"mname_clean": "mname",
|
|
1539
|
+
"suff_clean": "suffix",
|
|
1540
|
+
"birthdate_clean": "dob",
|
|
1541
|
+
"birthdate_imp": "dob_imp",
|
|
1542
|
+
}
|
|
1543
|
+
)
|
|
1544
|
+
.with_columns(
|
|
1545
|
+
pl.col("cmo_code")
|
|
1546
|
+
.replace(mappings.CMO_CODE_TO_NAME)
|
|
1547
|
+
.alias("cmo_name")
|
|
1548
|
+
)
|
|
1549
|
+
.filter(
|
|
1550
|
+
pl.col("cmo_name") == cmo_name
|
|
1551
|
+
)
|
|
1552
|
+
.with_columns(
|
|
1553
|
+
*clean_name("fname"),
|
|
1554
|
+
*clean_name("lname"),
|
|
1555
|
+
*clean_other_name("mname"),
|
|
1556
|
+
*clean_other_name("suffix"),
|
|
1557
|
+
*clean_dob(col="dob"),
|
|
1558
|
+
*clean_dob(col="dob_imp"),
|
|
1559
|
+
)
|
|
1560
|
+
.drop(
|
|
1561
|
+
[
|
|
1562
|
+
"dob",
|
|
1563
|
+
"dob_imp",
|
|
1564
|
+
]
|
|
1565
|
+
)
|
|
1566
|
+
.rename(
|
|
1567
|
+
{
|
|
1568
|
+
"dob_clean": "dob",
|
|
1569
|
+
"dob_imp_clean": "dob_imp",
|
|
1570
|
+
}
|
|
1571
|
+
)
|
|
1572
|
+
)
|
|
1573
|
+
|
|
1574
|
+
annual_frames.append(annual)
|
|
1582
1575
|
|
|
1583
|
-
|
|
1584
|
-
|
|
1585
|
-
|
|
1586
|
-
|
|
1576
|
+
#
|
|
1577
|
+
# MATERIALIZE ONCE
|
|
1578
|
+
#
|
|
1579
|
+
|
|
1580
|
+
census = (
|
|
1581
|
+
pl.concat(
|
|
1582
|
+
annual_frames,
|
|
1583
|
+
how="vertical_relaxed",
|
|
1584
|
+
)
|
|
1585
|
+
.collect()
|
|
1587
1586
|
)
|
|
1588
|
-
.collect()
|
|
1589
|
-
)
|
|
1590
1587
|
|
|
1591
|
-
print(f"census rows: {len(census):,}")
|
|
1588
|
+
print(f"census rows: {len(census):,}")
|
|
1592
1589
|
|
|
1593
|
-
#
|
|
1594
|
-
# BUILD LOOKUPS ONCE
|
|
1595
|
-
#
|
|
1590
|
+
#
|
|
1591
|
+
# BUILD LOOKUPS ONCE
|
|
1592
|
+
#
|
|
1596
1593
|
|
|
1597
|
-
lookup_exact = _build_lookup(
|
|
1598
|
-
|
|
1599
|
-
|
|
1600
|
-
|
|
1601
|
-
|
|
1602
|
-
|
|
1603
|
-
)
|
|
1594
|
+
lookup_exact = _build_lookup(
|
|
1595
|
+
census,
|
|
1596
|
+
fname_expr=pl.col("fname"),
|
|
1597
|
+
lname_expr=pl.col("lname"),
|
|
1598
|
+
dob_col="dob",
|
|
1599
|
+
label="exact",
|
|
1600
|
+
)
|
|
1604
1601
|
|
|
1605
|
-
lookup_mname = _build_lookup(
|
|
1606
|
-
|
|
1607
|
-
|
|
1608
|
-
|
|
1609
|
-
|
|
1610
|
-
|
|
1611
|
-
|
|
1612
|
-
|
|
1613
|
-
|
|
1614
|
-
|
|
1615
|
-
|
|
1616
|
-
|
|
1617
|
-
)
|
|
1602
|
+
lookup_mname = _build_lookup(
|
|
1603
|
+
census,
|
|
1604
|
+
fname_expr=pl.concat_str(
|
|
1605
|
+
[
|
|
1606
|
+
pl.col("fname"),
|
|
1607
|
+
pl.col("mname"),
|
|
1608
|
+
],
|
|
1609
|
+
separator=" ",
|
|
1610
|
+
),
|
|
1611
|
+
lname_expr=pl.col("lname"),
|
|
1612
|
+
dob_col="dob",
|
|
1613
|
+
label="mname",
|
|
1614
|
+
)
|
|
1618
1615
|
|
|
1619
|
-
lookup_suffix = _build_lookup(
|
|
1620
|
-
|
|
1621
|
-
|
|
1622
|
-
|
|
1623
|
-
|
|
1624
|
-
|
|
1625
|
-
|
|
1626
|
-
|
|
1627
|
-
|
|
1628
|
-
|
|
1629
|
-
|
|
1630
|
-
|
|
1631
|
-
)
|
|
1616
|
+
lookup_suffix = _build_lookup(
|
|
1617
|
+
census,
|
|
1618
|
+
fname_expr=pl.col("fname"),
|
|
1619
|
+
lname_expr=pl.concat_str(
|
|
1620
|
+
[
|
|
1621
|
+
pl.col("lname"),
|
|
1622
|
+
pl.col("suffix"),
|
|
1623
|
+
],
|
|
1624
|
+
separator=" ",
|
|
1625
|
+
),
|
|
1626
|
+
dob_col="dob",
|
|
1627
|
+
label="suffix",
|
|
1628
|
+
)
|
|
1632
1629
|
|
|
1633
|
-
lookup_dob_imp = _build_lookup(
|
|
1634
|
-
|
|
1635
|
-
|
|
1636
|
-
|
|
1637
|
-
|
|
1638
|
-
|
|
1639
|
-
)
|
|
1630
|
+
lookup_dob_imp = _build_lookup(
|
|
1631
|
+
census,
|
|
1632
|
+
fname_expr=pl.col("fname"),
|
|
1633
|
+
lname_expr=pl.col("lname"),
|
|
1634
|
+
dob_col="dob_imp",
|
|
1635
|
+
label="dob_imp",
|
|
1636
|
+
)
|
|
1640
1637
|
|
|
1641
|
-
return {
|
|
1642
|
-
|
|
1643
|
-
|
|
1644
|
-
|
|
1645
|
-
|
|
1646
|
-
}
|
|
1647
|
-
```
|
|
1638
|
+
return {
|
|
1639
|
+
"exact": lookup_exact,
|
|
1640
|
+
"mname": lookup_mname,
|
|
1641
|
+
"suffix": lookup_suffix,
|
|
1642
|
+
"dob_imp": lookup_dob_imp,
|
|
1643
|
+
}
|
|
1648
1644
|
|
|
1649
1645
|
def _run_match_stage(
|
|
1650
|
-
unmatched: pl.DataFrame,
|
|
1651
|
-
*,
|
|
1652
|
-
lookup: pl.DataFrame,
|
|
1653
|
-
fname_expr: pl.Expr,
|
|
1654
|
-
lname_expr: pl.Expr,
|
|
1655
|
-
dob_expr: pl.Expr,
|
|
1656
|
-
label: str,
|
|
1646
|
+
unmatched: pl.DataFrame,
|
|
1647
|
+
*,
|
|
1648
|
+
lookup: pl.DataFrame,
|
|
1649
|
+
fname_expr: pl.Expr,
|
|
1650
|
+
lname_expr: pl.Expr,
|
|
1651
|
+
dob_expr: pl.Expr,
|
|
1652
|
+
label: str,
|
|
1657
1653
|
) -> tuple[pl.DataFrame, pl.DataFrame]:
|
|
1658
1654
|
|
|
1659
|
-
|
|
1660
|
-
before = len(unmatched)
|
|
1655
|
+
before = len(unmatched)
|
|
1661
1656
|
|
|
1662
|
-
stage = (
|
|
1663
|
-
|
|
1664
|
-
|
|
1665
|
-
|
|
1666
|
-
|
|
1667
|
-
|
|
1668
|
-
|
|
1669
|
-
|
|
1670
|
-
|
|
1671
|
-
|
|
1672
|
-
|
|
1673
|
-
|
|
1674
|
-
|
|
1675
|
-
|
|
1676
|
-
|
|
1677
|
-
|
|
1678
|
-
|
|
1679
|
-
|
|
1680
|
-
|
|
1681
|
-
|
|
1682
|
-
|
|
1683
|
-
|
|
1684
|
-
|
|
1685
|
-
|
|
1686
|
-
|
|
1657
|
+
stage = (
|
|
1658
|
+
unmatched
|
|
1659
|
+
.with_columns(
|
|
1660
|
+
[
|
|
1661
|
+
fname_expr.alias("_fname_key"),
|
|
1662
|
+
lname_expr.alias("_lname_key"),
|
|
1663
|
+
dob_expr.alias("_dob_key"),
|
|
1664
|
+
]
|
|
1665
|
+
)
|
|
1666
|
+
.join(
|
|
1667
|
+
lookup,
|
|
1668
|
+
on=[
|
|
1669
|
+
"_fname_key",
|
|
1670
|
+
"_lname_key",
|
|
1671
|
+
"_dob_key",
|
|
1672
|
+
],
|
|
1673
|
+
how="left",
|
|
1674
|
+
validate="m:1",
|
|
1675
|
+
)
|
|
1676
|
+
.drop(
|
|
1677
|
+
[
|
|
1678
|
+
"_fname_key",
|
|
1679
|
+
"_lname_key",
|
|
1680
|
+
"_dob_key",
|
|
1681
|
+
]
|
|
1682
|
+
)
|
|
1687
1683
|
)
|
|
1688
|
-
)
|
|
1689
1684
|
|
|
1690
|
-
matched = (
|
|
1691
|
-
|
|
1692
|
-
|
|
1693
|
-
|
|
1685
|
+
matched = (
|
|
1686
|
+
stage
|
|
1687
|
+
.filter(
|
|
1688
|
+
pl.col("sid_cepr").is_not_null()
|
|
1689
|
+
)
|
|
1694
1690
|
)
|
|
1695
|
-
)
|
|
1696
1691
|
|
|
1697
|
-
unmatched = (
|
|
1698
|
-
|
|
1699
|
-
|
|
1700
|
-
|
|
1692
|
+
unmatched = (
|
|
1693
|
+
stage
|
|
1694
|
+
.filter(
|
|
1695
|
+
pl.col("sid_cepr").is_null()
|
|
1696
|
+
)
|
|
1697
|
+
.drop("sid_cepr")
|
|
1701
1698
|
)
|
|
1702
|
-
.drop("sid_cepr")
|
|
1703
|
-
)
|
|
1704
1699
|
|
|
1705
|
-
added = len(matched)
|
|
1700
|
+
added = len(matched)
|
|
1706
1701
|
|
|
1707
|
-
print(
|
|
1708
|
-
|
|
1709
|
-
)
|
|
1702
|
+
print(
|
|
1703
|
+
f"{label}: matched {added:,}/{before:,}"
|
|
1704
|
+
)
|
|
1710
1705
|
|
|
1711
|
-
return matched, unmatched
|
|
1712
|
-
```
|
|
1706
|
+
return matched, unmatched
|
|
1713
1707
|
|
|
1714
1708
|
def lookup_sid_cepr(
|
|
1715
|
-
frame: Frame,
|
|
1716
|
-
*,
|
|
1717
|
-
cols: Mapping[str, str],
|
|
1718
|
-
lookups: dict[str, pl.DataFrame],
|
|
1709
|
+
frame: Frame,
|
|
1710
|
+
*,
|
|
1711
|
+
cols: Mapping[str, str],
|
|
1712
|
+
lookups: dict[str, pl.DataFrame],
|
|
1719
1713
|
) -> Frame:
|
|
1720
1714
|
|
|
1721
|
-
|
|
1722
|
-
|
|
1723
|
-
|
|
1724
|
-
current = (
|
|
1725
|
-
frame.collect()
|
|
1726
|
-
if is_lazy
|
|
1727
|
-
else frame
|
|
1728
|
-
)
|
|
1729
|
-
|
|
1730
|
-
#
|
|
1731
|
-
# CLEAN LEFT SIDE
|
|
1732
|
-
#
|
|
1715
|
+
is_lazy = isinstance(frame, pl.LazyFrame)
|
|
1716
|
+
current = frame.collect() if is_lazy else frame
|
|
1733
1717
|
|
|
1734
|
-
current = (
|
|
1735
|
-
current
|
|
1736
|
-
.with_columns(
|
|
1718
|
+
current = current.with_columns(
|
|
1737
1719
|
*clean_name(cols["fname"]),
|
|
1738
1720
|
*clean_name(cols["lname"]),
|
|
1739
1721
|
*clean_dob(col=cols["dob"]),
|
|
1740
1722
|
)
|
|
1741
|
-
)
|
|
1742
1723
|
|
|
1743
|
-
|
|
1744
|
-
|
|
1745
|
-
unmatched = current
|
|
1746
|
-
|
|
1747
|
-
#
|
|
1748
|
-
# STAGE 1
|
|
1749
|
-
# EXACT
|
|
1750
|
-
#
|
|
1751
|
-
|
|
1752
|
-
matched, unmatched = _run_match_stage(
|
|
1753
|
-
unmatched,
|
|
1754
|
-
lookup=lookups["exact"],
|
|
1755
|
-
fname_expr=pl.col(f"{cols['fname']}_clean"),
|
|
1756
|
-
lname_expr=pl.col(f"{cols['lname']}_clean"),
|
|
1757
|
-
dob_expr=pl.col(f"{cols['dob']}_clean"),
|
|
1758
|
-
label="exact",
|
|
1759
|
-
)
|
|
1760
|
-
|
|
1761
|
-
matched_frames.append(matched)
|
|
1762
|
-
|
|
1763
|
-
#
|
|
1764
|
-
# STAGE 2
|
|
1765
|
-
# MNAME
|
|
1766
|
-
#
|
|
1767
|
-
|
|
1768
|
-
if "mname" in cols:
|
|
1769
|
-
|
|
1770
|
-
unmatched = (
|
|
1771
|
-
unmatched
|
|
1772
|
-
.with_columns(
|
|
1773
|
-
*clean_other_name(cols["mname"])
|
|
1774
|
-
)
|
|
1724
|
+
current = current.with_columns(
|
|
1725
|
+
_parse_dob_expr(f"{cols['dob']}_clean").alias(f"{cols['dob']}_clean")
|
|
1775
1726
|
)
|
|
1776
1727
|
|
|
1728
|
+
matched_frames = []
|
|
1729
|
+
unmatched = current
|
|
1730
|
+
|
|
1731
|
+
# exact
|
|
1777
1732
|
matched, unmatched = _run_match_stage(
|
|
1778
1733
|
unmatched,
|
|
1779
|
-
lookup=lookups["
|
|
1780
|
-
fname_expr=pl.
|
|
1781
|
-
[
|
|
1782
|
-
pl.col(f"{cols['fname']}_clean"),
|
|
1783
|
-
pl.col(f"{cols['mname']}_clean"),
|
|
1784
|
-
],
|
|
1785
|
-
separator=" ",
|
|
1786
|
-
),
|
|
1734
|
+
lookup=lookups["exact"],
|
|
1735
|
+
fname_expr=pl.col(f"{cols['fname']}_clean"),
|
|
1787
1736
|
lname_expr=pl.col(f"{cols['lname']}_clean"),
|
|
1788
1737
|
dob_expr=pl.col(f"{cols['dob']}_clean"),
|
|
1789
|
-
label="
|
|
1738
|
+
label="exact",
|
|
1790
1739
|
)
|
|
1791
|
-
|
|
1792
1740
|
matched_frames.append(matched)
|
|
1793
1741
|
|
|
1794
|
-
#
|
|
1795
|
-
|
|
1796
|
-
|
|
1797
|
-
|
|
1798
|
-
|
|
1799
|
-
if "suffix" in cols:
|
|
1800
|
-
|
|
1801
|
-
unmatched = (
|
|
1802
|
-
unmatched
|
|
1803
|
-
.with_columns(
|
|
1804
|
-
*clean_other_name(cols["suffix"])
|
|
1742
|
+
# middle-name variants
|
|
1743
|
+
if "mname" in cols:
|
|
1744
|
+
unmatched = unmatched.with_columns(
|
|
1745
|
+
*clean_other_name(cols["mname"])
|
|
1805
1746
|
)
|
|
1806
|
-
)
|
|
1807
1747
|
|
|
1808
|
-
|
|
1809
|
-
|
|
1810
|
-
|
|
1811
|
-
|
|
1812
|
-
|
|
1813
|
-
[
|
|
1748
|
+
mname_stages = [
|
|
1749
|
+
(
|
|
1750
|
+
"left exact -> right fname + mname",
|
|
1751
|
+
lookups["mname"],
|
|
1752
|
+
pl.col(f"{cols['fname']}_clean"),
|
|
1814
1753
|
pl.col(f"{cols['lname']}_clean"),
|
|
1815
|
-
|
|
1816
|
-
|
|
1817
|
-
|
|
1818
|
-
|
|
1819
|
-
|
|
1820
|
-
|
|
1821
|
-
|
|
1754
|
+
),
|
|
1755
|
+
(
|
|
1756
|
+
"left exact -> right mname + lname",
|
|
1757
|
+
lookups["mname_lname"],
|
|
1758
|
+
pl.col(f"{cols['fname']}_clean"),
|
|
1759
|
+
pl.col(f"{cols['lname']}_clean"),
|
|
1760
|
+
),
|
|
1761
|
+
(
|
|
1762
|
+
"left exact -> right fname + mname no space",
|
|
1763
|
+
lookups["mname_nospace"],
|
|
1764
|
+
pl.col(f"{cols['fname']}_clean"),
|
|
1765
|
+
pl.col(f"{cols['lname']}_clean"),
|
|
1766
|
+
),
|
|
1767
|
+
(
|
|
1768
|
+
"left fname + mname -> right exact",
|
|
1769
|
+
lookups["exact"],
|
|
1770
|
+
pl.concat_str(
|
|
1771
|
+
[pl.col(f"{cols['fname']}_clean"), pl.col(f"{cols['mname']}_clean")],
|
|
1772
|
+
separator=" ",
|
|
1773
|
+
),
|
|
1774
|
+
pl.col(f"{cols['lname']}_clean"),
|
|
1775
|
+
),
|
|
1776
|
+
(
|
|
1777
|
+
"left mname + lname -> right exact",
|
|
1778
|
+
lookups["exact"],
|
|
1779
|
+
pl.col(f"{cols['fname']}_clean"),
|
|
1780
|
+
pl.concat_str(
|
|
1781
|
+
[pl.col(f"{cols['mname']}_clean"), pl.col(f"{cols['lname']}_clean")],
|
|
1782
|
+
separator=" ",
|
|
1783
|
+
),
|
|
1784
|
+
),
|
|
1785
|
+
(
|
|
1786
|
+
"left fname + mname no space -> right exact",
|
|
1787
|
+
lookups["exact"],
|
|
1788
|
+
pl.concat_str(
|
|
1789
|
+
[pl.col(f"{cols['fname']}_clean"), pl.col(f"{cols['mname']}_clean")],
|
|
1790
|
+
separator="",
|
|
1791
|
+
),
|
|
1792
|
+
pl.col(f"{cols['lname']}_clean"),
|
|
1793
|
+
),
|
|
1794
|
+
]
|
|
1822
1795
|
|
|
1823
|
-
|
|
1796
|
+
for label, lookup, fname_expr, lname_expr in mname_stages:
|
|
1797
|
+
matched, unmatched = _run_match_stage(
|
|
1798
|
+
unmatched,
|
|
1799
|
+
lookup=lookup,
|
|
1800
|
+
fname_expr=fname_expr,
|
|
1801
|
+
lname_expr=lname_expr,
|
|
1802
|
+
dob_expr=pl.col(f"{cols['dob']}_clean"),
|
|
1803
|
+
label=label,
|
|
1804
|
+
)
|
|
1805
|
+
matched_frames.append(matched)
|
|
1824
1806
|
|
|
1825
|
-
#
|
|
1826
|
-
|
|
1827
|
-
|
|
1828
|
-
|
|
1807
|
+
# suffix variants
|
|
1808
|
+
if "suffix" in cols:
|
|
1809
|
+
unmatched = unmatched.with_columns(
|
|
1810
|
+
*clean_other_name(cols["suffix"])
|
|
1811
|
+
)
|
|
1829
1812
|
|
|
1830
|
-
|
|
1831
|
-
|
|
1832
|
-
|
|
1833
|
-
|
|
1834
|
-
|
|
1835
|
-
|
|
1836
|
-
|
|
1837
|
-
|
|
1813
|
+
suffix_stages = [
|
|
1814
|
+
(
|
|
1815
|
+
"left exact -> right lname + suffix",
|
|
1816
|
+
lookups["suffix"],
|
|
1817
|
+
pl.col(f"{cols['fname']}_clean"),
|
|
1818
|
+
pl.col(f"{cols['lname']}_clean"),
|
|
1819
|
+
),
|
|
1820
|
+
(
|
|
1821
|
+
"left exact -> right fname + suffix",
|
|
1822
|
+
lookups["suffix_fname"],
|
|
1823
|
+
pl.col(f"{cols['fname']}_clean"),
|
|
1824
|
+
pl.col(f"{cols['lname']}_clean"),
|
|
1825
|
+
),
|
|
1826
|
+
(
|
|
1827
|
+
"left exact -> right fname + suffix no space",
|
|
1828
|
+
lookups["suffix_fname_nospace"],
|
|
1829
|
+
pl.col(f"{cols['fname']}_clean"),
|
|
1830
|
+
pl.col(f"{cols['lname']}_clean"),
|
|
1831
|
+
),
|
|
1832
|
+
(
|
|
1833
|
+
"left exact -> right lname + suffix no space",
|
|
1834
|
+
lookups["suffix_lname_nospace"],
|
|
1835
|
+
pl.col(f"{cols['fname']}_clean"),
|
|
1836
|
+
pl.col(f"{cols['lname']}_clean"),
|
|
1837
|
+
),
|
|
1838
|
+
(
|
|
1839
|
+
"left fname + suffix -> right exact",
|
|
1840
|
+
lookups["exact"],
|
|
1841
|
+
pl.concat_str(
|
|
1842
|
+
[pl.col(f"{cols['fname']}_clean"), pl.col(f"{cols['suffix']}_clean")],
|
|
1843
|
+
separator=" ",
|
|
1844
|
+
),
|
|
1845
|
+
pl.col(f"{cols['lname']}_clean"),
|
|
1846
|
+
),
|
|
1847
|
+
(
|
|
1848
|
+
"left fname + suffix no space -> right exact",
|
|
1849
|
+
lookups["exact"],
|
|
1850
|
+
pl.concat_str(
|
|
1851
|
+
[pl.col(f"{cols['fname']}_clean"), pl.col(f"{cols['suffix']}_clean")],
|
|
1852
|
+
separator="",
|
|
1853
|
+
),
|
|
1854
|
+
pl.col(f"{cols['lname']}_clean"),
|
|
1855
|
+
),
|
|
1856
|
+
(
|
|
1857
|
+
"left lname + suffix -> right exact",
|
|
1858
|
+
lookups["exact"],
|
|
1859
|
+
pl.col(f"{cols['fname']}_clean"),
|
|
1860
|
+
pl.concat_str(
|
|
1861
|
+
[pl.col(f"{cols['lname']}_clean"), pl.col(f"{cols['suffix']}_clean")],
|
|
1862
|
+
separator=" ",
|
|
1863
|
+
),
|
|
1864
|
+
),
|
|
1865
|
+
(
|
|
1866
|
+
"left lname + suffix no space -> right exact",
|
|
1867
|
+
lookups["exact"],
|
|
1868
|
+
pl.col(f"{cols['fname']}_clean"),
|
|
1869
|
+
pl.concat_str(
|
|
1870
|
+
[pl.col(f"{cols['lname']}_clean"), pl.col(f"{cols['suffix']}_clean")],
|
|
1871
|
+
separator="",
|
|
1872
|
+
),
|
|
1873
|
+
),
|
|
1874
|
+
]
|
|
1838
1875
|
|
|
1839
|
-
|
|
1876
|
+
for label, lookup, fname_expr, lname_expr in suffix_stages:
|
|
1877
|
+
matched, unmatched = _run_match_stage(
|
|
1878
|
+
unmatched,
|
|
1879
|
+
lookup=lookup,
|
|
1880
|
+
fname_expr=fname_expr,
|
|
1881
|
+
lname_expr=lname_expr,
|
|
1882
|
+
dob_expr=pl.col(f"{cols['dob']}_clean"),
|
|
1883
|
+
label=label,
|
|
1884
|
+
)
|
|
1885
|
+
matched_frames.append(matched)
|
|
1886
|
+
|
|
1887
|
+
# dob_imp variants
|
|
1888
|
+
dob_imp_stages = [
|
|
1889
|
+
"dob_imp",
|
|
1890
|
+
"dob_imp_minus_1",
|
|
1891
|
+
"dob_imp_plus_1",
|
|
1892
|
+
"dob_imp_minus_2",
|
|
1893
|
+
"dob_imp_plus_2",
|
|
1894
|
+
]
|
|
1840
1895
|
|
|
1841
|
-
|
|
1842
|
-
|
|
1843
|
-
|
|
1896
|
+
for key in dob_imp_stages:
|
|
1897
|
+
matched, unmatched = _run_match_stage(
|
|
1898
|
+
unmatched,
|
|
1899
|
+
lookup=lookups[key],
|
|
1900
|
+
fname_expr=pl.col(f"{cols['fname']}_clean"),
|
|
1901
|
+
lname_expr=pl.col(f"{cols['lname']}_clean"),
|
|
1902
|
+
dob_expr=pl.col(f"{cols['dob']}_clean"),
|
|
1903
|
+
label=key,
|
|
1904
|
+
)
|
|
1905
|
+
matched_frames.append(matched)
|
|
1844
1906
|
|
|
1845
|
-
result = pl.concat(
|
|
1846
|
-
|
|
1847
|
-
|
|
1848
|
-
)
|
|
1907
|
+
result = pl.concat(
|
|
1908
|
+
matched_frames + [unmatched],
|
|
1909
|
+
how="diagonal_relaxed",
|
|
1910
|
+
)
|
|
1849
1911
|
|
|
1850
|
-
print(
|
|
1851
|
-
|
|
1852
|
-
)
|
|
1912
|
+
print(
|
|
1913
|
+
f"final matched: {result['sid_cepr'].is_not_null().sum():,}/{len(result):,}"
|
|
1914
|
+
)
|
|
1853
1915
|
|
|
1854
|
-
return result
|
|
1855
|
-
```
|
|
1916
|
+
return result
|
|
1856
1917
|
|
|
1857
1918
|
#
|
|
1858
1919
|
|
|
File without changes
|
|
File without changes
|
|
File without changes
|