ltc-code 0.1.4__tar.gz → 0.1.5__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {ltc_code-0.1.4 → ltc_code-0.1.5}/PKG-INFO +1 -1
- {ltc_code-0.1.4 → ltc_code-0.1.5}/pyproject.toml +1 -1
- {ltc_code-0.1.4 → ltc_code-0.1.5}/src/ltc_code/may27.py +338 -346
- {ltc_code-0.1.4 → ltc_code-0.1.5}/README.md +0 -0
- {ltc_code-0.1.4 → ltc_code-0.1.5}/src/ltc_code/__init__.py +0 -0
- {ltc_code-0.1.4 → ltc_code-0.1.5}/src/ltc_code/polars_dates.py +0 -0
|
@@ -1428,431 +1428,423 @@ def lookup_sid_cepr(
|
|
|
1428
1428
|
|
|
1429
1429
|
|
|
1430
1430
|
def _build_lookup(
|
|
1431
|
-
census: pl.DataFrame,
|
|
1432
|
-
*,
|
|
1433
|
-
fname_expr: pl.Expr,
|
|
1434
|
-
lname_expr: pl.Expr,
|
|
1435
|
-
dob_col: str,
|
|
1436
|
-
label: str,
|
|
1431
|
+
census: pl.DataFrame,
|
|
1432
|
+
*,
|
|
1433
|
+
fname_expr: pl.Expr,
|
|
1434
|
+
lname_expr: pl.Expr,
|
|
1435
|
+
dob_col: str,
|
|
1436
|
+
label: str,
|
|
1437
1437
|
) -> pl.DataFrame:
|
|
1438
|
-
"""
|
|
1439
|
-
Build a deterministic SID lookup table.
|
|
1440
|
-
|
|
1441
|
-
```
|
|
1442
|
-
Output schema:
|
|
1443
|
-
_fname_key
|
|
1444
|
-
_lname_key
|
|
1445
|
-
_dob_key
|
|
1446
|
-
sid_cepr
|
|
1447
|
-
|
|
1448
|
-
Ambiguous keys are removed.
|
|
1449
|
-
"""
|
|
1450
|
-
|
|
1451
|
-
lookup = (
|
|
1452
|
-
census
|
|
1453
|
-
.select(
|
|
1454
|
-
[
|
|
1455
|
-
fname_expr.alias("_fname_key"),
|
|
1456
|
-
lname_expr.alias("_lname_key"),
|
|
1457
|
-
pl.col(dob_col).alias("_dob_key"),
|
|
1458
|
-
pl.col("sid_cepr"),
|
|
1459
|
-
]
|
|
1460
|
-
)
|
|
1461
|
-
.drop_nulls(
|
|
1462
|
-
[
|
|
1463
|
-
"_fname_key",
|
|
1464
|
-
"_lname_key",
|
|
1465
|
-
"_dob_key",
|
|
1466
|
-
"sid_cepr",
|
|
1467
|
-
]
|
|
1468
|
-
)
|
|
1469
|
-
.group_by(
|
|
1470
|
-
[
|
|
1471
|
-
"_fname_key",
|
|
1472
|
-
"_lname_key",
|
|
1473
|
-
"_dob_key",
|
|
1474
|
-
]
|
|
1475
|
-
)
|
|
1476
|
-
.agg(
|
|
1477
|
-
pl.col("sid_cepr").unique().alias("_sids")
|
|
1478
|
-
)
|
|
1479
|
-
.with_columns(
|
|
1480
|
-
pl.col("_sids").list.len().alias("_sid_count")
|
|
1481
|
-
)
|
|
1482
|
-
.filter(
|
|
1483
|
-
pl.col("_sid_count") == 1
|
|
1484
|
-
)
|
|
1485
|
-
.select(
|
|
1486
|
-
[
|
|
1487
|
-
"_fname_key",
|
|
1488
|
-
"_lname_key",
|
|
1489
|
-
"_dob_key",
|
|
1490
|
-
pl.col("_sids").list.first().alias("sid_cepr"),
|
|
1491
|
-
]
|
|
1492
|
-
)
|
|
1493
|
-
)
|
|
1494
|
-
|
|
1495
|
-
print(f"built lookup: {label}")
|
|
1496
|
-
|
|
1497
|
-
return lookup
|
|
1498
|
-
```
|
|
1499
|
-
|
|
1500
|
-
def build_census_lookups(
|
|
1501
|
-
*,
|
|
1502
|
-
cmo_name: str,
|
|
1503
|
-
) -> dict[str, pl.DataFrame]:
|
|
1504
|
-
|
|
1505
|
-
```
|
|
1506
|
-
try:
|
|
1507
|
-
import mappings
|
|
1508
|
-
except ImportError:
|
|
1509
|
-
import mapppings as mappings
|
|
1510
|
-
|
|
1511
|
-
annual_frames = []
|
|
1438
|
+
"""
|
|
1439
|
+
Build a deterministic SID lookup table.
|
|
1512
1440
|
|
|
1513
|
-
|
|
1441
|
+
Output schema:
|
|
1442
|
+
_fname_key
|
|
1443
|
+
_lname_key
|
|
1444
|
+
_dob_key
|
|
1445
|
+
sid_cepr
|
|
1514
1446
|
|
|
1515
|
-
|
|
1447
|
+
Ambiguous keys are removed.
|
|
1448
|
+
"""
|
|
1516
1449
|
|
|
1517
|
-
|
|
1518
|
-
|
|
1519
|
-
path,
|
|
1520
|
-
infer_schema=False,
|
|
1521
|
-
null_values=[],
|
|
1522
|
-
try_parse_dates=False,
|
|
1523
|
-
ignore_errors=False,
|
|
1524
|
-
)
|
|
1450
|
+
lookup = (
|
|
1451
|
+
census
|
|
1525
1452
|
.select(
|
|
1526
1453
|
[
|
|
1527
|
-
"
|
|
1454
|
+
fname_expr.alias("_fname_key"),
|
|
1455
|
+
lname_expr.alias("_lname_key"),
|
|
1456
|
+
pl.col(dob_col).alias("_dob_key"),
|
|
1457
|
+
pl.col("sid_cepr"),
|
|
1458
|
+
]
|
|
1459
|
+
)
|
|
1460
|
+
.drop_nulls(
|
|
1461
|
+
[
|
|
1462
|
+
"_fname_key",
|
|
1463
|
+
"_lname_key",
|
|
1464
|
+
"_dob_key",
|
|
1528
1465
|
"sid_cepr",
|
|
1529
|
-
"fname_clean",
|
|
1530
|
-
"lname_clean",
|
|
1531
|
-
"mname_clean",
|
|
1532
|
-
"suff_clean",
|
|
1533
|
-
"birthdate_clean",
|
|
1534
|
-
"birthdate_imp",
|
|
1535
1466
|
]
|
|
1536
1467
|
)
|
|
1537
|
-
.
|
|
1538
|
-
|
|
1539
|
-
"
|
|
1540
|
-
"
|
|
1541
|
-
"
|
|
1542
|
-
|
|
1543
|
-
|
|
1544
|
-
|
|
1545
|
-
|
|
1468
|
+
.group_by(
|
|
1469
|
+
[
|
|
1470
|
+
"_fname_key",
|
|
1471
|
+
"_lname_key",
|
|
1472
|
+
"_dob_key",
|
|
1473
|
+
]
|
|
1474
|
+
)
|
|
1475
|
+
.agg(
|
|
1476
|
+
pl.col("sid_cepr").unique().alias("_sids")
|
|
1546
1477
|
)
|
|
1547
1478
|
.with_columns(
|
|
1548
|
-
pl.col("
|
|
1549
|
-
.replace(mappings.CMO_CODE_TO_NAME)
|
|
1550
|
-
.alias("cmo_name")
|
|
1479
|
+
pl.col("_sids").list.len().alias("_sid_count")
|
|
1551
1480
|
)
|
|
1552
1481
|
.filter(
|
|
1553
|
-
pl.col("
|
|
1482
|
+
pl.col("_sid_count") == 1
|
|
1554
1483
|
)
|
|
1555
|
-
.
|
|
1556
|
-
*clean_name("fname"),
|
|
1557
|
-
*clean_name("lname"),
|
|
1558
|
-
*clean_other_name("mname"),
|
|
1559
|
-
*clean_other_name("suffix"),
|
|
1560
|
-
*clean_dob(col="dob"),
|
|
1561
|
-
*clean_dob(col="dob_imp"),
|
|
1562
|
-
)
|
|
1563
|
-
.drop(
|
|
1484
|
+
.select(
|
|
1564
1485
|
[
|
|
1565
|
-
"
|
|
1566
|
-
"
|
|
1486
|
+
"_fname_key",
|
|
1487
|
+
"_lname_key",
|
|
1488
|
+
"_dob_key",
|
|
1489
|
+
pl.col("_sids").list.first().alias("sid_cepr"),
|
|
1567
1490
|
]
|
|
1568
1491
|
)
|
|
1569
|
-
.rename(
|
|
1570
|
-
{
|
|
1571
|
-
"dob_clean": "dob",
|
|
1572
|
-
"dob_imp_clean": "dob_imp",
|
|
1573
|
-
}
|
|
1574
|
-
)
|
|
1575
1492
|
)
|
|
1576
1493
|
|
|
1577
|
-
|
|
1494
|
+
print(f"built lookup: {label}")
|
|
1578
1495
|
|
|
1579
|
-
|
|
1580
|
-
# MATERIALIZE ONCE
|
|
1581
|
-
#
|
|
1496
|
+
return lookup
|
|
1582
1497
|
|
|
1583
|
-
|
|
1584
|
-
|
|
1585
|
-
|
|
1586
|
-
|
|
1587
|
-
)
|
|
1588
|
-
.collect()
|
|
1589
|
-
)
|
|
1498
|
+
def build_census_lookups(
|
|
1499
|
+
*,
|
|
1500
|
+
cmo_name: str,
|
|
1501
|
+
) -> dict[str, pl.DataFrame]:
|
|
1590
1502
|
|
|
1591
|
-
|
|
1503
|
+
try:
|
|
1504
|
+
import mappings
|
|
1505
|
+
except ImportError:
|
|
1506
|
+
import mapppings as mappings
|
|
1592
1507
|
|
|
1593
|
-
|
|
1594
|
-
# BUILD LOOKUPS ONCE
|
|
1595
|
-
#
|
|
1508
|
+
annual_frames = []
|
|
1596
1509
|
|
|
1597
|
-
|
|
1598
|
-
census,
|
|
1599
|
-
fname_expr=pl.col("fname"),
|
|
1600
|
-
lname_expr=pl.col("lname"),
|
|
1601
|
-
dob_col="dob",
|
|
1602
|
-
label="exact",
|
|
1603
|
-
)
|
|
1510
|
+
for year in range(1994, 2023):
|
|
1604
1511
|
|
|
1605
|
-
|
|
1606
|
-
census,
|
|
1607
|
-
fname_expr=pl.concat_str(
|
|
1608
|
-
[
|
|
1609
|
-
pl.col("fname"),
|
|
1610
|
-
pl.col("mname"),
|
|
1611
|
-
],
|
|
1612
|
-
separator=" ",
|
|
1613
|
-
),
|
|
1614
|
-
lname_expr=pl.col("lname"),
|
|
1615
|
-
dob_col="dob",
|
|
1616
|
-
label="mname",
|
|
1617
|
-
)
|
|
1512
|
+
path = CENSUS_STUDENTS / f"census_student_{year}.csv"
|
|
1618
1513
|
|
|
1619
|
-
|
|
1620
|
-
|
|
1621
|
-
|
|
1622
|
-
|
|
1623
|
-
|
|
1624
|
-
|
|
1625
|
-
|
|
1626
|
-
|
|
1627
|
-
|
|
1628
|
-
|
|
1629
|
-
|
|
1630
|
-
|
|
1631
|
-
|
|
1514
|
+
annual = (
|
|
1515
|
+
pl.scan_csv(
|
|
1516
|
+
path,
|
|
1517
|
+
infer_schema=False,
|
|
1518
|
+
null_values=[],
|
|
1519
|
+
try_parse_dates=False,
|
|
1520
|
+
ignore_errors=False,
|
|
1521
|
+
)
|
|
1522
|
+
.select(
|
|
1523
|
+
[
|
|
1524
|
+
"cmo_code",
|
|
1525
|
+
"sid_cepr",
|
|
1526
|
+
"fname_clean",
|
|
1527
|
+
"lname_clean",
|
|
1528
|
+
"mname_clean",
|
|
1529
|
+
"suff_clean",
|
|
1530
|
+
"birthdate_clean",
|
|
1531
|
+
"birthdate_imp",
|
|
1532
|
+
]
|
|
1533
|
+
)
|
|
1534
|
+
.rename(
|
|
1535
|
+
{
|
|
1536
|
+
"fname_clean": "fname",
|
|
1537
|
+
"lname_clean": "lname",
|
|
1538
|
+
"mname_clean": "mname",
|
|
1539
|
+
"suff_clean": "suffix",
|
|
1540
|
+
"birthdate_clean": "dob",
|
|
1541
|
+
"birthdate_imp": "dob_imp",
|
|
1542
|
+
}
|
|
1543
|
+
)
|
|
1544
|
+
.with_columns(
|
|
1545
|
+
pl.col("cmo_code")
|
|
1546
|
+
.replace(mappings.CMO_CODE_TO_NAME)
|
|
1547
|
+
.alias("cmo_name")
|
|
1548
|
+
)
|
|
1549
|
+
.filter(
|
|
1550
|
+
pl.col("cmo_name") == cmo_name
|
|
1551
|
+
)
|
|
1552
|
+
.with_columns(
|
|
1553
|
+
*clean_name("fname"),
|
|
1554
|
+
*clean_name("lname"),
|
|
1555
|
+
*clean_other_name("mname"),
|
|
1556
|
+
*clean_other_name("suffix"),
|
|
1557
|
+
*clean_dob(col="dob"),
|
|
1558
|
+
*clean_dob(col="dob_imp"),
|
|
1559
|
+
)
|
|
1560
|
+
.drop(
|
|
1561
|
+
[
|
|
1562
|
+
"dob",
|
|
1563
|
+
"dob_imp",
|
|
1564
|
+
]
|
|
1565
|
+
)
|
|
1566
|
+
.rename(
|
|
1567
|
+
{
|
|
1568
|
+
"dob_clean": "dob",
|
|
1569
|
+
"dob_imp_clean": "dob_imp",
|
|
1570
|
+
}
|
|
1571
|
+
)
|
|
1572
|
+
)
|
|
1632
1573
|
|
|
1633
|
-
|
|
1634
|
-
census,
|
|
1635
|
-
fname_expr=pl.col("fname"),
|
|
1636
|
-
lname_expr=pl.col("lname"),
|
|
1637
|
-
dob_col="dob_imp",
|
|
1638
|
-
label="dob_imp",
|
|
1639
|
-
)
|
|
1574
|
+
annual_frames.append(annual)
|
|
1640
1575
|
|
|
1641
|
-
|
|
1642
|
-
|
|
1643
|
-
|
|
1644
|
-
"suffix": lookup_suffix,
|
|
1645
|
-
"dob_imp": lookup_dob_imp,
|
|
1646
|
-
}
|
|
1647
|
-
```
|
|
1576
|
+
#
|
|
1577
|
+
# MATERIALIZE ONCE
|
|
1578
|
+
#
|
|
1648
1579
|
|
|
1649
|
-
|
|
1650
|
-
|
|
1651
|
-
|
|
1652
|
-
|
|
1653
|
-
|
|
1654
|
-
|
|
1655
|
-
|
|
1656
|
-
label: str,
|
|
1657
|
-
) -> tuple[pl.DataFrame, pl.DataFrame]:
|
|
1580
|
+
census = (
|
|
1581
|
+
pl.concat(
|
|
1582
|
+
annual_frames,
|
|
1583
|
+
how="vertical_relaxed",
|
|
1584
|
+
)
|
|
1585
|
+
.collect()
|
|
1586
|
+
)
|
|
1658
1587
|
|
|
1659
|
-
|
|
1660
|
-
before = len(unmatched)
|
|
1588
|
+
print(f"census rows: {len(census):,}")
|
|
1661
1589
|
|
|
1662
|
-
|
|
1663
|
-
|
|
1664
|
-
|
|
1665
|
-
[
|
|
1666
|
-
fname_expr.alias("_fname_key"),
|
|
1667
|
-
lname_expr.alias("_lname_key"),
|
|
1668
|
-
dob_expr.alias("_dob_key"),
|
|
1669
|
-
]
|
|
1670
|
-
)
|
|
1671
|
-
.join(
|
|
1672
|
-
lookup,
|
|
1673
|
-
on=[
|
|
1674
|
-
"_fname_key",
|
|
1675
|
-
"_lname_key",
|
|
1676
|
-
"_dob_key",
|
|
1677
|
-
],
|
|
1678
|
-
how="left",
|
|
1679
|
-
validate="m:1",
|
|
1680
|
-
)
|
|
1681
|
-
.drop(
|
|
1682
|
-
[
|
|
1683
|
-
"_fname_key",
|
|
1684
|
-
"_lname_key",
|
|
1685
|
-
"_dob_key",
|
|
1686
|
-
]
|
|
1687
|
-
)
|
|
1688
|
-
)
|
|
1590
|
+
#
|
|
1591
|
+
# BUILD LOOKUPS ONCE
|
|
1592
|
+
#
|
|
1689
1593
|
|
|
1690
|
-
|
|
1691
|
-
|
|
1692
|
-
|
|
1693
|
-
pl.col("
|
|
1594
|
+
lookup_exact = _build_lookup(
|
|
1595
|
+
census,
|
|
1596
|
+
fname_expr=pl.col("fname"),
|
|
1597
|
+
lname_expr=pl.col("lname"),
|
|
1598
|
+
dob_col="dob",
|
|
1599
|
+
label="exact",
|
|
1694
1600
|
)
|
|
1695
|
-
)
|
|
1696
1601
|
|
|
1697
|
-
|
|
1698
|
-
|
|
1699
|
-
|
|
1700
|
-
|
|
1602
|
+
lookup_mname = _build_lookup(
|
|
1603
|
+
census,
|
|
1604
|
+
fname_expr=pl.concat_str(
|
|
1605
|
+
[
|
|
1606
|
+
pl.col("fname"),
|
|
1607
|
+
pl.col("mname"),
|
|
1608
|
+
],
|
|
1609
|
+
separator=" ",
|
|
1610
|
+
),
|
|
1611
|
+
lname_expr=pl.col("lname"),
|
|
1612
|
+
dob_col="dob",
|
|
1613
|
+
label="mname",
|
|
1701
1614
|
)
|
|
1702
|
-
.drop("sid_cepr")
|
|
1703
|
-
)
|
|
1704
1615
|
|
|
1705
|
-
|
|
1616
|
+
lookup_suffix = _build_lookup(
|
|
1617
|
+
census,
|
|
1618
|
+
fname_expr=pl.col("fname"),
|
|
1619
|
+
lname_expr=pl.concat_str(
|
|
1620
|
+
[
|
|
1621
|
+
pl.col("lname"),
|
|
1622
|
+
pl.col("suffix"),
|
|
1623
|
+
],
|
|
1624
|
+
separator=" ",
|
|
1625
|
+
),
|
|
1626
|
+
dob_col="dob",
|
|
1627
|
+
label="suffix",
|
|
1628
|
+
)
|
|
1706
1629
|
|
|
1707
|
-
|
|
1708
|
-
|
|
1709
|
-
)
|
|
1630
|
+
lookup_dob_imp = _build_lookup(
|
|
1631
|
+
census,
|
|
1632
|
+
fname_expr=pl.col("fname"),
|
|
1633
|
+
lname_expr=pl.col("lname"),
|
|
1634
|
+
dob_col="dob_imp",
|
|
1635
|
+
label="dob_imp",
|
|
1636
|
+
)
|
|
1710
1637
|
|
|
1711
|
-
return
|
|
1712
|
-
|
|
1638
|
+
return {
|
|
1639
|
+
"exact": lookup_exact,
|
|
1640
|
+
"mname": lookup_mname,
|
|
1641
|
+
"suffix": lookup_suffix,
|
|
1642
|
+
"dob_imp": lookup_dob_imp,
|
|
1643
|
+
}
|
|
1713
1644
|
|
|
1714
|
-
def
|
|
1715
|
-
|
|
1716
|
-
*,
|
|
1717
|
-
|
|
1718
|
-
|
|
1719
|
-
|
|
1645
|
+
def _run_match_stage(
|
|
1646
|
+
unmatched: pl.DataFrame,
|
|
1647
|
+
*,
|
|
1648
|
+
lookup: pl.DataFrame,
|
|
1649
|
+
fname_expr: pl.Expr,
|
|
1650
|
+
lname_expr: pl.Expr,
|
|
1651
|
+
dob_expr: pl.Expr,
|
|
1652
|
+
label: str,
|
|
1653
|
+
) -> tuple[pl.DataFrame, pl.DataFrame]:
|
|
1720
1654
|
|
|
1721
|
-
|
|
1722
|
-
is_lazy = isinstance(frame, pl.LazyFrame)
|
|
1655
|
+
before = len(unmatched)
|
|
1723
1656
|
|
|
1724
|
-
|
|
1725
|
-
|
|
1726
|
-
|
|
1727
|
-
|
|
1728
|
-
)
|
|
1657
|
+
stage = (
|
|
1658
|
+
unmatched
|
|
1659
|
+
.with_columns(
|
|
1660
|
+
[
|
|
1661
|
+
fname_expr.alias("_fname_key"),
|
|
1662
|
+
lname_expr.alias("_lname_key"),
|
|
1663
|
+
dob_expr.alias("_dob_key"),
|
|
1664
|
+
]
|
|
1665
|
+
)
|
|
1666
|
+
.join(
|
|
1667
|
+
lookup,
|
|
1668
|
+
on=[
|
|
1669
|
+
"_fname_key",
|
|
1670
|
+
"_lname_key",
|
|
1671
|
+
"_dob_key",
|
|
1672
|
+
],
|
|
1673
|
+
how="left",
|
|
1674
|
+
validate="m:1",
|
|
1675
|
+
)
|
|
1676
|
+
.drop(
|
|
1677
|
+
[
|
|
1678
|
+
"_fname_key",
|
|
1679
|
+
"_lname_key",
|
|
1680
|
+
"_dob_key",
|
|
1681
|
+
]
|
|
1682
|
+
)
|
|
1683
|
+
)
|
|
1729
1684
|
|
|
1730
|
-
|
|
1731
|
-
|
|
1732
|
-
|
|
1685
|
+
matched = (
|
|
1686
|
+
stage
|
|
1687
|
+
.filter(
|
|
1688
|
+
pl.col("sid_cepr").is_not_null()
|
|
1689
|
+
)
|
|
1690
|
+
)
|
|
1733
1691
|
|
|
1734
|
-
|
|
1735
|
-
|
|
1736
|
-
|
|
1737
|
-
|
|
1738
|
-
|
|
1739
|
-
|
|
1692
|
+
unmatched = (
|
|
1693
|
+
stage
|
|
1694
|
+
.filter(
|
|
1695
|
+
pl.col("sid_cepr").is_null()
|
|
1696
|
+
)
|
|
1697
|
+
.drop("sid_cepr")
|
|
1740
1698
|
)
|
|
1741
|
-
)
|
|
1742
1699
|
|
|
1743
|
-
|
|
1700
|
+
added = len(matched)
|
|
1744
1701
|
|
|
1745
|
-
|
|
1702
|
+
print(
|
|
1703
|
+
f"{label}: matched {added:,}/{before:,}"
|
|
1704
|
+
)
|
|
1746
1705
|
|
|
1747
|
-
|
|
1748
|
-
# STAGE 1
|
|
1749
|
-
# EXACT
|
|
1750
|
-
#
|
|
1706
|
+
return matched, unmatched
|
|
1751
1707
|
|
|
1752
|
-
|
|
1753
|
-
|
|
1754
|
-
|
|
1755
|
-
|
|
1756
|
-
|
|
1757
|
-
|
|
1758
|
-
label="exact",
|
|
1759
|
-
)
|
|
1708
|
+
def lookup_sid_cepr(
|
|
1709
|
+
frame: Frame,
|
|
1710
|
+
*,
|
|
1711
|
+
cols: Mapping[str, str],
|
|
1712
|
+
lookups: dict[str, pl.DataFrame],
|
|
1713
|
+
) -> Frame:
|
|
1760
1714
|
|
|
1761
|
-
|
|
1715
|
+
is_lazy = isinstance(frame, pl.LazyFrame)
|
|
1762
1716
|
|
|
1763
|
-
|
|
1764
|
-
|
|
1765
|
-
|
|
1766
|
-
|
|
1717
|
+
current = (
|
|
1718
|
+
frame.collect()
|
|
1719
|
+
if is_lazy
|
|
1720
|
+
else frame
|
|
1721
|
+
)
|
|
1767
1722
|
|
|
1768
|
-
|
|
1723
|
+
#
|
|
1724
|
+
# CLEAN LEFT SIDE
|
|
1725
|
+
#
|
|
1769
1726
|
|
|
1770
|
-
|
|
1771
|
-
|
|
1727
|
+
current = (
|
|
1728
|
+
current
|
|
1772
1729
|
.with_columns(
|
|
1773
|
-
*
|
|
1730
|
+
*clean_name(cols["fname"]),
|
|
1731
|
+
*clean_name(cols["lname"]),
|
|
1732
|
+
*clean_dob(col=cols["dob"]),
|
|
1774
1733
|
)
|
|
1775
1734
|
)
|
|
1776
1735
|
|
|
1736
|
+
matched_frames = []
|
|
1737
|
+
|
|
1738
|
+
unmatched = current
|
|
1739
|
+
|
|
1740
|
+
#
|
|
1741
|
+
# STAGE 1
|
|
1742
|
+
# EXACT
|
|
1743
|
+
#
|
|
1744
|
+
|
|
1777
1745
|
matched, unmatched = _run_match_stage(
|
|
1778
1746
|
unmatched,
|
|
1779
|
-
lookup=lookups["
|
|
1780
|
-
fname_expr=pl.
|
|
1781
|
-
[
|
|
1782
|
-
pl.col(f"{cols['fname']}_clean"),
|
|
1783
|
-
pl.col(f"{cols['mname']}_clean"),
|
|
1784
|
-
],
|
|
1785
|
-
separator=" ",
|
|
1786
|
-
),
|
|
1747
|
+
lookup=lookups["exact"],
|
|
1748
|
+
fname_expr=pl.col(f"{cols['fname']}_clean"),
|
|
1787
1749
|
lname_expr=pl.col(f"{cols['lname']}_clean"),
|
|
1788
1750
|
dob_expr=pl.col(f"{cols['dob']}_clean"),
|
|
1789
|
-
label="
|
|
1751
|
+
label="exact",
|
|
1790
1752
|
)
|
|
1791
1753
|
|
|
1792
1754
|
matched_frames.append(matched)
|
|
1793
1755
|
|
|
1794
|
-
#
|
|
1795
|
-
# STAGE
|
|
1796
|
-
#
|
|
1797
|
-
#
|
|
1756
|
+
#
|
|
1757
|
+
# STAGE 2
|
|
1758
|
+
# MNAME
|
|
1759
|
+
#
|
|
1798
1760
|
|
|
1799
|
-
if "
|
|
1761
|
+
if "mname" in cols:
|
|
1800
1762
|
|
|
1801
|
-
|
|
1802
|
-
|
|
1803
|
-
|
|
1804
|
-
|
|
1763
|
+
unmatched = (
|
|
1764
|
+
unmatched
|
|
1765
|
+
.with_columns(
|
|
1766
|
+
*clean_other_name(cols["mname"])
|
|
1767
|
+
)
|
|
1805
1768
|
)
|
|
1806
|
-
|
|
1769
|
+
|
|
1770
|
+
matched, unmatched = _run_match_stage(
|
|
1771
|
+
unmatched,
|
|
1772
|
+
lookup=lookups["mname"],
|
|
1773
|
+
fname_expr=pl.concat_str(
|
|
1774
|
+
[
|
|
1775
|
+
pl.col(f"{cols['fname']}_clean"),
|
|
1776
|
+
pl.col(f"{cols['mname']}_clean"),
|
|
1777
|
+
],
|
|
1778
|
+
separator=" ",
|
|
1779
|
+
),
|
|
1780
|
+
lname_expr=pl.col(f"{cols['lname']}_clean"),
|
|
1781
|
+
dob_expr=pl.col(f"{cols['dob']}_clean"),
|
|
1782
|
+
label="mname",
|
|
1783
|
+
)
|
|
1784
|
+
|
|
1785
|
+
matched_frames.append(matched)
|
|
1786
|
+
|
|
1787
|
+
#
|
|
1788
|
+
# STAGE 3
|
|
1789
|
+
# SUFFIX
|
|
1790
|
+
#
|
|
1791
|
+
|
|
1792
|
+
if "suffix" in cols:
|
|
1793
|
+
|
|
1794
|
+
unmatched = (
|
|
1795
|
+
unmatched
|
|
1796
|
+
.with_columns(
|
|
1797
|
+
*clean_other_name(cols["suffix"])
|
|
1798
|
+
)
|
|
1799
|
+
)
|
|
1800
|
+
|
|
1801
|
+
matched, unmatched = _run_match_stage(
|
|
1802
|
+
unmatched,
|
|
1803
|
+
lookup=lookups["suffix"],
|
|
1804
|
+
fname_expr=pl.col(f"{cols['fname']}_clean"),
|
|
1805
|
+
lname_expr=pl.concat_str(
|
|
1806
|
+
[
|
|
1807
|
+
pl.col(f"{cols['lname']}_clean"),
|
|
1808
|
+
pl.col(f"{cols['suffix']}_clean"),
|
|
1809
|
+
],
|
|
1810
|
+
separator=" ",
|
|
1811
|
+
),
|
|
1812
|
+
dob_expr=pl.col(f"{cols['dob']}_clean"),
|
|
1813
|
+
label="suffix",
|
|
1814
|
+
)
|
|
1815
|
+
|
|
1816
|
+
matched_frames.append(matched)
|
|
1817
|
+
|
|
1818
|
+
#
|
|
1819
|
+
# STAGE 4
|
|
1820
|
+
# DOB IMP
|
|
1821
|
+
#
|
|
1807
1822
|
|
|
1808
1823
|
matched, unmatched = _run_match_stage(
|
|
1809
1824
|
unmatched,
|
|
1810
|
-
lookup=lookups["
|
|
1825
|
+
lookup=lookups["dob_imp"],
|
|
1811
1826
|
fname_expr=pl.col(f"{cols['fname']}_clean"),
|
|
1812
|
-
lname_expr=pl.
|
|
1813
|
-
[
|
|
1814
|
-
pl.col(f"{cols['lname']}_clean"),
|
|
1815
|
-
pl.col(f"{cols['suffix']}_clean"),
|
|
1816
|
-
],
|
|
1817
|
-
separator=" ",
|
|
1818
|
-
),
|
|
1827
|
+
lname_expr=pl.col(f"{cols['lname']}_clean"),
|
|
1819
1828
|
dob_expr=pl.col(f"{cols['dob']}_clean"),
|
|
1820
|
-
label="
|
|
1829
|
+
label="dob_imp",
|
|
1821
1830
|
)
|
|
1822
1831
|
|
|
1823
1832
|
matched_frames.append(matched)
|
|
1824
1833
|
|
|
1825
|
-
#
|
|
1826
|
-
#
|
|
1827
|
-
#
|
|
1828
|
-
#
|
|
1829
|
-
|
|
1830
|
-
matched, unmatched = _run_match_stage(
|
|
1831
|
-
unmatched,
|
|
1832
|
-
lookup=lookups["dob_imp"],
|
|
1833
|
-
fname_expr=pl.col(f"{cols['fname']}_clean"),
|
|
1834
|
-
lname_expr=pl.col(f"{cols['lname']}_clean"),
|
|
1835
|
-
dob_expr=pl.col(f"{cols['dob']}_clean"),
|
|
1836
|
-
label="dob_imp",
|
|
1837
|
-
)
|
|
1834
|
+
#
|
|
1835
|
+
# FINAL
|
|
1836
|
+
#
|
|
1838
1837
|
|
|
1839
|
-
|
|
1840
|
-
|
|
1841
|
-
|
|
1842
|
-
|
|
1843
|
-
#
|
|
1844
|
-
|
|
1845
|
-
result = pl.concat(
|
|
1846
|
-
matched_frames + [unmatched],
|
|
1847
|
-
how="diagonal_relaxed",
|
|
1848
|
-
)
|
|
1838
|
+
result = pl.concat(
|
|
1839
|
+
matched_frames + [unmatched],
|
|
1840
|
+
how="diagonal_relaxed",
|
|
1841
|
+
)
|
|
1849
1842
|
|
|
1850
|
-
print(
|
|
1851
|
-
|
|
1852
|
-
)
|
|
1843
|
+
print(
|
|
1844
|
+
f"final matched: {result['sid_cepr'].is_not_null().sum():,}/{len(result):,}"
|
|
1845
|
+
)
|
|
1853
1846
|
|
|
1854
|
-
return result
|
|
1855
|
-
```
|
|
1847
|
+
return result
|
|
1856
1848
|
|
|
1857
1849
|
#
|
|
1858
1850
|
|
|
File without changes
|
|
File without changes
|
|
File without changes
|