ltc-code 0.1.3__tar.gz → 0.1.4__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {ltc_code-0.1.3 → ltc_code-0.1.4}/PKG-INFO +1 -1
- {ltc_code-0.1.3 → ltc_code-0.1.4}/pyproject.toml +1 -1
- {ltc_code-0.1.3 → ltc_code-0.1.4}/src/ltc_code/may27.py +464 -0
- {ltc_code-0.1.3 → ltc_code-0.1.4}/README.md +0 -0
- {ltc_code-0.1.3 → ltc_code-0.1.4}/src/ltc_code/__init__.py +0 -0
- {ltc_code-0.1.3 → ltc_code-0.1.4}/src/ltc_code/polars_dates.py +0 -0
|
@@ -1414,3 +1414,467 @@ def lookup_sid_cepr(
|
|
|
1414
1414
|
if not is_lazy:
|
|
1415
1415
|
result = result.collect()
|
|
1416
1416
|
return result
|
|
1417
|
+
|
|
1418
|
+
|
|
1419
|
+
|
|
1420
|
+
|
|
1421
|
+
|
|
1422
|
+
####################################################################################
|
|
1423
|
+
# LOOK UP KEYS NEW
|
|
1424
|
+
####################################################################################
|
|
1425
|
+
|
|
1426
|
+
|
|
1427
|
+
|
|
1428
|
+
|
|
1429
|
+
|
|
1430
|
+
def _build_lookup(
|
|
1431
|
+
census: pl.DataFrame,
|
|
1432
|
+
*,
|
|
1433
|
+
fname_expr: pl.Expr,
|
|
1434
|
+
lname_expr: pl.Expr,
|
|
1435
|
+
dob_col: str,
|
|
1436
|
+
label: str,
|
|
1437
|
+
) -> pl.DataFrame:
|
|
1438
|
+
"""
|
|
1439
|
+
Build a deterministic SID lookup table.
|
|
1440
|
+
|
|
1441
|
+
```
|
|
1442
|
+
Output schema:
|
|
1443
|
+
_fname_key
|
|
1444
|
+
_lname_key
|
|
1445
|
+
_dob_key
|
|
1446
|
+
sid_cepr
|
|
1447
|
+
|
|
1448
|
+
Ambiguous keys are removed.
|
|
1449
|
+
"""
|
|
1450
|
+
|
|
1451
|
+
lookup = (
|
|
1452
|
+
census
|
|
1453
|
+
.select(
|
|
1454
|
+
[
|
|
1455
|
+
fname_expr.alias("_fname_key"),
|
|
1456
|
+
lname_expr.alias("_lname_key"),
|
|
1457
|
+
pl.col(dob_col).alias("_dob_key"),
|
|
1458
|
+
pl.col("sid_cepr"),
|
|
1459
|
+
]
|
|
1460
|
+
)
|
|
1461
|
+
.drop_nulls(
|
|
1462
|
+
[
|
|
1463
|
+
"_fname_key",
|
|
1464
|
+
"_lname_key",
|
|
1465
|
+
"_dob_key",
|
|
1466
|
+
"sid_cepr",
|
|
1467
|
+
]
|
|
1468
|
+
)
|
|
1469
|
+
.group_by(
|
|
1470
|
+
[
|
|
1471
|
+
"_fname_key",
|
|
1472
|
+
"_lname_key",
|
|
1473
|
+
"_dob_key",
|
|
1474
|
+
]
|
|
1475
|
+
)
|
|
1476
|
+
.agg(
|
|
1477
|
+
pl.col("sid_cepr").unique().alias("_sids")
|
|
1478
|
+
)
|
|
1479
|
+
.with_columns(
|
|
1480
|
+
pl.col("_sids").list.len().alias("_sid_count")
|
|
1481
|
+
)
|
|
1482
|
+
.filter(
|
|
1483
|
+
pl.col("_sid_count") == 1
|
|
1484
|
+
)
|
|
1485
|
+
.select(
|
|
1486
|
+
[
|
|
1487
|
+
"_fname_key",
|
|
1488
|
+
"_lname_key",
|
|
1489
|
+
"_dob_key",
|
|
1490
|
+
pl.col("_sids").list.first().alias("sid_cepr"),
|
|
1491
|
+
]
|
|
1492
|
+
)
|
|
1493
|
+
)
|
|
1494
|
+
|
|
1495
|
+
print(f"built lookup: {label}")
|
|
1496
|
+
|
|
1497
|
+
return lookup
|
|
1498
|
+
```
|
|
1499
|
+
|
|
1500
|
+
def build_census_lookups(
|
|
1501
|
+
*,
|
|
1502
|
+
cmo_name: str,
|
|
1503
|
+
) -> dict[str, pl.DataFrame]:
|
|
1504
|
+
|
|
1505
|
+
```
|
|
1506
|
+
try:
|
|
1507
|
+
import mappings
|
|
1508
|
+
except ImportError:
|
|
1509
|
+
import mapppings as mappings
|
|
1510
|
+
|
|
1511
|
+
annual_frames = []
|
|
1512
|
+
|
|
1513
|
+
for year in range(1994, 2023):
|
|
1514
|
+
|
|
1515
|
+
path = CENSUS_STUDENTS / f"census_student_{year}.csv"
|
|
1516
|
+
|
|
1517
|
+
annual = (
|
|
1518
|
+
pl.scan_csv(
|
|
1519
|
+
path,
|
|
1520
|
+
infer_schema=False,
|
|
1521
|
+
null_values=[],
|
|
1522
|
+
try_parse_dates=False,
|
|
1523
|
+
ignore_errors=False,
|
|
1524
|
+
)
|
|
1525
|
+
.select(
|
|
1526
|
+
[
|
|
1527
|
+
"cmo_code",
|
|
1528
|
+
"sid_cepr",
|
|
1529
|
+
"fname_clean",
|
|
1530
|
+
"lname_clean",
|
|
1531
|
+
"mname_clean",
|
|
1532
|
+
"suff_clean",
|
|
1533
|
+
"birthdate_clean",
|
|
1534
|
+
"birthdate_imp",
|
|
1535
|
+
]
|
|
1536
|
+
)
|
|
1537
|
+
.rename(
|
|
1538
|
+
{
|
|
1539
|
+
"fname_clean": "fname",
|
|
1540
|
+
"lname_clean": "lname",
|
|
1541
|
+
"mname_clean": "mname",
|
|
1542
|
+
"suff_clean": "suffix",
|
|
1543
|
+
"birthdate_clean": "dob",
|
|
1544
|
+
"birthdate_imp": "dob_imp",
|
|
1545
|
+
}
|
|
1546
|
+
)
|
|
1547
|
+
.with_columns(
|
|
1548
|
+
pl.col("cmo_code")
|
|
1549
|
+
.replace(mappings.CMO_CODE_TO_NAME)
|
|
1550
|
+
.alias("cmo_name")
|
|
1551
|
+
)
|
|
1552
|
+
.filter(
|
|
1553
|
+
pl.col("cmo_name") == cmo_name
|
|
1554
|
+
)
|
|
1555
|
+
.with_columns(
|
|
1556
|
+
*clean_name("fname"),
|
|
1557
|
+
*clean_name("lname"),
|
|
1558
|
+
*clean_other_name("mname"),
|
|
1559
|
+
*clean_other_name("suffix"),
|
|
1560
|
+
*clean_dob(col="dob"),
|
|
1561
|
+
*clean_dob(col="dob_imp"),
|
|
1562
|
+
)
|
|
1563
|
+
.drop(
|
|
1564
|
+
[
|
|
1565
|
+
"dob",
|
|
1566
|
+
"dob_imp",
|
|
1567
|
+
]
|
|
1568
|
+
)
|
|
1569
|
+
.rename(
|
|
1570
|
+
{
|
|
1571
|
+
"dob_clean": "dob",
|
|
1572
|
+
"dob_imp_clean": "dob_imp",
|
|
1573
|
+
}
|
|
1574
|
+
)
|
|
1575
|
+
)
|
|
1576
|
+
|
|
1577
|
+
annual_frames.append(annual)
|
|
1578
|
+
|
|
1579
|
+
#
|
|
1580
|
+
# MATERIALIZE ONCE
|
|
1581
|
+
#
|
|
1582
|
+
|
|
1583
|
+
census = (
|
|
1584
|
+
pl.concat(
|
|
1585
|
+
annual_frames,
|
|
1586
|
+
how="vertical_relaxed",
|
|
1587
|
+
)
|
|
1588
|
+
.collect()
|
|
1589
|
+
)
|
|
1590
|
+
|
|
1591
|
+
print(f"census rows: {len(census):,}")
|
|
1592
|
+
|
|
1593
|
+
#
|
|
1594
|
+
# BUILD LOOKUPS ONCE
|
|
1595
|
+
#
|
|
1596
|
+
|
|
1597
|
+
lookup_exact = _build_lookup(
|
|
1598
|
+
census,
|
|
1599
|
+
fname_expr=pl.col("fname"),
|
|
1600
|
+
lname_expr=pl.col("lname"),
|
|
1601
|
+
dob_col="dob",
|
|
1602
|
+
label="exact",
|
|
1603
|
+
)
|
|
1604
|
+
|
|
1605
|
+
lookup_mname = _build_lookup(
|
|
1606
|
+
census,
|
|
1607
|
+
fname_expr=pl.concat_str(
|
|
1608
|
+
[
|
|
1609
|
+
pl.col("fname"),
|
|
1610
|
+
pl.col("mname"),
|
|
1611
|
+
],
|
|
1612
|
+
separator=" ",
|
|
1613
|
+
),
|
|
1614
|
+
lname_expr=pl.col("lname"),
|
|
1615
|
+
dob_col="dob",
|
|
1616
|
+
label="mname",
|
|
1617
|
+
)
|
|
1618
|
+
|
|
1619
|
+
lookup_suffix = _build_lookup(
|
|
1620
|
+
census,
|
|
1621
|
+
fname_expr=pl.col("fname"),
|
|
1622
|
+
lname_expr=pl.concat_str(
|
|
1623
|
+
[
|
|
1624
|
+
pl.col("lname"),
|
|
1625
|
+
pl.col("suffix"),
|
|
1626
|
+
],
|
|
1627
|
+
separator=" ",
|
|
1628
|
+
),
|
|
1629
|
+
dob_col="dob",
|
|
1630
|
+
label="suffix",
|
|
1631
|
+
)
|
|
1632
|
+
|
|
1633
|
+
lookup_dob_imp = _build_lookup(
|
|
1634
|
+
census,
|
|
1635
|
+
fname_expr=pl.col("fname"),
|
|
1636
|
+
lname_expr=pl.col("lname"),
|
|
1637
|
+
dob_col="dob_imp",
|
|
1638
|
+
label="dob_imp",
|
|
1639
|
+
)
|
|
1640
|
+
|
|
1641
|
+
return {
|
|
1642
|
+
"exact": lookup_exact,
|
|
1643
|
+
"mname": lookup_mname,
|
|
1644
|
+
"suffix": lookup_suffix,
|
|
1645
|
+
"dob_imp": lookup_dob_imp,
|
|
1646
|
+
}
|
|
1647
|
+
```
|
|
1648
|
+
|
|
1649
|
+
def _run_match_stage(
|
|
1650
|
+
unmatched: pl.DataFrame,
|
|
1651
|
+
*,
|
|
1652
|
+
lookup: pl.DataFrame,
|
|
1653
|
+
fname_expr: pl.Expr,
|
|
1654
|
+
lname_expr: pl.Expr,
|
|
1655
|
+
dob_expr: pl.Expr,
|
|
1656
|
+
label: str,
|
|
1657
|
+
) -> tuple[pl.DataFrame, pl.DataFrame]:
|
|
1658
|
+
|
|
1659
|
+
```
|
|
1660
|
+
before = len(unmatched)
|
|
1661
|
+
|
|
1662
|
+
stage = (
|
|
1663
|
+
unmatched
|
|
1664
|
+
.with_columns(
|
|
1665
|
+
[
|
|
1666
|
+
fname_expr.alias("_fname_key"),
|
|
1667
|
+
lname_expr.alias("_lname_key"),
|
|
1668
|
+
dob_expr.alias("_dob_key"),
|
|
1669
|
+
]
|
|
1670
|
+
)
|
|
1671
|
+
.join(
|
|
1672
|
+
lookup,
|
|
1673
|
+
on=[
|
|
1674
|
+
"_fname_key",
|
|
1675
|
+
"_lname_key",
|
|
1676
|
+
"_dob_key",
|
|
1677
|
+
],
|
|
1678
|
+
how="left",
|
|
1679
|
+
validate="m:1",
|
|
1680
|
+
)
|
|
1681
|
+
.drop(
|
|
1682
|
+
[
|
|
1683
|
+
"_fname_key",
|
|
1684
|
+
"_lname_key",
|
|
1685
|
+
"_dob_key",
|
|
1686
|
+
]
|
|
1687
|
+
)
|
|
1688
|
+
)
|
|
1689
|
+
|
|
1690
|
+
matched = (
|
|
1691
|
+
stage
|
|
1692
|
+
.filter(
|
|
1693
|
+
pl.col("sid_cepr").is_not_null()
|
|
1694
|
+
)
|
|
1695
|
+
)
|
|
1696
|
+
|
|
1697
|
+
unmatched = (
|
|
1698
|
+
stage
|
|
1699
|
+
.filter(
|
|
1700
|
+
pl.col("sid_cepr").is_null()
|
|
1701
|
+
)
|
|
1702
|
+
.drop("sid_cepr")
|
|
1703
|
+
)
|
|
1704
|
+
|
|
1705
|
+
added = len(matched)
|
|
1706
|
+
|
|
1707
|
+
print(
|
|
1708
|
+
f"{label}: matched {added:,}/{before:,}"
|
|
1709
|
+
)
|
|
1710
|
+
|
|
1711
|
+
return matched, unmatched
|
|
1712
|
+
```
|
|
1713
|
+
|
|
1714
|
+
def lookup_sid_cepr(
|
|
1715
|
+
frame: Frame,
|
|
1716
|
+
*,
|
|
1717
|
+
cols: Mapping[str, str],
|
|
1718
|
+
lookups: dict[str, pl.DataFrame],
|
|
1719
|
+
) -> Frame:
|
|
1720
|
+
|
|
1721
|
+
```
|
|
1722
|
+
is_lazy = isinstance(frame, pl.LazyFrame)
|
|
1723
|
+
|
|
1724
|
+
current = (
|
|
1725
|
+
frame.collect()
|
|
1726
|
+
if is_lazy
|
|
1727
|
+
else frame
|
|
1728
|
+
)
|
|
1729
|
+
|
|
1730
|
+
#
|
|
1731
|
+
# CLEAN LEFT SIDE
|
|
1732
|
+
#
|
|
1733
|
+
|
|
1734
|
+
current = (
|
|
1735
|
+
current
|
|
1736
|
+
.with_columns(
|
|
1737
|
+
*clean_name(cols["fname"]),
|
|
1738
|
+
*clean_name(cols["lname"]),
|
|
1739
|
+
*clean_dob(col=cols["dob"]),
|
|
1740
|
+
)
|
|
1741
|
+
)
|
|
1742
|
+
|
|
1743
|
+
matched_frames = []
|
|
1744
|
+
|
|
1745
|
+
unmatched = current
|
|
1746
|
+
|
|
1747
|
+
#
|
|
1748
|
+
# STAGE 1
|
|
1749
|
+
# EXACT
|
|
1750
|
+
#
|
|
1751
|
+
|
|
1752
|
+
matched, unmatched = _run_match_stage(
|
|
1753
|
+
unmatched,
|
|
1754
|
+
lookup=lookups["exact"],
|
|
1755
|
+
fname_expr=pl.col(f"{cols['fname']}_clean"),
|
|
1756
|
+
lname_expr=pl.col(f"{cols['lname']}_clean"),
|
|
1757
|
+
dob_expr=pl.col(f"{cols['dob']}_clean"),
|
|
1758
|
+
label="exact",
|
|
1759
|
+
)
|
|
1760
|
+
|
|
1761
|
+
matched_frames.append(matched)
|
|
1762
|
+
|
|
1763
|
+
#
|
|
1764
|
+
# STAGE 2
|
|
1765
|
+
# MNAME
|
|
1766
|
+
#
|
|
1767
|
+
|
|
1768
|
+
if "mname" in cols:
|
|
1769
|
+
|
|
1770
|
+
unmatched = (
|
|
1771
|
+
unmatched
|
|
1772
|
+
.with_columns(
|
|
1773
|
+
*clean_other_name(cols["mname"])
|
|
1774
|
+
)
|
|
1775
|
+
)
|
|
1776
|
+
|
|
1777
|
+
matched, unmatched = _run_match_stage(
|
|
1778
|
+
unmatched,
|
|
1779
|
+
lookup=lookups["mname"],
|
|
1780
|
+
fname_expr=pl.concat_str(
|
|
1781
|
+
[
|
|
1782
|
+
pl.col(f"{cols['fname']}_clean"),
|
|
1783
|
+
pl.col(f"{cols['mname']}_clean"),
|
|
1784
|
+
],
|
|
1785
|
+
separator=" ",
|
|
1786
|
+
),
|
|
1787
|
+
lname_expr=pl.col(f"{cols['lname']}_clean"),
|
|
1788
|
+
dob_expr=pl.col(f"{cols['dob']}_clean"),
|
|
1789
|
+
label="mname",
|
|
1790
|
+
)
|
|
1791
|
+
|
|
1792
|
+
matched_frames.append(matched)
|
|
1793
|
+
|
|
1794
|
+
#
|
|
1795
|
+
# STAGE 3
|
|
1796
|
+
# SUFFIX
|
|
1797
|
+
#
|
|
1798
|
+
|
|
1799
|
+
if "suffix" in cols:
|
|
1800
|
+
|
|
1801
|
+
unmatched = (
|
|
1802
|
+
unmatched
|
|
1803
|
+
.with_columns(
|
|
1804
|
+
*clean_other_name(cols["suffix"])
|
|
1805
|
+
)
|
|
1806
|
+
)
|
|
1807
|
+
|
|
1808
|
+
matched, unmatched = _run_match_stage(
|
|
1809
|
+
unmatched,
|
|
1810
|
+
lookup=lookups["suffix"],
|
|
1811
|
+
fname_expr=pl.col(f"{cols['fname']}_clean"),
|
|
1812
|
+
lname_expr=pl.concat_str(
|
|
1813
|
+
[
|
|
1814
|
+
pl.col(f"{cols['lname']}_clean"),
|
|
1815
|
+
pl.col(f"{cols['suffix']}_clean"),
|
|
1816
|
+
],
|
|
1817
|
+
separator=" ",
|
|
1818
|
+
),
|
|
1819
|
+
dob_expr=pl.col(f"{cols['dob']}_clean"),
|
|
1820
|
+
label="suffix",
|
|
1821
|
+
)
|
|
1822
|
+
|
|
1823
|
+
matched_frames.append(matched)
|
|
1824
|
+
|
|
1825
|
+
#
|
|
1826
|
+
# STAGE 4
|
|
1827
|
+
# DOB IMP
|
|
1828
|
+
#
|
|
1829
|
+
|
|
1830
|
+
matched, unmatched = _run_match_stage(
|
|
1831
|
+
unmatched,
|
|
1832
|
+
lookup=lookups["dob_imp"],
|
|
1833
|
+
fname_expr=pl.col(f"{cols['fname']}_clean"),
|
|
1834
|
+
lname_expr=pl.col(f"{cols['lname']}_clean"),
|
|
1835
|
+
dob_expr=pl.col(f"{cols['dob']}_clean"),
|
|
1836
|
+
label="dob_imp",
|
|
1837
|
+
)
|
|
1838
|
+
|
|
1839
|
+
matched_frames.append(matched)
|
|
1840
|
+
|
|
1841
|
+
#
|
|
1842
|
+
# FINAL
|
|
1843
|
+
#
|
|
1844
|
+
|
|
1845
|
+
result = pl.concat(
|
|
1846
|
+
matched_frames + [unmatched],
|
|
1847
|
+
how="diagonal_relaxed",
|
|
1848
|
+
)
|
|
1849
|
+
|
|
1850
|
+
print(
|
|
1851
|
+
f"final matched: {result['sid_cepr'].is_not_null().sum():,}/{len(result):,}"
|
|
1852
|
+
)
|
|
1853
|
+
|
|
1854
|
+
return result
|
|
1855
|
+
```
|
|
1856
|
+
|
|
1857
|
+
#
|
|
1858
|
+
|
|
1859
|
+
# EXAMPLE USAGE
|
|
1860
|
+
|
|
1861
|
+
#
|
|
1862
|
+
|
|
1863
|
+
lookups = build_census_lookups(
|
|
1864
|
+
cmo_name="Aspire",
|
|
1865
|
+
)
|
|
1866
|
+
|
|
1867
|
+
result = (
|
|
1868
|
+
df
|
|
1869
|
+
.pipe(
|
|
1870
|
+
lookup_sid_cepr,
|
|
1871
|
+
cols={
|
|
1872
|
+
"fname": "fname",
|
|
1873
|
+
"lname": "lname",
|
|
1874
|
+
"mname": "mname",
|
|
1875
|
+
"suffix": "suffix",
|
|
1876
|
+
"dob": "dob",
|
|
1877
|
+
},
|
|
1878
|
+
lookups=lookups,
|
|
1879
|
+
)
|
|
1880
|
+
)
|
|
File without changes
|
|
File without changes
|
|
File without changes
|