ltc-code 0.1.3__tar.gz → 0.1.4__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.3
2
2
  Name: ltc-code
3
- Version: 0.1.3
3
+ Version: 0.1.4
4
4
  Summary: Add your description here
5
5
  Requires-Python: >=3.9
6
6
  Description-Content-Type: text/markdown
@@ -1,6 +1,6 @@
1
1
  [project]
2
2
  name = "ltc-code"
3
- version = "0.1.3"
3
+ version = "0.1.4"
4
4
  description = "Add your description here"
5
5
  readme = "README.md"
6
6
  requires-python = ">=3.9"
@@ -1414,3 +1414,467 @@ def lookup_sid_cepr(
1414
1414
  if not is_lazy:
1415
1415
  result = result.collect()
1416
1416
  return result
1417
+
1418
+
1419
+
1420
+
1421
+
1422
+ ####################################################################################
1423
+ # LOOK UP KEYS NEW
1424
+ ####################################################################################
1425
+
1426
+
1427
+
1428
+
1429
+
1430
+ def _build_lookup(
1431
+ census: pl.DataFrame,
1432
+ *,
1433
+ fname_expr: pl.Expr,
1434
+ lname_expr: pl.Expr,
1435
+ dob_col: str,
1436
+ label: str,
1437
+ ) -> pl.DataFrame:
1438
+ """
1439
+ Build a deterministic SID lookup table.
1440
+
1441
+ ```
1442
+ Output schema:
1443
+ _fname_key
1444
+ _lname_key
1445
+ _dob_key
1446
+ sid_cepr
1447
+
1448
+ Ambiguous keys are removed.
1449
+ """
1450
+
1451
+ lookup = (
1452
+ census
1453
+ .select(
1454
+ [
1455
+ fname_expr.alias("_fname_key"),
1456
+ lname_expr.alias("_lname_key"),
1457
+ pl.col(dob_col).alias("_dob_key"),
1458
+ pl.col("sid_cepr"),
1459
+ ]
1460
+ )
1461
+ .drop_nulls(
1462
+ [
1463
+ "_fname_key",
1464
+ "_lname_key",
1465
+ "_dob_key",
1466
+ "sid_cepr",
1467
+ ]
1468
+ )
1469
+ .group_by(
1470
+ [
1471
+ "_fname_key",
1472
+ "_lname_key",
1473
+ "_dob_key",
1474
+ ]
1475
+ )
1476
+ .agg(
1477
+ pl.col("sid_cepr").unique().alias("_sids")
1478
+ )
1479
+ .with_columns(
1480
+ pl.col("_sids").list.len().alias("_sid_count")
1481
+ )
1482
+ .filter(
1483
+ pl.col("_sid_count") == 1
1484
+ )
1485
+ .select(
1486
+ [
1487
+ "_fname_key",
1488
+ "_lname_key",
1489
+ "_dob_key",
1490
+ pl.col("_sids").list.first().alias("sid_cepr"),
1491
+ ]
1492
+ )
1493
+ )
1494
+
1495
+ print(f"built lookup: {label}")
1496
+
1497
+ return lookup
1498
+ ```
1499
+
1500
+ def build_census_lookups(
1501
+ *,
1502
+ cmo_name: str,
1503
+ ) -> dict[str, pl.DataFrame]:
1504
+
1505
+ ```
1506
+ try:
1507
+ import mappings
1508
+ except ImportError:
1509
+ import mapppings as mappings
1510
+
1511
+ annual_frames = []
1512
+
1513
+ for year in range(1994, 2023):
1514
+
1515
+ path = CENSUS_STUDENTS / f"census_student_{year}.csv"
1516
+
1517
+ annual = (
1518
+ pl.scan_csv(
1519
+ path,
1520
+ infer_schema=False,
1521
+ null_values=[],
1522
+ try_parse_dates=False,
1523
+ ignore_errors=False,
1524
+ )
1525
+ .select(
1526
+ [
1527
+ "cmo_code",
1528
+ "sid_cepr",
1529
+ "fname_clean",
1530
+ "lname_clean",
1531
+ "mname_clean",
1532
+ "suff_clean",
1533
+ "birthdate_clean",
1534
+ "birthdate_imp",
1535
+ ]
1536
+ )
1537
+ .rename(
1538
+ {
1539
+ "fname_clean": "fname",
1540
+ "lname_clean": "lname",
1541
+ "mname_clean": "mname",
1542
+ "suff_clean": "suffix",
1543
+ "birthdate_clean": "dob",
1544
+ "birthdate_imp": "dob_imp",
1545
+ }
1546
+ )
1547
+ .with_columns(
1548
+ pl.col("cmo_code")
1549
+ .replace(mappings.CMO_CODE_TO_NAME)
1550
+ .alias("cmo_name")
1551
+ )
1552
+ .filter(
1553
+ pl.col("cmo_name") == cmo_name
1554
+ )
1555
+ .with_columns(
1556
+ *clean_name("fname"),
1557
+ *clean_name("lname"),
1558
+ *clean_other_name("mname"),
1559
+ *clean_other_name("suffix"),
1560
+ *clean_dob(col="dob"),
1561
+ *clean_dob(col="dob_imp"),
1562
+ )
1563
+ .drop(
1564
+ [
1565
+ "dob",
1566
+ "dob_imp",
1567
+ ]
1568
+ )
1569
+ .rename(
1570
+ {
1571
+ "dob_clean": "dob",
1572
+ "dob_imp_clean": "dob_imp",
1573
+ }
1574
+ )
1575
+ )
1576
+
1577
+ annual_frames.append(annual)
1578
+
1579
+ #
1580
+ # MATERIALIZE ONCE
1581
+ #
1582
+
1583
+ census = (
1584
+ pl.concat(
1585
+ annual_frames,
1586
+ how="vertical_relaxed",
1587
+ )
1588
+ .collect()
1589
+ )
1590
+
1591
+ print(f"census rows: {len(census):,}")
1592
+
1593
+ #
1594
+ # BUILD LOOKUPS ONCE
1595
+ #
1596
+
1597
+ lookup_exact = _build_lookup(
1598
+ census,
1599
+ fname_expr=pl.col("fname"),
1600
+ lname_expr=pl.col("lname"),
1601
+ dob_col="dob",
1602
+ label="exact",
1603
+ )
1604
+
1605
+ lookup_mname = _build_lookup(
1606
+ census,
1607
+ fname_expr=pl.concat_str(
1608
+ [
1609
+ pl.col("fname"),
1610
+ pl.col("mname"),
1611
+ ],
1612
+ separator=" ",
1613
+ ),
1614
+ lname_expr=pl.col("lname"),
1615
+ dob_col="dob",
1616
+ label="mname",
1617
+ )
1618
+
1619
+ lookup_suffix = _build_lookup(
1620
+ census,
1621
+ fname_expr=pl.col("fname"),
1622
+ lname_expr=pl.concat_str(
1623
+ [
1624
+ pl.col("lname"),
1625
+ pl.col("suffix"),
1626
+ ],
1627
+ separator=" ",
1628
+ ),
1629
+ dob_col="dob",
1630
+ label="suffix",
1631
+ )
1632
+
1633
+ lookup_dob_imp = _build_lookup(
1634
+ census,
1635
+ fname_expr=pl.col("fname"),
1636
+ lname_expr=pl.col("lname"),
1637
+ dob_col="dob_imp",
1638
+ label="dob_imp",
1639
+ )
1640
+
1641
+ return {
1642
+ "exact": lookup_exact,
1643
+ "mname": lookup_mname,
1644
+ "suffix": lookup_suffix,
1645
+ "dob_imp": lookup_dob_imp,
1646
+ }
1647
+ ```
1648
+
1649
+ def _run_match_stage(
1650
+ unmatched: pl.DataFrame,
1651
+ *,
1652
+ lookup: pl.DataFrame,
1653
+ fname_expr: pl.Expr,
1654
+ lname_expr: pl.Expr,
1655
+ dob_expr: pl.Expr,
1656
+ label: str,
1657
+ ) -> tuple[pl.DataFrame, pl.DataFrame]:
1658
+
1659
+ ```
1660
+ before = len(unmatched)
1661
+
1662
+ stage = (
1663
+ unmatched
1664
+ .with_columns(
1665
+ [
1666
+ fname_expr.alias("_fname_key"),
1667
+ lname_expr.alias("_lname_key"),
1668
+ dob_expr.alias("_dob_key"),
1669
+ ]
1670
+ )
1671
+ .join(
1672
+ lookup,
1673
+ on=[
1674
+ "_fname_key",
1675
+ "_lname_key",
1676
+ "_dob_key",
1677
+ ],
1678
+ how="left",
1679
+ validate="m:1",
1680
+ )
1681
+ .drop(
1682
+ [
1683
+ "_fname_key",
1684
+ "_lname_key",
1685
+ "_dob_key",
1686
+ ]
1687
+ )
1688
+ )
1689
+
1690
+ matched = (
1691
+ stage
1692
+ .filter(
1693
+ pl.col("sid_cepr").is_not_null()
1694
+ )
1695
+ )
1696
+
1697
+ unmatched = (
1698
+ stage
1699
+ .filter(
1700
+ pl.col("sid_cepr").is_null()
1701
+ )
1702
+ .drop("sid_cepr")
1703
+ )
1704
+
1705
+ added = len(matched)
1706
+
1707
+ print(
1708
+ f"{label}: matched {added:,}/{before:,}"
1709
+ )
1710
+
1711
+ return matched, unmatched
1712
+ ```
1713
+
1714
+ def lookup_sid_cepr(
1715
+ frame: Frame,
1716
+ *,
1717
+ cols: Mapping[str, str],
1718
+ lookups: dict[str, pl.DataFrame],
1719
+ ) -> Frame:
1720
+
1721
+ ```
1722
+ is_lazy = isinstance(frame, pl.LazyFrame)
1723
+
1724
+ current = (
1725
+ frame.collect()
1726
+ if is_lazy
1727
+ else frame
1728
+ )
1729
+
1730
+ #
1731
+ # CLEAN LEFT SIDE
1732
+ #
1733
+
1734
+ current = (
1735
+ current
1736
+ .with_columns(
1737
+ *clean_name(cols["fname"]),
1738
+ *clean_name(cols["lname"]),
1739
+ *clean_dob(col=cols["dob"]),
1740
+ )
1741
+ )
1742
+
1743
+ matched_frames = []
1744
+
1745
+ unmatched = current
1746
+
1747
+ #
1748
+ # STAGE 1
1749
+ # EXACT
1750
+ #
1751
+
1752
+ matched, unmatched = _run_match_stage(
1753
+ unmatched,
1754
+ lookup=lookups["exact"],
1755
+ fname_expr=pl.col(f"{cols['fname']}_clean"),
1756
+ lname_expr=pl.col(f"{cols['lname']}_clean"),
1757
+ dob_expr=pl.col(f"{cols['dob']}_clean"),
1758
+ label="exact",
1759
+ )
1760
+
1761
+ matched_frames.append(matched)
1762
+
1763
+ #
1764
+ # STAGE 2
1765
+ # MNAME
1766
+ #
1767
+
1768
+ if "mname" in cols:
1769
+
1770
+ unmatched = (
1771
+ unmatched
1772
+ .with_columns(
1773
+ *clean_other_name(cols["mname"])
1774
+ )
1775
+ )
1776
+
1777
+ matched, unmatched = _run_match_stage(
1778
+ unmatched,
1779
+ lookup=lookups["mname"],
1780
+ fname_expr=pl.concat_str(
1781
+ [
1782
+ pl.col(f"{cols['fname']}_clean"),
1783
+ pl.col(f"{cols['mname']}_clean"),
1784
+ ],
1785
+ separator=" ",
1786
+ ),
1787
+ lname_expr=pl.col(f"{cols['lname']}_clean"),
1788
+ dob_expr=pl.col(f"{cols['dob']}_clean"),
1789
+ label="mname",
1790
+ )
1791
+
1792
+ matched_frames.append(matched)
1793
+
1794
+ #
1795
+ # STAGE 3
1796
+ # SUFFIX
1797
+ #
1798
+
1799
+ if "suffix" in cols:
1800
+
1801
+ unmatched = (
1802
+ unmatched
1803
+ .with_columns(
1804
+ *clean_other_name(cols["suffix"])
1805
+ )
1806
+ )
1807
+
1808
+ matched, unmatched = _run_match_stage(
1809
+ unmatched,
1810
+ lookup=lookups["suffix"],
1811
+ fname_expr=pl.col(f"{cols['fname']}_clean"),
1812
+ lname_expr=pl.concat_str(
1813
+ [
1814
+ pl.col(f"{cols['lname']}_clean"),
1815
+ pl.col(f"{cols['suffix']}_clean"),
1816
+ ],
1817
+ separator=" ",
1818
+ ),
1819
+ dob_expr=pl.col(f"{cols['dob']}_clean"),
1820
+ label="suffix",
1821
+ )
1822
+
1823
+ matched_frames.append(matched)
1824
+
1825
+ #
1826
+ # STAGE 4
1827
+ # DOB IMP
1828
+ #
1829
+
1830
+ matched, unmatched = _run_match_stage(
1831
+ unmatched,
1832
+ lookup=lookups["dob_imp"],
1833
+ fname_expr=pl.col(f"{cols['fname']}_clean"),
1834
+ lname_expr=pl.col(f"{cols['lname']}_clean"),
1835
+ dob_expr=pl.col(f"{cols['dob']}_clean"),
1836
+ label="dob_imp",
1837
+ )
1838
+
1839
+ matched_frames.append(matched)
1840
+
1841
+ #
1842
+ # FINAL
1843
+ #
1844
+
1845
+ result = pl.concat(
1846
+ matched_frames + [unmatched],
1847
+ how="diagonal_relaxed",
1848
+ )
1849
+
1850
+ print(
1851
+ f"final matched: {result['sid_cepr'].is_not_null().sum():,}/{len(result):,}"
1852
+ )
1853
+
1854
+ return result
1855
+ ```
1856
+
1857
+ #
1858
+
1859
+ # EXAMPLE USAGE
1860
+
1861
+ #
1862
+
1863
+ lookups = build_census_lookups(
1864
+ cmo_name="Aspire",
1865
+ )
1866
+
1867
+ result = (
1868
+ df
1869
+ .pipe(
1870
+ lookup_sid_cepr,
1871
+ cols={
1872
+ "fname": "fname",
1873
+ "lname": "lname",
1874
+ "mname": "mname",
1875
+ "suffix": "suffix",
1876
+ "dob": "dob",
1877
+ },
1878
+ lookups=lookups,
1879
+ )
1880
+ )
File without changes