ltc-code 0.1.3__tar.gz → 0.1.5__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.3
2
2
  Name: ltc-code
3
- Version: 0.1.3
3
+ Version: 0.1.5
4
4
  Summary: Add your description here
5
5
  Requires-Python: >=3.9
6
6
  Description-Content-Type: text/markdown
@@ -1,6 +1,6 @@
1
1
  [project]
2
2
  name = "ltc-code"
3
- version = "0.1.3"
3
+ version = "0.1.5"
4
4
  description = "Add your description here"
5
5
  readme = "README.md"
6
6
  requires-python = ">=3.9"
@@ -1414,3 +1414,459 @@ def lookup_sid_cepr(
1414
1414
  if not is_lazy:
1415
1415
  result = result.collect()
1416
1416
  return result
1417
+
1418
+
1419
+
1420
+
1421
+
1422
+ ####################################################################################
1423
+ # LOOK UP KEYS NEW
1424
+ ####################################################################################
1425
+
1426
+
1427
+
1428
+
1429
+
1430
+ def _build_lookup(
1431
+ census: pl.DataFrame,
1432
+ *,
1433
+ fname_expr: pl.Expr,
1434
+ lname_expr: pl.Expr,
1435
+ dob_col: str,
1436
+ label: str,
1437
+ ) -> pl.DataFrame:
1438
+ """
1439
+ Build a deterministic SID lookup table.
1440
+
1441
+ Output schema:
1442
+ _fname_key
1443
+ _lname_key
1444
+ _dob_key
1445
+ sid_cepr
1446
+
1447
+ Ambiguous keys are removed.
1448
+ """
1449
+
1450
+ lookup = (
1451
+ census
1452
+ .select(
1453
+ [
1454
+ fname_expr.alias("_fname_key"),
1455
+ lname_expr.alias("_lname_key"),
1456
+ pl.col(dob_col).alias("_dob_key"),
1457
+ pl.col("sid_cepr"),
1458
+ ]
1459
+ )
1460
+ .drop_nulls(
1461
+ [
1462
+ "_fname_key",
1463
+ "_lname_key",
1464
+ "_dob_key",
1465
+ "sid_cepr",
1466
+ ]
1467
+ )
1468
+ .group_by(
1469
+ [
1470
+ "_fname_key",
1471
+ "_lname_key",
1472
+ "_dob_key",
1473
+ ]
1474
+ )
1475
+ .agg(
1476
+ pl.col("sid_cepr").unique().alias("_sids")
1477
+ )
1478
+ .with_columns(
1479
+ pl.col("_sids").list.len().alias("_sid_count")
1480
+ )
1481
+ .filter(
1482
+ pl.col("_sid_count") == 1
1483
+ )
1484
+ .select(
1485
+ [
1486
+ "_fname_key",
1487
+ "_lname_key",
1488
+ "_dob_key",
1489
+ pl.col("_sids").list.first().alias("sid_cepr"),
1490
+ ]
1491
+ )
1492
+ )
1493
+
1494
+ print(f"built lookup: {label}")
1495
+
1496
+ return lookup
1497
+
1498
+ def build_census_lookups(
1499
+ *,
1500
+ cmo_name: str,
1501
+ ) -> dict[str, pl.DataFrame]:
1502
+
1503
+ try:
1504
+ import mappings
1505
+ except ImportError:
1506
+ import mapppings as mappings
1507
+
1508
+ annual_frames = []
1509
+
1510
+ for year in range(1994, 2023):
1511
+
1512
+ path = CENSUS_STUDENTS / f"census_student_{year}.csv"
1513
+
1514
+ annual = (
1515
+ pl.scan_csv(
1516
+ path,
1517
+ infer_schema=False,
1518
+ null_values=[],
1519
+ try_parse_dates=False,
1520
+ ignore_errors=False,
1521
+ )
1522
+ .select(
1523
+ [
1524
+ "cmo_code",
1525
+ "sid_cepr",
1526
+ "fname_clean",
1527
+ "lname_clean",
1528
+ "mname_clean",
1529
+ "suff_clean",
1530
+ "birthdate_clean",
1531
+ "birthdate_imp",
1532
+ ]
1533
+ )
1534
+ .rename(
1535
+ {
1536
+ "fname_clean": "fname",
1537
+ "lname_clean": "lname",
1538
+ "mname_clean": "mname",
1539
+ "suff_clean": "suffix",
1540
+ "birthdate_clean": "dob",
1541
+ "birthdate_imp": "dob_imp",
1542
+ }
1543
+ )
1544
+ .with_columns(
1545
+ pl.col("cmo_code")
1546
+ .replace(mappings.CMO_CODE_TO_NAME)
1547
+ .alias("cmo_name")
1548
+ )
1549
+ .filter(
1550
+ pl.col("cmo_name") == cmo_name
1551
+ )
1552
+ .with_columns(
1553
+ *clean_name("fname"),
1554
+ *clean_name("lname"),
1555
+ *clean_other_name("mname"),
1556
+ *clean_other_name("suffix"),
1557
+ *clean_dob(col="dob"),
1558
+ *clean_dob(col="dob_imp"),
1559
+ )
1560
+ .drop(
1561
+ [
1562
+ "dob",
1563
+ "dob_imp",
1564
+ ]
1565
+ )
1566
+ .rename(
1567
+ {
1568
+ "dob_clean": "dob",
1569
+ "dob_imp_clean": "dob_imp",
1570
+ }
1571
+ )
1572
+ )
1573
+
1574
+ annual_frames.append(annual)
1575
+
1576
+ #
1577
+ # MATERIALIZE ONCE
1578
+ #
1579
+
1580
+ census = (
1581
+ pl.concat(
1582
+ annual_frames,
1583
+ how="vertical_relaxed",
1584
+ )
1585
+ .collect()
1586
+ )
1587
+
1588
+ print(f"census rows: {len(census):,}")
1589
+
1590
+ #
1591
+ # BUILD LOOKUPS ONCE
1592
+ #
1593
+
1594
+ lookup_exact = _build_lookup(
1595
+ census,
1596
+ fname_expr=pl.col("fname"),
1597
+ lname_expr=pl.col("lname"),
1598
+ dob_col="dob",
1599
+ label="exact",
1600
+ )
1601
+
1602
+ lookup_mname = _build_lookup(
1603
+ census,
1604
+ fname_expr=pl.concat_str(
1605
+ [
1606
+ pl.col("fname"),
1607
+ pl.col("mname"),
1608
+ ],
1609
+ separator=" ",
1610
+ ),
1611
+ lname_expr=pl.col("lname"),
1612
+ dob_col="dob",
1613
+ label="mname",
1614
+ )
1615
+
1616
+ lookup_suffix = _build_lookup(
1617
+ census,
1618
+ fname_expr=pl.col("fname"),
1619
+ lname_expr=pl.concat_str(
1620
+ [
1621
+ pl.col("lname"),
1622
+ pl.col("suffix"),
1623
+ ],
1624
+ separator=" ",
1625
+ ),
1626
+ dob_col="dob",
1627
+ label="suffix",
1628
+ )
1629
+
1630
+ lookup_dob_imp = _build_lookup(
1631
+ census,
1632
+ fname_expr=pl.col("fname"),
1633
+ lname_expr=pl.col("lname"),
1634
+ dob_col="dob_imp",
1635
+ label="dob_imp",
1636
+ )
1637
+
1638
+ return {
1639
+ "exact": lookup_exact,
1640
+ "mname": lookup_mname,
1641
+ "suffix": lookup_suffix,
1642
+ "dob_imp": lookup_dob_imp,
1643
+ }
1644
+
1645
+ def _run_match_stage(
1646
+ unmatched: pl.DataFrame,
1647
+ *,
1648
+ lookup: pl.DataFrame,
1649
+ fname_expr: pl.Expr,
1650
+ lname_expr: pl.Expr,
1651
+ dob_expr: pl.Expr,
1652
+ label: str,
1653
+ ) -> tuple[pl.DataFrame, pl.DataFrame]:
1654
+
1655
+ before = len(unmatched)
1656
+
1657
+ stage = (
1658
+ unmatched
1659
+ .with_columns(
1660
+ [
1661
+ fname_expr.alias("_fname_key"),
1662
+ lname_expr.alias("_lname_key"),
1663
+ dob_expr.alias("_dob_key"),
1664
+ ]
1665
+ )
1666
+ .join(
1667
+ lookup,
1668
+ on=[
1669
+ "_fname_key",
1670
+ "_lname_key",
1671
+ "_dob_key",
1672
+ ],
1673
+ how="left",
1674
+ validate="m:1",
1675
+ )
1676
+ .drop(
1677
+ [
1678
+ "_fname_key",
1679
+ "_lname_key",
1680
+ "_dob_key",
1681
+ ]
1682
+ )
1683
+ )
1684
+
1685
+ matched = (
1686
+ stage
1687
+ .filter(
1688
+ pl.col("sid_cepr").is_not_null()
1689
+ )
1690
+ )
1691
+
1692
+ unmatched = (
1693
+ stage
1694
+ .filter(
1695
+ pl.col("sid_cepr").is_null()
1696
+ )
1697
+ .drop("sid_cepr")
1698
+ )
1699
+
1700
+ added = len(matched)
1701
+
1702
+ print(
1703
+ f"{label}: matched {added:,}/{before:,}"
1704
+ )
1705
+
1706
+ return matched, unmatched
1707
+
1708
+ def lookup_sid_cepr(
1709
+ frame: Frame,
1710
+ *,
1711
+ cols: Mapping[str, str],
1712
+ lookups: dict[str, pl.DataFrame],
1713
+ ) -> Frame:
1714
+
1715
+ is_lazy = isinstance(frame, pl.LazyFrame)
1716
+
1717
+ current = (
1718
+ frame.collect()
1719
+ if is_lazy
1720
+ else frame
1721
+ )
1722
+
1723
+ #
1724
+ # CLEAN LEFT SIDE
1725
+ #
1726
+
1727
+ current = (
1728
+ current
1729
+ .with_columns(
1730
+ *clean_name(cols["fname"]),
1731
+ *clean_name(cols["lname"]),
1732
+ *clean_dob(col=cols["dob"]),
1733
+ )
1734
+ )
1735
+
1736
+ matched_frames = []
1737
+
1738
+ unmatched = current
1739
+
1740
+ #
1741
+ # STAGE 1
1742
+ # EXACT
1743
+ #
1744
+
1745
+ matched, unmatched = _run_match_stage(
1746
+ unmatched,
1747
+ lookup=lookups["exact"],
1748
+ fname_expr=pl.col(f"{cols['fname']}_clean"),
1749
+ lname_expr=pl.col(f"{cols['lname']}_clean"),
1750
+ dob_expr=pl.col(f"{cols['dob']}_clean"),
1751
+ label="exact",
1752
+ )
1753
+
1754
+ matched_frames.append(matched)
1755
+
1756
+ #
1757
+ # STAGE 2
1758
+ # MNAME
1759
+ #
1760
+
1761
+ if "mname" in cols:
1762
+
1763
+ unmatched = (
1764
+ unmatched
1765
+ .with_columns(
1766
+ *clean_other_name(cols["mname"])
1767
+ )
1768
+ )
1769
+
1770
+ matched, unmatched = _run_match_stage(
1771
+ unmatched,
1772
+ lookup=lookups["mname"],
1773
+ fname_expr=pl.concat_str(
1774
+ [
1775
+ pl.col(f"{cols['fname']}_clean"),
1776
+ pl.col(f"{cols['mname']}_clean"),
1777
+ ],
1778
+ separator=" ",
1779
+ ),
1780
+ lname_expr=pl.col(f"{cols['lname']}_clean"),
1781
+ dob_expr=pl.col(f"{cols['dob']}_clean"),
1782
+ label="mname",
1783
+ )
1784
+
1785
+ matched_frames.append(matched)
1786
+
1787
+ #
1788
+ # STAGE 3
1789
+ # SUFFIX
1790
+ #
1791
+
1792
+ if "suffix" in cols:
1793
+
1794
+ unmatched = (
1795
+ unmatched
1796
+ .with_columns(
1797
+ *clean_other_name(cols["suffix"])
1798
+ )
1799
+ )
1800
+
1801
+ matched, unmatched = _run_match_stage(
1802
+ unmatched,
1803
+ lookup=lookups["suffix"],
1804
+ fname_expr=pl.col(f"{cols['fname']}_clean"),
1805
+ lname_expr=pl.concat_str(
1806
+ [
1807
+ pl.col(f"{cols['lname']}_clean"),
1808
+ pl.col(f"{cols['suffix']}_clean"),
1809
+ ],
1810
+ separator=" ",
1811
+ ),
1812
+ dob_expr=pl.col(f"{cols['dob']}_clean"),
1813
+ label="suffix",
1814
+ )
1815
+
1816
+ matched_frames.append(matched)
1817
+
1818
+ #
1819
+ # STAGE 4
1820
+ # DOB IMP
1821
+ #
1822
+
1823
+ matched, unmatched = _run_match_stage(
1824
+ unmatched,
1825
+ lookup=lookups["dob_imp"],
1826
+ fname_expr=pl.col(f"{cols['fname']}_clean"),
1827
+ lname_expr=pl.col(f"{cols['lname']}_clean"),
1828
+ dob_expr=pl.col(f"{cols['dob']}_clean"),
1829
+ label="dob_imp",
1830
+ )
1831
+
1832
+ matched_frames.append(matched)
1833
+
1834
+ #
1835
+ # FINAL
1836
+ #
1837
+
1838
+ result = pl.concat(
1839
+ matched_frames + [unmatched],
1840
+ how="diagonal_relaxed",
1841
+ )
1842
+
1843
+ print(
1844
+ f"final matched: {result['sid_cepr'].is_not_null().sum():,}/{len(result):,}"
1845
+ )
1846
+
1847
+ return result
1848
+
1849
+ #
1850
+
1851
+ # EXAMPLE USAGE
1852
+
1853
+ #
1854
+
1855
+ lookups = build_census_lookups(
1856
+ cmo_name="Aspire",
1857
+ )
1858
+
1859
+ result = (
1860
+ df
1861
+ .pipe(
1862
+ lookup_sid_cepr,
1863
+ cols={
1864
+ "fname": "fname",
1865
+ "lname": "lname",
1866
+ "mname": "mname",
1867
+ "suffix": "suffix",
1868
+ "dob": "dob",
1869
+ },
1870
+ lookups=lookups,
1871
+ )
1872
+ )
File without changes