ltc-code 0.1.4__tar.gz → 0.1.5__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.3
2
2
  Name: ltc-code
3
- Version: 0.1.4
3
+ Version: 0.1.5
4
4
  Summary: Add your description here
5
5
  Requires-Python: >=3.9
6
6
  Description-Content-Type: text/markdown
@@ -1,6 +1,6 @@
1
1
  [project]
2
2
  name = "ltc-code"
3
- version = "0.1.4"
3
+ version = "0.1.5"
4
4
  description = "Add your description here"
5
5
  readme = "README.md"
6
6
  requires-python = ">=3.9"
@@ -1428,431 +1428,423 @@ def lookup_sid_cepr(
1428
1428
 
1429
1429
 
1430
1430
  def _build_lookup(
1431
- census: pl.DataFrame,
1432
- *,
1433
- fname_expr: pl.Expr,
1434
- lname_expr: pl.Expr,
1435
- dob_col: str,
1436
- label: str,
1431
+ census: pl.DataFrame,
1432
+ *,
1433
+ fname_expr: pl.Expr,
1434
+ lname_expr: pl.Expr,
1435
+ dob_col: str,
1436
+ label: str,
1437
1437
  ) -> pl.DataFrame:
1438
- """
1439
- Build a deterministic SID lookup table.
1440
-
1441
- ```
1442
- Output schema:
1443
- _fname_key
1444
- _lname_key
1445
- _dob_key
1446
- sid_cepr
1447
-
1448
- Ambiguous keys are removed.
1449
- """
1450
-
1451
- lookup = (
1452
- census
1453
- .select(
1454
- [
1455
- fname_expr.alias("_fname_key"),
1456
- lname_expr.alias("_lname_key"),
1457
- pl.col(dob_col).alias("_dob_key"),
1458
- pl.col("sid_cepr"),
1459
- ]
1460
- )
1461
- .drop_nulls(
1462
- [
1463
- "_fname_key",
1464
- "_lname_key",
1465
- "_dob_key",
1466
- "sid_cepr",
1467
- ]
1468
- )
1469
- .group_by(
1470
- [
1471
- "_fname_key",
1472
- "_lname_key",
1473
- "_dob_key",
1474
- ]
1475
- )
1476
- .agg(
1477
- pl.col("sid_cepr").unique().alias("_sids")
1478
- )
1479
- .with_columns(
1480
- pl.col("_sids").list.len().alias("_sid_count")
1481
- )
1482
- .filter(
1483
- pl.col("_sid_count") == 1
1484
- )
1485
- .select(
1486
- [
1487
- "_fname_key",
1488
- "_lname_key",
1489
- "_dob_key",
1490
- pl.col("_sids").list.first().alias("sid_cepr"),
1491
- ]
1492
- )
1493
- )
1494
-
1495
- print(f"built lookup: {label}")
1496
-
1497
- return lookup
1498
- ```
1499
-
1500
- def build_census_lookups(
1501
- *,
1502
- cmo_name: str,
1503
- ) -> dict[str, pl.DataFrame]:
1504
-
1505
- ```
1506
- try:
1507
- import mappings
1508
- except ImportError:
1509
- import mapppings as mappings
1510
-
1511
- annual_frames = []
1438
+ """
1439
+ Build a deterministic SID lookup table.
1512
1440
 
1513
- for year in range(1994, 2023):
1441
+ Output schema:
1442
+ _fname_key
1443
+ _lname_key
1444
+ _dob_key
1445
+ sid_cepr
1514
1446
 
1515
- path = CENSUS_STUDENTS / f"census_student_{year}.csv"
1447
+ Ambiguous keys are removed.
1448
+ """
1516
1449
 
1517
- annual = (
1518
- pl.scan_csv(
1519
- path,
1520
- infer_schema=False,
1521
- null_values=[],
1522
- try_parse_dates=False,
1523
- ignore_errors=False,
1524
- )
1450
+ lookup = (
1451
+ census
1525
1452
  .select(
1526
1453
  [
1527
- "cmo_code",
1454
+ fname_expr.alias("_fname_key"),
1455
+ lname_expr.alias("_lname_key"),
1456
+ pl.col(dob_col).alias("_dob_key"),
1457
+ pl.col("sid_cepr"),
1458
+ ]
1459
+ )
1460
+ .drop_nulls(
1461
+ [
1462
+ "_fname_key",
1463
+ "_lname_key",
1464
+ "_dob_key",
1528
1465
  "sid_cepr",
1529
- "fname_clean",
1530
- "lname_clean",
1531
- "mname_clean",
1532
- "suff_clean",
1533
- "birthdate_clean",
1534
- "birthdate_imp",
1535
1466
  ]
1536
1467
  )
1537
- .rename(
1538
- {
1539
- "fname_clean": "fname",
1540
- "lname_clean": "lname",
1541
- "mname_clean": "mname",
1542
- "suff_clean": "suffix",
1543
- "birthdate_clean": "dob",
1544
- "birthdate_imp": "dob_imp",
1545
- }
1468
+ .group_by(
1469
+ [
1470
+ "_fname_key",
1471
+ "_lname_key",
1472
+ "_dob_key",
1473
+ ]
1474
+ )
1475
+ .agg(
1476
+ pl.col("sid_cepr").unique().alias("_sids")
1546
1477
  )
1547
1478
  .with_columns(
1548
- pl.col("cmo_code")
1549
- .replace(mappings.CMO_CODE_TO_NAME)
1550
- .alias("cmo_name")
1479
+ pl.col("_sids").list.len().alias("_sid_count")
1551
1480
  )
1552
1481
  .filter(
1553
- pl.col("cmo_name") == cmo_name
1482
+ pl.col("_sid_count") == 1
1554
1483
  )
1555
- .with_columns(
1556
- *clean_name("fname"),
1557
- *clean_name("lname"),
1558
- *clean_other_name("mname"),
1559
- *clean_other_name("suffix"),
1560
- *clean_dob(col="dob"),
1561
- *clean_dob(col="dob_imp"),
1562
- )
1563
- .drop(
1484
+ .select(
1564
1485
  [
1565
- "dob",
1566
- "dob_imp",
1486
+ "_fname_key",
1487
+ "_lname_key",
1488
+ "_dob_key",
1489
+ pl.col("_sids").list.first().alias("sid_cepr"),
1567
1490
  ]
1568
1491
  )
1569
- .rename(
1570
- {
1571
- "dob_clean": "dob",
1572
- "dob_imp_clean": "dob_imp",
1573
- }
1574
- )
1575
1492
  )
1576
1493
 
1577
- annual_frames.append(annual)
1494
+ print(f"built lookup: {label}")
1578
1495
 
1579
- #
1580
- # MATERIALIZE ONCE
1581
- #
1496
+ return lookup
1582
1497
 
1583
- census = (
1584
- pl.concat(
1585
- annual_frames,
1586
- how="vertical_relaxed",
1587
- )
1588
- .collect()
1589
- )
1498
+ def build_census_lookups(
1499
+ *,
1500
+ cmo_name: str,
1501
+ ) -> dict[str, pl.DataFrame]:
1590
1502
 
1591
- print(f"census rows: {len(census):,}")
1503
+ try:
1504
+ import mappings
1505
+ except ImportError:
1506
+ import mapppings as mappings
1592
1507
 
1593
- #
1594
- # BUILD LOOKUPS ONCE
1595
- #
1508
+ annual_frames = []
1596
1509
 
1597
- lookup_exact = _build_lookup(
1598
- census,
1599
- fname_expr=pl.col("fname"),
1600
- lname_expr=pl.col("lname"),
1601
- dob_col="dob",
1602
- label="exact",
1603
- )
1510
+ for year in range(1994, 2023):
1604
1511
 
1605
- lookup_mname = _build_lookup(
1606
- census,
1607
- fname_expr=pl.concat_str(
1608
- [
1609
- pl.col("fname"),
1610
- pl.col("mname"),
1611
- ],
1612
- separator=" ",
1613
- ),
1614
- lname_expr=pl.col("lname"),
1615
- dob_col="dob",
1616
- label="mname",
1617
- )
1512
+ path = CENSUS_STUDENTS / f"census_student_{year}.csv"
1618
1513
 
1619
- lookup_suffix = _build_lookup(
1620
- census,
1621
- fname_expr=pl.col("fname"),
1622
- lname_expr=pl.concat_str(
1623
- [
1624
- pl.col("lname"),
1625
- pl.col("suffix"),
1626
- ],
1627
- separator=" ",
1628
- ),
1629
- dob_col="dob",
1630
- label="suffix",
1631
- )
1514
+ annual = (
1515
+ pl.scan_csv(
1516
+ path,
1517
+ infer_schema=False,
1518
+ null_values=[],
1519
+ try_parse_dates=False,
1520
+ ignore_errors=False,
1521
+ )
1522
+ .select(
1523
+ [
1524
+ "cmo_code",
1525
+ "sid_cepr",
1526
+ "fname_clean",
1527
+ "lname_clean",
1528
+ "mname_clean",
1529
+ "suff_clean",
1530
+ "birthdate_clean",
1531
+ "birthdate_imp",
1532
+ ]
1533
+ )
1534
+ .rename(
1535
+ {
1536
+ "fname_clean": "fname",
1537
+ "lname_clean": "lname",
1538
+ "mname_clean": "mname",
1539
+ "suff_clean": "suffix",
1540
+ "birthdate_clean": "dob",
1541
+ "birthdate_imp": "dob_imp",
1542
+ }
1543
+ )
1544
+ .with_columns(
1545
+ pl.col("cmo_code")
1546
+ .replace(mappings.CMO_CODE_TO_NAME)
1547
+ .alias("cmo_name")
1548
+ )
1549
+ .filter(
1550
+ pl.col("cmo_name") == cmo_name
1551
+ )
1552
+ .with_columns(
1553
+ *clean_name("fname"),
1554
+ *clean_name("lname"),
1555
+ *clean_other_name("mname"),
1556
+ *clean_other_name("suffix"),
1557
+ *clean_dob(col="dob"),
1558
+ *clean_dob(col="dob_imp"),
1559
+ )
1560
+ .drop(
1561
+ [
1562
+ "dob",
1563
+ "dob_imp",
1564
+ ]
1565
+ )
1566
+ .rename(
1567
+ {
1568
+ "dob_clean": "dob",
1569
+ "dob_imp_clean": "dob_imp",
1570
+ }
1571
+ )
1572
+ )
1632
1573
 
1633
- lookup_dob_imp = _build_lookup(
1634
- census,
1635
- fname_expr=pl.col("fname"),
1636
- lname_expr=pl.col("lname"),
1637
- dob_col="dob_imp",
1638
- label="dob_imp",
1639
- )
1574
+ annual_frames.append(annual)
1640
1575
 
1641
- return {
1642
- "exact": lookup_exact,
1643
- "mname": lookup_mname,
1644
- "suffix": lookup_suffix,
1645
- "dob_imp": lookup_dob_imp,
1646
- }
1647
- ```
1576
+ #
1577
+ # MATERIALIZE ONCE
1578
+ #
1648
1579
 
1649
- def _run_match_stage(
1650
- unmatched: pl.DataFrame,
1651
- *,
1652
- lookup: pl.DataFrame,
1653
- fname_expr: pl.Expr,
1654
- lname_expr: pl.Expr,
1655
- dob_expr: pl.Expr,
1656
- label: str,
1657
- ) -> tuple[pl.DataFrame, pl.DataFrame]:
1580
+ census = (
1581
+ pl.concat(
1582
+ annual_frames,
1583
+ how="vertical_relaxed",
1584
+ )
1585
+ .collect()
1586
+ )
1658
1587
 
1659
- ```
1660
- before = len(unmatched)
1588
+ print(f"census rows: {len(census):,}")
1661
1589
 
1662
- stage = (
1663
- unmatched
1664
- .with_columns(
1665
- [
1666
- fname_expr.alias("_fname_key"),
1667
- lname_expr.alias("_lname_key"),
1668
- dob_expr.alias("_dob_key"),
1669
- ]
1670
- )
1671
- .join(
1672
- lookup,
1673
- on=[
1674
- "_fname_key",
1675
- "_lname_key",
1676
- "_dob_key",
1677
- ],
1678
- how="left",
1679
- validate="m:1",
1680
- )
1681
- .drop(
1682
- [
1683
- "_fname_key",
1684
- "_lname_key",
1685
- "_dob_key",
1686
- ]
1687
- )
1688
- )
1590
+ #
1591
+ # BUILD LOOKUPS ONCE
1592
+ #
1689
1593
 
1690
- matched = (
1691
- stage
1692
- .filter(
1693
- pl.col("sid_cepr").is_not_null()
1594
+ lookup_exact = _build_lookup(
1595
+ census,
1596
+ fname_expr=pl.col("fname"),
1597
+ lname_expr=pl.col("lname"),
1598
+ dob_col="dob",
1599
+ label="exact",
1694
1600
  )
1695
- )
1696
1601
 
1697
- unmatched = (
1698
- stage
1699
- .filter(
1700
- pl.col("sid_cepr").is_null()
1602
+ lookup_mname = _build_lookup(
1603
+ census,
1604
+ fname_expr=pl.concat_str(
1605
+ [
1606
+ pl.col("fname"),
1607
+ pl.col("mname"),
1608
+ ],
1609
+ separator=" ",
1610
+ ),
1611
+ lname_expr=pl.col("lname"),
1612
+ dob_col="dob",
1613
+ label="mname",
1701
1614
  )
1702
- .drop("sid_cepr")
1703
- )
1704
1615
 
1705
- added = len(matched)
1616
+ lookup_suffix = _build_lookup(
1617
+ census,
1618
+ fname_expr=pl.col("fname"),
1619
+ lname_expr=pl.concat_str(
1620
+ [
1621
+ pl.col("lname"),
1622
+ pl.col("suffix"),
1623
+ ],
1624
+ separator=" ",
1625
+ ),
1626
+ dob_col="dob",
1627
+ label="suffix",
1628
+ )
1706
1629
 
1707
- print(
1708
- f"{label}: matched {added:,}/{before:,}"
1709
- )
1630
+ lookup_dob_imp = _build_lookup(
1631
+ census,
1632
+ fname_expr=pl.col("fname"),
1633
+ lname_expr=pl.col("lname"),
1634
+ dob_col="dob_imp",
1635
+ label="dob_imp",
1636
+ )
1710
1637
 
1711
- return matched, unmatched
1712
- ```
1638
+ return {
1639
+ "exact": lookup_exact,
1640
+ "mname": lookup_mname,
1641
+ "suffix": lookup_suffix,
1642
+ "dob_imp": lookup_dob_imp,
1643
+ }
1713
1644
 
1714
- def lookup_sid_cepr(
1715
- frame: Frame,
1716
- *,
1717
- cols: Mapping[str, str],
1718
- lookups: dict[str, pl.DataFrame],
1719
- ) -> Frame:
1645
+ def _run_match_stage(
1646
+ unmatched: pl.DataFrame,
1647
+ *,
1648
+ lookup: pl.DataFrame,
1649
+ fname_expr: pl.Expr,
1650
+ lname_expr: pl.Expr,
1651
+ dob_expr: pl.Expr,
1652
+ label: str,
1653
+ ) -> tuple[pl.DataFrame, pl.DataFrame]:
1720
1654
 
1721
- ```
1722
- is_lazy = isinstance(frame, pl.LazyFrame)
1655
+ before = len(unmatched)
1723
1656
 
1724
- current = (
1725
- frame.collect()
1726
- if is_lazy
1727
- else frame
1728
- )
1657
+ stage = (
1658
+ unmatched
1659
+ .with_columns(
1660
+ [
1661
+ fname_expr.alias("_fname_key"),
1662
+ lname_expr.alias("_lname_key"),
1663
+ dob_expr.alias("_dob_key"),
1664
+ ]
1665
+ )
1666
+ .join(
1667
+ lookup,
1668
+ on=[
1669
+ "_fname_key",
1670
+ "_lname_key",
1671
+ "_dob_key",
1672
+ ],
1673
+ how="left",
1674
+ validate="m:1",
1675
+ )
1676
+ .drop(
1677
+ [
1678
+ "_fname_key",
1679
+ "_lname_key",
1680
+ "_dob_key",
1681
+ ]
1682
+ )
1683
+ )
1729
1684
 
1730
- #
1731
- # CLEAN LEFT SIDE
1732
- #
1685
+ matched = (
1686
+ stage
1687
+ .filter(
1688
+ pl.col("sid_cepr").is_not_null()
1689
+ )
1690
+ )
1733
1691
 
1734
- current = (
1735
- current
1736
- .with_columns(
1737
- *clean_name(cols["fname"]),
1738
- *clean_name(cols["lname"]),
1739
- *clean_dob(col=cols["dob"]),
1692
+ unmatched = (
1693
+ stage
1694
+ .filter(
1695
+ pl.col("sid_cepr").is_null()
1696
+ )
1697
+ .drop("sid_cepr")
1740
1698
  )
1741
- )
1742
1699
 
1743
- matched_frames = []
1700
+ added = len(matched)
1744
1701
 
1745
- unmatched = current
1702
+ print(
1703
+ f"{label}: matched {added:,}/{before:,}"
1704
+ )
1746
1705
 
1747
- #
1748
- # STAGE 1
1749
- # EXACT
1750
- #
1706
+ return matched, unmatched
1751
1707
 
1752
- matched, unmatched = _run_match_stage(
1753
- unmatched,
1754
- lookup=lookups["exact"],
1755
- fname_expr=pl.col(f"{cols['fname']}_clean"),
1756
- lname_expr=pl.col(f"{cols['lname']}_clean"),
1757
- dob_expr=pl.col(f"{cols['dob']}_clean"),
1758
- label="exact",
1759
- )
1708
+ def lookup_sid_cepr(
1709
+ frame: Frame,
1710
+ *,
1711
+ cols: Mapping[str, str],
1712
+ lookups: dict[str, pl.DataFrame],
1713
+ ) -> Frame:
1760
1714
 
1761
- matched_frames.append(matched)
1715
+ is_lazy = isinstance(frame, pl.LazyFrame)
1762
1716
 
1763
- #
1764
- # STAGE 2
1765
- # MNAME
1766
- #
1717
+ current = (
1718
+ frame.collect()
1719
+ if is_lazy
1720
+ else frame
1721
+ )
1767
1722
 
1768
- if "mname" in cols:
1723
+ #
1724
+ # CLEAN LEFT SIDE
1725
+ #
1769
1726
 
1770
- unmatched = (
1771
- unmatched
1727
+ current = (
1728
+ current
1772
1729
  .with_columns(
1773
- *clean_other_name(cols["mname"])
1730
+ *clean_name(cols["fname"]),
1731
+ *clean_name(cols["lname"]),
1732
+ *clean_dob(col=cols["dob"]),
1774
1733
  )
1775
1734
  )
1776
1735
 
1736
+ matched_frames = []
1737
+
1738
+ unmatched = current
1739
+
1740
+ #
1741
+ # STAGE 1
1742
+ # EXACT
1743
+ #
1744
+
1777
1745
  matched, unmatched = _run_match_stage(
1778
1746
  unmatched,
1779
- lookup=lookups["mname"],
1780
- fname_expr=pl.concat_str(
1781
- [
1782
- pl.col(f"{cols['fname']}_clean"),
1783
- pl.col(f"{cols['mname']}_clean"),
1784
- ],
1785
- separator=" ",
1786
- ),
1747
+ lookup=lookups["exact"],
1748
+ fname_expr=pl.col(f"{cols['fname']}_clean"),
1787
1749
  lname_expr=pl.col(f"{cols['lname']}_clean"),
1788
1750
  dob_expr=pl.col(f"{cols['dob']}_clean"),
1789
- label="mname",
1751
+ label="exact",
1790
1752
  )
1791
1753
 
1792
1754
  matched_frames.append(matched)
1793
1755
 
1794
- #
1795
- # STAGE 3
1796
- # SUFFIX
1797
- #
1756
+ #
1757
+ # STAGE 2
1758
+ # MNAME
1759
+ #
1798
1760
 
1799
- if "suffix" in cols:
1761
+ if "mname" in cols:
1800
1762
 
1801
- unmatched = (
1802
- unmatched
1803
- .with_columns(
1804
- *clean_other_name(cols["suffix"])
1763
+ unmatched = (
1764
+ unmatched
1765
+ .with_columns(
1766
+ *clean_other_name(cols["mname"])
1767
+ )
1805
1768
  )
1806
- )
1769
+
1770
+ matched, unmatched = _run_match_stage(
1771
+ unmatched,
1772
+ lookup=lookups["mname"],
1773
+ fname_expr=pl.concat_str(
1774
+ [
1775
+ pl.col(f"{cols['fname']}_clean"),
1776
+ pl.col(f"{cols['mname']}_clean"),
1777
+ ],
1778
+ separator=" ",
1779
+ ),
1780
+ lname_expr=pl.col(f"{cols['lname']}_clean"),
1781
+ dob_expr=pl.col(f"{cols['dob']}_clean"),
1782
+ label="mname",
1783
+ )
1784
+
1785
+ matched_frames.append(matched)
1786
+
1787
+ #
1788
+ # STAGE 3
1789
+ # SUFFIX
1790
+ #
1791
+
1792
+ if "suffix" in cols:
1793
+
1794
+ unmatched = (
1795
+ unmatched
1796
+ .with_columns(
1797
+ *clean_other_name(cols["suffix"])
1798
+ )
1799
+ )
1800
+
1801
+ matched, unmatched = _run_match_stage(
1802
+ unmatched,
1803
+ lookup=lookups["suffix"],
1804
+ fname_expr=pl.col(f"{cols['fname']}_clean"),
1805
+ lname_expr=pl.concat_str(
1806
+ [
1807
+ pl.col(f"{cols['lname']}_clean"),
1808
+ pl.col(f"{cols['suffix']}_clean"),
1809
+ ],
1810
+ separator=" ",
1811
+ ),
1812
+ dob_expr=pl.col(f"{cols['dob']}_clean"),
1813
+ label="suffix",
1814
+ )
1815
+
1816
+ matched_frames.append(matched)
1817
+
1818
+ #
1819
+ # STAGE 4
1820
+ # DOB IMP
1821
+ #
1807
1822
 
1808
1823
  matched, unmatched = _run_match_stage(
1809
1824
  unmatched,
1810
- lookup=lookups["suffix"],
1825
+ lookup=lookups["dob_imp"],
1811
1826
  fname_expr=pl.col(f"{cols['fname']}_clean"),
1812
- lname_expr=pl.concat_str(
1813
- [
1814
- pl.col(f"{cols['lname']}_clean"),
1815
- pl.col(f"{cols['suffix']}_clean"),
1816
- ],
1817
- separator=" ",
1818
- ),
1827
+ lname_expr=pl.col(f"{cols['lname']}_clean"),
1819
1828
  dob_expr=pl.col(f"{cols['dob']}_clean"),
1820
- label="suffix",
1829
+ label="dob_imp",
1821
1830
  )
1822
1831
 
1823
1832
  matched_frames.append(matched)
1824
1833
 
1825
- #
1826
- # STAGE 4
1827
- # DOB IMP
1828
- #
1829
-
1830
- matched, unmatched = _run_match_stage(
1831
- unmatched,
1832
- lookup=lookups["dob_imp"],
1833
- fname_expr=pl.col(f"{cols['fname']}_clean"),
1834
- lname_expr=pl.col(f"{cols['lname']}_clean"),
1835
- dob_expr=pl.col(f"{cols['dob']}_clean"),
1836
- label="dob_imp",
1837
- )
1834
+ #
1835
+ # FINAL
1836
+ #
1838
1837
 
1839
- matched_frames.append(matched)
1840
-
1841
- #
1842
- # FINAL
1843
- #
1844
-
1845
- result = pl.concat(
1846
- matched_frames + [unmatched],
1847
- how="diagonal_relaxed",
1848
- )
1838
+ result = pl.concat(
1839
+ matched_frames + [unmatched],
1840
+ how="diagonal_relaxed",
1841
+ )
1849
1842
 
1850
- print(
1851
- f"final matched: {result['sid_cepr'].is_not_null().sum():,}/{len(result):,}"
1852
- )
1843
+ print(
1844
+ f"final matched: {result['sid_cepr'].is_not_null().sum():,}/{len(result):,}"
1845
+ )
1853
1846
 
1854
- return result
1855
- ```
1847
+ return result
1856
1848
 
1857
1849
  #
1858
1850
 
File without changes