ltc-code 0.1.4__tar.gz → 0.1.6__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.3
2
2
  Name: ltc-code
3
- Version: 0.1.4
3
+ Version: 0.1.6
4
4
  Summary: Add your description here
5
5
  Requires-Python: >=3.9
6
6
  Description-Content-Type: text/markdown
@@ -1,6 +1,6 @@
1
1
  [project]
2
2
  name = "ltc-code"
3
- version = "0.1.4"
3
+ version = "0.1.6"
4
4
  description = "Add your description here"
5
5
  readme = "README.md"
6
6
  requires-python = ">=3.9"
@@ -1428,431 +1428,492 @@ def lookup_sid_cepr(
1428
1428
 
1429
1429
 
1430
1430
  def _build_lookup(
1431
- census: pl.DataFrame,
1432
- *,
1433
- fname_expr: pl.Expr,
1434
- lname_expr: pl.Expr,
1435
- dob_col: str,
1436
- label: str,
1431
+ census: pl.DataFrame,
1432
+ *,
1433
+ fname_expr: pl.Expr,
1434
+ lname_expr: pl.Expr,
1435
+ dob_col: str,
1436
+ label: str,
1437
1437
  ) -> pl.DataFrame:
1438
- """
1439
- Build a deterministic SID lookup table.
1440
-
1441
- ```
1442
- Output schema:
1443
- _fname_key
1444
- _lname_key
1445
- _dob_key
1446
- sid_cepr
1447
-
1448
- Ambiguous keys are removed.
1449
- """
1450
-
1451
- lookup = (
1452
- census
1453
- .select(
1454
- [
1455
- fname_expr.alias("_fname_key"),
1456
- lname_expr.alias("_lname_key"),
1457
- pl.col(dob_col).alias("_dob_key"),
1458
- pl.col("sid_cepr"),
1459
- ]
1460
- )
1461
- .drop_nulls(
1462
- [
1463
- "_fname_key",
1464
- "_lname_key",
1465
- "_dob_key",
1466
- "sid_cepr",
1467
- ]
1468
- )
1469
- .group_by(
1470
- [
1471
- "_fname_key",
1472
- "_lname_key",
1473
- "_dob_key",
1474
- ]
1475
- )
1476
- .agg(
1477
- pl.col("sid_cepr").unique().alias("_sids")
1478
- )
1479
- .with_columns(
1480
- pl.col("_sids").list.len().alias("_sid_count")
1481
- )
1482
- .filter(
1483
- pl.col("_sid_count") == 1
1484
- )
1485
- .select(
1486
- [
1487
- "_fname_key",
1488
- "_lname_key",
1489
- "_dob_key",
1490
- pl.col("_sids").list.first().alias("sid_cepr"),
1491
- ]
1492
- )
1493
- )
1494
-
1495
- print(f"built lookup: {label}")
1496
-
1497
- return lookup
1498
- ```
1499
-
1500
- def build_census_lookups(
1501
- *,
1502
- cmo_name: str,
1503
- ) -> dict[str, pl.DataFrame]:
1504
-
1505
- ```
1506
- try:
1507
- import mappings
1508
- except ImportError:
1509
- import mapppings as mappings
1510
-
1511
- annual_frames = []
1438
+ """
1439
+ Build a deterministic SID lookup table.
1512
1440
 
1513
- for year in range(1994, 2023):
1441
+ Output schema:
1442
+ _fname_key
1443
+ _lname_key
1444
+ _dob_key
1445
+ sid_cepr
1514
1446
 
1515
- path = CENSUS_STUDENTS / f"census_student_{year}.csv"
1447
+ Ambiguous keys are removed.
1448
+ """
1516
1449
 
1517
- annual = (
1518
- pl.scan_csv(
1519
- path,
1520
- infer_schema=False,
1521
- null_values=[],
1522
- try_parse_dates=False,
1523
- ignore_errors=False,
1524
- )
1450
+ lookup = (
1451
+ census
1525
1452
  .select(
1526
1453
  [
1527
- "cmo_code",
1454
+ fname_expr.alias("_fname_key"),
1455
+ lname_expr.alias("_lname_key"),
1456
+ pl.col(dob_col).alias("_dob_key"),
1457
+ pl.col("sid_cepr"),
1458
+ ]
1459
+ )
1460
+ .drop_nulls(
1461
+ [
1462
+ "_fname_key",
1463
+ "_lname_key",
1464
+ "_dob_key",
1528
1465
  "sid_cepr",
1529
- "fname_clean",
1530
- "lname_clean",
1531
- "mname_clean",
1532
- "suff_clean",
1533
- "birthdate_clean",
1534
- "birthdate_imp",
1535
1466
  ]
1536
1467
  )
1537
- .rename(
1538
- {
1539
- "fname_clean": "fname",
1540
- "lname_clean": "lname",
1541
- "mname_clean": "mname",
1542
- "suff_clean": "suffix",
1543
- "birthdate_clean": "dob",
1544
- "birthdate_imp": "dob_imp",
1545
- }
1468
+ .group_by(
1469
+ [
1470
+ "_fname_key",
1471
+ "_lname_key",
1472
+ "_dob_key",
1473
+ ]
1474
+ )
1475
+ .agg(
1476
+ pl.col("sid_cepr").unique().alias("_sids")
1546
1477
  )
1547
1478
  .with_columns(
1548
- pl.col("cmo_code")
1549
- .replace(mappings.CMO_CODE_TO_NAME)
1550
- .alias("cmo_name")
1479
+ pl.col("_sids").list.len().alias("_sid_count")
1551
1480
  )
1552
1481
  .filter(
1553
- pl.col("cmo_name") == cmo_name
1482
+ pl.col("_sid_count") == 1
1554
1483
  )
1555
- .with_columns(
1556
- *clean_name("fname"),
1557
- *clean_name("lname"),
1558
- *clean_other_name("mname"),
1559
- *clean_other_name("suffix"),
1560
- *clean_dob(col="dob"),
1561
- *clean_dob(col="dob_imp"),
1562
- )
1563
- .drop(
1484
+ .select(
1564
1485
  [
1565
- "dob",
1566
- "dob_imp",
1486
+ "_fname_key",
1487
+ "_lname_key",
1488
+ "_dob_key",
1489
+ pl.col("_sids").list.first().alias("sid_cepr"),
1567
1490
  ]
1568
1491
  )
1569
- .rename(
1570
- {
1571
- "dob_clean": "dob",
1572
- "dob_imp_clean": "dob_imp",
1573
- }
1574
- )
1575
1492
  )
1576
1493
 
1577
- annual_frames.append(annual)
1494
+ print(f"built lookup: {label}")
1578
1495
 
1579
- #
1580
- # MATERIALIZE ONCE
1581
- #
1496
+ return lookup
1497
+
1498
+ def build_census_lookups(
1499
+ *,
1500
+ cmo_name: str,
1501
+ ) -> dict[str, pl.DataFrame]:
1502
+
1503
+ try:
1504
+ import mappings
1505
+ except ImportError:
1506
+ import mapppings as mappings
1507
+
1508
+ annual_frames = []
1509
+
1510
+ for year in range(1994, 2023):
1511
+
1512
+ path = CENSUS_STUDENTS / f"census_student_{year}.csv"
1513
+
1514
+ annual = (
1515
+ pl.scan_csv(
1516
+ path,
1517
+ infer_schema=False,
1518
+ null_values=[],
1519
+ try_parse_dates=False,
1520
+ ignore_errors=False,
1521
+ )
1522
+ .select(
1523
+ [
1524
+ "cmo_code",
1525
+ "sid_cepr",
1526
+ "fname_clean",
1527
+ "lname_clean",
1528
+ "mname_clean",
1529
+ "suff_clean",
1530
+ "birthdate_clean",
1531
+ "birthdate_imp",
1532
+ ]
1533
+ )
1534
+ .rename(
1535
+ {
1536
+ "fname_clean": "fname",
1537
+ "lname_clean": "lname",
1538
+ "mname_clean": "mname",
1539
+ "suff_clean": "suffix",
1540
+ "birthdate_clean": "dob",
1541
+ "birthdate_imp": "dob_imp",
1542
+ }
1543
+ )
1544
+ .with_columns(
1545
+ pl.col("cmo_code")
1546
+ .replace(mappings.CMO_CODE_TO_NAME)
1547
+ .alias("cmo_name")
1548
+ )
1549
+ .filter(
1550
+ pl.col("cmo_name") == cmo_name
1551
+ )
1552
+ .with_columns(
1553
+ *clean_name("fname"),
1554
+ *clean_name("lname"),
1555
+ *clean_other_name("mname"),
1556
+ *clean_other_name("suffix"),
1557
+ *clean_dob(col="dob"),
1558
+ *clean_dob(col="dob_imp"),
1559
+ )
1560
+ .drop(
1561
+ [
1562
+ "dob",
1563
+ "dob_imp",
1564
+ ]
1565
+ )
1566
+ .rename(
1567
+ {
1568
+ "dob_clean": "dob",
1569
+ "dob_imp_clean": "dob_imp",
1570
+ }
1571
+ )
1572
+ )
1573
+
1574
+ annual_frames.append(annual)
1582
1575
 
1583
- census = (
1584
- pl.concat(
1585
- annual_frames,
1586
- how="vertical_relaxed",
1576
+ #
1577
+ # MATERIALIZE ONCE
1578
+ #
1579
+
1580
+ census = (
1581
+ pl.concat(
1582
+ annual_frames,
1583
+ how="vertical_relaxed",
1584
+ )
1585
+ .collect()
1587
1586
  )
1588
- .collect()
1589
- )
1590
1587
 
1591
- print(f"census rows: {len(census):,}")
1588
+ print(f"census rows: {len(census):,}")
1592
1589
 
1593
- #
1594
- # BUILD LOOKUPS ONCE
1595
- #
1590
+ #
1591
+ # BUILD LOOKUPS ONCE
1592
+ #
1596
1593
 
1597
- lookup_exact = _build_lookup(
1598
- census,
1599
- fname_expr=pl.col("fname"),
1600
- lname_expr=pl.col("lname"),
1601
- dob_col="dob",
1602
- label="exact",
1603
- )
1594
+ lookup_exact = _build_lookup(
1595
+ census,
1596
+ fname_expr=pl.col("fname"),
1597
+ lname_expr=pl.col("lname"),
1598
+ dob_col="dob",
1599
+ label="exact",
1600
+ )
1604
1601
 
1605
- lookup_mname = _build_lookup(
1606
- census,
1607
- fname_expr=pl.concat_str(
1608
- [
1609
- pl.col("fname"),
1610
- pl.col("mname"),
1611
- ],
1612
- separator=" ",
1613
- ),
1614
- lname_expr=pl.col("lname"),
1615
- dob_col="dob",
1616
- label="mname",
1617
- )
1602
+ lookup_mname = _build_lookup(
1603
+ census,
1604
+ fname_expr=pl.concat_str(
1605
+ [
1606
+ pl.col("fname"),
1607
+ pl.col("mname"),
1608
+ ],
1609
+ separator=" ",
1610
+ ),
1611
+ lname_expr=pl.col("lname"),
1612
+ dob_col="dob",
1613
+ label="mname",
1614
+ )
1618
1615
 
1619
- lookup_suffix = _build_lookup(
1620
- census,
1621
- fname_expr=pl.col("fname"),
1622
- lname_expr=pl.concat_str(
1623
- [
1624
- pl.col("lname"),
1625
- pl.col("suffix"),
1626
- ],
1627
- separator=" ",
1628
- ),
1629
- dob_col="dob",
1630
- label="suffix",
1631
- )
1616
+ lookup_suffix = _build_lookup(
1617
+ census,
1618
+ fname_expr=pl.col("fname"),
1619
+ lname_expr=pl.concat_str(
1620
+ [
1621
+ pl.col("lname"),
1622
+ pl.col("suffix"),
1623
+ ],
1624
+ separator=" ",
1625
+ ),
1626
+ dob_col="dob",
1627
+ label="suffix",
1628
+ )
1632
1629
 
1633
- lookup_dob_imp = _build_lookup(
1634
- census,
1635
- fname_expr=pl.col("fname"),
1636
- lname_expr=pl.col("lname"),
1637
- dob_col="dob_imp",
1638
- label="dob_imp",
1639
- )
1630
+ lookup_dob_imp = _build_lookup(
1631
+ census,
1632
+ fname_expr=pl.col("fname"),
1633
+ lname_expr=pl.col("lname"),
1634
+ dob_col="dob_imp",
1635
+ label="dob_imp",
1636
+ )
1640
1637
 
1641
- return {
1642
- "exact": lookup_exact,
1643
- "mname": lookup_mname,
1644
- "suffix": lookup_suffix,
1645
- "dob_imp": lookup_dob_imp,
1646
- }
1647
- ```
1638
+ return {
1639
+ "exact": lookup_exact,
1640
+ "mname": lookup_mname,
1641
+ "suffix": lookup_suffix,
1642
+ "dob_imp": lookup_dob_imp,
1643
+ }
1648
1644
 
1649
1645
  def _run_match_stage(
1650
- unmatched: pl.DataFrame,
1651
- *,
1652
- lookup: pl.DataFrame,
1653
- fname_expr: pl.Expr,
1654
- lname_expr: pl.Expr,
1655
- dob_expr: pl.Expr,
1656
- label: str,
1646
+ unmatched: pl.DataFrame,
1647
+ *,
1648
+ lookup: pl.DataFrame,
1649
+ fname_expr: pl.Expr,
1650
+ lname_expr: pl.Expr,
1651
+ dob_expr: pl.Expr,
1652
+ label: str,
1657
1653
  ) -> tuple[pl.DataFrame, pl.DataFrame]:
1658
1654
 
1659
- ```
1660
- before = len(unmatched)
1655
+ before = len(unmatched)
1661
1656
 
1662
- stage = (
1663
- unmatched
1664
- .with_columns(
1665
- [
1666
- fname_expr.alias("_fname_key"),
1667
- lname_expr.alias("_lname_key"),
1668
- dob_expr.alias("_dob_key"),
1669
- ]
1670
- )
1671
- .join(
1672
- lookup,
1673
- on=[
1674
- "_fname_key",
1675
- "_lname_key",
1676
- "_dob_key",
1677
- ],
1678
- how="left",
1679
- validate="m:1",
1680
- )
1681
- .drop(
1682
- [
1683
- "_fname_key",
1684
- "_lname_key",
1685
- "_dob_key",
1686
- ]
1657
+ stage = (
1658
+ unmatched
1659
+ .with_columns(
1660
+ [
1661
+ fname_expr.alias("_fname_key"),
1662
+ lname_expr.alias("_lname_key"),
1663
+ dob_expr.alias("_dob_key"),
1664
+ ]
1665
+ )
1666
+ .join(
1667
+ lookup,
1668
+ on=[
1669
+ "_fname_key",
1670
+ "_lname_key",
1671
+ "_dob_key",
1672
+ ],
1673
+ how="left",
1674
+ validate="m:1",
1675
+ )
1676
+ .drop(
1677
+ [
1678
+ "_fname_key",
1679
+ "_lname_key",
1680
+ "_dob_key",
1681
+ ]
1682
+ )
1687
1683
  )
1688
- )
1689
1684
 
1690
- matched = (
1691
- stage
1692
- .filter(
1693
- pl.col("sid_cepr").is_not_null()
1685
+ matched = (
1686
+ stage
1687
+ .filter(
1688
+ pl.col("sid_cepr").is_not_null()
1689
+ )
1694
1690
  )
1695
- )
1696
1691
 
1697
- unmatched = (
1698
- stage
1699
- .filter(
1700
- pl.col("sid_cepr").is_null()
1692
+ unmatched = (
1693
+ stage
1694
+ .filter(
1695
+ pl.col("sid_cepr").is_null()
1696
+ )
1697
+ .drop("sid_cepr")
1701
1698
  )
1702
- .drop("sid_cepr")
1703
- )
1704
1699
 
1705
- added = len(matched)
1700
+ added = len(matched)
1706
1701
 
1707
- print(
1708
- f"{label}: matched {added:,}/{before:,}"
1709
- )
1702
+ print(
1703
+ f"{label}: matched {added:,}/{before:,}"
1704
+ )
1710
1705
 
1711
- return matched, unmatched
1712
- ```
1706
+ return matched, unmatched
1713
1707
 
1714
1708
  def lookup_sid_cepr(
1715
- frame: Frame,
1716
- *,
1717
- cols: Mapping[str, str],
1718
- lookups: dict[str, pl.DataFrame],
1709
+ frame: Frame,
1710
+ *,
1711
+ cols: Mapping[str, str],
1712
+ lookups: dict[str, pl.DataFrame],
1719
1713
  ) -> Frame:
1720
1714
 
1721
- ```
1722
- is_lazy = isinstance(frame, pl.LazyFrame)
1723
-
1724
- current = (
1725
- frame.collect()
1726
- if is_lazy
1727
- else frame
1728
- )
1729
-
1730
- #
1731
- # CLEAN LEFT SIDE
1732
- #
1715
+ is_lazy = isinstance(frame, pl.LazyFrame)
1716
+ current = frame.collect() if is_lazy else frame
1733
1717
 
1734
- current = (
1735
- current
1736
- .with_columns(
1718
+ current = current.with_columns(
1737
1719
  *clean_name(cols["fname"]),
1738
1720
  *clean_name(cols["lname"]),
1739
1721
  *clean_dob(col=cols["dob"]),
1740
1722
  )
1741
- )
1742
1723
 
1743
- matched_frames = []
1744
-
1745
- unmatched = current
1746
-
1747
- #
1748
- # STAGE 1
1749
- # EXACT
1750
- #
1751
-
1752
- matched, unmatched = _run_match_stage(
1753
- unmatched,
1754
- lookup=lookups["exact"],
1755
- fname_expr=pl.col(f"{cols['fname']}_clean"),
1756
- lname_expr=pl.col(f"{cols['lname']}_clean"),
1757
- dob_expr=pl.col(f"{cols['dob']}_clean"),
1758
- label="exact",
1759
- )
1760
-
1761
- matched_frames.append(matched)
1762
-
1763
- #
1764
- # STAGE 2
1765
- # MNAME
1766
- #
1767
-
1768
- if "mname" in cols:
1769
-
1770
- unmatched = (
1771
- unmatched
1772
- .with_columns(
1773
- *clean_other_name(cols["mname"])
1774
- )
1724
+ current = current.with_columns(
1725
+ _parse_dob_expr(f"{cols['dob']}_clean").alias(f"{cols['dob']}_clean")
1775
1726
  )
1776
1727
 
1728
+ matched_frames = []
1729
+ unmatched = current
1730
+
1731
+ # exact
1777
1732
  matched, unmatched = _run_match_stage(
1778
1733
  unmatched,
1779
- lookup=lookups["mname"],
1780
- fname_expr=pl.concat_str(
1781
- [
1782
- pl.col(f"{cols['fname']}_clean"),
1783
- pl.col(f"{cols['mname']}_clean"),
1784
- ],
1785
- separator=" ",
1786
- ),
1734
+ lookup=lookups["exact"],
1735
+ fname_expr=pl.col(f"{cols['fname']}_clean"),
1787
1736
  lname_expr=pl.col(f"{cols['lname']}_clean"),
1788
1737
  dob_expr=pl.col(f"{cols['dob']}_clean"),
1789
- label="mname",
1738
+ label="exact",
1790
1739
  )
1791
-
1792
1740
  matched_frames.append(matched)
1793
1741
 
1794
- #
1795
- # STAGE 3
1796
- # SUFFIX
1797
- #
1798
-
1799
- if "suffix" in cols:
1800
-
1801
- unmatched = (
1802
- unmatched
1803
- .with_columns(
1804
- *clean_other_name(cols["suffix"])
1742
+ # middle-name variants
1743
+ if "mname" in cols:
1744
+ unmatched = unmatched.with_columns(
1745
+ *clean_other_name(cols["mname"])
1805
1746
  )
1806
- )
1807
1747
 
1808
- matched, unmatched = _run_match_stage(
1809
- unmatched,
1810
- lookup=lookups["suffix"],
1811
- fname_expr=pl.col(f"{cols['fname']}_clean"),
1812
- lname_expr=pl.concat_str(
1813
- [
1748
+ mname_stages = [
1749
+ (
1750
+ "left exact -> right fname + mname",
1751
+ lookups["mname"],
1752
+ pl.col(f"{cols['fname']}_clean"),
1814
1753
  pl.col(f"{cols['lname']}_clean"),
1815
- pl.col(f"{cols['suffix']}_clean"),
1816
- ],
1817
- separator=" ",
1818
- ),
1819
- dob_expr=pl.col(f"{cols['dob']}_clean"),
1820
- label="suffix",
1821
- )
1754
+ ),
1755
+ (
1756
+ "left exact -> right mname + lname",
1757
+ lookups["mname_lname"],
1758
+ pl.col(f"{cols['fname']}_clean"),
1759
+ pl.col(f"{cols['lname']}_clean"),
1760
+ ),
1761
+ (
1762
+ "left exact -> right fname + mname no space",
1763
+ lookups["mname_nospace"],
1764
+ pl.col(f"{cols['fname']}_clean"),
1765
+ pl.col(f"{cols['lname']}_clean"),
1766
+ ),
1767
+ (
1768
+ "left fname + mname -> right exact",
1769
+ lookups["exact"],
1770
+ pl.concat_str(
1771
+ [pl.col(f"{cols['fname']}_clean"), pl.col(f"{cols['mname']}_clean")],
1772
+ separator=" ",
1773
+ ),
1774
+ pl.col(f"{cols['lname']}_clean"),
1775
+ ),
1776
+ (
1777
+ "left mname + lname -> right exact",
1778
+ lookups["exact"],
1779
+ pl.col(f"{cols['fname']}_clean"),
1780
+ pl.concat_str(
1781
+ [pl.col(f"{cols['mname']}_clean"), pl.col(f"{cols['lname']}_clean")],
1782
+ separator=" ",
1783
+ ),
1784
+ ),
1785
+ (
1786
+ "left fname + mname no space -> right exact",
1787
+ lookups["exact"],
1788
+ pl.concat_str(
1789
+ [pl.col(f"{cols['fname']}_clean"), pl.col(f"{cols['mname']}_clean")],
1790
+ separator="",
1791
+ ),
1792
+ pl.col(f"{cols['lname']}_clean"),
1793
+ ),
1794
+ ]
1822
1795
 
1823
- matched_frames.append(matched)
1796
+ for label, lookup, fname_expr, lname_expr in mname_stages:
1797
+ matched, unmatched = _run_match_stage(
1798
+ unmatched,
1799
+ lookup=lookup,
1800
+ fname_expr=fname_expr,
1801
+ lname_expr=lname_expr,
1802
+ dob_expr=pl.col(f"{cols['dob']}_clean"),
1803
+ label=label,
1804
+ )
1805
+ matched_frames.append(matched)
1824
1806
 
1825
- #
1826
- # STAGE 4
1827
- # DOB IMP
1828
- #
1807
+ # suffix variants
1808
+ if "suffix" in cols:
1809
+ unmatched = unmatched.with_columns(
1810
+ *clean_other_name(cols["suffix"])
1811
+ )
1829
1812
 
1830
- matched, unmatched = _run_match_stage(
1831
- unmatched,
1832
- lookup=lookups["dob_imp"],
1833
- fname_expr=pl.col(f"{cols['fname']}_clean"),
1834
- lname_expr=pl.col(f"{cols['lname']}_clean"),
1835
- dob_expr=pl.col(f"{cols['dob']}_clean"),
1836
- label="dob_imp",
1837
- )
1813
+ suffix_stages = [
1814
+ (
1815
+ "left exact -> right lname + suffix",
1816
+ lookups["suffix"],
1817
+ pl.col(f"{cols['fname']}_clean"),
1818
+ pl.col(f"{cols['lname']}_clean"),
1819
+ ),
1820
+ (
1821
+ "left exact -> right fname + suffix",
1822
+ lookups["suffix_fname"],
1823
+ pl.col(f"{cols['fname']}_clean"),
1824
+ pl.col(f"{cols['lname']}_clean"),
1825
+ ),
1826
+ (
1827
+ "left exact -> right fname + suffix no space",
1828
+ lookups["suffix_fname_nospace"],
1829
+ pl.col(f"{cols['fname']}_clean"),
1830
+ pl.col(f"{cols['lname']}_clean"),
1831
+ ),
1832
+ (
1833
+ "left exact -> right lname + suffix no space",
1834
+ lookups["suffix_lname_nospace"],
1835
+ pl.col(f"{cols['fname']}_clean"),
1836
+ pl.col(f"{cols['lname']}_clean"),
1837
+ ),
1838
+ (
1839
+ "left fname + suffix -> right exact",
1840
+ lookups["exact"],
1841
+ pl.concat_str(
1842
+ [pl.col(f"{cols['fname']}_clean"), pl.col(f"{cols['suffix']}_clean")],
1843
+ separator=" ",
1844
+ ),
1845
+ pl.col(f"{cols['lname']}_clean"),
1846
+ ),
1847
+ (
1848
+ "left fname + suffix no space -> right exact",
1849
+ lookups["exact"],
1850
+ pl.concat_str(
1851
+ [pl.col(f"{cols['fname']}_clean"), pl.col(f"{cols['suffix']}_clean")],
1852
+ separator="",
1853
+ ),
1854
+ pl.col(f"{cols['lname']}_clean"),
1855
+ ),
1856
+ (
1857
+ "left lname + suffix -> right exact",
1858
+ lookups["exact"],
1859
+ pl.col(f"{cols['fname']}_clean"),
1860
+ pl.concat_str(
1861
+ [pl.col(f"{cols['lname']}_clean"), pl.col(f"{cols['suffix']}_clean")],
1862
+ separator=" ",
1863
+ ),
1864
+ ),
1865
+ (
1866
+ "left lname + suffix no space -> right exact",
1867
+ lookups["exact"],
1868
+ pl.col(f"{cols['fname']}_clean"),
1869
+ pl.concat_str(
1870
+ [pl.col(f"{cols['lname']}_clean"), pl.col(f"{cols['suffix']}_clean")],
1871
+ separator="",
1872
+ ),
1873
+ ),
1874
+ ]
1838
1875
 
1839
- matched_frames.append(matched)
1876
+ for label, lookup, fname_expr, lname_expr in suffix_stages:
1877
+ matched, unmatched = _run_match_stage(
1878
+ unmatched,
1879
+ lookup=lookup,
1880
+ fname_expr=fname_expr,
1881
+ lname_expr=lname_expr,
1882
+ dob_expr=pl.col(f"{cols['dob']}_clean"),
1883
+ label=label,
1884
+ )
1885
+ matched_frames.append(matched)
1886
+
1887
+ # dob_imp variants
1888
+ dob_imp_stages = [
1889
+ "dob_imp",
1890
+ "dob_imp_minus_1",
1891
+ "dob_imp_plus_1",
1892
+ "dob_imp_minus_2",
1893
+ "dob_imp_plus_2",
1894
+ ]
1840
1895
 
1841
- #
1842
- # FINAL
1843
- #
1896
+ for key in dob_imp_stages:
1897
+ matched, unmatched = _run_match_stage(
1898
+ unmatched,
1899
+ lookup=lookups[key],
1900
+ fname_expr=pl.col(f"{cols['fname']}_clean"),
1901
+ lname_expr=pl.col(f"{cols['lname']}_clean"),
1902
+ dob_expr=pl.col(f"{cols['dob']}_clean"),
1903
+ label=key,
1904
+ )
1905
+ matched_frames.append(matched)
1844
1906
 
1845
- result = pl.concat(
1846
- matched_frames + [unmatched],
1847
- how="diagonal_relaxed",
1848
- )
1907
+ result = pl.concat(
1908
+ matched_frames + [unmatched],
1909
+ how="diagonal_relaxed",
1910
+ )
1849
1911
 
1850
- print(
1851
- f"final matched: {result['sid_cepr'].is_not_null().sum():,}/{len(result):,}"
1852
- )
1912
+ print(
1913
+ f"final matched: {result['sid_cepr'].is_not_null().sum():,}/{len(result):,}"
1914
+ )
1853
1915
 
1854
- return result
1855
- ```
1916
+ return result
1856
1917
 
1857
1918
  #
1858
1919
 
File without changes