ltc-code 0.1.6__tar.gz → 0.1.8__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.3
2
2
  Name: ltc-code
3
- Version: 0.1.6
3
+ Version: 0.1.8
4
4
  Summary: Add your description here
5
5
  Requires-Python: >=3.9
6
6
  Description-Content-Type: text/markdown
@@ -1,6 +1,6 @@
1
1
  [project]
2
2
  name = "ltc-code"
3
- version = "0.1.6"
3
+ version = "0.1.8"
4
4
  description = "Add your description here"
5
5
  readme = "README.md"
6
6
  requires-python = ">=3.9"
@@ -1424,7 +1424,22 @@ def lookup_sid_cepr(
1424
1424
  ####################################################################################
1425
1425
 
1426
1426
 
1427
+ def _parse_dob_expr(col: str) -> pl.Expr:
1428
+ return pl.coalesce(
1429
+ [
1430
+ pl.col(col).cast(pl.String).str.strptime(pl.Date, "%m/%d/%Y", strict=False),
1431
+ pl.col(col).cast(pl.String).str.strptime(pl.Date, "%m/%d/%y", strict=False),
1432
+ pl.col(col).cast(pl.String).str.strptime(pl.Date, "%Y-%m-%d", strict=False),
1433
+ ]
1434
+ )
1435
+
1436
+
1437
+ def _first_word_expr(col: str) -> pl.Expr:
1438
+ return pl.col(col).cast(pl.String).str.split(" ").list.first()
1439
+
1427
1440
 
1441
+ def _second_word_expr(col: str) -> pl.Expr:
1442
+ return pl.col(col).cast(pl.String).str.split(" ").list.get(1, null_on_oob=True)
1428
1443
 
1429
1444
 
1430
1445
  def _build_lookup(
@@ -1435,80 +1450,33 @@ def _build_lookup(
1435
1450
  dob_col: str,
1436
1451
  label: str,
1437
1452
  ) -> pl.DataFrame:
1438
- """
1439
- Build a deterministic SID lookup table.
1440
-
1441
- Output schema:
1442
- _fname_key
1443
- _lname_key
1444
- _dob_key
1445
- sid_cepr
1446
-
1447
- Ambiguous keys are removed.
1448
- """
1449
-
1450
1453
  lookup = (
1451
1454
  census
1452
1455
  .select(
1453
- [
1454
- fname_expr.alias("_fname_key"),
1455
- lname_expr.alias("_lname_key"),
1456
- pl.col(dob_col).alias("_dob_key"),
1457
- pl.col("sid_cepr"),
1458
- ]
1459
- )
1460
- .drop_nulls(
1461
- [
1462
- "_fname_key",
1463
- "_lname_key",
1464
- "_dob_key",
1465
- "sid_cepr",
1466
- ]
1467
- )
1468
- .group_by(
1469
- [
1470
- "_fname_key",
1471
- "_lname_key",
1472
- "_dob_key",
1473
- ]
1474
- )
1475
- .agg(
1476
- pl.col("sid_cepr").unique().alias("_sids")
1477
- )
1478
- .with_columns(
1479
- pl.col("_sids").list.len().alias("_sid_count")
1480
- )
1481
- .filter(
1482
- pl.col("_sid_count") == 1
1456
+ fname_expr.alias("_fname_key"),
1457
+ lname_expr.alias("_lname_key"),
1458
+ pl.col(dob_col).alias("_dob_key"),
1459
+ pl.col("sid_cepr"),
1483
1460
  )
1461
+ .drop_nulls(["_fname_key", "_lname_key", "_dob_key", "sid_cepr"])
1462
+ .group_by(["_fname_key", "_lname_key", "_dob_key"])
1463
+ .agg(pl.col("sid_cepr").unique().alias("_sids"))
1484
1464
  .select(
1485
- [
1486
- "_fname_key",
1487
- "_lname_key",
1488
- "_dob_key",
1489
- pl.col("_sids").list.first().alias("sid_cepr"),
1490
- ]
1465
+ "_fname_key",
1466
+ "_lname_key",
1467
+ "_dob_key",
1468
+ pl.col("_sids").list.sort().list.first().alias("sid_cepr"),
1491
1469
  )
1492
1470
  )
1493
1471
 
1494
- print(f"built lookup: {label}")
1495
-
1472
+ print(f"built lookup: {label} ({len(lookup):,} usable keys)")
1496
1473
  return lookup
1497
1474
 
1498
- def build_census_lookups(
1499
- *,
1500
- cmo_name: str,
1501
- ) -> dict[str, pl.DataFrame]:
1502
-
1503
- try:
1504
- import mappings
1505
- except ImportError:
1506
- import mapppings as mappings
1507
1475
 
1476
+ def build_census_lookups(*, cmo_name: str) -> dict[str, pl.DataFrame]:
1508
1477
  annual_frames = []
1509
1478
 
1510
1479
  for year in range(1994, 2023):
1511
-
1512
1480
  path = CENSUS_STUDENTS / f"census_student_{year}.csv"
1513
1481
 
1514
1482
  annual = (
@@ -1520,16 +1488,14 @@ def build_census_lookups(
1520
1488
  ignore_errors=False,
1521
1489
  )
1522
1490
  .select(
1523
- [
1524
- "cmo_code",
1525
- "sid_cepr",
1526
- "fname_clean",
1527
- "lname_clean",
1528
- "mname_clean",
1529
- "suff_clean",
1530
- "birthdate_clean",
1531
- "birthdate_imp",
1532
- ]
1491
+ "cmo_code",
1492
+ "sid_cepr",
1493
+ "fname_clean",
1494
+ "lname_clean",
1495
+ "mname_clean",
1496
+ "suff_clean",
1497
+ "birthdate_clean",
1498
+ "birthdate_imp",
1533
1499
  )
1534
1500
  .rename(
1535
1501
  {
@@ -1542,92 +1508,129 @@ def build_census_lookups(
1542
1508
  }
1543
1509
  )
1544
1510
  .with_columns(
1545
- pl.col("cmo_code")
1546
- .replace(mappings.CMO_CODE_TO_NAME)
1547
- .alias("cmo_name")
1548
- )
1549
- .filter(
1550
- pl.col("cmo_name") == cmo_name
1511
+ pl.col("cmo_code").replace(cmo_map).alias("cmo_name")
1551
1512
  )
1513
+ .filter(pl.col("cmo_name") == cmo_name)
1552
1514
  .with_columns(
1553
1515
  *clean_name("fname"),
1554
1516
  *clean_name("lname"),
1555
1517
  *clean_other_name("mname"),
1556
1518
  *clean_other_name("suffix"),
1557
- *clean_dob(col="dob"),
1558
- *clean_dob(col="dob_imp"),
1559
1519
  )
1560
- .drop(
1561
- [
1562
- "dob",
1563
- "dob_imp",
1564
- ]
1520
+ .with_columns(*clean_dob(col="dob"))
1521
+ .with_columns(*clean_dob(col="dob_imp"))
1522
+ .with_columns(
1523
+ _parse_dob_expr("dob_clean").alias("dob"),
1524
+ _parse_dob_expr("dob_imp_clean").alias("dob_imp"),
1565
1525
  )
1566
- .rename(
1567
- {
1568
- "dob_clean": "dob",
1569
- "dob_imp_clean": "dob_imp",
1570
- }
1526
+ .select(
1527
+ "sid_cepr",
1528
+ pl.col("fname_clean").alias("fname"),
1529
+ pl.col("lname_clean").alias("lname"),
1530
+ pl.col("mname_clean").alias("mname"),
1531
+ pl.col("suffix_clean").alias("suffix"),
1532
+ "dob",
1533
+ "dob_imp",
1571
1534
  )
1572
1535
  )
1573
1536
 
1574
1537
  annual_frames.append(annual)
1575
1538
 
1576
- #
1577
- # MATERIALIZE ONCE
1578
- #
1539
+ census = pl.concat(annual_frames, how="vertical_relaxed").collect()
1579
1540
 
1580
- census = (
1581
- pl.concat(
1582
- annual_frames,
1583
- how="vertical_relaxed",
1584
- )
1585
- .collect()
1541
+ print(f"census rows after CMO filter: {len(census):,}")
1542
+
1543
+ lookups = {}
1544
+
1545
+ lookups["exact"] = _build_lookup(
1546
+ census,
1547
+ fname_expr=pl.col("fname"),
1548
+ lname_expr=pl.col("lname"),
1549
+ dob_col="dob",
1550
+ label="exact",
1586
1551
  )
1587
1552
 
1588
- print(f"census rows: {len(census):,}")
1553
+ lookups["mname"] = _build_lookup(
1554
+ census,
1555
+ fname_expr=pl.concat_str([pl.col("fname"), pl.col("mname")], separator=" "),
1556
+ lname_expr=pl.col("lname"),
1557
+ dob_col="dob",
1558
+ label="right fname + mname",
1559
+ )
1589
1560
 
1590
- #
1591
- # BUILD LOOKUPS ONCE
1592
- #
1561
+ lookups["mname_lname"] = _build_lookup(
1562
+ census,
1563
+ fname_expr=pl.col("fname"),
1564
+ lname_expr=pl.concat_str([pl.col("mname"), pl.col("lname")], separator=" "),
1565
+ dob_col="dob",
1566
+ label="right mname + lname",
1567
+ )
1593
1568
 
1594
- lookup_exact = _build_lookup(
1569
+ lookups["mname_nospace"] = _build_lookup(
1570
+ census,
1571
+ fname_expr=pl.concat_str([pl.col("fname"), pl.col("mname")], separator=""),
1572
+ lname_expr=pl.col("lname"),
1573
+ dob_col="dob",
1574
+ label="right fname + mname no space",
1575
+ )
1576
+
1577
+ lookups["suffix"] = _build_lookup(
1595
1578
  census,
1596
1579
  fname_expr=pl.col("fname"),
1580
+ lname_expr=pl.concat_str([pl.col("lname"), pl.col("suffix")], separator=" "),
1581
+ dob_col="dob",
1582
+ label="right lname + suffix",
1583
+ )
1584
+
1585
+ lookups["suffix_fname"] = _build_lookup(
1586
+ census,
1587
+ fname_expr=pl.concat_str([pl.col("fname"), pl.col("suffix")], separator=" "),
1597
1588
  lname_expr=pl.col("lname"),
1598
1589
  dob_col="dob",
1599
- label="exact",
1590
+ label="right fname + suffix",
1600
1591
  )
1601
1592
 
1602
- lookup_mname = _build_lookup(
1593
+ lookups["suffix_fname_nospace"] = _build_lookup(
1603
1594
  census,
1604
- fname_expr=pl.concat_str(
1605
- [
1606
- pl.col("fname"),
1607
- pl.col("mname"),
1608
- ],
1609
- separator=" ",
1610
- ),
1595
+ fname_expr=pl.concat_str([pl.col("fname"), pl.col("suffix")], separator=""),
1611
1596
  lname_expr=pl.col("lname"),
1612
1597
  dob_col="dob",
1613
- label="mname",
1598
+ label="right fname + suffix no space",
1614
1599
  )
1615
1600
 
1616
- lookup_suffix = _build_lookup(
1601
+ lookups["suffix_lname_nospace"] = _build_lookup(
1617
1602
  census,
1618
1603
  fname_expr=pl.col("fname"),
1619
- lname_expr=pl.concat_str(
1620
- [
1621
- pl.col("lname"),
1622
- pl.col("suffix"),
1623
- ],
1624
- separator=" ",
1625
- ),
1604
+ lname_expr=pl.concat_str([pl.col("lname"), pl.col("suffix")], separator=""),
1605
+ dob_col="dob",
1606
+ label="right lname + suffix no space",
1607
+ )
1608
+
1609
+ lookups["fname_first_word"] = _build_lookup(
1610
+ census,
1611
+ fname_expr=_first_word_expr("fname"),
1612
+ lname_expr=pl.col("lname"),
1613
+ dob_col="dob",
1614
+ label="right fname first word",
1615
+ )
1616
+
1617
+ lookups["lname_first_word"] = _build_lookup(
1618
+ census,
1619
+ fname_expr=pl.col("fname"),
1620
+ lname_expr=_first_word_expr("lname"),
1626
1621
  dob_col="dob",
1627
- label="suffix",
1622
+ label="right lname first word",
1628
1623
  )
1629
1624
 
1630
- lookup_dob_imp = _build_lookup(
1625
+ lookups["lname_second_word"] = _build_lookup(
1626
+ census,
1627
+ fname_expr=pl.col("fname"),
1628
+ lname_expr=_second_word_expr("lname"),
1629
+ dob_col="dob",
1630
+ label="right lname second word",
1631
+ )
1632
+
1633
+ lookups["dob_imp"] = _build_lookup(
1631
1634
  census,
1632
1635
  fname_expr=pl.col("fname"),
1633
1636
  lname_expr=pl.col("lname"),
@@ -1635,12 +1638,22 @@ def build_census_lookups(
1635
1638
  label="dob_imp",
1636
1639
  )
1637
1640
 
1638
- return {
1639
- "exact": lookup_exact,
1640
- "mname": lookup_mname,
1641
- "suffix": lookup_suffix,
1642
- "dob_imp": lookup_dob_imp,
1643
- }
1641
+ for offset, key in [
1642
+ ("-1y", "dob_imp_minus_1"),
1643
+ ("1y", "dob_imp_plus_1"),
1644
+ ("-2y", "dob_imp_minus_2"),
1645
+ ("2y", "dob_imp_plus_2"),
1646
+ ]:
1647
+ lookups[key] = _build_lookup(
1648
+ census.with_columns(pl.col("dob_imp").dt.offset_by(offset).alias(key)),
1649
+ fname_expr=pl.col("fname"),
1650
+ lname_expr=pl.col("lname"),
1651
+ dob_col=key,
1652
+ label=key,
1653
+ )
1654
+
1655
+ return lookups
1656
+
1644
1657
 
1645
1658
  def _run_match_stage(
1646
1659
  unmatched: pl.DataFrame,
@@ -1651,84 +1664,60 @@ def _run_match_stage(
1651
1664
  dob_expr: pl.Expr,
1652
1665
  label: str,
1653
1666
  ) -> tuple[pl.DataFrame, pl.DataFrame]:
1654
-
1655
1667
  before = len(unmatched)
1656
1668
 
1657
1669
  stage = (
1658
1670
  unmatched
1659
1671
  .with_columns(
1660
- [
1661
- fname_expr.alias("_fname_key"),
1662
- lname_expr.alias("_lname_key"),
1663
- dob_expr.alias("_dob_key"),
1664
- ]
1672
+ fname_expr.alias("_fname_key"),
1673
+ lname_expr.alias("_lname_key"),
1674
+ dob_expr.alias("_dob_key"),
1665
1675
  )
1666
1676
  .join(
1667
1677
  lookup,
1668
- on=[
1669
- "_fname_key",
1670
- "_lname_key",
1671
- "_dob_key",
1672
- ],
1678
+ on=["_fname_key", "_lname_key", "_dob_key"],
1673
1679
  how="left",
1674
1680
  validate="m:1",
1675
1681
  )
1676
- .drop(
1677
- [
1678
- "_fname_key",
1679
- "_lname_key",
1680
- "_dob_key",
1681
- ]
1682
- )
1683
- )
1684
-
1685
- matched = (
1686
- stage
1687
- .filter(
1688
- pl.col("sid_cepr").is_not_null()
1689
- )
1690
- )
1691
-
1692
- unmatched = (
1693
- stage
1694
- .filter(
1695
- pl.col("sid_cepr").is_null()
1696
- )
1697
- .drop("sid_cepr")
1682
+ .drop(["_fname_key", "_lname_key", "_dob_key"])
1698
1683
  )
1699
1684
 
1700
- added = len(matched)
1701
-
1702
- print(
1703
- f"{label}: matched {added:,}/{before:,}"
1704
- )
1685
+ matched = stage.filter(pl.col("sid_cepr").is_not_null())
1686
+ unmatched = stage.filter(pl.col("sid_cepr").is_null()).drop("sid_cepr")
1705
1687
 
1688
+ print(f"{label}: matched {len(matched):,}/{before:,}")
1706
1689
  return matched, unmatched
1707
1690
 
1691
+
1708
1692
  def lookup_sid_cepr(
1709
- frame: Frame,
1693
+ frame,
1710
1694
  *,
1711
1695
  cols: Mapping[str, str],
1712
1696
  lookups: dict[str, pl.DataFrame],
1713
- ) -> Frame:
1714
-
1697
+ ):
1715
1698
  is_lazy = isinstance(frame, pl.LazyFrame)
1716
1699
  current = frame.collect() if is_lazy else frame
1717
1700
 
1701
+ input_columns = current.columns
1702
+
1703
+ current = current.with_row_index("_row_id")
1704
+
1718
1705
  current = current.with_columns(
1719
1706
  *clean_name(cols["fname"]),
1720
1707
  *clean_name(cols["lname"]),
1721
- *clean_dob(col=cols["dob"]),
1722
1708
  )
1723
1709
 
1724
- current = current.with_columns(
1725
- _parse_dob_expr(f"{cols['dob']}_clean").alias(f"{cols['dob']}_clean")
1710
+ current = (
1711
+ current
1712
+ .with_columns(*clean_dob(col=cols["dob"]))
1713
+ .with_columns(
1714
+ _parse_dob_expr(f"{cols['dob']}_clean").alias(f"{cols['dob']}_clean")
1715
+ )
1726
1716
  )
1727
1717
 
1728
1718
  matched_frames = []
1729
1719
  unmatched = current
1730
1720
 
1731
- # exact
1732
1721
  matched, unmatched = _run_match_stage(
1733
1722
  unmatched,
1734
1723
  lookup=lookups["exact"],
@@ -1739,34 +1728,61 @@ def lookup_sid_cepr(
1739
1728
  )
1740
1729
  matched_frames.append(matched)
1741
1730
 
1742
- # middle-name variants
1743
- if "mname" in cols:
1744
- unmatched = unmatched.with_columns(
1745
- *clean_other_name(cols["mname"])
1731
+ for label, lookup in [
1732
+ ("left exact -> right fname + mname", lookups["mname"]),
1733
+ ("left exact -> right mname + lname", lookups["mname_lname"]),
1734
+ ("left exact -> right fname + mname no space", lookups["mname_nospace"]),
1735
+ ("left exact -> right lname + suffix", lookups["suffix"]),
1736
+ ("left exact -> right fname + suffix", lookups["suffix_fname"]),
1737
+ ("left exact -> right fname + suffix no space", lookups["suffix_fname_nospace"]),
1738
+ ("left exact -> right lname + suffix no space", lookups["suffix_lname_nospace"]),
1739
+ ("left exact -> right fname first word", lookups["fname_first_word"]),
1740
+ ("left exact -> right lname first word", lookups["lname_first_word"]),
1741
+ ("left exact -> right lname second word", lookups["lname_second_word"]),
1742
+ ]:
1743
+ matched, unmatched = _run_match_stage(
1744
+ unmatched,
1745
+ lookup=lookup,
1746
+ fname_expr=pl.col(f"{cols['fname']}_clean"),
1747
+ lname_expr=pl.col(f"{cols['lname']}_clean"),
1748
+ dob_expr=pl.col(f"{cols['dob']}_clean"),
1749
+ label=label,
1746
1750
  )
1751
+ matched_frames.append(matched)
1747
1752
 
1748
- mname_stages = [
1749
- (
1750
- "left exact -> right fname + mname",
1751
- lookups["mname"],
1752
- pl.col(f"{cols['fname']}_clean"),
1753
- pl.col(f"{cols['lname']}_clean"),
1754
- ),
1755
- (
1756
- "left exact -> right mname + lname",
1757
- lookups["mname_lname"],
1758
- pl.col(f"{cols['fname']}_clean"),
1759
- pl.col(f"{cols['lname']}_clean"),
1760
- ),
1761
- (
1762
- "left exact -> right fname + mname no space",
1763
- lookups["mname_nospace"],
1764
- pl.col(f"{cols['fname']}_clean"),
1765
- pl.col(f"{cols['lname']}_clean"),
1766
- ),
1753
+ for label, fname_expr, lname_expr in [
1754
+ (
1755
+ "left fname first word -> right exact",
1756
+ _first_word_expr(f"{cols['fname']}_clean"),
1757
+ pl.col(f"{cols['lname']}_clean"),
1758
+ ),
1759
+ (
1760
+ "left lname first word -> right exact",
1761
+ pl.col(f"{cols['fname']}_clean"),
1762
+ _first_word_expr(f"{cols['lname']}_clean"),
1763
+ ),
1764
+ (
1765
+ "left lname second word -> right exact",
1766
+ pl.col(f"{cols['fname']}_clean"),
1767
+ _second_word_expr(f"{cols['lname']}_clean"),
1768
+ ),
1769
+ ]:
1770
+ matched, unmatched = _run_match_stage(
1771
+ unmatched,
1772
+ lookup=lookups["exact"],
1773
+ fname_expr=fname_expr,
1774
+ lname_expr=lname_expr,
1775
+ dob_expr=pl.col(f"{cols['dob']}_clean"),
1776
+ label=label,
1777
+ )
1778
+ matched_frames.append(matched)
1779
+
1780
+ if "mname" in cols:
1781
+ unmatched = unmatched.with_columns(*clean_other_name(cols["mname"]))
1782
+
1783
+ for label, fname_expr, lname_expr in [
1767
1784
  (
1768
1785
  "left fname + mname -> right exact",
1769
- lookups["exact"],
1770
1786
  pl.concat_str(
1771
1787
  [pl.col(f"{cols['fname']}_clean"), pl.col(f"{cols['mname']}_clean")],
1772
1788
  separator=" ",
@@ -1775,7 +1791,6 @@ def lookup_sid_cepr(
1775
1791
  ),
1776
1792
  (
1777
1793
  "left mname + lname -> right exact",
1778
- lookups["exact"],
1779
1794
  pl.col(f"{cols['fname']}_clean"),
1780
1795
  pl.concat_str(
1781
1796
  [pl.col(f"{cols['mname']}_clean"), pl.col(f"{cols['lname']}_clean")],
@@ -1784,19 +1799,16 @@ def lookup_sid_cepr(
1784
1799
  ),
1785
1800
  (
1786
1801
  "left fname + mname no space -> right exact",
1787
- lookups["exact"],
1788
1802
  pl.concat_str(
1789
1803
  [pl.col(f"{cols['fname']}_clean"), pl.col(f"{cols['mname']}_clean")],
1790
1804
  separator="",
1791
1805
  ),
1792
1806
  pl.col(f"{cols['lname']}_clean"),
1793
1807
  ),
1794
- ]
1795
-
1796
- for label, lookup, fname_expr, lname_expr in mname_stages:
1808
+ ]:
1797
1809
  matched, unmatched = _run_match_stage(
1798
1810
  unmatched,
1799
- lookup=lookup,
1811
+ lookup=lookups["exact"],
1800
1812
  fname_expr=fname_expr,
1801
1813
  lname_expr=lname_expr,
1802
1814
  dob_expr=pl.col(f"{cols['dob']}_clean"),
@@ -1804,40 +1816,12 @@ def lookup_sid_cepr(
1804
1816
  )
1805
1817
  matched_frames.append(matched)
1806
1818
 
1807
- # suffix variants
1808
1819
  if "suffix" in cols:
1809
- unmatched = unmatched.with_columns(
1810
- *clean_other_name(cols["suffix"])
1811
- )
1820
+ unmatched = unmatched.with_columns(*clean_other_name(cols["suffix"]))
1812
1821
 
1813
- suffix_stages = [
1814
- (
1815
- "left exact -> right lname + suffix",
1816
- lookups["suffix"],
1817
- pl.col(f"{cols['fname']}_clean"),
1818
- pl.col(f"{cols['lname']}_clean"),
1819
- ),
1820
- (
1821
- "left exact -> right fname + suffix",
1822
- lookups["suffix_fname"],
1823
- pl.col(f"{cols['fname']}_clean"),
1824
- pl.col(f"{cols['lname']}_clean"),
1825
- ),
1826
- (
1827
- "left exact -> right fname + suffix no space",
1828
- lookups["suffix_fname_nospace"],
1829
- pl.col(f"{cols['fname']}_clean"),
1830
- pl.col(f"{cols['lname']}_clean"),
1831
- ),
1832
- (
1833
- "left exact -> right lname + suffix no space",
1834
- lookups["suffix_lname_nospace"],
1835
- pl.col(f"{cols['fname']}_clean"),
1836
- pl.col(f"{cols['lname']}_clean"),
1837
- ),
1822
+ for label, fname_expr, lname_expr in [
1838
1823
  (
1839
1824
  "left fname + suffix -> right exact",
1840
- lookups["exact"],
1841
1825
  pl.concat_str(
1842
1826
  [pl.col(f"{cols['fname']}_clean"), pl.col(f"{cols['suffix']}_clean")],
1843
1827
  separator=" ",
@@ -1846,7 +1830,6 @@ def lookup_sid_cepr(
1846
1830
  ),
1847
1831
  (
1848
1832
  "left fname + suffix no space -> right exact",
1849
- lookups["exact"],
1850
1833
  pl.concat_str(
1851
1834
  [pl.col(f"{cols['fname']}_clean"), pl.col(f"{cols['suffix']}_clean")],
1852
1835
  separator="",
@@ -1855,7 +1838,6 @@ def lookup_sid_cepr(
1855
1838
  ),
1856
1839
  (
1857
1840
  "left lname + suffix -> right exact",
1858
- lookups["exact"],
1859
1841
  pl.col(f"{cols['fname']}_clean"),
1860
1842
  pl.concat_str(
1861
1843
  [pl.col(f"{cols['lname']}_clean"), pl.col(f"{cols['suffix']}_clean")],
@@ -1864,19 +1846,16 @@ def lookup_sid_cepr(
1864
1846
  ),
1865
1847
  (
1866
1848
  "left lname + suffix no space -> right exact",
1867
- lookups["exact"],
1868
1849
  pl.col(f"{cols['fname']}_clean"),
1869
1850
  pl.concat_str(
1870
1851
  [pl.col(f"{cols['lname']}_clean"), pl.col(f"{cols['suffix']}_clean")],
1871
1852
  separator="",
1872
1853
  ),
1873
1854
  ),
1874
- ]
1875
-
1876
- for label, lookup, fname_expr, lname_expr in suffix_stages:
1855
+ ]:
1877
1856
  matched, unmatched = _run_match_stage(
1878
1857
  unmatched,
1879
- lookup=lookup,
1858
+ lookup=lookups["exact"],
1880
1859
  fname_expr=fname_expr,
1881
1860
  lname_expr=lname_expr,
1882
1861
  dob_expr=pl.col(f"{cols['dob']}_clean"),
@@ -1884,16 +1863,13 @@ def lookup_sid_cepr(
1884
1863
  )
1885
1864
  matched_frames.append(matched)
1886
1865
 
1887
- # dob_imp variants
1888
- dob_imp_stages = [
1866
+ for key in [
1889
1867
  "dob_imp",
1890
1868
  "dob_imp_minus_1",
1891
1869
  "dob_imp_plus_1",
1892
1870
  "dob_imp_minus_2",
1893
1871
  "dob_imp_plus_2",
1894
- ]
1895
-
1896
- for key in dob_imp_stages:
1872
+ ]:
1897
1873
  matched, unmatched = _run_match_stage(
1898
1874
  unmatched,
1899
1875
  lookup=lookups[key],
@@ -1904,18 +1880,20 @@ def lookup_sid_cepr(
1904
1880
  )
1905
1881
  matched_frames.append(matched)
1906
1882
 
1907
- result = pl.concat(
1908
- matched_frames + [unmatched],
1909
- how="diagonal_relaxed",
1883
+ result = (
1884
+ pl.concat(matched_frames + [unmatched], how="diagonal_relaxed")
1885
+ .sort("_row_id")
1886
+ .drop("_row_id")
1910
1887
  )
1911
1888
 
1912
- print(
1913
- f"final matched: {result['sid_cepr'].is_not_null().sum():,}/{len(result):,}"
1914
- )
1889
+ if "sid_cepr" not in input_columns:
1890
+ input_columns = input_columns + ["sid_cepr"]
1915
1891
 
1916
- return result
1892
+ result = result.select(input_columns)
1917
1893
 
1918
- #
1894
+ print(f"final matched: {result['sid_cepr'].is_not_null().sum():,}/{len(result):,}")
1895
+
1896
+ return result
1919
1897
 
1920
1898
  # EXAMPLE USAGE
1921
1899
 
File without changes