ltc-code 0.1.6__tar.gz → 0.1.7__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.3
2
2
  Name: ltc-code
3
- Version: 0.1.6
3
+ Version: 0.1.7
4
4
  Summary: Add your description here
5
5
  Requires-Python: >=3.9
6
6
  Description-Content-Type: text/markdown
@@ -1,6 +1,6 @@
1
1
  [project]
2
2
  name = "ltc-code"
3
- version = "0.1.6"
3
+ version = "0.1.7"
4
4
  description = "Add your description here"
5
5
  readme = "README.md"
6
6
  requires-python = ">=3.9"
@@ -1427,6 +1427,16 @@ def lookup_sid_cepr(
1427
1427
 
1428
1428
 
1429
1429
 
1430
+ def _parse_dob_expr(col: str) -> pl.Expr:
1431
+ return pl.coalesce(
1432
+ [
1433
+ pl.col(col).cast(pl.String).str.strptime(pl.Date, "%m/%d/%Y", strict=False),
1434
+ pl.col(col).cast(pl.String).str.strptime(pl.Date, "%m/%d/%y", strict=False),
1435
+ pl.col(col).cast(pl.String).str.strptime(pl.Date, "%Y-%m-%d", strict=False),
1436
+ ]
1437
+ )
1438
+
1439
+
1430
1440
  def _build_lookup(
1431
1441
  census: pl.DataFrame,
1432
1442
  *,
@@ -1435,80 +1445,35 @@ def _build_lookup(
1435
1445
  dob_col: str,
1436
1446
  label: str,
1437
1447
  ) -> pl.DataFrame:
1438
- """
1439
- Build a deterministic SID lookup table.
1440
-
1441
- Output schema:
1442
- _fname_key
1443
- _lname_key
1444
- _dob_key
1445
- sid_cepr
1446
-
1447
- Ambiguous keys are removed.
1448
- """
1449
-
1450
1448
  lookup = (
1451
1449
  census
1452
1450
  .select(
1453
- [
1454
- fname_expr.alias("_fname_key"),
1455
- lname_expr.alias("_lname_key"),
1456
- pl.col(dob_col).alias("_dob_key"),
1457
- pl.col("sid_cepr"),
1458
- ]
1459
- )
1460
- .drop_nulls(
1461
- [
1462
- "_fname_key",
1463
- "_lname_key",
1464
- "_dob_key",
1465
- "sid_cepr",
1466
- ]
1467
- )
1468
- .group_by(
1469
- [
1470
- "_fname_key",
1471
- "_lname_key",
1472
- "_dob_key",
1473
- ]
1474
- )
1475
- .agg(
1476
- pl.col("sid_cepr").unique().alias("_sids")
1477
- )
1478
- .with_columns(
1479
- pl.col("_sids").list.len().alias("_sid_count")
1480
- )
1481
- .filter(
1482
- pl.col("_sid_count") == 1
1451
+ fname_expr.alias("_fname_key"),
1452
+ lname_expr.alias("_lname_key"),
1453
+ pl.col(dob_col).alias("_dob_key"),
1454
+ pl.col("sid_cepr"),
1483
1455
  )
1456
+ .drop_nulls(["_fname_key", "_lname_key", "_dob_key", "sid_cepr"])
1457
+ .group_by(["_fname_key", "_lname_key", "_dob_key"])
1458
+ .agg(pl.col("sid_cepr").unique().alias("_sids"))
1459
+ .with_columns(pl.col("_sids").list.len().alias("_sid_count"))
1460
+ .filter(pl.col("_sid_count") == 1)
1484
1461
  .select(
1485
- [
1486
- "_fname_key",
1487
- "_lname_key",
1488
- "_dob_key",
1489
- pl.col("_sids").list.first().alias("sid_cepr"),
1490
- ]
1462
+ "_fname_key",
1463
+ "_lname_key",
1464
+ "_dob_key",
1465
+ pl.col("_sids").list.first().alias("sid_cepr"),
1491
1466
  )
1492
1467
  )
1493
1468
 
1494
- print(f"built lookup: {label}")
1495
-
1469
+ print(f"built lookup: {label} ({len(lookup):,} usable keys)")
1496
1470
  return lookup
1497
1471
 
1498
- def build_census_lookups(
1499
- *,
1500
- cmo_name: str,
1501
- ) -> dict[str, pl.DataFrame]:
1502
-
1503
- try:
1504
- import mappings
1505
- except ImportError:
1506
- import mapppings as mappings
1507
1472
 
1473
+ def build_census_lookups(*, cmo_name: str) -> dict[str, pl.DataFrame]:
1508
1474
  annual_frames = []
1509
1475
 
1510
1476
  for year in range(1994, 2023):
1511
-
1512
1477
  path = CENSUS_STUDENTS / f"census_student_{year}.csv"
1513
1478
 
1514
1479
  annual = (
@@ -1520,16 +1485,14 @@ def build_census_lookups(
1520
1485
  ignore_errors=False,
1521
1486
  )
1522
1487
  .select(
1523
- [
1524
- "cmo_code",
1525
- "sid_cepr",
1526
- "fname_clean",
1527
- "lname_clean",
1528
- "mname_clean",
1529
- "suff_clean",
1530
- "birthdate_clean",
1531
- "birthdate_imp",
1532
- ]
1488
+ "cmo_code",
1489
+ "sid_cepr",
1490
+ "fname_clean",
1491
+ "lname_clean",
1492
+ "mname_clean",
1493
+ "suff_clean",
1494
+ "birthdate_clean",
1495
+ "birthdate_imp",
1533
1496
  )
1534
1497
  .rename(
1535
1498
  {
@@ -1542,92 +1505,105 @@ def build_census_lookups(
1542
1505
  }
1543
1506
  )
1544
1507
  .with_columns(
1545
- pl.col("cmo_code")
1546
- .replace(mappings.CMO_CODE_TO_NAME)
1547
- .alias("cmo_name")
1548
- )
1549
- .filter(
1550
- pl.col("cmo_name") == cmo_name
1508
+ pl.col("cmo_code").replace(cmo_map).alias("cmo_name")
1551
1509
  )
1510
+ .filter(pl.col("cmo_name") == cmo_name)
1552
1511
  .with_columns(
1553
1512
  *clean_name("fname"),
1554
1513
  *clean_name("lname"),
1555
1514
  *clean_other_name("mname"),
1556
1515
  *clean_other_name("suffix"),
1557
- *clean_dob(col="dob"),
1558
- *clean_dob(col="dob_imp"),
1559
1516
  )
1560
- .drop(
1561
- [
1562
- "dob",
1563
- "dob_imp",
1564
- ]
1517
+ .with_columns(*clean_dob(col="dob"))
1518
+ .with_columns(*clean_dob(col="dob_imp"))
1519
+ .with_columns(
1520
+ _parse_dob_expr("dob_clean").alias("dob"),
1521
+ _parse_dob_expr("dob_imp_clean").alias("dob_imp"),
1565
1522
  )
1566
- .rename(
1567
- {
1568
- "dob_clean": "dob",
1569
- "dob_imp_clean": "dob_imp",
1570
- }
1523
+ .select(
1524
+ "sid_cepr",
1525
+ pl.col("fname_clean").alias("fname"),
1526
+ pl.col("lname_clean").alias("lname"),
1527
+ pl.col("mname_clean").alias("mname"),
1528
+ pl.col("suffix_clean").alias("suffix"),
1529
+ "dob",
1530
+ "dob_imp",
1571
1531
  )
1572
1532
  )
1573
1533
 
1574
1534
  annual_frames.append(annual)
1575
1535
 
1576
- #
1577
- # MATERIALIZE ONCE
1578
- #
1536
+ census = pl.concat(annual_frames, how="vertical_relaxed").collect()
1579
1537
 
1580
- census = (
1581
- pl.concat(
1582
- annual_frames,
1583
- how="vertical_relaxed",
1584
- )
1585
- .collect()
1538
+ print(f"census rows after CMO filter: {len(census):,}")
1539
+
1540
+ lookups = {}
1541
+
1542
+ lookups["exact"] = _build_lookup(
1543
+ census,
1544
+ fname_expr=pl.col("fname"),
1545
+ lname_expr=pl.col("lname"),
1546
+ dob_col="dob",
1547
+ label="exact",
1548
+ )
1549
+
1550
+ lookups["mname"] = _build_lookup(
1551
+ census,
1552
+ fname_expr=pl.concat_str([pl.col("fname"), pl.col("mname")], separator=" "),
1553
+ lname_expr=pl.col("lname"),
1554
+ dob_col="dob",
1555
+ label="right fname + mname",
1586
1556
  )
1587
1557
 
1588
- print(f"census rows: {len(census):,}")
1558
+ lookups["mname_lname"] = _build_lookup(
1559
+ census,
1560
+ fname_expr=pl.col("fname"),
1561
+ lname_expr=pl.concat_str([pl.col("mname"), pl.col("lname")], separator=" "),
1562
+ dob_col="dob",
1563
+ label="right mname + lname",
1564
+ )
1589
1565
 
1590
- #
1591
- # BUILD LOOKUPS ONCE
1592
- #
1566
+ lookups["mname_nospace"] = _build_lookup(
1567
+ census,
1568
+ fname_expr=pl.concat_str([pl.col("fname"), pl.col("mname")], separator=""),
1569
+ lname_expr=pl.col("lname"),
1570
+ dob_col="dob",
1571
+ label="right fname + mname no space",
1572
+ )
1593
1573
 
1594
- lookup_exact = _build_lookup(
1574
+ lookups["suffix"] = _build_lookup(
1595
1575
  census,
1596
1576
  fname_expr=pl.col("fname"),
1577
+ lname_expr=pl.concat_str([pl.col("lname"), pl.col("suffix")], separator=" "),
1578
+ dob_col="dob",
1579
+ label="right lname + suffix",
1580
+ )
1581
+
1582
+ lookups["suffix_fname"] = _build_lookup(
1583
+ census,
1584
+ fname_expr=pl.concat_str([pl.col("fname"), pl.col("suffix")], separator=" "),
1597
1585
  lname_expr=pl.col("lname"),
1598
1586
  dob_col="dob",
1599
- label="exact",
1587
+ label="right fname + suffix",
1600
1588
  )
1601
1589
 
1602
- lookup_mname = _build_lookup(
1590
+ lookups["suffix_fname_nospace"] = _build_lookup(
1603
1591
  census,
1604
- fname_expr=pl.concat_str(
1605
- [
1606
- pl.col("fname"),
1607
- pl.col("mname"),
1608
- ],
1609
- separator=" ",
1610
- ),
1592
+ fname_expr=pl.concat_str([pl.col("fname"), pl.col("suffix")], separator=""),
1611
1593
  lname_expr=pl.col("lname"),
1612
1594
  dob_col="dob",
1613
- label="mname",
1595
+ label="right fname + suffix no space",
1614
1596
  )
1615
1597
 
1616
- lookup_suffix = _build_lookup(
1598
+ lookups["suffix_lname_nospace"] = _build_lookup(
1617
1599
  census,
1618
1600
  fname_expr=pl.col("fname"),
1619
- lname_expr=pl.concat_str(
1620
- [
1621
- pl.col("lname"),
1622
- pl.col("suffix"),
1623
- ],
1624
- separator=" ",
1625
- ),
1601
+ lname_expr=pl.concat_str([pl.col("lname"), pl.col("suffix")], separator=""),
1626
1602
  dob_col="dob",
1627
- label="suffix",
1603
+ label="right lname + suffix no space",
1628
1604
  )
1629
1605
 
1630
- lookup_dob_imp = _build_lookup(
1606
+ lookups["dob_imp"] = _build_lookup(
1631
1607
  census,
1632
1608
  fname_expr=pl.col("fname"),
1633
1609
  lname_expr=pl.col("lname"),
@@ -1635,12 +1611,22 @@ def build_census_lookups(
1635
1611
  label="dob_imp",
1636
1612
  )
1637
1613
 
1638
- return {
1639
- "exact": lookup_exact,
1640
- "mname": lookup_mname,
1641
- "suffix": lookup_suffix,
1642
- "dob_imp": lookup_dob_imp,
1643
- }
1614
+ for offset, key in [
1615
+ ("-1y", "dob_imp_minus_1"),
1616
+ ("1y", "dob_imp_plus_1"),
1617
+ ("-2y", "dob_imp_minus_2"),
1618
+ ("2y", "dob_imp_plus_2"),
1619
+ ]:
1620
+ lookups[key] = _build_lookup(
1621
+ census.with_columns(pl.col("dob_imp").dt.offset_by(offset).alias(key)),
1622
+ fname_expr=pl.col("fname"),
1623
+ lname_expr=pl.col("lname"),
1624
+ dob_col=key,
1625
+ label=key,
1626
+ )
1627
+
1628
+ return lookups
1629
+
1644
1630
 
1645
1631
  def _run_match_stage(
1646
1632
  unmatched: pl.DataFrame,
@@ -1651,84 +1637,61 @@ def _run_match_stage(
1651
1637
  dob_expr: pl.Expr,
1652
1638
  label: str,
1653
1639
  ) -> tuple[pl.DataFrame, pl.DataFrame]:
1654
-
1655
1640
  before = len(unmatched)
1656
1641
 
1657
1642
  stage = (
1658
1643
  unmatched
1659
1644
  .with_columns(
1660
- [
1661
- fname_expr.alias("_fname_key"),
1662
- lname_expr.alias("_lname_key"),
1663
- dob_expr.alias("_dob_key"),
1664
- ]
1645
+ fname_expr.alias("_fname_key"),
1646
+ lname_expr.alias("_lname_key"),
1647
+ dob_expr.alias("_dob_key"),
1665
1648
  )
1666
1649
  .join(
1667
1650
  lookup,
1668
- on=[
1669
- "_fname_key",
1670
- "_lname_key",
1671
- "_dob_key",
1672
- ],
1651
+ on=["_fname_key", "_lname_key", "_dob_key"],
1673
1652
  how="left",
1674
1653
  validate="m:1",
1675
1654
  )
1676
- .drop(
1677
- [
1678
- "_fname_key",
1679
- "_lname_key",
1680
- "_dob_key",
1681
- ]
1682
- )
1683
- )
1684
-
1685
- matched = (
1686
- stage
1687
- .filter(
1688
- pl.col("sid_cepr").is_not_null()
1689
- )
1690
- )
1691
-
1692
- unmatched = (
1693
- stage
1694
- .filter(
1695
- pl.col("sid_cepr").is_null()
1696
- )
1697
- .drop("sid_cepr")
1655
+ .drop(["_fname_key", "_lname_key", "_dob_key"])
1698
1656
  )
1699
1657
 
1700
- added = len(matched)
1658
+ matched = stage.filter(pl.col("sid_cepr").is_not_null())
1659
+ unmatched = stage.filter(pl.col("sid_cepr").is_null()).drop("sid_cepr")
1701
1660
 
1702
- print(
1703
- f"{label}: matched {added:,}/{before:,}"
1704
- )
1661
+ print(f"{label}: matched {len(matched):,}/{before:,}")
1705
1662
 
1706
1663
  return matched, unmatched
1707
1664
 
1665
+
1708
1666
  def lookup_sid_cepr(
1709
- frame: Frame,
1667
+ frame,
1710
1668
  *,
1711
1669
  cols: Mapping[str, str],
1712
1670
  lookups: dict[str, pl.DataFrame],
1713
- ) -> Frame:
1714
-
1671
+ ):
1715
1672
  is_lazy = isinstance(frame, pl.LazyFrame)
1716
1673
  current = frame.collect() if is_lazy else frame
1717
1674
 
1675
+ input_columns = current.columns
1676
+
1677
+ current = current.with_row_index("_row_id")
1678
+
1718
1679
  current = current.with_columns(
1719
1680
  *clean_name(cols["fname"]),
1720
1681
  *clean_name(cols["lname"]),
1721
- *clean_dob(col=cols["dob"]),
1722
1682
  )
1723
1683
 
1724
- current = current.with_columns(
1725
- _parse_dob_expr(f"{cols['dob']}_clean").alias(f"{cols['dob']}_clean")
1684
+ current = (
1685
+ current
1686
+ .with_columns(*clean_dob(col=cols["dob"]))
1687
+ .with_columns(
1688
+ _parse_dob_expr(f"{cols['dob']}_clean").alias(f"{cols['dob']}_clean")
1689
+ )
1726
1690
  )
1727
1691
 
1728
1692
  matched_frames = []
1729
1693
  unmatched = current
1730
1694
 
1731
- # exact
1732
1695
  matched, unmatched = _run_match_stage(
1733
1696
  unmatched,
1734
1697
  lookup=lookups["exact"],
@@ -1739,34 +1702,31 @@ def lookup_sid_cepr(
1739
1702
  )
1740
1703
  matched_frames.append(matched)
1741
1704
 
1742
- # middle-name variants
1743
- if "mname" in cols:
1744
- unmatched = unmatched.with_columns(
1745
- *clean_other_name(cols["mname"])
1705
+ for label, lookup in [
1706
+ ("left exact -> right fname + mname", lookups["mname"]),
1707
+ ("left exact -> right mname + lname", lookups["mname_lname"]),
1708
+ ("left exact -> right fname + mname no space", lookups["mname_nospace"]),
1709
+ ("left exact -> right lname + suffix", lookups["suffix"]),
1710
+ ("left exact -> right fname + suffix", lookups["suffix_fname"]),
1711
+ ("left exact -> right fname + suffix no space", lookups["suffix_fname_nospace"]),
1712
+ ("left exact -> right lname + suffix no space", lookups["suffix_lname_nospace"]),
1713
+ ]:
1714
+ matched, unmatched = _run_match_stage(
1715
+ unmatched,
1716
+ lookup=lookup,
1717
+ fname_expr=pl.col(f"{cols['fname']}_clean"),
1718
+ lname_expr=pl.col(f"{cols['lname']}_clean"),
1719
+ dob_expr=pl.col(f"{cols['dob']}_clean"),
1720
+ label=label,
1746
1721
  )
1722
+ matched_frames.append(matched)
1747
1723
 
1748
- mname_stages = [
1749
- (
1750
- "left exact -> right fname + mname",
1751
- lookups["mname"],
1752
- pl.col(f"{cols['fname']}_clean"),
1753
- pl.col(f"{cols['lname']}_clean"),
1754
- ),
1755
- (
1756
- "left exact -> right mname + lname",
1757
- lookups["mname_lname"],
1758
- pl.col(f"{cols['fname']}_clean"),
1759
- pl.col(f"{cols['lname']}_clean"),
1760
- ),
1761
- (
1762
- "left exact -> right fname + mname no space",
1763
- lookups["mname_nospace"],
1764
- pl.col(f"{cols['fname']}_clean"),
1765
- pl.col(f"{cols['lname']}_clean"),
1766
- ),
1724
+ if "mname" in cols:
1725
+ unmatched = unmatched.with_columns(*clean_other_name(cols["mname"]))
1726
+
1727
+ for label, fname_expr, lname_expr in [
1767
1728
  (
1768
1729
  "left fname + mname -> right exact",
1769
- lookups["exact"],
1770
1730
  pl.concat_str(
1771
1731
  [pl.col(f"{cols['fname']}_clean"), pl.col(f"{cols['mname']}_clean")],
1772
1732
  separator=" ",
@@ -1775,7 +1735,6 @@ def lookup_sid_cepr(
1775
1735
  ),
1776
1736
  (
1777
1737
  "left mname + lname -> right exact",
1778
- lookups["exact"],
1779
1738
  pl.col(f"{cols['fname']}_clean"),
1780
1739
  pl.concat_str(
1781
1740
  [pl.col(f"{cols['mname']}_clean"), pl.col(f"{cols['lname']}_clean")],
@@ -1784,19 +1743,16 @@ def lookup_sid_cepr(
1784
1743
  ),
1785
1744
  (
1786
1745
  "left fname + mname no space -> right exact",
1787
- lookups["exact"],
1788
1746
  pl.concat_str(
1789
1747
  [pl.col(f"{cols['fname']}_clean"), pl.col(f"{cols['mname']}_clean")],
1790
1748
  separator="",
1791
1749
  ),
1792
1750
  pl.col(f"{cols['lname']}_clean"),
1793
1751
  ),
1794
- ]
1795
-
1796
- for label, lookup, fname_expr, lname_expr in mname_stages:
1752
+ ]:
1797
1753
  matched, unmatched = _run_match_stage(
1798
1754
  unmatched,
1799
- lookup=lookup,
1755
+ lookup=lookups["exact"],
1800
1756
  fname_expr=fname_expr,
1801
1757
  lname_expr=lname_expr,
1802
1758
  dob_expr=pl.col(f"{cols['dob']}_clean"),
@@ -1804,40 +1760,12 @@ def lookup_sid_cepr(
1804
1760
  )
1805
1761
  matched_frames.append(matched)
1806
1762
 
1807
- # suffix variants
1808
1763
  if "suffix" in cols:
1809
- unmatched = unmatched.with_columns(
1810
- *clean_other_name(cols["suffix"])
1811
- )
1764
+ unmatched = unmatched.with_columns(*clean_other_name(cols["suffix"]))
1812
1765
 
1813
- suffix_stages = [
1814
- (
1815
- "left exact -> right lname + suffix",
1816
- lookups["suffix"],
1817
- pl.col(f"{cols['fname']}_clean"),
1818
- pl.col(f"{cols['lname']}_clean"),
1819
- ),
1820
- (
1821
- "left exact -> right fname + suffix",
1822
- lookups["suffix_fname"],
1823
- pl.col(f"{cols['fname']}_clean"),
1824
- pl.col(f"{cols['lname']}_clean"),
1825
- ),
1826
- (
1827
- "left exact -> right fname + suffix no space",
1828
- lookups["suffix_fname_nospace"],
1829
- pl.col(f"{cols['fname']}_clean"),
1830
- pl.col(f"{cols['lname']}_clean"),
1831
- ),
1832
- (
1833
- "left exact -> right lname + suffix no space",
1834
- lookups["suffix_lname_nospace"],
1835
- pl.col(f"{cols['fname']}_clean"),
1836
- pl.col(f"{cols['lname']}_clean"),
1837
- ),
1766
+ for label, fname_expr, lname_expr in [
1838
1767
  (
1839
1768
  "left fname + suffix -> right exact",
1840
- lookups["exact"],
1841
1769
  pl.concat_str(
1842
1770
  [pl.col(f"{cols['fname']}_clean"), pl.col(f"{cols['suffix']}_clean")],
1843
1771
  separator=" ",
@@ -1846,7 +1774,6 @@ def lookup_sid_cepr(
1846
1774
  ),
1847
1775
  (
1848
1776
  "left fname + suffix no space -> right exact",
1849
- lookups["exact"],
1850
1777
  pl.concat_str(
1851
1778
  [pl.col(f"{cols['fname']}_clean"), pl.col(f"{cols['suffix']}_clean")],
1852
1779
  separator="",
@@ -1855,7 +1782,6 @@ def lookup_sid_cepr(
1855
1782
  ),
1856
1783
  (
1857
1784
  "left lname + suffix -> right exact",
1858
- lookups["exact"],
1859
1785
  pl.col(f"{cols['fname']}_clean"),
1860
1786
  pl.concat_str(
1861
1787
  [pl.col(f"{cols['lname']}_clean"), pl.col(f"{cols['suffix']}_clean")],
@@ -1864,19 +1790,16 @@ def lookup_sid_cepr(
1864
1790
  ),
1865
1791
  (
1866
1792
  "left lname + suffix no space -> right exact",
1867
- lookups["exact"],
1868
1793
  pl.col(f"{cols['fname']}_clean"),
1869
1794
  pl.concat_str(
1870
1795
  [pl.col(f"{cols['lname']}_clean"), pl.col(f"{cols['suffix']}_clean")],
1871
1796
  separator="",
1872
1797
  ),
1873
1798
  ),
1874
- ]
1875
-
1876
- for label, lookup, fname_expr, lname_expr in suffix_stages:
1799
+ ]:
1877
1800
  matched, unmatched = _run_match_stage(
1878
1801
  unmatched,
1879
- lookup=lookup,
1802
+ lookup=lookups["exact"],
1880
1803
  fname_expr=fname_expr,
1881
1804
  lname_expr=lname_expr,
1882
1805
  dob_expr=pl.col(f"{cols['dob']}_clean"),
@@ -1884,16 +1807,13 @@ def lookup_sid_cepr(
1884
1807
  )
1885
1808
  matched_frames.append(matched)
1886
1809
 
1887
- # dob_imp variants
1888
- dob_imp_stages = [
1810
+ for key in [
1889
1811
  "dob_imp",
1890
1812
  "dob_imp_minus_1",
1891
1813
  "dob_imp_plus_1",
1892
1814
  "dob_imp_minus_2",
1893
1815
  "dob_imp_plus_2",
1894
- ]
1895
-
1896
- for key in dob_imp_stages:
1816
+ ]:
1897
1817
  matched, unmatched = _run_match_stage(
1898
1818
  unmatched,
1899
1819
  lookup=lookups[key],
@@ -1904,17 +1824,20 @@ def lookup_sid_cepr(
1904
1824
  )
1905
1825
  matched_frames.append(matched)
1906
1826
 
1907
- result = pl.concat(
1908
- matched_frames + [unmatched],
1909
- how="diagonal_relaxed",
1827
+ result = (
1828
+ pl.concat(matched_frames + [unmatched], how="diagonal_relaxed")
1829
+ .sort("_row_id")
1830
+ .drop("_row_id")
1910
1831
  )
1911
1832
 
1912
- print(
1913
- f"final matched: {result['sid_cepr'].is_not_null().sum():,}/{len(result):,}"
1914
- )
1833
+ if "sid_cepr" not in input_columns:
1834
+ input_columns = input_columns + ["sid_cepr"]
1915
1835
 
1916
- return result
1836
+ result = result.select(input_columns)
1917
1837
 
1838
+ print(f"final matched: {result['sid_cepr'].is_not_null().sum():,}/{len(result):,}")
1839
+
1840
+ return result
1918
1841
  #
1919
1842
 
1920
1843
  # EXAMPLE USAGE
File without changes