ltc-code 0.1.5__tar.gz → 0.1.7__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.3
2
2
  Name: ltc-code
3
- Version: 0.1.5
3
+ Version: 0.1.7
4
4
  Summary: Add your description here
5
5
  Requires-Python: >=3.9
6
6
  Description-Content-Type: text/markdown
@@ -1,6 +1,6 @@
1
1
  [project]
2
2
  name = "ltc-code"
3
- version = "0.1.5"
3
+ version = "0.1.7"
4
4
  description = "Add your description here"
5
5
  readme = "README.md"
6
6
  requires-python = ">=3.9"
@@ -1427,6 +1427,16 @@ def lookup_sid_cepr(
1427
1427
 
1428
1428
 
1429
1429
 
1430
+ def _parse_dob_expr(col: str) -> pl.Expr:
1431
+ return pl.coalesce(
1432
+ [
1433
+ pl.col(col).cast(pl.String).str.strptime(pl.Date, "%m/%d/%Y", strict=False),
1434
+ pl.col(col).cast(pl.String).str.strptime(pl.Date, "%m/%d/%y", strict=False),
1435
+ pl.col(col).cast(pl.String).str.strptime(pl.Date, "%Y-%m-%d", strict=False),
1436
+ ]
1437
+ )
1438
+
1439
+
1430
1440
  def _build_lookup(
1431
1441
  census: pl.DataFrame,
1432
1442
  *,
@@ -1435,80 +1445,35 @@ def _build_lookup(
1435
1445
  dob_col: str,
1436
1446
  label: str,
1437
1447
  ) -> pl.DataFrame:
1438
- """
1439
- Build a deterministic SID lookup table.
1440
-
1441
- Output schema:
1442
- _fname_key
1443
- _lname_key
1444
- _dob_key
1445
- sid_cepr
1446
-
1447
- Ambiguous keys are removed.
1448
- """
1449
-
1450
1448
  lookup = (
1451
1449
  census
1452
1450
  .select(
1453
- [
1454
- fname_expr.alias("_fname_key"),
1455
- lname_expr.alias("_lname_key"),
1456
- pl.col(dob_col).alias("_dob_key"),
1457
- pl.col("sid_cepr"),
1458
- ]
1459
- )
1460
- .drop_nulls(
1461
- [
1462
- "_fname_key",
1463
- "_lname_key",
1464
- "_dob_key",
1465
- "sid_cepr",
1466
- ]
1467
- )
1468
- .group_by(
1469
- [
1470
- "_fname_key",
1471
- "_lname_key",
1472
- "_dob_key",
1473
- ]
1474
- )
1475
- .agg(
1476
- pl.col("sid_cepr").unique().alias("_sids")
1477
- )
1478
- .with_columns(
1479
- pl.col("_sids").list.len().alias("_sid_count")
1480
- )
1481
- .filter(
1482
- pl.col("_sid_count") == 1
1451
+ fname_expr.alias("_fname_key"),
1452
+ lname_expr.alias("_lname_key"),
1453
+ pl.col(dob_col).alias("_dob_key"),
1454
+ pl.col("sid_cepr"),
1483
1455
  )
1456
+ .drop_nulls(["_fname_key", "_lname_key", "_dob_key", "sid_cepr"])
1457
+ .group_by(["_fname_key", "_lname_key", "_dob_key"])
1458
+ .agg(pl.col("sid_cepr").unique().alias("_sids"))
1459
+ .with_columns(pl.col("_sids").list.len().alias("_sid_count"))
1460
+ .filter(pl.col("_sid_count") == 1)
1484
1461
  .select(
1485
- [
1486
- "_fname_key",
1487
- "_lname_key",
1488
- "_dob_key",
1489
- pl.col("_sids").list.first().alias("sid_cepr"),
1490
- ]
1462
+ "_fname_key",
1463
+ "_lname_key",
1464
+ "_dob_key",
1465
+ pl.col("_sids").list.first().alias("sid_cepr"),
1491
1466
  )
1492
1467
  )
1493
1468
 
1494
- print(f"built lookup: {label}")
1495
-
1469
+ print(f"built lookup: {label} ({len(lookup):,} usable keys)")
1496
1470
  return lookup
1497
1471
 
1498
- def build_census_lookups(
1499
- *,
1500
- cmo_name: str,
1501
- ) -> dict[str, pl.DataFrame]:
1502
-
1503
- try:
1504
- import mappings
1505
- except ImportError:
1506
- import mapppings as mappings
1507
1472
 
1473
+ def build_census_lookups(*, cmo_name: str) -> dict[str, pl.DataFrame]:
1508
1474
  annual_frames = []
1509
1475
 
1510
1476
  for year in range(1994, 2023):
1511
-
1512
1477
  path = CENSUS_STUDENTS / f"census_student_{year}.csv"
1513
1478
 
1514
1479
  annual = (
@@ -1520,16 +1485,14 @@ def build_census_lookups(
1520
1485
  ignore_errors=False,
1521
1486
  )
1522
1487
  .select(
1523
- [
1524
- "cmo_code",
1525
- "sid_cepr",
1526
- "fname_clean",
1527
- "lname_clean",
1528
- "mname_clean",
1529
- "suff_clean",
1530
- "birthdate_clean",
1531
- "birthdate_imp",
1532
- ]
1488
+ "cmo_code",
1489
+ "sid_cepr",
1490
+ "fname_clean",
1491
+ "lname_clean",
1492
+ "mname_clean",
1493
+ "suff_clean",
1494
+ "birthdate_clean",
1495
+ "birthdate_imp",
1533
1496
  )
1534
1497
  .rename(
1535
1498
  {
@@ -1542,92 +1505,105 @@ def build_census_lookups(
1542
1505
  }
1543
1506
  )
1544
1507
  .with_columns(
1545
- pl.col("cmo_code")
1546
- .replace(mappings.CMO_CODE_TO_NAME)
1547
- .alias("cmo_name")
1548
- )
1549
- .filter(
1550
- pl.col("cmo_name") == cmo_name
1508
+ pl.col("cmo_code").replace(cmo_map).alias("cmo_name")
1551
1509
  )
1510
+ .filter(pl.col("cmo_name") == cmo_name)
1552
1511
  .with_columns(
1553
1512
  *clean_name("fname"),
1554
1513
  *clean_name("lname"),
1555
1514
  *clean_other_name("mname"),
1556
1515
  *clean_other_name("suffix"),
1557
- *clean_dob(col="dob"),
1558
- *clean_dob(col="dob_imp"),
1559
1516
  )
1560
- .drop(
1561
- [
1562
- "dob",
1563
- "dob_imp",
1564
- ]
1517
+ .with_columns(*clean_dob(col="dob"))
1518
+ .with_columns(*clean_dob(col="dob_imp"))
1519
+ .with_columns(
1520
+ _parse_dob_expr("dob_clean").alias("dob"),
1521
+ _parse_dob_expr("dob_imp_clean").alias("dob_imp"),
1565
1522
  )
1566
- .rename(
1567
- {
1568
- "dob_clean": "dob",
1569
- "dob_imp_clean": "dob_imp",
1570
- }
1523
+ .select(
1524
+ "sid_cepr",
1525
+ pl.col("fname_clean").alias("fname"),
1526
+ pl.col("lname_clean").alias("lname"),
1527
+ pl.col("mname_clean").alias("mname"),
1528
+ pl.col("suffix_clean").alias("suffix"),
1529
+ "dob",
1530
+ "dob_imp",
1571
1531
  )
1572
1532
  )
1573
1533
 
1574
1534
  annual_frames.append(annual)
1575
1535
 
1576
- #
1577
- # MATERIALIZE ONCE
1578
- #
1536
+ census = pl.concat(annual_frames, how="vertical_relaxed").collect()
1579
1537
 
1580
- census = (
1581
- pl.concat(
1582
- annual_frames,
1583
- how="vertical_relaxed",
1584
- )
1585
- .collect()
1538
+ print(f"census rows after CMO filter: {len(census):,}")
1539
+
1540
+ lookups = {}
1541
+
1542
+ lookups["exact"] = _build_lookup(
1543
+ census,
1544
+ fname_expr=pl.col("fname"),
1545
+ lname_expr=pl.col("lname"),
1546
+ dob_col="dob",
1547
+ label="exact",
1548
+ )
1549
+
1550
+ lookups["mname"] = _build_lookup(
1551
+ census,
1552
+ fname_expr=pl.concat_str([pl.col("fname"), pl.col("mname")], separator=" "),
1553
+ lname_expr=pl.col("lname"),
1554
+ dob_col="dob",
1555
+ label="right fname + mname",
1586
1556
  )
1587
1557
 
1588
- print(f"census rows: {len(census):,}")
1558
+ lookups["mname_lname"] = _build_lookup(
1559
+ census,
1560
+ fname_expr=pl.col("fname"),
1561
+ lname_expr=pl.concat_str([pl.col("mname"), pl.col("lname")], separator=" "),
1562
+ dob_col="dob",
1563
+ label="right mname + lname",
1564
+ )
1589
1565
 
1590
- #
1591
- # BUILD LOOKUPS ONCE
1592
- #
1566
+ lookups["mname_nospace"] = _build_lookup(
1567
+ census,
1568
+ fname_expr=pl.concat_str([pl.col("fname"), pl.col("mname")], separator=""),
1569
+ lname_expr=pl.col("lname"),
1570
+ dob_col="dob",
1571
+ label="right fname + mname no space",
1572
+ )
1593
1573
 
1594
- lookup_exact = _build_lookup(
1574
+ lookups["suffix"] = _build_lookup(
1595
1575
  census,
1596
1576
  fname_expr=pl.col("fname"),
1577
+ lname_expr=pl.concat_str([pl.col("lname"), pl.col("suffix")], separator=" "),
1578
+ dob_col="dob",
1579
+ label="right lname + suffix",
1580
+ )
1581
+
1582
+ lookups["suffix_fname"] = _build_lookup(
1583
+ census,
1584
+ fname_expr=pl.concat_str([pl.col("fname"), pl.col("suffix")], separator=" "),
1597
1585
  lname_expr=pl.col("lname"),
1598
1586
  dob_col="dob",
1599
- label="exact",
1587
+ label="right fname + suffix",
1600
1588
  )
1601
1589
 
1602
- lookup_mname = _build_lookup(
1590
+ lookups["suffix_fname_nospace"] = _build_lookup(
1603
1591
  census,
1604
- fname_expr=pl.concat_str(
1605
- [
1606
- pl.col("fname"),
1607
- pl.col("mname"),
1608
- ],
1609
- separator=" ",
1610
- ),
1592
+ fname_expr=pl.concat_str([pl.col("fname"), pl.col("suffix")], separator=""),
1611
1593
  lname_expr=pl.col("lname"),
1612
1594
  dob_col="dob",
1613
- label="mname",
1595
+ label="right fname + suffix no space",
1614
1596
  )
1615
1597
 
1616
- lookup_suffix = _build_lookup(
1598
+ lookups["suffix_lname_nospace"] = _build_lookup(
1617
1599
  census,
1618
1600
  fname_expr=pl.col("fname"),
1619
- lname_expr=pl.concat_str(
1620
- [
1621
- pl.col("lname"),
1622
- pl.col("suffix"),
1623
- ],
1624
- separator=" ",
1625
- ),
1601
+ lname_expr=pl.concat_str([pl.col("lname"), pl.col("suffix")], separator=""),
1626
1602
  dob_col="dob",
1627
- label="suffix",
1603
+ label="right lname + suffix no space",
1628
1604
  )
1629
1605
 
1630
- lookup_dob_imp = _build_lookup(
1606
+ lookups["dob_imp"] = _build_lookup(
1631
1607
  census,
1632
1608
  fname_expr=pl.col("fname"),
1633
1609
  lname_expr=pl.col("lname"),
@@ -1635,12 +1611,22 @@ def build_census_lookups(
1635
1611
  label="dob_imp",
1636
1612
  )
1637
1613
 
1638
- return {
1639
- "exact": lookup_exact,
1640
- "mname": lookup_mname,
1641
- "suffix": lookup_suffix,
1642
- "dob_imp": lookup_dob_imp,
1643
- }
1614
+ for offset, key in [
1615
+ ("-1y", "dob_imp_minus_1"),
1616
+ ("1y", "dob_imp_plus_1"),
1617
+ ("-2y", "dob_imp_minus_2"),
1618
+ ("2y", "dob_imp_plus_2"),
1619
+ ]:
1620
+ lookups[key] = _build_lookup(
1621
+ census.with_columns(pl.col("dob_imp").dt.offset_by(offset).alias(key)),
1622
+ fname_expr=pl.col("fname"),
1623
+ lname_expr=pl.col("lname"),
1624
+ dob_col=key,
1625
+ label=key,
1626
+ )
1627
+
1628
+ return lookups
1629
+
1644
1630
 
1645
1631
  def _run_match_stage(
1646
1632
  unmatched: pl.DataFrame,
@@ -1651,97 +1637,61 @@ def _run_match_stage(
1651
1637
  dob_expr: pl.Expr,
1652
1638
  label: str,
1653
1639
  ) -> tuple[pl.DataFrame, pl.DataFrame]:
1654
-
1655
1640
  before = len(unmatched)
1656
1641
 
1657
1642
  stage = (
1658
1643
  unmatched
1659
1644
  .with_columns(
1660
- [
1661
- fname_expr.alias("_fname_key"),
1662
- lname_expr.alias("_lname_key"),
1663
- dob_expr.alias("_dob_key"),
1664
- ]
1645
+ fname_expr.alias("_fname_key"),
1646
+ lname_expr.alias("_lname_key"),
1647
+ dob_expr.alias("_dob_key"),
1665
1648
  )
1666
1649
  .join(
1667
1650
  lookup,
1668
- on=[
1669
- "_fname_key",
1670
- "_lname_key",
1671
- "_dob_key",
1672
- ],
1651
+ on=["_fname_key", "_lname_key", "_dob_key"],
1673
1652
  how="left",
1674
1653
  validate="m:1",
1675
1654
  )
1676
- .drop(
1677
- [
1678
- "_fname_key",
1679
- "_lname_key",
1680
- "_dob_key",
1681
- ]
1682
- )
1655
+ .drop(["_fname_key", "_lname_key", "_dob_key"])
1683
1656
  )
1684
1657
 
1685
- matched = (
1686
- stage
1687
- .filter(
1688
- pl.col("sid_cepr").is_not_null()
1689
- )
1690
- )
1691
-
1692
- unmatched = (
1693
- stage
1694
- .filter(
1695
- pl.col("sid_cepr").is_null()
1696
- )
1697
- .drop("sid_cepr")
1698
- )
1658
+ matched = stage.filter(pl.col("sid_cepr").is_not_null())
1659
+ unmatched = stage.filter(pl.col("sid_cepr").is_null()).drop("sid_cepr")
1699
1660
 
1700
- added = len(matched)
1701
-
1702
- print(
1703
- f"{label}: matched {added:,}/{before:,}"
1704
- )
1661
+ print(f"{label}: matched {len(matched):,}/{before:,}")
1705
1662
 
1706
1663
  return matched, unmatched
1707
1664
 
1665
+
1708
1666
  def lookup_sid_cepr(
1709
- frame: Frame,
1667
+ frame,
1710
1668
  *,
1711
1669
  cols: Mapping[str, str],
1712
1670
  lookups: dict[str, pl.DataFrame],
1713
- ) -> Frame:
1714
-
1671
+ ):
1715
1672
  is_lazy = isinstance(frame, pl.LazyFrame)
1673
+ current = frame.collect() if is_lazy else frame
1716
1674
 
1717
- current = (
1718
- frame.collect()
1719
- if is_lazy
1720
- else frame
1721
- )
1675
+ input_columns = current.columns
1722
1676
 
1723
- #
1724
- # CLEAN LEFT SIDE
1725
- #
1677
+ current = current.with_row_index("_row_id")
1678
+
1679
+ current = current.with_columns(
1680
+ *clean_name(cols["fname"]),
1681
+ *clean_name(cols["lname"]),
1682
+ )
1726
1683
 
1727
1684
  current = (
1728
1685
  current
1686
+ .with_columns(*clean_dob(col=cols["dob"]))
1729
1687
  .with_columns(
1730
- *clean_name(cols["fname"]),
1731
- *clean_name(cols["lname"]),
1732
- *clean_dob(col=cols["dob"]),
1688
+ _parse_dob_expr(f"{cols['dob']}_clean").alias(f"{cols['dob']}_clean")
1733
1689
  )
1734
1690
  )
1735
1691
 
1736
1692
  matched_frames = []
1737
-
1738
1693
  unmatched = current
1739
1694
 
1740
- #
1741
- # STAGE 1
1742
- # EXACT
1743
- #
1744
-
1745
1695
  matched, unmatched = _run_match_stage(
1746
1696
  unmatched,
1747
1697
  lookup=lookups["exact"],
@@ -1750,102 +1700,144 @@ def lookup_sid_cepr(
1750
1700
  dob_expr=pl.col(f"{cols['dob']}_clean"),
1751
1701
  label="exact",
1752
1702
  )
1753
-
1754
1703
  matched_frames.append(matched)
1755
1704
 
1756
- #
1757
- # STAGE 2
1758
- # MNAME
1759
- #
1760
-
1761
- if "mname" in cols:
1762
-
1763
- unmatched = (
1764
- unmatched
1765
- .with_columns(
1766
- *clean_other_name(cols["mname"])
1767
- )
1768
- )
1769
-
1705
+ for label, lookup in [
1706
+ ("left exact -> right fname + mname", lookups["mname"]),
1707
+ ("left exact -> right mname + lname", lookups["mname_lname"]),
1708
+ ("left exact -> right fname + mname no space", lookups["mname_nospace"]),
1709
+ ("left exact -> right lname + suffix", lookups["suffix"]),
1710
+ ("left exact -> right fname + suffix", lookups["suffix_fname"]),
1711
+ ("left exact -> right fname + suffix no space", lookups["suffix_fname_nospace"]),
1712
+ ("left exact -> right lname + suffix no space", lookups["suffix_lname_nospace"]),
1713
+ ]:
1770
1714
  matched, unmatched = _run_match_stage(
1771
1715
  unmatched,
1772
- lookup=lookups["mname"],
1773
- fname_expr=pl.concat_str(
1774
- [
1775
- pl.col(f"{cols['fname']}_clean"),
1776
- pl.col(f"{cols['mname']}_clean"),
1777
- ],
1778
- separator=" ",
1779
- ),
1716
+ lookup=lookup,
1717
+ fname_expr=pl.col(f"{cols['fname']}_clean"),
1780
1718
  lname_expr=pl.col(f"{cols['lname']}_clean"),
1781
1719
  dob_expr=pl.col(f"{cols['dob']}_clean"),
1782
- label="mname",
1720
+ label=label,
1783
1721
  )
1784
-
1785
1722
  matched_frames.append(matched)
1786
1723
 
1787
- #
1788
- # STAGE 3
1789
- # SUFFIX
1790
- #
1724
+ if "mname" in cols:
1725
+ unmatched = unmatched.with_columns(*clean_other_name(cols["mname"]))
1726
+
1727
+ for label, fname_expr, lname_expr in [
1728
+ (
1729
+ "left fname + mname -> right exact",
1730
+ pl.concat_str(
1731
+ [pl.col(f"{cols['fname']}_clean"), pl.col(f"{cols['mname']}_clean")],
1732
+ separator=" ",
1733
+ ),
1734
+ pl.col(f"{cols['lname']}_clean"),
1735
+ ),
1736
+ (
1737
+ "left mname + lname -> right exact",
1738
+ pl.col(f"{cols['fname']}_clean"),
1739
+ pl.concat_str(
1740
+ [pl.col(f"{cols['mname']}_clean"), pl.col(f"{cols['lname']}_clean")],
1741
+ separator=" ",
1742
+ ),
1743
+ ),
1744
+ (
1745
+ "left fname + mname no space -> right exact",
1746
+ pl.concat_str(
1747
+ [pl.col(f"{cols['fname']}_clean"), pl.col(f"{cols['mname']}_clean")],
1748
+ separator="",
1749
+ ),
1750
+ pl.col(f"{cols['lname']}_clean"),
1751
+ ),
1752
+ ]:
1753
+ matched, unmatched = _run_match_stage(
1754
+ unmatched,
1755
+ lookup=lookups["exact"],
1756
+ fname_expr=fname_expr,
1757
+ lname_expr=lname_expr,
1758
+ dob_expr=pl.col(f"{cols['dob']}_clean"),
1759
+ label=label,
1760
+ )
1761
+ matched_frames.append(matched)
1791
1762
 
1792
1763
  if "suffix" in cols:
1764
+ unmatched = unmatched.with_columns(*clean_other_name(cols["suffix"]))
1793
1765
 
1794
- unmatched = (
1795
- unmatched
1796
- .with_columns(
1797
- *clean_other_name(cols["suffix"])
1766
+ for label, fname_expr, lname_expr in [
1767
+ (
1768
+ "left fname + suffix -> right exact",
1769
+ pl.concat_str(
1770
+ [pl.col(f"{cols['fname']}_clean"), pl.col(f"{cols['suffix']}_clean")],
1771
+ separator=" ",
1772
+ ),
1773
+ pl.col(f"{cols['lname']}_clean"),
1774
+ ),
1775
+ (
1776
+ "left fname + suffix no space -> right exact",
1777
+ pl.concat_str(
1778
+ [pl.col(f"{cols['fname']}_clean"), pl.col(f"{cols['suffix']}_clean")],
1779
+ separator="",
1780
+ ),
1781
+ pl.col(f"{cols['lname']}_clean"),
1782
+ ),
1783
+ (
1784
+ "left lname + suffix -> right exact",
1785
+ pl.col(f"{cols['fname']}_clean"),
1786
+ pl.concat_str(
1787
+ [pl.col(f"{cols['lname']}_clean"), pl.col(f"{cols['suffix']}_clean")],
1788
+ separator=" ",
1789
+ ),
1790
+ ),
1791
+ (
1792
+ "left lname + suffix no space -> right exact",
1793
+ pl.col(f"{cols['fname']}_clean"),
1794
+ pl.concat_str(
1795
+ [pl.col(f"{cols['lname']}_clean"), pl.col(f"{cols['suffix']}_clean")],
1796
+ separator="",
1797
+ ),
1798
+ ),
1799
+ ]:
1800
+ matched, unmatched = _run_match_stage(
1801
+ unmatched,
1802
+ lookup=lookups["exact"],
1803
+ fname_expr=fname_expr,
1804
+ lname_expr=lname_expr,
1805
+ dob_expr=pl.col(f"{cols['dob']}_clean"),
1806
+ label=label,
1798
1807
  )
1799
- )
1800
-
1808
+ matched_frames.append(matched)
1809
+
1810
+ for key in [
1811
+ "dob_imp",
1812
+ "dob_imp_minus_1",
1813
+ "dob_imp_plus_1",
1814
+ "dob_imp_minus_2",
1815
+ "dob_imp_plus_2",
1816
+ ]:
1801
1817
  matched, unmatched = _run_match_stage(
1802
1818
  unmatched,
1803
- lookup=lookups["suffix"],
1819
+ lookup=lookups[key],
1804
1820
  fname_expr=pl.col(f"{cols['fname']}_clean"),
1805
- lname_expr=pl.concat_str(
1806
- [
1807
- pl.col(f"{cols['lname']}_clean"),
1808
- pl.col(f"{cols['suffix']}_clean"),
1809
- ],
1810
- separator=" ",
1811
- ),
1821
+ lname_expr=pl.col(f"{cols['lname']}_clean"),
1812
1822
  dob_expr=pl.col(f"{cols['dob']}_clean"),
1813
- label="suffix",
1823
+ label=key,
1814
1824
  )
1815
-
1816
1825
  matched_frames.append(matched)
1817
1826
 
1818
- #
1819
- # STAGE 4
1820
- # DOB IMP
1821
- #
1822
-
1823
- matched, unmatched = _run_match_stage(
1824
- unmatched,
1825
- lookup=lookups["dob_imp"],
1826
- fname_expr=pl.col(f"{cols['fname']}_clean"),
1827
- lname_expr=pl.col(f"{cols['lname']}_clean"),
1828
- dob_expr=pl.col(f"{cols['dob']}_clean"),
1829
- label="dob_imp",
1827
+ result = (
1828
+ pl.concat(matched_frames + [unmatched], how="diagonal_relaxed")
1829
+ .sort("_row_id")
1830
+ .drop("_row_id")
1830
1831
  )
1831
1832
 
1832
- matched_frames.append(matched)
1833
+ if "sid_cepr" not in input_columns:
1834
+ input_columns = input_columns + ["sid_cepr"]
1833
1835
 
1834
- #
1835
- # FINAL
1836
- #
1836
+ result = result.select(input_columns)
1837
1837
 
1838
- result = pl.concat(
1839
- matched_frames + [unmatched],
1840
- how="diagonal_relaxed",
1841
- )
1842
-
1843
- print(
1844
- f"final matched: {result['sid_cepr'].is_not_null().sum():,}/{len(result):,}"
1845
- )
1838
+ print(f"final matched: {result['sid_cepr'].is_not_null().sum():,}/{len(result):,}")
1846
1839
 
1847
1840
  return result
1848
-
1849
1841
  #
1850
1842
 
1851
1843
  # EXAMPLE USAGE
File without changes