ltc-code 0.1.7__tar.gz → 0.1.8__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.3
2
2
  Name: ltc-code
3
- Version: 0.1.7
3
+ Version: 0.1.8
4
4
  Summary: Add your description here
5
5
  Requires-Python: >=3.9
6
6
  Description-Content-Type: text/markdown
@@ -1,6 +1,6 @@
1
1
  [project]
2
2
  name = "ltc-code"
3
- version = "0.1.7"
3
+ version = "0.1.8"
4
4
  description = "Add your description here"
5
5
  readme = "README.md"
6
6
  requires-python = ">=3.9"
@@ -1424,9 +1424,6 @@ def lookup_sid_cepr(
1424
1424
  ####################################################################################
1425
1425
 
1426
1426
 
1427
-
1428
-
1429
-
1430
1427
  def _parse_dob_expr(col: str) -> pl.Expr:
1431
1428
  return pl.coalesce(
1432
1429
  [
@@ -1437,6 +1434,14 @@ def _parse_dob_expr(col: str) -> pl.Expr:
1437
1434
  )
1438
1435
 
1439
1436
 
1437
+ def _first_word_expr(col: str) -> pl.Expr:
1438
+ return pl.col(col).cast(pl.String).str.split(" ").list.first()
1439
+
1440
+
1441
+ def _second_word_expr(col: str) -> pl.Expr:
1442
+ return pl.col(col).cast(pl.String).str.split(" ").list.get(1, null_on_oob=True)
1443
+
1444
+
1440
1445
  def _build_lookup(
1441
1446
  census: pl.DataFrame,
1442
1447
  *,
@@ -1456,13 +1461,11 @@ def _build_lookup(
1456
1461
  .drop_nulls(["_fname_key", "_lname_key", "_dob_key", "sid_cepr"])
1457
1462
  .group_by(["_fname_key", "_lname_key", "_dob_key"])
1458
1463
  .agg(pl.col("sid_cepr").unique().alias("_sids"))
1459
- .with_columns(pl.col("_sids").list.len().alias("_sid_count"))
1460
- .filter(pl.col("_sid_count") == 1)
1461
1464
  .select(
1462
1465
  "_fname_key",
1463
1466
  "_lname_key",
1464
1467
  "_dob_key",
1465
- pl.col("_sids").list.first().alias("sid_cepr"),
1468
+ pl.col("_sids").list.sort().list.first().alias("sid_cepr"),
1466
1469
  )
1467
1470
  )
1468
1471
 
@@ -1603,6 +1606,30 @@ def build_census_lookups(*, cmo_name: str) -> dict[str, pl.DataFrame]:
1603
1606
  label="right lname + suffix no space",
1604
1607
  )
1605
1608
 
1609
+ lookups["fname_first_word"] = _build_lookup(
1610
+ census,
1611
+ fname_expr=_first_word_expr("fname"),
1612
+ lname_expr=pl.col("lname"),
1613
+ dob_col="dob",
1614
+ label="right fname first word",
1615
+ )
1616
+
1617
+ lookups["lname_first_word"] = _build_lookup(
1618
+ census,
1619
+ fname_expr=pl.col("fname"),
1620
+ lname_expr=_first_word_expr("lname"),
1621
+ dob_col="dob",
1622
+ label="right lname first word",
1623
+ )
1624
+
1625
+ lookups["lname_second_word"] = _build_lookup(
1626
+ census,
1627
+ fname_expr=pl.col("fname"),
1628
+ lname_expr=_second_word_expr("lname"),
1629
+ dob_col="dob",
1630
+ label="right lname second word",
1631
+ )
1632
+
1606
1633
  lookups["dob_imp"] = _build_lookup(
1607
1634
  census,
1608
1635
  fname_expr=pl.col("fname"),
@@ -1659,7 +1686,6 @@ def _run_match_stage(
1659
1686
  unmatched = stage.filter(pl.col("sid_cepr").is_null()).drop("sid_cepr")
1660
1687
 
1661
1688
  print(f"{label}: matched {len(matched):,}/{before:,}")
1662
-
1663
1689
  return matched, unmatched
1664
1690
 
1665
1691
 
@@ -1710,6 +1736,9 @@ def lookup_sid_cepr(
1710
1736
  ("left exact -> right fname + suffix", lookups["suffix_fname"]),
1711
1737
  ("left exact -> right fname + suffix no space", lookups["suffix_fname_nospace"]),
1712
1738
  ("left exact -> right lname + suffix no space", lookups["suffix_lname_nospace"]),
1739
+ ("left exact -> right fname first word", lookups["fname_first_word"]),
1740
+ ("left exact -> right lname first word", lookups["lname_first_word"]),
1741
+ ("left exact -> right lname second word", lookups["lname_second_word"]),
1713
1742
  ]:
1714
1743
  matched, unmatched = _run_match_stage(
1715
1744
  unmatched,
@@ -1721,6 +1750,33 @@ def lookup_sid_cepr(
1721
1750
  )
1722
1751
  matched_frames.append(matched)
1723
1752
 
1753
+ for label, fname_expr, lname_expr in [
1754
+ (
1755
+ "left fname first word -> right exact",
1756
+ _first_word_expr(f"{cols['fname']}_clean"),
1757
+ pl.col(f"{cols['lname']}_clean"),
1758
+ ),
1759
+ (
1760
+ "left lname first word -> right exact",
1761
+ pl.col(f"{cols['fname']}_clean"),
1762
+ _first_word_expr(f"{cols['lname']}_clean"),
1763
+ ),
1764
+ (
1765
+ "left lname second word -> right exact",
1766
+ pl.col(f"{cols['fname']}_clean"),
1767
+ _second_word_expr(f"{cols['lname']}_clean"),
1768
+ ),
1769
+ ]:
1770
+ matched, unmatched = _run_match_stage(
1771
+ unmatched,
1772
+ lookup=lookups["exact"],
1773
+ fname_expr=fname_expr,
1774
+ lname_expr=lname_expr,
1775
+ dob_expr=pl.col(f"{cols['dob']}_clean"),
1776
+ label=label,
1777
+ )
1778
+ matched_frames.append(matched)
1779
+
1724
1780
  if "mname" in cols:
1725
1781
  unmatched = unmatched.with_columns(*clean_other_name(cols["mname"]))
1726
1782
 
@@ -1838,7 +1894,6 @@ def lookup_sid_cepr(
1838
1894
  print(f"final matched: {result['sid_cepr'].is_not_null().sum():,}/{len(result):,}")
1839
1895
 
1840
1896
  return result
1841
- #
1842
1897
 
1843
1898
  # EXAMPLE USAGE
1844
1899
 
File without changes