ltc-code 0.1.5__tar.gz → 0.1.6__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.3
2
2
  Name: ltc-code
3
- Version: 0.1.5
3
+ Version: 0.1.6
4
4
  Summary: Add your description here
5
5
  Requires-Python: >=3.9
6
6
  Description-Content-Type: text/markdown
@@ -1,6 +1,6 @@
1
1
  [project]
2
2
  name = "ltc-code"
3
- version = "0.1.5"
3
+ version = "0.1.6"
4
4
  description = "Add your description here"
5
5
  readme = "README.md"
6
6
  requires-python = ">=3.9"
@@ -1713,35 +1713,22 @@ def lookup_sid_cepr(
1713
1713
  ) -> Frame:
1714
1714
 
1715
1715
  is_lazy = isinstance(frame, pl.LazyFrame)
1716
+ current = frame.collect() if is_lazy else frame
1716
1717
 
1717
- current = (
1718
- frame.collect()
1719
- if is_lazy
1720
- else frame
1718
+ current = current.with_columns(
1719
+ *clean_name(cols["fname"]),
1720
+ *clean_name(cols["lname"]),
1721
+ *clean_dob(col=cols["dob"]),
1721
1722
  )
1722
1723
 
1723
- #
1724
- # CLEAN LEFT SIDE
1725
- #
1726
-
1727
- current = (
1728
- current
1729
- .with_columns(
1730
- *clean_name(cols["fname"]),
1731
- *clean_name(cols["lname"]),
1732
- *clean_dob(col=cols["dob"]),
1733
- )
1724
+ current = current.with_columns(
1725
+ _parse_dob_expr(f"{cols['dob']}_clean").alias(f"{cols['dob']}_clean")
1734
1726
  )
1735
1727
 
1736
1728
  matched_frames = []
1737
-
1738
1729
  unmatched = current
1739
1730
 
1740
- #
1741
- # STAGE 1
1742
- # EXACT
1743
- #
1744
-
1731
+ # exact
1745
1732
  matched, unmatched = _run_match_stage(
1746
1733
  unmatched,
1747
1734
  lookup=lookups["exact"],
@@ -1750,91 +1737,173 @@ def lookup_sid_cepr(
1750
1737
  dob_expr=pl.col(f"{cols['dob']}_clean"),
1751
1738
  label="exact",
1752
1739
  )
1753
-
1754
1740
  matched_frames.append(matched)
1755
1741
 
1756
- #
1757
- # STAGE 2
1758
- # MNAME
1759
- #
1760
-
1742
+ # middle-name variants
1761
1743
  if "mname" in cols:
1762
-
1763
- unmatched = (
1764
- unmatched
1765
- .with_columns(
1766
- *clean_other_name(cols["mname"])
1767
- )
1744
+ unmatched = unmatched.with_columns(
1745
+ *clean_other_name(cols["mname"])
1768
1746
  )
1769
1747
 
1770
- matched, unmatched = _run_match_stage(
1771
- unmatched,
1772
- lookup=lookups["mname"],
1773
- fname_expr=pl.concat_str(
1774
- [
1775
- pl.col(f"{cols['fname']}_clean"),
1776
- pl.col(f"{cols['mname']}_clean"),
1777
- ],
1778
- separator=" ",
1748
+ mname_stages = [
1749
+ (
1750
+ "left exact -> right fname + mname",
1751
+ lookups["mname"],
1752
+ pl.col(f"{cols['fname']}_clean"),
1753
+ pl.col(f"{cols['lname']}_clean"),
1779
1754
  ),
1780
- lname_expr=pl.col(f"{cols['lname']}_clean"),
1781
- dob_expr=pl.col(f"{cols['dob']}_clean"),
1782
- label="mname",
1783
- )
1784
-
1785
- matched_frames.append(matched)
1755
+ (
1756
+ "left exact -> right mname + lname",
1757
+ lookups["mname_lname"],
1758
+ pl.col(f"{cols['fname']}_clean"),
1759
+ pl.col(f"{cols['lname']}_clean"),
1760
+ ),
1761
+ (
1762
+ "left exact -> right fname + mname no space",
1763
+ lookups["mname_nospace"],
1764
+ pl.col(f"{cols['fname']}_clean"),
1765
+ pl.col(f"{cols['lname']}_clean"),
1766
+ ),
1767
+ (
1768
+ "left fname + mname -> right exact",
1769
+ lookups["exact"],
1770
+ pl.concat_str(
1771
+ [pl.col(f"{cols['fname']}_clean"), pl.col(f"{cols['mname']}_clean")],
1772
+ separator=" ",
1773
+ ),
1774
+ pl.col(f"{cols['lname']}_clean"),
1775
+ ),
1776
+ (
1777
+ "left mname + lname -> right exact",
1778
+ lookups["exact"],
1779
+ pl.col(f"{cols['fname']}_clean"),
1780
+ pl.concat_str(
1781
+ [pl.col(f"{cols['mname']}_clean"), pl.col(f"{cols['lname']}_clean")],
1782
+ separator=" ",
1783
+ ),
1784
+ ),
1785
+ (
1786
+ "left fname + mname no space -> right exact",
1787
+ lookups["exact"],
1788
+ pl.concat_str(
1789
+ [pl.col(f"{cols['fname']}_clean"), pl.col(f"{cols['mname']}_clean")],
1790
+ separator="",
1791
+ ),
1792
+ pl.col(f"{cols['lname']}_clean"),
1793
+ ),
1794
+ ]
1786
1795
 
1787
- #
1788
- # STAGE 3
1789
- # SUFFIX
1790
- #
1796
+ for label, lookup, fname_expr, lname_expr in mname_stages:
1797
+ matched, unmatched = _run_match_stage(
1798
+ unmatched,
1799
+ lookup=lookup,
1800
+ fname_expr=fname_expr,
1801
+ lname_expr=lname_expr,
1802
+ dob_expr=pl.col(f"{cols['dob']}_clean"),
1803
+ label=label,
1804
+ )
1805
+ matched_frames.append(matched)
1791
1806
 
1807
+ # suffix variants
1792
1808
  if "suffix" in cols:
1809
+ unmatched = unmatched.with_columns(
1810
+ *clean_other_name(cols["suffix"])
1811
+ )
1793
1812
 
1794
- unmatched = (
1795
- unmatched
1796
- .with_columns(
1797
- *clean_other_name(cols["suffix"])
1813
+ suffix_stages = [
1814
+ (
1815
+ "left exact -> right lname + suffix",
1816
+ lookups["suffix"],
1817
+ pl.col(f"{cols['fname']}_clean"),
1818
+ pl.col(f"{cols['lname']}_clean"),
1819
+ ),
1820
+ (
1821
+ "left exact -> right fname + suffix",
1822
+ lookups["suffix_fname"],
1823
+ pl.col(f"{cols['fname']}_clean"),
1824
+ pl.col(f"{cols['lname']}_clean"),
1825
+ ),
1826
+ (
1827
+ "left exact -> right fname + suffix no space",
1828
+ lookups["suffix_fname_nospace"],
1829
+ pl.col(f"{cols['fname']}_clean"),
1830
+ pl.col(f"{cols['lname']}_clean"),
1831
+ ),
1832
+ (
1833
+ "left exact -> right lname + suffix no space",
1834
+ lookups["suffix_lname_nospace"],
1835
+ pl.col(f"{cols['fname']}_clean"),
1836
+ pl.col(f"{cols['lname']}_clean"),
1837
+ ),
1838
+ (
1839
+ "left fname + suffix -> right exact",
1840
+ lookups["exact"],
1841
+ pl.concat_str(
1842
+ [pl.col(f"{cols['fname']}_clean"), pl.col(f"{cols['suffix']}_clean")],
1843
+ separator=" ",
1844
+ ),
1845
+ pl.col(f"{cols['lname']}_clean"),
1846
+ ),
1847
+ (
1848
+ "left fname + suffix no space -> right exact",
1849
+ lookups["exact"],
1850
+ pl.concat_str(
1851
+ [pl.col(f"{cols['fname']}_clean"), pl.col(f"{cols['suffix']}_clean")],
1852
+ separator="",
1853
+ ),
1854
+ pl.col(f"{cols['lname']}_clean"),
1855
+ ),
1856
+ (
1857
+ "left lname + suffix -> right exact",
1858
+ lookups["exact"],
1859
+ pl.col(f"{cols['fname']}_clean"),
1860
+ pl.concat_str(
1861
+ [pl.col(f"{cols['lname']}_clean"), pl.col(f"{cols['suffix']}_clean")],
1862
+ separator=" ",
1863
+ ),
1864
+ ),
1865
+ (
1866
+ "left lname + suffix no space -> right exact",
1867
+ lookups["exact"],
1868
+ pl.col(f"{cols['fname']}_clean"),
1869
+ pl.concat_str(
1870
+ [pl.col(f"{cols['lname']}_clean"), pl.col(f"{cols['suffix']}_clean")],
1871
+ separator="",
1872
+ ),
1873
+ ),
1874
+ ]
1875
+
1876
+ for label, lookup, fname_expr, lname_expr in suffix_stages:
1877
+ matched, unmatched = _run_match_stage(
1878
+ unmatched,
1879
+ lookup=lookup,
1880
+ fname_expr=fname_expr,
1881
+ lname_expr=lname_expr,
1882
+ dob_expr=pl.col(f"{cols['dob']}_clean"),
1883
+ label=label,
1798
1884
  )
1799
- )
1885
+ matched_frames.append(matched)
1886
+
1887
+ # dob_imp variants
1888
+ dob_imp_stages = [
1889
+ "dob_imp",
1890
+ "dob_imp_minus_1",
1891
+ "dob_imp_plus_1",
1892
+ "dob_imp_minus_2",
1893
+ "dob_imp_plus_2",
1894
+ ]
1800
1895
 
1896
+ for key in dob_imp_stages:
1801
1897
  matched, unmatched = _run_match_stage(
1802
1898
  unmatched,
1803
- lookup=lookups["suffix"],
1899
+ lookup=lookups[key],
1804
1900
  fname_expr=pl.col(f"{cols['fname']}_clean"),
1805
- lname_expr=pl.concat_str(
1806
- [
1807
- pl.col(f"{cols['lname']}_clean"),
1808
- pl.col(f"{cols['suffix']}_clean"),
1809
- ],
1810
- separator=" ",
1811
- ),
1901
+ lname_expr=pl.col(f"{cols['lname']}_clean"),
1812
1902
  dob_expr=pl.col(f"{cols['dob']}_clean"),
1813
- label="suffix",
1903
+ label=key,
1814
1904
  )
1815
-
1816
1905
  matched_frames.append(matched)
1817
1906
 
1818
- #
1819
- # STAGE 4
1820
- # DOB IMP
1821
- #
1822
-
1823
- matched, unmatched = _run_match_stage(
1824
- unmatched,
1825
- lookup=lookups["dob_imp"],
1826
- fname_expr=pl.col(f"{cols['fname']}_clean"),
1827
- lname_expr=pl.col(f"{cols['lname']}_clean"),
1828
- dob_expr=pl.col(f"{cols['dob']}_clean"),
1829
- label="dob_imp",
1830
- )
1831
-
1832
- matched_frames.append(matched)
1833
-
1834
- #
1835
- # FINAL
1836
- #
1837
-
1838
1907
  result = pl.concat(
1839
1908
  matched_frames + [unmatched],
1840
1909
  how="diagonal_relaxed",
File without changes