ltc-code 0.1.5__tar.gz → 0.1.6__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {ltc_code-0.1.5 → ltc_code-0.1.6}/PKG-INFO +1 -1
- {ltc_code-0.1.5 → ltc_code-0.1.6}/pyproject.toml +1 -1
- {ltc_code-0.1.5 → ltc_code-0.1.6}/src/ltc_code/may27.py +156 -87
- {ltc_code-0.1.5 → ltc_code-0.1.6}/README.md +0 -0
- {ltc_code-0.1.5 → ltc_code-0.1.6}/src/ltc_code/__init__.py +0 -0
- {ltc_code-0.1.5 → ltc_code-0.1.6}/src/ltc_code/polars_dates.py +0 -0
|
@@ -1713,35 +1713,22 @@ def lookup_sid_cepr(
|
|
|
1713
1713
|
) -> Frame:
|
|
1714
1714
|
|
|
1715
1715
|
is_lazy = isinstance(frame, pl.LazyFrame)
|
|
1716
|
+
current = frame.collect() if is_lazy else frame
|
|
1716
1717
|
|
|
1717
|
-
current = (
|
|
1718
|
-
|
|
1719
|
-
|
|
1720
|
-
|
|
1718
|
+
current = current.with_columns(
|
|
1719
|
+
*clean_name(cols["fname"]),
|
|
1720
|
+
*clean_name(cols["lname"]),
|
|
1721
|
+
*clean_dob(col=cols["dob"]),
|
|
1721
1722
|
)
|
|
1722
1723
|
|
|
1723
|
-
|
|
1724
|
-
|
|
1725
|
-
#
|
|
1726
|
-
|
|
1727
|
-
current = (
|
|
1728
|
-
current
|
|
1729
|
-
.with_columns(
|
|
1730
|
-
*clean_name(cols["fname"]),
|
|
1731
|
-
*clean_name(cols["lname"]),
|
|
1732
|
-
*clean_dob(col=cols["dob"]),
|
|
1733
|
-
)
|
|
1724
|
+
current = current.with_columns(
|
|
1725
|
+
_parse_dob_expr(f"{cols['dob']}_clean").alias(f"{cols['dob']}_clean")
|
|
1734
1726
|
)
|
|
1735
1727
|
|
|
1736
1728
|
matched_frames = []
|
|
1737
|
-
|
|
1738
1729
|
unmatched = current
|
|
1739
1730
|
|
|
1740
|
-
#
|
|
1741
|
-
# STAGE 1
|
|
1742
|
-
# EXACT
|
|
1743
|
-
#
|
|
1744
|
-
|
|
1731
|
+
# exact
|
|
1745
1732
|
matched, unmatched = _run_match_stage(
|
|
1746
1733
|
unmatched,
|
|
1747
1734
|
lookup=lookups["exact"],
|
|
@@ -1750,91 +1737,173 @@ def lookup_sid_cepr(
|
|
|
1750
1737
|
dob_expr=pl.col(f"{cols['dob']}_clean"),
|
|
1751
1738
|
label="exact",
|
|
1752
1739
|
)
|
|
1753
|
-
|
|
1754
1740
|
matched_frames.append(matched)
|
|
1755
1741
|
|
|
1756
|
-
#
|
|
1757
|
-
# STAGE 2
|
|
1758
|
-
# MNAME
|
|
1759
|
-
#
|
|
1760
|
-
|
|
1742
|
+
# middle-name variants
|
|
1761
1743
|
if "mname" in cols:
|
|
1762
|
-
|
|
1763
|
-
|
|
1764
|
-
unmatched
|
|
1765
|
-
.with_columns(
|
|
1766
|
-
*clean_other_name(cols["mname"])
|
|
1767
|
-
)
|
|
1744
|
+
unmatched = unmatched.with_columns(
|
|
1745
|
+
*clean_other_name(cols["mname"])
|
|
1768
1746
|
)
|
|
1769
1747
|
|
|
1770
|
-
|
|
1771
|
-
|
|
1772
|
-
|
|
1773
|
-
|
|
1774
|
-
[
|
|
1775
|
-
|
|
1776
|
-
pl.col(f"{cols['mname']}_clean"),
|
|
1777
|
-
],
|
|
1778
|
-
separator=" ",
|
|
1748
|
+
mname_stages = [
|
|
1749
|
+
(
|
|
1750
|
+
"left exact -> right fname + mname",
|
|
1751
|
+
lookups["mname"],
|
|
1752
|
+
pl.col(f"{cols['fname']}_clean"),
|
|
1753
|
+
pl.col(f"{cols['lname']}_clean"),
|
|
1779
1754
|
),
|
|
1780
|
-
|
|
1781
|
-
|
|
1782
|
-
|
|
1783
|
-
|
|
1784
|
-
|
|
1785
|
-
|
|
1755
|
+
(
|
|
1756
|
+
"left exact -> right mname + lname",
|
|
1757
|
+
lookups["mname_lname"],
|
|
1758
|
+
pl.col(f"{cols['fname']}_clean"),
|
|
1759
|
+
pl.col(f"{cols['lname']}_clean"),
|
|
1760
|
+
),
|
|
1761
|
+
(
|
|
1762
|
+
"left exact -> right fname + mname no space",
|
|
1763
|
+
lookups["mname_nospace"],
|
|
1764
|
+
pl.col(f"{cols['fname']}_clean"),
|
|
1765
|
+
pl.col(f"{cols['lname']}_clean"),
|
|
1766
|
+
),
|
|
1767
|
+
(
|
|
1768
|
+
"left fname + mname -> right exact",
|
|
1769
|
+
lookups["exact"],
|
|
1770
|
+
pl.concat_str(
|
|
1771
|
+
[pl.col(f"{cols['fname']}_clean"), pl.col(f"{cols['mname']}_clean")],
|
|
1772
|
+
separator=" ",
|
|
1773
|
+
),
|
|
1774
|
+
pl.col(f"{cols['lname']}_clean"),
|
|
1775
|
+
),
|
|
1776
|
+
(
|
|
1777
|
+
"left mname + lname -> right exact",
|
|
1778
|
+
lookups["exact"],
|
|
1779
|
+
pl.col(f"{cols['fname']}_clean"),
|
|
1780
|
+
pl.concat_str(
|
|
1781
|
+
[pl.col(f"{cols['mname']}_clean"), pl.col(f"{cols['lname']}_clean")],
|
|
1782
|
+
separator=" ",
|
|
1783
|
+
),
|
|
1784
|
+
),
|
|
1785
|
+
(
|
|
1786
|
+
"left fname + mname no space -> right exact",
|
|
1787
|
+
lookups["exact"],
|
|
1788
|
+
pl.concat_str(
|
|
1789
|
+
[pl.col(f"{cols['fname']}_clean"), pl.col(f"{cols['mname']}_clean")],
|
|
1790
|
+
separator="",
|
|
1791
|
+
),
|
|
1792
|
+
pl.col(f"{cols['lname']}_clean"),
|
|
1793
|
+
),
|
|
1794
|
+
]
|
|
1786
1795
|
|
|
1787
|
-
|
|
1788
|
-
|
|
1789
|
-
|
|
1790
|
-
|
|
1796
|
+
for label, lookup, fname_expr, lname_expr in mname_stages:
|
|
1797
|
+
matched, unmatched = _run_match_stage(
|
|
1798
|
+
unmatched,
|
|
1799
|
+
lookup=lookup,
|
|
1800
|
+
fname_expr=fname_expr,
|
|
1801
|
+
lname_expr=lname_expr,
|
|
1802
|
+
dob_expr=pl.col(f"{cols['dob']}_clean"),
|
|
1803
|
+
label=label,
|
|
1804
|
+
)
|
|
1805
|
+
matched_frames.append(matched)
|
|
1791
1806
|
|
|
1807
|
+
# suffix variants
|
|
1792
1808
|
if "suffix" in cols:
|
|
1809
|
+
unmatched = unmatched.with_columns(
|
|
1810
|
+
*clean_other_name(cols["suffix"])
|
|
1811
|
+
)
|
|
1793
1812
|
|
|
1794
|
-
|
|
1795
|
-
|
|
1796
|
-
|
|
1797
|
-
|
|
1813
|
+
suffix_stages = [
|
|
1814
|
+
(
|
|
1815
|
+
"left exact -> right lname + suffix",
|
|
1816
|
+
lookups["suffix"],
|
|
1817
|
+
pl.col(f"{cols['fname']}_clean"),
|
|
1818
|
+
pl.col(f"{cols['lname']}_clean"),
|
|
1819
|
+
),
|
|
1820
|
+
(
|
|
1821
|
+
"left exact -> right fname + suffix",
|
|
1822
|
+
lookups["suffix_fname"],
|
|
1823
|
+
pl.col(f"{cols['fname']}_clean"),
|
|
1824
|
+
pl.col(f"{cols['lname']}_clean"),
|
|
1825
|
+
),
|
|
1826
|
+
(
|
|
1827
|
+
"left exact -> right fname + suffix no space",
|
|
1828
|
+
lookups["suffix_fname_nospace"],
|
|
1829
|
+
pl.col(f"{cols['fname']}_clean"),
|
|
1830
|
+
pl.col(f"{cols['lname']}_clean"),
|
|
1831
|
+
),
|
|
1832
|
+
(
|
|
1833
|
+
"left exact -> right lname + suffix no space",
|
|
1834
|
+
lookups["suffix_lname_nospace"],
|
|
1835
|
+
pl.col(f"{cols['fname']}_clean"),
|
|
1836
|
+
pl.col(f"{cols['lname']}_clean"),
|
|
1837
|
+
),
|
|
1838
|
+
(
|
|
1839
|
+
"left fname + suffix -> right exact",
|
|
1840
|
+
lookups["exact"],
|
|
1841
|
+
pl.concat_str(
|
|
1842
|
+
[pl.col(f"{cols['fname']}_clean"), pl.col(f"{cols['suffix']}_clean")],
|
|
1843
|
+
separator=" ",
|
|
1844
|
+
),
|
|
1845
|
+
pl.col(f"{cols['lname']}_clean"),
|
|
1846
|
+
),
|
|
1847
|
+
(
|
|
1848
|
+
"left fname + suffix no space -> right exact",
|
|
1849
|
+
lookups["exact"],
|
|
1850
|
+
pl.concat_str(
|
|
1851
|
+
[pl.col(f"{cols['fname']}_clean"), pl.col(f"{cols['suffix']}_clean")],
|
|
1852
|
+
separator="",
|
|
1853
|
+
),
|
|
1854
|
+
pl.col(f"{cols['lname']}_clean"),
|
|
1855
|
+
),
|
|
1856
|
+
(
|
|
1857
|
+
"left lname + suffix -> right exact",
|
|
1858
|
+
lookups["exact"],
|
|
1859
|
+
pl.col(f"{cols['fname']}_clean"),
|
|
1860
|
+
pl.concat_str(
|
|
1861
|
+
[pl.col(f"{cols['lname']}_clean"), pl.col(f"{cols['suffix']}_clean")],
|
|
1862
|
+
separator=" ",
|
|
1863
|
+
),
|
|
1864
|
+
),
|
|
1865
|
+
(
|
|
1866
|
+
"left lname + suffix no space -> right exact",
|
|
1867
|
+
lookups["exact"],
|
|
1868
|
+
pl.col(f"{cols['fname']}_clean"),
|
|
1869
|
+
pl.concat_str(
|
|
1870
|
+
[pl.col(f"{cols['lname']}_clean"), pl.col(f"{cols['suffix']}_clean")],
|
|
1871
|
+
separator="",
|
|
1872
|
+
),
|
|
1873
|
+
),
|
|
1874
|
+
]
|
|
1875
|
+
|
|
1876
|
+
for label, lookup, fname_expr, lname_expr in suffix_stages:
|
|
1877
|
+
matched, unmatched = _run_match_stage(
|
|
1878
|
+
unmatched,
|
|
1879
|
+
lookup=lookup,
|
|
1880
|
+
fname_expr=fname_expr,
|
|
1881
|
+
lname_expr=lname_expr,
|
|
1882
|
+
dob_expr=pl.col(f"{cols['dob']}_clean"),
|
|
1883
|
+
label=label,
|
|
1798
1884
|
)
|
|
1799
|
-
|
|
1885
|
+
matched_frames.append(matched)
|
|
1886
|
+
|
|
1887
|
+
# dob_imp variants
|
|
1888
|
+
dob_imp_stages = [
|
|
1889
|
+
"dob_imp",
|
|
1890
|
+
"dob_imp_minus_1",
|
|
1891
|
+
"dob_imp_plus_1",
|
|
1892
|
+
"dob_imp_minus_2",
|
|
1893
|
+
"dob_imp_plus_2",
|
|
1894
|
+
]
|
|
1800
1895
|
|
|
1896
|
+
for key in dob_imp_stages:
|
|
1801
1897
|
matched, unmatched = _run_match_stage(
|
|
1802
1898
|
unmatched,
|
|
1803
|
-
lookup=lookups[
|
|
1899
|
+
lookup=lookups[key],
|
|
1804
1900
|
fname_expr=pl.col(f"{cols['fname']}_clean"),
|
|
1805
|
-
lname_expr=pl.
|
|
1806
|
-
[
|
|
1807
|
-
pl.col(f"{cols['lname']}_clean"),
|
|
1808
|
-
pl.col(f"{cols['suffix']}_clean"),
|
|
1809
|
-
],
|
|
1810
|
-
separator=" ",
|
|
1811
|
-
),
|
|
1901
|
+
lname_expr=pl.col(f"{cols['lname']}_clean"),
|
|
1812
1902
|
dob_expr=pl.col(f"{cols['dob']}_clean"),
|
|
1813
|
-
label=
|
|
1903
|
+
label=key,
|
|
1814
1904
|
)
|
|
1815
|
-
|
|
1816
1905
|
matched_frames.append(matched)
|
|
1817
1906
|
|
|
1818
|
-
#
|
|
1819
|
-
# STAGE 4
|
|
1820
|
-
# DOB IMP
|
|
1821
|
-
#
|
|
1822
|
-
|
|
1823
|
-
matched, unmatched = _run_match_stage(
|
|
1824
|
-
unmatched,
|
|
1825
|
-
lookup=lookups["dob_imp"],
|
|
1826
|
-
fname_expr=pl.col(f"{cols['fname']}_clean"),
|
|
1827
|
-
lname_expr=pl.col(f"{cols['lname']}_clean"),
|
|
1828
|
-
dob_expr=pl.col(f"{cols['dob']}_clean"),
|
|
1829
|
-
label="dob_imp",
|
|
1830
|
-
)
|
|
1831
|
-
|
|
1832
|
-
matched_frames.append(matched)
|
|
1833
|
-
|
|
1834
|
-
#
|
|
1835
|
-
# FINAL
|
|
1836
|
-
#
|
|
1837
|
-
|
|
1838
1907
|
result = pl.concat(
|
|
1839
1908
|
matched_frames + [unmatched],
|
|
1840
1909
|
how="diagonal_relaxed",
|
|
File without changes
|
|
File without changes
|
|
File without changes
|