PyPI - ltc-code - Versions diffs - 0.1.2__tar.gz → 0.1.3__tar.gz - Mend

ltc-code 0.1.2tar.gz → 0.1.3tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (6) hide show

{ltc_code-0.1.2 → ltc_code-0.1.3}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.3
 Name: ltc-code
-Version: 0.1.2
+Version: 0.1.3
 Summary: Add your description here
 Requires-Python: >=3.9
 Description-Content-Type: text/markdown

{ltc_code-0.1.2 → ltc_code-0.1.3}/pyproject.toml RENAMED Viewed

@@ -1,6 +1,6 @@
 [project]
 name = "ltc-code"
-version = "0.1.2"
+version = "0.1.3"
 description = "Add your description here"
 readme = "README.md"
 requires-python = ">=3.9"

{ltc_code-0.1.2 → ltc_code-0.1.3}/src/ltc_code/may27.py RENAMED Viewed

@@ -26,6 +26,7 @@ __all__ = [
     "consolidate_columns",
     "select_lottery_columns",
     "select_enr_columns",
+    "lookup_sid_cepr",
 ]
@@ -972,3 +973,444 @@ def select_enr_columns(frame: Frame) -> Frame:
             "spec_ed_flag": "sped",
         }
     )
+###############################################################################
+# SID/CEPR LOOKUP FROM ANNUAL CENSUS FILES
+#
+# Public entry point:
+#     matched = source.pipe(
+#         lookup_sid_cepr,
+#         cols=[
+#             "fname_clean", "lname_clean", "mname_clean",
+#             "suffix_clean", "dob_clean", "nickname_clean",
+#         ],
+#         cmo_name="Example CMO",
+#     )
+#
+# Census file locations, the CMO code mapping, and fixed annual-file edits are
+# configured once below rather than provided with each dataset call.
+###############################################################################
+def _sid_name_key_expressions(
+    fname: str,
+    lname: str,
+    mname: Optional[str] = None,
+    suffix: Optional[str] = None,
+    nickname: Optional[str] = None,
+) -> List[Tuple[str, pl.Expr, pl.Expr]]:
+    # Start with the plain first-name / last-name key.
+    modes = [("fname, lname", pl.col(fname), pl.col(lname))]
+    # If a middle-name column exists, build keys where the middle name may have
+    # been appended to first name, prepended to last name, or glued to first name.
+    if mname:
+        has_mname = pl.col(mname).fill_null("").cast(pl.String).str.strip_chars().ne("")
+        modes.extend(
+            [
+                (
+                    "fname + mname, lname",
+                    pl.when(has_mname).then(pl.concat_str([fname, mname], separator=" ")),
+                    pl.col(lname),
+                ),
+                (
+                    "fname, mname + lname",
+                    pl.col(fname),
+                    pl.when(has_mname).then(pl.concat_str([mname, lname], separator=" ")),
+                ),
+                (
+                    "fname + mname without space, lname",
+                    pl.when(has_mname).then(pl.concat_str([fname, mname], separator="")),
+                    pl.col(lname),
+                ),
+            ]
+        )
+    # If a suffix column exists, build keys where suffix may sit with first or
+    # last name, either separated by a space or attached directly.
+    if suffix:
+        has_suffix = pl.col(suffix).fill_null("").cast(pl.String).str.strip_chars().ne("")
+        modes.extend(
+            [
+                (
+                    "fname + suffix, lname",
+                    pl.when(has_suffix).then(pl.concat_str([fname, suffix], separator=" ")),
+                    pl.col(lname),
+                ),
+                (
+                    "fname, lname + suffix",
+                    pl.col(fname),
+                    pl.when(has_suffix).then(pl.concat_str([lname, suffix], separator=" ")),
+                ),
+                (
+                    "fname + suffix without space, lname",
+                    pl.when(has_suffix).then(pl.concat_str([fname, suffix], separator="")),
+                    pl.col(lname),
+                ),
+                (
+                    "fname, lname + suffix without space",
+                    pl.col(fname),
+                    pl.when(has_suffix).then(pl.concat_str([lname, suffix], separator="")),
+                ),
+            ]
+        )
+    # If a nickname column exists on the left data, allow nickname to replace
+    # first name while keeping last name fixed.
+    if nickname:
+        has_nickname = (
+            pl.col(nickname).fill_null("").cast(pl.String).str.strip_chars().ne("")
+        )
+        modes.append(
+            (
+                "nickname, lname",
+                pl.when(has_nickname).then(pl.col(nickname)),
+                pl.col(lname),
+            )
+        )
+    # Add broad text-position variants that can happen even without middle,
+    # suffix, or nickname columns.
+    modes.extend(
+        [
+            (
+                "fname first word only, lname",
+                pl.col(fname).cast(pl.String).str.split(" ").list.first(),
+                pl.col(lname),
+            ),
+            (
+                "fname, lname after first word",
+                pl.col(fname),
+                pl.when(pl.col(lname).cast(pl.String).str.contains(r"\S+\s+\S+"))
+                .then(pl.col(lname).cast(pl.String).str.replace(r"^\S+\s+", ""))
+                .otherwise(pl.lit(None, dtype=pl.String)),
+            ),
+            (
+                "fname + lname in fname, lname",
+                pl.concat_str([fname, lname], separator=" "),
+                pl.col(lname),
+            ),
+        ]
+    )
+    # Return a list of labeled first-name and last-name expressions. The caller
+    # uses the label in printed diagnostics.
+    return modes
+def lookup_sid_cepr(
+    frame: Frame,
+    *,
+    cols: Union[Sequence[str], Mapping[str, str]],
+    cmo_name: str,
+) -> Frame:
+    """Add ``sid_cepr`` from annual census files using staged name/DOB keys.
+    ``cols`` can be a mapping with at least ``fname``, ``lname``, and ``dob``;
+    optional keys are ``mname``, ``suffix``, and ``nickname``. For a short list,
+    pass ``[fname, lname, dob]``. For the old six-column order, pass
+    ``[fname, lname, mname, suffix, dob, nickname]``.
+    """
+    # Get the census folder/path template from the project paths module.
+    from paths import CENSUS_STUDENTS
+    # Get the CMO code-to-name map. The fallback spelling handles the typo you
+    # mentioned in case the module is actually named mapppings.py.
+    try:
+        import mappings
+    except ImportError:
+        import mapppings as mappings
+    # This helper supports both eager and lazy Polars frames.
+    if not isinstance(frame, (pl.DataFrame, pl.LazyFrame)):
+        raise TypeError("lookup_sid_cepr expects a polars.DataFrame or polars.LazyFrame.")
+    # Normalize cols into a role dictionary, so later code can refer to
+    # "fname", "lname", "dob", etc. regardless of how the caller supplied it.
+    if isinstance(cols, Mapping):
+        left_columns = dict(cols)
+    else:
+        # Minimal list syntax: first name, last name, DOB.
+        if len(cols) == 3:
+            left_columns = {"fname": cols[0], "lname": cols[1], "dob": cols[2]}
+        # Full list syntax: first, last, middle, suffix, DOB, nickname.
+        elif len(cols) == 6:
+            left_columns = {
+                "fname": cols[0],
+                "lname": cols[1],
+                "mname": cols[2],
+                "suffix": cols[3],
+                "dob": cols[4],
+                "nickname": cols[5],
+            }
+        else:
+            raise ValueError(
+                "cols must be either [fname, lname, dob], "
+                "[fname, lname, mname, suffix, dob, nickname], or a mapping."
+            )
+    # First name, last name, and DOB are required for any lookup.
+    missing_roles = [role for role in ("fname", "lname", "dob") if role not in left_columns]
+    if missing_roles:
+        raise ValueError("cols is missing required roles: %s" % ", ".join(missing_roles))
+    # Preserve the input frame type at the end.
+    is_lazy = isinstance(frame, pl.LazyFrame)
+    # Get the source column names without collecting the whole frame if it is lazy.
+    input_columns = frame.collect_schema().names() if is_lazy else frame.columns
+    # Make sure every supplied left-side column actually exists.
+    supplied_columns = [column for column in left_columns.values() if column]
+    missing = [column for column in supplied_columns if column not in input_columns]
+    if missing:
+        raise ValueError("Lookup source is missing columns: %s" % ", ".join(missing))
+    # Find the CMO mapping dictionary from mappings.py. The first block checks
+    # common names; the fallback finds any dict whose values include cmo_name.
+    cmo_codes = None
+    for mapping_name in (
+        "CMO_CODE_TO_NAME",
+        "CMO_CODE_TO_CMO_NAME",
+        "CMO_CODES",
+        "CMO_CODE_MAP",
+        "cmo_code_to_name",
+        "cmo_code_to_cmo_name",
+        "cmo_codes",
+    ):
+        cmo_codes = getattr(mappings, mapping_name, None)
+        if cmo_codes is not None:
+            break
+    if cmo_codes is None:
+        for value in vars(mappings).values():
+            if isinstance(value, dict) and cmo_name in set(value.values()):
+                cmo_codes = value
+                break
+    if cmo_codes is None:
+        raise ValueError("Could not find a CMO code-to-name mapping in mappings.py.")
+    # Scan every annual census CSV lazily, standardize the small set of columns
+    # needed for matching, map CMO code to CMO name, filter to the requested CMO,
+    # and store each lazy frame for appending.
+    annual_frames = []
+    for year in range(1994, 2023):
+        # CENSUS_STUDENTS can either be a folder or a path template with {year}.
+        census_folder = str(CENSUS_STUDENTS)
+        if "{year}" in census_folder:
+            path = census_folder.format(year=year)
+        else:
+            path = "%s/census_students_%s.csv" % (census_folder.rstrip("/"), year)
+        # Read all columns as strings so IDs, codes, and DOB text are not
+        # guessed into inconsistent types across years.
+        annual = pl.scan_csv(path, infer_schema=False)
+        # Some years may not have every optional column; add null versions so
+        # all years can be appended with the same schema.
+        annual_columns = annual.collect_schema().names()
+        optional = []
+        if "mname_clean" not in annual_columns:
+            optional.append(pl.lit(None, dtype=pl.String).alias("mname_clean"))
+        if "suffix_clean" not in annual_columns:
+            optional.append(pl.lit(None, dtype=pl.String).alias("suffix_clean"))
+        if "dob_imp" not in annual_columns:
+            optional.append(pl.lit(None, dtype=pl.String).alias("dob_imp"))
+        # Apply year-specific cleanup here if needed before appending.
+        annual_frames.append(
+            annual.with_columns(optional)
+            # Convert CMO code to CMO name.
+            .with_columns(
+                pl.col("cmo_code")
+                .cast(pl.String)
+                .replace_strict(
+                    {str(code): name for code, name in cmo_codes.items()},
+                    default=None,
+                    return_dtype=pl.String,
+                )
+                .alias("cmo_name")
+            )
+            # Keep only the CMO requested by the caller.
+            .filter(pl.col("cmo_name") == cmo_name)
+            # Keep only right-side columns needed for matching and the SID.
+            .select(
+                [
+                    "sid_cepr",
+                    "fname_clean",
+                    "mname_clean",
+                    "lname_clean",
+                    "suffix_clean",
+                    "dob_clean",
+                    "dob_imp",
+                ]
+            )
+        )
+    # Append all annual census lazy frames into one lazy right-side frame.
+    census = pl.concat(annual_frames, how="vertical_relaxed")
+    # Do the matching lazily even when the caller gave an eager DataFrame.
+    current = frame if is_lazy else frame.lazy()
+    # If the left data already has sid_cepr, preserve it as the initial matched
+    # value; otherwise start every row unmatched.
+    if "sid_cepr" in input_columns:
+        current = current.with_columns(pl.col("sid_cepr").alias("_sid_matched"))
+    else:
+        current = current.with_columns(pl.lit(None, dtype=pl.String).alias("_sid_matched"))
+    # Count total left-side rows once for diagnostics.
+    total_rows = current.select(pl.len().alias("rows")).collect().item()
+    # Count how many already had sid_cepr before any lookup pass.
+    previous_matched = (
+        current.select(pl.col("_sid_matched").is_not_null().sum()).collect().item()
+    )
+    # Build possible left-side name keys from the caller's available columns.
+    left_name_modes = _sid_name_key_expressions(
+        left_columns["fname"],
+        left_columns["lname"],
+        left_columns.get("mname"),
+        left_columns.get("suffix"),
+        left_columns.get("nickname"),
+    )
+    # Build possible right-side name keys from the census cleaned columns.
+    right_name_modes = _sid_name_key_expressions(
+        "fname_clean", "lname_clean", "mname_clean", "suffix_clean"
+    )
+    # First try exact DOB. Then, for remaining unmatched rows, use imputed DOB
+    # as-is and shifted by one or two years in either direction.
+    dob_passes = [("dob_clean", 0, "dob_clean")]
+    dob_passes.extend(
+        [
+            ("dob_imp", 0, "dob_imp"),
+            ("dob_imp", -1, "dob_imp - 1 year"),
+            ("dob_imp", 1, "dob_imp + 1 year"),
+            ("dob_imp", -2, "dob_imp - 2 years"),
+            ("dob_imp", 2, "dob_imp + 2 years"),
+        ]
+    )
+    print("SID/CEPR lookup diagnostics for CMO %r (%s rows)" % (cmo_name, total_rows))
+    # Try every ordered DOB pass, then name-key pass, printing coverage each time.
+    for dob_column, day_offset, dob_description in dob_passes:
+        for left_label, left_fname, left_lname in left_name_modes:
+            for right_label, right_fname, right_lname in right_name_modes:
+                # Avoid very loose fuzzy matching: mutate only one side at a
+                # time, except the exact-vs-exact pass.
+                if left_label != "fname, lname" and right_label != "fname, lname":
+                    continue
+                # Build the right-side DOB key, optionally shifted by calendar years.
+                right_dob = pl.col(dob_column).cast(pl.Date, strict=False)
+                if day_offset:
+                    right_dob = right_dob.dt.offset_by("%+dy" % day_offset)
+                # Collapse the census/right side to one row per lookup key.
+                # A key is usable only when it points to exactly one sid_cepr.
+                candidates = (
+                    census.select(
+                        [
+                            right_fname.alias("_fname_key"),
+                            right_lname.alias("_lname_key"),
+                            right_dob.alias("_dob_key"),
+                            "sid_cepr",
+                        ]
+                    )
+                    .drop_nulls(["_fname_key", "_lname_key", "_dob_key", "sid_cepr"])
+                    .group_by(["_fname_key", "_lname_key", "_dob_key"])
+                    .agg(pl.col("sid_cepr").unique().alias("_sids"))
+                    .with_columns(pl.col("_sids").list.len().alias("_sid_count"))
+                )
+                # Count ambiguous right-side keys for diagnostics; these are
+                # deliberately omitted rather than guessed.
+                ambiguous = (
+                    candidates.filter(pl.col("_sid_count") > 1)
+                    .select(pl.len())
+                    .collect()
+                    .item()
+                )
+                # Keep only unambiguous right-side keys and extract their SID.
+                lookup = candidates.filter(pl.col("_sid_count") == 1).select(
+                    [
+                        "_fname_key",
+                        "_lname_key",
+                        "_dob_key",
+                        pl.col("_sids").list.first().alias("_sid_candidate"),
+                    ]
+                )
+                # Join the current left frame to this lookup key. Existing
+                # matches win; this pass only fills still-null _sid_matched.
+                current = (
+                    current.with_columns(
+                        [
+                            left_fname.alias("_fname_key"),
+                            left_lname.alias("_lname_key"),
+                            pl.col(left_columns["dob"])
+                            .cast(pl.Date, strict=False)
+                            .alias("_dob_key"),
+                        ]
+                    )
+                    .join(
+                        lookup,
+                        on=["_fname_key", "_lname_key", "_dob_key"],
+                        how="left",
+                        validate="m:1",
+                    )
+                    .with_columns(
+                        pl.coalesce(["_sid_matched", "_sid_candidate"]).alias(
+                            "_sid_matched"
+                        )
+                    )
+                    .drop(["_fname_key", "_lname_key", "_dob_key", "_sid_candidate"])
+                )
+                # Count cumulative matches after this pass.
+                matched = (
+                    current.select(pl.col("_sid_matched").is_not_null().sum())
+                    .collect()
+                    .item()
+                )
+                # Print the cumulative match count and how much this pass added.
+                lookup_key = "%s -> %s using %s" % (
+                    left_label,
+                    right_label,
+                    dob_description,
+                )
+                ambiguity = (
+                    "; %s ambiguous right keys omitted" % ambiguous if ambiguous else ""
+                )
+                print(
+                    "  %s: %s/%s matched (+%s)%s"
+                    % (
+                        lookup_key,
+                        matched,
+                        total_rows,
+                        matched - previous_matched,
+                        ambiguity,
+                    )
+                )
+                previous_matched = matched
+    # Final unmatched count after all lookup passes.
+    unmatched_rows = total_rows - previous_matched
+    print("  final unmatched sid_cepr rows: %s/%s" % (unmatched_rows, total_rows))
+    # Return exactly the input columns plus sid_cepr if it was not already there.
+    output_columns = list(input_columns)
+    if "sid_cepr" not in output_columns:
+        output_columns.append("sid_cepr")
+    # Replace/add sid_cepr from the internal matched SID column.
+    result = current.with_columns(pl.col("_sid_matched").alias("sid_cepr")).select(output_columns)
+    # Restore eager output when the input was eager.
+    if not is_lazy:
+        result = result.collect()
+    return result

{ltc_code-0.1.2 → ltc_code-0.1.3}/README.md RENAMED Viewed

File without changes

{ltc_code-0.1.2 → ltc_code-0.1.3}/src/ltc_code/__init__.py RENAMED Viewed

File without changes

{ltc_code-0.1.2 → ltc_code-0.1.3}/src/ltc_code/polars_dates.py RENAMED Viewed

File without changes

ltc-code 0.1.2__tar.gz → 0.1.3__tar.gz

ltc-code 0.1.2tar.gz → 0.1.3tar.gz