ltc-code 0.1.1__tar.gz → 0.1.2__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.3
2
2
  Name: ltc-code
3
- Version: 0.1.1
3
+ Version: 0.1.2
4
4
  Summary: Add your description here
5
5
  Requires-Python: >=3.9
6
6
  Description-Content-Type: text/markdown
@@ -1,6 +1,6 @@
1
1
  [project]
2
2
  name = "ltc-code"
3
- version = "0.1.1"
3
+ version = "0.1.2"
4
4
  description = "Add your description here"
5
5
  readme = "README.md"
6
6
  requires-python = ">=3.9"
@@ -0,0 +1,974 @@
1
+ """Reusable helper functions for the LTC project."""
2
+
3
+ from collections import Counter
4
+ from dataclasses import dataclass
5
+ from functools import lru_cache
6
+ from itertools import combinations
7
+ import math
8
+ import re
9
+ import warnings
10
+ from typing import Any, Dict, Iterable, List, Mapping, Optional, Sequence, Tuple, Union
11
+
12
+ import polars as pl
13
+ with warnings.catch_warnings():
14
+ warnings.filterwarnings("ignore", message="urllib3 v2 only supports OpenSSL")
15
+ from scourgify import normalize_address_record
16
+ from scourgify.exceptions import AddressNormalizationError
17
+ import usaddress
18
+
19
+
20
+ __all__ = [
21
+ "clean_addresses",
22
+ "clean_address_columns",
23
+ "ColumnConflictError",
24
+ "ConsolidationDiagnostics",
25
+ "GroupDiagnostics",
26
+ "consolidate_columns",
27
+ "select_lottery_columns",
28
+ "select_enr_columns",
29
+ ]
30
+
31
+
32
+ ###############################################################################
33
+ # ADDRESS STANDARDIZATION AND CONFLICT REVIEW
34
+ #
35
+ # Public entry point:
36
+ # cleaned = frame.pipe(clean_addresses)
37
+ #
38
+ # This section reconciles one to three optional address-bearing columns
39
+ # (`address`, `address1`, and `address2` by default) into standardized output
40
+ # fields and an integer `address_conflict` review flag.
41
+ #
42
+ # Operational notes:
43
+ # - Parsing uses usaddress-scourgify with usaddress fallback for fragments.
44
+ # - Parsed text is standardized, not verified against an official address file.
45
+ # - Repeated exact raw values are cached in memory only for the Python process.
46
+ # - DataFrame inputs print diagnostics while returning a DataFrame.
47
+ # - LazyFrame inputs also print diagnostics; doing so executes separate summary
48
+ # and example queries immediately, while the returned result remains lazy.
49
+ ###############################################################################
50
+
51
+ DEFAULT_ADDRESS_COLUMNS = ("address", "address1", "address2")
52
+ OUTPUT_DTYPE = pl.Struct(
53
+ {
54
+ "address1_clean": pl.String,
55
+ "address2_clean": pl.String,
56
+ "zipcode_clean": pl.String,
57
+ "city_clean": pl.String,
58
+ "state_clean": pl.String,
59
+ "address_conflict": pl.Int8,
60
+ "address_conflict_reason": pl.String,
61
+ "address_parse_ok": pl.Boolean,
62
+ }
63
+ )
64
+
65
+ STATE_ABBREVIATIONS = {
66
+ "ALABAMA": "AL",
67
+ "ALASKA": "AK",
68
+ "ARIZONA": "AZ",
69
+ "ARKANSAS": "AR",
70
+ "CALIFORNIA": "CA",
71
+ "COLORADO": "CO",
72
+ "CONNECTICUT": "CT",
73
+ "DELAWARE": "DE",
74
+ "FLORIDA": "FL",
75
+ "GEORGIA": "GA",
76
+ "HAWAII": "HI",
77
+ "IDAHO": "ID",
78
+ "ILLINOIS": "IL",
79
+ "INDIANA": "IN",
80
+ "IOWA": "IA",
81
+ "KANSAS": "KS",
82
+ "KENTUCKY": "KY",
83
+ "LOUISIANA": "LA",
84
+ "MAINE": "ME",
85
+ "MARYLAND": "MD",
86
+ "MASSACHUSETTS": "MA",
87
+ "MICHIGAN": "MI",
88
+ "MINNESOTA": "MN",
89
+ "MISSISSIPPI": "MS",
90
+ "MISSOURI": "MO",
91
+ "MONTANA": "MT",
92
+ "NEBRASKA": "NE",
93
+ "NEVADA": "NV",
94
+ "NEW HAMPSHIRE": "NH",
95
+ "NEW JERSEY": "NJ",
96
+ "NEW MEXICO": "NM",
97
+ "NEW YORK": "NY",
98
+ "NORTH CAROLINA": "NC",
99
+ "NORTH DAKOTA": "ND",
100
+ "OHIO": "OH",
101
+ "OKLAHOMA": "OK",
102
+ "OREGON": "OR",
103
+ "PENNSYLVANIA": "PA",
104
+ "RHODE ISLAND": "RI",
105
+ "SOUTH CAROLINA": "SC",
106
+ "SOUTH DAKOTA": "SD",
107
+ "TENNESSEE": "TN",
108
+ "TEXAS": "TX",
109
+ "UTAH": "UT",
110
+ "VERMONT": "VT",
111
+ "VIRGINIA": "VA",
112
+ "WASHINGTON": "WA",
113
+ "WEST VIRGINIA": "WV",
114
+ "WISCONSIN": "WI",
115
+ "WYOMING": "WY",
116
+ }
117
+
118
+ COMPONENT_KEYS = (
119
+ "address_line_1",
120
+ "address_line_2",
121
+ "city",
122
+ "state",
123
+ "postal_code",
124
+ )
125
+
126
+
127
+ def _clean_text(value: Any) -> Optional[str]:
128
+ if value is None:
129
+ return None
130
+ text = str(value).strip()
131
+ return text or None
132
+
133
+
134
+ def _prepare_address(value: str) -> str:
135
+ text = value.upper()
136
+ text = re.sub(r"\bZIP\s*:?\s*", "", text)
137
+ text = re.sub(r"\s*[|;/]\s*", ", ", text)
138
+ text = re.sub(r"\s*--+\s*", ", ", text)
139
+ text = re.sub(r"#\s*(APT|STE|SUITE|UNIT|FLOOR)\b", r"\1", text)
140
+ for state, abbreviation in STATE_ABBREVIATIONS.items():
141
+ text = re.sub(
142
+ r"\b%s\b(?=\s*,?\s*(?:\d{5}(?:-\d{4})?\b|$))" % re.escape(state),
143
+ abbreviation,
144
+ text,
145
+ )
146
+ text = re.sub(
147
+ r"^(.*?),\s*(.*?),\s*(\d{5}(?:-\d{4})?),\s*([A-Z]{2})$",
148
+ r"\1, \2, \4 \3",
149
+ text,
150
+ )
151
+ text = re.sub(
152
+ r"^(.*?),\s*(.*?)\s+([A-Z]{2})-(\d{5}(?:-\d{4})?)\s+(.+)$",
153
+ r"\1, \5, \2, \3 \4",
154
+ text,
155
+ )
156
+ text = re.sub(r"\s+", " ", text)
157
+ return text.strip(" ,")
158
+
159
+
160
+ def _join_tag_values(tags: Dict[str, str], keys: Iterable[str]) -> Optional[str]:
161
+ values = [tags[key].upper() for key in keys if tags.get(key)]
162
+ return " ".join(values) if values else None
163
+
164
+
165
+ def _from_usaddress(value: str) -> Dict[str, Optional[str]]:
166
+ try:
167
+ tags, _ = usaddress.tag(value)
168
+ except usaddress.RepeatedLabelError:
169
+ return {key: None for key in COMPONENT_KEYS}
170
+
171
+ line_1 = _join_tag_values(
172
+ tags,
173
+ (
174
+ "USPSBoxType",
175
+ "USPSBoxID",
176
+ "AddressNumber",
177
+ "StreetNamePreDirectional",
178
+ "StreetNamePreType",
179
+ "StreetName",
180
+ "StreetNamePostType",
181
+ "StreetNamePostDirectional",
182
+ ),
183
+ )
184
+ line_2 = _join_tag_values(
185
+ tags,
186
+ ("OccupancyType", "OccupancyIdentifier", "SubaddressType", "SubaddressIdentifier"),
187
+ )
188
+ state = tags.get("StateName")
189
+ return {
190
+ "address_line_1": line_1,
191
+ "address_line_2": line_2,
192
+ "city": tags.get("PlaceName", "").upper() or None,
193
+ "state": state.upper() if state else None,
194
+ "postal_code": tags.get("ZipCode", "").upper() or None,
195
+ }
196
+
197
+
198
+ @lru_cache(maxsize=100000)
199
+ def _parse_address(value: str) -> Dict[str, Optional[str]]:
200
+ prepared = _prepare_address(value)
201
+ try:
202
+ normalized = normalize_address_record(prepared)
203
+ return {key: normalized.get(key) for key in COMPONENT_KEYS}
204
+ except AddressNormalizationError:
205
+ return _from_usaddress(prepared)
206
+
207
+
208
+ def _candidate_score(candidate: Dict[str, Optional[str]]) -> int:
209
+ return (
210
+ (4 if candidate["address_line_1"] else 0)
211
+ + (1 if candidate["address_line_2"] else 0)
212
+ + (1 if candidate["city"] else 0)
213
+ + (1 if candidate["state"] else 0)
214
+ + (1 if candidate["postal_code"] else 0)
215
+ )
216
+
217
+
218
+ def _comparison_value(key: str, value: str) -> str:
219
+ text = re.sub(r"[^A-Z0-9# ]+", " ", value.upper())
220
+ text = re.sub(r"\s+", " ", text).strip()
221
+ if key == "address_line_2":
222
+ text = re.sub(r"^APT\s+", "APT ", text)
223
+ text = re.sub(r"^(FLOOR|FL)\s+", "FL ", text)
224
+ text = re.sub(r"^(SUITE|STE)\s+", "STE ", text)
225
+ text = re.sub(r"^(APARTMENT)\s+", "APT ", text)
226
+ return text
227
+
228
+
229
+ def _resolve_component(
230
+ candidates: List[Dict[str, Optional[str]]], key: str
231
+ ) -> Tuple[Optional[str], List[str]]:
232
+ observed = [candidate[key] for candidate in candidates if candidate[key]]
233
+ if not observed:
234
+ return None, []
235
+ counts = Counter(_comparison_value(key, value) for value in observed)
236
+ best_count = max(counts.values())
237
+ most_common = {value for value, count in counts.items() if count == best_count}
238
+ chosen = next(
239
+ candidate[key]
240
+ for candidate in sorted(candidates, key=_candidate_score, reverse=True)
241
+ if candidate[key] and _comparison_value(key, candidate[key]) in most_common
242
+ )
243
+ distinct = []
244
+ seen = set()
245
+ for value in observed:
246
+ comparison = _comparison_value(key, value)
247
+ if comparison not in seen:
248
+ distinct.append(value)
249
+ seen.add(comparison)
250
+ return chosen, sorted(distinct)
251
+
252
+
253
+ def _reconcile_row(row: Dict[str, Any], address_columns: Sequence[str]) -> Dict[str, Any]:
254
+ values = [_clean_text(row.get(column)) for column in address_columns]
255
+ candidates = [_parse_address(value) for value in values if value]
256
+ resolved = {}
257
+ conflicts = []
258
+ for key in COMPONENT_KEYS:
259
+ resolved[key], distinct = _resolve_component(candidates, key)
260
+ if len(distinct) > 1:
261
+ conflicts.append("%s: %s" % (key, " <> ".join(distinct)))
262
+
263
+ return {
264
+ "address1_clean": resolved["address_line_1"],
265
+ "address2_clean": resolved["address_line_2"],
266
+ "zipcode_clean": resolved["postal_code"],
267
+ "city_clean": resolved["city"],
268
+ "state_clean": resolved["state"],
269
+ "address_conflict": int(bool(conflicts)),
270
+ "address_conflict_reason": "; ".join(conflicts) or None,
271
+ "address_parse_ok": bool(resolved["address_line_1"] or resolved["postal_code"]),
272
+ }
273
+
274
+
275
+ def _print_address_diagnostics(
276
+ frame: Union[pl.DataFrame, pl.LazyFrame],
277
+ address_columns: Sequence[str],
278
+ example_count: int = 3,
279
+ ) -> None:
280
+ nonblank_expressions = [
281
+ pl.col(column).fill_null("").str.strip_chars().ne("") for column in address_columns
282
+ ]
283
+ summary_expressions = [
284
+ pl.len().alias("rows"),
285
+ *[
286
+ expression.sum().alias("nonblank_%s" % column)
287
+ for column, expression in zip(address_columns, nonblank_expressions)
288
+ ],
289
+ pl.any_horizontal(nonblank_expressions).sum().alias("rows_with_address"),
290
+ pl.col("address_parse_ok").sum().alias("parsed_rows"),
291
+ pl.col("address_conflict").sum().alias("conflicting_rows"),
292
+ ]
293
+ summary = frame.select(summary_expressions)
294
+ if isinstance(summary, pl.LazyFrame):
295
+ summary = summary.collect()
296
+ totals = summary.row(0, named=True)
297
+ conflicting_rows = totals["conflicting_rows"]
298
+
299
+ print("Address cleaning diagnostics")
300
+ print(" rows: %s" % totals["rows"])
301
+ for column in address_columns:
302
+ print(" non-null %s: %s" % (column, totals["nonblank_%s" % column]))
303
+ print(" rows with any supplied address: %s" % totals["rows_with_address"])
304
+ print(" parsed successfully: %s" % totals["parsed_rows"])
305
+ print(" contradictions: %s" % conflicting_rows)
306
+
307
+ if conflicting_rows:
308
+ print(" contradiction examples:")
309
+ example_columns = list(address_columns) + [
310
+ "address1_clean",
311
+ "address2_clean",
312
+ "zipcode_clean",
313
+ "address_conflict",
314
+ "address_conflict_reason",
315
+ ]
316
+ examples = (
317
+ frame.filter(pl.col("address_conflict") == 1)
318
+ .select(example_columns)
319
+ .head(example_count)
320
+ )
321
+ if isinstance(examples, pl.LazyFrame):
322
+ examples = examples.collect()
323
+ for number, row in enumerate(examples.iter_rows(named=True), 1):
324
+ values = [
325
+ "%s=%s" % (column, row[column] if row[column] is not None else "NULL")
326
+ for column in address_columns
327
+ ]
328
+ print(" %s. %s" % (number, " | ".join(values)))
329
+ print(
330
+ " address1_clean=%s | address2_clean=%s | zipcode_clean=%s"
331
+ % (
332
+ row["address1_clean"] or "NULL",
333
+ row["address2_clean"] or "NULL",
334
+ row["zipcode_clean"] or "NULL",
335
+ )
336
+ )
337
+ print(" address_conflict=1 | reason=%s" % row["address_conflict_reason"])
338
+
339
+
340
+ def clean_addresses(
341
+ frame: Union[pl.DataFrame, pl.LazyFrame],
342
+ address_columns: Optional[Union[str, Sequence[str]]] = None,
343
+ ) -> Union[pl.DataFrame, pl.LazyFrame]:
344
+ """Clean one to three address fields; designed for ``frame.pipe(...)``.
345
+
346
+ With no ``address_columns`` argument, any existing columns named
347
+ ``address``, ``address1``, and ``address2`` are used automatically.
348
+ Values from all chosen nonblank columns contribute to the resolved output.
349
+ Disagreements are retained in the conflict audit columns.
350
+ Calls print a brief diagnostic summary and conflict examples. For a
351
+ ``LazyFrame``, this evaluates separate diagnostic queries immediately but
352
+ returns the cleaned result as a still-lazy ``LazyFrame``.
353
+ """
354
+ existing = set(
355
+ frame.collect_schema().names() if isinstance(frame, pl.LazyFrame) else frame.columns
356
+ )
357
+ if address_columns is None:
358
+ columns = [column for column in DEFAULT_ADDRESS_COLUMNS if column in existing]
359
+ elif isinstance(address_columns, str):
360
+ columns = [address_columns]
361
+ else:
362
+ columns = list(address_columns)
363
+
364
+ if not 1 <= len(columns) <= 3:
365
+ raise ValueError("Provide one, two, or three address column names.")
366
+ if len(set(columns)) != len(columns):
367
+ raise ValueError("Address column names must be unique.")
368
+ missing = [column for column in columns if column not in existing]
369
+ if missing:
370
+ raise ValueError("Missing address columns: %s" % ", ".join(missing))
371
+
372
+ clean_struct = pl.struct(columns).map_elements(
373
+ lambda row: _reconcile_row(row, columns),
374
+ return_dtype=OUTPUT_DTYPE,
375
+ ).alias("_address_clean")
376
+ result = frame.with_columns(clean_struct).unnest("_address_clean")
377
+ _print_address_diagnostics(result, columns)
378
+ return result
379
+
380
+
381
+ clean_address_columns = clean_addresses
382
+
383
+
384
+ ###############################################################################
385
+ # HORIZONTAL COLUMN CONSOLIDATION AND CONFLICT REVIEW
386
+ #
387
+ # Public entry point:
388
+ # reviewed = frame.pipe(
389
+ # consolidate_columns,
390
+ # {"dob": ["dob_1", "dob_2"], "fname": ["fname_1", "fname_2"]},
391
+ # )
392
+ #
393
+ # This section merges repeated representations of the same field, such as
394
+ # several DOB columns into `dob` and several first-name columns into `fname`.
395
+ # Values are compared exactly as stored; this helper does not trim, parse,
396
+ # standardize, or normalize them.
397
+ #
398
+ # Operational notes:
399
+ # - This helper uses in-memory Polars processing only. It makes no API calls
400
+ # and does not read from or write to a database.
401
+ # - It accepts a DataFrame or LazyFrame and returns the same frame type.
402
+ # - Printing diagnostics for a LazyFrame evaluates only the relevant source
403
+ # projection immediately; the returned transformed result remains lazy.
404
+ # - On contradictions, the default is to retain source evidence and add
405
+ # integer `<output>_conflict` columns where `1` indicates review is needed.
406
+ # - When all compared values agree, the input source columns are dropped.
407
+ # - Rows are never dropped.
408
+ ###############################################################################
409
+
410
+ Frame = Union[pl.DataFrame, pl.LazyFrame]
411
+ ConsolidatedResult = Union[Frame, Tuple[Frame, "ConsolidationDiagnostics"]]
412
+
413
+
414
+ @dataclass
415
+ class GroupDiagnostics:
416
+ """Diagnostics for one requested consolidated output column."""
417
+
418
+ output_column: str
419
+ source_columns: List[str]
420
+ context_columns: List[str]
421
+ total_rows: int
422
+ rows_with_any_value: int
423
+ rows_with_multiple_values: int
424
+ conflicting_rows: int
425
+ conflicting_row_numbers: List[int]
426
+ conflict_column_pairs: pl.DataFrame
427
+ value_patterns: pl.DataFrame
428
+ example_conflicts: pl.DataFrame
429
+
430
+ @property
431
+ def conflict_rate(self) -> float:
432
+ if not self.total_rows:
433
+ return 0.0
434
+ return self.conflicting_rows / self.total_rows
435
+
436
+ @property
437
+ def rows_with_output(self) -> int:
438
+ """Rows for which the requested output can be safely populated."""
439
+ return self.rows_with_any_value - self.conflicting_rows
440
+
441
+ @property
442
+ def rows_without_value(self) -> int:
443
+ """Rows with no non-missing source value for this output."""
444
+ return self.total_rows - self.rows_with_any_value
445
+
446
+
447
+ @dataclass
448
+ class ConsolidationDiagnostics:
449
+ """Structured audit information produced while consolidating columns."""
450
+
451
+ groups: Dict[str, GroupDiagnostics]
452
+
453
+ @property
454
+ def has_conflicts(self) -> bool:
455
+ return any(group.conflicting_rows for group in self.groups.values())
456
+
457
+ @property
458
+ def total_conflicting_rows(self) -> int:
459
+ conflict_rows = set()
460
+ for group in self.groups.values():
461
+ conflict_rows.update(group.conflicting_row_numbers)
462
+ return len(conflict_rows)
463
+
464
+ def summary(self) -> pl.DataFrame:
465
+ """Return one audit-summary row for each consolidated output."""
466
+ return pl.DataFrame(
467
+ [
468
+ {
469
+ "output_column": group.output_column,
470
+ "source_columns": ", ".join(group.source_columns),
471
+ "total_rows": group.total_rows,
472
+ "rows_with_output": group.rows_with_output,
473
+ "rows_without_value": group.rows_without_value,
474
+ "conflicting_rows": group.conflicting_rows,
475
+ "conflict_rate": group.conflict_rate,
476
+ }
477
+ for group in self.groups.values()
478
+ ]
479
+ )
480
+
481
+ def format_report(self, action: Optional[str] = None) -> str:
482
+ """Render a compact human-readable consolidation report."""
483
+ if self.has_conflicts:
484
+ headline = "Column consolidation found contradictions in %s unique rows." % (
485
+ self.total_conflicting_rows
486
+ )
487
+ else:
488
+ headline = "Column consolidation completed with no contradictions."
489
+ with pl.Config(tbl_cols=-1, tbl_width_chars=220):
490
+ lines = [headline, str(self.summary())]
491
+ for group in self.groups.values():
492
+ if not group.conflicting_rows:
493
+ continue
494
+ lines.extend(
495
+ [
496
+ "",
497
+ "Output %r: conflicting source-column pairs:" % group.output_column,
498
+ str(group.conflict_column_pairs),
499
+ "Most frequent contradictory value patterns:",
500
+ str(group.value_patterns),
501
+ "Example contradictory rows:",
502
+ str(group.example_conflicts),
503
+ ]
504
+ )
505
+ if action:
506
+ lines.extend(["", "Action: %s" % action])
507
+ return "\n".join(lines)
508
+
509
+
510
+ class ColumnConflictError(ValueError):
511
+ """Raised when consolidation is configured to stop on contradictory values."""
512
+
513
+ def __init__(self, diagnostics: ConsolidationDiagnostics) -> None:
514
+ self.diagnostics = diagnostics
515
+ super().__init__(diagnostics.format_report())
516
+
517
+
518
+ def _consolidation_non_missing(value: Any) -> Optional[Any]:
519
+ if value is None:
520
+ return None
521
+ if isinstance(value, float) and math.isnan(value):
522
+ return None
523
+ return value
524
+
525
+
526
+ def _consolidation_comparable_value(value: Any) -> Any:
527
+ try:
528
+ hash(value)
529
+ except TypeError:
530
+ return repr(value)
531
+ return value
532
+
533
+
534
+ def _consolidation_row_values(
535
+ row: Mapping[str, Any],
536
+ source_columns: Sequence[str],
537
+ ) -> Tuple[List[Tuple[str, Any, Any]], List[Any]]:
538
+ populated = []
539
+ distinct = []
540
+ for column in source_columns:
541
+ value = _consolidation_non_missing(row[column])
542
+ if value is None:
543
+ continue
544
+ comparable = _consolidation_comparable_value(value)
545
+ populated.append((column, value, comparable))
546
+ if comparable not in distinct:
547
+ distinct.append(comparable)
548
+ return populated, distinct
549
+
550
+
551
+ def _analyze_consolidation_group(
552
+ frame: pl.DataFrame,
553
+ output_column: str,
554
+ source_columns: Sequence[str],
555
+ context_columns: Sequence[str],
556
+ max_examples: int,
557
+ row_number_column: str,
558
+ ) -> Tuple[List[Any], List[int], GroupDiagnostics]:
559
+ numbered = frame.with_row_index(row_number_column)
560
+ selected = numbered.select(
561
+ [row_number_column] + list(context_columns) + list(source_columns)
562
+ )
563
+ output_values = []
564
+ conflict_flags = []
565
+ rows_with_any_value = 0
566
+ rows_with_multiple_values = 0
567
+ conflicting_rows = []
568
+ conflicting_row_numbers = []
569
+ patterns = Counter()
570
+ pair_counts = Counter()
571
+
572
+ for row in selected.iter_rows(named=True):
573
+ populated, distinct = _consolidation_row_values(row, source_columns)
574
+ if populated:
575
+ rows_with_any_value += 1
576
+ if len(populated) > 1:
577
+ rows_with_multiple_values += 1
578
+ if len(distinct) <= 1:
579
+ output_values.append(populated[0][1] if populated else None)
580
+ conflict_flags.append(0)
581
+ continue
582
+
583
+ output_values.append(None)
584
+ conflict_flags.append(1)
585
+ conflicting_rows.append(row)
586
+ conflicting_row_numbers.append(row[row_number_column])
587
+ pattern = " <> ".join(repr(value) for value in distinct)
588
+ patterns[pattern] += 1
589
+ for left, right in combinations(populated, 2):
590
+ if left[2] != right[2]:
591
+ pair_counts[(left[0], right[0])] += 1
592
+
593
+ pair_rows = [
594
+ {"column_a": pair[0], "column_b": pair[1], "conflicting_rows": count}
595
+ for pair, count in pair_counts.most_common()
596
+ ]
597
+ pattern_rows = [
598
+ {"observed_values": pattern, "rows": count}
599
+ for pattern, count in patterns.most_common(max_examples)
600
+ ]
601
+ diagnostics = GroupDiagnostics(
602
+ output_column=output_column,
603
+ source_columns=list(source_columns),
604
+ context_columns=list(context_columns),
605
+ total_rows=frame.height,
606
+ rows_with_any_value=rows_with_any_value,
607
+ rows_with_multiple_values=rows_with_multiple_values,
608
+ conflicting_rows=len(conflicting_rows),
609
+ conflicting_row_numbers=conflicting_row_numbers,
610
+ conflict_column_pairs=pl.DataFrame(
611
+ pair_rows,
612
+ schema={"column_a": pl.String, "column_b": pl.String, "conflicting_rows": pl.Int64},
613
+ ),
614
+ value_patterns=pl.DataFrame(
615
+ pattern_rows, schema={"observed_values": pl.String, "rows": pl.Int64}
616
+ ),
617
+ example_conflicts=pl.DataFrame(
618
+ conflicting_rows[:max_examples],
619
+ schema=selected.schema,
620
+ ),
621
+ )
622
+ return output_values, conflict_flags, diagnostics
623
+
624
+
625
+ def _temporary_consolidation_row_number(existing: Sequence[str]) -> str:
626
+ """Choose an internal diagnostic row-number column not present in the input."""
627
+ column = "_row_number"
628
+ while column in existing:
629
+ column = "_" + column
630
+ return column
631
+
632
+
633
+ def consolidate_columns(
634
+ frame: Frame,
635
+ columns: Mapping[str, Sequence[str]],
636
+ *,
637
+ context_columns: Optional[Sequence[str]] = None,
638
+ on_conflict: str = "keep",
639
+ max_examples: int = 5,
640
+ return_diagnostics: bool = False,
641
+ ) -> ConsolidatedResult:
642
+ """Coalesce repeated exact-value columns and print conflict diagnostics.
643
+
644
+ ``columns`` maps each desired output column to its possible source
645
+ columns; any number of outputs may be consolidated in one call. No
646
+ normalization is performed: values must match exactly to agree.
647
+
648
+ ``context_columns`` optionally supplies identifiers, such as
649
+ ``["person_id"]``, to display in conflict examples. Without one, a
650
+ temporary row number is printed in examples but is not returned.
651
+
652
+ ``on_conflict="keep"`` is the safe default: input columns are retained and
653
+ integer ``<output>_conflict`` flags are added when contradictions exist.
654
+ ``"raise"`` prints diagnostics and stops, while ``"drop"`` discards source
655
+ evidence and leaves contradicted consolidated outputs null.
656
+
657
+ Accepts a Polars DataFrame or LazyFrame and returns the same frame type.
658
+ For a LazyFrame, diagnostic printing evaluates the relevant source
659
+ projection immediately, while the returned transformed frame stays lazy.
660
+ The function makes no API or database calls and never drops rows.
661
+ """
662
+ if not isinstance(frame, (pl.DataFrame, pl.LazyFrame)):
663
+ raise TypeError(
664
+ "consolidate_columns expects a polars.DataFrame or polars.LazyFrame."
665
+ )
666
+ if not columns:
667
+ raise ValueError("Provide at least one output-to-source column mapping.")
668
+ if on_conflict not in {"raise", "keep", "drop"}:
669
+ raise ValueError("on_conflict must be 'raise', 'keep', or 'drop'.")
670
+ if max_examples < 1:
671
+ raise ValueError("max_examples must be at least 1.")
672
+
673
+ context_columns = list(dict.fromkeys(context_columns or []))
674
+ is_lazy = isinstance(frame, pl.LazyFrame)
675
+ input_columns = frame.collect_schema().names() if is_lazy else frame.columns
676
+ existing = set(input_columns)
677
+ missing_context = [column for column in context_columns if column not in existing]
678
+ if missing_context:
679
+ raise ValueError(
680
+ "Missing diagnostic context columns: %s" % ", ".join(missing_context)
681
+ )
682
+
683
+ used_sources = set()
684
+ for output_column, source_columns in columns.items():
685
+ if not source_columns:
686
+ raise ValueError("Output %r has no source columns." % output_column)
687
+ if len(set(source_columns)) != len(source_columns):
688
+ raise ValueError("Output %r contains duplicate source columns." % output_column)
689
+ if output_column in source_columns:
690
+ raise ValueError("Output %r cannot also be one of its sources." % output_column)
691
+ if output_column in existing:
692
+ raise ValueError(
693
+ "Output %r already exists; choose a new output name or remove it first."
694
+ % output_column
695
+ )
696
+ if on_conflict == "keep" and "%s_conflict" % output_column in existing:
697
+ raise ValueError(
698
+ "Audit output %r already exists; remove it or choose a new output name."
699
+ % ("%s_conflict" % output_column)
700
+ )
701
+ missing = [column for column in source_columns if column not in existing]
702
+ if missing:
703
+ raise ValueError(
704
+ "Output %r has missing source columns: %s"
705
+ % (output_column, ", ".join(missing))
706
+ )
707
+ overlap = used_sources.intersection(source_columns)
708
+ if overlap:
709
+ raise ValueError(
710
+ "Source columns cannot feed more than one output: %s"
711
+ % ", ".join(sorted(overlap))
712
+ )
713
+ used_sources.update(source_columns)
714
+
715
+ diagnostic_columns = list(
716
+ dict.fromkeys(
717
+ list(context_columns)
718
+ + [
719
+ source_column
720
+ for source_columns in columns.values()
721
+ for source_column in source_columns
722
+ ]
723
+ )
724
+ )
725
+ diagnostic_source = frame.select(diagnostic_columns)
726
+ if isinstance(diagnostic_source, pl.LazyFrame):
727
+ diagnostic_source = diagnostic_source.collect()
728
+ row_number_column = _temporary_consolidation_row_number(input_columns)
729
+ outputs = {}
730
+ conflict_flags = {}
731
+ reports = {}
732
+ for output_column, source_columns in columns.items():
733
+ group_context = [
734
+ column for column in context_columns if column not in source_columns
735
+ ]
736
+ (
737
+ outputs[output_column],
738
+ conflict_flags[output_column],
739
+ reports[output_column],
740
+ ) = _analyze_consolidation_group(
741
+ diagnostic_source,
742
+ output_column,
743
+ source_columns,
744
+ group_context,
745
+ max_examples,
746
+ row_number_column,
747
+ )
748
+
749
+ diagnostics = ConsolidationDiagnostics(reports)
750
+ if diagnostics.has_conflicts and on_conflict == "raise":
751
+ action = "Source columns retained; no rows or columns changed because conflicts were detected."
752
+ elif diagnostics.has_conflicts and on_conflict == "keep":
753
+ action = (
754
+ "Source columns retained and 0/1 conflict flags added for audit; "
755
+ "no rows dropped."
756
+ )
757
+ elif diagnostics.has_conflicts:
758
+ action = (
759
+ "Source columns dropped as requested; conflicting output values "
760
+ "remain null; no rows dropped."
761
+ )
762
+ else:
763
+ action = "Source columns dropped after successful consolidation; no rows dropped."
764
+ print(diagnostics.format_report(action))
765
+ if diagnostics.has_conflicts and on_conflict == "raise":
766
+ raise ColumnConflictError(diagnostics)
767
+
768
+ update_values = dict(outputs)
769
+ if diagnostics.has_conflicts and on_conflict == "keep":
770
+ update_values.update(
771
+ {
772
+ "%s_conflict" % output_column: pl.Series(values, dtype=pl.Int8)
773
+ for output_column, values in conflict_flags.items()
774
+ }
775
+ )
776
+ if is_lazy:
777
+ update_values[row_number_column] = range(diagnostic_source.height)
778
+ updates = pl.DataFrame(update_values).lazy()
779
+ result = (
780
+ frame.with_row_index(row_number_column)
781
+ .join(updates, on=row_number_column, how="left")
782
+ .drop(row_number_column)
783
+ )
784
+ else:
785
+ result = frame.with_columns(
786
+ [pl.Series(output_column, values) for output_column, values in update_values.items()]
787
+ )
788
+ if not diagnostics.has_conflicts or on_conflict == "drop":
789
+ result = result.drop(list(used_sources))
790
+
791
+ if return_diagnostics:
792
+ return result, diagnostics
793
+ return result
794
+
795
+
796
+ ###############################################################################
797
+ # DEMOCRACY PREP LOTTERY COLUMN SELECTION
798
+ ###############################################################################
799
+
800
+ def select_lottery_columns(frame: Frame) -> Frame:
801
+ """Select the requested lottery fields and rename direct schema matches."""
802
+ return frame.select(
803
+ [
804
+ "origorder",
805
+ "year",
806
+ "applygrade",
807
+ "firstname",
808
+ "lastname",
809
+ "lottstatus",
810
+ "offered",
811
+ "address",
812
+ "apt",
813
+ "zip",
814
+ "pfirstname",
815
+ "plastname",
816
+ "dob_orig",
817
+ "waitlist",
818
+ "dob",
819
+ "sibling1name",
820
+ "sibling1status",
821
+ "sibling1school",
822
+ "sibling1grade",
823
+ "in_dist",
824
+ "number",
825
+ "oth_district",
826
+ "in_district",
827
+ "currentschool2",
828
+ "datereceived",
829
+ "siblingcurrent",
830
+ "offered_school",
831
+ "currentgrade",
832
+ "dob2",
833
+ "status",
834
+ "appno",
835
+ "app_hpcs",
836
+ "app_dpe",
837
+ "app_dph",
838
+ "app_dpcs",
839
+ "guardian2",
840
+ "sibling2name",
841
+ "sibling2status",
842
+ "sibling2school",
843
+ "sibling2grade",
844
+ "sibling3name",
845
+ "sibling3status",
846
+ "sibling3school",
847
+ "sibling3grade",
848
+ "actualdistrict",
849
+ "waitoffer",
850
+ "waitaccept",
851
+ "waitacceptdate",
852
+ "waitdeclinedate",
853
+ "waitacceptschool",
854
+ "appid",
855
+ "hpcs",
856
+ "dpe",
857
+ "dph",
858
+ "dpcs",
859
+ "acceptedschoolname",
860
+ "guardian1name",
861
+ "hpcsletter",
862
+ "dpeletter",
863
+ "dphletter",
864
+ "dpcsletter",
865
+ "anyletter",
866
+ "currentgrade_orig",
867
+ "siblingstatus",
868
+ "guardian1lastname",
869
+ "guardian1firstname",
870
+ "guardian1street",
871
+ "guardian1apt",
872
+ "guardian1city",
873
+ "guardian1zipcode",
874
+ "guardian2lastname",
875
+ "guardian2firstname",
876
+ "guardian2street",
877
+ "guardian2apt",
878
+ "guardian2city",
879
+ "guardian2zipcode",
880
+ "currentschoolpreference",
881
+ "applicationsource",
882
+ "waitlistinfo",
883
+ "offer_dpcs2",
884
+ "offer_dpe2",
885
+ "offer_dph2",
886
+ "offer_hpcs2",
887
+ "wait_dpcs2",
888
+ "wait_dpe2",
889
+ "wait_dph2",
890
+ "wait_hpcs2",
891
+ "whichschl",
892
+ "dob_orig1415",
893
+ "graderank_hpcs",
894
+ "graderank_dph",
895
+ "graderank_dpcs",
896
+ "graderank_dpe",
897
+ "prigroup",
898
+ "preferencesort_hpcs",
899
+ "preferencesort_dph",
900
+ "preferencesort_dpcs",
901
+ "preferencesort_dpe",
902
+ "waitlist_best",
903
+ "offered2",
904
+ "Sid2",
905
+ "sid_impute",
906
+ "geodist",
907
+ ]
908
+ ).rename(
909
+ {
910
+ "year": "school_year",
911
+ "applygrade": "entry_grade",
912
+ "firstname": "fname",
913
+ "lastname": "lname",
914
+ "apt": "address2",
915
+ "pfirstname": "p_fname",
916
+ "plastname": "p_lname",
917
+ "waitlist": "waitlist_number",
918
+ "datereceived": "application_date",
919
+ "whichschl": "school_name",
920
+ "guardian1name": "p_name",
921
+ "guardian1firstname": "p1_fname",
922
+ "guardian1lastname": "p1_lname",
923
+ "guardian2firstname": "p2_fname",
924
+ "guardian2lastname": "p2_lname",
925
+ }
926
+ )
927
+
928
+
929
+ ###############################################################################
930
+ # ENR DATA COLUMN SELECTION
931
+ ###############################################################################
932
+
933
+ def select_enr_columns(frame: Frame) -> Frame:
934
+ """Select ENR fields and rename direct centralized-schema matches."""
935
+ return frame.select(
936
+ [
937
+ "year",
938
+ "datasource",
939
+ "dbn_oct31",
940
+ "dbn_sis",
941
+ "first_name",
942
+ "middle_name",
943
+ "last_name",
944
+ "dob",
945
+ "dob_sis",
946
+ "mealcode",
947
+ "admitdate",
948
+ "home_lang",
949
+ "pob_code",
950
+ "grade_level",
951
+ "official_class",
952
+ "createdate",
953
+ "spec_ed_flag",
954
+ "school_level",
955
+ "building code",
956
+ "residencestreetaddress",
957
+ "residencezip",
958
+ "contact1name",
959
+ "contact1streetaddress",
960
+ "contact1zip",
961
+ ]
962
+ ).rename(
963
+ {
964
+ "year": "school_year",
965
+ "first_name": "fname",
966
+ "middle_name": "mname",
967
+ "last_name": "lname",
968
+ "grade_level": "enrollment_grade",
969
+ "residencestreetaddress": "address",
970
+ "residencezip": "zip",
971
+ "contact1name": "p_name",
972
+ "spec_ed_flag": "sped",
973
+ }
974
+ )
File without changes