ltc-code 0.1.1__tar.gz → 0.1.2__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {ltc_code-0.1.1 → ltc_code-0.1.2}/PKG-INFO +1 -1
- {ltc_code-0.1.1 → ltc_code-0.1.2}/pyproject.toml +1 -1
- ltc_code-0.1.2/src/ltc_code/may27.py +974 -0
- {ltc_code-0.1.1 → ltc_code-0.1.2}/README.md +0 -0
- {ltc_code-0.1.1 → ltc_code-0.1.2}/src/ltc_code/__init__.py +0 -0
- {ltc_code-0.1.1 → ltc_code-0.1.2}/src/ltc_code/polars_dates.py +0 -0
|
@@ -0,0 +1,974 @@
|
|
|
1
|
+
"""Reusable helper functions for the LTC project."""
|
|
2
|
+
|
|
3
|
+
from collections import Counter
|
|
4
|
+
from dataclasses import dataclass
|
|
5
|
+
from functools import lru_cache
|
|
6
|
+
from itertools import combinations
|
|
7
|
+
import math
|
|
8
|
+
import re
|
|
9
|
+
import warnings
|
|
10
|
+
from typing import Any, Dict, Iterable, List, Mapping, Optional, Sequence, Tuple, Union
|
|
11
|
+
|
|
12
|
+
import polars as pl
|
|
13
|
+
with warnings.catch_warnings():
|
|
14
|
+
warnings.filterwarnings("ignore", message="urllib3 v2 only supports OpenSSL")
|
|
15
|
+
from scourgify import normalize_address_record
|
|
16
|
+
from scourgify.exceptions import AddressNormalizationError
|
|
17
|
+
import usaddress
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
__all__ = [
|
|
21
|
+
"clean_addresses",
|
|
22
|
+
"clean_address_columns",
|
|
23
|
+
"ColumnConflictError",
|
|
24
|
+
"ConsolidationDiagnostics",
|
|
25
|
+
"GroupDiagnostics",
|
|
26
|
+
"consolidate_columns",
|
|
27
|
+
"select_lottery_columns",
|
|
28
|
+
"select_enr_columns",
|
|
29
|
+
]
|
|
30
|
+
|
|
31
|
+
|
|
32
|
+
###############################################################################
|
|
33
|
+
# ADDRESS STANDARDIZATION AND CONFLICT REVIEW
|
|
34
|
+
#
|
|
35
|
+
# Public entry point:
|
|
36
|
+
# cleaned = frame.pipe(clean_addresses)
|
|
37
|
+
#
|
|
38
|
+
# This section reconciles one to three optional address-bearing columns
|
|
39
|
+
# (`address`, `address1`, and `address2` by default) into standardized output
|
|
40
|
+
# fields and an integer `address_conflict` review flag.
|
|
41
|
+
#
|
|
42
|
+
# Operational notes:
|
|
43
|
+
# - Parsing uses usaddress-scourgify with usaddress fallback for fragments.
|
|
44
|
+
# - Parsed text is standardized, not verified against an official address file.
|
|
45
|
+
# - Repeated exact raw values are cached in memory only for the Python process.
|
|
46
|
+
# - DataFrame inputs print diagnostics while returning a DataFrame.
|
|
47
|
+
# - LazyFrame inputs also print diagnostics; doing so executes separate summary
|
|
48
|
+
# and example queries immediately, while the returned result remains lazy.
|
|
49
|
+
###############################################################################
|
|
50
|
+
|
|
51
|
+
DEFAULT_ADDRESS_COLUMNS = ("address", "address1", "address2")
|
|
52
|
+
OUTPUT_DTYPE = pl.Struct(
|
|
53
|
+
{
|
|
54
|
+
"address1_clean": pl.String,
|
|
55
|
+
"address2_clean": pl.String,
|
|
56
|
+
"zipcode_clean": pl.String,
|
|
57
|
+
"city_clean": pl.String,
|
|
58
|
+
"state_clean": pl.String,
|
|
59
|
+
"address_conflict": pl.Int8,
|
|
60
|
+
"address_conflict_reason": pl.String,
|
|
61
|
+
"address_parse_ok": pl.Boolean,
|
|
62
|
+
}
|
|
63
|
+
)
|
|
64
|
+
|
|
65
|
+
STATE_ABBREVIATIONS = {
|
|
66
|
+
"ALABAMA": "AL",
|
|
67
|
+
"ALASKA": "AK",
|
|
68
|
+
"ARIZONA": "AZ",
|
|
69
|
+
"ARKANSAS": "AR",
|
|
70
|
+
"CALIFORNIA": "CA",
|
|
71
|
+
"COLORADO": "CO",
|
|
72
|
+
"CONNECTICUT": "CT",
|
|
73
|
+
"DELAWARE": "DE",
|
|
74
|
+
"FLORIDA": "FL",
|
|
75
|
+
"GEORGIA": "GA",
|
|
76
|
+
"HAWAII": "HI",
|
|
77
|
+
"IDAHO": "ID",
|
|
78
|
+
"ILLINOIS": "IL",
|
|
79
|
+
"INDIANA": "IN",
|
|
80
|
+
"IOWA": "IA",
|
|
81
|
+
"KANSAS": "KS",
|
|
82
|
+
"KENTUCKY": "KY",
|
|
83
|
+
"LOUISIANA": "LA",
|
|
84
|
+
"MAINE": "ME",
|
|
85
|
+
"MARYLAND": "MD",
|
|
86
|
+
"MASSACHUSETTS": "MA",
|
|
87
|
+
"MICHIGAN": "MI",
|
|
88
|
+
"MINNESOTA": "MN",
|
|
89
|
+
"MISSISSIPPI": "MS",
|
|
90
|
+
"MISSOURI": "MO",
|
|
91
|
+
"MONTANA": "MT",
|
|
92
|
+
"NEBRASKA": "NE",
|
|
93
|
+
"NEVADA": "NV",
|
|
94
|
+
"NEW HAMPSHIRE": "NH",
|
|
95
|
+
"NEW JERSEY": "NJ",
|
|
96
|
+
"NEW MEXICO": "NM",
|
|
97
|
+
"NEW YORK": "NY",
|
|
98
|
+
"NORTH CAROLINA": "NC",
|
|
99
|
+
"NORTH DAKOTA": "ND",
|
|
100
|
+
"OHIO": "OH",
|
|
101
|
+
"OKLAHOMA": "OK",
|
|
102
|
+
"OREGON": "OR",
|
|
103
|
+
"PENNSYLVANIA": "PA",
|
|
104
|
+
"RHODE ISLAND": "RI",
|
|
105
|
+
"SOUTH CAROLINA": "SC",
|
|
106
|
+
"SOUTH DAKOTA": "SD",
|
|
107
|
+
"TENNESSEE": "TN",
|
|
108
|
+
"TEXAS": "TX",
|
|
109
|
+
"UTAH": "UT",
|
|
110
|
+
"VERMONT": "VT",
|
|
111
|
+
"VIRGINIA": "VA",
|
|
112
|
+
"WASHINGTON": "WA",
|
|
113
|
+
"WEST VIRGINIA": "WV",
|
|
114
|
+
"WISCONSIN": "WI",
|
|
115
|
+
"WYOMING": "WY",
|
|
116
|
+
}
|
|
117
|
+
|
|
118
|
+
COMPONENT_KEYS = (
|
|
119
|
+
"address_line_1",
|
|
120
|
+
"address_line_2",
|
|
121
|
+
"city",
|
|
122
|
+
"state",
|
|
123
|
+
"postal_code",
|
|
124
|
+
)
|
|
125
|
+
|
|
126
|
+
|
|
127
|
+
def _clean_text(value: Any) -> Optional[str]:
|
|
128
|
+
if value is None:
|
|
129
|
+
return None
|
|
130
|
+
text = str(value).strip()
|
|
131
|
+
return text or None
|
|
132
|
+
|
|
133
|
+
|
|
134
|
+
def _prepare_address(value: str) -> str:
|
|
135
|
+
text = value.upper()
|
|
136
|
+
text = re.sub(r"\bZIP\s*:?\s*", "", text)
|
|
137
|
+
text = re.sub(r"\s*[|;/]\s*", ", ", text)
|
|
138
|
+
text = re.sub(r"\s*--+\s*", ", ", text)
|
|
139
|
+
text = re.sub(r"#\s*(APT|STE|SUITE|UNIT|FLOOR)\b", r"\1", text)
|
|
140
|
+
for state, abbreviation in STATE_ABBREVIATIONS.items():
|
|
141
|
+
text = re.sub(
|
|
142
|
+
r"\b%s\b(?=\s*,?\s*(?:\d{5}(?:-\d{4})?\b|$))" % re.escape(state),
|
|
143
|
+
abbreviation,
|
|
144
|
+
text,
|
|
145
|
+
)
|
|
146
|
+
text = re.sub(
|
|
147
|
+
r"^(.*?),\s*(.*?),\s*(\d{5}(?:-\d{4})?),\s*([A-Z]{2})$",
|
|
148
|
+
r"\1, \2, \4 \3",
|
|
149
|
+
text,
|
|
150
|
+
)
|
|
151
|
+
text = re.sub(
|
|
152
|
+
r"^(.*?),\s*(.*?)\s+([A-Z]{2})-(\d{5}(?:-\d{4})?)\s+(.+)$",
|
|
153
|
+
r"\1, \5, \2, \3 \4",
|
|
154
|
+
text,
|
|
155
|
+
)
|
|
156
|
+
text = re.sub(r"\s+", " ", text)
|
|
157
|
+
return text.strip(" ,")
|
|
158
|
+
|
|
159
|
+
|
|
160
|
+
def _join_tag_values(tags: Dict[str, str], keys: Iterable[str]) -> Optional[str]:
|
|
161
|
+
values = [tags[key].upper() for key in keys if tags.get(key)]
|
|
162
|
+
return " ".join(values) if values else None
|
|
163
|
+
|
|
164
|
+
|
|
165
|
+
def _from_usaddress(value: str) -> Dict[str, Optional[str]]:
|
|
166
|
+
try:
|
|
167
|
+
tags, _ = usaddress.tag(value)
|
|
168
|
+
except usaddress.RepeatedLabelError:
|
|
169
|
+
return {key: None for key in COMPONENT_KEYS}
|
|
170
|
+
|
|
171
|
+
line_1 = _join_tag_values(
|
|
172
|
+
tags,
|
|
173
|
+
(
|
|
174
|
+
"USPSBoxType",
|
|
175
|
+
"USPSBoxID",
|
|
176
|
+
"AddressNumber",
|
|
177
|
+
"StreetNamePreDirectional",
|
|
178
|
+
"StreetNamePreType",
|
|
179
|
+
"StreetName",
|
|
180
|
+
"StreetNamePostType",
|
|
181
|
+
"StreetNamePostDirectional",
|
|
182
|
+
),
|
|
183
|
+
)
|
|
184
|
+
line_2 = _join_tag_values(
|
|
185
|
+
tags,
|
|
186
|
+
("OccupancyType", "OccupancyIdentifier", "SubaddressType", "SubaddressIdentifier"),
|
|
187
|
+
)
|
|
188
|
+
state = tags.get("StateName")
|
|
189
|
+
return {
|
|
190
|
+
"address_line_1": line_1,
|
|
191
|
+
"address_line_2": line_2,
|
|
192
|
+
"city": tags.get("PlaceName", "").upper() or None,
|
|
193
|
+
"state": state.upper() if state else None,
|
|
194
|
+
"postal_code": tags.get("ZipCode", "").upper() or None,
|
|
195
|
+
}
|
|
196
|
+
|
|
197
|
+
|
|
198
|
+
@lru_cache(maxsize=100000)
|
|
199
|
+
def _parse_address(value: str) -> Dict[str, Optional[str]]:
|
|
200
|
+
prepared = _prepare_address(value)
|
|
201
|
+
try:
|
|
202
|
+
normalized = normalize_address_record(prepared)
|
|
203
|
+
return {key: normalized.get(key) for key in COMPONENT_KEYS}
|
|
204
|
+
except AddressNormalizationError:
|
|
205
|
+
return _from_usaddress(prepared)
|
|
206
|
+
|
|
207
|
+
|
|
208
|
+
def _candidate_score(candidate: Dict[str, Optional[str]]) -> int:
|
|
209
|
+
return (
|
|
210
|
+
(4 if candidate["address_line_1"] else 0)
|
|
211
|
+
+ (1 if candidate["address_line_2"] else 0)
|
|
212
|
+
+ (1 if candidate["city"] else 0)
|
|
213
|
+
+ (1 if candidate["state"] else 0)
|
|
214
|
+
+ (1 if candidate["postal_code"] else 0)
|
|
215
|
+
)
|
|
216
|
+
|
|
217
|
+
|
|
218
|
+
def _comparison_value(key: str, value: str) -> str:
|
|
219
|
+
text = re.sub(r"[^A-Z0-9# ]+", " ", value.upper())
|
|
220
|
+
text = re.sub(r"\s+", " ", text).strip()
|
|
221
|
+
if key == "address_line_2":
|
|
222
|
+
text = re.sub(r"^APT\s+", "APT ", text)
|
|
223
|
+
text = re.sub(r"^(FLOOR|FL)\s+", "FL ", text)
|
|
224
|
+
text = re.sub(r"^(SUITE|STE)\s+", "STE ", text)
|
|
225
|
+
text = re.sub(r"^(APARTMENT)\s+", "APT ", text)
|
|
226
|
+
return text
|
|
227
|
+
|
|
228
|
+
|
|
229
|
+
def _resolve_component(
|
|
230
|
+
candidates: List[Dict[str, Optional[str]]], key: str
|
|
231
|
+
) -> Tuple[Optional[str], List[str]]:
|
|
232
|
+
observed = [candidate[key] for candidate in candidates if candidate[key]]
|
|
233
|
+
if not observed:
|
|
234
|
+
return None, []
|
|
235
|
+
counts = Counter(_comparison_value(key, value) for value in observed)
|
|
236
|
+
best_count = max(counts.values())
|
|
237
|
+
most_common = {value for value, count in counts.items() if count == best_count}
|
|
238
|
+
chosen = next(
|
|
239
|
+
candidate[key]
|
|
240
|
+
for candidate in sorted(candidates, key=_candidate_score, reverse=True)
|
|
241
|
+
if candidate[key] and _comparison_value(key, candidate[key]) in most_common
|
|
242
|
+
)
|
|
243
|
+
distinct = []
|
|
244
|
+
seen = set()
|
|
245
|
+
for value in observed:
|
|
246
|
+
comparison = _comparison_value(key, value)
|
|
247
|
+
if comparison not in seen:
|
|
248
|
+
distinct.append(value)
|
|
249
|
+
seen.add(comparison)
|
|
250
|
+
return chosen, sorted(distinct)
|
|
251
|
+
|
|
252
|
+
|
|
253
|
+
def _reconcile_row(row: Dict[str, Any], address_columns: Sequence[str]) -> Dict[str, Any]:
|
|
254
|
+
values = [_clean_text(row.get(column)) for column in address_columns]
|
|
255
|
+
candidates = [_parse_address(value) for value in values if value]
|
|
256
|
+
resolved = {}
|
|
257
|
+
conflicts = []
|
|
258
|
+
for key in COMPONENT_KEYS:
|
|
259
|
+
resolved[key], distinct = _resolve_component(candidates, key)
|
|
260
|
+
if len(distinct) > 1:
|
|
261
|
+
conflicts.append("%s: %s" % (key, " <> ".join(distinct)))
|
|
262
|
+
|
|
263
|
+
return {
|
|
264
|
+
"address1_clean": resolved["address_line_1"],
|
|
265
|
+
"address2_clean": resolved["address_line_2"],
|
|
266
|
+
"zipcode_clean": resolved["postal_code"],
|
|
267
|
+
"city_clean": resolved["city"],
|
|
268
|
+
"state_clean": resolved["state"],
|
|
269
|
+
"address_conflict": int(bool(conflicts)),
|
|
270
|
+
"address_conflict_reason": "; ".join(conflicts) or None,
|
|
271
|
+
"address_parse_ok": bool(resolved["address_line_1"] or resolved["postal_code"]),
|
|
272
|
+
}
|
|
273
|
+
|
|
274
|
+
|
|
275
|
+
def _print_address_diagnostics(
|
|
276
|
+
frame: Union[pl.DataFrame, pl.LazyFrame],
|
|
277
|
+
address_columns: Sequence[str],
|
|
278
|
+
example_count: int = 3,
|
|
279
|
+
) -> None:
|
|
280
|
+
nonblank_expressions = [
|
|
281
|
+
pl.col(column).fill_null("").str.strip_chars().ne("") for column in address_columns
|
|
282
|
+
]
|
|
283
|
+
summary_expressions = [
|
|
284
|
+
pl.len().alias("rows"),
|
|
285
|
+
*[
|
|
286
|
+
expression.sum().alias("nonblank_%s" % column)
|
|
287
|
+
for column, expression in zip(address_columns, nonblank_expressions)
|
|
288
|
+
],
|
|
289
|
+
pl.any_horizontal(nonblank_expressions).sum().alias("rows_with_address"),
|
|
290
|
+
pl.col("address_parse_ok").sum().alias("parsed_rows"),
|
|
291
|
+
pl.col("address_conflict").sum().alias("conflicting_rows"),
|
|
292
|
+
]
|
|
293
|
+
summary = frame.select(summary_expressions)
|
|
294
|
+
if isinstance(summary, pl.LazyFrame):
|
|
295
|
+
summary = summary.collect()
|
|
296
|
+
totals = summary.row(0, named=True)
|
|
297
|
+
conflicting_rows = totals["conflicting_rows"]
|
|
298
|
+
|
|
299
|
+
print("Address cleaning diagnostics")
|
|
300
|
+
print(" rows: %s" % totals["rows"])
|
|
301
|
+
for column in address_columns:
|
|
302
|
+
print(" non-null %s: %s" % (column, totals["nonblank_%s" % column]))
|
|
303
|
+
print(" rows with any supplied address: %s" % totals["rows_with_address"])
|
|
304
|
+
print(" parsed successfully: %s" % totals["parsed_rows"])
|
|
305
|
+
print(" contradictions: %s" % conflicting_rows)
|
|
306
|
+
|
|
307
|
+
if conflicting_rows:
|
|
308
|
+
print(" contradiction examples:")
|
|
309
|
+
example_columns = list(address_columns) + [
|
|
310
|
+
"address1_clean",
|
|
311
|
+
"address2_clean",
|
|
312
|
+
"zipcode_clean",
|
|
313
|
+
"address_conflict",
|
|
314
|
+
"address_conflict_reason",
|
|
315
|
+
]
|
|
316
|
+
examples = (
|
|
317
|
+
frame.filter(pl.col("address_conflict") == 1)
|
|
318
|
+
.select(example_columns)
|
|
319
|
+
.head(example_count)
|
|
320
|
+
)
|
|
321
|
+
if isinstance(examples, pl.LazyFrame):
|
|
322
|
+
examples = examples.collect()
|
|
323
|
+
for number, row in enumerate(examples.iter_rows(named=True), 1):
|
|
324
|
+
values = [
|
|
325
|
+
"%s=%s" % (column, row[column] if row[column] is not None else "NULL")
|
|
326
|
+
for column in address_columns
|
|
327
|
+
]
|
|
328
|
+
print(" %s. %s" % (number, " | ".join(values)))
|
|
329
|
+
print(
|
|
330
|
+
" address1_clean=%s | address2_clean=%s | zipcode_clean=%s"
|
|
331
|
+
% (
|
|
332
|
+
row["address1_clean"] or "NULL",
|
|
333
|
+
row["address2_clean"] or "NULL",
|
|
334
|
+
row["zipcode_clean"] or "NULL",
|
|
335
|
+
)
|
|
336
|
+
)
|
|
337
|
+
print(" address_conflict=1 | reason=%s" % row["address_conflict_reason"])
|
|
338
|
+
|
|
339
|
+
|
|
340
|
+
def clean_addresses(
|
|
341
|
+
frame: Union[pl.DataFrame, pl.LazyFrame],
|
|
342
|
+
address_columns: Optional[Union[str, Sequence[str]]] = None,
|
|
343
|
+
) -> Union[pl.DataFrame, pl.LazyFrame]:
|
|
344
|
+
"""Clean one to three address fields; designed for ``frame.pipe(...)``.
|
|
345
|
+
|
|
346
|
+
With no ``address_columns`` argument, any existing columns named
|
|
347
|
+
``address``, ``address1``, and ``address2`` are used automatically.
|
|
348
|
+
Values from all chosen nonblank columns contribute to the resolved output.
|
|
349
|
+
Disagreements are retained in the conflict audit columns.
|
|
350
|
+
Calls print a brief diagnostic summary and conflict examples. For a
|
|
351
|
+
``LazyFrame``, this evaluates separate diagnostic queries immediately but
|
|
352
|
+
returns the cleaned result as a still-lazy ``LazyFrame``.
|
|
353
|
+
"""
|
|
354
|
+
existing = set(
|
|
355
|
+
frame.collect_schema().names() if isinstance(frame, pl.LazyFrame) else frame.columns
|
|
356
|
+
)
|
|
357
|
+
if address_columns is None:
|
|
358
|
+
columns = [column for column in DEFAULT_ADDRESS_COLUMNS if column in existing]
|
|
359
|
+
elif isinstance(address_columns, str):
|
|
360
|
+
columns = [address_columns]
|
|
361
|
+
else:
|
|
362
|
+
columns = list(address_columns)
|
|
363
|
+
|
|
364
|
+
if not 1 <= len(columns) <= 3:
|
|
365
|
+
raise ValueError("Provide one, two, or three address column names.")
|
|
366
|
+
if len(set(columns)) != len(columns):
|
|
367
|
+
raise ValueError("Address column names must be unique.")
|
|
368
|
+
missing = [column for column in columns if column not in existing]
|
|
369
|
+
if missing:
|
|
370
|
+
raise ValueError("Missing address columns: %s" % ", ".join(missing))
|
|
371
|
+
|
|
372
|
+
clean_struct = pl.struct(columns).map_elements(
|
|
373
|
+
lambda row: _reconcile_row(row, columns),
|
|
374
|
+
return_dtype=OUTPUT_DTYPE,
|
|
375
|
+
).alias("_address_clean")
|
|
376
|
+
result = frame.with_columns(clean_struct).unnest("_address_clean")
|
|
377
|
+
_print_address_diagnostics(result, columns)
|
|
378
|
+
return result
|
|
379
|
+
|
|
380
|
+
|
|
381
|
+
clean_address_columns = clean_addresses
|
|
382
|
+
|
|
383
|
+
|
|
384
|
+
###############################################################################
|
|
385
|
+
# HORIZONTAL COLUMN CONSOLIDATION AND CONFLICT REVIEW
|
|
386
|
+
#
|
|
387
|
+
# Public entry point:
|
|
388
|
+
# reviewed = frame.pipe(
|
|
389
|
+
# consolidate_columns,
|
|
390
|
+
# {"dob": ["dob_1", "dob_2"], "fname": ["fname_1", "fname_2"]},
|
|
391
|
+
# )
|
|
392
|
+
#
|
|
393
|
+
# This section merges repeated representations of the same field, such as
|
|
394
|
+
# several DOB columns into `dob` and several first-name columns into `fname`.
|
|
395
|
+
# Values are compared exactly as stored; this helper does not trim, parse,
|
|
396
|
+
# standardize, or normalize them.
|
|
397
|
+
#
|
|
398
|
+
# Operational notes:
|
|
399
|
+
# - This helper uses in-memory Polars processing only. It makes no API calls
|
|
400
|
+
# and does not read from or write to a database.
|
|
401
|
+
# - It accepts a DataFrame or LazyFrame and returns the same frame type.
|
|
402
|
+
# - Printing diagnostics for a LazyFrame evaluates only the relevant source
|
|
403
|
+
# projection immediately; the returned transformed result remains lazy.
|
|
404
|
+
# - On contradictions, the default is to retain source evidence and add
|
|
405
|
+
# integer `<output>_conflict` columns where `1` indicates review is needed.
|
|
406
|
+
# - When all compared values agree, the input source columns are dropped.
|
|
407
|
+
# - Rows are never dropped.
|
|
408
|
+
###############################################################################
|
|
409
|
+
|
|
410
|
+
Frame = Union[pl.DataFrame, pl.LazyFrame]
|
|
411
|
+
ConsolidatedResult = Union[Frame, Tuple[Frame, "ConsolidationDiagnostics"]]
|
|
412
|
+
|
|
413
|
+
|
|
414
|
+
@dataclass
|
|
415
|
+
class GroupDiagnostics:
|
|
416
|
+
"""Diagnostics for one requested consolidated output column."""
|
|
417
|
+
|
|
418
|
+
output_column: str
|
|
419
|
+
source_columns: List[str]
|
|
420
|
+
context_columns: List[str]
|
|
421
|
+
total_rows: int
|
|
422
|
+
rows_with_any_value: int
|
|
423
|
+
rows_with_multiple_values: int
|
|
424
|
+
conflicting_rows: int
|
|
425
|
+
conflicting_row_numbers: List[int]
|
|
426
|
+
conflict_column_pairs: pl.DataFrame
|
|
427
|
+
value_patterns: pl.DataFrame
|
|
428
|
+
example_conflicts: pl.DataFrame
|
|
429
|
+
|
|
430
|
+
@property
|
|
431
|
+
def conflict_rate(self) -> float:
|
|
432
|
+
if not self.total_rows:
|
|
433
|
+
return 0.0
|
|
434
|
+
return self.conflicting_rows / self.total_rows
|
|
435
|
+
|
|
436
|
+
@property
|
|
437
|
+
def rows_with_output(self) -> int:
|
|
438
|
+
"""Rows for which the requested output can be safely populated."""
|
|
439
|
+
return self.rows_with_any_value - self.conflicting_rows
|
|
440
|
+
|
|
441
|
+
@property
|
|
442
|
+
def rows_without_value(self) -> int:
|
|
443
|
+
"""Rows with no non-missing source value for this output."""
|
|
444
|
+
return self.total_rows - self.rows_with_any_value
|
|
445
|
+
|
|
446
|
+
|
|
447
|
+
@dataclass
|
|
448
|
+
class ConsolidationDiagnostics:
|
|
449
|
+
"""Structured audit information produced while consolidating columns."""
|
|
450
|
+
|
|
451
|
+
groups: Dict[str, GroupDiagnostics]
|
|
452
|
+
|
|
453
|
+
@property
|
|
454
|
+
def has_conflicts(self) -> bool:
|
|
455
|
+
return any(group.conflicting_rows for group in self.groups.values())
|
|
456
|
+
|
|
457
|
+
@property
|
|
458
|
+
def total_conflicting_rows(self) -> int:
|
|
459
|
+
conflict_rows = set()
|
|
460
|
+
for group in self.groups.values():
|
|
461
|
+
conflict_rows.update(group.conflicting_row_numbers)
|
|
462
|
+
return len(conflict_rows)
|
|
463
|
+
|
|
464
|
+
def summary(self) -> pl.DataFrame:
|
|
465
|
+
"""Return one audit-summary row for each consolidated output."""
|
|
466
|
+
return pl.DataFrame(
|
|
467
|
+
[
|
|
468
|
+
{
|
|
469
|
+
"output_column": group.output_column,
|
|
470
|
+
"source_columns": ", ".join(group.source_columns),
|
|
471
|
+
"total_rows": group.total_rows,
|
|
472
|
+
"rows_with_output": group.rows_with_output,
|
|
473
|
+
"rows_without_value": group.rows_without_value,
|
|
474
|
+
"conflicting_rows": group.conflicting_rows,
|
|
475
|
+
"conflict_rate": group.conflict_rate,
|
|
476
|
+
}
|
|
477
|
+
for group in self.groups.values()
|
|
478
|
+
]
|
|
479
|
+
)
|
|
480
|
+
|
|
481
|
+
def format_report(self, action: Optional[str] = None) -> str:
|
|
482
|
+
"""Render a compact human-readable consolidation report."""
|
|
483
|
+
if self.has_conflicts:
|
|
484
|
+
headline = "Column consolidation found contradictions in %s unique rows." % (
|
|
485
|
+
self.total_conflicting_rows
|
|
486
|
+
)
|
|
487
|
+
else:
|
|
488
|
+
headline = "Column consolidation completed with no contradictions."
|
|
489
|
+
with pl.Config(tbl_cols=-1, tbl_width_chars=220):
|
|
490
|
+
lines = [headline, str(self.summary())]
|
|
491
|
+
for group in self.groups.values():
|
|
492
|
+
if not group.conflicting_rows:
|
|
493
|
+
continue
|
|
494
|
+
lines.extend(
|
|
495
|
+
[
|
|
496
|
+
"",
|
|
497
|
+
"Output %r: conflicting source-column pairs:" % group.output_column,
|
|
498
|
+
str(group.conflict_column_pairs),
|
|
499
|
+
"Most frequent contradictory value patterns:",
|
|
500
|
+
str(group.value_patterns),
|
|
501
|
+
"Example contradictory rows:",
|
|
502
|
+
str(group.example_conflicts),
|
|
503
|
+
]
|
|
504
|
+
)
|
|
505
|
+
if action:
|
|
506
|
+
lines.extend(["", "Action: %s" % action])
|
|
507
|
+
return "\n".join(lines)
|
|
508
|
+
|
|
509
|
+
|
|
510
|
+
class ColumnConflictError(ValueError):
|
|
511
|
+
"""Raised when consolidation is configured to stop on contradictory values."""
|
|
512
|
+
|
|
513
|
+
def __init__(self, diagnostics: ConsolidationDiagnostics) -> None:
|
|
514
|
+
self.diagnostics = diagnostics
|
|
515
|
+
super().__init__(diagnostics.format_report())
|
|
516
|
+
|
|
517
|
+
|
|
518
|
+
def _consolidation_non_missing(value: Any) -> Optional[Any]:
|
|
519
|
+
if value is None:
|
|
520
|
+
return None
|
|
521
|
+
if isinstance(value, float) and math.isnan(value):
|
|
522
|
+
return None
|
|
523
|
+
return value
|
|
524
|
+
|
|
525
|
+
|
|
526
|
+
def _consolidation_comparable_value(value: Any) -> Any:
|
|
527
|
+
try:
|
|
528
|
+
hash(value)
|
|
529
|
+
except TypeError:
|
|
530
|
+
return repr(value)
|
|
531
|
+
return value
|
|
532
|
+
|
|
533
|
+
|
|
534
|
+
def _consolidation_row_values(
|
|
535
|
+
row: Mapping[str, Any],
|
|
536
|
+
source_columns: Sequence[str],
|
|
537
|
+
) -> Tuple[List[Tuple[str, Any, Any]], List[Any]]:
|
|
538
|
+
populated = []
|
|
539
|
+
distinct = []
|
|
540
|
+
for column in source_columns:
|
|
541
|
+
value = _consolidation_non_missing(row[column])
|
|
542
|
+
if value is None:
|
|
543
|
+
continue
|
|
544
|
+
comparable = _consolidation_comparable_value(value)
|
|
545
|
+
populated.append((column, value, comparable))
|
|
546
|
+
if comparable not in distinct:
|
|
547
|
+
distinct.append(comparable)
|
|
548
|
+
return populated, distinct
|
|
549
|
+
|
|
550
|
+
|
|
551
|
+
def _analyze_consolidation_group(
|
|
552
|
+
frame: pl.DataFrame,
|
|
553
|
+
output_column: str,
|
|
554
|
+
source_columns: Sequence[str],
|
|
555
|
+
context_columns: Sequence[str],
|
|
556
|
+
max_examples: int,
|
|
557
|
+
row_number_column: str,
|
|
558
|
+
) -> Tuple[List[Any], List[int], GroupDiagnostics]:
|
|
559
|
+
numbered = frame.with_row_index(row_number_column)
|
|
560
|
+
selected = numbered.select(
|
|
561
|
+
[row_number_column] + list(context_columns) + list(source_columns)
|
|
562
|
+
)
|
|
563
|
+
output_values = []
|
|
564
|
+
conflict_flags = []
|
|
565
|
+
rows_with_any_value = 0
|
|
566
|
+
rows_with_multiple_values = 0
|
|
567
|
+
conflicting_rows = []
|
|
568
|
+
conflicting_row_numbers = []
|
|
569
|
+
patterns = Counter()
|
|
570
|
+
pair_counts = Counter()
|
|
571
|
+
|
|
572
|
+
for row in selected.iter_rows(named=True):
|
|
573
|
+
populated, distinct = _consolidation_row_values(row, source_columns)
|
|
574
|
+
if populated:
|
|
575
|
+
rows_with_any_value += 1
|
|
576
|
+
if len(populated) > 1:
|
|
577
|
+
rows_with_multiple_values += 1
|
|
578
|
+
if len(distinct) <= 1:
|
|
579
|
+
output_values.append(populated[0][1] if populated else None)
|
|
580
|
+
conflict_flags.append(0)
|
|
581
|
+
continue
|
|
582
|
+
|
|
583
|
+
output_values.append(None)
|
|
584
|
+
conflict_flags.append(1)
|
|
585
|
+
conflicting_rows.append(row)
|
|
586
|
+
conflicting_row_numbers.append(row[row_number_column])
|
|
587
|
+
pattern = " <> ".join(repr(value) for value in distinct)
|
|
588
|
+
patterns[pattern] += 1
|
|
589
|
+
for left, right in combinations(populated, 2):
|
|
590
|
+
if left[2] != right[2]:
|
|
591
|
+
pair_counts[(left[0], right[0])] += 1
|
|
592
|
+
|
|
593
|
+
pair_rows = [
|
|
594
|
+
{"column_a": pair[0], "column_b": pair[1], "conflicting_rows": count}
|
|
595
|
+
for pair, count in pair_counts.most_common()
|
|
596
|
+
]
|
|
597
|
+
pattern_rows = [
|
|
598
|
+
{"observed_values": pattern, "rows": count}
|
|
599
|
+
for pattern, count in patterns.most_common(max_examples)
|
|
600
|
+
]
|
|
601
|
+
diagnostics = GroupDiagnostics(
|
|
602
|
+
output_column=output_column,
|
|
603
|
+
source_columns=list(source_columns),
|
|
604
|
+
context_columns=list(context_columns),
|
|
605
|
+
total_rows=frame.height,
|
|
606
|
+
rows_with_any_value=rows_with_any_value,
|
|
607
|
+
rows_with_multiple_values=rows_with_multiple_values,
|
|
608
|
+
conflicting_rows=len(conflicting_rows),
|
|
609
|
+
conflicting_row_numbers=conflicting_row_numbers,
|
|
610
|
+
conflict_column_pairs=pl.DataFrame(
|
|
611
|
+
pair_rows,
|
|
612
|
+
schema={"column_a": pl.String, "column_b": pl.String, "conflicting_rows": pl.Int64},
|
|
613
|
+
),
|
|
614
|
+
value_patterns=pl.DataFrame(
|
|
615
|
+
pattern_rows, schema={"observed_values": pl.String, "rows": pl.Int64}
|
|
616
|
+
),
|
|
617
|
+
example_conflicts=pl.DataFrame(
|
|
618
|
+
conflicting_rows[:max_examples],
|
|
619
|
+
schema=selected.schema,
|
|
620
|
+
),
|
|
621
|
+
)
|
|
622
|
+
return output_values, conflict_flags, diagnostics
|
|
623
|
+
|
|
624
|
+
|
|
625
|
+
def _temporary_consolidation_row_number(existing: Sequence[str]) -> str:
|
|
626
|
+
"""Choose an internal diagnostic row-number column not present in the input."""
|
|
627
|
+
column = "_row_number"
|
|
628
|
+
while column in existing:
|
|
629
|
+
column = "_" + column
|
|
630
|
+
return column
|
|
631
|
+
|
|
632
|
+
|
|
633
|
+
def consolidate_columns(
|
|
634
|
+
frame: Frame,
|
|
635
|
+
columns: Mapping[str, Sequence[str]],
|
|
636
|
+
*,
|
|
637
|
+
context_columns: Optional[Sequence[str]] = None,
|
|
638
|
+
on_conflict: str = "keep",
|
|
639
|
+
max_examples: int = 5,
|
|
640
|
+
return_diagnostics: bool = False,
|
|
641
|
+
) -> ConsolidatedResult:
|
|
642
|
+
"""Coalesce repeated exact-value columns and print conflict diagnostics.
|
|
643
|
+
|
|
644
|
+
``columns`` maps each desired output column to its possible source
|
|
645
|
+
columns; any number of outputs may be consolidated in one call. No
|
|
646
|
+
normalization is performed: values must match exactly to agree.
|
|
647
|
+
|
|
648
|
+
``context_columns`` optionally supplies identifiers, such as
|
|
649
|
+
``["person_id"]``, to display in conflict examples. Without one, a
|
|
650
|
+
temporary row number is printed in examples but is not returned.
|
|
651
|
+
|
|
652
|
+
``on_conflict="keep"`` is the safe default: input columns are retained and
|
|
653
|
+
integer ``<output>_conflict`` flags are added when contradictions exist.
|
|
654
|
+
``"raise"`` prints diagnostics and stops, while ``"drop"`` discards source
|
|
655
|
+
evidence and leaves contradicted consolidated outputs null.
|
|
656
|
+
|
|
657
|
+
Accepts a Polars DataFrame or LazyFrame and returns the same frame type.
|
|
658
|
+
For a LazyFrame, diagnostic printing evaluates the relevant source
|
|
659
|
+
projection immediately, while the returned transformed frame stays lazy.
|
|
660
|
+
The function makes no API or database calls and never drops rows.
|
|
661
|
+
"""
|
|
662
|
+
if not isinstance(frame, (pl.DataFrame, pl.LazyFrame)):
|
|
663
|
+
raise TypeError(
|
|
664
|
+
"consolidate_columns expects a polars.DataFrame or polars.LazyFrame."
|
|
665
|
+
)
|
|
666
|
+
if not columns:
|
|
667
|
+
raise ValueError("Provide at least one output-to-source column mapping.")
|
|
668
|
+
if on_conflict not in {"raise", "keep", "drop"}:
|
|
669
|
+
raise ValueError("on_conflict must be 'raise', 'keep', or 'drop'.")
|
|
670
|
+
if max_examples < 1:
|
|
671
|
+
raise ValueError("max_examples must be at least 1.")
|
|
672
|
+
|
|
673
|
+
context_columns = list(dict.fromkeys(context_columns or []))
|
|
674
|
+
is_lazy = isinstance(frame, pl.LazyFrame)
|
|
675
|
+
input_columns = frame.collect_schema().names() if is_lazy else frame.columns
|
|
676
|
+
existing = set(input_columns)
|
|
677
|
+
missing_context = [column for column in context_columns if column not in existing]
|
|
678
|
+
if missing_context:
|
|
679
|
+
raise ValueError(
|
|
680
|
+
"Missing diagnostic context columns: %s" % ", ".join(missing_context)
|
|
681
|
+
)
|
|
682
|
+
|
|
683
|
+
used_sources = set()
|
|
684
|
+
for output_column, source_columns in columns.items():
|
|
685
|
+
if not source_columns:
|
|
686
|
+
raise ValueError("Output %r has no source columns." % output_column)
|
|
687
|
+
if len(set(source_columns)) != len(source_columns):
|
|
688
|
+
raise ValueError("Output %r contains duplicate source columns." % output_column)
|
|
689
|
+
if output_column in source_columns:
|
|
690
|
+
raise ValueError("Output %r cannot also be one of its sources." % output_column)
|
|
691
|
+
if output_column in existing:
|
|
692
|
+
raise ValueError(
|
|
693
|
+
"Output %r already exists; choose a new output name or remove it first."
|
|
694
|
+
% output_column
|
|
695
|
+
)
|
|
696
|
+
if on_conflict == "keep" and "%s_conflict" % output_column in existing:
|
|
697
|
+
raise ValueError(
|
|
698
|
+
"Audit output %r already exists; remove it or choose a new output name."
|
|
699
|
+
% ("%s_conflict" % output_column)
|
|
700
|
+
)
|
|
701
|
+
missing = [column for column in source_columns if column not in existing]
|
|
702
|
+
if missing:
|
|
703
|
+
raise ValueError(
|
|
704
|
+
"Output %r has missing source columns: %s"
|
|
705
|
+
% (output_column, ", ".join(missing))
|
|
706
|
+
)
|
|
707
|
+
overlap = used_sources.intersection(source_columns)
|
|
708
|
+
if overlap:
|
|
709
|
+
raise ValueError(
|
|
710
|
+
"Source columns cannot feed more than one output: %s"
|
|
711
|
+
% ", ".join(sorted(overlap))
|
|
712
|
+
)
|
|
713
|
+
used_sources.update(source_columns)
|
|
714
|
+
|
|
715
|
+
diagnostic_columns = list(
|
|
716
|
+
dict.fromkeys(
|
|
717
|
+
list(context_columns)
|
|
718
|
+
+ [
|
|
719
|
+
source_column
|
|
720
|
+
for source_columns in columns.values()
|
|
721
|
+
for source_column in source_columns
|
|
722
|
+
]
|
|
723
|
+
)
|
|
724
|
+
)
|
|
725
|
+
diagnostic_source = frame.select(diagnostic_columns)
|
|
726
|
+
if isinstance(diagnostic_source, pl.LazyFrame):
|
|
727
|
+
diagnostic_source = diagnostic_source.collect()
|
|
728
|
+
row_number_column = _temporary_consolidation_row_number(input_columns)
|
|
729
|
+
outputs = {}
|
|
730
|
+
conflict_flags = {}
|
|
731
|
+
reports = {}
|
|
732
|
+
for output_column, source_columns in columns.items():
|
|
733
|
+
group_context = [
|
|
734
|
+
column for column in context_columns if column not in source_columns
|
|
735
|
+
]
|
|
736
|
+
(
|
|
737
|
+
outputs[output_column],
|
|
738
|
+
conflict_flags[output_column],
|
|
739
|
+
reports[output_column],
|
|
740
|
+
) = _analyze_consolidation_group(
|
|
741
|
+
diagnostic_source,
|
|
742
|
+
output_column,
|
|
743
|
+
source_columns,
|
|
744
|
+
group_context,
|
|
745
|
+
max_examples,
|
|
746
|
+
row_number_column,
|
|
747
|
+
)
|
|
748
|
+
|
|
749
|
+
diagnostics = ConsolidationDiagnostics(reports)
|
|
750
|
+
if diagnostics.has_conflicts and on_conflict == "raise":
|
|
751
|
+
action = "Source columns retained; no rows or columns changed because conflicts were detected."
|
|
752
|
+
elif diagnostics.has_conflicts and on_conflict == "keep":
|
|
753
|
+
action = (
|
|
754
|
+
"Source columns retained and 0/1 conflict flags added for audit; "
|
|
755
|
+
"no rows dropped."
|
|
756
|
+
)
|
|
757
|
+
elif diagnostics.has_conflicts:
|
|
758
|
+
action = (
|
|
759
|
+
"Source columns dropped as requested; conflicting output values "
|
|
760
|
+
"remain null; no rows dropped."
|
|
761
|
+
)
|
|
762
|
+
else:
|
|
763
|
+
action = "Source columns dropped after successful consolidation; no rows dropped."
|
|
764
|
+
print(diagnostics.format_report(action))
|
|
765
|
+
if diagnostics.has_conflicts and on_conflict == "raise":
|
|
766
|
+
raise ColumnConflictError(diagnostics)
|
|
767
|
+
|
|
768
|
+
update_values = dict(outputs)
|
|
769
|
+
if diagnostics.has_conflicts and on_conflict == "keep":
|
|
770
|
+
update_values.update(
|
|
771
|
+
{
|
|
772
|
+
"%s_conflict" % output_column: pl.Series(values, dtype=pl.Int8)
|
|
773
|
+
for output_column, values in conflict_flags.items()
|
|
774
|
+
}
|
|
775
|
+
)
|
|
776
|
+
if is_lazy:
|
|
777
|
+
update_values[row_number_column] = range(diagnostic_source.height)
|
|
778
|
+
updates = pl.DataFrame(update_values).lazy()
|
|
779
|
+
result = (
|
|
780
|
+
frame.with_row_index(row_number_column)
|
|
781
|
+
.join(updates, on=row_number_column, how="left")
|
|
782
|
+
.drop(row_number_column)
|
|
783
|
+
)
|
|
784
|
+
else:
|
|
785
|
+
result = frame.with_columns(
|
|
786
|
+
[pl.Series(output_column, values) for output_column, values in update_values.items()]
|
|
787
|
+
)
|
|
788
|
+
if not diagnostics.has_conflicts or on_conflict == "drop":
|
|
789
|
+
result = result.drop(list(used_sources))
|
|
790
|
+
|
|
791
|
+
if return_diagnostics:
|
|
792
|
+
return result, diagnostics
|
|
793
|
+
return result
|
|
794
|
+
|
|
795
|
+
|
|
796
|
+
###############################################################################
|
|
797
|
+
# DEMOCRACY PREP LOTTERY COLUMN SELECTION
|
|
798
|
+
###############################################################################
|
|
799
|
+
|
|
800
|
+
def select_lottery_columns(frame: Frame) -> Frame:
|
|
801
|
+
"""Select the requested lottery fields and rename direct schema matches."""
|
|
802
|
+
return frame.select(
|
|
803
|
+
[
|
|
804
|
+
"origorder",
|
|
805
|
+
"year",
|
|
806
|
+
"applygrade",
|
|
807
|
+
"firstname",
|
|
808
|
+
"lastname",
|
|
809
|
+
"lottstatus",
|
|
810
|
+
"offered",
|
|
811
|
+
"address",
|
|
812
|
+
"apt",
|
|
813
|
+
"zip",
|
|
814
|
+
"pfirstname",
|
|
815
|
+
"plastname",
|
|
816
|
+
"dob_orig",
|
|
817
|
+
"waitlist",
|
|
818
|
+
"dob",
|
|
819
|
+
"sibling1name",
|
|
820
|
+
"sibling1status",
|
|
821
|
+
"sibling1school",
|
|
822
|
+
"sibling1grade",
|
|
823
|
+
"in_dist",
|
|
824
|
+
"number",
|
|
825
|
+
"oth_district",
|
|
826
|
+
"in_district",
|
|
827
|
+
"currentschool2",
|
|
828
|
+
"datereceived",
|
|
829
|
+
"siblingcurrent",
|
|
830
|
+
"offered_school",
|
|
831
|
+
"currentgrade",
|
|
832
|
+
"dob2",
|
|
833
|
+
"status",
|
|
834
|
+
"appno",
|
|
835
|
+
"app_hpcs",
|
|
836
|
+
"app_dpe",
|
|
837
|
+
"app_dph",
|
|
838
|
+
"app_dpcs",
|
|
839
|
+
"guardian2",
|
|
840
|
+
"sibling2name",
|
|
841
|
+
"sibling2status",
|
|
842
|
+
"sibling2school",
|
|
843
|
+
"sibling2grade",
|
|
844
|
+
"sibling3name",
|
|
845
|
+
"sibling3status",
|
|
846
|
+
"sibling3school",
|
|
847
|
+
"sibling3grade",
|
|
848
|
+
"actualdistrict",
|
|
849
|
+
"waitoffer",
|
|
850
|
+
"waitaccept",
|
|
851
|
+
"waitacceptdate",
|
|
852
|
+
"waitdeclinedate",
|
|
853
|
+
"waitacceptschool",
|
|
854
|
+
"appid",
|
|
855
|
+
"hpcs",
|
|
856
|
+
"dpe",
|
|
857
|
+
"dph",
|
|
858
|
+
"dpcs",
|
|
859
|
+
"acceptedschoolname",
|
|
860
|
+
"guardian1name",
|
|
861
|
+
"hpcsletter",
|
|
862
|
+
"dpeletter",
|
|
863
|
+
"dphletter",
|
|
864
|
+
"dpcsletter",
|
|
865
|
+
"anyletter",
|
|
866
|
+
"currentgrade_orig",
|
|
867
|
+
"siblingstatus",
|
|
868
|
+
"guardian1lastname",
|
|
869
|
+
"guardian1firstname",
|
|
870
|
+
"guardian1street",
|
|
871
|
+
"guardian1apt",
|
|
872
|
+
"guardian1city",
|
|
873
|
+
"guardian1zipcode",
|
|
874
|
+
"guardian2lastname",
|
|
875
|
+
"guardian2firstname",
|
|
876
|
+
"guardian2street",
|
|
877
|
+
"guardian2apt",
|
|
878
|
+
"guardian2city",
|
|
879
|
+
"guardian2zipcode",
|
|
880
|
+
"currentschoolpreference",
|
|
881
|
+
"applicationsource",
|
|
882
|
+
"waitlistinfo",
|
|
883
|
+
"offer_dpcs2",
|
|
884
|
+
"offer_dpe2",
|
|
885
|
+
"offer_dph2",
|
|
886
|
+
"offer_hpcs2",
|
|
887
|
+
"wait_dpcs2",
|
|
888
|
+
"wait_dpe2",
|
|
889
|
+
"wait_dph2",
|
|
890
|
+
"wait_hpcs2",
|
|
891
|
+
"whichschl",
|
|
892
|
+
"dob_orig1415",
|
|
893
|
+
"graderank_hpcs",
|
|
894
|
+
"graderank_dph",
|
|
895
|
+
"graderank_dpcs",
|
|
896
|
+
"graderank_dpe",
|
|
897
|
+
"prigroup",
|
|
898
|
+
"preferencesort_hpcs",
|
|
899
|
+
"preferencesort_dph",
|
|
900
|
+
"preferencesort_dpcs",
|
|
901
|
+
"preferencesort_dpe",
|
|
902
|
+
"waitlist_best",
|
|
903
|
+
"offered2",
|
|
904
|
+
"Sid2",
|
|
905
|
+
"sid_impute",
|
|
906
|
+
"geodist",
|
|
907
|
+
]
|
|
908
|
+
).rename(
|
|
909
|
+
{
|
|
910
|
+
"year": "school_year",
|
|
911
|
+
"applygrade": "entry_grade",
|
|
912
|
+
"firstname": "fname",
|
|
913
|
+
"lastname": "lname",
|
|
914
|
+
"apt": "address2",
|
|
915
|
+
"pfirstname": "p_fname",
|
|
916
|
+
"plastname": "p_lname",
|
|
917
|
+
"waitlist": "waitlist_number",
|
|
918
|
+
"datereceived": "application_date",
|
|
919
|
+
"whichschl": "school_name",
|
|
920
|
+
"guardian1name": "p_name",
|
|
921
|
+
"guardian1firstname": "p1_fname",
|
|
922
|
+
"guardian1lastname": "p1_lname",
|
|
923
|
+
"guardian2firstname": "p2_fname",
|
|
924
|
+
"guardian2lastname": "p2_lname",
|
|
925
|
+
}
|
|
926
|
+
)
|
|
927
|
+
|
|
928
|
+
|
|
929
|
+
###############################################################################
|
|
930
|
+
# ENR DATA COLUMN SELECTION
|
|
931
|
+
###############################################################################
|
|
932
|
+
|
|
933
|
+
def select_enr_columns(frame: Frame) -> Frame:
|
|
934
|
+
"""Select ENR fields and rename direct centralized-schema matches."""
|
|
935
|
+
return frame.select(
|
|
936
|
+
[
|
|
937
|
+
"year",
|
|
938
|
+
"datasource",
|
|
939
|
+
"dbn_oct31",
|
|
940
|
+
"dbn_sis",
|
|
941
|
+
"first_name",
|
|
942
|
+
"middle_name",
|
|
943
|
+
"last_name",
|
|
944
|
+
"dob",
|
|
945
|
+
"dob_sis",
|
|
946
|
+
"mealcode",
|
|
947
|
+
"admitdate",
|
|
948
|
+
"home_lang",
|
|
949
|
+
"pob_code",
|
|
950
|
+
"grade_level",
|
|
951
|
+
"official_class",
|
|
952
|
+
"createdate",
|
|
953
|
+
"spec_ed_flag",
|
|
954
|
+
"school_level",
|
|
955
|
+
"building code",
|
|
956
|
+
"residencestreetaddress",
|
|
957
|
+
"residencezip",
|
|
958
|
+
"contact1name",
|
|
959
|
+
"contact1streetaddress",
|
|
960
|
+
"contact1zip",
|
|
961
|
+
]
|
|
962
|
+
).rename(
|
|
963
|
+
{
|
|
964
|
+
"year": "school_year",
|
|
965
|
+
"first_name": "fname",
|
|
966
|
+
"middle_name": "mname",
|
|
967
|
+
"last_name": "lname",
|
|
968
|
+
"grade_level": "enrollment_grade",
|
|
969
|
+
"residencestreetaddress": "address",
|
|
970
|
+
"residencezip": "zip",
|
|
971
|
+
"contact1name": "p_name",
|
|
972
|
+
"spec_ed_flag": "sped",
|
|
973
|
+
}
|
|
974
|
+
)
|
|
File without changes
|
|
File without changes
|
|
File without changes
|