ultrasav 0.1.4__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- ultrasav/__init__.py +280 -0
- ultrasav/_add_cases.py +227 -0
- ultrasav/_data.py +513 -0
- ultrasav/_make_dummy.py +137 -0
- ultrasav/_merge_data.py +435 -0
- ultrasav/_merge_meta.py +280 -0
- ultrasav/_metadata.py +570 -0
- ultrasav/_read_files.py +558 -0
- ultrasav/_write_files.py +111 -0
- ultrasav/metaman/__init__.py +91 -0
- ultrasav/metaman/def_detect_variable_type.py +454 -0
- ultrasav/metaman/def_get_meta.py +561 -0
- ultrasav/metaman/def_make_datamap.py +127 -0
- ultrasav/metaman/def_make_labels.py +833 -0
- ultrasav/metaman/def_map_engine.py +529 -0
- ultrasav/metaman/def_map_to_excel.py +294 -0
- ultrasav/metaman/def_write_excel_engine.py +298 -0
- ultrasav/metaman/pastel_color_schemes.py +185 -0
- ultrasav-0.1.4.dist-info/METADATA +550 -0
- ultrasav-0.1.4.dist-info/RECORD +21 -0
- ultrasav-0.1.4.dist-info/WHEEL +4 -0
|
@@ -0,0 +1,833 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Make Labels Module (v2 - Polars Backend)
|
|
3
|
+
=========================================
|
|
4
|
+
A high-performance utility module for transforming Excel files containing label mappings
|
|
5
|
+
into Python dictionaries using Polars for vectorized operations.
|
|
6
|
+
|
|
7
|
+
This module provides the make_labels function to:
|
|
8
|
+
- Read column labels and value labels from Excel sheets using Polars
|
|
9
|
+
- Generate Python dictionary files for SPSS metadata labeling
|
|
10
|
+
- Support customizable sheet names, column names, and output formatting
|
|
11
|
+
|
|
12
|
+
Version: 2.0.0
|
|
13
|
+
Dependencies: polars, pathlib
|
|
14
|
+
"""
|
|
15
|
+
|
|
16
|
+
import polars as pl
|
|
17
|
+
from pathlib import Path
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
# =============================================================================
|
|
21
|
+
# Type Aliases (for clarity)
|
|
22
|
+
# =============================================================================
|
|
23
|
+
ColumnLabelsDict = dict[str, str]
|
|
24
|
+
ValueLabelsDict = dict[str, dict[int | float | str, str]]
|
|
25
|
+
|
|
26
|
+
|
|
27
|
+
# =============================================================================
|
|
28
|
+
# Excel Reading (adapted from def_read_files.py)
|
|
29
|
+
# =============================================================================
|
|
30
|
+
|
|
31
|
+
def _read_excel_sheet(file_path: Path, sheet_name: str, engine: str = "calamine") -> pl.DataFrame:
|
|
32
|
+
"""
|
|
33
|
+
Read a single Excel sheet into a Polars DataFrame with engine fallback.
|
|
34
|
+
|
|
35
|
+
Parameters
|
|
36
|
+
----------
|
|
37
|
+
file_path : Path
|
|
38
|
+
Path to the Excel file
|
|
39
|
+
sheet_name : str
|
|
40
|
+
Name of the sheet to read
|
|
41
|
+
engine : str, default "calamine"
|
|
42
|
+
Excel engine to use. Falls back to openpyxl, then xlsx2csv on failure.
|
|
43
|
+
|
|
44
|
+
Returns
|
|
45
|
+
-------
|
|
46
|
+
pl.DataFrame
|
|
47
|
+
The sheet data as a Polars DataFrame
|
|
48
|
+
"""
|
|
49
|
+
file_str = str(file_path)
|
|
50
|
+
engines = [engine, "openpyxl", "xlsx2csv"]
|
|
51
|
+
|
|
52
|
+
last_error: Exception | None = None
|
|
53
|
+
for eng in engines:
|
|
54
|
+
try:
|
|
55
|
+
return pl.read_excel(file_str, sheet_name=sheet_name, engine=eng)
|
|
56
|
+
except Exception as e:
|
|
57
|
+
last_error = e
|
|
58
|
+
continue
|
|
59
|
+
|
|
60
|
+
raise ValueError(f"Failed to read sheet '{sheet_name}' from '{file_path}'. Last error: {last_error}")
|
|
61
|
+
|
|
62
|
+
|
|
63
|
+
def _get_sheet_names(file_path: Path) -> list[str]:
|
|
64
|
+
"""
|
|
65
|
+
Get all sheet names from an Excel file.
|
|
66
|
+
|
|
67
|
+
Uses Polars' sheet_id=0 to load all sheets as a dict, then extracts keys.
|
|
68
|
+
Falls back to fastexcel or openpyxl if needed.
|
|
69
|
+
|
|
70
|
+
Parameters
|
|
71
|
+
----------
|
|
72
|
+
file_path : Path
|
|
73
|
+
Path to the Excel file
|
|
74
|
+
|
|
75
|
+
Returns
|
|
76
|
+
-------
|
|
77
|
+
list[str]
|
|
78
|
+
List of sheet names
|
|
79
|
+
"""
|
|
80
|
+
file_str = str(file_path)
|
|
81
|
+
|
|
82
|
+
# Primary: Use Polars with sheet_id=0 (returns {sheetname: DataFrame, ...} dict)
|
|
83
|
+
engines = ["calamine", "openpyxl", "xlsx2csv"]
|
|
84
|
+
for engine in engines:
|
|
85
|
+
try:
|
|
86
|
+
sheets_dict = pl.read_excel(file_str, sheet_id=0, engine=engine)
|
|
87
|
+
return list(sheets_dict.keys())
|
|
88
|
+
except Exception:
|
|
89
|
+
continue
|
|
90
|
+
|
|
91
|
+
# Fallback: Try fastexcel directly (calamine's Python binding)
|
|
92
|
+
try:
|
|
93
|
+
import fastexcel
|
|
94
|
+
excel_file = fastexcel.read_excel(file_str)
|
|
95
|
+
return excel_file.sheet_names
|
|
96
|
+
except (ImportError, Exception):
|
|
97
|
+
pass
|
|
98
|
+
|
|
99
|
+
# Last resort: openpyxl directly
|
|
100
|
+
try:
|
|
101
|
+
from openpyxl import load_workbook
|
|
102
|
+
wb = load_workbook(file_path, read_only=True, data_only=True)
|
|
103
|
+
names = wb.sheetnames
|
|
104
|
+
wb.close()
|
|
105
|
+
return names
|
|
106
|
+
except Exception as e:
|
|
107
|
+
raise ValueError(
|
|
108
|
+
f"Failed to read sheet names from '{file_path}'. "
|
|
109
|
+
f"Ensure the file exists and is a valid Excel file. Error: {e}"
|
|
110
|
+
)
|
|
111
|
+
|
|
112
|
+
|
|
113
|
+
# =============================================================================
|
|
114
|
+
# Data Cleaning Helpers (Pure Functions)
|
|
115
|
+
# =============================================================================
|
|
116
|
+
|
|
117
|
+
def _clean_variable_column(df: pl.DataFrame, col_name: str) -> pl.DataFrame:
|
|
118
|
+
"""
|
|
119
|
+
Clean variable name column: cast to string, mark nulls/empty as null.
|
|
120
|
+
|
|
121
|
+
Parameters
|
|
122
|
+
----------
|
|
123
|
+
df : pl.DataFrame
|
|
124
|
+
Input DataFrame
|
|
125
|
+
col_name : str
|
|
126
|
+
Name of the variable column to clean
|
|
127
|
+
|
|
128
|
+
Returns
|
|
129
|
+
-------
|
|
130
|
+
pl.DataFrame
|
|
131
|
+
DataFrame with cleaned variable column
|
|
132
|
+
"""
|
|
133
|
+
return df.with_columns(
|
|
134
|
+
pl.col(col_name)
|
|
135
|
+
.cast(pl.Utf8)
|
|
136
|
+
.replace("", None)
|
|
137
|
+
.alias(col_name)
|
|
138
|
+
)
|
|
139
|
+
|
|
140
|
+
|
|
141
|
+
def _clean_label_column(df: pl.DataFrame, col_name: str) -> pl.DataFrame:
|
|
142
|
+
"""
|
|
143
|
+
Clean label column: cast to string, strip whitespace, convert null to empty string.
|
|
144
|
+
|
|
145
|
+
Parameters
|
|
146
|
+
----------
|
|
147
|
+
df : pl.DataFrame
|
|
148
|
+
Input DataFrame
|
|
149
|
+
col_name : str
|
|
150
|
+
Name of the label column to clean
|
|
151
|
+
|
|
152
|
+
Returns
|
|
153
|
+
-------
|
|
154
|
+
pl.DataFrame
|
|
155
|
+
DataFrame with cleaned label column
|
|
156
|
+
"""
|
|
157
|
+
return df.with_columns(
|
|
158
|
+
pl.col(col_name)
|
|
159
|
+
.cast(pl.Utf8)
|
|
160
|
+
.fill_null("")
|
|
161
|
+
.str.strip_chars()
|
|
162
|
+
.alias(col_name)
|
|
163
|
+
)
|
|
164
|
+
|
|
165
|
+
|
|
166
|
+
def _filter_valid_rows(df: pl.DataFrame, required_col: str) -> pl.DataFrame:
|
|
167
|
+
"""
|
|
168
|
+
Filter rows where the required column is not null.
|
|
169
|
+
|
|
170
|
+
Parameters
|
|
171
|
+
----------
|
|
172
|
+
df : pl.DataFrame
|
|
173
|
+
Input DataFrame
|
|
174
|
+
required_col : str
|
|
175
|
+
Column that must not be null
|
|
176
|
+
|
|
177
|
+
Returns
|
|
178
|
+
-------
|
|
179
|
+
pl.DataFrame
|
|
180
|
+
Filtered DataFrame
|
|
181
|
+
"""
|
|
182
|
+
return df.filter(pl.col(required_col).is_not_null())
|
|
183
|
+
|
|
184
|
+
|
|
185
|
+
# =============================================================================
|
|
186
|
+
# Value Conversion (Vectorized)
|
|
187
|
+
# =============================================================================
|
|
188
|
+
|
|
189
|
+
def _convert_values_vectorized(df: pl.DataFrame, value_col: str) -> pl.DataFrame:
|
|
190
|
+
"""
|
|
191
|
+
Convert value column to appropriate types: int > float > string (vectorized).
|
|
192
|
+
|
|
193
|
+
This uses a vectorized approach:
|
|
194
|
+
1. Cast to string first
|
|
195
|
+
2. Try to cast to Float64
|
|
196
|
+
3. Check if float is integer-like, cast to Int64 where possible
|
|
197
|
+
4. Keep as string where numeric conversion failed
|
|
198
|
+
|
|
199
|
+
Parameters
|
|
200
|
+
----------
|
|
201
|
+
df : pl.DataFrame
|
|
202
|
+
Input DataFrame
|
|
203
|
+
value_col : str
|
|
204
|
+
Name of the value column to convert
|
|
205
|
+
|
|
206
|
+
Returns
|
|
207
|
+
-------
|
|
208
|
+
pl.DataFrame
|
|
209
|
+
DataFrame with converted value column (as Object/mixed type via struct)
|
|
210
|
+
"""
|
|
211
|
+
# First, ensure string representation for processing
|
|
212
|
+
df = df.with_columns(
|
|
213
|
+
pl.col(value_col).cast(pl.Utf8).str.strip_chars().alias("_val_str")
|
|
214
|
+
)
|
|
215
|
+
|
|
216
|
+
# Try numeric conversion
|
|
217
|
+
df = df.with_columns(
|
|
218
|
+
pl.col("_val_str").cast(pl.Float64, strict=False).alias("_val_float")
|
|
219
|
+
)
|
|
220
|
+
|
|
221
|
+
# Check if float values are actually integers
|
|
222
|
+
df = df.with_columns(
|
|
223
|
+
(pl.col("_val_float") == pl.col("_val_float").floor()).alias("_is_int")
|
|
224
|
+
)
|
|
225
|
+
|
|
226
|
+
# Create integer version where applicable
|
|
227
|
+
df = df.with_columns(
|
|
228
|
+
pl.when(pl.col("_is_int") & pl.col("_val_float").is_not_null())
|
|
229
|
+
.then(pl.col("_val_float").cast(pl.Int64))
|
|
230
|
+
.otherwise(None)
|
|
231
|
+
.alias("_val_int")
|
|
232
|
+
)
|
|
233
|
+
|
|
234
|
+
return df
|
|
235
|
+
|
|
236
|
+
|
|
237
|
+
def _extract_typed_value(row: dict) -> int | float | str | None:
|
|
238
|
+
"""
|
|
239
|
+
Extract the properly typed value from a row with conversion columns.
|
|
240
|
+
|
|
241
|
+
Parameters
|
|
242
|
+
----------
|
|
243
|
+
row : dict
|
|
244
|
+
Row dictionary with _val_int, _val_float, _val_str columns
|
|
245
|
+
|
|
246
|
+
Returns
|
|
247
|
+
-------
|
|
248
|
+
int | float | str | None
|
|
249
|
+
The value in its appropriate type
|
|
250
|
+
"""
|
|
251
|
+
if row.get("_val_int") is not None:
|
|
252
|
+
return int(row["_val_int"])
|
|
253
|
+
elif row.get("_val_float") is not None:
|
|
254
|
+
return float(row["_val_float"])
|
|
255
|
+
elif row.get("_val_str") is not None and row["_val_str"] != "":
|
|
256
|
+
return str(row["_val_str"])
|
|
257
|
+
return None
|
|
258
|
+
|
|
259
|
+
|
|
260
|
+
# =============================================================================
|
|
261
|
+
# Duplicate Detection (Vectorized)
|
|
262
|
+
# =============================================================================
|
|
263
|
+
|
|
264
|
+
def _count_duplicates(df: pl.DataFrame, cols: list[str]) -> int:
|
|
265
|
+
"""
|
|
266
|
+
Count the number of duplicate groups (groups with more than 1 row).
|
|
267
|
+
|
|
268
|
+
Parameters
|
|
269
|
+
----------
|
|
270
|
+
df : pl.DataFrame
|
|
271
|
+
Input DataFrame
|
|
272
|
+
cols : list[str]
|
|
273
|
+
Columns to group by for duplicate detection
|
|
274
|
+
|
|
275
|
+
Returns
|
|
276
|
+
-------
|
|
277
|
+
int
|
|
278
|
+
Number of duplicate groups
|
|
279
|
+
"""
|
|
280
|
+
counts = df.group_by(cols).agg(pl.len().alias("_count"))
|
|
281
|
+
return counts.filter(pl.col("_count") > 1).height
|
|
282
|
+
|
|
283
|
+
|
|
284
|
+
# =============================================================================
|
|
285
|
+
# Column Labels Processing
|
|
286
|
+
# =============================================================================
|
|
287
|
+
|
|
288
|
+
def _validate_column_labels_schema(df: pl.DataFrame, var_col: str, label_col: str) -> None:
|
|
289
|
+
"""
|
|
290
|
+
Validate that required columns exist in column labels DataFrame.
|
|
291
|
+
|
|
292
|
+
Parameters
|
|
293
|
+
----------
|
|
294
|
+
df : pl.DataFrame
|
|
295
|
+
Input DataFrame
|
|
296
|
+
var_col : str
|
|
297
|
+
Expected variable column name
|
|
298
|
+
label_col : str
|
|
299
|
+
Expected label column name
|
|
300
|
+
|
|
301
|
+
Raises
|
|
302
|
+
------
|
|
303
|
+
ValueError
|
|
304
|
+
If required columns are missing
|
|
305
|
+
"""
|
|
306
|
+
if var_col not in df.columns or label_col not in df.columns:
|
|
307
|
+
raise ValueError(f"Column labels sheet must have '{var_col}' and '{label_col}' columns")
|
|
308
|
+
|
|
309
|
+
|
|
310
|
+
def _process_column_labels_df(
|
|
311
|
+
df: pl.DataFrame,
|
|
312
|
+
var_col: str,
|
|
313
|
+
label_col: str
|
|
314
|
+
) -> tuple[ColumnLabelsDict, int, int, int]:
|
|
315
|
+
"""
|
|
316
|
+
Process column labels DataFrame into dictionary (pure function).
|
|
317
|
+
|
|
318
|
+
Parameters
|
|
319
|
+
----------
|
|
320
|
+
df : pl.DataFrame
|
|
321
|
+
Raw column labels DataFrame
|
|
322
|
+
var_col : str
|
|
323
|
+
Variable column name
|
|
324
|
+
label_col : str
|
|
325
|
+
Label column name
|
|
326
|
+
|
|
327
|
+
Returns
|
|
328
|
+
-------
|
|
329
|
+
tuple[ColumnLabelsDict, int, int, int]
|
|
330
|
+
(column_labels_dict, initial_rows, removed_rows, duplicate_count)
|
|
331
|
+
"""
|
|
332
|
+
initial_rows = df.height
|
|
333
|
+
|
|
334
|
+
# Clean columns
|
|
335
|
+
df = _clean_variable_column(df, var_col)
|
|
336
|
+
df = _clean_label_column(df, label_col)
|
|
337
|
+
|
|
338
|
+
# Filter valid rows
|
|
339
|
+
df_cleaned = _filter_valid_rows(df, var_col)
|
|
340
|
+
removed = initial_rows - df_cleaned.height
|
|
341
|
+
|
|
342
|
+
# Check duplicates
|
|
343
|
+
duplicate_count = _count_duplicates(df_cleaned, [var_col])
|
|
344
|
+
|
|
345
|
+
# Build dictionary (vectorized extraction)
|
|
346
|
+
result: ColumnLabelsDict = {}
|
|
347
|
+
for row in df_cleaned.select([var_col, label_col]).iter_rows(named=True):
|
|
348
|
+
result[row[var_col]] = row[label_col]
|
|
349
|
+
|
|
350
|
+
return result, initial_rows, removed, duplicate_count
|
|
351
|
+
|
|
352
|
+
|
|
353
|
+
# =============================================================================
|
|
354
|
+
# Value Labels Processing
|
|
355
|
+
# =============================================================================
|
|
356
|
+
|
|
357
|
+
def _validate_value_labels_schema(
|
|
358
|
+
df: pl.DataFrame,
|
|
359
|
+
var_col: str,
|
|
360
|
+
value_col: str,
|
|
361
|
+
label_col: str
|
|
362
|
+
) -> None:
|
|
363
|
+
"""
|
|
364
|
+
Validate that required columns exist in value labels DataFrame.
|
|
365
|
+
|
|
366
|
+
Parameters
|
|
367
|
+
----------
|
|
368
|
+
df : pl.DataFrame
|
|
369
|
+
Input DataFrame
|
|
370
|
+
var_col : str
|
|
371
|
+
Expected variable column name
|
|
372
|
+
value_col : str
|
|
373
|
+
Expected value column name
|
|
374
|
+
label_col : str
|
|
375
|
+
Expected label column name
|
|
376
|
+
|
|
377
|
+
Raises
|
|
378
|
+
------
|
|
379
|
+
ValueError
|
|
380
|
+
If required columns are missing
|
|
381
|
+
"""
|
|
382
|
+
required = [var_col, value_col, label_col]
|
|
383
|
+
missing = [col for col in required if col not in df.columns]
|
|
384
|
+
if missing:
|
|
385
|
+
raise ValueError(f"Value labels sheet missing columns: {missing}")
|
|
386
|
+
|
|
387
|
+
|
|
388
|
+
def _process_value_labels_df(
|
|
389
|
+
df: pl.DataFrame,
|
|
390
|
+
var_col: str,
|
|
391
|
+
value_col: str,
|
|
392
|
+
label_col: str
|
|
393
|
+
) -> tuple[ValueLabelsDict, int, int, int]:
|
|
394
|
+
"""
|
|
395
|
+
Process value labels DataFrame into nested dictionary (pure function).
|
|
396
|
+
|
|
397
|
+
Parameters
|
|
398
|
+
----------
|
|
399
|
+
df : pl.DataFrame
|
|
400
|
+
Raw value labels DataFrame
|
|
401
|
+
var_col : str
|
|
402
|
+
Variable column name
|
|
403
|
+
value_col : str
|
|
404
|
+
Value column name
|
|
405
|
+
label_col : str
|
|
406
|
+
Label column name
|
|
407
|
+
|
|
408
|
+
Returns
|
|
409
|
+
-------
|
|
410
|
+
tuple[ValueLabelsDict, int, int, int]
|
|
411
|
+
(value_labels_dict, initial_rows, removed_rows, duplicate_count)
|
|
412
|
+
"""
|
|
413
|
+
initial_rows = df.height
|
|
414
|
+
|
|
415
|
+
# Clean variable column
|
|
416
|
+
df = _clean_variable_column(df, var_col)
|
|
417
|
+
|
|
418
|
+
# Clean label column
|
|
419
|
+
df = _clean_label_column(df, label_col)
|
|
420
|
+
|
|
421
|
+
# Filter rows with valid variable and value
|
|
422
|
+
df = df.filter(
|
|
423
|
+
pl.col(var_col).is_not_null() &
|
|
424
|
+
pl.col(value_col).is_not_null()
|
|
425
|
+
)
|
|
426
|
+
|
|
427
|
+
# Convert values to appropriate types (vectorized)
|
|
428
|
+
df = _convert_values_vectorized(df, value_col)
|
|
429
|
+
|
|
430
|
+
# Filter out rows where conversion resulted in null
|
|
431
|
+
df_cleaned = df.filter(
|
|
432
|
+
pl.col("_val_int").is_not_null() |
|
|
433
|
+
pl.col("_val_float").is_not_null() |
|
|
434
|
+
(pl.col("_val_str").is_not_null() & (pl.col("_val_str") != ""))
|
|
435
|
+
)
|
|
436
|
+
|
|
437
|
+
removed = initial_rows - df_cleaned.height
|
|
438
|
+
|
|
439
|
+
# Check duplicates on variable + original value string (before type conversion)
|
|
440
|
+
duplicate_count = _count_duplicates(df_cleaned, [var_col, "_val_str"])
|
|
441
|
+
|
|
442
|
+
# Build nested dictionary
|
|
443
|
+
# Group by variable first for efficiency
|
|
444
|
+
result: ValueLabelsDict = {}
|
|
445
|
+
|
|
446
|
+
# Select only needed columns for iteration
|
|
447
|
+
select_cols = [var_col, label_col, "_val_int", "_val_float", "_val_str"]
|
|
448
|
+
|
|
449
|
+
for row in df_cleaned.select(select_cols).iter_rows(named=True):
|
|
450
|
+
variable = row[var_col]
|
|
451
|
+
label = row[label_col]
|
|
452
|
+
value = _extract_typed_value(row)
|
|
453
|
+
|
|
454
|
+
if value is not None:
|
|
455
|
+
if variable not in result:
|
|
456
|
+
result[variable] = {}
|
|
457
|
+
result[variable][value] = label
|
|
458
|
+
|
|
459
|
+
return result, initial_rows, removed, duplicate_count
|
|
460
|
+
|
|
461
|
+
|
|
462
|
+
# =============================================================================
|
|
463
|
+
# Output Formatting (Pure Functions)
|
|
464
|
+
# =============================================================================
|
|
465
|
+
|
|
466
|
+
def _format_column_dict_lines(
|
|
467
|
+
data: ColumnLabelsDict,
|
|
468
|
+
dict_name: str,
|
|
469
|
+
quote_style: str,
|
|
470
|
+
indent: str
|
|
471
|
+
) -> list[str]:
|
|
472
|
+
"""
|
|
473
|
+
Format column labels dictionary as Python code lines.
|
|
474
|
+
|
|
475
|
+
Parameters
|
|
476
|
+
----------
|
|
477
|
+
data : ColumnLabelsDict
|
|
478
|
+
Column labels dictionary
|
|
479
|
+
dict_name : str
|
|
480
|
+
Name for the dictionary variable
|
|
481
|
+
quote_style : str
|
|
482
|
+
Quote style for values
|
|
483
|
+
indent : str
|
|
484
|
+
Indentation string
|
|
485
|
+
|
|
486
|
+
Returns
|
|
487
|
+
-------
|
|
488
|
+
list[str]
|
|
489
|
+
Lines of Python code
|
|
490
|
+
"""
|
|
491
|
+
lines = [f"{dict_name} = {{"]
|
|
492
|
+
for variable, label in data.items():
|
|
493
|
+
lines.append(f"{indent}'{variable}': {quote_style}{label}{quote_style},")
|
|
494
|
+
lines.append("}")
|
|
495
|
+
return lines
|
|
496
|
+
|
|
497
|
+
|
|
498
|
+
def _format_value_dict_lines(
|
|
499
|
+
data: ValueLabelsDict,
|
|
500
|
+
dict_name: str,
|
|
501
|
+
quote_style: str,
|
|
502
|
+
indent: str
|
|
503
|
+
) -> list[str]:
|
|
504
|
+
"""
|
|
505
|
+
Format value labels dictionary as Python code lines.
|
|
506
|
+
|
|
507
|
+
Parameters
|
|
508
|
+
----------
|
|
509
|
+
data : ValueLabelsDict
|
|
510
|
+
Value labels dictionary
|
|
511
|
+
dict_name : str
|
|
512
|
+
Name for the dictionary variable
|
|
513
|
+
quote_style : str
|
|
514
|
+
Quote style for label values
|
|
515
|
+
indent : str
|
|
516
|
+
Indentation string
|
|
517
|
+
|
|
518
|
+
Returns
|
|
519
|
+
-------
|
|
520
|
+
list[str]
|
|
521
|
+
Lines of Python code
|
|
522
|
+
"""
|
|
523
|
+
lines = [f"\n\n{dict_name} = {{"]
|
|
524
|
+
for variable, value_dict in data.items():
|
|
525
|
+
lines.append(f"{indent}'{variable}': {{")
|
|
526
|
+
for value, label in value_dict.items():
|
|
527
|
+
key_repr = f"'{value}'" if isinstance(value, str) else str(value)
|
|
528
|
+
lines.append(f"{indent}{indent}{key_repr}: {quote_style}{label}{quote_style},")
|
|
529
|
+
lines.append(f"{indent}}},")
|
|
530
|
+
lines.append("}")
|
|
531
|
+
return lines
|
|
532
|
+
|
|
533
|
+
|
|
534
|
+
def _save_combined_output(
|
|
535
|
+
col_labels: ColumnLabelsDict,
|
|
536
|
+
val_labels: ValueLabelsDict,
|
|
537
|
+
output_path: str,
|
|
538
|
+
col_dict_name: str,
|
|
539
|
+
value_dict_name: str,
|
|
540
|
+
col_quote_style: str,
|
|
541
|
+
value_quote_style: str,
|
|
542
|
+
indent: str,
|
|
543
|
+
encoding: str
|
|
544
|
+
) -> Path:
|
|
545
|
+
"""
|
|
546
|
+
Save both dictionaries to a single Python file.
|
|
547
|
+
|
|
548
|
+
Parameters
|
|
549
|
+
----------
|
|
550
|
+
col_labels : ColumnLabelsDict
|
|
551
|
+
Column labels dictionary
|
|
552
|
+
val_labels : ValueLabelsDict
|
|
553
|
+
Value labels dictionary
|
|
554
|
+
output_path : str
|
|
555
|
+
Output file path
|
|
556
|
+
col_dict_name : str
|
|
557
|
+
Name for column labels dictionary
|
|
558
|
+
value_dict_name : str
|
|
559
|
+
Name for value labels dictionary
|
|
560
|
+
col_quote_style : str
|
|
561
|
+
Quote style for column labels
|
|
562
|
+
value_quote_style : str
|
|
563
|
+
Quote style for value labels
|
|
564
|
+
indent : str
|
|
565
|
+
Indentation string
|
|
566
|
+
encoding : str
|
|
567
|
+
File encoding
|
|
568
|
+
|
|
569
|
+
Returns
|
|
570
|
+
-------
|
|
571
|
+
Path
|
|
572
|
+
Path to the saved file
|
|
573
|
+
"""
|
|
574
|
+
output_path_obj = Path(output_path)
|
|
575
|
+
output_path_obj.parent.mkdir(parents=True, exist_ok=True)
|
|
576
|
+
|
|
577
|
+
all_lines: list[str] = []
|
|
578
|
+
all_lines.extend(_format_column_dict_lines(col_labels, col_dict_name, col_quote_style, indent))
|
|
579
|
+
all_lines.extend(_format_value_dict_lines(val_labels, value_dict_name, value_quote_style, indent))
|
|
580
|
+
|
|
581
|
+
with open(output_path_obj, "w", encoding=encoding) as file:
|
|
582
|
+
file.write("\n".join(all_lines))
|
|
583
|
+
|
|
584
|
+
return output_path_obj
|
|
585
|
+
|
|
586
|
+
|
|
587
|
+
# =============================================================================
|
|
588
|
+
# Summary Helpers
|
|
589
|
+
# =============================================================================
|
|
590
|
+
|
|
591
|
+
def _get_longest_labels(col_labels: ColumnLabelsDict, top_n: int = 3) -> list[tuple[str, str, int]]:
|
|
592
|
+
"""
|
|
593
|
+
Get variables with the longest labels.
|
|
594
|
+
|
|
595
|
+
Parameters
|
|
596
|
+
----------
|
|
597
|
+
col_labels : ColumnLabelsDict
|
|
598
|
+
Column labels dictionary
|
|
599
|
+
top_n : int, default 3
|
|
600
|
+
Number of top results to return
|
|
601
|
+
|
|
602
|
+
Returns
|
|
603
|
+
-------
|
|
604
|
+
list[tuple[str, str, int]]
|
|
605
|
+
List of (variable, label, length) tuples
|
|
606
|
+
"""
|
|
607
|
+
non_empty = [(var, label, len(label)) for var, label in col_labels.items() if label]
|
|
608
|
+
return sorted(non_empty, key=lambda x: x[2], reverse=True)[:top_n]
|
|
609
|
+
|
|
610
|
+
|
|
611
|
+
def _get_top_value_label_counts(val_labels: ValueLabelsDict, top_n: int = 5) -> list[tuple[str, int]]:
|
|
612
|
+
"""
|
|
613
|
+
Get variables with the most value labels.
|
|
614
|
+
|
|
615
|
+
Parameters
|
|
616
|
+
----------
|
|
617
|
+
val_labels : ValueLabelsDict
|
|
618
|
+
Value labels dictionary
|
|
619
|
+
top_n : int, default 5
|
|
620
|
+
Number of top results to return
|
|
621
|
+
|
|
622
|
+
Returns
|
|
623
|
+
-------
|
|
624
|
+
list[tuple[str, int]]
|
|
625
|
+
List of (variable, count) tuples
|
|
626
|
+
"""
|
|
627
|
+
counts = [(var, len(labels)) for var, labels in val_labels.items()]
|
|
628
|
+
return sorted(counts, key=lambda x: x[1], reverse=True)[:top_n]
|
|
629
|
+
|
|
630
|
+
|
|
631
|
+
# =============================================================================
|
|
632
|
+
# Main Function
|
|
633
|
+
# =============================================================================
|
|
634
|
+
|
|
635
|
+
def make_labels(
|
|
636
|
+
input_path: str,
|
|
637
|
+
output_path: str | None = None,
|
|
638
|
+
col_label_sheet: str = 'col_label',
|
|
639
|
+
value_label_sheet: str = 'value_label',
|
|
640
|
+
col_dict_name: str = 'user_column_labels',
|
|
641
|
+
value_dict_name: str = 'user_variable_value_labels',
|
|
642
|
+
col_quote_style: str = "'''",
|
|
643
|
+
value_quote_style: str = "'''",
|
|
644
|
+
indent: str = " ",
|
|
645
|
+
encoding: str = 'utf-8',
|
|
646
|
+
col_variable_column: str = 'variable',
|
|
647
|
+
col_label_column: str = 'label',
|
|
648
|
+
value_variable_column: str = 'variable',
|
|
649
|
+
value_value_column: str = 'value',
|
|
650
|
+
value_label_column: str = 'label',
|
|
651
|
+
verbose: bool = False
|
|
652
|
+
) -> tuple[ColumnLabelsDict, ValueLabelsDict]:
|
|
653
|
+
"""
|
|
654
|
+
Transform Excel file with two sheets into a single Python file containing both
|
|
655
|
+
column labels and value labels dictionaries.
|
|
656
|
+
|
|
657
|
+
This v2 version uses Polars for high-performance vectorized operations.
|
|
658
|
+
|
|
659
|
+
Parameters
|
|
660
|
+
----------
|
|
661
|
+
input_path : str
|
|
662
|
+
Path to input Excel file with two sheets: one for column labels, one for value labels
|
|
663
|
+
output_path : str, optional
|
|
664
|
+
Path where the output Python file will be saved. If None, no file is written (default: None)
|
|
665
|
+
col_label_sheet : str, optional
|
|
666
|
+
Name of the sheet containing column labels (default: 'col_label')
|
|
667
|
+
value_label_sheet : str, optional
|
|
668
|
+
Name of the sheet containing value labels (default: 'value_label')
|
|
669
|
+
col_dict_name : str, optional
|
|
670
|
+
Name of the column labels dictionary in output (default: 'user_column_labels')
|
|
671
|
+
value_dict_name : str, optional
|
|
672
|
+
Name of the value labels dictionary in output (default: 'user_variable_value_labels')
|
|
673
|
+
col_quote_style : str, optional
|
|
674
|
+
Quote style for column labels in output (default: triple quotes "'''")
|
|
675
|
+
value_quote_style : str, optional
|
|
676
|
+
Quote style for value labels in output (default: triple quotes "'''")
|
|
677
|
+
indent : str, optional
|
|
678
|
+
Indentation for dictionary items (default: 4 spaces)
|
|
679
|
+
encoding : str, optional
|
|
680
|
+
File encoding (default: 'utf-8')
|
|
681
|
+
col_variable_column : str, optional
|
|
682
|
+
Name of the column containing variable names in col_label sheet (default: 'variable')
|
|
683
|
+
col_label_column : str, optional
|
|
684
|
+
Name of the column containing labels in col_label sheet (default: 'label')
|
|
685
|
+
value_variable_column : str, optional
|
|
686
|
+
Name of the column containing variable names in value_label sheet (default: 'variable')
|
|
687
|
+
value_value_column : str, optional
|
|
688
|
+
Name of the column containing values in value_label sheet (default: 'value')
|
|
689
|
+
value_label_column : str, optional
|
|
690
|
+
Name of the column containing labels in value_label sheet (default: 'label')
|
|
691
|
+
verbose : bool, optional
|
|
692
|
+
Whether to print progress messages (default: False)
|
|
693
|
+
|
|
694
|
+
Returns
|
|
695
|
+
-------
|
|
696
|
+
tuple[ColumnLabelsDict, ValueLabelsDict]
|
|
697
|
+
Tuple containing (column_labels_dict, value_labels_dict)
|
|
698
|
+
|
|
699
|
+
Examples
|
|
700
|
+
--------
|
|
701
|
+
>>> # Basic usage
|
|
702
|
+
>>> col_labels, val_labels = make_labels(
|
|
703
|
+
... input_path="label_mapping.xlsx",
|
|
704
|
+
... output_path="label_mapping.py"
|
|
705
|
+
... )
|
|
706
|
+
|
|
707
|
+
>>> # Return only (no file output)
|
|
708
|
+
>>> col_labels, val_labels = make_labels(
|
|
709
|
+
... input_path="label_mapping.xlsx"
|
|
710
|
+
... )
|
|
711
|
+
|
|
712
|
+
>>> # Custom configuration
|
|
713
|
+
>>> col_labels, val_labels = make_labels(
|
|
714
|
+
... input_path="mappings.xlsx",
|
|
715
|
+
... output_path="all_labels.py",
|
|
716
|
+
... col_label_sheet="columns",
|
|
717
|
+
... value_label_sheet="values",
|
|
718
|
+
... col_dict_name="column_labels",
|
|
719
|
+
... value_dict_name="value_labels"
|
|
720
|
+
... )
|
|
721
|
+
"""
|
|
722
|
+
|
|
723
|
+
def _print(msg: str) -> None:
|
|
724
|
+
"""Conditional print helper."""
|
|
725
|
+
if verbose:
|
|
726
|
+
print(msg)
|
|
727
|
+
|
|
728
|
+
try:
|
|
729
|
+
_print("=" * 60)
|
|
730
|
+
_print("COMBINED LABEL MAKER (v2 - Polars) - Starting Processing")
|
|
731
|
+
_print("=" * 60)
|
|
732
|
+
_print(f"Input file: {input_path}")
|
|
733
|
+
_print(f"Output file: {output_path if output_path else 'None (return only)'}")
|
|
734
|
+
|
|
735
|
+
# Validate input file
|
|
736
|
+
file_path = Path(input_path)
|
|
737
|
+
if not file_path.exists():
|
|
738
|
+
raise FileNotFoundError(f"Input file not found: {file_path}")
|
|
739
|
+
|
|
740
|
+
if file_path.suffix.lower() not in ['.xlsx', '.xls']:
|
|
741
|
+
raise ValueError(f"Input must be an Excel file, got: {file_path.suffix}")
|
|
742
|
+
|
|
743
|
+
# Get available sheets
|
|
744
|
+
available_sheets = _get_sheet_names(file_path)
|
|
745
|
+
_print(f"\nAvailable sheets: {available_sheets}")
|
|
746
|
+
|
|
747
|
+
# Validate required sheets
|
|
748
|
+
if col_label_sheet not in available_sheets:
|
|
749
|
+
raise ValueError(f"Sheet '{col_label_sheet}' not found. Available: {available_sheets}")
|
|
750
|
+
if value_label_sheet not in available_sheets:
|
|
751
|
+
raise ValueError(f"Sheet '{value_label_sheet}' not found. Available: {available_sheets}")
|
|
752
|
+
|
|
753
|
+
# Process column labels
|
|
754
|
+
_print("\nš Processing Column Labels Sheet...")
|
|
755
|
+
df_col = _read_excel_sheet(file_path, col_label_sheet)
|
|
756
|
+
_validate_column_labels_schema(df_col, col_variable_column, col_label_column)
|
|
757
|
+
|
|
758
|
+
column_labels, col_initial, col_removed, col_dups = _process_column_labels_df(
|
|
759
|
+
df_col, col_variable_column, col_label_column
|
|
760
|
+
)
|
|
761
|
+
|
|
762
|
+
_print(f" Found {col_initial} rows")
|
|
763
|
+
if col_removed > 0:
|
|
764
|
+
_print(f" ā Removed {col_removed} rows with null variable names")
|
|
765
|
+
if col_dups > 0:
|
|
766
|
+
_print(f" ā Warning: Found {col_dups} duplicate variable names")
|
|
767
|
+
empty_labels = sum(1 for label in column_labels.values() if label == "")
|
|
768
|
+
_print(f" ā Processed {len(column_labels)} column labels ({empty_labels} empty)")
|
|
769
|
+
|
|
770
|
+
# Process value labels
|
|
771
|
+
_print("\nš Processing Value Labels Sheet...")
|
|
772
|
+
df_val = _read_excel_sheet(file_path, value_label_sheet)
|
|
773
|
+
_validate_value_labels_schema(df_val, value_variable_column, value_value_column, value_label_column)
|
|
774
|
+
|
|
775
|
+
value_labels, val_initial, val_removed, val_dups = _process_value_labels_df(
|
|
776
|
+
df_val, value_variable_column, value_value_column, value_label_column
|
|
777
|
+
)
|
|
778
|
+
|
|
779
|
+
_print(f" Found {val_initial} rows")
|
|
780
|
+
if val_removed > 0:
|
|
781
|
+
_print(f" ā Removed {val_removed} invalid rows")
|
|
782
|
+
if val_dups > 0:
|
|
783
|
+
_print(f" ā Warning: Found {val_dups} duplicate variable-value pairs")
|
|
784
|
+
total_labels = sum(len(labels) for labels in value_labels.values())
|
|
785
|
+
_print(f" ā Processed {len(value_labels)} variables with {total_labels} total value labels")
|
|
786
|
+
|
|
787
|
+
# Save output if path provided
|
|
788
|
+
if output_path:
|
|
789
|
+
saved_path = _save_combined_output(
|
|
790
|
+
column_labels, value_labels, output_path,
|
|
791
|
+
col_dict_name, value_dict_name,
|
|
792
|
+
col_quote_style, value_quote_style,
|
|
793
|
+
indent, encoding
|
|
794
|
+
)
|
|
795
|
+
_print(f"\nā Combined dictionaries saved to: {saved_path}")
|
|
796
|
+
|
|
797
|
+
# Print summary
|
|
798
|
+
_print("\n" + "=" * 60)
|
|
799
|
+
_print("PROCESSING SUMMARY")
|
|
800
|
+
_print("=" * 60)
|
|
801
|
+
_print(f"Column Labels Dictionary: {len(column_labels)} variables")
|
|
802
|
+
_print(f"Value Labels Dictionary: {len(value_labels)} variables")
|
|
803
|
+
|
|
804
|
+
# Show longest labels
|
|
805
|
+
if column_labels:
|
|
806
|
+
longest = _get_longest_labels(column_labels)
|
|
807
|
+
if longest:
|
|
808
|
+
_print(f"\nVariables with longest labels:")
|
|
809
|
+
for i, (var, label, length) in enumerate(longest, 1):
|
|
810
|
+
preview = label[:50] + "..." if len(label) > 50 else label
|
|
811
|
+
_print(f" {i}. {var}: {length} chars - '{preview}'")
|
|
812
|
+
|
|
813
|
+
# Show top value label counts
|
|
814
|
+
if value_labels:
|
|
815
|
+
top_counts = _get_top_value_label_counts(value_labels)
|
|
816
|
+
_print(f"\nTop 5 variables by value label count:")
|
|
817
|
+
for i, (var, count) in enumerate(top_counts, 1):
|
|
818
|
+
_print(f" {i}. {var}: {count} labels")
|
|
819
|
+
|
|
820
|
+
_print("=" * 60)
|
|
821
|
+
if output_path:
|
|
822
|
+
_print("š Combined label transformation and file save completed successfully!")
|
|
823
|
+
else:
|
|
824
|
+
_print("š Combined label transformation completed successfully! (No file saved)")
|
|
825
|
+
|
|
826
|
+
return column_labels, value_labels
|
|
827
|
+
|
|
828
|
+
except Exception as e:
|
|
829
|
+
_print(f"\nā Error during transformation: {str(e)}")
|
|
830
|
+
raise
|
|
831
|
+
|
|
832
|
+
|
|
833
|
+
__all__ = ["make_labels"]
|