ultrasav 0.1.4__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,833 @@
1
+ """
2
+ Make Labels Module (v2 - Polars Backend)
3
+ =========================================
4
+ A high-performance utility module for transforming Excel files containing label mappings
5
+ into Python dictionaries using Polars for vectorized operations.
6
+
7
+ This module provides the make_labels function to:
8
+ - Read column labels and value labels from Excel sheets using Polars
9
+ - Generate Python dictionary files for SPSS metadata labeling
10
+ - Support customizable sheet names, column names, and output formatting
11
+
12
+ Version: 2.0.0
13
+ Dependencies: polars, pathlib
14
+ """
15
+
16
+ import polars as pl
17
+ from pathlib import Path
18
+
19
+
20
+ # =============================================================================
21
+ # Type Aliases (for clarity)
22
+ # =============================================================================
23
+ ColumnLabelsDict = dict[str, str]
24
+ ValueLabelsDict = dict[str, dict[int | float | str, str]]
25
+
26
+
27
+ # =============================================================================
28
+ # Excel Reading (adapted from def_read_files.py)
29
+ # =============================================================================
30
+
31
+ def _read_excel_sheet(file_path: Path, sheet_name: str, engine: str = "calamine") -> pl.DataFrame:
32
+ """
33
+ Read a single Excel sheet into a Polars DataFrame with engine fallback.
34
+
35
+ Parameters
36
+ ----------
37
+ file_path : Path
38
+ Path to the Excel file
39
+ sheet_name : str
40
+ Name of the sheet to read
41
+ engine : str, default "calamine"
42
+ Excel engine to use. Falls back to openpyxl, then xlsx2csv on failure.
43
+
44
+ Returns
45
+ -------
46
+ pl.DataFrame
47
+ The sheet data as a Polars DataFrame
48
+ """
49
+ file_str = str(file_path)
50
+ engines = [engine, "openpyxl", "xlsx2csv"]
51
+
52
+ last_error: Exception | None = None
53
+ for eng in engines:
54
+ try:
55
+ return pl.read_excel(file_str, sheet_name=sheet_name, engine=eng)
56
+ except Exception as e:
57
+ last_error = e
58
+ continue
59
+
60
+ raise ValueError(f"Failed to read sheet '{sheet_name}' from '{file_path}'. Last error: {last_error}")
61
+
62
+
63
+ def _get_sheet_names(file_path: Path) -> list[str]:
64
+ """
65
+ Get all sheet names from an Excel file.
66
+
67
+ Uses Polars' sheet_id=0 to load all sheets as a dict, then extracts keys.
68
+ Falls back to fastexcel or openpyxl if needed.
69
+
70
+ Parameters
71
+ ----------
72
+ file_path : Path
73
+ Path to the Excel file
74
+
75
+ Returns
76
+ -------
77
+ list[str]
78
+ List of sheet names
79
+ """
80
+ file_str = str(file_path)
81
+
82
+ # Primary: Use Polars with sheet_id=0 (returns {sheetname: DataFrame, ...} dict)
83
+ engines = ["calamine", "openpyxl", "xlsx2csv"]
84
+ for engine in engines:
85
+ try:
86
+ sheets_dict = pl.read_excel(file_str, sheet_id=0, engine=engine)
87
+ return list(sheets_dict.keys())
88
+ except Exception:
89
+ continue
90
+
91
+ # Fallback: Try fastexcel directly (calamine's Python binding)
92
+ try:
93
+ import fastexcel
94
+ excel_file = fastexcel.read_excel(file_str)
95
+ return excel_file.sheet_names
96
+ except (ImportError, Exception):
97
+ pass
98
+
99
+ # Last resort: openpyxl directly
100
+ try:
101
+ from openpyxl import load_workbook
102
+ wb = load_workbook(file_path, read_only=True, data_only=True)
103
+ names = wb.sheetnames
104
+ wb.close()
105
+ return names
106
+ except Exception as e:
107
+ raise ValueError(
108
+ f"Failed to read sheet names from '{file_path}'. "
109
+ f"Ensure the file exists and is a valid Excel file. Error: {e}"
110
+ )
111
+
112
+
113
+ # =============================================================================
114
+ # Data Cleaning Helpers (Pure Functions)
115
+ # =============================================================================
116
+
117
+ def _clean_variable_column(df: pl.DataFrame, col_name: str) -> pl.DataFrame:
118
+ """
119
+ Clean variable name column: cast to string, mark nulls/empty as null.
120
+
121
+ Parameters
122
+ ----------
123
+ df : pl.DataFrame
124
+ Input DataFrame
125
+ col_name : str
126
+ Name of the variable column to clean
127
+
128
+ Returns
129
+ -------
130
+ pl.DataFrame
131
+ DataFrame with cleaned variable column
132
+ """
133
+ return df.with_columns(
134
+ pl.col(col_name)
135
+ .cast(pl.Utf8)
136
+ .replace("", None)
137
+ .alias(col_name)
138
+ )
139
+
140
+
141
+ def _clean_label_column(df: pl.DataFrame, col_name: str) -> pl.DataFrame:
142
+ """
143
+ Clean label column: cast to string, strip whitespace, convert null to empty string.
144
+
145
+ Parameters
146
+ ----------
147
+ df : pl.DataFrame
148
+ Input DataFrame
149
+ col_name : str
150
+ Name of the label column to clean
151
+
152
+ Returns
153
+ -------
154
+ pl.DataFrame
155
+ DataFrame with cleaned label column
156
+ """
157
+ return df.with_columns(
158
+ pl.col(col_name)
159
+ .cast(pl.Utf8)
160
+ .fill_null("")
161
+ .str.strip_chars()
162
+ .alias(col_name)
163
+ )
164
+
165
+
166
+ def _filter_valid_rows(df: pl.DataFrame, required_col: str) -> pl.DataFrame:
167
+ """
168
+ Filter rows where the required column is not null.
169
+
170
+ Parameters
171
+ ----------
172
+ df : pl.DataFrame
173
+ Input DataFrame
174
+ required_col : str
175
+ Column that must not be null
176
+
177
+ Returns
178
+ -------
179
+ pl.DataFrame
180
+ Filtered DataFrame
181
+ """
182
+ return df.filter(pl.col(required_col).is_not_null())
183
+
184
+
185
+ # =============================================================================
186
+ # Value Conversion (Vectorized)
187
+ # =============================================================================
188
+
189
+ def _convert_values_vectorized(df: pl.DataFrame, value_col: str) -> pl.DataFrame:
190
+ """
191
+ Convert value column to appropriate types: int > float > string (vectorized).
192
+
193
+ This uses a vectorized approach:
194
+ 1. Cast to string first
195
+ 2. Try to cast to Float64
196
+ 3. Check if float is integer-like, cast to Int64 where possible
197
+ 4. Keep as string where numeric conversion failed
198
+
199
+ Parameters
200
+ ----------
201
+ df : pl.DataFrame
202
+ Input DataFrame
203
+ value_col : str
204
+ Name of the value column to convert
205
+
206
+ Returns
207
+ -------
208
+ pl.DataFrame
209
+ DataFrame with converted value column (as Object/mixed type via struct)
210
+ """
211
+ # First, ensure string representation for processing
212
+ df = df.with_columns(
213
+ pl.col(value_col).cast(pl.Utf8).str.strip_chars().alias("_val_str")
214
+ )
215
+
216
+ # Try numeric conversion
217
+ df = df.with_columns(
218
+ pl.col("_val_str").cast(pl.Float64, strict=False).alias("_val_float")
219
+ )
220
+
221
+ # Check if float values are actually integers
222
+ df = df.with_columns(
223
+ (pl.col("_val_float") == pl.col("_val_float").floor()).alias("_is_int")
224
+ )
225
+
226
+ # Create integer version where applicable
227
+ df = df.with_columns(
228
+ pl.when(pl.col("_is_int") & pl.col("_val_float").is_not_null())
229
+ .then(pl.col("_val_float").cast(pl.Int64))
230
+ .otherwise(None)
231
+ .alias("_val_int")
232
+ )
233
+
234
+ return df
235
+
236
+
237
+ def _extract_typed_value(row: dict) -> int | float | str | None:
238
+ """
239
+ Extract the properly typed value from a row with conversion columns.
240
+
241
+ Parameters
242
+ ----------
243
+ row : dict
244
+ Row dictionary with _val_int, _val_float, _val_str columns
245
+
246
+ Returns
247
+ -------
248
+ int | float | str | None
249
+ The value in its appropriate type
250
+ """
251
+ if row.get("_val_int") is not None:
252
+ return int(row["_val_int"])
253
+ elif row.get("_val_float") is not None:
254
+ return float(row["_val_float"])
255
+ elif row.get("_val_str") is not None and row["_val_str"] != "":
256
+ return str(row["_val_str"])
257
+ return None
258
+
259
+
260
+ # =============================================================================
261
+ # Duplicate Detection (Vectorized)
262
+ # =============================================================================
263
+
264
+ def _count_duplicates(df: pl.DataFrame, cols: list[str]) -> int:
265
+ """
266
+ Count the number of duplicate groups (groups with more than 1 row).
267
+
268
+ Parameters
269
+ ----------
270
+ df : pl.DataFrame
271
+ Input DataFrame
272
+ cols : list[str]
273
+ Columns to group by for duplicate detection
274
+
275
+ Returns
276
+ -------
277
+ int
278
+ Number of duplicate groups
279
+ """
280
+ counts = df.group_by(cols).agg(pl.len().alias("_count"))
281
+ return counts.filter(pl.col("_count") > 1).height
282
+
283
+
284
+ # =============================================================================
285
+ # Column Labels Processing
286
+ # =============================================================================
287
+
288
+ def _validate_column_labels_schema(df: pl.DataFrame, var_col: str, label_col: str) -> None:
289
+ """
290
+ Validate that required columns exist in column labels DataFrame.
291
+
292
+ Parameters
293
+ ----------
294
+ df : pl.DataFrame
295
+ Input DataFrame
296
+ var_col : str
297
+ Expected variable column name
298
+ label_col : str
299
+ Expected label column name
300
+
301
+ Raises
302
+ ------
303
+ ValueError
304
+ If required columns are missing
305
+ """
306
+ if var_col not in df.columns or label_col not in df.columns:
307
+ raise ValueError(f"Column labels sheet must have '{var_col}' and '{label_col}' columns")
308
+
309
+
310
+ def _process_column_labels_df(
311
+ df: pl.DataFrame,
312
+ var_col: str,
313
+ label_col: str
314
+ ) -> tuple[ColumnLabelsDict, int, int, int]:
315
+ """
316
+ Process column labels DataFrame into dictionary (pure function).
317
+
318
+ Parameters
319
+ ----------
320
+ df : pl.DataFrame
321
+ Raw column labels DataFrame
322
+ var_col : str
323
+ Variable column name
324
+ label_col : str
325
+ Label column name
326
+
327
+ Returns
328
+ -------
329
+ tuple[ColumnLabelsDict, int, int, int]
330
+ (column_labels_dict, initial_rows, removed_rows, duplicate_count)
331
+ """
332
+ initial_rows = df.height
333
+
334
+ # Clean columns
335
+ df = _clean_variable_column(df, var_col)
336
+ df = _clean_label_column(df, label_col)
337
+
338
+ # Filter valid rows
339
+ df_cleaned = _filter_valid_rows(df, var_col)
340
+ removed = initial_rows - df_cleaned.height
341
+
342
+ # Check duplicates
343
+ duplicate_count = _count_duplicates(df_cleaned, [var_col])
344
+
345
+ # Build dictionary (vectorized extraction)
346
+ result: ColumnLabelsDict = {}
347
+ for row in df_cleaned.select([var_col, label_col]).iter_rows(named=True):
348
+ result[row[var_col]] = row[label_col]
349
+
350
+ return result, initial_rows, removed, duplicate_count
351
+
352
+
353
+ # =============================================================================
354
+ # Value Labels Processing
355
+ # =============================================================================
356
+
357
+ def _validate_value_labels_schema(
358
+ df: pl.DataFrame,
359
+ var_col: str,
360
+ value_col: str,
361
+ label_col: str
362
+ ) -> None:
363
+ """
364
+ Validate that required columns exist in value labels DataFrame.
365
+
366
+ Parameters
367
+ ----------
368
+ df : pl.DataFrame
369
+ Input DataFrame
370
+ var_col : str
371
+ Expected variable column name
372
+ value_col : str
373
+ Expected value column name
374
+ label_col : str
375
+ Expected label column name
376
+
377
+ Raises
378
+ ------
379
+ ValueError
380
+ If required columns are missing
381
+ """
382
+ required = [var_col, value_col, label_col]
383
+ missing = [col for col in required if col not in df.columns]
384
+ if missing:
385
+ raise ValueError(f"Value labels sheet missing columns: {missing}")
386
+
387
+
388
+ def _process_value_labels_df(
389
+ df: pl.DataFrame,
390
+ var_col: str,
391
+ value_col: str,
392
+ label_col: str
393
+ ) -> tuple[ValueLabelsDict, int, int, int]:
394
+ """
395
+ Process value labels DataFrame into nested dictionary (pure function).
396
+
397
+ Parameters
398
+ ----------
399
+ df : pl.DataFrame
400
+ Raw value labels DataFrame
401
+ var_col : str
402
+ Variable column name
403
+ value_col : str
404
+ Value column name
405
+ label_col : str
406
+ Label column name
407
+
408
+ Returns
409
+ -------
410
+ tuple[ValueLabelsDict, int, int, int]
411
+ (value_labels_dict, initial_rows, removed_rows, duplicate_count)
412
+ """
413
+ initial_rows = df.height
414
+
415
+ # Clean variable column
416
+ df = _clean_variable_column(df, var_col)
417
+
418
+ # Clean label column
419
+ df = _clean_label_column(df, label_col)
420
+
421
+ # Filter rows with valid variable and value
422
+ df = df.filter(
423
+ pl.col(var_col).is_not_null() &
424
+ pl.col(value_col).is_not_null()
425
+ )
426
+
427
+ # Convert values to appropriate types (vectorized)
428
+ df = _convert_values_vectorized(df, value_col)
429
+
430
+ # Filter out rows where conversion resulted in null
431
+ df_cleaned = df.filter(
432
+ pl.col("_val_int").is_not_null() |
433
+ pl.col("_val_float").is_not_null() |
434
+ (pl.col("_val_str").is_not_null() & (pl.col("_val_str") != ""))
435
+ )
436
+
437
+ removed = initial_rows - df_cleaned.height
438
+
439
+ # Check duplicates on variable + original value string (before type conversion)
440
+ duplicate_count = _count_duplicates(df_cleaned, [var_col, "_val_str"])
441
+
442
+ # Build nested dictionary
443
+ # Group by variable first for efficiency
444
+ result: ValueLabelsDict = {}
445
+
446
+ # Select only needed columns for iteration
447
+ select_cols = [var_col, label_col, "_val_int", "_val_float", "_val_str"]
448
+
449
+ for row in df_cleaned.select(select_cols).iter_rows(named=True):
450
+ variable = row[var_col]
451
+ label = row[label_col]
452
+ value = _extract_typed_value(row)
453
+
454
+ if value is not None:
455
+ if variable not in result:
456
+ result[variable] = {}
457
+ result[variable][value] = label
458
+
459
+ return result, initial_rows, removed, duplicate_count
460
+
461
+
462
+ # =============================================================================
463
+ # Output Formatting (Pure Functions)
464
+ # =============================================================================
465
+
466
+ def _format_column_dict_lines(
467
+ data: ColumnLabelsDict,
468
+ dict_name: str,
469
+ quote_style: str,
470
+ indent: str
471
+ ) -> list[str]:
472
+ """
473
+ Format column labels dictionary as Python code lines.
474
+
475
+ Parameters
476
+ ----------
477
+ data : ColumnLabelsDict
478
+ Column labels dictionary
479
+ dict_name : str
480
+ Name for the dictionary variable
481
+ quote_style : str
482
+ Quote style for values
483
+ indent : str
484
+ Indentation string
485
+
486
+ Returns
487
+ -------
488
+ list[str]
489
+ Lines of Python code
490
+ """
491
+ lines = [f"{dict_name} = {{"]
492
+ for variable, label in data.items():
493
+ lines.append(f"{indent}'{variable}': {quote_style}{label}{quote_style},")
494
+ lines.append("}")
495
+ return lines
496
+
497
+
498
+ def _format_value_dict_lines(
499
+ data: ValueLabelsDict,
500
+ dict_name: str,
501
+ quote_style: str,
502
+ indent: str
503
+ ) -> list[str]:
504
+ """
505
+ Format value labels dictionary as Python code lines.
506
+
507
+ Parameters
508
+ ----------
509
+ data : ValueLabelsDict
510
+ Value labels dictionary
511
+ dict_name : str
512
+ Name for the dictionary variable
513
+ quote_style : str
514
+ Quote style for label values
515
+ indent : str
516
+ Indentation string
517
+
518
+ Returns
519
+ -------
520
+ list[str]
521
+ Lines of Python code
522
+ """
523
+ lines = [f"\n\n{dict_name} = {{"]
524
+ for variable, value_dict in data.items():
525
+ lines.append(f"{indent}'{variable}': {{")
526
+ for value, label in value_dict.items():
527
+ key_repr = f"'{value}'" if isinstance(value, str) else str(value)
528
+ lines.append(f"{indent}{indent}{key_repr}: {quote_style}{label}{quote_style},")
529
+ lines.append(f"{indent}}},")
530
+ lines.append("}")
531
+ return lines
532
+
533
+
534
+ def _save_combined_output(
535
+ col_labels: ColumnLabelsDict,
536
+ val_labels: ValueLabelsDict,
537
+ output_path: str,
538
+ col_dict_name: str,
539
+ value_dict_name: str,
540
+ col_quote_style: str,
541
+ value_quote_style: str,
542
+ indent: str,
543
+ encoding: str
544
+ ) -> Path:
545
+ """
546
+ Save both dictionaries to a single Python file.
547
+
548
+ Parameters
549
+ ----------
550
+ col_labels : ColumnLabelsDict
551
+ Column labels dictionary
552
+ val_labels : ValueLabelsDict
553
+ Value labels dictionary
554
+ output_path : str
555
+ Output file path
556
+ col_dict_name : str
557
+ Name for column labels dictionary
558
+ value_dict_name : str
559
+ Name for value labels dictionary
560
+ col_quote_style : str
561
+ Quote style for column labels
562
+ value_quote_style : str
563
+ Quote style for value labels
564
+ indent : str
565
+ Indentation string
566
+ encoding : str
567
+ File encoding
568
+
569
+ Returns
570
+ -------
571
+ Path
572
+ Path to the saved file
573
+ """
574
+ output_path_obj = Path(output_path)
575
+ output_path_obj.parent.mkdir(parents=True, exist_ok=True)
576
+
577
+ all_lines: list[str] = []
578
+ all_lines.extend(_format_column_dict_lines(col_labels, col_dict_name, col_quote_style, indent))
579
+ all_lines.extend(_format_value_dict_lines(val_labels, value_dict_name, value_quote_style, indent))
580
+
581
+ with open(output_path_obj, "w", encoding=encoding) as file:
582
+ file.write("\n".join(all_lines))
583
+
584
+ return output_path_obj
585
+
586
+
587
+ # =============================================================================
588
+ # Summary Helpers
589
+ # =============================================================================
590
+
591
+ def _get_longest_labels(col_labels: ColumnLabelsDict, top_n: int = 3) -> list[tuple[str, str, int]]:
592
+ """
593
+ Get variables with the longest labels.
594
+
595
+ Parameters
596
+ ----------
597
+ col_labels : ColumnLabelsDict
598
+ Column labels dictionary
599
+ top_n : int, default 3
600
+ Number of top results to return
601
+
602
+ Returns
603
+ -------
604
+ list[tuple[str, str, int]]
605
+ List of (variable, label, length) tuples
606
+ """
607
+ non_empty = [(var, label, len(label)) for var, label in col_labels.items() if label]
608
+ return sorted(non_empty, key=lambda x: x[2], reverse=True)[:top_n]
609
+
610
+
611
+ def _get_top_value_label_counts(val_labels: ValueLabelsDict, top_n: int = 5) -> list[tuple[str, int]]:
612
+ """
613
+ Get variables with the most value labels.
614
+
615
+ Parameters
616
+ ----------
617
+ val_labels : ValueLabelsDict
618
+ Value labels dictionary
619
+ top_n : int, default 5
620
+ Number of top results to return
621
+
622
+ Returns
623
+ -------
624
+ list[tuple[str, int]]
625
+ List of (variable, count) tuples
626
+ """
627
+ counts = [(var, len(labels)) for var, labels in val_labels.items()]
628
+ return sorted(counts, key=lambda x: x[1], reverse=True)[:top_n]
629
+
630
+
631
+ # =============================================================================
632
+ # Main Function
633
+ # =============================================================================
634
+
635
+ def make_labels(
636
+ input_path: str,
637
+ output_path: str | None = None,
638
+ col_label_sheet: str = 'col_label',
639
+ value_label_sheet: str = 'value_label',
640
+ col_dict_name: str = 'user_column_labels',
641
+ value_dict_name: str = 'user_variable_value_labels',
642
+ col_quote_style: str = "'''",
643
+ value_quote_style: str = "'''",
644
+ indent: str = " ",
645
+ encoding: str = 'utf-8',
646
+ col_variable_column: str = 'variable',
647
+ col_label_column: str = 'label',
648
+ value_variable_column: str = 'variable',
649
+ value_value_column: str = 'value',
650
+ value_label_column: str = 'label',
651
+ verbose: bool = False
652
+ ) -> tuple[ColumnLabelsDict, ValueLabelsDict]:
653
+ """
654
+ Transform Excel file with two sheets into a single Python file containing both
655
+ column labels and value labels dictionaries.
656
+
657
+ This v2 version uses Polars for high-performance vectorized operations.
658
+
659
+ Parameters
660
+ ----------
661
+ input_path : str
662
+ Path to input Excel file with two sheets: one for column labels, one for value labels
663
+ output_path : str, optional
664
+ Path where the output Python file will be saved. If None, no file is written (default: None)
665
+ col_label_sheet : str, optional
666
+ Name of the sheet containing column labels (default: 'col_label')
667
+ value_label_sheet : str, optional
668
+ Name of the sheet containing value labels (default: 'value_label')
669
+ col_dict_name : str, optional
670
+ Name of the column labels dictionary in output (default: 'user_column_labels')
671
+ value_dict_name : str, optional
672
+ Name of the value labels dictionary in output (default: 'user_variable_value_labels')
673
+ col_quote_style : str, optional
674
+ Quote style for column labels in output (default: triple quotes "'''")
675
+ value_quote_style : str, optional
676
+ Quote style for value labels in output (default: triple quotes "'''")
677
+ indent : str, optional
678
+ Indentation for dictionary items (default: 4 spaces)
679
+ encoding : str, optional
680
+ File encoding (default: 'utf-8')
681
+ col_variable_column : str, optional
682
+ Name of the column containing variable names in col_label sheet (default: 'variable')
683
+ col_label_column : str, optional
684
+ Name of the column containing labels in col_label sheet (default: 'label')
685
+ value_variable_column : str, optional
686
+ Name of the column containing variable names in value_label sheet (default: 'variable')
687
+ value_value_column : str, optional
688
+ Name of the column containing values in value_label sheet (default: 'value')
689
+ value_label_column : str, optional
690
+ Name of the column containing labels in value_label sheet (default: 'label')
691
+ verbose : bool, optional
692
+ Whether to print progress messages (default: False)
693
+
694
+ Returns
695
+ -------
696
+ tuple[ColumnLabelsDict, ValueLabelsDict]
697
+ Tuple containing (column_labels_dict, value_labels_dict)
698
+
699
+ Examples
700
+ --------
701
+ >>> # Basic usage
702
+ >>> col_labels, val_labels = make_labels(
703
+ ... input_path="label_mapping.xlsx",
704
+ ... output_path="label_mapping.py"
705
+ ... )
706
+
707
+ >>> # Return only (no file output)
708
+ >>> col_labels, val_labels = make_labels(
709
+ ... input_path="label_mapping.xlsx"
710
+ ... )
711
+
712
+ >>> # Custom configuration
713
+ >>> col_labels, val_labels = make_labels(
714
+ ... input_path="mappings.xlsx",
715
+ ... output_path="all_labels.py",
716
+ ... col_label_sheet="columns",
717
+ ... value_label_sheet="values",
718
+ ... col_dict_name="column_labels",
719
+ ... value_dict_name="value_labels"
720
+ ... )
721
+ """
722
+
723
+ def _print(msg: str) -> None:
724
+ """Conditional print helper."""
725
+ if verbose:
726
+ print(msg)
727
+
728
+ try:
729
+ _print("=" * 60)
730
+ _print("COMBINED LABEL MAKER (v2 - Polars) - Starting Processing")
731
+ _print("=" * 60)
732
+ _print(f"Input file: {input_path}")
733
+ _print(f"Output file: {output_path if output_path else 'None (return only)'}")
734
+
735
+ # Validate input file
736
+ file_path = Path(input_path)
737
+ if not file_path.exists():
738
+ raise FileNotFoundError(f"Input file not found: {file_path}")
739
+
740
+ if file_path.suffix.lower() not in ['.xlsx', '.xls']:
741
+ raise ValueError(f"Input must be an Excel file, got: {file_path.suffix}")
742
+
743
+ # Get available sheets
744
+ available_sheets = _get_sheet_names(file_path)
745
+ _print(f"\nAvailable sheets: {available_sheets}")
746
+
747
+ # Validate required sheets
748
+ if col_label_sheet not in available_sheets:
749
+ raise ValueError(f"Sheet '{col_label_sheet}' not found. Available: {available_sheets}")
750
+ if value_label_sheet not in available_sheets:
751
+ raise ValueError(f"Sheet '{value_label_sheet}' not found. Available: {available_sheets}")
752
+
753
+ # Process column labels
754
+ _print("\nšŸ“Š Processing Column Labels Sheet...")
755
+ df_col = _read_excel_sheet(file_path, col_label_sheet)
756
+ _validate_column_labels_schema(df_col, col_variable_column, col_label_column)
757
+
758
+ column_labels, col_initial, col_removed, col_dups = _process_column_labels_df(
759
+ df_col, col_variable_column, col_label_column
760
+ )
761
+
762
+ _print(f" Found {col_initial} rows")
763
+ if col_removed > 0:
764
+ _print(f" ⚠ Removed {col_removed} rows with null variable names")
765
+ if col_dups > 0:
766
+ _print(f" ⚠ Warning: Found {col_dups} duplicate variable names")
767
+ empty_labels = sum(1 for label in column_labels.values() if label == "")
768
+ _print(f" āœ“ Processed {len(column_labels)} column labels ({empty_labels} empty)")
769
+
770
+ # Process value labels
771
+ _print("\nšŸ“Š Processing Value Labels Sheet...")
772
+ df_val = _read_excel_sheet(file_path, value_label_sheet)
773
+ _validate_value_labels_schema(df_val, value_variable_column, value_value_column, value_label_column)
774
+
775
+ value_labels, val_initial, val_removed, val_dups = _process_value_labels_df(
776
+ df_val, value_variable_column, value_value_column, value_label_column
777
+ )
778
+
779
+ _print(f" Found {val_initial} rows")
780
+ if val_removed > 0:
781
+ _print(f" ⚠ Removed {val_removed} invalid rows")
782
+ if val_dups > 0:
783
+ _print(f" ⚠ Warning: Found {val_dups} duplicate variable-value pairs")
784
+ total_labels = sum(len(labels) for labels in value_labels.values())
785
+ _print(f" āœ“ Processed {len(value_labels)} variables with {total_labels} total value labels")
786
+
787
+ # Save output if path provided
788
+ if output_path:
789
+ saved_path = _save_combined_output(
790
+ column_labels, value_labels, output_path,
791
+ col_dict_name, value_dict_name,
792
+ col_quote_style, value_quote_style,
793
+ indent, encoding
794
+ )
795
+ _print(f"\nāœ“ Combined dictionaries saved to: {saved_path}")
796
+
797
+ # Print summary
798
+ _print("\n" + "=" * 60)
799
+ _print("PROCESSING SUMMARY")
800
+ _print("=" * 60)
801
+ _print(f"Column Labels Dictionary: {len(column_labels)} variables")
802
+ _print(f"Value Labels Dictionary: {len(value_labels)} variables")
803
+
804
+ # Show longest labels
805
+ if column_labels:
806
+ longest = _get_longest_labels(column_labels)
807
+ if longest:
808
+ _print(f"\nVariables with longest labels:")
809
+ for i, (var, label, length) in enumerate(longest, 1):
810
+ preview = label[:50] + "..." if len(label) > 50 else label
811
+ _print(f" {i}. {var}: {length} chars - '{preview}'")
812
+
813
+ # Show top value label counts
814
+ if value_labels:
815
+ top_counts = _get_top_value_label_counts(value_labels)
816
+ _print(f"\nTop 5 variables by value label count:")
817
+ for i, (var, count) in enumerate(top_counts, 1):
818
+ _print(f" {i}. {var}: {count} labels")
819
+
820
+ _print("=" * 60)
821
+ if output_path:
822
+ _print("šŸŽ‰ Combined label transformation and file save completed successfully!")
823
+ else:
824
+ _print("šŸŽ‰ Combined label transformation completed successfully! (No file saved)")
825
+
826
+ return column_labels, value_labels
827
+
828
+ except Exception as e:
829
+ _print(f"\nāŒ Error during transformation: {str(e)}")
830
+ raise
831
+
832
+
833
+ __all__ = ["make_labels"]