ultrasav 0.1.4__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,91 @@
1
+ """
2
+ metaman - The Metadata Superhero 🦸
3
+ ===================================
4
+ Metadata inspection, extraction, and reporting tools for ultrasav.
5
+
6
+ This submodule handles all metadata-related operations including:
7
+ - Extracting metadata to Python files (get_meta)
8
+ - Creating label dictionaries from Excel (make_labels)
9
+ - Building validation datamaps (make_datamap)
10
+ - Exporting formatted Excel reports (map_to_excel)
11
+ - Detecting variable types (detect_variable_type)
12
+
13
+ All public functions are also available at the top-level ultrasav namespace.
14
+
15
+ Examples
16
+ --------
17
+ >>> import ultrasav as ul
18
+ >>>
19
+ >>> # Extract metadata to a Python file
20
+ >>> meta_dict = ul.get_meta(meta, output_path="labels.py")
21
+ >>>
22
+ >>> # Create labels from Excel template
23
+ >>> ul.make_labels("template.xlsx", "labels.py")
24
+ >>>
25
+ >>> # Build a validation datamap
26
+ >>> datamap = ul.make_datamap(df, meta)
27
+ >>>
28
+ >>> # Export to formatted Excel
29
+ >>> ul.map_to_excel(datamap, "validation.xlsx")
30
+
31
+ Or access via submodule:
32
+ >>> from ultrasav.metaman import make_datamap, get_color_scheme
33
+ """
34
+
35
+ # Metadata extraction
36
+ from .def_get_meta import get_meta
37
+
38
+ # Label creation from Excel
39
+ from .def_make_labels import make_labels
40
+
41
+ # Variable type detection
42
+ from .def_detect_variable_type import detect_variable_type, create_mr_set_lookup
43
+
44
+ # Datamap creation
45
+ from .def_map_engine import map_engine, precompute_value_maps
46
+ from .def_make_datamap import make_datamap
47
+
48
+ # Excel export
49
+ from .def_write_excel_engine import write_excel_engine
50
+ from .def_map_to_excel import map_to_excel
51
+
52
+ # Color schemes for Excel formatting
53
+ from .pastel_color_schemes import (
54
+ get_color_scheme,
55
+ COLOR_SCHEMES,
56
+ CLASSIC_GREY,
57
+ PASTEL_GREEN_MUTED,
58
+ PASTEL_BLUE_COOL,
59
+ PASTEL_PURPLE_WARM,
60
+ PASTEL_INDIGO,
61
+ )
62
+
63
+ __all__ = [
64
+ # Metadata extraction
65
+ "get_meta",
66
+
67
+ # Label creation
68
+ "make_labels",
69
+
70
+ # Variable type detection
71
+ "detect_variable_type",
72
+ # "create_mr_set_lookup",
73
+
74
+ # Datamap creation
75
+ "make_datamap",
76
+ # "map_engine",
77
+ "precompute_value_maps",
78
+
79
+ # Excel export
80
+ "map_to_excel",
81
+ # "write_excel_engine",
82
+
83
+ # Color schemes
84
+ "get_color_scheme",
85
+ "COLOR_SCHEMES",
86
+ "CLASSIC_GREY",
87
+ "PASTEL_GREEN_MUTED",
88
+ "PASTEL_BLUE_COOL",
89
+ "PASTEL_PURPLE_WARM",
90
+ "PASTEL_INDIGO",
91
+ ]
@@ -0,0 +1,454 @@
1
+ """
2
+ Multi-Select Variable Detection Algorithm v3
3
+ ===========================================
4
+
5
+ Returns one of:
6
+ - 'multi-select'
7
+ - 'single-select'
8
+ - 'numeric'
9
+ - 'text'
10
+ - 'date'
11
+
12
+ New in v3:
13
+ - Centralized MR set lookup via create_mr_set_lookup
14
+ - Safer binary detection (fixed operator precedence)
15
+ - Optional unique_value_map for performance
16
+ - Optional strict_multi flag to control how aggressive multi-select detection is
17
+ - Optional explain flag to return (var_type, reason) instead of just var_type
18
+ - Removed ALLOWED_EXTRA_MULTI_CODES filtering - actual data values are checked as-is
19
+ """
20
+
21
+ import narwhals as nw
22
+ from narwhals.typing import FrameT
23
+ import polars as pl
24
+ import pandas as pd
25
+ from typing import Any
26
+ import re
27
+
28
+ # ---------------------------------------------------------------------------
29
+ # Configurable "magic" lists and patterns
30
+ # ---------------------------------------------------------------------------
31
+
32
+ SELECTION_PAIRS = [
33
+ ("not selected", "selected"),
34
+ ("unchecked", "checked"),
35
+ ("no", "yes"),
36
+ ("0", "1"),
37
+ ("not mentioned", "mentioned"),
38
+ ("not chosen", "chosen"),
39
+ ("exclude", "include"),
40
+ ]
41
+
42
+ GENERIC_BINARY_LABELS = [
43
+ ("no", "yes"),
44
+ ("false", "true"),
45
+ ("disagree", "agree"),
46
+ ("male", "female"),
47
+ ("off", "on"),
48
+ ("absent", "present"),
49
+ ]
50
+
51
+ MULTI_SELECT_NAME_PATTERNS = [
52
+ r"[_\-]?\d+$", # ends with number
53
+ r"Q\d+[A-Z]$", # Q1A pattern
54
+ r"r\d+$", # r1 pattern
55
+ r"_[A-Z]$", # _A pattern
56
+ r"[A-Z]\d+[A-Z]\d+$", # A1B1 pattern
57
+ ]
58
+
59
+
60
+ # ---------------------------------------------------------------------------
61
+ # Helpers
62
+ # ---------------------------------------------------------------------------
63
+
64
+ def create_mr_set_lookup(meta) -> set[str]:
65
+ """
66
+ Create a set of all variables that belong to multi-response sets.
67
+
68
+ Parameters
69
+ ----------
70
+ meta : pyreadstat metadata object
71
+
72
+ Returns
73
+ -------
74
+ set[str]
75
+ Set of variable names that are part of multi-response sets.
76
+ """
77
+ mr_set_variables: set[str] = set()
78
+
79
+ if hasattr(meta, "mr_sets") and meta.mr_sets:
80
+ for mr_set_name, mr_set_info in meta.mr_sets.items():
81
+ if "variable_list" in mr_set_info:
82
+ mr_set_variables.update(mr_set_info["variable_list"])
83
+
84
+ return mr_set_variables
85
+
86
+
87
+ def _normalize_value_keys(keys: set[Any]) -> set[Any]:
88
+ """
89
+ Normalize value label keys so that 0/1, 0.0/1.0, and "0"/"1" all map to ints.
90
+ Other keys are left as-is.
91
+ """
92
+ normalized: set[Any] = set()
93
+ for k in keys:
94
+ if isinstance(k, (int, float)) and k in [0, 1, 0.0, 1.0]:
95
+ normalized.add(int(k))
96
+ elif isinstance(k, str) and k in {"0", "1"}:
97
+ normalized.add(int(k))
98
+ else:
99
+ normalized.add(k)
100
+ return normalized
101
+
102
+
103
+ def _is_binary_value_dict(value_dict: dict[Any, str]) -> bool:
104
+ """
105
+ Check if a value label dict represents a 0/1 binary variable.
106
+ """
107
+ if len(value_dict) != 2:
108
+ return False
109
+ keys = set(value_dict.keys())
110
+ normalized = _normalize_value_keys(keys)
111
+ return normalized <= {0, 1}
112
+
113
+
114
+ def _labels_lower_pair(value_dict: dict[Any, str]) -> tuple[str, str]:
115
+ """
116
+ Get labels for 0 and 1 (or their float/string equivalents), lowercased and stripped.
117
+ """
118
+ label_0 = str(
119
+ value_dict.get(0, value_dict.get(0.0, value_dict.get("0", "")))
120
+ ).lower().strip()
121
+ label_1 = str(
122
+ value_dict.get(1, value_dict.get(1.0, value_dict.get("1", "")))
123
+ ).lower().strip()
124
+ return label_0, label_1
125
+
126
+
127
+ def _is_generic_binary_labels(label_0: str, label_1: str) -> bool:
128
+ """
129
+ Check if a (label_0, label_1) pair matches generic binary patterns
130
+ like (no, yes), (false, true), (male, female), etc.
131
+ """
132
+ labels_set_lower = {label_0.lower(), label_1.lower()}
133
+ for pair in GENERIC_BINARY_LABELS:
134
+ if labels_set_lower == {p.lower() for p in pair}:
135
+ return True
136
+ return False
137
+
138
+
139
+ def _get_sibling_vars(meta, var_name: str) -> list[str]:
140
+ """
141
+ Find variables that share the same base as var_name (e.g. Q4A, Q4B...).
142
+ """
143
+ base_match = re.match(r"(.+?)([A-Z]|\d+)$", var_name, re.IGNORECASE)
144
+ if not base_match:
145
+ return []
146
+ base = base_match.group(1)
147
+ return [
148
+ v for v in getattr(meta, "column_names", [])
149
+ if v.startswith(base) and v != var_name
150
+ ]
151
+
152
+
153
+ def _match_multi_name_pattern(var_name: str) -> bool:
154
+ """
155
+ Check if variable name matches any multi-select-like naming patterns.
156
+ """
157
+ for pattern in MULTI_SELECT_NAME_PATTERNS:
158
+ if re.search(pattern, var_name, re.IGNORECASE):
159
+ return True
160
+ return False
161
+
162
+
163
+ def _get_unique_values_for_var(
164
+ df: FrameT,
165
+ var_name: str,
166
+ unique_value_map: dict[str, set[Any]] | None = None,
167
+ ) -> set[Any]:
168
+ """
169
+ Get unique values for a variable, using an optional precomputed map
170
+ to avoid recomputing on every call.
171
+ """
172
+ if unique_value_map is not None and var_name in unique_value_map:
173
+ return unique_value_map[var_name]
174
+
175
+ df_nw = nw.from_native(df)
176
+ unique_vals_df = df_nw.select(nw.col(var_name)).unique()
177
+ unique_vals_native = nw.to_native(unique_vals_df)
178
+
179
+ if isinstance(unique_vals_native, pl.DataFrame):
180
+ unique_set = set(unique_vals_native[var_name].to_list())
181
+ else: # pandas
182
+ unique_set = set(unique_vals_native[var_name].tolist())
183
+
184
+ if unique_value_map is not None:
185
+ unique_value_map[var_name] = unique_set
186
+
187
+ return unique_set
188
+
189
+
190
+ # ---------------------------------------------------------------------------
191
+ # Main detection function
192
+ # ---------------------------------------------------------------------------
193
+
194
+ def detect_variable_type(
195
+ var_name: str,
196
+ meta,
197
+ mr_set_variables: set[str] | None = None,
198
+ df: FrameT | None = None,
199
+ *,
200
+ unique_value_map: dict[str, set[Any]] | None = None,
201
+ strict_multi: bool = True,
202
+ explain: bool = False,
203
+ ):
204
+ """
205
+ Detect the type of a variable: single-select, multi-select, numeric, text, or date.
206
+
207
+ Parameters
208
+ ----------
209
+ var_name : str
210
+ The variable name to classify.
211
+ meta : pyreadstat metadata object
212
+ The metadata object from pyreadstat.
213
+ mr_set_variables : set[str] | None
214
+ Pre-computed set of all variables that are part of multi-response sets.
215
+ If None, it will be computed via create_mr_set_lookup(meta).
216
+ df : FrameT | None
217
+ Optional dataframe for data-based type detection.
218
+ unique_value_map : dict[str, set[Any]] | None, keyword-only
219
+ Optional precomputed map of column -> unique values set, for performance.
220
+ strict_multi : bool, keyword-only
221
+ If True (default), require metadata evidence, series evidence, OR
222
+ unlabeled variable status before using data patterns to classify as multi-select.
223
+ If False, any variable with matching 0/1 data pattern can be classified as multi-select.
224
+ explain : bool, keyword-only
225
+ If False (default), return only the type string.
226
+ If True, return a tuple: (type_str, reason_str).
227
+
228
+ Returns
229
+ -------
230
+ str or (str, str)
231
+ If explain=False: one of 'multi-select', 'single-select', 'numeric', 'text', 'date'.
232
+ If explain=True: (type_str, reason_str).
233
+ """
234
+
235
+ def _ret(type_str: str, reason: str):
236
+ return (type_str, reason) if explain else type_str
237
+
238
+ # Build the mr_set_variables if not provided
239
+ if mr_set_variables is None:
240
+ mr_set_variables = create_mr_set_lookup(meta)
241
+
242
+ # Get variable value labels if available
243
+ variable_value_labels: dict[str, dict[Any, str]] = (
244
+ meta.variable_value_labels if hasattr(meta, "variable_value_labels") else {}
245
+ )
246
+
247
+ # Get readstat variable types
248
+ readstat_types: dict[str, str] = (
249
+ meta.readstat_variable_types if hasattr(meta, "readstat_variable_types") else {}
250
+ )
251
+ var_type: str | None = readstat_types.get(var_name, None)
252
+
253
+ # Get original variable types (SPSS format types)
254
+ original_types: dict[str, str] = (
255
+ meta.original_variable_types if hasattr(meta, "original_variable_types") else {}
256
+ )
257
+ original_type: str = original_types.get(var_name, "")
258
+
259
+ # Get variable measure (SPSS measurement level)
260
+ variable_measure: dict[str, str] = (
261
+ meta.variable_measure if hasattr(meta, "variable_measure") else {}
262
+ )
263
+ measure: str = variable_measure.get(var_name, "unknown")
264
+
265
+ # ------------------------------------------------------------------------
266
+ # STEP 1: String/Text Check (highest priority)
267
+ # ------------------------------------------------------------------------
268
+ if var_type == "string":
269
+ return _ret("text", "STEP 1: readstat type is 'string'")
270
+
271
+ # ------------------------------------------------------------------------
272
+ # STEP 2: SPSS Multi-Response Set Check
273
+ # ------------------------------------------------------------------------
274
+ if var_name in mr_set_variables:
275
+ return _ret("multi-select", "STEP 2: variable is in meta.mr_sets")
276
+
277
+ # ------------------------------------------------------------------------
278
+ # STEP 3: DataFrame-based Detection with Metadata Gating
279
+ # ------------------------------------------------------------------------
280
+ if df is not None:
281
+ try:
282
+ metadata_confirms_01_coding = False
283
+ series_confirms_01_coding = False
284
+
285
+ if var_name in variable_value_labels:
286
+ keys = set(variable_value_labels[var_name].keys())
287
+ normalized_keys = _normalize_value_keys(keys)
288
+ if normalized_keys <= {0, 1}:
289
+ metadata_confirms_01_coding = True
290
+
291
+ if not metadata_confirms_01_coding:
292
+ # Check series context
293
+ sibling_vars = _get_sibling_vars(meta, var_name)
294
+ if len(sibling_vars) >= 2:
295
+ siblings_with_01_coding = 0
296
+ for sibling_var in sibling_vars[:5]:
297
+ if sibling_var in variable_value_labels:
298
+ sibling_keys = set(variable_value_labels[sibling_var].keys())
299
+ sibling_norm = _normalize_value_keys(sibling_keys)
300
+ if sibling_norm <= {0, 1}:
301
+ siblings_with_01_coding += 1
302
+ if siblings_with_01_coding >= 2:
303
+ series_confirms_01_coding = True
304
+
305
+ df_nw = nw.from_native(df)
306
+ schema = df_nw.schema
307
+
308
+ # Only attempt df-based pattern check on non-string columns
309
+ if schema.get(var_name) != nw.String:
310
+ unique_set = _get_unique_values_for_var(
311
+ df=df,
312
+ var_name=var_name,
313
+ unique_value_map=unique_value_map,
314
+ )
315
+ # Remove nulls
316
+ unique_set_no_null = {
317
+ v
318
+ for v in unique_set
319
+ if v is not None and not (isinstance(v, float) and pd.isna(v))
320
+ }
321
+
322
+ # Multi-select patterns (with both int and float variants)
323
+ multi_select_patterns = [
324
+ {0, 1},
325
+ {0.0, 1.0},
326
+ {1},
327
+ {1.0},
328
+ {0},
329
+ {0.0},
330
+ ]
331
+
332
+ # Check actual data values as-is (no filtering)
333
+ pattern_match = unique_set_no_null in multi_select_patterns
334
+
335
+ # Gating logic:
336
+ # - metadata_confirms_01_coding: variable's own labels use 0/1 coding
337
+ # - series_confirms_01_coding: sibling variables use 0/1 coding
338
+ # - var_name not in variable_value_labels: unlabeled variables always proceed
339
+ # (unlabeled 0/1 binary indicators are likely multi-select)
340
+ # - strict_multi=False: allows labeled variables without 0/1 evidence to proceed
341
+ gated_ok = (
342
+ metadata_confirms_01_coding
343
+ or series_confirms_01_coding
344
+ or var_name not in variable_value_labels
345
+ or not strict_multi
346
+ )
347
+
348
+ if gated_ok and pattern_match:
349
+ reason_parts = []
350
+ if metadata_confirms_01_coding:
351
+ reason_parts.append("metadata confirms 0/1 coding")
352
+ if series_confirms_01_coding:
353
+ reason_parts.append("series context confirms 0/1 coding")
354
+ if var_name not in variable_value_labels:
355
+ reason_parts.append("unlabeled variable")
356
+ if not strict_multi:
357
+ reason_parts.append("strict_multi=False")
358
+
359
+ return _ret(
360
+ "multi-select",
361
+ f"STEP 3: df values match 0/1 multi-select pattern ({', '.join(reason_parts)})",
362
+ )
363
+ except Exception:
364
+ # Any error in df logic -> fall back to meta-based detection
365
+ pass
366
+
367
+ # ------------------------------------------------------------------------
368
+ # STEP 4: Date/DateTime Check
369
+ # ------------------------------------------------------------------------
370
+ if isinstance(original_type, str) and (
371
+ "DATETIME" in original_type.upper()
372
+ or "DATE" in original_type.upper()
373
+ or "TIME" in original_type.upper()
374
+ ):
375
+ return _ret("date", "STEP 4: original SPSS type is date/time/datetime")
376
+
377
+ # ------------------------------------------------------------------------
378
+ # STEP 5: Value Label Analysis (for categorical variables)
379
+ # ------------------------------------------------------------------------
380
+ has_value_labels: bool = (
381
+ var_name in variable_value_labels and bool(variable_value_labels[var_name])
382
+ )
383
+
384
+ if has_value_labels:
385
+ value_dict: dict[Any, str] = variable_value_labels[var_name]
386
+
387
+ is_binary: bool = _is_binary_value_dict(value_dict)
388
+
389
+ if is_binary:
390
+ # TIER 2: Label analysis
391
+ label_0, label_1 = _labels_lower_pair(value_dict)
392
+
393
+ if (not label_0 or label_0 in ["null", "none", "not selected", ""]):
394
+ # The "1" label is the actual option text
395
+ if label_1 and label_1 not in ["yes", "selected", "true", "1"]:
396
+ return _ret(
397
+ "multi-select",
398
+ "STEP 5 TIER 2: 0 label empty/null, 1 label descriptive",
399
+ )
400
+
401
+ # TIER 3: Selection pair labels + variable naming patterns
402
+ labels_set_lower = {label_0, label_1}
403
+ for pair in SELECTION_PAIRS:
404
+ if labels_set_lower == {p.lower() for p in pair}:
405
+ if _match_multi_name_pattern(var_name):
406
+ return _ret(
407
+ "multi-select",
408
+ "STEP 5 TIER 3: selection pair labels + multi-select name pattern",
409
+ )
410
+
411
+ # TIER 3b: series context (all binary siblings)
412
+ sibling_vars = _get_sibling_vars(meta, var_name)
413
+ if len(sibling_vars) >= 2:
414
+ all_binary = True
415
+ for similar_var in sibling_vars[:3]:
416
+ if similar_var in variable_value_labels:
417
+ similar_dict = variable_value_labels[similar_var]
418
+ if not _is_binary_value_dict(similar_dict):
419
+ all_binary = False
420
+ break
421
+ if all_binary:
422
+ return _ret(
423
+ "multi-select",
424
+ "STEP 5 TIER 3b: part of binary-coded series",
425
+ )
426
+
427
+ # TIER 4: generic binary (yes/no, male/female, etc.) -> single-select
428
+ if _is_generic_binary_labels(label_0, label_1):
429
+ return _ret(
430
+ "single-select",
431
+ "STEP 5 TIER 4: generic binary labels (yes/no, etc.)",
432
+ )
433
+
434
+ # Non-binary categorical -> single-select
435
+ return _ret("single-select", "STEP 5: non-binary categorical with value labels")
436
+
437
+ # ------------------------------------------------------------------------
438
+ # STEP 6: Numeric Type Fallback
439
+ # ------------------------------------------------------------------------
440
+ if var_type in ["double", "numeric", "integer", "long"]:
441
+ return _ret("numeric", "STEP 6: numeric readstat type without value labels")
442
+
443
+ # ------------------------------------------------------------------------
444
+ # STEP 7: Measurement Level Fallback
445
+ # ------------------------------------------------------------------------
446
+ if measure == "scale":
447
+ return _ret("numeric", "STEP 7: measurement level 'scale'")
448
+ elif measure in ["nominal", "ordinal"]:
449
+ return _ret("single-select", "STEP 7: measurement level nominal/ordinal")
450
+
451
+ # ------------------------------------------------------------------------
452
+ # STEP 8: Final Fallback
453
+ # ------------------------------------------------------------------------
454
+ return _ret("numeric", "STEP 8: final fallback to numeric")