ultrasav 0.1.4__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,529 @@
1
+ import narwhals as nw
2
+ from narwhals.typing import FrameT
3
+ import polars as pl
4
+ import pandas as pd
5
+ from typing import Any
6
+ from .def_detect_variable_type import detect_variable_type, create_mr_set_lookup
7
+ # version_9
8
+
9
+ def precompute_value_maps(
10
+ df: FrameT,
11
+ ) -> tuple[
12
+ dict[str, dict[Any, int]],
13
+ dict[str, int],
14
+ dict[str, int],
15
+ dict[str, set[Any]],
16
+ ]:
17
+ """
18
+ Precompute value counts, null counts, non-null counts, and unique values
19
+ for each column in the dataframe.
20
+
21
+ Hybrid design:
22
+ - If the underlying native frame is Polars or Pandas, use an optimized
23
+ backend-specific implementation (_precompute_value_maps_native).
24
+ - Otherwise, fall back to a generic Narwhals-based implementation
25
+ (_precompute_value_maps_narwhals), which should still be efficient and
26
+ automatically benefit from fast backends like Polars.
27
+
28
+ Parameters
29
+ ----------
30
+ df : FrameT
31
+ Any Narwhals-compatible dataframe (Polars, Pandas, etc).
32
+
33
+ Returns
34
+ -------
35
+ value_counts_map : dict[str, dict[Any, int]]
36
+ For each column, a dict of {value -> count} (excluding nulls).
37
+ null_count_map : dict[str, int]
38
+ For each column, the count of null values.
39
+ non_null_count_map : dict[str, int]
40
+ For each column, the count of non-null values.
41
+ unique_value_map : dict[str, set[Any]]
42
+ For each column, the set of unique non-null values.
43
+ This map is what you pass into detect_variable_type to avoid
44
+ recomputing uniques.
45
+ """
46
+ # Normalize to a Narwhals frame
47
+ df_nw = nw.from_native(df)
48
+ native = nw.to_native(df_nw)
49
+
50
+ # Fast path for Polars / Pandas
51
+ if isinstance(native, pl.DataFrame) or isinstance(native, pd.DataFrame):
52
+ return _precompute_value_maps_native(native)
53
+
54
+ # Generic Narwhals path (for other backends)
55
+ return _precompute_value_maps_narwhals(df_nw)
56
+
57
+
58
+ def _precompute_value_maps_native(
59
+ df_native: pl.DataFrame | pd.DataFrame,
60
+ ) -> tuple[
61
+ dict[str, dict[Any, int]],
62
+ dict[str, int],
63
+ dict[str, int],
64
+ dict[str, set[Any]],
65
+ ]:
66
+ """
67
+ Backend-specific fast implementation for Polars and Pandas.
68
+
69
+ For string/text columns, empty strings "" are treated as missing (like nulls).
70
+ Whitespace-only strings are treated as valid non-missing data.
71
+ """
72
+ value_counts_map: dict[str, dict[Any, int]] = {}
73
+ null_count_map: dict[str, int] = {}
74
+ non_null_count_map: dict[str, int] = {}
75
+ unique_value_map: dict[str, set[Any]] = {}
76
+
77
+ if isinstance(df_native, pl.DataFrame):
78
+ for col in df_native.columns:
79
+ s = df_native[col]
80
+
81
+ # Check if column is string type
82
+ is_string_col = s.dtype == pl.Utf8 or s.dtype == pl.String
83
+
84
+ if is_string_col:
85
+ # For string columns: treat nulls AND empty strings as missing
86
+ actual_null_count = int(s.null_count())
87
+
88
+ # Count empty strings (only exact "", not whitespace)
89
+ empty_string_count = int(
90
+ s.drop_nulls().eq("").sum()
91
+ )
92
+
93
+ # Total "missing" = nulls + empty strings
94
+ null_count_map[col] = actual_null_count + empty_string_count
95
+
96
+ # Value counts for non-null, non-empty values
97
+ s_valid = s.filter(s.is_not_null() & (s != ""))
98
+
99
+ if s_valid.len() > 0:
100
+ vc_df = s_valid.value_counts()
101
+ cols = vc_df.columns
102
+ value_col = cols[0]
103
+ count_col = cols[1] if len(cols) > 1 else None
104
+
105
+ values = vc_df[value_col].to_list()
106
+ counts = vc_df[count_col].to_list() if count_col else [1] * len(values)
107
+ vc_dict = dict(zip(values, counts))
108
+ else:
109
+ values = []
110
+ counts = []
111
+ vc_dict = {}
112
+
113
+ value_counts_map[col] = vc_dict
114
+ unique_value_map[col] = set(values)
115
+ non_null_count_map[col] = int(sum(counts))
116
+
117
+ else:
118
+ # Non-string columns: original logic
119
+ null_count = int(s.null_count())
120
+ null_count_map[col] = null_count
121
+
122
+ vc_df = s.drop_nulls().value_counts()
123
+ if vc_df.height > 0:
124
+ cols = vc_df.columns
125
+ value_col = cols[0]
126
+ count_col = cols[1] if len(cols) > 1 else None
127
+
128
+ values = vc_df[value_col].to_list()
129
+ counts = vc_df[count_col].to_list() if count_col else [1] * len(values)
130
+ vc_dict = dict(zip(values, counts))
131
+ else:
132
+ values = []
133
+ counts = []
134
+ vc_dict = {}
135
+
136
+ value_counts_map[col] = vc_dict
137
+ unique_value_map[col] = set(values)
138
+ non_null_count_map[col] = int(sum(counts))
139
+
140
+ elif isinstance(df_native, pd.DataFrame):
141
+ for col in df_native.columns:
142
+ s = df_native[col]
143
+
144
+ # Check if column is string/object type
145
+ is_string_col = s.dtype == "object" or pd.api.types.is_string_dtype(s)
146
+
147
+ if is_string_col:
148
+ # For string columns: treat nulls AND empty strings as missing
149
+ actual_null_count = int(s.isna().sum())
150
+
151
+ # Count empty strings (only exact "", not whitespace)
152
+ non_null_mask = s.notna()
153
+ empty_string_count = int((s[non_null_mask] == "").sum())
154
+
155
+ null_count_map[col] = actual_null_count + empty_string_count
156
+
157
+ # Value counts for non-null, non-empty values
158
+ valid_mask = non_null_mask & (s != "")
159
+ s_valid = s[valid_mask]
160
+
161
+ vc = s_valid.value_counts(dropna=True)
162
+ vc_dict = vc.to_dict()
163
+ value_counts_map[col] = vc_dict
164
+ unique_value_map[col] = set(vc.index.tolist())
165
+ non_null_count_map[col] = int(vc.sum())
166
+
167
+ else:
168
+ # Non-string columns: original logic
169
+ null_count = int(s.isna().sum())
170
+ null_count_map[col] = null_count
171
+
172
+ vc = s.value_counts(dropna=True)
173
+ vc_dict = vc.to_dict()
174
+ value_counts_map[col] = vc_dict
175
+ unique_value_map[col] = set(vc.index.tolist())
176
+ non_null_count_map[col] = int(vc.sum())
177
+ else:
178
+ raise ValueError(f"Unsupported native dataframe type: {type(df_native)}")
179
+
180
+ return value_counts_map, null_count_map, non_null_count_map, unique_value_map
181
+
182
+
183
+ def _precompute_value_maps_narwhals(
184
+ df_nw: FrameT,
185
+ ) -> tuple[
186
+ dict[str, dict[Any, int]],
187
+ dict[str, int],
188
+ dict[str, int],
189
+ dict[str, set[Any]],
190
+ ]:
191
+ """
192
+ Generic Narwhals implementation.
193
+
194
+ For string/text columns, empty strings "" are treated as missing (like nulls).
195
+ Whitespace-only strings are treated as valid non-missing data.
196
+ """
197
+ value_counts_map: dict[str, dict[Any, int]] = {}
198
+ null_count_map: dict[str, int] = {}
199
+ non_null_count_map: dict[str, int] = {}
200
+ unique_value_map: dict[str, set[Any]] = {}
201
+
202
+ schema = df_nw.schema # dict-like: {column_name: dtype}
203
+
204
+ for col in schema.keys():
205
+ col_expr = nw.col(col)
206
+ col_dtype = schema[col]
207
+
208
+ # Check if column is string type
209
+ is_string_col = col_dtype == nw.String
210
+
211
+ if is_string_col:
212
+ # For string columns: treat nulls AND empty strings as missing
213
+
214
+ # Count actual nulls
215
+ actual_null_count = int(
216
+ df_nw.select(col_expr.is_null().sum().alias("n_null")).item(0, "n_null")
217
+ )
218
+
219
+ # Count empty strings (only exact "", not whitespace)
220
+ empty_string_count = int(
221
+ df_nw.filter(~col_expr.is_null())
222
+ .select((col_expr == "").sum().alias("n_empty"))
223
+ .item(0, "n_empty")
224
+ )
225
+
226
+ null_count_map[col] = actual_null_count + empty_string_count
227
+
228
+ # Value counts for non-null, non-empty values
229
+ vc_nw = (
230
+ df_nw.filter(~col_expr.is_null() & (col_expr != ""))
231
+ .select(col_expr.alias(col))
232
+ .group_by(col)
233
+ .agg(nw.col(col).count().alias("count"))
234
+ )
235
+
236
+ vc_native = nw.to_native(vc_nw)
237
+
238
+ if isinstance(vc_native, pl.DataFrame):
239
+ values = vc_native[col].to_list()
240
+ counts = vc_native["count"].to_list()
241
+ else: # assume pandas-like
242
+ values = vc_native[col].tolist()
243
+ counts = vc_native["count"].tolist()
244
+
245
+ vc_dict = dict(zip(values, counts))
246
+ value_counts_map[col] = vc_dict
247
+ unique_value_map[col] = set(values)
248
+ non_null_count_map[col] = int(sum(counts))
249
+
250
+ else:
251
+ # Non-string columns: original logic
252
+ null_count = int(
253
+ df_nw.select(col_expr.is_null().sum().alias("n_null")).item(0, "n_null")
254
+ )
255
+ null_count_map[col] = null_count
256
+
257
+ vc_nw = (
258
+ df_nw.filter(~col_expr.is_null())
259
+ .select(col_expr.alias(col))
260
+ .group_by(col)
261
+ .agg(nw.col(col).count().alias("count"))
262
+ )
263
+
264
+ vc_native = nw.to_native(vc_nw)
265
+
266
+ if isinstance(vc_native, pl.DataFrame):
267
+ values = vc_native[col].to_list()
268
+ counts = vc_native["count"].to_list()
269
+ else: # assume pandas-like
270
+ values = vc_native[col].tolist()
271
+ counts = vc_native["count"].tolist()
272
+
273
+ vc_dict = dict(zip(values, counts))
274
+ value_counts_map[col] = vc_dict
275
+ unique_value_map[col] = set(values)
276
+ non_null_count_map[col] = int(sum(counts))
277
+
278
+ return value_counts_map, null_count_map, non_null_count_map, unique_value_map
279
+
280
+
281
+ def merge_meta_and_actual_values(
282
+ meta_values: dict[Any, str],
283
+ actual_value_counts: dict[Any, int]
284
+ ) -> list[tuple[Any, Any, int, bool]]:
285
+ """
286
+ Merge meta value labels with actual values found in data.
287
+ Insert unlabeled values in proper numeric/string order.
288
+
289
+ Parameters
290
+ ----------
291
+ meta_values : dict[Any, str]
292
+ Value labels from meta (code -> label).
293
+ actual_value_counts : dict[Any, int]
294
+ Actual values and their counts from data.
295
+
296
+ Returns
297
+ -------
298
+ list[tuple[Any, Any, int, bool]]
299
+ List of (value_code, value_label, count, is_missing_label),
300
+ sorted by value_code.
301
+
302
+ Notes
303
+ -----
304
+ - All values in meta_values are included (even if count=0) for integrity.
305
+ - Values in actual data but not in meta are also included (unlabeled values).
306
+ """
307
+ result: list[tuple[Any, Any, int, bool]] = []
308
+
309
+ # All codes from meta + actual data
310
+ all_codes: set[Any] = set(meta_values.keys()) | set(actual_value_counts.keys())
311
+
312
+ # Sort codes robustly (mixed types handled via (type_name, str(value)))
313
+ try:
314
+ sorted_codes = sorted(all_codes)
315
+ except TypeError:
316
+ sorted_codes = sorted(all_codes, key=lambda x: (type(x).__name__, str(x)))
317
+
318
+ for code in sorted_codes:
319
+ if code in meta_values or code in actual_value_counts:
320
+ label: str | None = meta_values.get(code, None)
321
+ count: int = actual_value_counts.get(code, 0)
322
+ is_missing_label: bool = code not in meta_values
323
+ result.append((code, label, count, is_missing_label))
324
+
325
+ return result
326
+
327
+ def map_engine(
328
+ df: pl.DataFrame | pd.DataFrame,
329
+ meta,
330
+ output_format: str | None = None
331
+ ) -> pl.DataFrame | pd.DataFrame:
332
+ """
333
+ Create a data validation core map from dataframe and pyreadstat meta object.
334
+
335
+ This function serves as the core map engine that analyzes both metadata and
336
+ actual data to produce a comprehensive mapping for data validation and analysis.
337
+
338
+ It identifies:
339
+ - Missing data (nulls)
340
+ - Unlabeled values (values in data but not in meta)
341
+ - Value distributions (counts for each value)
342
+
343
+ Parameters
344
+ ----------
345
+ df : pl.DataFrame | pd.DataFrame
346
+ The data dataframe (Polars or Pandas).
347
+ meta : pyreadstat metadata object
348
+ The metadata object returned by pyreadstat when reading SPSS files.
349
+ output_format : str | None
350
+ Output format - either "polars" or "pandas".
351
+ If None, will match the input dataframe type.
352
+
353
+ Returns
354
+ -------
355
+ pl.DataFrame | pd.DataFrame
356
+ A dataframe with columns:
357
+ - variable: variable name
358
+ - variable_label: variable label text from meta
359
+ - variable_type: variable type (single-select, multi-select, numeric, text, date)
360
+ - value_code: value code (None for missing-data row, codes for categories)
361
+ - value_label: value label ("NULL" for missing-data row, labels or None for unlabeled)
362
+ - value_n: count of occurrences
363
+ """
364
+
365
+ # Determine output format
366
+ if output_format is None:
367
+ if isinstance(df, pl.DataFrame):
368
+ output_format = "polars"
369
+ elif isinstance(df, pd.DataFrame):
370
+ output_format = "pandas"
371
+ else:
372
+ raise ValueError(f"Unsupported dataframe type: {type(df)}")
373
+
374
+ if output_format not in {"polars", "pandas"}:
375
+ raise ValueError(f"output_format must be 'polars' or 'pandas', got '{output_format}'")
376
+
377
+ # Precompute MR set variables for efficiency
378
+ mr_set_variables: set[str] = create_mr_set_lookup(meta)
379
+
380
+ # Precompute value counts, null counts, non-null counts, and unique sets
381
+ (
382
+ value_counts_map,
383
+ null_count_map,
384
+ non_null_count_map,
385
+ unique_value_map,
386
+ ) = precompute_value_maps(df)
387
+
388
+ # Initialize lists to store final map rows
389
+ variables: list[str] = []
390
+ variable_labels: list[str] = []
391
+ variable_types: list[str] = []
392
+ value_codes: list[Any] = []
393
+ value_labels: list[Any] = []
394
+ value_ns: list[int] = []
395
+
396
+ # Meta helpers
397
+ col_names_to_labels: dict[str, str] = meta.column_names_to_labels
398
+ variable_value_labels: dict[str, dict[Any, str]] = (
399
+ meta.variable_value_labels if hasattr(meta, "variable_value_labels") else {}
400
+ )
401
+
402
+ # Iterate through variables defined in meta
403
+ for var_name in meta.column_names:
404
+ # Variable label
405
+ variable_label: str = col_names_to_labels.get(var_name, "")
406
+
407
+ # Detect variable type using cached uniques
408
+ var_type: str = detect_variable_type(
409
+ var_name,
410
+ meta,
411
+ mr_set_variables=mr_set_variables,
412
+ df=df,
413
+ unique_value_map=unique_value_map,
414
+ )
415
+
416
+ # Pull precomputed counts
417
+ value_count_dict: dict[Any, int] = value_counts_map.get(var_name, {})
418
+ null_count: int = null_count_map.get(var_name, 0)
419
+ non_null_count: int = non_null_count_map.get(var_name, 0)
420
+
421
+ is_categorical: bool = var_type in ["single-select", "multi-select"]
422
+
423
+ # STEP 1: Add missing data row if nulls exist
424
+ if null_count > 0:
425
+ variables.append(var_name)
426
+ variable_labels.append(variable_label)
427
+ variable_types.append(var_type)
428
+ value_codes.append(None)
429
+ value_labels.append("NULL")
430
+ value_ns.append(null_count)
431
+
432
+ # STEP 2: Categorical vs non-categorical handling
433
+ if is_categorical:
434
+ # Meta value labels for this variable
435
+ meta_values: dict[Any, str] = variable_value_labels.get(var_name, {})
436
+
437
+ # Merge meta and actual values
438
+ merged_values = merge_meta_and_actual_values(meta_values, value_count_dict)
439
+
440
+ for code, label, count, _is_missing_label in merged_values:
441
+ variables.append(var_name)
442
+ variable_labels.append(variable_label)
443
+ variable_types.append(var_type)
444
+ value_codes.append(code)
445
+ value_labels.append(label)
446
+ value_ns.append(count)
447
+ else:
448
+ # Non-categorical (numeric, text, date)
449
+ if non_null_count > 0:
450
+ variables.append(var_name)
451
+ variable_labels.append(variable_label)
452
+ variable_types.append(var_type)
453
+ value_codes.append(None)
454
+ value_labels.append(None)
455
+ value_ns.append(non_null_count)
456
+
457
+ # Build final core map dataframe
458
+ if output_format == "polars":
459
+ # Decide dtype for value_code column based on non-None codes
460
+ non_none_codes = [v for v in value_codes if v is not None]
461
+
462
+ if non_none_codes:
463
+ try:
464
+ numeric_values = [float(v) for v in non_none_codes]
465
+ if all(v.is_integer() for v in numeric_values):
466
+ value_code_dtype = pl.Int64
467
+ value_codes_typed = [
468
+ int(float(v)) if v is not None else None for v in value_codes
469
+ ]
470
+ else:
471
+ value_code_dtype = pl.Float64
472
+ value_codes_typed = [
473
+ float(v) if v is not None else None for v in value_codes
474
+ ]
475
+ except (ValueError, TypeError):
476
+ value_code_dtype = pl.Utf8
477
+ value_codes_typed = [
478
+ str(v) if v is not None else None for v in value_codes
479
+ ]
480
+ else:
481
+ value_code_dtype = pl.Float64
482
+ value_codes_typed = value_codes
483
+
484
+ core_map = pl.DataFrame(
485
+ {
486
+ "variable": variables,
487
+ "variable_label": variable_labels,
488
+ "variable_type": variable_types,
489
+ "value_code": pl.Series(value_codes_typed, dtype=value_code_dtype),
490
+ "value_label": value_labels,
491
+ "value_n": value_ns,
492
+ }
493
+ )
494
+ else: # "pandas"
495
+ core_map = pd.DataFrame(
496
+ {
497
+ "variable": variables,
498
+ "variable_label": variable_labels,
499
+ "variable_type": variable_types,
500
+ "value_code": value_codes,
501
+ "value_label": value_labels,
502
+ "value_n": value_ns,
503
+ }
504
+ )
505
+
506
+ return core_map
507
+
508
+
509
+ # Example usage:
510
+ if __name__ == "__main__":
511
+ import pyreadstat
512
+
513
+ # Read SPSS file as Polars or Pandas (pyreadstat gives Pandas by default)
514
+ df_pd, meta = pyreadstat.read_sav("your_file.sav", user_missing=True)
515
+
516
+ # If you prefer Polars:
517
+ df_pl = pl.from_pandas(df_pd)
518
+
519
+ # Create core map (Polars)
520
+ core_map_pl = map_engine(df_pl, meta)
521
+ print(core_map_pl.head())
522
+
523
+ # Save to files
524
+ core_map_pl.write_excel("data_core_map_polars.xlsx")
525
+ core_map_pl.write_csv("data_core_map_polars.csv")
526
+
527
+ # Or explicitly ask for Pandas output
528
+ core_map_pd = map_engine(df_pl, meta, output_format="pandas")
529
+ core_map_pd.to_excel("data_core_map_pandas.xlsx", index=False)