ultrasav 0.1.4__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- ultrasav/__init__.py +280 -0
- ultrasav/_add_cases.py +227 -0
- ultrasav/_data.py +513 -0
- ultrasav/_make_dummy.py +137 -0
- ultrasav/_merge_data.py +435 -0
- ultrasav/_merge_meta.py +280 -0
- ultrasav/_metadata.py +570 -0
- ultrasav/_read_files.py +558 -0
- ultrasav/_write_files.py +111 -0
- ultrasav/metaman/__init__.py +91 -0
- ultrasav/metaman/def_detect_variable_type.py +454 -0
- ultrasav/metaman/def_get_meta.py +561 -0
- ultrasav/metaman/def_make_datamap.py +127 -0
- ultrasav/metaman/def_make_labels.py +833 -0
- ultrasav/metaman/def_map_engine.py +529 -0
- ultrasav/metaman/def_map_to_excel.py +294 -0
- ultrasav/metaman/def_write_excel_engine.py +298 -0
- ultrasav/metaman/pastel_color_schemes.py +185 -0
- ultrasav-0.1.4.dist-info/METADATA +550 -0
- ultrasav-0.1.4.dist-info/RECORD +21 -0
- ultrasav-0.1.4.dist-info/WHEEL +4 -0
|
@@ -0,0 +1,561 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Extract Metadict Module (v2 - Flat Format)
|
|
3
|
+
==========================================
|
|
4
|
+
A utility module for extracting SPSS metadata from pyreadstat meta objects
|
|
5
|
+
and saving as importable Python files with flat, pyreadstat-ready variables.
|
|
6
|
+
|
|
7
|
+
Output variables are directly usable with pyreadstat.write_sav():
|
|
8
|
+
- column_labels: {var: label, ...}
|
|
9
|
+
- variable_value_labels: {var: {val: label}, ...}
|
|
10
|
+
|
|
11
|
+
Dependencies: pathlib, datetime
|
|
12
|
+
"""
|
|
13
|
+
|
|
14
|
+
from datetime import datetime
|
|
15
|
+
from pathlib import Path
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
# =============================================================================
|
|
19
|
+
# Type Aliases
|
|
20
|
+
# =============================================================================
|
|
21
|
+
ColumnLabelsDict = dict[str, str]
|
|
22
|
+
ValueLabelsDict = dict[str, dict[int | float | str, str]]
|
|
23
|
+
FileInfoDict = dict[str, str | int | None]
|
|
24
|
+
MetaDictFlat = dict[str, ColumnLabelsDict | ValueLabelsDict | FileInfoDict]
|
|
25
|
+
|
|
26
|
+
|
|
27
|
+
# =============================================================================
|
|
28
|
+
# Value Conversion Helper
|
|
29
|
+
# =============================================================================
|
|
30
|
+
|
|
31
|
+
def _convert_value(value):
|
|
32
|
+
"""
|
|
33
|
+
Convert values to serializable types (handles datetime, dict, list).
|
|
34
|
+
|
|
35
|
+
Uses inner function for recursion to avoid Marimo's function renaming.
|
|
36
|
+
|
|
37
|
+
Parameters
|
|
38
|
+
----------
|
|
39
|
+
value : any
|
|
40
|
+
Value to convert
|
|
41
|
+
|
|
42
|
+
Returns
|
|
43
|
+
-------
|
|
44
|
+
any
|
|
45
|
+
Converted value (datetime -> str, others unchanged)
|
|
46
|
+
"""
|
|
47
|
+
def convert_nested(obj):
|
|
48
|
+
"""Iteratively convert nested structures."""
|
|
49
|
+
if isinstance(obj, datetime):
|
|
50
|
+
return obj.strftime("%Y-%m-%d %H:%M:%S")
|
|
51
|
+
|
|
52
|
+
if not isinstance(obj, (dict, list)):
|
|
53
|
+
return obj
|
|
54
|
+
|
|
55
|
+
if isinstance(obj, dict):
|
|
56
|
+
result = {}
|
|
57
|
+
for k, v in obj.items():
|
|
58
|
+
result[k] = convert_nested(v) if isinstance(v, (dict, list, datetime)) else v
|
|
59
|
+
return result
|
|
60
|
+
else: # list
|
|
61
|
+
result = []
|
|
62
|
+
for item in obj:
|
|
63
|
+
result.append(convert_nested(item) if isinstance(item, (dict, list, datetime)) else item)
|
|
64
|
+
return result
|
|
65
|
+
|
|
66
|
+
return convert_nested(value)
|
|
67
|
+
|
|
68
|
+
|
|
69
|
+
# =============================================================================
|
|
70
|
+
# Metadata Extraction (Pure Functions)
|
|
71
|
+
# =============================================================================
|
|
72
|
+
|
|
73
|
+
def _extract_column_labels(meta) -> ColumnLabelsDict:
|
|
74
|
+
"""
|
|
75
|
+
Extract column labels dict directly usable with pyreadstat.write_sav().
|
|
76
|
+
|
|
77
|
+
Parameters
|
|
78
|
+
----------
|
|
79
|
+
meta : pyreadstat.metadata_container
|
|
80
|
+
The metadata object from pyreadstat.read_sav()
|
|
81
|
+
|
|
82
|
+
Returns
|
|
83
|
+
-------
|
|
84
|
+
ColumnLabelsDict
|
|
85
|
+
{variable_name: label, ...}
|
|
86
|
+
"""
|
|
87
|
+
return dict(meta.column_names_to_labels)
|
|
88
|
+
|
|
89
|
+
|
|
90
|
+
def _extract_variable_value_labels(meta) -> ValueLabelsDict:
|
|
91
|
+
"""
|
|
92
|
+
Extract variable value labels dict directly usable with pyreadstat.write_sav().
|
|
93
|
+
|
|
94
|
+
Parameters
|
|
95
|
+
----------
|
|
96
|
+
meta : pyreadstat.metadata_container
|
|
97
|
+
The metadata object from pyreadstat.read_sav()
|
|
98
|
+
|
|
99
|
+
Returns
|
|
100
|
+
-------
|
|
101
|
+
ValueLabelsDict
|
|
102
|
+
{variable_name: {value: label, ...}, ...}
|
|
103
|
+
"""
|
|
104
|
+
return _convert_value(meta.variable_value_labels)
|
|
105
|
+
|
|
106
|
+
|
|
107
|
+
def _extract_file_info(meta) -> FileInfoDict:
|
|
108
|
+
"""
|
|
109
|
+
Extract general file information for reference.
|
|
110
|
+
|
|
111
|
+
Parameters
|
|
112
|
+
----------
|
|
113
|
+
meta : pyreadstat.metadata_container
|
|
114
|
+
The metadata object from pyreadstat.read_sav()
|
|
115
|
+
|
|
116
|
+
Returns
|
|
117
|
+
-------
|
|
118
|
+
FileInfoDict
|
|
119
|
+
General file metadata
|
|
120
|
+
"""
|
|
121
|
+
return {
|
|
122
|
+
"file_label": meta.file_label,
|
|
123
|
+
"table_name": meta.table_name,
|
|
124
|
+
"file_encoding": meta.file_encoding,
|
|
125
|
+
"number_rows": meta.number_rows,
|
|
126
|
+
"number_columns": meta.number_columns,
|
|
127
|
+
"creation_time": _convert_value(meta.creation_time),
|
|
128
|
+
"modification_time": _convert_value(meta.modification_time),
|
|
129
|
+
"notes": _convert_value(meta.notes),
|
|
130
|
+
}
|
|
131
|
+
|
|
132
|
+
|
|
133
|
+
def _extract_extended_metadata(meta) -> dict:
|
|
134
|
+
"""
|
|
135
|
+
Extract extended metadata fields (for include_all=True).
|
|
136
|
+
|
|
137
|
+
Parameters
|
|
138
|
+
----------
|
|
139
|
+
meta : pyreadstat.metadata_container
|
|
140
|
+
The metadata object from pyreadstat.read_sav()
|
|
141
|
+
|
|
142
|
+
Returns
|
|
143
|
+
-------
|
|
144
|
+
dict
|
|
145
|
+
Extended metadata fields
|
|
146
|
+
"""
|
|
147
|
+
return {
|
|
148
|
+
"variable_measure": dict(meta.variable_measure),
|
|
149
|
+
"variable_display_width": dict(meta.variable_display_width),
|
|
150
|
+
"variable_storage_width": dict(meta.variable_storage_width),
|
|
151
|
+
"variable_alignment": dict(meta.variable_alignment),
|
|
152
|
+
"original_variable_types": dict(meta.original_variable_types),
|
|
153
|
+
"readstat_variable_types": dict(meta.readstat_variable_types),
|
|
154
|
+
"missing_ranges": _convert_value(meta.missing_ranges),
|
|
155
|
+
"missing_user_values": _convert_value(meta.missing_user_values),
|
|
156
|
+
# Additional fields for metadata integrity (less commonly used)
|
|
157
|
+
"column_names": list(meta.column_names),
|
|
158
|
+
"column_labels": list(meta.column_labels),
|
|
159
|
+
"value_labels": _convert_value(meta.value_labels),
|
|
160
|
+
"variable_to_label": dict(meta.variable_to_label),
|
|
161
|
+
}
|
|
162
|
+
|
|
163
|
+
|
|
164
|
+
# =============================================================================
|
|
165
|
+
# Python Code Formatting
|
|
166
|
+
# =============================================================================
|
|
167
|
+
|
|
168
|
+
def _format_value_as_python(value, indent_level: int = 0) -> str:
|
|
169
|
+
"""
|
|
170
|
+
Format a Python value as valid Python code string.
|
|
171
|
+
|
|
172
|
+
Uses inner function for recursion to avoid Marimo's function renaming.
|
|
173
|
+
|
|
174
|
+
Parameters
|
|
175
|
+
----------
|
|
176
|
+
value : any
|
|
177
|
+
Value to format
|
|
178
|
+
indent_level : int
|
|
179
|
+
Current indentation level
|
|
180
|
+
|
|
181
|
+
Returns
|
|
182
|
+
-------
|
|
183
|
+
str
|
|
184
|
+
Python code representation of the value
|
|
185
|
+
"""
|
|
186
|
+
def fmt(val, level: int) -> str:
|
|
187
|
+
"""Inner recursive formatter."""
|
|
188
|
+
indent = " " * level
|
|
189
|
+
next_indent = " " * (level + 1)
|
|
190
|
+
|
|
191
|
+
if val is None:
|
|
192
|
+
return "None"
|
|
193
|
+
elif isinstance(val, bool):
|
|
194
|
+
return "True" if val else "False"
|
|
195
|
+
elif isinstance(val, (int, float)):
|
|
196
|
+
return repr(val)
|
|
197
|
+
elif isinstance(val, str):
|
|
198
|
+
# Use triple quotes for strings with newlines or quotes
|
|
199
|
+
if '\n' in val or "'" in val or '"' in val:
|
|
200
|
+
escaped = val.replace("'''", "\\'\\'\\'")
|
|
201
|
+
return f"'''{escaped}'''"
|
|
202
|
+
return repr(val)
|
|
203
|
+
elif isinstance(val, list):
|
|
204
|
+
if not val:
|
|
205
|
+
return "[]"
|
|
206
|
+
if len(val) <= 3 and all(isinstance(v, (int, float, str, bool, type(None))) for v in val):
|
|
207
|
+
items = ", ".join(fmt(v, 0) for v in val)
|
|
208
|
+
return f"[{items}]"
|
|
209
|
+
lines = ["["]
|
|
210
|
+
for item in val:
|
|
211
|
+
formatted = fmt(item, level + 1)
|
|
212
|
+
lines.append(f"{next_indent}{formatted},")
|
|
213
|
+
lines.append(f"{indent}]")
|
|
214
|
+
return "\n".join(lines)
|
|
215
|
+
elif isinstance(val, dict):
|
|
216
|
+
if not val:
|
|
217
|
+
return "{}"
|
|
218
|
+
lines = ["{"]
|
|
219
|
+
for k, v in val.items():
|
|
220
|
+
key_repr = repr(k)
|
|
221
|
+
val_repr = fmt(v, level + 1)
|
|
222
|
+
lines.append(f"{next_indent}{key_repr}: {val_repr},")
|
|
223
|
+
lines.append(f"{indent}}}")
|
|
224
|
+
return "\n".join(lines)
|
|
225
|
+
else:
|
|
226
|
+
return repr(val)
|
|
227
|
+
|
|
228
|
+
return fmt(value, indent_level)
|
|
229
|
+
|
|
230
|
+
|
|
231
|
+
def _format_variable_assignment(var_name: str, data, comment: str | None = None) -> str:
|
|
232
|
+
"""
|
|
233
|
+
Format a variable assignment as Python code.
|
|
234
|
+
|
|
235
|
+
Parameters
|
|
236
|
+
----------
|
|
237
|
+
var_name : str
|
|
238
|
+
Variable name
|
|
239
|
+
data : any
|
|
240
|
+
Data to assign
|
|
241
|
+
comment : str, optional
|
|
242
|
+
Comment to add above the assignment
|
|
243
|
+
|
|
244
|
+
Returns
|
|
245
|
+
-------
|
|
246
|
+
str
|
|
247
|
+
Python code for the assignment
|
|
248
|
+
"""
|
|
249
|
+
lines: list[str] = []
|
|
250
|
+
if comment:
|
|
251
|
+
lines.append(f"# {comment}")
|
|
252
|
+
formatted = _format_value_as_python(data, 0)
|
|
253
|
+
lines.append(f"{var_name} = {formatted}")
|
|
254
|
+
return "\n".join(lines)
|
|
255
|
+
|
|
256
|
+
|
|
257
|
+
# =============================================================================
|
|
258
|
+
# File Generation
|
|
259
|
+
# =============================================================================
|
|
260
|
+
|
|
261
|
+
def _generate_python_file_content(
|
|
262
|
+
column_labels: ColumnLabelsDict,
|
|
263
|
+
variable_value_labels: ValueLabelsDict,
|
|
264
|
+
file_info: FileInfoDict,
|
|
265
|
+
extended_metadata: dict | None = None,
|
|
266
|
+
col_labels_var: str = "column_names_to_labels",
|
|
267
|
+
val_labels_var: str = "variable_value_labels",
|
|
268
|
+
file_info_var: str = "file_info",
|
|
269
|
+
) -> str:
|
|
270
|
+
"""
|
|
271
|
+
Generate complete Python file content with flat, pyreadstat-ready format.
|
|
272
|
+
|
|
273
|
+
Parameters
|
|
274
|
+
----------
|
|
275
|
+
column_labels : ColumnLabelsDict
|
|
276
|
+
Column labels dictionary
|
|
277
|
+
variable_value_labels : ValueLabelsDict
|
|
278
|
+
Variable value labels dictionary
|
|
279
|
+
file_info : FileInfoDict
|
|
280
|
+
File information dictionary
|
|
281
|
+
extended_metadata : dict, optional
|
|
282
|
+
Extended metadata (if include_all=True)
|
|
283
|
+
col_labels_var : str
|
|
284
|
+
Variable name for column labels
|
|
285
|
+
val_labels_var : str
|
|
286
|
+
Variable name for value labels
|
|
287
|
+
file_info_var : str
|
|
288
|
+
Variable name for file info
|
|
289
|
+
|
|
290
|
+
Returns
|
|
291
|
+
-------
|
|
292
|
+
str
|
|
293
|
+
Complete Python file content
|
|
294
|
+
"""
|
|
295
|
+
lines: list[str] = []
|
|
296
|
+
|
|
297
|
+
# Module docstring
|
|
298
|
+
lines.append('"""')
|
|
299
|
+
lines.append("SPSS Metadata Dictionary")
|
|
300
|
+
lines.append("========================")
|
|
301
|
+
lines.append("Auto-generated metadata extracted from SPSS file.")
|
|
302
|
+
lines.append("")
|
|
303
|
+
# lines.append("Usage with pyreadstat.write_sav():")
|
|
304
|
+
# lines.append(f" pyreadstat.write_sav(df, 'output.sav',")
|
|
305
|
+
# lines.append(f" column_labels={col_labels_var},")
|
|
306
|
+
# lines.append(f" variable_value_labels={val_labels_var}")
|
|
307
|
+
# lines.append(f" )")
|
|
308
|
+
# lines.append("")
|
|
309
|
+
lines.append(f"Generated: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")
|
|
310
|
+
lines.append('"""')
|
|
311
|
+
lines.append("")
|
|
312
|
+
|
|
313
|
+
# File info first (high-level overview)
|
|
314
|
+
lines.append(_format_variable_assignment(
|
|
315
|
+
file_info_var,
|
|
316
|
+
file_info,
|
|
317
|
+
"File Information (high-level overview)"
|
|
318
|
+
))
|
|
319
|
+
lines.append("")
|
|
320
|
+
lines.append("")
|
|
321
|
+
|
|
322
|
+
# Column labels (primary output)
|
|
323
|
+
lines.append(_format_variable_assignment(
|
|
324
|
+
col_labels_var,
|
|
325
|
+
column_labels,
|
|
326
|
+
f"Column Names to Labels - use with column_labels= in pyreadstat.write_sav()"
|
|
327
|
+
))
|
|
328
|
+
lines.append("")
|
|
329
|
+
lines.append("")
|
|
330
|
+
|
|
331
|
+
# Variable value labels (primary output)
|
|
332
|
+
lines.append(_format_variable_assignment(
|
|
333
|
+
val_labels_var,
|
|
334
|
+
variable_value_labels,
|
|
335
|
+
f"Variable Value Labels - use with variable_value_labels= in pyreadstat.write_sav()"
|
|
336
|
+
))
|
|
337
|
+
lines.append("")
|
|
338
|
+
|
|
339
|
+
# Extended metadata if included
|
|
340
|
+
if extended_metadata:
|
|
341
|
+
lines.append("")
|
|
342
|
+
lines.append(_format_variable_assignment(
|
|
343
|
+
"extended_metadata",
|
|
344
|
+
extended_metadata,
|
|
345
|
+
"Extended Metadata (variable measures, widths, types, etc.)"
|
|
346
|
+
))
|
|
347
|
+
lines.append("")
|
|
348
|
+
|
|
349
|
+
return "\n".join(lines)
|
|
350
|
+
|
|
351
|
+
|
|
352
|
+
def _save_python_file(content: str, output_path: Path, encoding: str = "utf-8") -> Path:
|
|
353
|
+
"""
|
|
354
|
+
Save Python content to file.
|
|
355
|
+
|
|
356
|
+
Parameters
|
|
357
|
+
----------
|
|
358
|
+
content : str
|
|
359
|
+
Python file content
|
|
360
|
+
output_path : Path
|
|
361
|
+
Output file path
|
|
362
|
+
encoding : str
|
|
363
|
+
File encoding
|
|
364
|
+
|
|
365
|
+
Returns
|
|
366
|
+
-------
|
|
367
|
+
Path
|
|
368
|
+
Path to saved file
|
|
369
|
+
"""
|
|
370
|
+
output_path.parent.mkdir(parents=True, exist_ok=True)
|
|
371
|
+
|
|
372
|
+
with open(output_path, "w", encoding=encoding) as f:
|
|
373
|
+
f.write(content)
|
|
374
|
+
|
|
375
|
+
return output_path
|
|
376
|
+
|
|
377
|
+
|
|
378
|
+
def _get_default_output_path() -> Path:
|
|
379
|
+
"""
|
|
380
|
+
Get default output path in Downloads folder.
|
|
381
|
+
|
|
382
|
+
Returns
|
|
383
|
+
-------
|
|
384
|
+
Path
|
|
385
|
+
Default output path
|
|
386
|
+
"""
|
|
387
|
+
downloads_dir = Path.home() / "Downloads"
|
|
388
|
+
downloads_dir.mkdir(parents=True, exist_ok=True)
|
|
389
|
+
return downloads_dir / "spss_metadata.py"
|
|
390
|
+
|
|
391
|
+
|
|
392
|
+
# =============================================================================
|
|
393
|
+
# Main Function
|
|
394
|
+
# =============================================================================
|
|
395
|
+
|
|
396
|
+
def get_meta(
|
|
397
|
+
meta,
|
|
398
|
+
include_all: bool = False,
|
|
399
|
+
output_path: str | None = None,
|
|
400
|
+
col_labels_var: str = "column_names_to_labels",
|
|
401
|
+
val_labels_var: str = "variable_value_labels",
|
|
402
|
+
file_info_var: str = "file_info",
|
|
403
|
+
encoding: str = "utf-8",
|
|
404
|
+
verbose: bool = False
|
|
405
|
+
) -> MetaDictFlat:
|
|
406
|
+
"""
|
|
407
|
+
Extract metadata from pyreadstat meta object in flat, pyreadstat-ready format.
|
|
408
|
+
|
|
409
|
+
Output is directly usable with pyreadstat.write_sav() - no nested digging required.
|
|
410
|
+
|
|
411
|
+
Parameters
|
|
412
|
+
----------
|
|
413
|
+
meta : pyreadstat.metadata_container
|
|
414
|
+
The metadata returned by pyreadstat.read_sav()
|
|
415
|
+
include_all : bool, default False
|
|
416
|
+
If True, also extract extended metadata (variable measures, widths, types, etc.)
|
|
417
|
+
output_path : str, optional
|
|
418
|
+
File path to save Python file. Options:
|
|
419
|
+
- None: Don't save, just return the dict (default)
|
|
420
|
+
- "downloads": Save to system Downloads folder as 'spss_metadata.py'
|
|
421
|
+
- "path/to/file.py": Save to specific path (must end with .py)
|
|
422
|
+
col_labels_var : str, default "column_names_to_labels"
|
|
423
|
+
Variable name for column labels in output file
|
|
424
|
+
val_labels_var : str, default "variable_value_labels"
|
|
425
|
+
Variable name for value labels in output file
|
|
426
|
+
file_info_var : str, default "file_info"
|
|
427
|
+
Variable name for file info in output file
|
|
428
|
+
encoding : str, default "utf-8"
|
|
429
|
+
File encoding for output
|
|
430
|
+
verbose : bool, default False
|
|
431
|
+
Whether to print progress messages
|
|
432
|
+
|
|
433
|
+
Returns
|
|
434
|
+
-------
|
|
435
|
+
MetaDictFlat
|
|
436
|
+
Flat dictionary with keys:
|
|
437
|
+
- "column_names_to_labels": {var: label, ...} - directly usable with pyreadstat
|
|
438
|
+
- "variable_value_labels": {var: {val: label}, ...} - directly usable with pyreadstat
|
|
439
|
+
- "file_info": General file metadata
|
|
440
|
+
- "extended_metadata": (only if include_all=True) Additional metadata fields
|
|
441
|
+
|
|
442
|
+
Examples
|
|
443
|
+
--------
|
|
444
|
+
>>> import pyreadstat
|
|
445
|
+
>>> df, meta = pyreadstat.read_sav("survey.sav")
|
|
446
|
+
|
|
447
|
+
>>> # Extract metadata (no file saving)
|
|
448
|
+
>>> meta_dict = get_meta(meta)
|
|
449
|
+
|
|
450
|
+
>>> # Use directly with pyreadstat.write_sav()
|
|
451
|
+
>>> pyreadstat.write_sav(
|
|
452
|
+
... df, "output.sav",
|
|
453
|
+
... column_labels=meta_dict["column_names_to_labels"],
|
|
454
|
+
... variable_value_labels=meta_dict["variable_value_labels"]
|
|
455
|
+
... )
|
|
456
|
+
|
|
457
|
+
>>> # Or save to file and import later
|
|
458
|
+
>>> get_meta(meta, output_path="my_metadata.py")
|
|
459
|
+
>>> # Then in another script:
|
|
460
|
+
>>> from my_metadata import column_names_to_labels, variable_value_labels
|
|
461
|
+
>>> pyreadstat.write_sav(df, "output.sav",
|
|
462
|
+
... column_labels=column_names_to_labels,
|
|
463
|
+
... variable_value_labels=variable_value_labels
|
|
464
|
+
... )
|
|
465
|
+
|
|
466
|
+
>>> # Save to Downloads folder
|
|
467
|
+
>>> get_meta(meta, output_path="downloads")
|
|
468
|
+
|
|
469
|
+
>>> # Include extended metadata
|
|
470
|
+
>>> meta_dict = get_meta(meta, include_all=True)
|
|
471
|
+
>>> print(meta_dict["extended_metadata"]["variable_measure"])
|
|
472
|
+
"""
|
|
473
|
+
|
|
474
|
+
def _print(msg: str) -> None:
|
|
475
|
+
if verbose:
|
|
476
|
+
print(msg)
|
|
477
|
+
|
|
478
|
+
try:
|
|
479
|
+
_print("=" * 60)
|
|
480
|
+
_print("GET METADATA SUMMARY")
|
|
481
|
+
_print("=" * 60)
|
|
482
|
+
|
|
483
|
+
# Extract the three main components
|
|
484
|
+
column_labels = _extract_column_labels(meta)
|
|
485
|
+
variable_value_labels = _extract_variable_value_labels(meta)
|
|
486
|
+
file_info = _extract_file_info(meta)
|
|
487
|
+
|
|
488
|
+
# Extended metadata if requested
|
|
489
|
+
extended_metadata = _extract_extended_metadata(meta) if include_all else None
|
|
490
|
+
|
|
491
|
+
# Display info
|
|
492
|
+
_print(f"File: {file_info.get('file_label') or file_info.get('table_name') or 'Unknown'}")
|
|
493
|
+
_print(f"Rows: {file_info.get('number_rows')}, Columns: {file_info.get('number_columns')}")
|
|
494
|
+
_print(f"Variables with labels: {len(column_labels)}")
|
|
495
|
+
_print(f"Variables with value labels: {len(variable_value_labels)}")
|
|
496
|
+
if output_path is not None:
|
|
497
|
+
_print(f"Output: {'Downloads folder' if output_path.lower() == 'downloads' else output_path}")
|
|
498
|
+
|
|
499
|
+
# Build return dict
|
|
500
|
+
result: MetaDictFlat = {
|
|
501
|
+
"column_names_to_labels": column_labels,
|
|
502
|
+
"variable_value_labels": variable_value_labels,
|
|
503
|
+
"file_info": file_info,
|
|
504
|
+
}
|
|
505
|
+
if extended_metadata:
|
|
506
|
+
result["extended_metadata"] = extended_metadata
|
|
507
|
+
|
|
508
|
+
# Determine if we should save and where
|
|
509
|
+
save_path: Path | None = None
|
|
510
|
+
|
|
511
|
+
if output_path is not None:
|
|
512
|
+
if output_path.lower() == "downloads":
|
|
513
|
+
save_path = _get_default_output_path()
|
|
514
|
+
elif not output_path.lower().endswith(".py"):
|
|
515
|
+
raise ValueError("output_path must end with .py or be 'downloads'")
|
|
516
|
+
else:
|
|
517
|
+
save_path = Path(output_path)
|
|
518
|
+
|
|
519
|
+
# Save if path provided
|
|
520
|
+
saved_path: Path | None = None
|
|
521
|
+
if save_path is not None:
|
|
522
|
+
content = _generate_python_file_content(
|
|
523
|
+
column_labels,
|
|
524
|
+
variable_value_labels,
|
|
525
|
+
file_info,
|
|
526
|
+
extended_metadata,
|
|
527
|
+
col_labels_var,
|
|
528
|
+
val_labels_var,
|
|
529
|
+
file_info_var,
|
|
530
|
+
)
|
|
531
|
+
saved_path = _save_python_file(content, save_path, encoding)
|
|
532
|
+
|
|
533
|
+
# Summary
|
|
534
|
+
_print("\n" + "=" * 60)
|
|
535
|
+
_print("EXTRACTION COMPLETE")
|
|
536
|
+
_print("=" * 60)
|
|
537
|
+
_print(f" ⢠column_names_to_labels: {len(column_labels)} variables")
|
|
538
|
+
_print(f" ⢠variable_value_labels: {len(variable_value_labels)} variables")
|
|
539
|
+
if extended_metadata:
|
|
540
|
+
_print(f" ⢠extended_metadata:")
|
|
541
|
+
for key, val in extended_metadata.items():
|
|
542
|
+
if isinstance(val, (dict, list)):
|
|
543
|
+
_print(f" - {key}: {len(val)} items")
|
|
544
|
+
else:
|
|
545
|
+
_print(f" - {key}")
|
|
546
|
+
|
|
547
|
+
_print("")
|
|
548
|
+
if saved_path:
|
|
549
|
+
_print(f"ā
Saved to: {saved_path}")
|
|
550
|
+
else:
|
|
551
|
+
_print("ā
Extracted (no file saved)")
|
|
552
|
+
_print("=" * 60)
|
|
553
|
+
|
|
554
|
+
return result
|
|
555
|
+
|
|
556
|
+
except Exception as e:
|
|
557
|
+
_print(f"\nā Error during extraction: {str(e)}")
|
|
558
|
+
raise
|
|
559
|
+
|
|
560
|
+
|
|
561
|
+
__all__ = ["get_meta"]
|
|
@@ -0,0 +1,127 @@
|
|
|
1
|
+
import narwhals as nw
|
|
2
|
+
from narwhals.typing import FrameT
|
|
3
|
+
import polars as pl
|
|
4
|
+
import pandas as pd
|
|
5
|
+
from .def_map_engine import map_engine
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
def make_datamap(
|
|
9
|
+
df: pl.DataFrame | pd.DataFrame,
|
|
10
|
+
meta,
|
|
11
|
+
output_format: str | None = None
|
|
12
|
+
) -> pl.DataFrame | pd.DataFrame:
|
|
13
|
+
"""
|
|
14
|
+
Create a validation data map from dataframe and pyreadstat meta object.
|
|
15
|
+
|
|
16
|
+
This wrapper function internally calls map_engine() to generate the core_map,
|
|
17
|
+
then adds computed columns for missing value labels, missing data flags,
|
|
18
|
+
and base_n calculations.
|
|
19
|
+
|
|
20
|
+
Parameters:
|
|
21
|
+
-----------
|
|
22
|
+
df : pl.DataFrame | pd.DataFrame
|
|
23
|
+
The data dataframe (Polars or Pandas)
|
|
24
|
+
meta : pyreadstat metadata object
|
|
25
|
+
The metadata object returned by pyreadstat when reading SPSS files
|
|
26
|
+
output_format : str | None
|
|
27
|
+
Output format - either "polars" or "pandas"
|
|
28
|
+
If None, will match the input dataframe type
|
|
29
|
+
|
|
30
|
+
Returns:
|
|
31
|
+
--------
|
|
32
|
+
pl.DataFrame | pd.DataFrame
|
|
33
|
+
A data map dataframe with columns:
|
|
34
|
+
- variable: variable name
|
|
35
|
+
- variable_label: variable label text
|
|
36
|
+
- variable_type: variable type (single-select, multi-select, numeric, text, date)
|
|
37
|
+
- value_code: value code (None for missing data row, actual codes for values)
|
|
38
|
+
- value_label: value label ("NULL" for missing data row, labels or None for unlabeled)
|
|
39
|
+
- value_n: count of occurrences
|
|
40
|
+
- base_n: total non-NULL count for the variable
|
|
41
|
+
- base_pct: percentage of value_n over base_n (null if base_n is 0)
|
|
42
|
+
- total_n: total count of value_n per variable
|
|
43
|
+
- total_pct: percentage of value_n over total_n (null if total_n is 0)
|
|
44
|
+
- missing_value_label: "Yes" if value exists in data but not in meta, else "No"
|
|
45
|
+
- missing_data: "Yes" for NULL data rows only, else "No"
|
|
46
|
+
|
|
47
|
+
Examples:
|
|
48
|
+
---------
|
|
49
|
+
>>> import pyreadstat
|
|
50
|
+
>>> df, meta = pyreadstat.read_sav('data.sav', user_missing=True)
|
|
51
|
+
>>> data_map = make_datamap(df, meta)
|
|
52
|
+
>>> data_map.write_excel('datamap.xlsx') # For Polars
|
|
53
|
+
>>> # or
|
|
54
|
+
>>> data_map_pd = make_datamap(df, meta, output_format="pandas")
|
|
55
|
+
>>> data_map_pd.to_excel('datamap.xlsx', index=False) # For Pandas
|
|
56
|
+
"""
|
|
57
|
+
|
|
58
|
+
# First, get the core_map from map_engine
|
|
59
|
+
core_map = map_engine(df, meta, output_format)
|
|
60
|
+
|
|
61
|
+
# Then apply the data map transformations
|
|
62
|
+
data_map = nw.from_native(core_map).with_columns(
|
|
63
|
+
# missing_label: "Yes" if value exists in data but not in meta for single-select or multi-select variables
|
|
64
|
+
nw.when(
|
|
65
|
+
(~nw.col("value_code").is_null()) & # Changed from is_not_null()
|
|
66
|
+
(nw.col("value_label").is_null())
|
|
67
|
+
).then(nw.lit("Yes"))
|
|
68
|
+
.otherwise(nw.lit("No"))
|
|
69
|
+
.alias("missing_value_label"),
|
|
70
|
+
|
|
71
|
+
# missing_data: "Yes" for NULL data rows only
|
|
72
|
+
nw.when(nw.col("value_label") == "NULL")
|
|
73
|
+
.then(nw.lit("Yes"))
|
|
74
|
+
.otherwise(nw.lit("No"))
|
|
75
|
+
.alias("missing_data"), # Added missing comma
|
|
76
|
+
|
|
77
|
+
# Calculate base_n: sum of non-NULL value_n per variable
|
|
78
|
+
nw
|
|
79
|
+
.when(nw.col("value_label") == "NULL")
|
|
80
|
+
.then(nw.col("value_n"))
|
|
81
|
+
.otherwise(
|
|
82
|
+
nw
|
|
83
|
+
.when((nw.col("value_label") != "NULL") | (nw.col("value_label").is_null()))
|
|
84
|
+
.then(nw.col("value_n"))
|
|
85
|
+
.sum()
|
|
86
|
+
.over("variable")
|
|
87
|
+
)
|
|
88
|
+
.alias("base_n")
|
|
89
|
+
|
|
90
|
+
).with_columns(
|
|
91
|
+
# Calculate base_pct (might create value 'NaN' if base_n is 0)
|
|
92
|
+
(nw.col("value_n") / nw.col("base_n")).alias("base_pct")
|
|
93
|
+
).with_columns(
|
|
94
|
+
# Replace NaN with null
|
|
95
|
+
nw.when(nw.col("base_pct").is_nan())
|
|
96
|
+
.then(None) # Convert NaN to null
|
|
97
|
+
.otherwise(nw.col("base_pct"))
|
|
98
|
+
.alias("base_pct")
|
|
99
|
+
).with_columns(
|
|
100
|
+
# Calculate total_n per variable for total_pct
|
|
101
|
+
nw.col('value_n').sum().over("variable").alias('total_n')
|
|
102
|
+
).with_columns(
|
|
103
|
+
# Calculate total_pct
|
|
104
|
+
(nw.col('value_n')/nw.col('total_n')).alias('total_pct')
|
|
105
|
+
).with_columns(
|
|
106
|
+
# Replace NaN with null
|
|
107
|
+
nw.when(nw.col("total_pct").is_nan())
|
|
108
|
+
.then(None) # Convert NaN to null
|
|
109
|
+
.otherwise(nw.col("total_pct"))
|
|
110
|
+
.alias("total_pct")
|
|
111
|
+
).select([
|
|
112
|
+
# Reorder columns: variable info first, then everything else
|
|
113
|
+
'variable',
|
|
114
|
+
'variable_label',
|
|
115
|
+
'variable_type',
|
|
116
|
+
'value_code',
|
|
117
|
+
'value_label',
|
|
118
|
+
'value_n',
|
|
119
|
+
'base_n',
|
|
120
|
+
'base_pct',
|
|
121
|
+
'total_n',
|
|
122
|
+
'total_pct',
|
|
123
|
+
'missing_value_label',
|
|
124
|
+
'missing_data'
|
|
125
|
+
]).to_native()
|
|
126
|
+
|
|
127
|
+
return data_map
|