ultrasav 0.1.4__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- ultrasav/__init__.py +280 -0
- ultrasav/_add_cases.py +227 -0
- ultrasav/_data.py +513 -0
- ultrasav/_make_dummy.py +137 -0
- ultrasav/_merge_data.py +435 -0
- ultrasav/_merge_meta.py +280 -0
- ultrasav/_metadata.py +570 -0
- ultrasav/_read_files.py +558 -0
- ultrasav/_write_files.py +111 -0
- ultrasav/metaman/__init__.py +91 -0
- ultrasav/metaman/def_detect_variable_type.py +454 -0
- ultrasav/metaman/def_get_meta.py +561 -0
- ultrasav/metaman/def_make_datamap.py +127 -0
- ultrasav/metaman/def_make_labels.py +833 -0
- ultrasav/metaman/def_map_engine.py +529 -0
- ultrasav/metaman/def_map_to_excel.py +294 -0
- ultrasav/metaman/def_write_excel_engine.py +298 -0
- ultrasav/metaman/pastel_color_schemes.py +185 -0
- ultrasav-0.1.4.dist-info/METADATA +550 -0
- ultrasav-0.1.4.dist-info/RECORD +21 -0
- ultrasav-0.1.4.dist-info/WHEEL +4 -0
ultrasav/_merge_meta.py
ADDED
|
@@ -0,0 +1,280 @@
|
|
|
1
|
+
"""
|
|
2
|
+
merge_meta.py
|
|
3
|
+
Metadata merging function for tidyspss 2.0
|
|
4
|
+
Following the two-track architecture where metadata is independent from data
|
|
5
|
+
"""
|
|
6
|
+
|
|
7
|
+
import logging
|
|
8
|
+
from typing import Any
|
|
9
|
+
from copy import deepcopy
|
|
10
|
+
|
|
11
|
+
# Import Metadata class
|
|
12
|
+
from .class_metadata import Metadata
|
|
13
|
+
|
|
14
|
+
logger = logging.getLogger(__name__)
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
def _merge_dict_field(base_dict: dict, other_dict: dict, field_name: str) -> tuple[dict, list]:
|
|
18
|
+
"""
|
|
19
|
+
Merge two dictionary fields at column level - base wins for existing columns.
|
|
20
|
+
|
|
21
|
+
Parameters
|
|
22
|
+
----------
|
|
23
|
+
base_dict : dict
|
|
24
|
+
Base dictionary (takes precedence for existing keys)
|
|
25
|
+
other_dict : dict
|
|
26
|
+
Other dictionary (only new keys are added)
|
|
27
|
+
field_name : str
|
|
28
|
+
Name of the field being merged (for logging)
|
|
29
|
+
|
|
30
|
+
Returns
|
|
31
|
+
-------
|
|
32
|
+
tuple[dict, list]
|
|
33
|
+
Merged dictionary and list of new columns added
|
|
34
|
+
"""
|
|
35
|
+
merged = base_dict.copy() if base_dict else {}
|
|
36
|
+
new_columns = []
|
|
37
|
+
|
|
38
|
+
for col_name, col_value in other_dict.items():
|
|
39
|
+
if col_name not in merged:
|
|
40
|
+
# This is a new column - add entire column:value pair
|
|
41
|
+
merged[col_name] = deepcopy(col_value)
|
|
42
|
+
new_columns.append(col_name)
|
|
43
|
+
# If column exists in base, keep base's value entirely
|
|
44
|
+
|
|
45
|
+
if new_columns:
|
|
46
|
+
logger.debug(f" Added {len(new_columns)} new columns to {field_name}: {new_columns[:5]}{'...' if len(new_columns) > 5 else ''}")
|
|
47
|
+
|
|
48
|
+
return merged, new_columns
|
|
49
|
+
|
|
50
|
+
|
|
51
|
+
def _collect_column_labels(meta) -> dict:
|
|
52
|
+
"""
|
|
53
|
+
Extract column_labels from metadata, handling both list and dict formats.
|
|
54
|
+
|
|
55
|
+
Parameters
|
|
56
|
+
----------
|
|
57
|
+
meta : Metadata
|
|
58
|
+
Metadata object to extract column labels from
|
|
59
|
+
|
|
60
|
+
Returns
|
|
61
|
+
-------
|
|
62
|
+
dict
|
|
63
|
+
Column labels in dictionary format
|
|
64
|
+
"""
|
|
65
|
+
column_labels = {}
|
|
66
|
+
|
|
67
|
+
if hasattr(meta, 'column_labels') and meta.column_labels:
|
|
68
|
+
if isinstance(meta.column_labels, dict):
|
|
69
|
+
column_labels = meta.column_labels.copy()
|
|
70
|
+
elif isinstance(meta.column_labels, list) and hasattr(meta, 'column_names'):
|
|
71
|
+
# Convert list format to dict format
|
|
72
|
+
for name, label in zip(meta.column_names, meta.column_labels):
|
|
73
|
+
column_labels[name] = label
|
|
74
|
+
|
|
75
|
+
return column_labels
|
|
76
|
+
|
|
77
|
+
|
|
78
|
+
def merge_meta(
|
|
79
|
+
metas: list[Any | None],
|
|
80
|
+
strategy: str = "first"
|
|
81
|
+
) -> Any:
|
|
82
|
+
"""
|
|
83
|
+
Merge multiple metadata objects with column-level preservation.
|
|
84
|
+
|
|
85
|
+
This function merges metadata from multiple sources following tidyspss's
|
|
86
|
+
principle that metadata is independent from data. The merge operates at
|
|
87
|
+
the column level - for each column, we take ALL metadata from one source,
|
|
88
|
+
never mixing metadata values within a column.
|
|
89
|
+
|
|
90
|
+
Parameters
|
|
91
|
+
----------
|
|
92
|
+
metas : list[Metadata | None]
|
|
93
|
+
List of Metadata objects or None values. Can include:
|
|
94
|
+
- Metadata objects from read_sav/pyreadstat
|
|
95
|
+
- None for missing metadata
|
|
96
|
+
- Metadata objects created manually
|
|
97
|
+
strategy : str, default "first"
|
|
98
|
+
Merge strategy for combining metadata:
|
|
99
|
+
- "first": Use first non-None meta as base, add new columns from others
|
|
100
|
+
- "last": Use last non-None meta as base, add new columns from others
|
|
101
|
+
|
|
102
|
+
Returns
|
|
103
|
+
-------
|
|
104
|
+
Metadata
|
|
105
|
+
Merged Metadata object with combined metadata from all sources
|
|
106
|
+
|
|
107
|
+
Notes
|
|
108
|
+
-----
|
|
109
|
+
The merge strategy works at the COLUMN level, not value level:
|
|
110
|
+
- If base meta has metadata for column "Q1", it keeps ALL of Q1's metadata
|
|
111
|
+
- Only columns NOT in base are added from subsequent metas
|
|
112
|
+
- No mixing of values within a column's metadata
|
|
113
|
+
|
|
114
|
+
Only these fields are merged:
|
|
115
|
+
- column_labels
|
|
116
|
+
- variable_value_labels
|
|
117
|
+
- variable_format
|
|
118
|
+
- variable_measure
|
|
119
|
+
- variable_display_width
|
|
120
|
+
- missing_ranges
|
|
121
|
+
|
|
122
|
+
File-level metadata (notes, file_label) are taken from base only.
|
|
123
|
+
|
|
124
|
+
Examples
|
|
125
|
+
--------
|
|
126
|
+
>>> # Merge metadata from multiple SAV files
|
|
127
|
+
>>> _, meta1 = read_sav("file1.sav")
|
|
128
|
+
>>> _, meta2 = read_sav("file2.sav")
|
|
129
|
+
>>> _, meta3 = read_sav("file3.sav")
|
|
130
|
+
>>> merged_meta = merge_meta([meta1, meta2, meta3])
|
|
131
|
+
|
|
132
|
+
>>> # Handle None values
|
|
133
|
+
>>> merged_meta = merge_meta([None, meta1, None, meta2])
|
|
134
|
+
|
|
135
|
+
>>> # Use last strategy
|
|
136
|
+
>>> merged_meta = merge_meta([meta1, meta2, meta3], strategy="last")
|
|
137
|
+
"""
|
|
138
|
+
|
|
139
|
+
# Filter out None values
|
|
140
|
+
valid_metas = [m for m in metas if m is not None]
|
|
141
|
+
|
|
142
|
+
if not valid_metas:
|
|
143
|
+
# Return empty metadata if all are None
|
|
144
|
+
logger.info("All metadata objects are None, returning empty Metadata")
|
|
145
|
+
return Metadata()
|
|
146
|
+
|
|
147
|
+
if len(valid_metas) == 1:
|
|
148
|
+
# Only one valid metadata, return it wrapped in Metadata class
|
|
149
|
+
logger.info("Only one valid metadata found, returning as Metadata object")
|
|
150
|
+
return Metadata(valid_metas[0])
|
|
151
|
+
|
|
152
|
+
# Select base metadata based on strategy
|
|
153
|
+
if strategy == "first":
|
|
154
|
+
base_meta = valid_metas[0]
|
|
155
|
+
others = valid_metas[1:]
|
|
156
|
+
logger.info(f"Using first non-None metadata as base")
|
|
157
|
+
elif strategy == "last":
|
|
158
|
+
base_meta = valid_metas[-1]
|
|
159
|
+
others = valid_metas[:-1]
|
|
160
|
+
logger.info(f"Using last non-None metadata as base")
|
|
161
|
+
else:
|
|
162
|
+
raise ValueError(f"Unknown merge strategy: {strategy}. Use 'first' or 'last'")
|
|
163
|
+
|
|
164
|
+
# Wrap base metadata in Metadata class to ensure we have proper methods
|
|
165
|
+
merged = Metadata(base_meta)
|
|
166
|
+
|
|
167
|
+
# Fields to merge (dict-based metadata that maps columns to values)
|
|
168
|
+
dict_fields = [
|
|
169
|
+
'variable_value_labels', # Dict with column keys
|
|
170
|
+
'variable_format', # Dict with column keys
|
|
171
|
+
'variable_measure', # Dict with column keys
|
|
172
|
+
'variable_display_width', # Dict with column keys
|
|
173
|
+
'missing_ranges', # Dict with column keys
|
|
174
|
+
]
|
|
175
|
+
|
|
176
|
+
# Collect all unique column names (for logging)
|
|
177
|
+
column_names_all = []
|
|
178
|
+
if hasattr(merged, 'column_names') and merged.column_names:
|
|
179
|
+
column_names_all.extend(merged.column_names)
|
|
180
|
+
|
|
181
|
+
# Collect and merge column_labels from all metadata
|
|
182
|
+
column_labels_dict = _collect_column_labels(merged)
|
|
183
|
+
|
|
184
|
+
# Process each subsequent metadata object
|
|
185
|
+
for i, other_meta in enumerate(others):
|
|
186
|
+
logger.debug(f"Merging metadata {i+1} of {len(others)}")
|
|
187
|
+
|
|
188
|
+
# Collect column names from this metadata
|
|
189
|
+
if hasattr(other_meta, 'column_names') and other_meta.column_names:
|
|
190
|
+
for col in other_meta.column_names:
|
|
191
|
+
if col not in column_names_all:
|
|
192
|
+
column_names_all.append(col)
|
|
193
|
+
|
|
194
|
+
# Merge column_labels
|
|
195
|
+
other_labels = _collect_column_labels(other_meta)
|
|
196
|
+
if other_labels:
|
|
197
|
+
merged_labels, new_cols = _merge_dict_field(column_labels_dict, other_labels, 'column_labels')
|
|
198
|
+
column_labels_dict = merged_labels
|
|
199
|
+
|
|
200
|
+
# Merge each dict field
|
|
201
|
+
for field_name in dict_fields:
|
|
202
|
+
# Get current and other field values
|
|
203
|
+
merged_field = getattr(merged, field_name, None)
|
|
204
|
+
other_field = getattr(other_meta, field_name, None)
|
|
205
|
+
|
|
206
|
+
# Skip if either is None or not a dict
|
|
207
|
+
if merged_field is None or other_field is None:
|
|
208
|
+
continue
|
|
209
|
+
if not isinstance(merged_field, dict) or not isinstance(other_field, dict):
|
|
210
|
+
logger.debug(f" Skipping {field_name} - not dict type")
|
|
211
|
+
continue
|
|
212
|
+
|
|
213
|
+
# Merge the field
|
|
214
|
+
merged_result, _ = _merge_dict_field(merged_field, other_field, field_name)
|
|
215
|
+
setattr(merged, field_name, merged_result)
|
|
216
|
+
|
|
217
|
+
# Update column_labels with the merged dictionary
|
|
218
|
+
if column_labels_dict:
|
|
219
|
+
merged.column_labels = column_labels_dict
|
|
220
|
+
logger.debug(f"Updated column_labels with {len(column_labels_dict)} entries")
|
|
221
|
+
|
|
222
|
+
# Log summary of unique columns found (column_names is read-only)
|
|
223
|
+
if column_names_all:
|
|
224
|
+
unique_count = len(set(column_names_all))
|
|
225
|
+
logger.debug(f"Found {unique_count} unique columns across all metadata")
|
|
226
|
+
|
|
227
|
+
# Log summary
|
|
228
|
+
total_columns = set()
|
|
229
|
+
for field_name in ['column_labels'] + dict_fields:
|
|
230
|
+
field_value = getattr(merged, field_name, None)
|
|
231
|
+
if field_value and isinstance(field_value, dict):
|
|
232
|
+
total_columns.update(field_value.keys())
|
|
233
|
+
|
|
234
|
+
logger.info(f"Merge complete: {len(valid_metas)} metadata objects merged, {len(total_columns)} unique columns in result")
|
|
235
|
+
|
|
236
|
+
return merged
|
|
237
|
+
|
|
238
|
+
|
|
239
|
+
def get_meta_summary(meta: Any) -> dict:
|
|
240
|
+
"""
|
|
241
|
+
Get a summary of metadata contents for debugging/logging.
|
|
242
|
+
|
|
243
|
+
Parameters
|
|
244
|
+
----------
|
|
245
|
+
meta : Metadata
|
|
246
|
+
Metadata object to summarize
|
|
247
|
+
|
|
248
|
+
Returns
|
|
249
|
+
-------
|
|
250
|
+
dict
|
|
251
|
+
Summary statistics about the metadata
|
|
252
|
+
"""
|
|
253
|
+
if meta is None:
|
|
254
|
+
return {"status": "None"}
|
|
255
|
+
|
|
256
|
+
summary = {
|
|
257
|
+
"column_names": len(getattr(meta, 'column_names', [])),
|
|
258
|
+
"column_labels": len(getattr(meta, 'column_labels', {})),
|
|
259
|
+
"value_labels": len(getattr(meta, 'variable_value_labels', {})),
|
|
260
|
+
"formats": len(getattr(meta, 'variable_format', {})),
|
|
261
|
+
"measures": len(getattr(meta, 'variable_measure', {})),
|
|
262
|
+
"display_widths": len(getattr(meta, 'variable_display_width', {})),
|
|
263
|
+
"missing_ranges": len(getattr(meta, 'missing_ranges', {})),
|
|
264
|
+
"missing_user_values": len(getattr(meta, 'missing_user_values', {})),
|
|
265
|
+
"original_types": len(getattr(meta, 'original_variable_types', {})),
|
|
266
|
+
"readstat_types": len(getattr(meta, 'readstat_variable_types', {})),
|
|
267
|
+
"alignment": len(getattr(meta, 'variable_alignment', {})),
|
|
268
|
+
"storage_width": len(getattr(meta, 'variable_storage_width', {})),
|
|
269
|
+
"variable_to_label": len(getattr(meta, 'variable_to_label', {})),
|
|
270
|
+
"value_label_defs": len(getattr(meta, 'value_labels', {})),
|
|
271
|
+
"mr_sets": len(getattr(meta, 'mr_sets', {})),
|
|
272
|
+
}
|
|
273
|
+
|
|
274
|
+
# Add file-level metadata if present
|
|
275
|
+
if hasattr(meta, 'file_label') and meta.file_label:
|
|
276
|
+
summary['file_label'] = meta.file_label
|
|
277
|
+
if hasattr(meta, 'notes') and meta.notes:
|
|
278
|
+
summary['has_notes'] = True
|
|
279
|
+
|
|
280
|
+
return summary
|