ultrasav 0.1.4__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,280 @@
1
+ """
2
+ merge_meta.py
3
+ Metadata merging function for tidyspss 2.0
4
+ Following the two-track architecture where metadata is independent from data
5
+ """
6
+
7
+ import logging
8
+ from typing import Any
9
+ from copy import deepcopy
10
+
11
+ # Import Metadata class
12
+ from .class_metadata import Metadata
13
+
14
+ logger = logging.getLogger(__name__)
15
+
16
+
17
+ def _merge_dict_field(base_dict: dict, other_dict: dict, field_name: str) -> tuple[dict, list]:
18
+ """
19
+ Merge two dictionary fields at column level - base wins for existing columns.
20
+
21
+ Parameters
22
+ ----------
23
+ base_dict : dict
24
+ Base dictionary (takes precedence for existing keys)
25
+ other_dict : dict
26
+ Other dictionary (only new keys are added)
27
+ field_name : str
28
+ Name of the field being merged (for logging)
29
+
30
+ Returns
31
+ -------
32
+ tuple[dict, list]
33
+ Merged dictionary and list of new columns added
34
+ """
35
+ merged = base_dict.copy() if base_dict else {}
36
+ new_columns = []
37
+
38
+ for col_name, col_value in other_dict.items():
39
+ if col_name not in merged:
40
+ # This is a new column - add entire column:value pair
41
+ merged[col_name] = deepcopy(col_value)
42
+ new_columns.append(col_name)
43
+ # If column exists in base, keep base's value entirely
44
+
45
+ if new_columns:
46
+ logger.debug(f" Added {len(new_columns)} new columns to {field_name}: {new_columns[:5]}{'...' if len(new_columns) > 5 else ''}")
47
+
48
+ return merged, new_columns
49
+
50
+
51
+ def _collect_column_labels(meta) -> dict:
52
+ """
53
+ Extract column_labels from metadata, handling both list and dict formats.
54
+
55
+ Parameters
56
+ ----------
57
+ meta : Metadata
58
+ Metadata object to extract column labels from
59
+
60
+ Returns
61
+ -------
62
+ dict
63
+ Column labels in dictionary format
64
+ """
65
+ column_labels = {}
66
+
67
+ if hasattr(meta, 'column_labels') and meta.column_labels:
68
+ if isinstance(meta.column_labels, dict):
69
+ column_labels = meta.column_labels.copy()
70
+ elif isinstance(meta.column_labels, list) and hasattr(meta, 'column_names'):
71
+ # Convert list format to dict format
72
+ for name, label in zip(meta.column_names, meta.column_labels):
73
+ column_labels[name] = label
74
+
75
+ return column_labels
76
+
77
+
78
+ def merge_meta(
79
+ metas: list[Any | None],
80
+ strategy: str = "first"
81
+ ) -> Any:
82
+ """
83
+ Merge multiple metadata objects with column-level preservation.
84
+
85
+ This function merges metadata from multiple sources following tidyspss's
86
+ principle that metadata is independent from data. The merge operates at
87
+ the column level - for each column, we take ALL metadata from one source,
88
+ never mixing metadata values within a column.
89
+
90
+ Parameters
91
+ ----------
92
+ metas : list[Metadata | None]
93
+ List of Metadata objects or None values. Can include:
94
+ - Metadata objects from read_sav/pyreadstat
95
+ - None for missing metadata
96
+ - Metadata objects created manually
97
+ strategy : str, default "first"
98
+ Merge strategy for combining metadata:
99
+ - "first": Use first non-None meta as base, add new columns from others
100
+ - "last": Use last non-None meta as base, add new columns from others
101
+
102
+ Returns
103
+ -------
104
+ Metadata
105
+ Merged Metadata object with combined metadata from all sources
106
+
107
+ Notes
108
+ -----
109
+ The merge strategy works at the COLUMN level, not value level:
110
+ - If base meta has metadata for column "Q1", it keeps ALL of Q1's metadata
111
+ - Only columns NOT in base are added from subsequent metas
112
+ - No mixing of values within a column's metadata
113
+
114
+ Only these fields are merged:
115
+ - column_labels
116
+ - variable_value_labels
117
+ - variable_format
118
+ - variable_measure
119
+ - variable_display_width
120
+ - missing_ranges
121
+
122
+ File-level metadata (notes, file_label) are taken from base only.
123
+
124
+ Examples
125
+ --------
126
+ >>> # Merge metadata from multiple SAV files
127
+ >>> _, meta1 = read_sav("file1.sav")
128
+ >>> _, meta2 = read_sav("file2.sav")
129
+ >>> _, meta3 = read_sav("file3.sav")
130
+ >>> merged_meta = merge_meta([meta1, meta2, meta3])
131
+
132
+ >>> # Handle None values
133
+ >>> merged_meta = merge_meta([None, meta1, None, meta2])
134
+
135
+ >>> # Use last strategy
136
+ >>> merged_meta = merge_meta([meta1, meta2, meta3], strategy="last")
137
+ """
138
+
139
+ # Filter out None values
140
+ valid_metas = [m for m in metas if m is not None]
141
+
142
+ if not valid_metas:
143
+ # Return empty metadata if all are None
144
+ logger.info("All metadata objects are None, returning empty Metadata")
145
+ return Metadata()
146
+
147
+ if len(valid_metas) == 1:
148
+ # Only one valid metadata, return it wrapped in Metadata class
149
+ logger.info("Only one valid metadata found, returning as Metadata object")
150
+ return Metadata(valid_metas[0])
151
+
152
+ # Select base metadata based on strategy
153
+ if strategy == "first":
154
+ base_meta = valid_metas[0]
155
+ others = valid_metas[1:]
156
+ logger.info(f"Using first non-None metadata as base")
157
+ elif strategy == "last":
158
+ base_meta = valid_metas[-1]
159
+ others = valid_metas[:-1]
160
+ logger.info(f"Using last non-None metadata as base")
161
+ else:
162
+ raise ValueError(f"Unknown merge strategy: {strategy}. Use 'first' or 'last'")
163
+
164
+ # Wrap base metadata in Metadata class to ensure we have proper methods
165
+ merged = Metadata(base_meta)
166
+
167
+ # Fields to merge (dict-based metadata that maps columns to values)
168
+ dict_fields = [
169
+ 'variable_value_labels', # Dict with column keys
170
+ 'variable_format', # Dict with column keys
171
+ 'variable_measure', # Dict with column keys
172
+ 'variable_display_width', # Dict with column keys
173
+ 'missing_ranges', # Dict with column keys
174
+ ]
175
+
176
+ # Collect all unique column names (for logging)
177
+ column_names_all = []
178
+ if hasattr(merged, 'column_names') and merged.column_names:
179
+ column_names_all.extend(merged.column_names)
180
+
181
+ # Collect and merge column_labels from all metadata
182
+ column_labels_dict = _collect_column_labels(merged)
183
+
184
+ # Process each subsequent metadata object
185
+ for i, other_meta in enumerate(others):
186
+ logger.debug(f"Merging metadata {i+1} of {len(others)}")
187
+
188
+ # Collect column names from this metadata
189
+ if hasattr(other_meta, 'column_names') and other_meta.column_names:
190
+ for col in other_meta.column_names:
191
+ if col not in column_names_all:
192
+ column_names_all.append(col)
193
+
194
+ # Merge column_labels
195
+ other_labels = _collect_column_labels(other_meta)
196
+ if other_labels:
197
+ merged_labels, new_cols = _merge_dict_field(column_labels_dict, other_labels, 'column_labels')
198
+ column_labels_dict = merged_labels
199
+
200
+ # Merge each dict field
201
+ for field_name in dict_fields:
202
+ # Get current and other field values
203
+ merged_field = getattr(merged, field_name, None)
204
+ other_field = getattr(other_meta, field_name, None)
205
+
206
+ # Skip if either is None or not a dict
207
+ if merged_field is None or other_field is None:
208
+ continue
209
+ if not isinstance(merged_field, dict) or not isinstance(other_field, dict):
210
+ logger.debug(f" Skipping {field_name} - not dict type")
211
+ continue
212
+
213
+ # Merge the field
214
+ merged_result, _ = _merge_dict_field(merged_field, other_field, field_name)
215
+ setattr(merged, field_name, merged_result)
216
+
217
+ # Update column_labels with the merged dictionary
218
+ if column_labels_dict:
219
+ merged.column_labels = column_labels_dict
220
+ logger.debug(f"Updated column_labels with {len(column_labels_dict)} entries")
221
+
222
+ # Log summary of unique columns found (column_names is read-only)
223
+ if column_names_all:
224
+ unique_count = len(set(column_names_all))
225
+ logger.debug(f"Found {unique_count} unique columns across all metadata")
226
+
227
+ # Log summary
228
+ total_columns = set()
229
+ for field_name in ['column_labels'] + dict_fields:
230
+ field_value = getattr(merged, field_name, None)
231
+ if field_value and isinstance(field_value, dict):
232
+ total_columns.update(field_value.keys())
233
+
234
+ logger.info(f"Merge complete: {len(valid_metas)} metadata objects merged, {len(total_columns)} unique columns in result")
235
+
236
+ return merged
237
+
238
+
239
+ def get_meta_summary(meta: Any) -> dict:
240
+ """
241
+ Get a summary of metadata contents for debugging/logging.
242
+
243
+ Parameters
244
+ ----------
245
+ meta : Metadata
246
+ Metadata object to summarize
247
+
248
+ Returns
249
+ -------
250
+ dict
251
+ Summary statistics about the metadata
252
+ """
253
+ if meta is None:
254
+ return {"status": "None"}
255
+
256
+ summary = {
257
+ "column_names": len(getattr(meta, 'column_names', [])),
258
+ "column_labels": len(getattr(meta, 'column_labels', {})),
259
+ "value_labels": len(getattr(meta, 'variable_value_labels', {})),
260
+ "formats": len(getattr(meta, 'variable_format', {})),
261
+ "measures": len(getattr(meta, 'variable_measure', {})),
262
+ "display_widths": len(getattr(meta, 'variable_display_width', {})),
263
+ "missing_ranges": len(getattr(meta, 'missing_ranges', {})),
264
+ "missing_user_values": len(getattr(meta, 'missing_user_values', {})),
265
+ "original_types": len(getattr(meta, 'original_variable_types', {})),
266
+ "readstat_types": len(getattr(meta, 'readstat_variable_types', {})),
267
+ "alignment": len(getattr(meta, 'variable_alignment', {})),
268
+ "storage_width": len(getattr(meta, 'variable_storage_width', {})),
269
+ "variable_to_label": len(getattr(meta, 'variable_to_label', {})),
270
+ "value_label_defs": len(getattr(meta, 'value_labels', {})),
271
+ "mr_sets": len(getattr(meta, 'mr_sets', {})),
272
+ }
273
+
274
+ # Add file-level metadata if present
275
+ if hasattr(meta, 'file_label') and meta.file_label:
276
+ summary['file_label'] = meta.file_label
277
+ if hasattr(meta, 'notes') and meta.notes:
278
+ summary['has_notes'] = True
279
+
280
+ return summary