ultrasav 0.1.4__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
ultrasav/_metadata.py ADDED
@@ -0,0 +1,570 @@
1
+ #_v4_updated
2
+ import warnings
3
+ from typing import Any
4
+ from dataclasses import dataclass, field
5
+ from copy import deepcopy
6
+
7
+ @dataclass
8
+ class Metadata:
9
+ """
10
+ A class to handle SPSS metadata updates for writing SAV files.
11
+
12
+ This class takes the original pyreadstat metadata and allows explicit updates.
13
+ It does NOT track dataframe changes - users must explicitly update metadata
14
+ to match their dataframe transformations.
15
+
16
+ All metadata updates MERGE with original metadata - they don't replace it.
17
+
18
+ Parameters
19
+ ----------
20
+ meta_obj : pyreadstat metadata object, dict, or None
21
+ Can be:
22
+ - pyreadstat metadata object from read_sav()
23
+ - dict with metadata parameters to set
24
+ - None for empty metadata
25
+
26
+ Examples
27
+ --------
28
+ >>> # From pyreadstat
29
+ >>> df, meta_raw = pyreadstat.read_sav("file.sav")
30
+ >>> meta = Metadata(meta_raw)
31
+
32
+ >>> # Empty metadata
33
+ >>> meta = Metadata()
34
+
35
+ >>> # With initial values
36
+ >>> meta = Metadata({"column_labels": {"Q1": "Question 1"}})
37
+ """
38
+
39
+ # Store the original metadata object
40
+ _original_meta: Any | None = field(default=None, init=False)
41
+
42
+ # User updates - these will override original metadata when provided
43
+ _user_column_labels: dict[str, str] | None = field(default=None, init=False)
44
+ _user_variable_value_labels: dict[str, dict[int | float | str, str]] | None = field(default=None, init=False)
45
+ _user_variable_format: dict[str, str] | None = field(default=None, init=False)
46
+ _user_variable_measure: dict[str, str] | None = field(default=None, init=False)
47
+ _user_variable_display_width: dict[str, int] | None = field(default=None, init=False)
48
+ _user_missing_ranges: dict[str, list] | None = field(default=None, init=False)
49
+ _user_note: str | list[str] | None = field(default=None, init=False)
50
+ _user_file_label: str | None = field(default=None, init=False)
51
+ _user_compress: bool | None = field(default=None, init=False)
52
+ _user_row_compress: bool | None = field(default=None, init=False)
53
+
54
+ def __init__(self, meta_obj=None):
55
+ """
56
+ Initialize Metadata instance.
57
+
58
+ Parameters
59
+ ----------
60
+ meta_obj : pyreadstat metadata object, dict, or None
61
+ Can be pyreadstat metadata, a dict of parameters, or None for empty
62
+ """
63
+ # Initialize all fields
64
+ self._original_meta = None
65
+ self._user_column_labels = None
66
+ self._user_variable_value_labels = None
67
+ self._user_variable_format = None
68
+ self._user_variable_measure = None
69
+ self._user_variable_display_width = None
70
+ self._user_missing_ranges = None
71
+ self._user_note = None
72
+ self._user_file_label = None
73
+ self._user_compress = None
74
+ self._user_row_compress = None
75
+
76
+ if meta_obj is not None:
77
+ # Check if it's pyreadstat metadata (has specific attributes)
78
+ if hasattr(meta_obj, 'column_names') and hasattr(meta_obj, 'column_labels'):
79
+ # It's pyreadstat metadata
80
+ self._original_meta = meta_obj
81
+ elif isinstance(meta_obj, dict):
82
+ # It's user-provided dict of updates
83
+ self.update(**meta_obj)
84
+ else:
85
+ # Try to detect if it's pyreadstat metadata by other attributes
86
+ if hasattr(meta_obj, 'number_columns') or hasattr(meta_obj, 'file_label'):
87
+ self._original_meta = meta_obj
88
+ else:
89
+ raise TypeError(
90
+ f"Unsupported metadata type: {type(meta_obj)}. "
91
+ "Expected pyreadstat metadata object, dict, or None."
92
+ )
93
+
94
+ @classmethod
95
+ def from_pyreadstat(cls, meta_obj):
96
+ """
97
+ Create a Metadata instance from a pyreadstat metadata object.
98
+
99
+ DEPRECATED: Use Metadata(meta_obj) instead.
100
+
101
+ Parameters
102
+ ----------
103
+ meta_obj : pyreadstat metadata object or None
104
+ The metadata object returned by pyreadstat.read_sav()
105
+
106
+ Returns
107
+ -------
108
+ Metadata
109
+ A new Metadata instance
110
+ """
111
+ warnings.warn(
112
+ "Metadata.from_pyreadstat() is deprecated. Use Metadata(meta_obj) instead.",
113
+ DeprecationWarning,
114
+ stacklevel=2
115
+ )
116
+ return cls(meta_obj)
117
+
118
+ def _merge_with_original(self, user_dict: dict | None,
119
+ original_attr: str,
120
+ process_values: bool = False) -> dict:
121
+ """
122
+ Generic method to merge user updates with original metadata.
123
+
124
+ Parameters
125
+ ----------
126
+ user_dict : dict or None
127
+ User-provided updates
128
+ original_attr : str
129
+ Name of the attribute in original metadata
130
+ process_values : bool
131
+ If True, process value labels (convert keys to numbers)
132
+
133
+ Returns
134
+ -------
135
+ dict
136
+ Merged dictionary (original + updates)
137
+ """
138
+ # If no user updates, return original
139
+ if not user_dict:
140
+ if not self._original_meta or not hasattr(self._original_meta, original_attr):
141
+ return {}
142
+ original = getattr(self._original_meta, original_attr)
143
+ return original.copy() if original else {}
144
+
145
+ # If no original metadata, return user updates
146
+ if not self._original_meta or not hasattr(self._original_meta, original_attr):
147
+ if process_values:
148
+ # Convert keys to numbers if possible for value labels
149
+ converted = {}
150
+ for var, lbls in user_dict.items():
151
+ converted[var] = self._convert_keys_to_numbers_if_possible(lbls)
152
+ return converted
153
+ return user_dict.copy()
154
+
155
+ # Merge: start with original, then apply user updates
156
+ original = getattr(self._original_meta, original_attr)
157
+ existing = original.copy() if original else {}
158
+
159
+ # Apply user updates
160
+ for key, value in user_dict.items():
161
+ if process_values:
162
+ existing[key] = self._convert_keys_to_numbers_if_possible(value)
163
+ else:
164
+ existing[key] = value
165
+
166
+ return existing
167
+
168
+ # ===================================================================
169
+ # WRITABLE PROPERTIES (can be updated by user)
170
+ # ===================================================================
171
+
172
+ @property
173
+ def column_labels(self) -> dict[str, str]:
174
+ """Get current column labels (original + updates)."""
175
+ if not self._user_column_labels:
176
+ if not self._original_meta:
177
+ return {}
178
+ # Special handling for column_labels as it's stored differently
179
+ if hasattr(self._original_meta, 'column_names') and hasattr(self._original_meta, 'column_labels'):
180
+ return dict(zip(self._original_meta.column_names,
181
+ self._original_meta.column_labels))
182
+ return {}
183
+
184
+ if self._original_meta is None:
185
+ return self._user_column_labels
186
+
187
+ # Start with existing labels
188
+ existing = {}
189
+ if hasattr(self._original_meta, 'column_names') and hasattr(self._original_meta, 'column_labels'):
190
+ existing = dict(zip(self._original_meta.column_names,
191
+ self._original_meta.column_labels))
192
+
193
+ # Override with user updates
194
+ return {**existing, **self._user_column_labels}
195
+
196
+ @column_labels.setter
197
+ def column_labels(self, value: dict[str, str]):
198
+ """Set user column labels updates (merges with original)."""
199
+ self._user_column_labels = value
200
+
201
+ @property
202
+ def variable_value_labels(self) -> dict[str, dict[int | float | str, str]]:
203
+ """Get current variable value labels (original + updates)."""
204
+ return self._merge_with_original(
205
+ self._user_variable_value_labels,
206
+ 'variable_value_labels',
207
+ process_values=True
208
+ )
209
+
210
+ @variable_value_labels.setter
211
+ def variable_value_labels(self, value: dict[str, dict[int | float | str, str]]):
212
+ """Set user variable value labels updates (merges with original)."""
213
+ self._user_variable_value_labels = value
214
+
215
+ @property
216
+ def variable_format(self) -> dict[str, str]:
217
+ """Get current variable formats (original + updates)."""
218
+ # First try variable_format, then fall back to original_variable_types
219
+ if hasattr(self._original_meta, 'variable_format') and self._original_meta.variable_format:
220
+ return self._merge_with_original(
221
+ self._user_variable_format,
222
+ 'variable_format'
223
+ )
224
+ elif hasattr(self._original_meta, 'original_variable_types') and not self._user_variable_format:
225
+ # Use original_variable_types as fallback if no variable_format exists
226
+ return self._original_meta.original_variable_types.copy()
227
+ else:
228
+ # Merge user updates with original_variable_types if available
229
+ if self._user_variable_format:
230
+ if hasattr(self._original_meta, 'original_variable_types'):
231
+ existing = self._original_meta.original_variable_types.copy()
232
+ for key, value in self._user_variable_format.items():
233
+ existing[key] = value
234
+ return existing
235
+ return self._user_variable_format.copy()
236
+ return {}
237
+
238
+ @variable_format.setter
239
+ def variable_format(self, value: dict[str, str]):
240
+ """Set user variable format updates (merges with original)."""
241
+ self._user_variable_format = value
242
+
243
+ @property
244
+ def variable_measure(self) -> dict[str, str]:
245
+ """Get current variable measures (original + updates)."""
246
+ return self._merge_with_original(
247
+ self._user_variable_measure,
248
+ 'variable_measure'
249
+ )
250
+
251
+ @variable_measure.setter
252
+ def variable_measure(self, value: dict[str, str]):
253
+ """Set user variable measure updates (merges with original)."""
254
+ self._user_variable_measure = value
255
+
256
+ @property
257
+ def variable_display_width(self) -> dict[str, int]:
258
+ """Get current variable display widths (original + updates)."""
259
+ return self._merge_with_original(
260
+ self._user_variable_display_width,
261
+ 'variable_display_width'
262
+ )
263
+
264
+ @variable_display_width.setter
265
+ def variable_display_width(self, value: dict[str, int]):
266
+ """Set user variable display width updates (merges with original)."""
267
+ self._user_variable_display_width = value
268
+
269
+ @property
270
+ def missing_ranges(self) -> dict[str, list] | None:
271
+ """Get current missing ranges (original + updates)."""
272
+ # missing_ranges follows same merge pattern
273
+ if not self._user_missing_ranges:
274
+ return getattr(self._original_meta, "missing_ranges", None) if self._original_meta else None
275
+
276
+ if not self._original_meta or not hasattr(self._original_meta, "missing_ranges"):
277
+ return self._user_missing_ranges
278
+
279
+ # Merge: start with original, apply user updates
280
+ original = getattr(self._original_meta, "missing_ranges", {})
281
+ if original:
282
+ merged = original.copy()
283
+ for key, value in self._user_missing_ranges.items():
284
+ merged[key] = value
285
+ return merged
286
+ return self._user_missing_ranges
287
+
288
+ @missing_ranges.setter
289
+ def missing_ranges(self, value: dict[str, list]):
290
+ """Set user missing ranges (merges with original)."""
291
+ self._user_missing_ranges = value
292
+
293
+ @property
294
+ def note(self) -> str | list[str] | None:
295
+ """Get current note (user or original)."""
296
+ if self._user_note is not None:
297
+ return self._user_note
298
+ if self._original_meta and hasattr(self._original_meta, "notes") and self._original_meta.notes:
299
+ return self._original_meta.notes
300
+ return None
301
+
302
+ @note.setter
303
+ def note(self, value: str | list[str]):
304
+ """Set user note (replaces original)."""
305
+ self._user_note = value
306
+
307
+ @property
308
+ def file_label(self) -> str:
309
+ """Get current file label (user or original)."""
310
+ if self._user_file_label is not None:
311
+ return self._user_file_label
312
+ return getattr(self._original_meta, "file_label", "") if self._original_meta else ""
313
+
314
+ @file_label.setter
315
+ def file_label(self, value: str):
316
+ """Set user file label (replaces original)."""
317
+ self._user_file_label = value
318
+
319
+ @property
320
+ def compress(self) -> bool:
321
+ """Get compress setting."""
322
+ return self._user_compress if self._user_compress is not None else False
323
+
324
+ @compress.setter
325
+ def compress(self, value: bool):
326
+ """Set compress setting."""
327
+ self._user_compress = value
328
+
329
+ @property
330
+ def row_compress(self) -> bool:
331
+ """Get row_compress setting."""
332
+ return self._user_row_compress if self._user_row_compress is not None else False
333
+
334
+ @row_compress.setter
335
+ def row_compress(self, value: bool):
336
+ """Set row_compress setting."""
337
+ self._user_row_compress = value
338
+
339
+ # ===================================================================
340
+ # READ-ONLY PROPERTIES (from original metadata)
341
+ # ===================================================================
342
+
343
+ # Basic file information
344
+ @property
345
+ def notes(self) -> str | list[str] | None:
346
+ """Get notes from original metadata (same as note property)."""
347
+ return self.note
348
+
349
+ @property
350
+ def creation_time(self) -> str | None:
351
+ """Get creation time from original metadata."""
352
+ return getattr(self._original_meta, "creation_time", None) if self._original_meta else None
353
+
354
+ @property
355
+ def modification_time(self) -> str | None:
356
+ """Get modification time from original metadata."""
357
+ return getattr(self._original_meta, "modification_time", None) if self._original_meta else None
358
+
359
+ @property
360
+ def file_encoding(self) -> str | None:
361
+ """Get file encoding from original metadata."""
362
+ return getattr(self._original_meta, "file_encoding", None) if self._original_meta else None
363
+
364
+ @property
365
+ def table_name(self) -> str | None:
366
+ """Get table name from original metadata."""
367
+ return getattr(self._original_meta, "table_name", None) if self._original_meta else None
368
+
369
+ # Column/variable information
370
+ @property
371
+ def column_names(self) -> list[str]:
372
+ """Get column names from original metadata."""
373
+ if self._original_meta and hasattr(self._original_meta, 'column_names'):
374
+ return list(self._original_meta.column_names)
375
+ return []
376
+
377
+ @property
378
+ def column_names_to_labels(self) -> dict[str, str]:
379
+ """Get column names to labels mapping (same as column_labels property)."""
380
+ return self.column_labels
381
+
382
+ @property
383
+ def number_columns(self) -> int | None:
384
+ """Get number of columns from original metadata."""
385
+ return getattr(self._original_meta, "number_columns", None) if self._original_meta else None
386
+
387
+ @property
388
+ def number_rows(self) -> int | None:
389
+ """Get number of rows from original metadata."""
390
+ return getattr(self._original_meta, "number_rows", None) if self._original_meta else None
391
+
392
+ # Variable types and formats
393
+ @property
394
+ def original_variable_types(self) -> dict[str, str]:
395
+ """Get original variable types from metadata."""
396
+ if self._original_meta and hasattr(self._original_meta, 'original_variable_types'):
397
+ return self._original_meta.original_variable_types.copy()
398
+ return {}
399
+
400
+ @property
401
+ def readstat_variable_types(self) -> dict[str, str]:
402
+ """Get readstat variable types from metadata."""
403
+ if self._original_meta and hasattr(self._original_meta, 'readstat_variable_types'):
404
+ return self._original_meta.readstat_variable_types.copy()
405
+ return {}
406
+
407
+ # Value labels and mappings
408
+ @property
409
+ def value_labels(self) -> dict:
410
+ """Get value labels from original metadata."""
411
+ if self._original_meta and hasattr(self._original_meta, 'value_labels'):
412
+ return self._original_meta.value_labels.copy() if self._original_meta.value_labels else {}
413
+ return {}
414
+
415
+ @property
416
+ def variable_to_label(self) -> dict[str, str]:
417
+ """Get variable to label mapping from original metadata."""
418
+ if self._original_meta and hasattr(self._original_meta, 'variable_to_label'):
419
+ return self._original_meta.variable_to_label.copy() if self._original_meta.variable_to_label else {}
420
+ return {}
421
+
422
+ # Missing value information
423
+ @property
424
+ def missing_user_values(self) -> dict | None:
425
+ """Get missing user values from original metadata."""
426
+ return getattr(self._original_meta, "missing_user_values", None) if self._original_meta else None
427
+
428
+ # Display properties
429
+ @property
430
+ def variable_alignment(self) -> dict[str, str]:
431
+ """Get variable alignment from original metadata."""
432
+ if self._original_meta and hasattr(self._original_meta, 'variable_alignment'):
433
+ return self._original_meta.variable_alignment.copy() if self._original_meta.variable_alignment else {}
434
+ return {}
435
+
436
+ @property
437
+ def variable_storage_width(self) -> dict[str, int]:
438
+ """Get variable storage width from original metadata."""
439
+ if self._original_meta and hasattr(self._original_meta, 'variable_storage_width'):
440
+ return self._original_meta.variable_storage_width.copy() if self._original_meta.variable_storage_width else {}
441
+ return {}
442
+
443
+ # Multiple response sets
444
+ @property
445
+ def mr_sets(self) -> dict | None:
446
+ """Get multiple response sets from original metadata."""
447
+ return getattr(self._original_meta, "mr_sets", None) if self._original_meta else None
448
+
449
+ # ===================================================================
450
+ # METHODS
451
+ # ===================================================================
452
+
453
+ def update(self, **kwargs) -> 'Metadata':
454
+ """
455
+ Update metadata with user-provided values.
456
+
457
+ Parameters
458
+ ----------
459
+ **kwargs : dict
460
+ Any of the writable metadata attributes (column_labels, variable_value_labels, etc.)
461
+
462
+ Returns
463
+ -------
464
+ self
465
+ Returns self for method chaining
466
+
467
+ Examples
468
+ --------
469
+ >>> meta.update(
470
+ ... column_labels={"Q1": "Question 1"},
471
+ ... file_label="My Survey"
472
+ ... )
473
+ """
474
+ for key, value in kwargs.items():
475
+ if hasattr(self, key) and not key.startswith('_'):
476
+ setattr(self, key, value)
477
+ else:
478
+ warnings.warn(f"Unknown metadata attribute: {key}", UserWarning, stacklevel=2)
479
+
480
+ return self
481
+
482
+ def _convert_keys_to_numbers_if_possible(self, value_labels_dict):
483
+ """Convert string keys to numbers where possible (from v1.0 logic)."""
484
+ updated = {}
485
+ for k, v in value_labels_dict.items():
486
+ try:
487
+ temp = float(k)
488
+ if temp.is_integer():
489
+ temp = int(temp)
490
+ updated[temp] = v
491
+ except (ValueError, TypeError):
492
+ updated[k] = v
493
+ return updated
494
+
495
+ def _force_string_labels(self, labels_dict):
496
+ """Ensure all labels are strings (from v1.0 logic)."""
497
+ if not labels_dict:
498
+ return {}
499
+ fixed = {}
500
+ for col_name, lbl_val in labels_dict.items():
501
+ col_name_str = str(col_name)
502
+ label_str = str(lbl_val) if lbl_val is not None else ""
503
+ fixed[col_name_str] = label_str
504
+ return fixed
505
+
506
+ def _resolve_compress_settings(self):
507
+ """Resolve compression settings."""
508
+ final_compress = self.compress
509
+ final_row_compress = self.row_compress
510
+
511
+ if final_compress and final_row_compress:
512
+ warnings.warn(
513
+ "Both 'compress' and 'row_compress' are True; prioritizing 'compress' over 'row_compress'.",
514
+ UserWarning,
515
+ stacklevel=2
516
+ )
517
+ final_row_compress = False
518
+
519
+ return final_compress, final_row_compress
520
+
521
+ def get_write_params(self) -> dict[str, Any]:
522
+ """
523
+ Get parameters formatted for pyreadstat.write_sav().
524
+
525
+ Returns
526
+ -------
527
+ dict
528
+ Dictionary of parameters ready to pass to write_sav
529
+ """
530
+ # Ensure column labels are all strings
531
+ column_labels = self._force_string_labels(self.column_labels)
532
+
533
+ # Resolve note formatting
534
+ final_note = self.note
535
+ if isinstance(final_note, list):
536
+ final_note = "\n".join(final_note)
537
+
538
+ # Resolve compression settings
539
+ final_compress, final_row_compress = self._resolve_compress_settings()
540
+
541
+ params = {
542
+ 'file_label': self.file_label,
543
+ 'column_labels': column_labels if column_labels else None,
544
+ 'compress': final_compress,
545
+ 'row_compress': final_row_compress,
546
+ 'note': final_note,
547
+ 'variable_value_labels': self.variable_value_labels if self.variable_value_labels else None,
548
+ 'missing_ranges': self.missing_ranges,
549
+ 'variable_display_width': self.variable_display_width if self.variable_display_width else None,
550
+ 'variable_measure': self.variable_measure if self.variable_measure else None,
551
+ 'variable_format': self.variable_format if self.variable_format else None,
552
+ }
553
+
554
+ # Remove None values for cleaner params
555
+ return {k: v for k, v in params.items() if v is not None}
556
+
557
+ def copy(self) -> 'Metadata':
558
+ """Create a deep copy of the metadata."""
559
+ return deepcopy(self)
560
+
561
+ def __repr__(self) -> str:
562
+ info = []
563
+ if self._original_meta:
564
+ info.append(f"columns={self.number_columns}")
565
+ if self.column_labels:
566
+ info.append(f"labels={len(self.column_labels)}")
567
+ if self.variable_value_labels:
568
+ info.append(f"value_labels={len(self.variable_value_labels)}")
569
+
570
+ return f"Metadata({', '.join(info)})"