ultrasav 0.1.4__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
ultrasav/__init__.py ADDED
@@ -0,0 +1,280 @@
1
+ """
2
+ ultrasav - Ultra-powerful Python package for SPSS/SAV file processing.
3
+
4
+ ultrasav separates data and metadata operations into independent tracks that only
5
+ converge at read/write time. This design provides explicit control, clean separation
6
+ of concerns, and flexibility when working with SPSS data files.
7
+
8
+ ⚡ Specium Ray for your data!
9
+
10
+ Basic usage:
11
+ >>> import ultrasav as ul
12
+ >>> # Read SPSS file - splits into two tracks
13
+ >>> data, meta = ul.read_sav("survey.sav")
14
+ >>>
15
+ >>> # Track 1: Process data independently
16
+ >>> data = ul.Data(data)
17
+ >>> data = data.move(first=["ID"]).rename({"Q1": "Question1"})
18
+ >>>
19
+ >>> # Track 2: Process metadata independently
20
+ >>> meta.column_labels = {"Question1": "Customer Satisfaction"}
21
+ >>>
22
+ >>> # Convergence: Write both tracks to SPSS
23
+ >>> ul.write_sav(data.to_native(), meta, "output.sav")
24
+
25
+ Main components:
26
+ - Data: Handle dataframe operations (move, rename, replace, drop, select)
27
+ - Metadata: Handle SPSS metadata (labels, formats, measures, missing values)
28
+ - read_sav/write_sav: File I/O for SPSS format
29
+ - add_cases: High-level function for merging files with metadata
30
+ - merge_data/merge_meta: Lower-level merge operations
31
+
32
+ Metadata tools (via metaman submodule):
33
+ - get_meta: Extract metadata to Python files
34
+ - make_labels: Create label dictionaries from Excel
35
+ - make_datamap: Build validation datamaps
36
+ - map_to_excel: Export formatted Excel reports
37
+ - detect_variable_type: Detect variable types (single/multi-select, etc.)
38
+ """
39
+
40
+ # __version__ = "0.1.0"
41
+ from importlib.metadata import version
42
+ __version__ = version("ultrasav")
43
+ __author__ = "Albert Li"
44
+
45
+ # =============================================================================
46
+ # Core classes
47
+ # =============================================================================
48
+ from ._data import Data
49
+ from ._metadata import Metadata
50
+
51
+ # =============================================================================
52
+ # I/O functions
53
+ # =============================================================================
54
+ from ._read_files import read_sav, read_csv, read_excel
55
+ from ._write_files import write_sav
56
+
57
+ # =============================================================================
58
+ # Merge functions
59
+ # =============================================================================
60
+ from ._merge_data import merge_data
61
+ from ._merge_meta import merge_meta, get_meta_summary
62
+
63
+ # =============================================================================
64
+ # High-level functions
65
+ # =============================================================================
66
+ from ._add_cases import add_cases
67
+ from ._make_dummy import make_dummies
68
+
69
+ # =============================================================================
70
+ # Metaman submodule - The Metadata Superhero 🦸
71
+ # =============================================================================
72
+ # 1.Submodule namespace access
73
+ from . import metaman
74
+ # ultrasav.metaman.get_meta(...)
75
+ # from ultrasav import metaman
76
+
77
+
78
+ # 2.Top-level function access
79
+ # Re-export metaman's public API at top level for convenience
80
+ from .metaman import (
81
+ # Metadata extraction
82
+ get_meta,
83
+
84
+ # Label creation
85
+ make_labels,
86
+
87
+ # Variable type detection
88
+ detect_variable_type,
89
+ # create_mr_set_lookup,
90
+
91
+ # Datamap creation
92
+ make_datamap,
93
+ # map_engine,
94
+
95
+ # Excel export
96
+ map_to_excel,
97
+ # write_excel_engine,
98
+
99
+ # Color schemes
100
+ get_color_scheme,
101
+ COLOR_SCHEMES,
102
+ )
103
+
104
+ # =============================================================================
105
+ # Public API
106
+ # =============================================================================
107
+ __all__ = [
108
+ # Version
109
+ "__version__",
110
+
111
+ # Core classes
112
+ "Data",
113
+ "Metadata",
114
+
115
+ # Read functions
116
+ "read_sav",
117
+ "read_csv",
118
+ "read_excel",
119
+
120
+ # Write function
121
+ "write_sav",
122
+
123
+ # Merge functions
124
+ "merge_data",
125
+ "merge_meta",
126
+ "get_meta_summary",
127
+
128
+ # High-level functions
129
+ "add_cases",
130
+ "make_dummies",
131
+
132
+ # Metaman submodule
133
+ "metaman",
134
+
135
+ # Metaman re-exports (top-level access)
136
+ "get_meta",
137
+ "make_labels",
138
+ "detect_variable_type",
139
+ # "create_mr_set_lookup",
140
+ "make_datamap",
141
+ # "map_engine",
142
+ "map_to_excel",
143
+ # "write_excel_engine",
144
+ "get_color_scheme",
145
+ "COLOR_SCHEMES",
146
+ ]
147
+
148
+
149
+ # =============================================================================
150
+ # Utility functions
151
+ # =============================================================================
152
+
153
+ def _show_architecture():
154
+ """Display the ultrasav two-track architecture diagram."""
155
+ architecture = """
156
+ ⚡ ULTRASAV ARCHITECTURE ⚡
157
+ ==========================
158
+
159
+ The Two-Track System:
160
+
161
+ ┌─────────────┐ ┌─────────────┐
162
+ │ DATA │ │ METADATA │
163
+ │ │ │ │
164
+ │ DataFrame │ │ Labels │
165
+ │ Columns │ │ Formats │
166
+ │ Values │ │ Measures │
167
+ └─────────────┘ └─────────────┘
168
+ │ │
169
+ │ Independent Work │
170
+ │ │
171
+ └────────┬────────────────┘
172
+
173
+
174
+ ┌─────────────┐
175
+ │ WRITE SAV │
176
+ └─────────────┘
177
+
178
+ Key Principle:
179
+ --------------
180
+ Data and Metadata are two independent layers that only come together at read/write time.
181
+ Like Ultraman's form changes, data can transform independently of its metadata!
182
+
183
+ Workflow:
184
+ ---------
185
+ 1. READING (Splitting): read_sav() → (data, metadata)
186
+ 2. PROCESSING (Parallel):
187
+ - Data operations via Data class
188
+ - Metadata operations via Metadata class
189
+ 3. WRITING (Reunification): write_sav(data, metadata)
190
+
191
+ Benefits:
192
+ ---------
193
+ • Explicit Control - No hidden magic or automatic transfers
194
+ • Clean Separation - Each class has single responsibility
195
+ • Flexibility - Mix and match data/metadata from different sources
196
+ • Easy to Extend - Clear boundaries for new features
197
+
198
+ Metaman Submodule:
199
+ ------------------
200
+ 🦸 The metadata superhero for inspection & reporting:
201
+ • get_meta() - Extract metadata to Python files
202
+ • make_labels() - Create labels from Excel templates
203
+ • make_datamap() - Build validation datamaps
204
+ • map_to_excel() - Export formatted Excel reports
205
+ """
206
+ print(architecture)
207
+
208
+
209
+ def show_arch():
210
+ """Alias for show_architecture()."""
211
+ return _show_architecture()
212
+
213
+
214
+ def _quick_merge(files, output_file=None, source_col="mrgsrc", **kwargs):
215
+ """
216
+ Convenience function for quick file merging.
217
+ Ultra-fast like Ultraman's flight speed!
218
+
219
+ Parameters
220
+ ----------
221
+ files : list
222
+ List of file paths to merge
223
+ output_file : str, optional
224
+ If provided, writes merged data to this file
225
+ source_col : str, default "mrgsrc"
226
+ Name of column tracking data sources
227
+ **kwargs
228
+ Additional arguments passed to add_cases
229
+
230
+ Returns
231
+ -------
232
+ DataFrame, Metadata or None
233
+ Merged data and metadata if output_file not specified
234
+
235
+ Examples
236
+ --------
237
+ >>> import ultrasav as ul
238
+ >>> # Quick merge and save
239
+ >>> ul._quick_merge(["file1.sav", "file2.sav"], "merged.sav")
240
+
241
+ >>> # Quick merge and return
242
+ >>> data, meta = ul._quick_merge(["file1.sav", "file2.sav"])
243
+ """
244
+ data, meta = add_cases(files, source_col=source_col, **kwargs)
245
+
246
+ if output_file:
247
+ write_sav(data, meta, output_file)
248
+ print(f"⚡ Specium Ray successful! Merged {len(files)} files → {output_file}")
249
+ else:
250
+ return data, meta
251
+
252
+
253
+ # Add utility functions to __all__
254
+ __all__.extend(["show_arch"])
255
+ # __all__.extend(["_show_architecture", "show_arch", "_quick_merge"])
256
+
257
+
258
+ # =============================================================================
259
+ # Package metadata
260
+ # =============================================================================
261
+
262
+ def about():
263
+ """Display package information."""
264
+ info = f"""
265
+ ⚡ ultrasav v{__version__} ⚡
266
+ ========================
267
+ Ultra-powerful Python package for SPSS/SAV file processing.
268
+
269
+ Author: {__author__}
270
+
271
+ "Specium Ray for your data!"
272
+
273
+ Core components: Data, Metadata, read_sav, write_sav
274
+ Metaman tools: get_meta, make_labels, make_datamap, map_to_excel
275
+
276
+ Use ul.show_architecture() to see the two-track design.
277
+ """
278
+ print(info)
279
+
280
+ __all__.append("about")
ultrasav/_add_cases.py ADDED
@@ -0,0 +1,227 @@
1
+ """
2
+ add_cases.py
3
+ Top-level function for merging SAV files or dataframes with metadata
4
+ Following tidyspss 2.0's two-track architecture
5
+ """
6
+
7
+ import logging
8
+ from pathlib import Path
9
+ from typing import Any
10
+ from narwhals.typing import IntoFrame
11
+
12
+ from .def_merge_data import merge_data
13
+ from .def_merge_meta import merge_meta
14
+ from .def_read_files import read_sav, read_csv, read_excel
15
+ from .class_metadata import Metadata
16
+
17
+
18
+ logger = logging.getLogger(__name__)
19
+
20
+
21
+ def add_cases(
22
+ # inputs: list[str | Path | IntoFrame | tuple[IntoFrame, Any]],
23
+ inputs: list[str | Path | Any],
24
+ meta: list[Any | None] | None = None,
25
+ output_format: str = "polars",
26
+ source_col: str = "mrgsrc",
27
+ meta_strategy: str = "first"
28
+ ) -> tuple[Any, Any | None]:
29
+ """
30
+ Merge multiple SAV/CSV/Excel files or dataframes with their metadata.
31
+
32
+ This is the main entry point for merging that combines both data and metadata
33
+ merging following tidyspss's two-track architecture. Data and metadata are
34
+ merged independently and returned as a tuple.
35
+
36
+ Parameters
37
+ ----------
38
+ inputs : list[str | Path | DataFrame | tuple[DataFrame, Metadata]]
39
+ List of inputs to merge. Each element can be:
40
+ - File path (str or Path) to:
41
+ * SAV/ZSAV files (metadata extracted automatically)
42
+ * CSV files (no metadata)
43
+ * Excel files (.xlsx, .xls, .xlsm, .xlsb, .ods) (no metadata)
44
+ - A dataframe (pandas, polars, or any narwhals-supported format) without metadata
45
+ - A combination of file paths (str/Path) and dataframes (pandas/polars/narwhals)
46
+ - A tuple of (dataframe, metadata) for explicit data-metadata pairs
47
+ meta : list[Metadata | None] | None, optional
48
+ Optional list of metadata objects to use for merging.
49
+ - If None (default): metadata is automatically extracted from SAV files
50
+ - If provided: uses these metadata objects for merging, ignoring any
51
+ metadata from SAV files. The list does NOT need to match input length.
52
+ Common usage: provide 1-2 metadata objects to merge, regardless of number of inputs
53
+ source_col : str, default "mrgsrc"
54
+ Name of the provenance column to add to track data sources.
55
+ This column will contain:
56
+ - For file paths: the base filename (e.g., "survey_2024.sav", "data.csv", "report.xlsx")
57
+ - For dataframes: "source_1", "source_2", etc.
58
+ output_format : str, default "polars"
59
+ Output dataframe format: "pandas", "polars", or "narwhals"
60
+ meta_strategy : str, default "first"
61
+ Strategy for merging metadata:
62
+ - "first": Use first non-None meta as base, add new columns from others
63
+ - "last": Use last non-None meta as base, add new columns from others
64
+
65
+ Returns
66
+ -------
67
+ tuple[DataFrame, Metadata | None]
68
+ - Merged dataframe in the specified format with provenance column
69
+ - Merged metadata (Metadata object) or None if no metadata available
70
+
71
+ Notes
72
+ -----
73
+ - Data and metadata are merged independently (two-track architecture)
74
+ - If meta is None: uses metadata from SAV files (if any)
75
+ - If meta is provided: uses ONLY those metadata objects, ignoring SAV metadata
76
+ - The source column appears as the last column in the merged dataframe
77
+ - Metadata merge follows column-level preservation (base wins for existing columns)
78
+ - CSV and Excel files don't have metadata, but can still be merged for data
79
+
80
+ File Format Support
81
+ -------------------
82
+ - SAV/ZSAV: Full support with automatic metadata extraction
83
+ - CSV: Data only, no metadata
84
+ - Excel: Data only (reads first sheet), no metadata
85
+ * Supported extensions: .xlsx, .xls, .xlsm, .xlsb, .ods
86
+
87
+ Examples
88
+ --------
89
+ >>> # Merge SAV files with automatic metadata extraction
90
+ >>> data, meta = add_cases(["survey1.sav", "survey2.sav", "survey3.sav"])
91
+
92
+ >>> # Mix different file types (SAV with metadata, CSV/Excel without)
93
+ >>> data, meta = add_cases(["survey.sav", "additional_data.csv", "report.xlsx"])
94
+
95
+ >>> # Provide specific metadata objects (ignores SAV metadata)
96
+ >>> data, meta = add_cases(
97
+ ... inputs=["data1.sav", "data2.csv", "data3.xlsx"],
98
+ ... meta=[meta1, meta2] # Only these two will be merged
99
+ ... )
100
+
101
+ >>> # Mix different input types
102
+ >>> df1 = pd.DataFrame({'Q1': [1, 2]})
103
+ >>> data, meta = add_cases([df1, "survey.sav", "data.csv", (df2, meta2)])
104
+
105
+ >>> # Single metadata for multiple files of any type
106
+ >>> data, meta = add_cases(
107
+ ... inputs=["file1.sav", "file2.csv", "file3.xlsx"], # Mixed file types
108
+ ... meta=[base_meta], # Just one metadata to use
109
+ ... meta_strategy="first"
110
+ ... )
111
+
112
+ >>> # Write merged result
113
+ >>> from tidyspss import write_sav
114
+ >>> write_sav(data, meta, "merged_output.sav")
115
+ """
116
+
117
+ if not inputs:
118
+ raise ValueError("inputs list cannot be empty")
119
+
120
+ # Separate data and metadata handling
121
+ dfs = []
122
+ metas_to_merge = []
123
+
124
+ # If meta parameter is provided, use ONLY those metadata objects
125
+ if meta is not None:
126
+ # User provided specific metadata - use only these
127
+ metas_to_merge = [Metadata(m) if m is not None and not isinstance(m, Metadata) else m
128
+ for m in meta]
129
+ logger.info(f"Using {len(meta)} provided metadata objects (ignoring any SAV metadata)")
130
+
131
+ # Process inputs for data extraction
132
+ for i, item in enumerate(inputs):
133
+ if isinstance(item, tuple) and len(item) == 2:
134
+ # It's a (dataframe, metadata) tuple
135
+ df, tuple_meta = item
136
+ dfs.append(df)
137
+
138
+ # Only use tuple metadata if meta parameter wasn't provided
139
+ if meta is None and tuple_meta is not None:
140
+ metas_to_merge.append(Metadata(tuple_meta) if not isinstance(tuple_meta, Metadata) else tuple_meta)
141
+ logger.debug(f"Using tuple metadata for input {i}")
142
+
143
+ elif isinstance(item, (str, Path)):
144
+ # It's a file path
145
+ file_path = Path(item)
146
+
147
+ if not file_path.exists():
148
+ raise FileNotFoundError(f"File not found: {file_path}")
149
+
150
+ ext = file_path.suffix.lower()
151
+
152
+ # Always pass file path to merge_data to preserve filename in source_col
153
+ dfs.append(str(file_path))
154
+
155
+ if ext in ['.sav', '.zsav']:
156
+ # SAV files - extract metadata separately if needed
157
+ # Only use SAV metadata if meta parameter wasn't provided
158
+ if meta is None:
159
+ _, meta_raw = read_sav(file_path, output_format="polars")
160
+ if meta_raw is not None:
161
+ metas_to_merge.append(Metadata(meta_raw))
162
+ logger.debug(f"Using SAV file metadata: {file_path.name}")
163
+ elif ext == '.csv':
164
+ # CSV files - no metadata available
165
+ logger.debug(f"Added CSV file: {file_path.name} (no metadata)")
166
+ elif ext in ['.xlsx', '.xls', '.xlsm', '.xlsb', '.ods']:
167
+ # Excel files - no metadata available
168
+ logger.debug(f"Added Excel file: {file_path.name} (no metadata)")
169
+ else:
170
+ # Other file types - log warning but try to process
171
+ logger.warning(f"Unknown file type: {ext} - will attempt to process: {file_path.name}")
172
+
173
+ else:
174
+ # It's a dataframe without metadata
175
+ dfs.append(item)
176
+ logger.debug(f"Added dataframe {i}")
177
+
178
+ # Log summary of inputs
179
+ logger.info(f"Processing {len(inputs)} inputs for data")
180
+ if meta is None:
181
+ logger.info(f"Found {len(metas_to_merge)} metadata objects from SAV files/tuples")
182
+ else:
183
+ logger.info(f"Using {len(metas_to_merge)} provided metadata objects")
184
+
185
+ # Count file types for logging
186
+ file_type_counts = {}
187
+ for item in inputs:
188
+ if isinstance(item, (str, Path)):
189
+ ext = Path(item).suffix.lower()
190
+ file_type_counts[ext] = file_type_counts.get(ext, 0) + 1
191
+
192
+ if file_type_counts:
193
+ types_summary = ", ".join([f"{count} {ext}" for ext, count in file_type_counts.items()])
194
+ logger.info(f"File types: {types_summary}")
195
+
196
+ # Merge data using merge_data function
197
+ logger.info("Merging data...")
198
+ merged_data = merge_data(dfs, source_col=source_col, output_format=output_format)
199
+
200
+ # Merge metadata if any exists
201
+ merged_meta = None
202
+ if metas_to_merge and any(m is not None for m in metas_to_merge):
203
+ logger.info(f"Merging metadata with strategy='{meta_strategy}'...")
204
+ merged_meta = merge_meta(metas_to_merge, strategy=meta_strategy)
205
+
206
+ # Add label for the source column if not present
207
+ if merged_meta and source_col not in merged_meta.column_labels:
208
+ logger.debug(f"Adding label for source column '{source_col}'")
209
+ labels_update = {source_col: "Data Source"}
210
+ # Get existing labels and add new one
211
+ existing_labels = merged_meta.column_labels if merged_meta.column_labels else {}
212
+ merged_meta.column_labels = {**existing_labels, **labels_update}
213
+
214
+ # Set as nominal measure if not present
215
+ if source_col not in merged_meta.variable_measure:
216
+ measures_update = {source_col: "nominal"}
217
+ existing_measures = merged_meta.variable_measure if merged_meta.variable_measure else {}
218
+ merged_meta.variable_measure = {**existing_measures, **measures_update}
219
+ else:
220
+ logger.info("No metadata to merge (common when merging CSV/Excel files)")
221
+
222
+ # Log final summary
223
+ data_shape = merged_data.shape if hasattr(merged_data, 'shape') else "unknown"
224
+ meta_cols = len(merged_meta.column_labels) if merged_meta and merged_meta.column_labels else 0
225
+ logger.info(f"Merge complete: data shape {data_shape}, metadata for {meta_cols} columns")
226
+
227
+ return merged_data, merged_meta