ultrasav 0.1.4__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
ultrasav/_data.py ADDED
@@ -0,0 +1,513 @@
1
+ #v2
2
+ from typing import Any
3
+ import narwhals as nw
4
+ from narwhals.typing import IntoFrame
5
+
6
+
7
+ class Data:
8
+ """
9
+ DataFrame handler for tidyspss 2.0 - manages all data transformations.
10
+
11
+ The Data class is part of tidyspss's two-track architecture where Data and
12
+ Metadata are completely independent until write time. This class handles all
13
+ dataframe operations (renaming, selecting, filtering, transforming) while
14
+ remaining completely agnostic to either pandas or polars dataframes.
15
+
16
+ Key Design Principles
17
+ ---------------------
18
+ - **DataFrame Agnostic**: Works with any narwhals-supported dataframe
19
+ (pandas, Polars, cuDF, Modin, PyArrow, DuckDB, etc.)
20
+ - **No Metadata Awareness**: Never reads or modifies metadata - that's
21
+ handled by the separate Metadata class
22
+ - **Chainable Operations**: All methods return self for fluent API usage
23
+ - **Explicit Control**: No automatic transfers or hidden magic - you control
24
+ exactly what happens to your data
25
+
26
+ Workflow
27
+ --------
28
+ 1. Create from a dataframe: `data = Data(df)`
29
+ 2. Transform as needed: `data.rename(...).select(...).replace(...)`
30
+ 3. Combine with metadata only at write: `write_sav(data, meta, "output.sav")`
31
+
32
+ Examples
33
+ --------
34
+ >>> import pandas as pd
35
+ >>> from tidyspss import Data
36
+ >>>
37
+ >>> # Create from any supported dataframe
38
+ >>> df = pd.DataFrame({'Q1': [1, 2, 3], 'Q2': [4, 5, 6]})
39
+ >>> data = Data(df)
40
+ >>>
41
+ >>> # Chain operations
42
+ >>> data = (data
43
+ ... .rename({'Q1': 'satisfaction'})
44
+ ... .select(['satisfaction'])
45
+ ... .replace({'satisfaction': {1: 10, 2: 20}})
46
+ ... )
47
+ >>>
48
+ >>> # Convert back to native format when needed
49
+ >>> result_df = data.to_native()
50
+
51
+ Notes
52
+ -----
53
+ When you rename columns in Data, any associated metadata must be explicitly
54
+ updated in the Metadata object. The two classes do not communicate - this is
55
+ by design for explicit control and clean separation of concerns.
56
+ """
57
+
58
+ def __init__(self, df: IntoFrame) -> None:
59
+ """
60
+ Initialize with a DataFrame from any narwhals-supported library.
61
+
62
+ Parameters
63
+ ----------
64
+ df : IntoFrame
65
+ A DataFrame from any narwhals-supported library including:
66
+ pandas, Polars, cuDF, Modin, PyArrow, DuckDB, and others.
67
+ """
68
+ self._nw_df = nw.from_native(df)
69
+
70
+ def rename(self, mapping: dict[str, str]) -> 'Data':
71
+ """
72
+ Rename columns and return self for chaining.
73
+
74
+ Parameters
75
+ ----------
76
+ mapping : dict[str, str]
77
+ Mapping of old column names to new column names.
78
+
79
+ Returns
80
+ -------
81
+ Data
82
+ Self for method chaining.
83
+
84
+ Notes
85
+ -----
86
+ Remember to update metadata for renamed columns in the Metadata object.
87
+ Old column metadata does not automatically transfer to new names.
88
+ """
89
+ self._nw_df = self._nw_df.rename(mapping)
90
+ return self
91
+
92
+ def select(self, columns: str | list[str]) -> 'Data':
93
+ """
94
+ Select specified columns and return self for chaining.
95
+
96
+ Parameters
97
+ ----------
98
+ columns : str or list[str]
99
+ Column name(s) to select.
100
+
101
+ Returns
102
+ -------
103
+ Data
104
+ Self for method chaining.
105
+ """
106
+ self._nw_df = self._nw_df.select(columns)
107
+ return self
108
+
109
+ def drop(self, columns: str | list[str]) -> 'Data':
110
+ """
111
+ Drop columns and return self for chaining.
112
+
113
+ Parameters
114
+ ----------
115
+ columns : str or list[str]
116
+ Column name(s) to drop.
117
+
118
+ Returns
119
+ -------
120
+ Data
121
+ Self for method chaining.
122
+ """
123
+ self._nw_df = self._nw_df.drop(columns)
124
+ return self
125
+
126
+ def replace(self, replacements: dict[str, dict[Any, Any]]) -> 'Data':
127
+ """
128
+ Replace values in specified columns.
129
+
130
+ Parameters
131
+ ----------
132
+ replacements : dict[str, dict[Any, Any]]
133
+ Nested dictionary where keys are column names and values are
134
+ dictionaries mapping old values to new values.
135
+ Use None as a key to replace null values.
136
+
137
+ Returns
138
+ -------
139
+ Data
140
+ Self for method chaining.
141
+
142
+ Examples
143
+ --------
144
+ >>> data.replace({
145
+ ... 'column1': {1: 'one', 2: 'two', None: 'missing'},
146
+ ... 'column2': {'old': 'new'}
147
+ ... })
148
+ """
149
+ columns_to_update = []
150
+
151
+ for col, mapping in replacements.items():
152
+ col_expr = nw.col(col)
153
+
154
+ # Handle null replacements separately
155
+ if None in mapping:
156
+ null_replacement = mapping[None]
157
+ col_expr = col_expr.fill_null(null_replacement)
158
+ # Create new mapping without None
159
+ mapping = {k: v for k, v in mapping.items() if k is not None}
160
+
161
+ # Handle regular value replacements
162
+ for old_val, new_val in mapping.items():
163
+ col_expr = nw.when(col_expr == old_val).then(new_val).otherwise(col_expr)
164
+
165
+ columns_to_update.append(col_expr.alias(col))
166
+
167
+ self._nw_df = self._nw_df.with_columns(columns_to_update)
168
+ return self
169
+
170
+ def move(
171
+ self,
172
+ config: dict[str, Any] | None = None,
173
+ first: list[str | list[str]] | None = None,
174
+ last: list[str | list[str]] | None = None,
175
+ before: dict[str, str | list[str]] | None = None,
176
+ after: dict[str, str | list[str]] | None = None
177
+ ) -> 'Data':
178
+ """
179
+ Move columns in the DataFrame using various positioning strategies.
180
+
181
+ This method allows flexible column reordering using four positioning strategies
182
+ that can be combined. Columns can be specified individually, as lists, or using
183
+ slice notation (e.g., 'Q1_1:Q1_25' to select a range of columns).
184
+
185
+ Parameters
186
+ ----------
187
+ config : dict[str, Any], optional
188
+ Dictionary containing any of the positioning parameters below.
189
+ If provided, overrides individual parameters.
190
+ first : list[str | list[str]], optional
191
+ Columns to position at the beginning of the DataFrame.
192
+ Supports slice notation like 'Q1_1:Q1_5'.
193
+ last : list[str | list[str]], optional
194
+ Columns to position at the end of the DataFrame.
195
+ Supports slice notation.
196
+ before : dict[str, str | list[str]], optional
197
+ Dictionary mapping anchor columns to columns that should be
198
+ positioned before them. Keys are anchor column names, values
199
+ are columns to insert before the anchor.
200
+ after : dict[str, str | list[str]], optional
201
+ Dictionary mapping anchor columns to columns that should be
202
+ positioned after them. Keys are anchor column names, values
203
+ are columns to insert after the anchor.
204
+
205
+ Returns
206
+ -------
207
+ Data
208
+ Self for method chaining.
209
+
210
+ Examples
211
+ --------
212
+ >>> # Move columns to the beginning
213
+ >>> data.move(first=['id', 'name'])
214
+
215
+ >>> # Move columns to the end
216
+ >>> data.move(last=['created_at', 'updated_at'])
217
+
218
+ >>> # Move specific columns before/after anchors
219
+ >>> data.move(
220
+ ... before={'age': ['birth_date', 'birth_year']},
221
+ ... after={'name': ['first_name', 'last_name']}
222
+ ... )
223
+
224
+ >>> # Use slice notation for sequential columns
225
+ >>> data.move(first=['Q1_1:Q1_5'], last=['Q10_1:Q10_20'])
226
+
227
+ >>> # Complex combination
228
+ >>> data.move(
229
+ ... first=['respondent_id'],
230
+ ... last=['timestamp'],
231
+ ... before={'Q2_1': 'Q1_1:Q1_10'},
232
+ ... after={'demographics': ['age', 'gender', 'income']}
233
+ ... )
234
+
235
+ Notes
236
+ -----
237
+ - Operations are applied in order: first → before/after → last
238
+ - Columns can only appear in one positioning directive
239
+ - Non-existent columns will raise a ValueError
240
+ - Slice notation 'start:end' includes both endpoints
241
+ """
242
+ # Use config if provided, otherwise use individual parameters
243
+ if config:
244
+ first = config.get('first', first)
245
+ last = config.get('last', last)
246
+ before = config.get('before', before)
247
+ after = config.get('after', after)
248
+
249
+ # Get current column order
250
+ current_cols = list(self._nw_df.columns)
251
+
252
+ # Reorder columns based on specifications
253
+ new_order = self._calculate_column_order(
254
+ current_cols, first, last, before, after
255
+ )
256
+
257
+ # Apply the new order
258
+ self._nw_df = self._nw_df.select(new_order)
259
+ return self
260
+
261
+ def _calculate_column_order(
262
+ self,
263
+ current_cols: list[str],
264
+ first: list[str | list[str]] | None,
265
+ last: list[str | list[str]] | None,
266
+ before: dict[str, str | list[str]] | None,
267
+ after: dict[str, str | list[str]] | None
268
+ ) -> list[str]:
269
+ """
270
+ Calculate new column order based on positioning specifications.
271
+
272
+ Parameters
273
+ ----------
274
+ current_cols : list[str]
275
+ Current column order.
276
+ first : list[str | list[str]] | None
277
+ Columns to move to beginning.
278
+ last : list[str | list[str]] | None
279
+ Columns to move to end.
280
+ before : dict[str, str | list[str]] | None
281
+ Columns to position before anchors.
282
+ after : dict[str, str | list[str]] | None
283
+ Columns to position after anchors.
284
+
285
+ Returns
286
+ -------
287
+ list[str]
288
+ New column order.
289
+
290
+ Raises
291
+ ------
292
+ ValueError
293
+ If any specified columns don't exist or if there are conflicts.
294
+ """
295
+ # First, validate all columns exist and detect conflicts
296
+ errors = []
297
+ current_cols_set = set(current_cols)
298
+
299
+ # Check 'first' columns exist
300
+ if first:
301
+ expanded_first = []
302
+ for item in first:
303
+ expanded_first.extend(self._expand_column_spec(item, current_cols))
304
+ missing = [col for col in expanded_first if col not in current_cols_set]
305
+ if missing:
306
+ errors.append(f"'first' contains non-existent columns: {missing}")
307
+ # Update first to be the expanded version
308
+ first = expanded_first
309
+
310
+ # Check 'last' columns exist
311
+ if last:
312
+ expanded_last = []
313
+ for item in last:
314
+ expanded_last.extend(self._expand_column_spec(item, current_cols))
315
+ missing = [col for col in expanded_last if col not in current_cols_set]
316
+ if missing:
317
+ errors.append(f"'last' contains non-existent columns: {missing}")
318
+ # Update last to be the expanded version
319
+ last = expanded_last
320
+
321
+ # Check 'before' anchor columns and values
322
+ if before:
323
+ # Check anchor columns (keys)
324
+ missing_anchors = [col for col in before.keys() if col not in current_cols_set]
325
+ if missing_anchors:
326
+ errors.append(f"'before' references non-existent anchor columns: {missing_anchors}")
327
+
328
+ # Check columns to position (values)
329
+ for anchor, cols in before.items():
330
+ expanded_cols = self._expand_column_spec(cols, current_cols)
331
+ missing = [col for col in expanded_cols if col not in current_cols_set]
332
+ if missing:
333
+ errors.append(f"'before[{anchor}]' contains non-existent columns: {missing}")
334
+
335
+ # Check 'after' anchor columns and values
336
+ if after:
337
+ # Check anchor columns (keys)
338
+ missing_anchors = [col for col in after.keys() if col not in current_cols_set]
339
+ if missing_anchors:
340
+ errors.append(f"'after' references non-existent anchor columns: {missing_anchors}")
341
+
342
+ # Check columns to position (values)
343
+ for anchor, cols in after.items():
344
+ expanded_cols = self._expand_column_spec(cols, current_cols)
345
+ missing = [col for col in expanded_cols if col not in current_cols_set]
346
+ if missing:
347
+ errors.append(f"'after[{anchor}]' contains non-existent columns: {missing}")
348
+
349
+ # Raise error if any issues found
350
+ if errors:
351
+ raise ValueError("Column positioning errors:\n" + "\n".join(errors))
352
+
353
+ # Build the new column order
354
+ new_order = []
355
+
356
+ # Handle 'first' columns (already expanded above)
357
+ first_set = set(first) if first else set()
358
+ if first:
359
+ new_order.extend(first)
360
+
361
+ # Identify columns that should be positioned relative to anchors
362
+ relatively_positioned = set()
363
+
364
+ if before:
365
+ for cols in before.values():
366
+ expanded_cols = self._expand_column_spec(cols, current_cols)
367
+ relatively_positioned.update(expanded_cols)
368
+
369
+ if after:
370
+ for cols in after.values():
371
+ expanded_cols = self._expand_column_spec(cols, current_cols)
372
+ relatively_positioned.update(expanded_cols)
373
+
374
+ # Handle 'last' columns set (already expanded above)
375
+ last_set = set(last) if last else set()
376
+
377
+ # Track what we've already added to avoid duplicates
378
+ positioned = set(first) if first else set()
379
+
380
+ # Process columns in original order, handling before/after relationships
381
+ for col in current_cols:
382
+ # Skip if already positioned (in first)
383
+ if col in positioned:
384
+ continue
385
+
386
+ # Skip if this column should be positioned relatively
387
+ if col in relatively_positioned:
388
+ continue
389
+
390
+ # Skip if this column goes in 'last'
391
+ if col in last_set:
392
+ continue
393
+
394
+ # Handle 'before' - insert columns before current anchor
395
+ if before and col in before:
396
+ cols_to_insert = self._expand_column_spec(before[col], current_cols)
397
+
398
+ for insert_col in cols_to_insert:
399
+ if insert_col not in positioned:
400
+ new_order.append(insert_col)
401
+ positioned.add(insert_col)
402
+
403
+ # Add current column (the anchor)
404
+ new_order.append(col)
405
+ positioned.add(col)
406
+
407
+ # Handle 'after' - insert columns after current anchor
408
+ if after and col in after:
409
+ cols_to_insert = self._expand_column_spec(after[col], current_cols)
410
+
411
+ for insert_col in cols_to_insert:
412
+ if insert_col not in positioned:
413
+ new_order.append(insert_col)
414
+ positioned.add(insert_col)
415
+
416
+ # Handle 'last' columns (already expanded above)
417
+ if last:
418
+ new_order.extend(last)
419
+
420
+ return new_order
421
+
422
+ def _expand_column_spec(
423
+ self,
424
+ spec: str | list[str],
425
+ current_cols: list[str]
426
+ ) -> list[str]:
427
+ """
428
+ Expand column specifications including slice notation into list of columns.
429
+
430
+ Parameters
431
+ ----------
432
+ spec : str or list[str]
433
+ Column specification. Can be:
434
+ - Single column name: 'col1'
435
+ - List of column names: ['col1', 'col2']
436
+ - Slice notation: 'col1:col5'
437
+ - List with mixed notation: ['col1', 'col2:col5', 'col6']
438
+ current_cols : list[str]
439
+ Current column order in the DataFrame.
440
+
441
+ Returns
442
+ -------
443
+ list[str]
444
+ Expanded list of column names.
445
+
446
+ Raises
447
+ ------
448
+ ValueError
449
+ If any referenced columns don't exist.
450
+ """
451
+ if isinstance(spec, list):
452
+ # If it's a list, expand each element and flatten
453
+ expanded = []
454
+ for item in spec:
455
+ if isinstance(item, str) and ':' in item:
456
+ # This item is slice notation - split and strip whitespace
457
+ start_col, end_col = [col.strip() for col in item.split(':')]
458
+ try:
459
+ start_idx = current_cols.index(start_col)
460
+ end_idx = current_cols.index(end_col)
461
+ expanded.extend(current_cols[start_idx:end_idx + 1])
462
+ except ValueError:
463
+ if start_col not in current_cols:
464
+ raise ValueError(f"Start column '{start_col}' not found in current columns")
465
+ if end_col not in current_cols:
466
+ raise ValueError(f"End column '{end_col}' not found in current columns")
467
+ else:
468
+ # Regular column name
469
+ expanded.append(item)
470
+ return expanded
471
+ elif isinstance(spec, str) and ':' in spec:
472
+ # Single string with slice notation - split and strip whitespace
473
+ start_col, end_col = [col.strip() for col in spec.split(':')]
474
+ try:
475
+ start_idx = current_cols.index(start_col)
476
+ end_idx = current_cols.index(end_col)
477
+ except ValueError:
478
+ if start_col not in current_cols:
479
+ raise ValueError(f"Start column '{start_col}' not found in current columns")
480
+ if end_col not in current_cols:
481
+ raise ValueError(f"End column '{end_col}' not found in current columns")
482
+ return current_cols[start_idx:end_idx + 1]
483
+ elif isinstance(spec, str):
484
+ # Single column name
485
+ return [spec]
486
+ else:
487
+ # Already a list without slice notation or other type
488
+ return spec if isinstance(spec, list) else [spec]
489
+
490
+ def to_native(self) -> IntoFrame:
491
+ """
492
+ Convert back to original DataFrame type.
493
+
494
+ Returns
495
+ -------
496
+ IntoFrame
497
+ DataFrame in its original library format (pandas, polars, etc.).
498
+ """
499
+ return self._nw_df.to_native()
500
+
501
+ @property
502
+ def columns(self) -> list[str]:
503
+ """Get list of column names."""
504
+ return list(self._nw_df.columns)
505
+
506
+ @property
507
+ def shape(self) -> tuple[int, int]:
508
+ """Get dataframe shape (rows, cols)."""
509
+ return self._nw_df.shape
510
+
511
+ def __repr__(self) -> str:
512
+ """String representation for debugging."""
513
+ return f"Data(shape={self.shape}, columns={len(self.columns)})"
@@ -0,0 +1,137 @@
1
+ import narwhals as nw
2
+ from narwhals.typing import IntoFrameT
3
+
4
+
5
+ def make_dummies(
6
+ df: IntoFrameT,
7
+ column: str,
8
+ level: int | list[int | float] = 0,
9
+ *,
10
+ prefix: str | None = None,
11
+ drop_original: bool = False,
12
+ detect_special: bool = True
13
+ ) -> IntoFrameT:
14
+ """
15
+ Create dummy variables for all specified categories.
16
+ Works with both pandas and polars DataFrames via Narwhals.
17
+ Convert single-select question into binary variables.
18
+
19
+ Parameters:
20
+ - df: pandas or polars DataFrame
21
+ - column: name of column to create dummies from (must be numeric type)
22
+ - level: Can be either:
23
+ * int: number of categories to create (1 to level) [default: 0]
24
+ If level=0, only values found in data will get dummy columns
25
+ * list: explicit list of values to create dummies for (e.g., [1,2,3,4,5,99])
26
+ Creates dummy columns for ALL listed values, even if not present in data
27
+ This ensures consistent columns across datasets
28
+ - prefix: prefix for dummy column names (default: same as column name)
29
+ - drop_original: if True, drop the original column [default: False]
30
+ - detect_special: if True, automatically create dummies for any values found in data
31
+ that are outside the specified level range (e.g., 99 for "don't know")
32
+ [default: True]
33
+
34
+ Returns:
35
+ - DataFrame (same type as input) with dummy columns added
36
+ - Missing values: If the original column has null/NaN, all dummy columns for that row will be null
37
+
38
+ Examples:
39
+ >>> make_dummies(df, "q1", level=5)
40
+ # Creates dummies for 1-5 + any special values found (like 99, -1)
41
+
42
+ >>> make_dummies(df, "q1", level=[1,2,3,4,5])
43
+ # Creates dummies for 1-5 + any special values found
44
+
45
+ >>> make_dummies(df, "q1", level=[1,2,3,4,5], detect_special=False)
46
+ # Creates dummies for ONLY 1-5, ignoring any other values
47
+ """
48
+ df_nw = nw.from_native(df)
49
+
50
+ # Check if column is numeric
51
+ col_dtype = df_nw.schema[column]
52
+ numeric_types = [
53
+ nw.Int8, nw.Int16, nw.Int32, nw.Int64,
54
+ nw.UInt8, nw.UInt16, nw.UInt32, nw.UInt64,
55
+ nw.Float32, nw.Float64
56
+ ]
57
+
58
+ if col_dtype not in numeric_types:
59
+ raise TypeError(
60
+ f"Column '{column}' must be numeric (int or float). "
61
+ f"Got {col_dtype}. Please convert string columns to numeric first."
62
+ )
63
+
64
+ if prefix is None:
65
+ prefix = column
66
+
67
+ is_list: bool = isinstance(level, (list, tuple))
68
+
69
+ # Collect all expressions to apply at once
70
+ expressions: list = []
71
+
72
+ # Determine base categories to create
73
+ base_categories: list[int | float]
74
+ if is_list:
75
+ base_categories = sorted(level) # type: ignore
76
+ elif level > 0:
77
+ base_categories = list(range(1, level + 1))
78
+ else:
79
+ base_categories = []
80
+
81
+ # Detect special values if requested
82
+ special_vals: list[int | float] = []
83
+ if detect_special:
84
+ unique_vals: list = (
85
+ df_nw.select(nw.col(column))
86
+ .filter(~nw.col(column).is_null())
87
+ .unique()
88
+ .to_native()[column]
89
+ .to_list()
90
+ )
91
+
92
+ if is_list:
93
+ # For list mode: any value not in the list is special
94
+ special_vals = sorted([v for v in unique_vals if v not in level])
95
+ elif level > 0:
96
+ # For integer mode: any value outside 1-level range is special
97
+ special_vals = sorted([v for v in unique_vals if v < 1 or v > level])
98
+ else:
99
+ # For level=0: all values found are included
100
+ special_vals = sorted(unique_vals)
101
+
102
+ # Create expressions for base categories
103
+ for val in base_categories:
104
+ if isinstance(val, float) and val == int(val):
105
+ val = int(val)
106
+ col_name: str = f"{prefix}_{val}"
107
+
108
+ expr = (
109
+ nw.when(nw.col(column).is_null())
110
+ .then(None)
111
+ .otherwise((nw.col(column) == val).cast(nw.Int8))
112
+ .alias(col_name)
113
+ )
114
+ expressions.append(expr)
115
+
116
+ # Create expressions for special values
117
+ for val in special_vals:
118
+ if isinstance(val, float) and val == int(val):
119
+ val = int(val)
120
+ col_name = f"{prefix}_{val}"
121
+
122
+ expr = (
123
+ nw.when(nw.col(column).is_null())
124
+ .then(None)
125
+ .otherwise((nw.col(column) == val).cast(nw.Int8))
126
+ .alias(col_name)
127
+ )
128
+ expressions.append(expr)
129
+
130
+ # Apply all transformations in one go
131
+ if expressions:
132
+ df_nw = df_nw.with_columns(*expressions)
133
+
134
+ if drop_original:
135
+ df_nw = df_nw.drop(column)
136
+
137
+ return nw.to_native(df_nw)