ultrasav 0.1.4__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- ultrasav/__init__.py +280 -0
- ultrasav/_add_cases.py +227 -0
- ultrasav/_data.py +513 -0
- ultrasav/_make_dummy.py +137 -0
- ultrasav/_merge_data.py +435 -0
- ultrasav/_merge_meta.py +280 -0
- ultrasav/_metadata.py +570 -0
- ultrasav/_read_files.py +558 -0
- ultrasav/_write_files.py +111 -0
- ultrasav/metaman/__init__.py +91 -0
- ultrasav/metaman/def_detect_variable_type.py +454 -0
- ultrasav/metaman/def_get_meta.py +561 -0
- ultrasav/metaman/def_make_datamap.py +127 -0
- ultrasav/metaman/def_make_labels.py +833 -0
- ultrasav/metaman/def_map_engine.py +529 -0
- ultrasav/metaman/def_map_to_excel.py +294 -0
- ultrasav/metaman/def_write_excel_engine.py +298 -0
- ultrasav/metaman/pastel_color_schemes.py +185 -0
- ultrasav-0.1.4.dist-info/METADATA +550 -0
- ultrasav-0.1.4.dist-info/RECORD +21 -0
- ultrasav-0.1.4.dist-info/WHEEL +4 -0
ultrasav/_data.py
ADDED
|
@@ -0,0 +1,513 @@
|
|
|
1
|
+
#v2
|
|
2
|
+
from typing import Any
|
|
3
|
+
import narwhals as nw
|
|
4
|
+
from narwhals.typing import IntoFrame
|
|
5
|
+
|
|
6
|
+
|
|
7
|
+
class Data:
|
|
8
|
+
"""
|
|
9
|
+
DataFrame handler for tidyspss 2.0 - manages all data transformations.
|
|
10
|
+
|
|
11
|
+
The Data class is part of tidyspss's two-track architecture where Data and
|
|
12
|
+
Metadata are completely independent until write time. This class handles all
|
|
13
|
+
dataframe operations (renaming, selecting, filtering, transforming) while
|
|
14
|
+
remaining completely agnostic to either pandas or polars dataframes.
|
|
15
|
+
|
|
16
|
+
Key Design Principles
|
|
17
|
+
---------------------
|
|
18
|
+
- **DataFrame Agnostic**: Works with any narwhals-supported dataframe
|
|
19
|
+
(pandas, Polars, cuDF, Modin, PyArrow, DuckDB, etc.)
|
|
20
|
+
- **No Metadata Awareness**: Never reads or modifies metadata - that's
|
|
21
|
+
handled by the separate Metadata class
|
|
22
|
+
- **Chainable Operations**: All methods return self for fluent API usage
|
|
23
|
+
- **Explicit Control**: No automatic transfers or hidden magic - you control
|
|
24
|
+
exactly what happens to your data
|
|
25
|
+
|
|
26
|
+
Workflow
|
|
27
|
+
--------
|
|
28
|
+
1. Create from a dataframe: `data = Data(df)`
|
|
29
|
+
2. Transform as needed: `data.rename(...).select(...).replace(...)`
|
|
30
|
+
3. Combine with metadata only at write: `write_sav(data, meta, "output.sav")`
|
|
31
|
+
|
|
32
|
+
Examples
|
|
33
|
+
--------
|
|
34
|
+
>>> import pandas as pd
|
|
35
|
+
>>> from tidyspss import Data
|
|
36
|
+
>>>
|
|
37
|
+
>>> # Create from any supported dataframe
|
|
38
|
+
>>> df = pd.DataFrame({'Q1': [1, 2, 3], 'Q2': [4, 5, 6]})
|
|
39
|
+
>>> data = Data(df)
|
|
40
|
+
>>>
|
|
41
|
+
>>> # Chain operations
|
|
42
|
+
>>> data = (data
|
|
43
|
+
... .rename({'Q1': 'satisfaction'})
|
|
44
|
+
... .select(['satisfaction'])
|
|
45
|
+
... .replace({'satisfaction': {1: 10, 2: 20}})
|
|
46
|
+
... )
|
|
47
|
+
>>>
|
|
48
|
+
>>> # Convert back to native format when needed
|
|
49
|
+
>>> result_df = data.to_native()
|
|
50
|
+
|
|
51
|
+
Notes
|
|
52
|
+
-----
|
|
53
|
+
When you rename columns in Data, any associated metadata must be explicitly
|
|
54
|
+
updated in the Metadata object. The two classes do not communicate - this is
|
|
55
|
+
by design for explicit control and clean separation of concerns.
|
|
56
|
+
"""
|
|
57
|
+
|
|
58
|
+
def __init__(self, df: IntoFrame) -> None:
|
|
59
|
+
"""
|
|
60
|
+
Initialize with a DataFrame from any narwhals-supported library.
|
|
61
|
+
|
|
62
|
+
Parameters
|
|
63
|
+
----------
|
|
64
|
+
df : IntoFrame
|
|
65
|
+
A DataFrame from any narwhals-supported library including:
|
|
66
|
+
pandas, Polars, cuDF, Modin, PyArrow, DuckDB, and others.
|
|
67
|
+
"""
|
|
68
|
+
self._nw_df = nw.from_native(df)
|
|
69
|
+
|
|
70
|
+
def rename(self, mapping: dict[str, str]) -> 'Data':
|
|
71
|
+
"""
|
|
72
|
+
Rename columns and return self for chaining.
|
|
73
|
+
|
|
74
|
+
Parameters
|
|
75
|
+
----------
|
|
76
|
+
mapping : dict[str, str]
|
|
77
|
+
Mapping of old column names to new column names.
|
|
78
|
+
|
|
79
|
+
Returns
|
|
80
|
+
-------
|
|
81
|
+
Data
|
|
82
|
+
Self for method chaining.
|
|
83
|
+
|
|
84
|
+
Notes
|
|
85
|
+
-----
|
|
86
|
+
Remember to update metadata for renamed columns in the Metadata object.
|
|
87
|
+
Old column metadata does not automatically transfer to new names.
|
|
88
|
+
"""
|
|
89
|
+
self._nw_df = self._nw_df.rename(mapping)
|
|
90
|
+
return self
|
|
91
|
+
|
|
92
|
+
def select(self, columns: str | list[str]) -> 'Data':
|
|
93
|
+
"""
|
|
94
|
+
Select specified columns and return self for chaining.
|
|
95
|
+
|
|
96
|
+
Parameters
|
|
97
|
+
----------
|
|
98
|
+
columns : str or list[str]
|
|
99
|
+
Column name(s) to select.
|
|
100
|
+
|
|
101
|
+
Returns
|
|
102
|
+
-------
|
|
103
|
+
Data
|
|
104
|
+
Self for method chaining.
|
|
105
|
+
"""
|
|
106
|
+
self._nw_df = self._nw_df.select(columns)
|
|
107
|
+
return self
|
|
108
|
+
|
|
109
|
+
def drop(self, columns: str | list[str]) -> 'Data':
|
|
110
|
+
"""
|
|
111
|
+
Drop columns and return self for chaining.
|
|
112
|
+
|
|
113
|
+
Parameters
|
|
114
|
+
----------
|
|
115
|
+
columns : str or list[str]
|
|
116
|
+
Column name(s) to drop.
|
|
117
|
+
|
|
118
|
+
Returns
|
|
119
|
+
-------
|
|
120
|
+
Data
|
|
121
|
+
Self for method chaining.
|
|
122
|
+
"""
|
|
123
|
+
self._nw_df = self._nw_df.drop(columns)
|
|
124
|
+
return self
|
|
125
|
+
|
|
126
|
+
def replace(self, replacements: dict[str, dict[Any, Any]]) -> 'Data':
|
|
127
|
+
"""
|
|
128
|
+
Replace values in specified columns.
|
|
129
|
+
|
|
130
|
+
Parameters
|
|
131
|
+
----------
|
|
132
|
+
replacements : dict[str, dict[Any, Any]]
|
|
133
|
+
Nested dictionary where keys are column names and values are
|
|
134
|
+
dictionaries mapping old values to new values.
|
|
135
|
+
Use None as a key to replace null values.
|
|
136
|
+
|
|
137
|
+
Returns
|
|
138
|
+
-------
|
|
139
|
+
Data
|
|
140
|
+
Self for method chaining.
|
|
141
|
+
|
|
142
|
+
Examples
|
|
143
|
+
--------
|
|
144
|
+
>>> data.replace({
|
|
145
|
+
... 'column1': {1: 'one', 2: 'two', None: 'missing'},
|
|
146
|
+
... 'column2': {'old': 'new'}
|
|
147
|
+
... })
|
|
148
|
+
"""
|
|
149
|
+
columns_to_update = []
|
|
150
|
+
|
|
151
|
+
for col, mapping in replacements.items():
|
|
152
|
+
col_expr = nw.col(col)
|
|
153
|
+
|
|
154
|
+
# Handle null replacements separately
|
|
155
|
+
if None in mapping:
|
|
156
|
+
null_replacement = mapping[None]
|
|
157
|
+
col_expr = col_expr.fill_null(null_replacement)
|
|
158
|
+
# Create new mapping without None
|
|
159
|
+
mapping = {k: v for k, v in mapping.items() if k is not None}
|
|
160
|
+
|
|
161
|
+
# Handle regular value replacements
|
|
162
|
+
for old_val, new_val in mapping.items():
|
|
163
|
+
col_expr = nw.when(col_expr == old_val).then(new_val).otherwise(col_expr)
|
|
164
|
+
|
|
165
|
+
columns_to_update.append(col_expr.alias(col))
|
|
166
|
+
|
|
167
|
+
self._nw_df = self._nw_df.with_columns(columns_to_update)
|
|
168
|
+
return self
|
|
169
|
+
|
|
170
|
+
def move(
|
|
171
|
+
self,
|
|
172
|
+
config: dict[str, Any] | None = None,
|
|
173
|
+
first: list[str | list[str]] | None = None,
|
|
174
|
+
last: list[str | list[str]] | None = None,
|
|
175
|
+
before: dict[str, str | list[str]] | None = None,
|
|
176
|
+
after: dict[str, str | list[str]] | None = None
|
|
177
|
+
) -> 'Data':
|
|
178
|
+
"""
|
|
179
|
+
Move columns in the DataFrame using various positioning strategies.
|
|
180
|
+
|
|
181
|
+
This method allows flexible column reordering using four positioning strategies
|
|
182
|
+
that can be combined. Columns can be specified individually, as lists, or using
|
|
183
|
+
slice notation (e.g., 'Q1_1:Q1_25' to select a range of columns).
|
|
184
|
+
|
|
185
|
+
Parameters
|
|
186
|
+
----------
|
|
187
|
+
config : dict[str, Any], optional
|
|
188
|
+
Dictionary containing any of the positioning parameters below.
|
|
189
|
+
If provided, overrides individual parameters.
|
|
190
|
+
first : list[str | list[str]], optional
|
|
191
|
+
Columns to position at the beginning of the DataFrame.
|
|
192
|
+
Supports slice notation like 'Q1_1:Q1_5'.
|
|
193
|
+
last : list[str | list[str]], optional
|
|
194
|
+
Columns to position at the end of the DataFrame.
|
|
195
|
+
Supports slice notation.
|
|
196
|
+
before : dict[str, str | list[str]], optional
|
|
197
|
+
Dictionary mapping anchor columns to columns that should be
|
|
198
|
+
positioned before them. Keys are anchor column names, values
|
|
199
|
+
are columns to insert before the anchor.
|
|
200
|
+
after : dict[str, str | list[str]], optional
|
|
201
|
+
Dictionary mapping anchor columns to columns that should be
|
|
202
|
+
positioned after them. Keys are anchor column names, values
|
|
203
|
+
are columns to insert after the anchor.
|
|
204
|
+
|
|
205
|
+
Returns
|
|
206
|
+
-------
|
|
207
|
+
Data
|
|
208
|
+
Self for method chaining.
|
|
209
|
+
|
|
210
|
+
Examples
|
|
211
|
+
--------
|
|
212
|
+
>>> # Move columns to the beginning
|
|
213
|
+
>>> data.move(first=['id', 'name'])
|
|
214
|
+
|
|
215
|
+
>>> # Move columns to the end
|
|
216
|
+
>>> data.move(last=['created_at', 'updated_at'])
|
|
217
|
+
|
|
218
|
+
>>> # Move specific columns before/after anchors
|
|
219
|
+
>>> data.move(
|
|
220
|
+
... before={'age': ['birth_date', 'birth_year']},
|
|
221
|
+
... after={'name': ['first_name', 'last_name']}
|
|
222
|
+
... )
|
|
223
|
+
|
|
224
|
+
>>> # Use slice notation for sequential columns
|
|
225
|
+
>>> data.move(first=['Q1_1:Q1_5'], last=['Q10_1:Q10_20'])
|
|
226
|
+
|
|
227
|
+
>>> # Complex combination
|
|
228
|
+
>>> data.move(
|
|
229
|
+
... first=['respondent_id'],
|
|
230
|
+
... last=['timestamp'],
|
|
231
|
+
... before={'Q2_1': 'Q1_1:Q1_10'},
|
|
232
|
+
... after={'demographics': ['age', 'gender', 'income']}
|
|
233
|
+
... )
|
|
234
|
+
|
|
235
|
+
Notes
|
|
236
|
+
-----
|
|
237
|
+
- Operations are applied in order: first → before/after → last
|
|
238
|
+
- Columns can only appear in one positioning directive
|
|
239
|
+
- Non-existent columns will raise a ValueError
|
|
240
|
+
- Slice notation 'start:end' includes both endpoints
|
|
241
|
+
"""
|
|
242
|
+
# Use config if provided, otherwise use individual parameters
|
|
243
|
+
if config:
|
|
244
|
+
first = config.get('first', first)
|
|
245
|
+
last = config.get('last', last)
|
|
246
|
+
before = config.get('before', before)
|
|
247
|
+
after = config.get('after', after)
|
|
248
|
+
|
|
249
|
+
# Get current column order
|
|
250
|
+
current_cols = list(self._nw_df.columns)
|
|
251
|
+
|
|
252
|
+
# Reorder columns based on specifications
|
|
253
|
+
new_order = self._calculate_column_order(
|
|
254
|
+
current_cols, first, last, before, after
|
|
255
|
+
)
|
|
256
|
+
|
|
257
|
+
# Apply the new order
|
|
258
|
+
self._nw_df = self._nw_df.select(new_order)
|
|
259
|
+
return self
|
|
260
|
+
|
|
261
|
+
def _calculate_column_order(
|
|
262
|
+
self,
|
|
263
|
+
current_cols: list[str],
|
|
264
|
+
first: list[str | list[str]] | None,
|
|
265
|
+
last: list[str | list[str]] | None,
|
|
266
|
+
before: dict[str, str | list[str]] | None,
|
|
267
|
+
after: dict[str, str | list[str]] | None
|
|
268
|
+
) -> list[str]:
|
|
269
|
+
"""
|
|
270
|
+
Calculate new column order based on positioning specifications.
|
|
271
|
+
|
|
272
|
+
Parameters
|
|
273
|
+
----------
|
|
274
|
+
current_cols : list[str]
|
|
275
|
+
Current column order.
|
|
276
|
+
first : list[str | list[str]] | None
|
|
277
|
+
Columns to move to beginning.
|
|
278
|
+
last : list[str | list[str]] | None
|
|
279
|
+
Columns to move to end.
|
|
280
|
+
before : dict[str, str | list[str]] | None
|
|
281
|
+
Columns to position before anchors.
|
|
282
|
+
after : dict[str, str | list[str]] | None
|
|
283
|
+
Columns to position after anchors.
|
|
284
|
+
|
|
285
|
+
Returns
|
|
286
|
+
-------
|
|
287
|
+
list[str]
|
|
288
|
+
New column order.
|
|
289
|
+
|
|
290
|
+
Raises
|
|
291
|
+
------
|
|
292
|
+
ValueError
|
|
293
|
+
If any specified columns don't exist or if there are conflicts.
|
|
294
|
+
"""
|
|
295
|
+
# First, validate all columns exist and detect conflicts
|
|
296
|
+
errors = []
|
|
297
|
+
current_cols_set = set(current_cols)
|
|
298
|
+
|
|
299
|
+
# Check 'first' columns exist
|
|
300
|
+
if first:
|
|
301
|
+
expanded_first = []
|
|
302
|
+
for item in first:
|
|
303
|
+
expanded_first.extend(self._expand_column_spec(item, current_cols))
|
|
304
|
+
missing = [col for col in expanded_first if col not in current_cols_set]
|
|
305
|
+
if missing:
|
|
306
|
+
errors.append(f"'first' contains non-existent columns: {missing}")
|
|
307
|
+
# Update first to be the expanded version
|
|
308
|
+
first = expanded_first
|
|
309
|
+
|
|
310
|
+
# Check 'last' columns exist
|
|
311
|
+
if last:
|
|
312
|
+
expanded_last = []
|
|
313
|
+
for item in last:
|
|
314
|
+
expanded_last.extend(self._expand_column_spec(item, current_cols))
|
|
315
|
+
missing = [col for col in expanded_last if col not in current_cols_set]
|
|
316
|
+
if missing:
|
|
317
|
+
errors.append(f"'last' contains non-existent columns: {missing}")
|
|
318
|
+
# Update last to be the expanded version
|
|
319
|
+
last = expanded_last
|
|
320
|
+
|
|
321
|
+
# Check 'before' anchor columns and values
|
|
322
|
+
if before:
|
|
323
|
+
# Check anchor columns (keys)
|
|
324
|
+
missing_anchors = [col for col in before.keys() if col not in current_cols_set]
|
|
325
|
+
if missing_anchors:
|
|
326
|
+
errors.append(f"'before' references non-existent anchor columns: {missing_anchors}")
|
|
327
|
+
|
|
328
|
+
# Check columns to position (values)
|
|
329
|
+
for anchor, cols in before.items():
|
|
330
|
+
expanded_cols = self._expand_column_spec(cols, current_cols)
|
|
331
|
+
missing = [col for col in expanded_cols if col not in current_cols_set]
|
|
332
|
+
if missing:
|
|
333
|
+
errors.append(f"'before[{anchor}]' contains non-existent columns: {missing}")
|
|
334
|
+
|
|
335
|
+
# Check 'after' anchor columns and values
|
|
336
|
+
if after:
|
|
337
|
+
# Check anchor columns (keys)
|
|
338
|
+
missing_anchors = [col for col in after.keys() if col not in current_cols_set]
|
|
339
|
+
if missing_anchors:
|
|
340
|
+
errors.append(f"'after' references non-existent anchor columns: {missing_anchors}")
|
|
341
|
+
|
|
342
|
+
# Check columns to position (values)
|
|
343
|
+
for anchor, cols in after.items():
|
|
344
|
+
expanded_cols = self._expand_column_spec(cols, current_cols)
|
|
345
|
+
missing = [col for col in expanded_cols if col not in current_cols_set]
|
|
346
|
+
if missing:
|
|
347
|
+
errors.append(f"'after[{anchor}]' contains non-existent columns: {missing}")
|
|
348
|
+
|
|
349
|
+
# Raise error if any issues found
|
|
350
|
+
if errors:
|
|
351
|
+
raise ValueError("Column positioning errors:\n" + "\n".join(errors))
|
|
352
|
+
|
|
353
|
+
# Build the new column order
|
|
354
|
+
new_order = []
|
|
355
|
+
|
|
356
|
+
# Handle 'first' columns (already expanded above)
|
|
357
|
+
first_set = set(first) if first else set()
|
|
358
|
+
if first:
|
|
359
|
+
new_order.extend(first)
|
|
360
|
+
|
|
361
|
+
# Identify columns that should be positioned relative to anchors
|
|
362
|
+
relatively_positioned = set()
|
|
363
|
+
|
|
364
|
+
if before:
|
|
365
|
+
for cols in before.values():
|
|
366
|
+
expanded_cols = self._expand_column_spec(cols, current_cols)
|
|
367
|
+
relatively_positioned.update(expanded_cols)
|
|
368
|
+
|
|
369
|
+
if after:
|
|
370
|
+
for cols in after.values():
|
|
371
|
+
expanded_cols = self._expand_column_spec(cols, current_cols)
|
|
372
|
+
relatively_positioned.update(expanded_cols)
|
|
373
|
+
|
|
374
|
+
# Handle 'last' columns set (already expanded above)
|
|
375
|
+
last_set = set(last) if last else set()
|
|
376
|
+
|
|
377
|
+
# Track what we've already added to avoid duplicates
|
|
378
|
+
positioned = set(first) if first else set()
|
|
379
|
+
|
|
380
|
+
# Process columns in original order, handling before/after relationships
|
|
381
|
+
for col in current_cols:
|
|
382
|
+
# Skip if already positioned (in first)
|
|
383
|
+
if col in positioned:
|
|
384
|
+
continue
|
|
385
|
+
|
|
386
|
+
# Skip if this column should be positioned relatively
|
|
387
|
+
if col in relatively_positioned:
|
|
388
|
+
continue
|
|
389
|
+
|
|
390
|
+
# Skip if this column goes in 'last'
|
|
391
|
+
if col in last_set:
|
|
392
|
+
continue
|
|
393
|
+
|
|
394
|
+
# Handle 'before' - insert columns before current anchor
|
|
395
|
+
if before and col in before:
|
|
396
|
+
cols_to_insert = self._expand_column_spec(before[col], current_cols)
|
|
397
|
+
|
|
398
|
+
for insert_col in cols_to_insert:
|
|
399
|
+
if insert_col not in positioned:
|
|
400
|
+
new_order.append(insert_col)
|
|
401
|
+
positioned.add(insert_col)
|
|
402
|
+
|
|
403
|
+
# Add current column (the anchor)
|
|
404
|
+
new_order.append(col)
|
|
405
|
+
positioned.add(col)
|
|
406
|
+
|
|
407
|
+
# Handle 'after' - insert columns after current anchor
|
|
408
|
+
if after and col in after:
|
|
409
|
+
cols_to_insert = self._expand_column_spec(after[col], current_cols)
|
|
410
|
+
|
|
411
|
+
for insert_col in cols_to_insert:
|
|
412
|
+
if insert_col not in positioned:
|
|
413
|
+
new_order.append(insert_col)
|
|
414
|
+
positioned.add(insert_col)
|
|
415
|
+
|
|
416
|
+
# Handle 'last' columns (already expanded above)
|
|
417
|
+
if last:
|
|
418
|
+
new_order.extend(last)
|
|
419
|
+
|
|
420
|
+
return new_order
|
|
421
|
+
|
|
422
|
+
def _expand_column_spec(
|
|
423
|
+
self,
|
|
424
|
+
spec: str | list[str],
|
|
425
|
+
current_cols: list[str]
|
|
426
|
+
) -> list[str]:
|
|
427
|
+
"""
|
|
428
|
+
Expand column specifications including slice notation into list of columns.
|
|
429
|
+
|
|
430
|
+
Parameters
|
|
431
|
+
----------
|
|
432
|
+
spec : str or list[str]
|
|
433
|
+
Column specification. Can be:
|
|
434
|
+
- Single column name: 'col1'
|
|
435
|
+
- List of column names: ['col1', 'col2']
|
|
436
|
+
- Slice notation: 'col1:col5'
|
|
437
|
+
- List with mixed notation: ['col1', 'col2:col5', 'col6']
|
|
438
|
+
current_cols : list[str]
|
|
439
|
+
Current column order in the DataFrame.
|
|
440
|
+
|
|
441
|
+
Returns
|
|
442
|
+
-------
|
|
443
|
+
list[str]
|
|
444
|
+
Expanded list of column names.
|
|
445
|
+
|
|
446
|
+
Raises
|
|
447
|
+
------
|
|
448
|
+
ValueError
|
|
449
|
+
If any referenced columns don't exist.
|
|
450
|
+
"""
|
|
451
|
+
if isinstance(spec, list):
|
|
452
|
+
# If it's a list, expand each element and flatten
|
|
453
|
+
expanded = []
|
|
454
|
+
for item in spec:
|
|
455
|
+
if isinstance(item, str) and ':' in item:
|
|
456
|
+
# This item is slice notation - split and strip whitespace
|
|
457
|
+
start_col, end_col = [col.strip() for col in item.split(':')]
|
|
458
|
+
try:
|
|
459
|
+
start_idx = current_cols.index(start_col)
|
|
460
|
+
end_idx = current_cols.index(end_col)
|
|
461
|
+
expanded.extend(current_cols[start_idx:end_idx + 1])
|
|
462
|
+
except ValueError:
|
|
463
|
+
if start_col not in current_cols:
|
|
464
|
+
raise ValueError(f"Start column '{start_col}' not found in current columns")
|
|
465
|
+
if end_col not in current_cols:
|
|
466
|
+
raise ValueError(f"End column '{end_col}' not found in current columns")
|
|
467
|
+
else:
|
|
468
|
+
# Regular column name
|
|
469
|
+
expanded.append(item)
|
|
470
|
+
return expanded
|
|
471
|
+
elif isinstance(spec, str) and ':' in spec:
|
|
472
|
+
# Single string with slice notation - split and strip whitespace
|
|
473
|
+
start_col, end_col = [col.strip() for col in spec.split(':')]
|
|
474
|
+
try:
|
|
475
|
+
start_idx = current_cols.index(start_col)
|
|
476
|
+
end_idx = current_cols.index(end_col)
|
|
477
|
+
except ValueError:
|
|
478
|
+
if start_col not in current_cols:
|
|
479
|
+
raise ValueError(f"Start column '{start_col}' not found in current columns")
|
|
480
|
+
if end_col not in current_cols:
|
|
481
|
+
raise ValueError(f"End column '{end_col}' not found in current columns")
|
|
482
|
+
return current_cols[start_idx:end_idx + 1]
|
|
483
|
+
elif isinstance(spec, str):
|
|
484
|
+
# Single column name
|
|
485
|
+
return [spec]
|
|
486
|
+
else:
|
|
487
|
+
# Already a list without slice notation or other type
|
|
488
|
+
return spec if isinstance(spec, list) else [spec]
|
|
489
|
+
|
|
490
|
+
def to_native(self) -> IntoFrame:
|
|
491
|
+
"""
|
|
492
|
+
Convert back to original DataFrame type.
|
|
493
|
+
|
|
494
|
+
Returns
|
|
495
|
+
-------
|
|
496
|
+
IntoFrame
|
|
497
|
+
DataFrame in its original library format (pandas, polars, etc.).
|
|
498
|
+
"""
|
|
499
|
+
return self._nw_df.to_native()
|
|
500
|
+
|
|
501
|
+
@property
|
|
502
|
+
def columns(self) -> list[str]:
|
|
503
|
+
"""Get list of column names."""
|
|
504
|
+
return list(self._nw_df.columns)
|
|
505
|
+
|
|
506
|
+
@property
|
|
507
|
+
def shape(self) -> tuple[int, int]:
|
|
508
|
+
"""Get dataframe shape (rows, cols)."""
|
|
509
|
+
return self._nw_df.shape
|
|
510
|
+
|
|
511
|
+
def __repr__(self) -> str:
|
|
512
|
+
"""String representation for debugging."""
|
|
513
|
+
return f"Data(shape={self.shape}, columns={len(self.columns)})"
|
ultrasav/_make_dummy.py
ADDED
|
@@ -0,0 +1,137 @@
|
|
|
1
|
+
import narwhals as nw
|
|
2
|
+
from narwhals.typing import IntoFrameT
|
|
3
|
+
|
|
4
|
+
|
|
5
|
+
def make_dummies(
|
|
6
|
+
df: IntoFrameT,
|
|
7
|
+
column: str,
|
|
8
|
+
level: int | list[int | float] = 0,
|
|
9
|
+
*,
|
|
10
|
+
prefix: str | None = None,
|
|
11
|
+
drop_original: bool = False,
|
|
12
|
+
detect_special: bool = True
|
|
13
|
+
) -> IntoFrameT:
|
|
14
|
+
"""
|
|
15
|
+
Create dummy variables for all specified categories.
|
|
16
|
+
Works with both pandas and polars DataFrames via Narwhals.
|
|
17
|
+
Convert single-select question into binary variables.
|
|
18
|
+
|
|
19
|
+
Parameters:
|
|
20
|
+
- df: pandas or polars DataFrame
|
|
21
|
+
- column: name of column to create dummies from (must be numeric type)
|
|
22
|
+
- level: Can be either:
|
|
23
|
+
* int: number of categories to create (1 to level) [default: 0]
|
|
24
|
+
If level=0, only values found in data will get dummy columns
|
|
25
|
+
* list: explicit list of values to create dummies for (e.g., [1,2,3,4,5,99])
|
|
26
|
+
Creates dummy columns for ALL listed values, even if not present in data
|
|
27
|
+
This ensures consistent columns across datasets
|
|
28
|
+
- prefix: prefix for dummy column names (default: same as column name)
|
|
29
|
+
- drop_original: if True, drop the original column [default: False]
|
|
30
|
+
- detect_special: if True, automatically create dummies for any values found in data
|
|
31
|
+
that are outside the specified level range (e.g., 99 for "don't know")
|
|
32
|
+
[default: True]
|
|
33
|
+
|
|
34
|
+
Returns:
|
|
35
|
+
- DataFrame (same type as input) with dummy columns added
|
|
36
|
+
- Missing values: If the original column has null/NaN, all dummy columns for that row will be null
|
|
37
|
+
|
|
38
|
+
Examples:
|
|
39
|
+
>>> make_dummies(df, "q1", level=5)
|
|
40
|
+
# Creates dummies for 1-5 + any special values found (like 99, -1)
|
|
41
|
+
|
|
42
|
+
>>> make_dummies(df, "q1", level=[1,2,3,4,5])
|
|
43
|
+
# Creates dummies for 1-5 + any special values found
|
|
44
|
+
|
|
45
|
+
>>> make_dummies(df, "q1", level=[1,2,3,4,5], detect_special=False)
|
|
46
|
+
# Creates dummies for ONLY 1-5, ignoring any other values
|
|
47
|
+
"""
|
|
48
|
+
df_nw = nw.from_native(df)
|
|
49
|
+
|
|
50
|
+
# Check if column is numeric
|
|
51
|
+
col_dtype = df_nw.schema[column]
|
|
52
|
+
numeric_types = [
|
|
53
|
+
nw.Int8, nw.Int16, nw.Int32, nw.Int64,
|
|
54
|
+
nw.UInt8, nw.UInt16, nw.UInt32, nw.UInt64,
|
|
55
|
+
nw.Float32, nw.Float64
|
|
56
|
+
]
|
|
57
|
+
|
|
58
|
+
if col_dtype not in numeric_types:
|
|
59
|
+
raise TypeError(
|
|
60
|
+
f"Column '{column}' must be numeric (int or float). "
|
|
61
|
+
f"Got {col_dtype}. Please convert string columns to numeric first."
|
|
62
|
+
)
|
|
63
|
+
|
|
64
|
+
if prefix is None:
|
|
65
|
+
prefix = column
|
|
66
|
+
|
|
67
|
+
is_list: bool = isinstance(level, (list, tuple))
|
|
68
|
+
|
|
69
|
+
# Collect all expressions to apply at once
|
|
70
|
+
expressions: list = []
|
|
71
|
+
|
|
72
|
+
# Determine base categories to create
|
|
73
|
+
base_categories: list[int | float]
|
|
74
|
+
if is_list:
|
|
75
|
+
base_categories = sorted(level) # type: ignore
|
|
76
|
+
elif level > 0:
|
|
77
|
+
base_categories = list(range(1, level + 1))
|
|
78
|
+
else:
|
|
79
|
+
base_categories = []
|
|
80
|
+
|
|
81
|
+
# Detect special values if requested
|
|
82
|
+
special_vals: list[int | float] = []
|
|
83
|
+
if detect_special:
|
|
84
|
+
unique_vals: list = (
|
|
85
|
+
df_nw.select(nw.col(column))
|
|
86
|
+
.filter(~nw.col(column).is_null())
|
|
87
|
+
.unique()
|
|
88
|
+
.to_native()[column]
|
|
89
|
+
.to_list()
|
|
90
|
+
)
|
|
91
|
+
|
|
92
|
+
if is_list:
|
|
93
|
+
# For list mode: any value not in the list is special
|
|
94
|
+
special_vals = sorted([v for v in unique_vals if v not in level])
|
|
95
|
+
elif level > 0:
|
|
96
|
+
# For integer mode: any value outside 1-level range is special
|
|
97
|
+
special_vals = sorted([v for v in unique_vals if v < 1 or v > level])
|
|
98
|
+
else:
|
|
99
|
+
# For level=0: all values found are included
|
|
100
|
+
special_vals = sorted(unique_vals)
|
|
101
|
+
|
|
102
|
+
# Create expressions for base categories
|
|
103
|
+
for val in base_categories:
|
|
104
|
+
if isinstance(val, float) and val == int(val):
|
|
105
|
+
val = int(val)
|
|
106
|
+
col_name: str = f"{prefix}_{val}"
|
|
107
|
+
|
|
108
|
+
expr = (
|
|
109
|
+
nw.when(nw.col(column).is_null())
|
|
110
|
+
.then(None)
|
|
111
|
+
.otherwise((nw.col(column) == val).cast(nw.Int8))
|
|
112
|
+
.alias(col_name)
|
|
113
|
+
)
|
|
114
|
+
expressions.append(expr)
|
|
115
|
+
|
|
116
|
+
# Create expressions for special values
|
|
117
|
+
for val in special_vals:
|
|
118
|
+
if isinstance(val, float) and val == int(val):
|
|
119
|
+
val = int(val)
|
|
120
|
+
col_name = f"{prefix}_{val}"
|
|
121
|
+
|
|
122
|
+
expr = (
|
|
123
|
+
nw.when(nw.col(column).is_null())
|
|
124
|
+
.then(None)
|
|
125
|
+
.otherwise((nw.col(column) == val).cast(nw.Int8))
|
|
126
|
+
.alias(col_name)
|
|
127
|
+
)
|
|
128
|
+
expressions.append(expr)
|
|
129
|
+
|
|
130
|
+
# Apply all transformations in one go
|
|
131
|
+
if expressions:
|
|
132
|
+
df_nw = df_nw.with_columns(*expressions)
|
|
133
|
+
|
|
134
|
+
if drop_original:
|
|
135
|
+
df_nw = df_nw.drop(column)
|
|
136
|
+
|
|
137
|
+
return nw.to_native(df_nw)
|