ultrasav 0.1.4__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,435 @@
1
+ """
2
+ merge_data.py
3
+ Data merging function for ultrasav
4
+ Following the dataframe-agnostic architecture using narwhals
5
+ """
6
+
7
+ import os
8
+ import logging
9
+ from pathlib import Path
10
+ from typing import Any
11
+ import narwhals as nw
12
+ from narwhals.typing import IntoFrame
13
+
14
+ # Import read functions
15
+ from .def_read_files import read_sav, read_csv, read_excel
16
+
17
+ logger = logging.getLogger(__name__)
18
+
19
+
20
+ def _get_narwhals_dtype(dtype_str: str):
21
+ """
22
+ Map dtype string representation to narwhals dtype object.
23
+
24
+ Parameters
25
+ ----------
26
+ dtype_str : str
27
+ String representation of the dtype
28
+
29
+ Returns
30
+ -------
31
+ narwhals dtype or None if unknown
32
+ """
33
+ dtype_map = {
34
+ "String": nw.String,
35
+ "Int64": nw.Int64,
36
+ "Int32": nw.Int32,
37
+ "Int16": nw.Int16,
38
+ "Int8": nw.Int8,
39
+ "Float64": nw.Float64,
40
+ "Float32": nw.Float32,
41
+ "Boolean": nw.Boolean,
42
+ "Datetime": nw.Datetime,
43
+ "Date": nw.Date,
44
+ "Object": nw.Object,
45
+ "Unknown": nw.Unknown,
46
+ # Add more mappings as needed
47
+ }
48
+
49
+ # Try exact match first
50
+ if dtype_str in dtype_map:
51
+ return dtype_map[dtype_str]
52
+
53
+ # Handle variations
54
+ if "Utf8" in dtype_str or "str" in dtype_str.lower():
55
+ return nw.String
56
+ if "float" in dtype_str.lower() and "64" in dtype_str:
57
+ return nw.Float64
58
+ if "float" in dtype_str.lower() and "32" in dtype_str:
59
+ return nw.Float32
60
+ if "int" in dtype_str.lower() and "64" in dtype_str:
61
+ return nw.Int64
62
+ if "int" in dtype_str.lower() and "32" in dtype_str:
63
+ return nw.Int32
64
+
65
+ # Default to String for Unknown/Null
66
+ if dtype_str == "Unknown":
67
+ return nw.String
68
+
69
+ logger.warning(f"Unknown dtype mapping for '{dtype_str}', defaulting to String")
70
+ return nw.String
71
+
72
+
73
+ def align_schemas(nw_dfs: list) -> list:
74
+ """
75
+ Align schemas across multiple narwhals DataFrames for successful concatenation.
76
+
77
+ This function harmonizes column types across all dataframes using these rules:
78
+ 1. Use df[0] as the baseline schema
79
+ 2. If a later df has a new column, append it to the merged schema
80
+ 3. If a column is Unknown/Null in merged schema but later appears with real type,
81
+ upgrade to the non-null dtype
82
+ 4. For type conflicts, preserve the existing dtype (df[0] wins)
83
+
84
+ Parameters
85
+ ----------
86
+ nw_dfs : list[narwhals.DataFrame]
87
+ List of narwhals dataframes to align
88
+
89
+ Returns
90
+ -------
91
+ list[narwhals.DataFrame]
92
+ List of dataframes with aligned schemas
93
+ """
94
+ if not nw_dfs:
95
+ return nw_dfs
96
+
97
+ if len(nw_dfs) == 1:
98
+ return nw_dfs
99
+
100
+ # Build the merged schema starting with first dataframe
101
+ merged_schema = {}
102
+
103
+ for i, df in enumerate(nw_dfs):
104
+ for col_name, dtype in df.schema.items():
105
+ dtype_str = str(dtype)
106
+
107
+ if col_name not in merged_schema:
108
+ # New column - add to merged schema
109
+ merged_schema[col_name] = dtype_str
110
+ logger.debug(f"Added new column '{col_name}' with type {dtype_str} from df[{i}]")
111
+
112
+ elif merged_schema[col_name] == "Unknown" and dtype_str != "Unknown":
113
+ # Upgrade from Unknown/Null to real type
114
+ merged_schema[col_name] = dtype_str
115
+ logger.debug(f"Upgraded column '{col_name}' from Unknown to {dtype_str} from df[{i}]")
116
+
117
+ elif merged_schema[col_name] != dtype_str and dtype_str != "Unknown":
118
+ # Type conflict - log but keep existing (df[0] wins)
119
+ logger.debug(f"Type conflict for column '{col_name}': keeping {merged_schema[col_name]}, ignoring {dtype_str} from df[{i}]")
120
+
121
+ logger.info(f"Final merged schema: {merged_schema}")
122
+
123
+ # Now cast all dataframes to match the merged schema
124
+ aligned_dfs = []
125
+
126
+ for i, df in enumerate(nw_dfs):
127
+ cast_exprs = []
128
+
129
+ for col_name, target_type_str in merged_schema.items():
130
+ if col_name in df.columns:
131
+ current_type_str = str(df.schema[col_name])
132
+
133
+ # Only cast if types don't match
134
+ if current_type_str != target_type_str:
135
+ # Map string type names to narwhals dtypes
136
+ target_dtype = _get_narwhals_dtype(target_type_str)
137
+ if target_dtype:
138
+ cast_exprs.append(nw.col(col_name).cast(target_dtype).alias(col_name))
139
+ logger.debug(f"df[{i}]: casting column '{col_name}' from {current_type_str} to {target_type_str}")
140
+
141
+ # Apply casts if needed
142
+ if cast_exprs:
143
+ try:
144
+ df = df.with_columns(cast_exprs)
145
+ except Exception as e:
146
+ error_msg = str(e)
147
+
148
+ # Check for common type conversion issues and provide helpful guidance
149
+ if "conversion from `str`" in error_msg:
150
+ # Extract column name from error message if possible
151
+ import re
152
+ col_match = re.search(r"column '(\w+)'", error_msg)
153
+ col_name = col_match.group(1) if col_match else "unknown"
154
+
155
+ # Check if it's likely a date/time column
156
+ if any(indicator in col_name.lower() for indicator in ['date', 'time', 'timestamp']):
157
+ raise ValueError(
158
+ f"\nType casting failed for column '{col_name}'.\n"
159
+ f"Cannot convert string dates to numeric/datetime format.\n\n"
160
+ f"The error suggests column '{col_name}' contains date strings that need parsing.\n"
161
+ f"Please pre-process your data before merging. For example:\n\n"
162
+ f" # For date strings like '12/16/2024 10:14':\n"
163
+ f" df = df.with_columns(\n"
164
+ f" nw.col('{col_name}').str.to_datetime('%m/%d/%Y %H:%M')\n"
165
+ f" )\n\n"
166
+ f" # Or for other formats:\n"
167
+ f" # '%Y-%m-%d %H:%M:%S' for '2024-12-16 10:14:00'\n"
168
+ f" # '%d/%m/%Y' for '16/12/2024'\n\n"
169
+ f"Original error: {error_msg}"
170
+ ) from e
171
+ else:
172
+ # Generic string conversion error
173
+ raise ValueError(
174
+ f"\nType casting failed for column '{col_name}'.\n"
175
+ f"Cannot convert string values to {target_type_str}.\n\n"
176
+ f"This often happens when:\n"
177
+ f" 1. String columns contain non-numeric text\n"
178
+ f" 2. Date/time values are stored as strings\n"
179
+ f" 3. Numbers have formatting (e.g., '$1,234.56' or '1.234,56')\n\n"
180
+ f"Please check your data and pre-process if needed.\n"
181
+ f"Original error: {error_msg}"
182
+ ) from e
183
+ else:
184
+ # Re-raise unexpected errors as-is
185
+ raise
186
+
187
+ aligned_dfs.append(df)
188
+
189
+ return aligned_dfs
190
+
191
+
192
+ def _normalize_to_common_backend(nw_dfs: list) -> list:
193
+ """
194
+ Normalize all narwhals dataframes to a common backend using Arrow interchange.
195
+
196
+ This ensures all dataframes can be concatenated regardless of their original
197
+ backend (pandas, polars, duckdb, ibis, etc.). Uses Arrow as the universal
198
+ interchange format.
199
+
200
+ Parameters
201
+ ----------
202
+ nw_dfs : list[narwhals.DataFrame]
203
+ List of narwhals dataframes possibly from different backends
204
+
205
+ Returns
206
+ -------
207
+ list[narwhals.DataFrame]
208
+ List of narwhals dataframes all using the same backend (polars)
209
+ """
210
+ import polars as pl
211
+ import pyarrow as pa
212
+
213
+ normalized_dfs = []
214
+
215
+ for i, nw_df in enumerate(nw_dfs):
216
+ # Get the native dataframe
217
+ native_df = nw_df.to_native()
218
+
219
+ # Check the backend type and convert to arrow
220
+ if isinstance(native_df, pl.DataFrame):
221
+ # Polars to arrow
222
+ arrow_table = native_df.to_arrow()
223
+ elif hasattr(native_df, 'to_arrow'):
224
+ # If it has to_arrow method, use it
225
+ arrow_table = native_df.to_arrow()
226
+ elif hasattr(native_df, '__arrow_c_stream__'):
227
+ # Use Arrow C stream interface if available
228
+ arrow_table = pa.table(native_df)
229
+ else:
230
+ # Fallback: pandas or other - use pyarrow conversion
231
+ try:
232
+ arrow_table = pa.Table.from_pandas(native_df)
233
+ except:
234
+ # Last resort: convert through dict
235
+ data_dict = {}
236
+ for col in native_df.columns:
237
+ data_dict[col] = native_df[col].tolist() if hasattr(native_df[col], 'tolist') else list(native_df[col])
238
+ arrow_table = pa.Table.from_pydict(data_dict)
239
+
240
+ # Convert arrow table back to polars (our common backend)
241
+ pl_df = pl.from_arrow(arrow_table)
242
+
243
+ # Wrap back in narwhals
244
+ normalized_df = nw.from_native(pl_df)
245
+ normalized_dfs.append(normalized_df)
246
+
247
+ logger.debug(f"Normalized df[{i}] to common backend via Arrow")
248
+
249
+ return normalized_dfs
250
+
251
+
252
+ def merge_data(
253
+ dfs: list[str | Path | IntoFrame],
254
+ source_col: str = "mrgsrc",
255
+ output_format: str = "polars"
256
+ ) -> Any:
257
+ """
258
+ Merge multiple dataframes vertically with provenance tracking.
259
+
260
+ This function performs vertical concatenation of dataframes while adding a
261
+ provenance column to track the source of each row. It follows tidyspss's
262
+ dataframe-agnostic design by using narwhals for all processing, only
263
+ converting to the desired format at the final step.
264
+
265
+ The function uses Arrow as a universal interchange format to ensure
266
+ compatibility between different dataframe backends (pandas, polars, duckdb,
267
+ ibis, etc.) before concatenation.
268
+
269
+ Parameters
270
+ ----------
271
+ dfs : list[str | Path | IntoFrame]
272
+ List of inputs to merge. Each element can be:
273
+ - File path (str or Path) to a SAV, CSV, or Excel file
274
+ - A dataframe (pandas, polars, or any narwhals-supported format)
275
+ Mixed lists are supported (e.g., [df1, "file.sav", df2])
276
+ source_col : str, default "mrgsrc"
277
+ Name of the provenance column to add. This column will contain:
278
+ - For file paths: the base filename (e.g., "survey_2024.sav")
279
+ - For dataframes: "source_1", "source_2", etc.
280
+ output_format : str, default "polars"
281
+ Output dataframe format: "pandas", "polars", or "narwhals"
282
+
283
+ Returns
284
+ -------
285
+ DataFrame
286
+ Merged dataframe in the specified format with the provenance column added.
287
+ Uses narwhals.concat with how="diagonal" for column union behavior.
288
+
289
+ Notes
290
+ -----
291
+ - The function uses diagonal concatenation, which means dataframes with
292
+ different columns can be stacked (missing columns filled with nulls)
293
+ - The provenance column is added as the last column in the result
294
+ - All merging happens in narwhals format for consistency, then converts
295
+ to the requested output format at the end
296
+ - The provenance column is always string type
297
+ - Schemas are automatically aligned across all dataframes before merging
298
+ - Mixed backend dataframes are normalized through Arrow interchange protocol
299
+
300
+ Type Casting Behavior
301
+ ---------------------
302
+ When columns exist in multiple dataframes with different types, the function
303
+ attempts to cast them to a common type (first dataframe's type takes precedence).
304
+
305
+ **Important**: Type casting assumes compatible formats. The function cannot
306
+ automatically convert between incompatible formats such as:
307
+ - String dates (e.g., "12/16/2024 10:14") to numeric/datetime
308
+ - Formatted numbers (e.g., "$1,234.56") to numeric
309
+ - Text booleans (e.g., "Yes"/"No") to boolean
310
+
311
+ For these cases, pre-process your data before merging:
312
+ ```python
313
+ # Convert string dates to datetime
314
+ df = df.with_columns(
315
+ nw.col('date').str.to_datetime('%m/%d/%Y %H:%M')
316
+ )
317
+
318
+ # Clean formatted numbers
319
+ df = df.with_columns(
320
+ nw.col('amount').str.replace('[$,]', '').cast(nw.Float64)
321
+ )
322
+ ```
323
+
324
+ The function will raise a helpful error message if type casting fails,
325
+ indicating which column failed and suggesting how to fix it.
326
+
327
+ Examples
328
+ --------
329
+ >>> # Merge multiple SAV files
330
+ >>> merged = merge_data(["file1.sav", "file2.sav", "file3.sav"])
331
+
332
+ >>> # Mix dataframes from different backends
333
+ >>> import pandas as pd
334
+ >>> import polars as pl
335
+ >>> df_pd = pd.DataFrame({'Q1': [1, 2, 3]})
336
+ >>> df_pl = pl.DataFrame({'Q2': [4, 5, 6]})
337
+ >>> merged = merge_data([df_pd, df_pl])
338
+
339
+ >>> # Handle date format issues
340
+ >>> # If you get a type casting error for date columns:
341
+ >>> df = df.with_columns(
342
+ ... nw.col('date').str.to_datetime('%m/%d/%Y %H:%M')
343
+ ... )
344
+ >>> merged = merge_data([df, other_df])
345
+
346
+ >>> # Custom provenance column name and output format
347
+ >>> merged = merge_data(dfs_list, source_col="file_source", output_format="pandas")
348
+ """
349
+ if not dfs:
350
+ raise ValueError("dfs list cannot be empty")
351
+
352
+ if output_format not in ["pandas", "polars", "narwhals"]:
353
+ raise ValueError(f"output_format must be 'pandas', 'polars', or 'narwhals', got {output_format}")
354
+
355
+
356
+ # Prepare list of narwhals dataframes WITHOUT provenance column first
357
+ nw_dfs = []
358
+ source_names = [] # Keep track of source names for each df
359
+ df_counter = 0
360
+
361
+ for item in dfs:
362
+ # Determine if item is a file path or dataframe
363
+ if isinstance(item, (str, Path)):
364
+ # It's a file path
365
+ file_path = Path(item)
366
+
367
+ if not file_path.exists():
368
+ raise FileNotFoundError(f"File not found: {file_path}")
369
+
370
+ # Determine file type and read appropriately
371
+ ext = file_path.suffix.lower()
372
+ source_name = file_path.name # Use base filename for provenance
373
+
374
+ # Read file based on extension (use narwhals format for consistency)
375
+ if ext in ['.sav', '.zsav']:
376
+ df, _ = read_sav(file_path, output_format="narwhals")
377
+ nw_df = df # Already in narwhals format
378
+ elif ext == '.csv':
379
+ nw_df = read_csv(file_path, output_format="narwhals")
380
+ elif ext in ['.xlsx', '.xls', '.xlsm', '.xlsb', '.ods']:
381
+ nw_df = read_excel(file_path, output_format="narwhals")
382
+ else:
383
+ raise ValueError(f"Unsupported file type: {ext}")
384
+
385
+ else:
386
+ # It's a dataframe - convert to narwhals
387
+ nw_df = nw.from_native(item)
388
+ df_counter += 1
389
+ source_name = f"source_{df_counter}"
390
+
391
+ # Store the dataframe and its source name
392
+ nw_dfs.append(nw_df)
393
+ source_names.append(source_name)
394
+
395
+ logger.info(f"Added dataframe from {source_name} with shape {nw_df.shape}")
396
+
397
+ # Normalize all dataframes to common backend through Arrow
398
+ logger.info("Normalizing dataframes to common backend via Arrow interchange...")
399
+ normalized_dfs = _normalize_to_common_backend(nw_dfs)
400
+
401
+ # Align schemas across all dataframes before concatenation
402
+ logger.info("Aligning schemas across all dataframes...")
403
+ aligned_dfs = align_schemas(normalized_dfs)
404
+
405
+ # Add source column to each aligned dataframe
406
+ dfs_with_source = []
407
+ for df, source_name in zip(aligned_dfs, source_names):
408
+ df_with_source = df.with_columns(
409
+ nw.lit(source_name).alias(source_col)
410
+ )
411
+ dfs_with_source.append(df_with_source)
412
+
413
+ # Now all dataframes have the same backend and aligned schemas
414
+ # Concatenate all dataframes with diagonal union in narwhals
415
+ merged_nw = nw.concat(dfs_with_source, how="diagonal")
416
+
417
+ # Move source column to the end if it's not already there
418
+ cols = list(merged_nw.columns)
419
+ if source_col in cols and cols[-1] != source_col:
420
+ cols.remove(source_col)
421
+ cols.append(source_col)
422
+ merged_nw = merged_nw.select(cols)
423
+
424
+ logger.info(f"Merged {len(dfs)} dataframes into shape {merged_nw.shape}")
425
+
426
+ # Convert to requested output format using narwhals built-in methods
427
+ if output_format == "narwhals":
428
+ return merged_nw
429
+ elif output_format == "polars":
430
+ return merged_nw.to_polars()
431
+ elif output_format == "pandas":
432
+ return merged_nw.to_pandas()
433
+
434
+ # Should not reach here due to earlier validation
435
+ return merged_nw