ultrasav 0.1.4__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- ultrasav/__init__.py +280 -0
- ultrasav/_add_cases.py +227 -0
- ultrasav/_data.py +513 -0
- ultrasav/_make_dummy.py +137 -0
- ultrasav/_merge_data.py +435 -0
- ultrasav/_merge_meta.py +280 -0
- ultrasav/_metadata.py +570 -0
- ultrasav/_read_files.py +558 -0
- ultrasav/_write_files.py +111 -0
- ultrasav/metaman/__init__.py +91 -0
- ultrasav/metaman/def_detect_variable_type.py +454 -0
- ultrasav/metaman/def_get_meta.py +561 -0
- ultrasav/metaman/def_make_datamap.py +127 -0
- ultrasav/metaman/def_make_labels.py +833 -0
- ultrasav/metaman/def_map_engine.py +529 -0
- ultrasav/metaman/def_map_to_excel.py +294 -0
- ultrasav/metaman/def_write_excel_engine.py +298 -0
- ultrasav/metaman/pastel_color_schemes.py +185 -0
- ultrasav-0.1.4.dist-info/METADATA +550 -0
- ultrasav-0.1.4.dist-info/RECORD +21 -0
- ultrasav-0.1.4.dist-info/WHEEL +4 -0
ultrasav/_read_files.py
ADDED
|
@@ -0,0 +1,558 @@
|
|
|
1
|
+
"""
|
|
2
|
+
ultrasav: read functions
|
|
3
|
+
v_0.1.0
|
|
4
|
+
"""
|
|
5
|
+
|
|
6
|
+
|
|
7
|
+
import os
|
|
8
|
+
from collections.abc import Sequence
|
|
9
|
+
import logging
|
|
10
|
+
from typing import Any
|
|
11
|
+
import narwhals as nw
|
|
12
|
+
import pyreadstat
|
|
13
|
+
|
|
14
|
+
# Shared encoding list
|
|
15
|
+
COMMON_ENCODINGS = [
|
|
16
|
+
None, # Default
|
|
17
|
+
"utf-8",
|
|
18
|
+
"utf-8-sig",
|
|
19
|
+
"latin1",
|
|
20
|
+
"cp1252", # Windows Western European
|
|
21
|
+
"iso-8859-1",
|
|
22
|
+
"cp1251", # Windows Cyrillic
|
|
23
|
+
"cp1250", # Windows Central European
|
|
24
|
+
"gbk", # Chinese Simplified
|
|
25
|
+
"big5", # Chinese Traditional
|
|
26
|
+
"shift_jis", # Japanese
|
|
27
|
+
"euc-kr", # Korean
|
|
28
|
+
]
|
|
29
|
+
|
|
30
|
+
def read_sav(
|
|
31
|
+
file_path: str | os.PathLike,
|
|
32
|
+
output_format: str = "polars",
|
|
33
|
+
encoding: str | None = None,
|
|
34
|
+
apply_value_formats: bool = False,
|
|
35
|
+
formats_as_category: bool = True,
|
|
36
|
+
formats_as_ordered_category: bool = False,
|
|
37
|
+
auto_detect_encoding: bool = True,
|
|
38
|
+
**kwargs
|
|
39
|
+
) -> tuple[Any, Any]:
|
|
40
|
+
"""
|
|
41
|
+
Read SPSS SAV/ZSAV files with automatic encoding detection.
|
|
42
|
+
|
|
43
|
+
Parameters
|
|
44
|
+
----------
|
|
45
|
+
file_path : str or Path
|
|
46
|
+
Path to the SAV/ZSAV file
|
|
47
|
+
output_format : str, default "polars"
|
|
48
|
+
Output format: "pandas", "polars", "narwhals", or "dict"
|
|
49
|
+
- "pandas": returns pandas DataFrame
|
|
50
|
+
- "polars": returns polars DataFrame
|
|
51
|
+
- "narwhals": returns narwhals DataFrame (converted from polars)
|
|
52
|
+
- "dict": returns dictionary
|
|
53
|
+
encoding : str, optional
|
|
54
|
+
File encoding. If None and auto_detect_encoding=True, will try multiple encodings
|
|
55
|
+
auto_detect_encoding : bool, default True
|
|
56
|
+
If True and encoding is None, automatically tries multiple encodings
|
|
57
|
+
apply_value_formats : bool, default False
|
|
58
|
+
Apply value labels to the data
|
|
59
|
+
formats_as_category : bool, default True
|
|
60
|
+
Convert formatted variables to categories
|
|
61
|
+
formats_as_ordered_category : bool, default False
|
|
62
|
+
Convert formatted variables to ordered categories
|
|
63
|
+
**kwargs : additional arguments passed to pyreadstat.read_sav()
|
|
64
|
+
|
|
65
|
+
Returns
|
|
66
|
+
-------
|
|
67
|
+
df : DataFrame or dict
|
|
68
|
+
Data in the specified output format
|
|
69
|
+
meta : metadata object
|
|
70
|
+
Metadata from the SPSS file
|
|
71
|
+
|
|
72
|
+
Examples
|
|
73
|
+
--------
|
|
74
|
+
>>> df, meta = read_sav("survey.sav") # Returns polars DataFrame by default
|
|
75
|
+
>>> df, meta = read_sav("survey.sav", output_format="pandas") # Returns pandas DataFrame
|
|
76
|
+
>>> df, meta = read_sav("survey.sav", output_format="narwhals") # Returns narwhals DataFrame
|
|
77
|
+
"""
|
|
78
|
+
if output_format not in ["pandas", "polars", "narwhals", "dict"]:
|
|
79
|
+
raise ValueError(f"output_format must be 'pandas', 'polars', 'narwhals', or 'dict', got {output_format}")
|
|
80
|
+
|
|
81
|
+
# For narwhals output, read as polars first
|
|
82
|
+
read_format = "polars" if output_format == "narwhals" else output_format
|
|
83
|
+
|
|
84
|
+
read_kwargs = {
|
|
85
|
+
"apply_value_formats": apply_value_formats,
|
|
86
|
+
"formats_as_category": formats_as_category,
|
|
87
|
+
"formats_as_ordered_category": formats_as_ordered_category,
|
|
88
|
+
"output_format": read_format,
|
|
89
|
+
**kwargs
|
|
90
|
+
}
|
|
91
|
+
|
|
92
|
+
if encoding is not None or not auto_detect_encoding:
|
|
93
|
+
if encoding:
|
|
94
|
+
read_kwargs["encoding"] = encoding
|
|
95
|
+
df, meta = pyreadstat.read_sav(file_path, **read_kwargs)
|
|
96
|
+
if output_format == "narwhals":
|
|
97
|
+
df = nw.from_native(df)
|
|
98
|
+
return df, meta
|
|
99
|
+
|
|
100
|
+
# Auto-detect encoding
|
|
101
|
+
last_error = None
|
|
102
|
+
for enc in COMMON_ENCODINGS:
|
|
103
|
+
try:
|
|
104
|
+
if enc is not None:
|
|
105
|
+
read_kwargs["encoding"] = enc
|
|
106
|
+
elif "encoding" in read_kwargs:
|
|
107
|
+
del read_kwargs["encoding"]
|
|
108
|
+
|
|
109
|
+
df, meta = pyreadstat.read_sav(file_path, **read_kwargs)
|
|
110
|
+
|
|
111
|
+
enc_name = enc or "default"
|
|
112
|
+
logging.info(f"Successfully read SAV file with encoding: {enc_name}")
|
|
113
|
+
|
|
114
|
+
if output_format == "narwhals":
|
|
115
|
+
df = nw.from_native(df)
|
|
116
|
+
return df, meta
|
|
117
|
+
|
|
118
|
+
except (UnicodeDecodeError, UnicodeError) as e:
|
|
119
|
+
last_error = e
|
|
120
|
+
continue
|
|
121
|
+
except Exception as e:
|
|
122
|
+
if any(term in str(e).lower() for term in ["decode", "encode", "codec", "utf", "unicode"]):
|
|
123
|
+
last_error = e
|
|
124
|
+
continue
|
|
125
|
+
raise e
|
|
126
|
+
|
|
127
|
+
raise ValueError(
|
|
128
|
+
f"Failed to read SAV file '{file_path}' with any attempted encoding.\n"
|
|
129
|
+
f"Last error: {last_error}"
|
|
130
|
+
)
|
|
131
|
+
|
|
132
|
+
|
|
133
|
+
# Helper: non-string sequence check
|
|
134
|
+
def _is_non_str_seq(obj: Any) -> bool:
|
|
135
|
+
return isinstance(obj, Sequence) and not isinstance(obj, (str, bytes))
|
|
136
|
+
|
|
137
|
+
# Helper: choose a pandas fallback engine from file extension
|
|
138
|
+
def _pandas_engine_from_ext(ext: str) -> str | None:
|
|
139
|
+
# Best-effort guesser. Users can always override via `engine=...`.
|
|
140
|
+
if ext in {".xlsx", ".xlsm", ".xltx", ".xltm"}:
|
|
141
|
+
return "openpyxl"
|
|
142
|
+
if ext == ".xls":
|
|
143
|
+
return "xlrd"
|
|
144
|
+
if ext == ".xlsb":
|
|
145
|
+
return "pyxlsb"
|
|
146
|
+
if ext == ".ods":
|
|
147
|
+
return "odf"
|
|
148
|
+
return None
|
|
149
|
+
|
|
150
|
+
def read_excel(
|
|
151
|
+
file_path: str | os.PathLike,
|
|
152
|
+
output_format: str = "polars",
|
|
153
|
+
sheet_name: str | int | Sequence[str | int] | None = 0,
|
|
154
|
+
engine: str | None = None,
|
|
155
|
+
**kwargs
|
|
156
|
+
) -> Any:
|
|
157
|
+
"""
|
|
158
|
+
Read Excel files into the requested DataFrame format.
|
|
159
|
+
|
|
160
|
+
Parameters
|
|
161
|
+
----------
|
|
162
|
+
file_path : str or Path
|
|
163
|
+
Path to the Excel file.
|
|
164
|
+
output_format : {"pandas","polars","narwhals"}, default "polars"
|
|
165
|
+
- "pandas": returns pandas DataFrame or dict[str, DataFrame]
|
|
166
|
+
- "polars": returns polars DataFrame or dict[str, DataFrame]
|
|
167
|
+
- "narwhals": reads via Polars with the same defaults, then converts
|
|
168
|
+
sheet_name : str | int | list[str|int] | None, default 0
|
|
169
|
+
Sheet selector(s).
|
|
170
|
+
- str or list[str]: sheet names.
|
|
171
|
+
- int or list[int]: **0-based indices** (we map to Polars's 1-based sheet_id).
|
|
172
|
+
- None: **read all sheets** (dict is returned).
|
|
173
|
+
engine : str | None, default None
|
|
174
|
+
Backend engine to use.
|
|
175
|
+
- Polars: defaults to "calamine" with "openpyxl" fallback. Options: "calamine", "openpyxl", "xlsx2csv"
|
|
176
|
+
- Pandas: auto-detects based on file extension. Options: "openpyxl" (.xlsx), "xlrd" (.xls), "pyxlsb" (.xlsb), "odf" (.ods)
|
|
177
|
+
If the specified engine fails, we try sensible fallbacks.
|
|
178
|
+
|
|
179
|
+
**kwargs :
|
|
180
|
+
Additional keyword args forwarded to the underlying read function.
|
|
181
|
+
Note that available parameters differ between pandas and polars:
|
|
182
|
+
- pandas supports: header, nrows, usecols, dtype, etc.
|
|
183
|
+
- polars supports: has_header, columns, schema_overrides, etc.
|
|
184
|
+
Excel readers do not take an 'encoding' parameter.
|
|
185
|
+
|
|
186
|
+
Returns
|
|
187
|
+
-------
|
|
188
|
+
DataFrame or dict[str, DataFrame]
|
|
189
|
+
Depending on `output_format` and whether multiple sheets were requested.
|
|
190
|
+
|
|
191
|
+
Notes
|
|
192
|
+
-----
|
|
193
|
+
- Integer sheet indices are **0-based** in this API, even though Polars uses 1-based
|
|
194
|
+
`sheet_id`. We adjust automatically.
|
|
195
|
+
- Passing a mixed list of integers and strings for `sheet_name` is not supported and
|
|
196
|
+
will raise a ValueError.
|
|
197
|
+
"""
|
|
198
|
+
if output_format not in {"pandas", "polars", "narwhals"}:
|
|
199
|
+
raise ValueError("output_format must be 'pandas', 'polars', or 'narwhals'.")
|
|
200
|
+
|
|
201
|
+
# Determine how to read (narwhals reads via Polars first)
|
|
202
|
+
backend = "polars" if output_format == "narwhals" else output_format
|
|
203
|
+
|
|
204
|
+
# Normalize sheet selection
|
|
205
|
+
sheet_is_seq = _is_non_str_seq(sheet_name)
|
|
206
|
+
if sheet_is_seq:
|
|
207
|
+
items = list(sheet_name) # type: ignore[arg-type]
|
|
208
|
+
else:
|
|
209
|
+
items = [] if sheet_name is None else [sheet_name] # None handled below
|
|
210
|
+
|
|
211
|
+
# Check for mixed types in sheet_name
|
|
212
|
+
if sheet_is_seq and items:
|
|
213
|
+
first_type = type(items[0])
|
|
214
|
+
if not all(isinstance(item, first_type) for item in items):
|
|
215
|
+
raise ValueError(
|
|
216
|
+
f"Mixed sheet types are not supported. Got: {[type(item).__name__ for item in items]}"
|
|
217
|
+
)
|
|
218
|
+
|
|
219
|
+
# Read based on backend
|
|
220
|
+
file_str = str(file_path)
|
|
221
|
+
|
|
222
|
+
if backend == "polars":
|
|
223
|
+
import polars as pl
|
|
224
|
+
|
|
225
|
+
# Polars-specific kwargs
|
|
226
|
+
polars_kwargs = dict(kwargs)
|
|
227
|
+
|
|
228
|
+
# Note: Polars doesn't have nrows parameter, it uses other mechanisms
|
|
229
|
+
# Remove nrows if present since polars doesn't support it
|
|
230
|
+
if "nrows" in polars_kwargs:
|
|
231
|
+
polars_kwargs.pop("nrows")
|
|
232
|
+
# Could warn user that nrows is not supported in polars
|
|
233
|
+
|
|
234
|
+
# Map sheet selection
|
|
235
|
+
if sheet_name is None:
|
|
236
|
+
# Read all sheets
|
|
237
|
+
polars_kwargs["sheet_name"] = None
|
|
238
|
+
elif sheet_is_seq:
|
|
239
|
+
# Multiple sheets
|
|
240
|
+
if all(isinstance(item, int) for item in items):
|
|
241
|
+
# 0-based to 1-based conversion for sheet indices
|
|
242
|
+
polars_kwargs["sheet_id"] = [i + 1 for i in items]
|
|
243
|
+
else:
|
|
244
|
+
# Sheet names
|
|
245
|
+
polars_kwargs["sheet_name"] = items
|
|
246
|
+
else:
|
|
247
|
+
# Single sheet
|
|
248
|
+
if isinstance(sheet_name, int):
|
|
249
|
+
polars_kwargs["sheet_id"] = sheet_name + 1
|
|
250
|
+
else:
|
|
251
|
+
polars_kwargs["sheet_name"] = sheet_name
|
|
252
|
+
|
|
253
|
+
# Engine handling for Polars
|
|
254
|
+
# Default to calamine if no engine specified
|
|
255
|
+
if engine is None:
|
|
256
|
+
engine = "calamine"
|
|
257
|
+
else:
|
|
258
|
+
# Validate user-provided engine
|
|
259
|
+
valid_polars_engines = {"calamine", "openpyxl", "xlsx2csv"}
|
|
260
|
+
if engine not in valid_polars_engines:
|
|
261
|
+
import warnings
|
|
262
|
+
warnings.warn(
|
|
263
|
+
f"Invalid engine '{engine}' for polars. "
|
|
264
|
+
f"Valid options are: {valid_polars_engines}. "
|
|
265
|
+
f"Defaulting to 'calamine'.",
|
|
266
|
+
UserWarning
|
|
267
|
+
)
|
|
268
|
+
engine = "calamine"
|
|
269
|
+
|
|
270
|
+
polars_kwargs["engine"] = engine
|
|
271
|
+
|
|
272
|
+
try:
|
|
273
|
+
result = pl.read_excel(file_str, **polars_kwargs)
|
|
274
|
+
except Exception as e:
|
|
275
|
+
# If calamine fails, try openpyxl as fallback
|
|
276
|
+
if engine == "calamine" and "calamine" in str(e).lower():
|
|
277
|
+
logging.info("Calamine engine failed, falling back to openpyxl")
|
|
278
|
+
polars_kwargs["engine"] = "openpyxl"
|
|
279
|
+
try:
|
|
280
|
+
result = pl.read_excel(file_str, **polars_kwargs)
|
|
281
|
+
except Exception as e2:
|
|
282
|
+
# If openpyxl also fails, try xlsx2csv as last resort
|
|
283
|
+
logging.info("Openpyxl engine failed, falling back to xlsx2csv")
|
|
284
|
+
polars_kwargs["engine"] = "xlsx2csv"
|
|
285
|
+
result = pl.read_excel(file_str, **polars_kwargs)
|
|
286
|
+
else:
|
|
287
|
+
raise
|
|
288
|
+
|
|
289
|
+
else: # backend == "pandas"
|
|
290
|
+
import pandas as pd
|
|
291
|
+
|
|
292
|
+
# Pandas-specific kwargs
|
|
293
|
+
pandas_kwargs = dict(kwargs)
|
|
294
|
+
|
|
295
|
+
# Sheet selection
|
|
296
|
+
if sheet_name is None:
|
|
297
|
+
pandas_kwargs["sheet_name"] = None
|
|
298
|
+
elif sheet_is_seq:
|
|
299
|
+
# Convert 0-based indices to sheet names if possible
|
|
300
|
+
if all(isinstance(item, int) for item in items):
|
|
301
|
+
# Keep as 0-based for pandas
|
|
302
|
+
pandas_kwargs["sheet_name"] = items
|
|
303
|
+
else:
|
|
304
|
+
pandas_kwargs["sheet_name"] = items
|
|
305
|
+
else:
|
|
306
|
+
pandas_kwargs["sheet_name"] = sheet_name
|
|
307
|
+
|
|
308
|
+
# Engine selection for pandas
|
|
309
|
+
if engine:
|
|
310
|
+
pandas_kwargs["engine"] = engine
|
|
311
|
+
else:
|
|
312
|
+
# Auto-detect engine based on file extension
|
|
313
|
+
ext = os.path.splitext(file_str)[1].lower()
|
|
314
|
+
pandas_engine = _pandas_engine_from_ext(ext)
|
|
315
|
+
if pandas_engine:
|
|
316
|
+
pandas_kwargs["engine"] = pandas_engine
|
|
317
|
+
|
|
318
|
+
try:
|
|
319
|
+
result = pd.read_excel(file_str, **pandas_kwargs)
|
|
320
|
+
except Exception as e:
|
|
321
|
+
# Fallback for engine issues
|
|
322
|
+
if "engine" in pandas_kwargs and "engine" in str(e).lower():
|
|
323
|
+
ext = os.path.splitext(file_str)[1].lower()
|
|
324
|
+
fallback_engine = _pandas_engine_from_ext(ext)
|
|
325
|
+
if fallback_engine and fallback_engine != pandas_kwargs.get("engine"):
|
|
326
|
+
pandas_kwargs["engine"] = fallback_engine
|
|
327
|
+
result = pd.read_excel(file_str, **pandas_kwargs)
|
|
328
|
+
else:
|
|
329
|
+
raise
|
|
330
|
+
else:
|
|
331
|
+
raise
|
|
332
|
+
|
|
333
|
+
# Convert to narwhals if requested
|
|
334
|
+
if output_format == "narwhals":
|
|
335
|
+
if isinstance(result, dict):
|
|
336
|
+
return {name: nw.from_native(df) for name, df in result.items()}
|
|
337
|
+
else:
|
|
338
|
+
return nw.from_native(result)
|
|
339
|
+
|
|
340
|
+
return result
|
|
341
|
+
|
|
342
|
+
def _is_encoding_error(e: Exception) -> bool:
|
|
343
|
+
"""Check if an exception is encoding-related."""
|
|
344
|
+
error_str = str(e).lower()
|
|
345
|
+
error_type = type(e).__name__.lower()
|
|
346
|
+
|
|
347
|
+
encoding_indicators = [
|
|
348
|
+
"encode", "decode", "codec", "utf", "unicode",
|
|
349
|
+
"charmap", "ascii", "latin", "gbk", "big5"
|
|
350
|
+
]
|
|
351
|
+
|
|
352
|
+
return any(ind in error_str or ind in error_type for ind in encoding_indicators)
|
|
353
|
+
|
|
354
|
+
def _normalize_sep_kwargs(kwargs: dict[str, Any], backend: str) -> dict[str, Any]:
|
|
355
|
+
"""
|
|
356
|
+
Normalize delimiter-related kwargs for different backends.
|
|
357
|
+
- pandas: uses `sep`
|
|
358
|
+
- polars: uses `separator`
|
|
359
|
+
"""
|
|
360
|
+
out = dict(kwargs)
|
|
361
|
+
|
|
362
|
+
if backend == "polars":
|
|
363
|
+
# Prefer explicit 'separator' if provided, else map common pandas spellings
|
|
364
|
+
if "separator" not in out:
|
|
365
|
+
if "sep" in out:
|
|
366
|
+
out["separator"] = out.pop("sep")
|
|
367
|
+
elif "delimiter" in out:
|
|
368
|
+
out["separator"] = out.pop("delimiter")
|
|
369
|
+
else: # backend == "pandas"
|
|
370
|
+
# Prefer explicit 'sep' if provided, else map 'separator'/'delimiter'
|
|
371
|
+
if "sep" not in out:
|
|
372
|
+
if "separator" in out:
|
|
373
|
+
out["sep"] = out.pop("separator")
|
|
374
|
+
elif "delimiter" in out:
|
|
375
|
+
out["sep"] = out.pop("delimiter")
|
|
376
|
+
|
|
377
|
+
return out
|
|
378
|
+
|
|
379
|
+
def _pick_backend_for_narwhals_preference(prefer: str = "polars") -> str:
|
|
380
|
+
"""
|
|
381
|
+
Choose a backend module name for Narwhals ('polars' or 'pandas').
|
|
382
|
+
If the preferred backend isn't importable, fall back to the other.
|
|
383
|
+
"""
|
|
384
|
+
if prefer == "polars":
|
|
385
|
+
try:
|
|
386
|
+
import polars # noqa: F401
|
|
387
|
+
return "polars"
|
|
388
|
+
except Exception:
|
|
389
|
+
pass
|
|
390
|
+
try:
|
|
391
|
+
import pandas # noqa: F401
|
|
392
|
+
return "pandas"
|
|
393
|
+
except Exception:
|
|
394
|
+
raise RuntimeError(
|
|
395
|
+
"Neither Polars nor pandas is installed. Please install one of them."
|
|
396
|
+
)
|
|
397
|
+
else: # prefer pandas
|
|
398
|
+
try:
|
|
399
|
+
import pandas # noqa: F401
|
|
400
|
+
return "pandas"
|
|
401
|
+
except Exception:
|
|
402
|
+
pass
|
|
403
|
+
try:
|
|
404
|
+
import polars # noqa: F401
|
|
405
|
+
return "polars"
|
|
406
|
+
except Exception:
|
|
407
|
+
raise RuntimeError(
|
|
408
|
+
"Neither pandas nor Polars is installed. Please install one of them."
|
|
409
|
+
)
|
|
410
|
+
|
|
411
|
+
def read_csv(
|
|
412
|
+
file_path: str | os.PathLike,
|
|
413
|
+
output_format: str = "polars",
|
|
414
|
+
encoding: str | None = None,
|
|
415
|
+
auto_detect_encoding: bool = True,
|
|
416
|
+
**kwargs
|
|
417
|
+
) -> Any:
|
|
418
|
+
"""
|
|
419
|
+
Read CSV files into the requested DataFrame format using Narwhals under the hood.
|
|
420
|
+
|
|
421
|
+
Parameters
|
|
422
|
+
----------
|
|
423
|
+
file_path : str or Path
|
|
424
|
+
Path to the CSV file.
|
|
425
|
+
output_format : {"polars","pandas","narwhals"}, default "polars"
|
|
426
|
+
- "polars": returns a polars.DataFrame
|
|
427
|
+
- "pandas": returns a pandas.DataFrame
|
|
428
|
+
- "narwhals": returns a narwhals.DataFrame (backed by a native df)
|
|
429
|
+
encoding : str, optional
|
|
430
|
+
Encoding hint passed to the native reader. If None and
|
|
431
|
+
auto_detect_encoding=True, we'll try several encodings.
|
|
432
|
+
auto_detect_encoding : bool, default True
|
|
433
|
+
If True and `encoding` is None, attempt multiple encodings.
|
|
434
|
+
**kwargs :
|
|
435
|
+
Extra keywords forwarded to the native CSV reader (through Narwhals).
|
|
436
|
+
These may be backend-specific (e.g., pandas uses `sep`, Polars uses
|
|
437
|
+
`separator`). We normalize only delimiter args (`sep`/`delimiter`↔`separator`)
|
|
438
|
+
to reduce friction.
|
|
439
|
+
|
|
440
|
+
Returns
|
|
441
|
+
-------
|
|
442
|
+
DataFrame
|
|
443
|
+
pandas.DataFrame, polars.DataFrame, or narwhals.DataFrame according to
|
|
444
|
+
`output_format`.
|
|
445
|
+
|
|
446
|
+
Notes
|
|
447
|
+
-----
|
|
448
|
+
- Narwhals API: `nw.read_csv(source, backend=..., **kwargs)` then
|
|
449
|
+
`nw.to_native(df)` if you want the native object. Kwargs pass through to
|
|
450
|
+
the backend reader.
|
|
451
|
+
- Polars CSV reader is UTF‑8‑first; for non‑UTF encodings, pandas tends to
|
|
452
|
+
be more permissive, so we use a pandas fallback during autodetection.
|
|
453
|
+
"""
|
|
454
|
+
if output_format not in {"polars", "pandas", "narwhals"}:
|
|
455
|
+
raise ValueError("output_format must be 'polars', 'pandas', or 'narwhals'.")
|
|
456
|
+
|
|
457
|
+
# Decide the first backend to try based on the requested output.
|
|
458
|
+
if output_format == "polars":
|
|
459
|
+
preferred_backend = "polars"
|
|
460
|
+
elif output_format == "pandas":
|
|
461
|
+
preferred_backend = "pandas"
|
|
462
|
+
else: # "narwhals"
|
|
463
|
+
# Prefer polars for performance; fall back to pandas if not installed
|
|
464
|
+
preferred_backend = _pick_backend_for_narwhals_preference("polars")
|
|
465
|
+
|
|
466
|
+
def _read_once(backend: str, enc: str | None, passthrough_kwargs: dict[str, Any]):
|
|
467
|
+
# Normalize delimiter kwargs for this backend
|
|
468
|
+
k = _normalize_sep_kwargs(passthrough_kwargs, backend=backend)
|
|
469
|
+
if enc:
|
|
470
|
+
k = {**k, "encoding": enc}
|
|
471
|
+
# Use Narwhals to create a Narwhals DataFrame backed by the chosen backend
|
|
472
|
+
return nw.read_csv(str(file_path), backend=backend, **k) # narwhals DF
|
|
473
|
+
|
|
474
|
+
last_err: Exception | None = None
|
|
475
|
+
|
|
476
|
+
# Fast path: if encoding is provided or autodetect is off, do a single attempt
|
|
477
|
+
if (encoding is not None) or (not auto_detect_encoding):
|
|
478
|
+
try:
|
|
479
|
+
df_nw = _read_once(preferred_backend, encoding, kwargs)
|
|
480
|
+
except Exception as e:
|
|
481
|
+
# If it's encoding-related and the preferred backend is Polars,
|
|
482
|
+
# try pandas as a one-shot fallback with the same encoding.
|
|
483
|
+
if _is_encoding_error(e) and preferred_backend == "polars":
|
|
484
|
+
try:
|
|
485
|
+
df_nw = _read_once("pandas", encoding, kwargs)
|
|
486
|
+
except Exception as e2:
|
|
487
|
+
raise e2
|
|
488
|
+
else:
|
|
489
|
+
raise
|
|
490
|
+
# Return in the requested format
|
|
491
|
+
if output_format == "narwhals":
|
|
492
|
+
return df_nw
|
|
493
|
+
native = nw.to_native(df_nw) # pandas or polars depending on backend
|
|
494
|
+
if output_format == "polars" and not (type(native).__module__.startswith("polars")):
|
|
495
|
+
# Convert pandas->polars if fallback used
|
|
496
|
+
import polars as pl
|
|
497
|
+
return pl.from_pandas(native)
|
|
498
|
+
return native
|
|
499
|
+
|
|
500
|
+
# Autodetect encoding: try preferred backend with default (utf-8), then iterate COMMON_ENCODINGS
|
|
501
|
+
# Strategy:
|
|
502
|
+
# 1) Try preferred backend with no 'encoding' kwarg (native default).
|
|
503
|
+
# 2) For each enc in COMMON_ENCODINGS (skipping None), try preferred backend.
|
|
504
|
+
# If that fails due to encoding, try pandas (broad support).
|
|
505
|
+
# 3) On success, convert to requested output.
|
|
506
|
+
try:
|
|
507
|
+
df_nw = _read_once(preferred_backend, None, kwargs)
|
|
508
|
+
# Success with default encoding
|
|
509
|
+
if output_format == "narwhals":
|
|
510
|
+
return df_nw
|
|
511
|
+
native = nw.to_native(df_nw)
|
|
512
|
+
if output_format == "polars" and not (type(native).__module__.startswith("polars")):
|
|
513
|
+
import polars as pl
|
|
514
|
+
return pl.from_pandas(native)
|
|
515
|
+
return native
|
|
516
|
+
except Exception as e:
|
|
517
|
+
last_err = e
|
|
518
|
+
if not _is_encoding_error(e):
|
|
519
|
+
# Not encoding-related -> propagate immediately
|
|
520
|
+
raise
|
|
521
|
+
|
|
522
|
+
# Try a list of encodings
|
|
523
|
+
for enc in (e for e in COMMON_ENCODINGS if e is not None):
|
|
524
|
+
try:
|
|
525
|
+
df_nw = _read_once(preferred_backend, enc, kwargs)
|
|
526
|
+
logging.info(f"read_csv succeeded with backend={preferred_backend}, encoding={enc}")
|
|
527
|
+
if output_format == "narwhals":
|
|
528
|
+
return df_nw
|
|
529
|
+
native = nw.to_native(df_nw)
|
|
530
|
+
if output_format == "polars" and not (type(native).__module__.startswith("polars")):
|
|
531
|
+
import polars as pl
|
|
532
|
+
return pl.from_pandas(native)
|
|
533
|
+
return native
|
|
534
|
+
except Exception as e:
|
|
535
|
+
last_err = e
|
|
536
|
+
# If encoding-related and we haven't tried pandas yet (or preferred is polars), try pandas
|
|
537
|
+
if _is_encoding_error(e) and preferred_backend == "polars":
|
|
538
|
+
try:
|
|
539
|
+
df_nw = _read_once("pandas", enc, kwargs)
|
|
540
|
+
logging.info(f"read_csv succeeded with backend=pandas, encoding={enc}")
|
|
541
|
+
if output_format == "narwhals":
|
|
542
|
+
return df_nw
|
|
543
|
+
native = nw.to_native(df_nw) # pandas DF
|
|
544
|
+
if output_format == "polars":
|
|
545
|
+
import polars as pl
|
|
546
|
+
return pl.from_pandas(native)
|
|
547
|
+
return native
|
|
548
|
+
except Exception as e2:
|
|
549
|
+
last_err = e2
|
|
550
|
+
continue
|
|
551
|
+
# Non-encoding error -> raise early
|
|
552
|
+
if not _is_encoding_error(e):
|
|
553
|
+
raise
|
|
554
|
+
|
|
555
|
+
raise ValueError(
|
|
556
|
+
f"Failed to read CSV file '{file_path}' with any attempted encoding. "
|
|
557
|
+
f"Last error: {last_err}"
|
|
558
|
+
)
|
ultrasav/_write_files.py
ADDED
|
@@ -0,0 +1,111 @@
|
|
|
1
|
+
"""
|
|
2
|
+
write_functions.py
|
|
3
|
+
v0.1.0
|
|
4
|
+
"""
|
|
5
|
+
|
|
6
|
+
import logging
|
|
7
|
+
from pathlib import Path
|
|
8
|
+
import pyreadstat
|
|
9
|
+
|
|
10
|
+
logger = logging.getLogger(__name__)
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
def write_sav(data,
|
|
14
|
+
meta,
|
|
15
|
+
dst_path: str | Path,
|
|
16
|
+
**overrides) -> None:
|
|
17
|
+
"""
|
|
18
|
+
Write data and metadata to a SPSS SAV file.
|
|
19
|
+
|
|
20
|
+
This is the convergence point where independent data and metadata objects
|
|
21
|
+
reunite. Only columns that exist in the data will have their metadata
|
|
22
|
+
written, regardless of what metadata exists for other columns.
|
|
23
|
+
|
|
24
|
+
Parameters
|
|
25
|
+
----------
|
|
26
|
+
data : The data to write, pandas.DataFrame, or polars.DataFrame
|
|
27
|
+
meta : Metadata, metadata_container, or None
|
|
28
|
+
Metadata object containing labels, formats, etc. Can be:
|
|
29
|
+
- A Metadata object (from this package)
|
|
30
|
+
- A pyreadstat metadata_container (will be auto-wrapped)
|
|
31
|
+
- None for minimal metadata (no labels, formats, etc.)
|
|
32
|
+
dst_path : str or Path
|
|
33
|
+
Path where the SAV file will be written.
|
|
34
|
+
**overrides : keyword arguments
|
|
35
|
+
Optional overrides for metadata settings. Common overrides:
|
|
36
|
+
- compress (bool): If True, creates a compressed ZSAV file
|
|
37
|
+
- row_compress (bool): If True, uses row compression
|
|
38
|
+
Note: compress and row_compress cannot both be True.
|
|
39
|
+
Any override temporarily replaces the meta object's setting during write.
|
|
40
|
+
|
|
41
|
+
Notes
|
|
42
|
+
-----
|
|
43
|
+
The function follows the tidyspss two-track architecture where data and
|
|
44
|
+
metadata work independently and only converge at write time. Metadata for
|
|
45
|
+
columns that don't exist in the data will be silently ignored by pyreadstat.
|
|
46
|
+
|
|
47
|
+
Examples
|
|
48
|
+
--------
|
|
49
|
+
>>> # Basic write with data and metadata
|
|
50
|
+
>>> write_sav(data, meta, "output.sav")
|
|
51
|
+
|
|
52
|
+
>>> # Write without metadata (minimal SPSS file)
|
|
53
|
+
>>> write_sav(df, dst_path="minimal.sav")
|
|
54
|
+
|
|
55
|
+
>>> # Write with compression override
|
|
56
|
+
>>> write_sav(data, meta, dst_path="output.zsav", compress=True)
|
|
57
|
+
"""
|
|
58
|
+
# Convert to native dataframe if needed
|
|
59
|
+
if hasattr(data, 'to_native'):
|
|
60
|
+
df = data.to_native()
|
|
61
|
+
else:
|
|
62
|
+
df = data
|
|
63
|
+
|
|
64
|
+
# Handle metadata
|
|
65
|
+
if meta is not None:
|
|
66
|
+
# Check if it's already a Metadata object or needs wrapping
|
|
67
|
+
if not hasattr(meta, 'get_write_params'):
|
|
68
|
+
# It's likely a pyreadstat metadata_container, wrap it
|
|
69
|
+
from .class_metadata import Metadata
|
|
70
|
+
meta = Metadata(meta)
|
|
71
|
+
|
|
72
|
+
# Apply temporary overrides to metadata object
|
|
73
|
+
originals = {}
|
|
74
|
+
for key, value in overrides.items():
|
|
75
|
+
if value is not None and hasattr(meta, key):
|
|
76
|
+
originals[key] = getattr(meta, key)
|
|
77
|
+
setattr(meta, key, value)
|
|
78
|
+
|
|
79
|
+
# Get write parameters from metadata
|
|
80
|
+
write_params = meta.get_write_params()
|
|
81
|
+
|
|
82
|
+
# Restore original metadata settings
|
|
83
|
+
for key, value in originals.items():
|
|
84
|
+
setattr(meta, key, value)
|
|
85
|
+
|
|
86
|
+
# Extract compression settings from write params
|
|
87
|
+
final_compress = write_params.pop('compress', False)
|
|
88
|
+
final_row_compress = write_params.pop('row_compress', False)
|
|
89
|
+
else:
|
|
90
|
+
# No metadata - use minimal parameters with overrides
|
|
91
|
+
write_params = {}
|
|
92
|
+
final_compress = overrides.get('compress', False)
|
|
93
|
+
final_row_compress = overrides.get('row_compress', False)
|
|
94
|
+
|
|
95
|
+
# Validate compression settings
|
|
96
|
+
if final_compress and final_row_compress:
|
|
97
|
+
raise ValueError("Both 'compress' and 'row_compress' cannot be True at the same time")
|
|
98
|
+
|
|
99
|
+
# Convert path to string
|
|
100
|
+
dst_path = str(dst_path)
|
|
101
|
+
|
|
102
|
+
# Write the file
|
|
103
|
+
pyreadstat.write_sav(
|
|
104
|
+
df=df,
|
|
105
|
+
dst_path=dst_path,
|
|
106
|
+
compress=final_compress,
|
|
107
|
+
row_compress=final_row_compress,
|
|
108
|
+
**write_params
|
|
109
|
+
)
|
|
110
|
+
|
|
111
|
+
logger.info(f"SPSS file saved successfully to {dst_path}")
|