pyreadstat 1.2.9__tar.gz → 1.3.1__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of pyreadstat might be problematic. Click here for more details.
- {pyreadstat-1.2.9/pyreadstat.egg-info → pyreadstat-1.3.1}/PKG-INFO +18 -7
- {pyreadstat-1.2.9 → pyreadstat-1.3.1}/README.md +42 -35
- {pyreadstat-1.2.9 → pyreadstat-1.3.1}/pyreadstat/__init__.py +1 -1
- {pyreadstat-1.2.9 → pyreadstat-1.3.1}/pyreadstat/_readstat_parser.c +11089 -10246
- {pyreadstat-1.2.9 → pyreadstat-1.3.1}/pyreadstat/_readstat_parser.pxd +5 -2
- {pyreadstat-1.2.9 → pyreadstat-1.3.1}/pyreadstat/_readstat_parser.pyx +157 -51
- {pyreadstat-1.2.9 → pyreadstat-1.3.1}/pyreadstat/_readstat_writer.c +15418 -15802
- {pyreadstat-1.2.9 → pyreadstat-1.3.1}/pyreadstat/_readstat_writer.pxd +12 -5
- pyreadstat-1.3.1/pyreadstat/_readstat_writer.pyx +986 -0
- {pyreadstat-1.2.9 → pyreadstat-1.3.1}/pyreadstat/pyfunctions.py +38 -12
- {pyreadstat-1.2.9 → pyreadstat-1.3.1}/pyreadstat/pyreadstat.c +9281 -8804
- {pyreadstat-1.2.9 → pyreadstat-1.3.1}/pyreadstat/pyreadstat.pyx +71 -70
- {pyreadstat-1.2.9 → pyreadstat-1.3.1/pyreadstat.egg-info}/PKG-INFO +18 -7
- {pyreadstat-1.2.9 → pyreadstat-1.3.1}/pyreadstat.egg-info/SOURCES.txt +44 -0
- pyreadstat-1.3.1/pyreadstat.egg-info/requires.txt +2 -0
- {pyreadstat-1.2.9 → pyreadstat-1.3.1}/setup.py +30 -8
- {pyreadstat-1.2.9 → pyreadstat-1.3.1}/src/sas/readstat_sas.c +7 -2
- {pyreadstat-1.2.9 → pyreadstat-1.3.1}/src/sas/readstat_sas7bcat_read.c +12 -2
- {pyreadstat-1.2.9 → pyreadstat-1.3.1}/src/sas/readstat_sas7bcat_write.c +8 -2
- {pyreadstat-1.2.9 → pyreadstat-1.3.1}/src/sas/readstat_sas7bdat_read.c +13 -5
- {pyreadstat-1.2.9 → pyreadstat-1.3.1}/src/spss/readstat_por_read.c +1 -2
- {pyreadstat-1.2.9 → pyreadstat-1.3.1}/src/spss/readstat_por_write.c +2 -3
- pyreadstat-1.3.1/src/spss/readstat_sav_parse.c +843 -0
- pyreadstat-1.3.1/src/spss/readstat_sav_parse_mr_name.c +546 -0
- {pyreadstat-1.2.9 → pyreadstat-1.3.1}/src/spss/readstat_sav_read.c +9 -12
- {pyreadstat-1.2.9 → pyreadstat-1.3.1}/tests/test_basic.py +3 -3
- pyreadstat-1.3.1/tests/test_narwhalified.py +1323 -0
- pyreadstat-1.2.9/pyreadstat.egg-info/requires.txt +0 -1
- pyreadstat-1.2.9/src/spss/readstat_sav_parse.c +0 -872
- pyreadstat-1.2.9/src/spss/readstat_sav_parse_mr_name.c +0 -468
- {pyreadstat-1.2.9 → pyreadstat-1.3.1}/LICENSE +0 -0
- {pyreadstat-1.2.9 → pyreadstat-1.3.1}/MANIFEST.in +0 -0
- {pyreadstat-1.2.9 → pyreadstat-1.3.1}/pyproject.toml +0 -0
- /pyreadstat-1.2.9/pyreadstat/_readstat_writer.pyx → /pyreadstat-1.3.1/pyreadstat/_readstat_writer_pandas.pyx +0 -0
- {pyreadstat-1.2.9 → pyreadstat-1.3.1}/pyreadstat/conditional_includes.h +0 -0
- {pyreadstat-1.2.9 → pyreadstat-1.3.1}/pyreadstat/pyreadstat.pxd +0 -0
- {pyreadstat-1.2.9 → pyreadstat-1.3.1}/pyreadstat/readstat_api.pxd +0 -0
- {pyreadstat-1.2.9 → pyreadstat-1.3.1}/pyreadstat/worker.py +0 -0
- {pyreadstat-1.2.9 → pyreadstat-1.3.1}/pyreadstat.egg-info/dependency_links.txt +0 -0
- {pyreadstat-1.2.9 → pyreadstat-1.3.1}/pyreadstat.egg-info/top_level.txt +0 -0
- {pyreadstat-1.2.9 → pyreadstat-1.3.1}/setup.cfg +0 -0
- {pyreadstat-1.2.9 → pyreadstat-1.3.1}/src/CKHashTable.c +0 -0
- {pyreadstat-1.2.9 → pyreadstat-1.3.1}/src/CKHashTable.h +0 -0
- {pyreadstat-1.2.9 → pyreadstat-1.3.1}/src/readstat.h +0 -0
- {pyreadstat-1.2.9 → pyreadstat-1.3.1}/src/readstat_bits.c +0 -0
- {pyreadstat-1.2.9 → pyreadstat-1.3.1}/src/readstat_bits.h +0 -0
- {pyreadstat-1.2.9 → pyreadstat-1.3.1}/src/readstat_convert.c +0 -0
- {pyreadstat-1.2.9 → pyreadstat-1.3.1}/src/readstat_convert.h +0 -0
- {pyreadstat-1.2.9 → pyreadstat-1.3.1}/src/readstat_error.c +0 -0
- {pyreadstat-1.2.9 → pyreadstat-1.3.1}/src/readstat_iconv.h +0 -0
- {pyreadstat-1.2.9 → pyreadstat-1.3.1}/src/readstat_io_unistd.c +0 -0
- {pyreadstat-1.2.9 → pyreadstat-1.3.1}/src/readstat_io_unistd.h +0 -0
- {pyreadstat-1.2.9 → pyreadstat-1.3.1}/src/readstat_malloc.c +0 -0
- {pyreadstat-1.2.9 → pyreadstat-1.3.1}/src/readstat_malloc.h +0 -0
- {pyreadstat-1.2.9 → pyreadstat-1.3.1}/src/readstat_metadata.c +0 -0
- {pyreadstat-1.2.9 → pyreadstat-1.3.1}/src/readstat_parser.c +0 -0
- {pyreadstat-1.2.9 → pyreadstat-1.3.1}/src/readstat_strings.h +0 -0
- {pyreadstat-1.2.9 → pyreadstat-1.3.1}/src/readstat_value.c +0 -0
- {pyreadstat-1.2.9 → pyreadstat-1.3.1}/src/readstat_variable.c +0 -0
- {pyreadstat-1.2.9 → pyreadstat-1.3.1}/src/readstat_writer.c +0 -0
- {pyreadstat-1.2.9 → pyreadstat-1.3.1}/src/readstat_writer.h +0 -0
- {pyreadstat-1.2.9 → pyreadstat-1.3.1}/src/sas/ieee.c +0 -0
- {pyreadstat-1.2.9 → pyreadstat-1.3.1}/src/sas/ieee.h +0 -0
- {pyreadstat-1.2.9 → pyreadstat-1.3.1}/src/sas/readstat_sas.h +0 -0
- {pyreadstat-1.2.9 → pyreadstat-1.3.1}/src/sas/readstat_sas7bdat_write.c +0 -0
- {pyreadstat-1.2.9 → pyreadstat-1.3.1}/src/sas/readstat_sas_rle.c +0 -0
- {pyreadstat-1.2.9 → pyreadstat-1.3.1}/src/sas/readstat_sas_rle.h +0 -0
- {pyreadstat-1.2.9 → pyreadstat-1.3.1}/src/sas/readstat_xport.c +0 -0
- {pyreadstat-1.2.9 → pyreadstat-1.3.1}/src/sas/readstat_xport.h +0 -0
- {pyreadstat-1.2.9 → pyreadstat-1.3.1}/src/sas/readstat_xport_parse_format.c +0 -0
- {pyreadstat-1.2.9 → pyreadstat-1.3.1}/src/sas/readstat_xport_parse_format.h +0 -0
- {pyreadstat-1.2.9 → pyreadstat-1.3.1}/src/sas/readstat_xport_read.c +0 -0
- {pyreadstat-1.2.9 → pyreadstat-1.3.1}/src/sas/readstat_xport_write.c +0 -0
- {pyreadstat-1.2.9 → pyreadstat-1.3.1}/src/spss/readstat_por.c +0 -0
- {pyreadstat-1.2.9 → pyreadstat-1.3.1}/src/spss/readstat_por.h +0 -0
- {pyreadstat-1.2.9 → pyreadstat-1.3.1}/src/spss/readstat_por_parse.c +0 -0
- {pyreadstat-1.2.9 → pyreadstat-1.3.1}/src/spss/readstat_por_parse.h +0 -0
- {pyreadstat-1.2.9 → pyreadstat-1.3.1}/src/spss/readstat_sav.c +0 -0
- {pyreadstat-1.2.9 → pyreadstat-1.3.1}/src/spss/readstat_sav.h +0 -0
- {pyreadstat-1.2.9 → pyreadstat-1.3.1}/src/spss/readstat_sav_compress.c +0 -0
- {pyreadstat-1.2.9 → pyreadstat-1.3.1}/src/spss/readstat_sav_compress.h +0 -0
- {pyreadstat-1.2.9 → pyreadstat-1.3.1}/src/spss/readstat_sav_parse.h +0 -0
- {pyreadstat-1.2.9 → pyreadstat-1.3.1}/src/spss/readstat_sav_parse_mr_name.h +0 -0
- {pyreadstat-1.2.9 → pyreadstat-1.3.1}/src/spss/readstat_sav_parse_timestamp.c +0 -0
- {pyreadstat-1.2.9 → pyreadstat-1.3.1}/src/spss/readstat_sav_parse_timestamp.h +0 -0
- {pyreadstat-1.2.9 → pyreadstat-1.3.1}/src/spss/readstat_sav_write.c +0 -0
- {pyreadstat-1.2.9 → pyreadstat-1.3.1}/src/spss/readstat_spss.c +0 -0
- {pyreadstat-1.2.9 → pyreadstat-1.3.1}/src/spss/readstat_spss.h +0 -0
- {pyreadstat-1.2.9 → pyreadstat-1.3.1}/src/spss/readstat_spss_parse.c +0 -0
- {pyreadstat-1.2.9 → pyreadstat-1.3.1}/src/spss/readstat_spss_parse.h +0 -0
- {pyreadstat-1.2.9 → pyreadstat-1.3.1}/src/spss/readstat_zsav_compress.c +0 -0
- {pyreadstat-1.2.9 → pyreadstat-1.3.1}/src/spss/readstat_zsav_compress.h +0 -0
- {pyreadstat-1.2.9 → pyreadstat-1.3.1}/src/spss/readstat_zsav_read.c +0 -0
- {pyreadstat-1.2.9 → pyreadstat-1.3.1}/src/spss/readstat_zsav_read.h +0 -0
- {pyreadstat-1.2.9 → pyreadstat-1.3.1}/src/spss/readstat_zsav_write.c +0 -0
- {pyreadstat-1.2.9 → pyreadstat-1.3.1}/src/spss/readstat_zsav_write.h +0 -0
- {pyreadstat-1.2.9 → pyreadstat-1.3.1}/src/stata/readstat_dta.c +0 -0
- {pyreadstat-1.2.9 → pyreadstat-1.3.1}/src/stata/readstat_dta.h +0 -0
- {pyreadstat-1.2.9 → pyreadstat-1.3.1}/src/stata/readstat_dta_parse_timestamp.c +0 -0
- {pyreadstat-1.2.9 → pyreadstat-1.3.1}/src/stata/readstat_dta_parse_timestamp.h +0 -0
- {pyreadstat-1.2.9 → pyreadstat-1.3.1}/src/stata/readstat_dta_read.c +0 -0
- {pyreadstat-1.2.9 → pyreadstat-1.3.1}/src/stata/readstat_dta_write.c +0 -0
- {pyreadstat-1.2.9 → pyreadstat-1.3.1}/tests/test_version.py +0 -0
|
@@ -1,25 +1,36 @@
|
|
|
1
|
-
Metadata-Version: 2.
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
2
|
Name: pyreadstat
|
|
3
|
-
Version: 1.
|
|
4
|
-
Summary: Reads and Writes SAS, SPSS and Stata files into/from pandas data frames.
|
|
3
|
+
Version: 1.3.1
|
|
4
|
+
Summary: Reads and Writes SAS, SPSS and Stata files into/from pandas and polars data frames.
|
|
5
5
|
Home-page: https://github.com/Roche/pyreadstat
|
|
6
6
|
Download-URL: https://github.com/Roche/pyreadstat/dist
|
|
7
7
|
Author: Otto Fajardo
|
|
8
8
|
Author-email: pleasecontactviagithub@notvalid.com
|
|
9
|
-
License: Apache
|
|
9
|
+
License: Apache-2.0
|
|
10
10
|
Classifier: Programming Language :: Python
|
|
11
11
|
Classifier: Programming Language :: Cython
|
|
12
12
|
Classifier: Programming Language :: C
|
|
13
|
-
Classifier: License :: OSI Approved :: Apache Software License
|
|
14
13
|
Classifier: Intended Audience :: Science/Research
|
|
15
14
|
Classifier: Topic :: Scientific/Engineering
|
|
16
15
|
Classifier: Environment :: Console
|
|
17
16
|
Description-Content-Type: text/markdown
|
|
18
17
|
License-File: LICENSE
|
|
19
|
-
Requires-Dist:
|
|
18
|
+
Requires-Dist: narwhals>=2.0
|
|
19
|
+
Requires-Dist: numpy
|
|
20
|
+
Dynamic: author
|
|
21
|
+
Dynamic: author-email
|
|
22
|
+
Dynamic: classifier
|
|
23
|
+
Dynamic: description
|
|
24
|
+
Dynamic: description-content-type
|
|
25
|
+
Dynamic: download-url
|
|
26
|
+
Dynamic: home-page
|
|
27
|
+
Dynamic: license
|
|
28
|
+
Dynamic: license-file
|
|
29
|
+
Dynamic: requires-dist
|
|
30
|
+
Dynamic: summary
|
|
20
31
|
|
|
21
32
|
A Python package to read and write SAS
|
|
22
|
-
(sas7bdat, sas7bcat, xport/xpt), SPSS (sav, zsav, por) and Stata (dta) files into/from pandas data frames. It is a wrapper
|
|
33
|
+
(sas7bdat, sas7bcat, xport/xpt), SPSS (sav, zsav, por) and Stata (dta) files into/from pandas and polars data frames. It is a wrapper
|
|
23
34
|
around the C library readstat.<br>
|
|
24
35
|
Please visit out project home page for more information:<br>
|
|
25
36
|
https://github.com/Roche/pyreadstat
|
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
# pyreadstat
|
|
2
2
|
|
|
3
3
|
A python package to read and write sas (sas7bdat, sas7bcat, xport), spps (sav, zsav, por) and stata (dta) data files
|
|
4
|
-
into/from pandas dataframes.
|
|
4
|
+
into/from pandas and polars dataframes.
|
|
5
5
|
<br>
|
|
6
6
|
|
|
7
7
|
This module is a wrapper around the excellent [Readstat](https://github.com/WizardMac/ReadStat) C library by
|
|
@@ -133,7 +133,8 @@ brings a big hit in performance. The situation can be improved tough by reading
|
|
|
133
133
|
|
|
134
134
|
## Dependencies
|
|
135
135
|
|
|
136
|
-
The module depends on
|
|
136
|
+
The module depends on numpy and narwhals, a package to interface with pandas and polars. In addition you will need to have installed
|
|
137
|
+
either pandas or polars.
|
|
137
138
|
|
|
138
139
|
In order to compile from source you will need a C compiler (see installation).
|
|
139
140
|
Only if you want to do changes to the cython source code, you will need cython (normally not necessary).
|
|
@@ -222,7 +223,7 @@ the folder build, otherwise you may be installing the old compilation again).
|
|
|
222
223
|
|
|
223
224
|
#### Reading files
|
|
224
225
|
|
|
225
|
-
Pass the path to a file to any of the functions provided by pyreadstat. It will return a pandas data frame and a metadata
|
|
226
|
+
Pass the path to a file to any of the functions provided by pyreadstat. It will return a pandas or polars data frame and a metadata
|
|
226
227
|
object. <br>
|
|
227
228
|
The dataframe uses the column names. The metadata object contains the column names, column labels, number_rows,
|
|
228
229
|
number_columns, file label
|
|
@@ -234,7 +235,8 @@ For example, in order to read a sas7bdat file:
|
|
|
234
235
|
```python
|
|
235
236
|
import pyreadstat
|
|
236
237
|
|
|
237
|
-
|
|
238
|
+
# output format by default is pandas. You can use polars to get a polars dataframe.
|
|
239
|
+
df, meta = pyreadstat.read_sas7bdat('/path/to/a/file.sas7bdat', output_format="pandas")
|
|
238
240
|
|
|
239
241
|
# done! let's see what we got
|
|
240
242
|
print(df.head())
|
|
@@ -257,25 +259,38 @@ df.columns = meta.column_labels
|
|
|
257
259
|
df.columns = meta.column_names
|
|
258
260
|
```
|
|
259
261
|
|
|
262
|
+
As mentioned before you can very easily read into a polars dataframe by using the output_format argument:
|
|
263
|
+
|
|
264
|
+
```python
|
|
265
|
+
import pyreadstat
|
|
266
|
+
|
|
267
|
+
# this time df will be polars
|
|
268
|
+
df, meta = pyreadstat.read_sas7bdat('/path/to/a/file.sas7bdat', output_format="polars")
|
|
269
|
+
|
|
270
|
+
# done! let's see what we got
|
|
271
|
+
print(df.head())
|
|
272
|
+
```
|
|
273
|
+
|
|
260
274
|
#### Writing files
|
|
261
275
|
|
|
262
276
|
Pyreadstat can write STATA (dta), SPSS (sav and zsav, por currently nor supported) and SAS (Xport, sas7bdat and sas7bcat
|
|
263
|
-
currently not supported) files from pandas
|
|
277
|
+
currently not supported) files from pandas or polars dataframes.
|
|
264
278
|
|
|
265
|
-
write functions take as first argument a pandas
|
|
279
|
+
write functions take as first argument a pandas or polars dataframe (other data structures are not supported), as a second argument
|
|
266
280
|
the path to the destination file. Optionally you can also pass a file label and a list with column labels.
|
|
267
281
|
|
|
268
282
|
```python
|
|
269
283
|
import pandas as pd
|
|
270
284
|
import pyreadstat
|
|
271
285
|
|
|
286
|
+
# this would work the same for a polars dataframe
|
|
272
287
|
df = pd.DataFrame([[1,2.0,"a"],[3,4.0,"b"]], columns=["v1", "v2", "v3"])
|
|
273
288
|
# column_labels can also be a dictionary with variable name as key and label as value
|
|
274
289
|
column_labels = ["Variable 1", "Variable 2", "Variable 3"]
|
|
275
290
|
pyreadstat.write_sav(df, "path/to/destination.sav", file_label="test", column_labels=column_labels)
|
|
276
291
|
```
|
|
277
292
|
|
|
278
|
-
Some special arguments are available depending on the function. write_sav can take also notes as string, wheter to
|
|
293
|
+
Some special arguments are available depending on the function. write_sav can take also notes as string or list of strings, wheter to
|
|
279
294
|
compress or not as zsav or apply row compression, variable display widths and variable measures. write_dta can take a stata version.
|
|
280
295
|
write_xport a name for the dataset. User defined missing values and value labels are also supported. See the
|
|
281
296
|
[Module documentation](https://ofajardo.github.io/pyreadstat_documentation/_build/html/index.html) for more details.
|
|
@@ -434,7 +449,7 @@ function. The original values will be replaced by the values in the catalog.
|
|
|
434
449
|
```python
|
|
435
450
|
import pyreadstat
|
|
436
451
|
|
|
437
|
-
# formats_as_category is by default True, and it means the replaced values will be transformed to a pandas category column. There is also formats_as_ordered_category to get an ordered category, this by default is False.
|
|
452
|
+
# formats_as_category is by default True, and it means the replaced values will be transformed to a pandas/polars category column. There is also formats_as_ordered_category to get an ordered category, this by default is False.
|
|
438
453
|
df, meta = pyreadstat.read_sas7bdat('/path/to/a/file.sas7bdat', catalog_file='/path/to/a/file.sas7bcat', formats_as_category=True, formats_as_ordered_category=False)
|
|
439
454
|
```
|
|
440
455
|
|
|
@@ -449,7 +464,7 @@ df, meta = pyreadstat.read_sas7bdat('/path/to/a/file.sas7bdat')
|
|
|
449
464
|
# read_sas7bdat returns an emtpy data frame and the catalog
|
|
450
465
|
df_empty, catalog = pyreadstat.read_sas7bdat('/path/to/a/file.sas7bcat')
|
|
451
466
|
# enrich the dataframe with the catalog
|
|
452
|
-
# formats_as_category is by default True, and it means the replaced values will be transformed to a pandas category column. formats_as_ordered_category is by default False meaning by default categories are not ordered.
|
|
467
|
+
# formats_as_category is by default True, and it means the replaced values will be transformed to a pandas/polars category column. formats_as_ordered_category is by default False meaning by default categories are not ordered.
|
|
453
468
|
df_enriched, meta_enriched = pyreadstat.set_catalog_to_sas(df, meta, catalog,
|
|
454
469
|
formats_as_category=True, formats_as_ordered_category=False)
|
|
455
470
|
```
|
|
@@ -461,7 +476,7 @@ when reading the file using the option apply_value_formats, ...
|
|
|
461
476
|
import pyreadstat
|
|
462
477
|
|
|
463
478
|
# apply_value_formats is by default False, so you have to set it to True manually if you want the labels
|
|
464
|
-
# formats_as_category is by default True, and it means the replaced values will be transformed to a pandas category column. formats_as_ordered_category is by default False meaning by default categories are not ordered.
|
|
479
|
+
# formats_as_category is by default True, and it means the replaced values will be transformed to a pandas/polars category column. formats_as_ordered_category is by default False meaning by default categories are not ordered.
|
|
465
480
|
df, meta = pyreadstat.read_sav("/path/to/sav/file.sav", apply_value_formats=True,
|
|
466
481
|
formats_as_category=True, formats_as_ordered_category=False)
|
|
467
482
|
```
|
|
@@ -530,9 +545,9 @@ example if one has a categorical variable representing if the person passed a te
|
|
|
530
545
|
1 for pass, and as user defined missing variables 2 for did not show up for the test, 3 for unable to process the results,
|
|
531
546
|
etc.
|
|
532
547
|
|
|
533
|
-
**By default both cases are represented by NaN when
|
|
548
|
+
**By default both cases are represented by NaN in pandas and null in polars when
|
|
534
549
|
read with pyreadstat**. Notice that the only possible missing value in pandas is NaN (Not a Number) for both string and numeric
|
|
535
|
-
variables, date, datetime and time variables have NaT (Not a Time).
|
|
550
|
+
variables, date, datetime and time variables have NaT (Not a Time). Polars use null for all datatypes.
|
|
536
551
|
|
|
537
552
|
##### SPSS
|
|
538
553
|
|
|
@@ -599,16 +614,16 @@ translated as NaN by default and to the correspoding string value if
|
|
|
599
614
|
user_missing is set to True. meta.missing_ranges will show the string
|
|
600
615
|
value as well.
|
|
601
616
|
|
|
602
|
-
When writing a
|
|
617
|
+
When writing a dataframe to a sav file, if user defined missing values are not set, NaNs are translated to
|
|
603
618
|
empty strings, as there is no other possibility to represent those missing values and user defined missing values
|
|
604
619
|
are not set automatically.
|
|
605
620
|
|
|
606
|
-
When reading a sav into a
|
|
607
|
-
a character variable is an empty string (''), it will not be translated to NaN, but will stay as an empty string. This
|
|
621
|
+
When reading a sav into a dataframe, if the value in
|
|
622
|
+
a character variable is an empty string (''), it will not be translated to NaN/null, but will stay as an empty string. This
|
|
608
623
|
is because the empty string is a valid character value in SPSS and pyreadstat preserves that property.
|
|
609
624
|
|
|
610
625
|
This behaviour generates an asymetrical situation that has to be managed by the user. You can convert
|
|
611
|
-
empty strings to nan very easily
|
|
626
|
+
empty strings to nan very easily if you think it is appropiate
|
|
612
627
|
for your dataset, or you can use defined missing values as described before.
|
|
613
628
|
|
|
614
629
|
|
|
@@ -700,7 +715,7 @@ df, meta = pyreadstat.read_sas7bdat('/path/to/a/file.sas7bdat', encoding="LATIN1
|
|
|
700
715
|
```
|
|
701
716
|
|
|
702
717
|
You can preserve the original pandas behavior regarding dates (meaning dates are converted to pandas datetime) with the
|
|
703
|
-
dates_as_pandas_datetime option
|
|
718
|
+
dates_as_pandas_datetime option. This option is effective for pandas only, not for polars.
|
|
704
719
|
|
|
705
720
|
```python
|
|
706
721
|
import pyreadstat
|
|
@@ -708,18 +723,10 @@ import pyreadstat
|
|
|
708
723
|
df, meta = pyreadstat.read_sas7bdat('/path/to/a/file.sas7bdat', dates_as_pandas_datetime=True)
|
|
709
724
|
```
|
|
710
725
|
|
|
711
|
-
You can get a dictionary of numpy arrays instead of a pandas dataframe when reading any file format.
|
|
712
|
-
In order to do that, set the parameter output_format='dict' (default is 'pandas'). This is useful if
|
|
713
|
-
you want to transform the data to some other format different to pandas, as transforming the data to pandas is a costly
|
|
714
|
-
process both in terms of speed and memory.
|
|
715
|
-
|
|
716
|
-
```python
|
|
717
|
-
import pyreadstat
|
|
718
|
-
import polars
|
|
719
|
-
|
|
720
|
-
dicdata, meta = pyreadstat.read_sav('/path/to/a/file.sav', output_format='dict')
|
|
721
|
-
df = polars.DataFrame(dicdata)
|
|
722
|
-
```
|
|
726
|
+
You can get a dictionary of numpy arrays instead of a pandas or polars dataframe when reading any file format.
|
|
727
|
+
In order to do that, set the parameter output_format='dict' (default is 'pandas', the other option is 'polars'). This is useful if
|
|
728
|
+
you want to transform the data to some other format different to pandas/polars, as transforming the data to pandas is a costly
|
|
729
|
+
process both in terms of speed and memory.
|
|
723
730
|
|
|
724
731
|
For more information, please check the [Module documentation](https://ofajardo.github.io/pyreadstat_documentation/_build/html/index.html).
|
|
725
732
|
|
|
@@ -727,7 +734,7 @@ For more information, please check the [Module documentation](https://ofajardo.g
|
|
|
727
734
|
|
|
728
735
|
#### File specific options
|
|
729
736
|
|
|
730
|
-
Some special arguments are available depending on the function. write_sav can take also notes as string, wheter to
|
|
737
|
+
Some special arguments are available depending on the function. write_sav can take also notes as string or list of strings, wheter to
|
|
731
738
|
compress or not as zsav or apply row compression, variable display widths and variable measures. write_dta can take a stata version.
|
|
732
739
|
write_xport a name for the dataset. See the
|
|
733
740
|
[Module documentation](https://ofajardo.github.io/pyreadstat_documentation/_build/html/index.html) for more details.
|
|
@@ -735,7 +742,7 @@ write_xport a name for the dataset. See the
|
|
|
735
742
|
#### Writing value labels
|
|
736
743
|
|
|
737
744
|
The argument variable_value_labels can be passed to write_sav and write_dta to write value labels. This argument must be a
|
|
738
|
-
dictionary where keys are variable names (names must match column names in the
|
|
745
|
+
dictionary where keys are variable names (names must match column names in the dataframe). Values are another dictionary where
|
|
739
746
|
keys are the value present in the dataframe and values are the labels (strings).
|
|
740
747
|
|
|
741
748
|
```python
|
|
@@ -812,7 +819,7 @@ for the documentation of the original application.
|
|
|
812
819
|
In the case of SPSS we have some presets for some formats:
|
|
813
820
|
* restricted_integer: with leading zeros, equivalent to N + variable width (e.g N4)
|
|
814
821
|
* integer: Numeric with no decimal places, equivalent to F + variable width + ".0" (0 decimal positions). A
|
|
815
|
-
|
|
822
|
+
column of type integer will also be translated into this format automatically.
|
|
816
823
|
|
|
817
824
|
```python
|
|
818
825
|
import pandas as pd
|
|
@@ -828,12 +835,12 @@ There is some information about the possible formats [here](https://www.gnu.org/
|
|
|
828
835
|
|
|
829
836
|
#### Variable type conversion
|
|
830
837
|
|
|
831
|
-
The following rules are used in order to convert from pandas/numpy/python types to the target file types:
|
|
838
|
+
The following rules are used in order to convert from pandas/polars/numpy/python types to the target file types:
|
|
832
839
|
|
|
833
840
|
| Python Type | Converted Type |
|
|
834
841
|
| ------------------- | --------- |
|
|
835
|
-
| np.int32 or lower | integer (stata), numeric (spss, sas) |
|
|
836
|
-
| int, np.int64, np.float | double (stata), numeric (spss, sas) |
|
|
842
|
+
| np.int32/pl.int32 or lower | integer (stata), numeric (spss, sas) |
|
|
843
|
+
| int, np.int64, pl.int64, np.float, pl.float64 | double (stata), numeric (spss, sas) |
|
|
837
844
|
| str | character |
|
|
838
845
|
| bool | integer (stata), numeric (spss, sas) |
|
|
839
846
|
| datetime, date, time | numeric with datetime/date/time formatting |
|