sas2parquet 0.1.8__tar.gz → 0.2.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sas2parquet-0.2.0/PKG-INFO +154 -0
- sas2parquet-0.2.0/README.md +122 -0
- sas2parquet-0.2.0/pyproject.toml +48 -0
- sas2parquet-0.2.0/src/sas2parquet/cli.py +114 -0
- {sas2parquet-0.1.8 → sas2parquet-0.2.0}/src/sas2parquet/convert.py +61 -53
- sas2parquet-0.1.8/PKG-INFO +0 -135
- sas2parquet-0.1.8/README.md +0 -104
- sas2parquet-0.1.8/pyproject.toml +0 -37
- sas2parquet-0.1.8/src/sas2parquet/cli.py +0 -36
- {sas2parquet-0.1.8 → sas2parquet-0.2.0}/LICENSE +0 -0
- {sas2parquet-0.1.8 → sas2parquet-0.2.0}/src/sas2parquet/__init__.py +0 -0
|
@@ -0,0 +1,154 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: sas2parquet
|
|
3
|
+
Version: 0.2.0
|
|
4
|
+
Summary: SAS → Parquet Hybrid Converter & Validator
|
|
5
|
+
License: MIT
|
|
6
|
+
License-File: LICENSE
|
|
7
|
+
Keywords: sas,parquet,etl,data,pyarrow
|
|
8
|
+
Author: Zaman Ziabakhshganji
|
|
9
|
+
Author-email: zaman.ganji@gmail.com
|
|
10
|
+
Requires-Python: >=3.11,<4.0
|
|
11
|
+
Classifier: License :: OSI Approved :: MIT License
|
|
12
|
+
Classifier: Operating System :: OS Independent
|
|
13
|
+
Classifier: Programming Language :: Python :: 3
|
|
14
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
15
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
16
|
+
Classifier: Programming Language :: Python :: 3.13
|
|
17
|
+
Classifier: Programming Language :: Python :: 3.14
|
|
18
|
+
Requires-Dist: narwhals (==2.13.0)
|
|
19
|
+
Requires-Dist: numpy (==2.3.5)
|
|
20
|
+
Requires-Dist: pandas (==2.3.3)
|
|
21
|
+
Requires-Dist: polars (==1.36.1)
|
|
22
|
+
Requires-Dist: pyarrow (==22.0.0)
|
|
23
|
+
Requires-Dist: pyreadstat (==1.3.2)
|
|
24
|
+
Requires-Dist: python-dateutil (==2.9.0.post0)
|
|
25
|
+
Requires-Dist: pytz (==2025.2)
|
|
26
|
+
Requires-Dist: requests (>=2.32.5,<3.0.0)
|
|
27
|
+
Requires-Dist: six (==1.17.0)
|
|
28
|
+
Requires-Dist: tzdata (==2025.2)
|
|
29
|
+
Project-URL: Homepage, https://github.com/<you>/<repo>
|
|
30
|
+
Project-URL: Repository, https://github.com/<you>/<repo>
|
|
31
|
+
Description-Content-Type: text/markdown
|
|
32
|
+
|
|
33
|
+
# sas2parquet
|
|
34
|
+
|
|
35
|
+
[](https://pypi.org/project/sas2parquet/)
|
|
36
|
+
[](https://pypi.org/project/sas2parquet/)
|
|
37
|
+
[](LICENCE)
|
|
38
|
+
|
|
39
|
+
**The ultimate SAS (.sas7bdat) to Parquet converter** — built to handle files that fail with standard tools.
|
|
40
|
+
|
|
41
|
+
`sas2parquet` automatically detects encodings, repairs schemas, infers correct data types, and performs **pixel-perfect validation** between SAS and Parquet outputs.
|
|
42
|
+
|
|
43
|
+
---
|
|
44
|
+
|
|
45
|
+
## ✨ Features
|
|
46
|
+
|
|
47
|
+
| Feature | Description |
|
|
48
|
+
|-------|-------------|
|
|
49
|
+
| 🔄 **Auto Encoding** | Detects UTF-8, Latin1, CP1252 from metadata or fallback |
|
|
50
|
+
| 🧠 **Smart Types** | Infers datetime, numeric, string with 20+ retry strategies |
|
|
51
|
+
| ✅ **Validation** | Chunk-by-chunk comparison (metadata, counts, values) |
|
|
52
|
+
| 📊 **Memory Safe** | Chunked processing (96GB RAM optimized, configurable) |
|
|
53
|
+
| 💾 **ZSTD Compression** | Level-6 ZSTD for efficient Parquet storage |
|
|
54
|
+
| 📝 **Detailed Logs** | Full conversion trace + mismatch reports |
|
|
55
|
+
| 🎯 **Two Modes** | Single file or recursive directory processing |
|
|
56
|
+
|
|
57
|
+
---
|
|
58
|
+
|
|
59
|
+
## 🚀 Quick Start
|
|
60
|
+
|
|
61
|
+
### Install
|
|
62
|
+
```bash
|
|
63
|
+
pip install sas2parquet
|
|
64
|
+
```
|
|
65
|
+
|
|
66
|
+
---
|
|
67
|
+
|
|
68
|
+
## ✅ Usage
|
|
69
|
+
|
|
70
|
+
### Convert a directory (recommended)
|
|
71
|
+
|
|
72
|
+
```bash
|
|
73
|
+
sas2parquet path/to/sasdata/
|
|
74
|
+
```
|
|
75
|
+
|
|
76
|
+
- Converts **all `.sas7bdat` files recursively**
|
|
77
|
+
- Creates `parquetdata/` and `logging/` next to `sasdata/`
|
|
78
|
+
|
|
79
|
+
---
|
|
80
|
+
|
|
81
|
+
### Convert a single file
|
|
82
|
+
|
|
83
|
+
```bash
|
|
84
|
+
sas2parquet path/to/file.sas7bdat
|
|
85
|
+
```
|
|
86
|
+
|
|
87
|
+
Output (default):
|
|
88
|
+
```text
|
|
89
|
+
path/to/file.parquet
|
|
90
|
+
```
|
|
91
|
+
|
|
92
|
+
---
|
|
93
|
+
|
|
94
|
+
### Specify output location
|
|
95
|
+
|
|
96
|
+
#### Directory mode — custom output directory
|
|
97
|
+
```bash
|
|
98
|
+
sas2parquet path/to/sasdata/ --out path/to/parquetdata/
|
|
99
|
+
```
|
|
100
|
+
|
|
101
|
+
#### File mode — custom output file
|
|
102
|
+
```bash
|
|
103
|
+
sas2parquet path/to/file.sas7bdat --out path/to/output.parquet
|
|
104
|
+
```
|
|
105
|
+
|
|
106
|
+
---
|
|
107
|
+
|
|
108
|
+
### Custom log directory (directory mode)
|
|
109
|
+
|
|
110
|
+
```bash
|
|
111
|
+
sas2parquet path/to/sasdata/ --log-dir path/to/logs/
|
|
112
|
+
```
|
|
113
|
+
|
|
114
|
+
---
|
|
115
|
+
|
|
116
|
+
## 📁 Directory Mode Behavior
|
|
117
|
+
|
|
118
|
+
```text
|
|
119
|
+
your-project/
|
|
120
|
+
├── sasdata/
|
|
121
|
+
│ ├── file1.sas7bdat
|
|
122
|
+
│ └── subfolder/
|
|
123
|
+
│ └── nested.sas7bdat
|
|
124
|
+
├── parquetdata/
|
|
125
|
+
│ ├── file1.parquet
|
|
126
|
+
│ └── subfolder_parquet/
|
|
127
|
+
│ └── nested.parquet
|
|
128
|
+
└── logging/
|
|
129
|
+
└── conversion_20260205_1145.log
|
|
130
|
+
```
|
|
131
|
+
|
|
132
|
+
---
|
|
133
|
+
|
|
134
|
+
## 🛠️ CLI Reference
|
|
135
|
+
|
|
136
|
+
```bash
|
|
137
|
+
sas2parquet --help
|
|
138
|
+
```
|
|
139
|
+
|
|
140
|
+
---
|
|
141
|
+
|
|
142
|
+
## ⚙️ Configuration (Advanced)
|
|
143
|
+
|
|
144
|
+
Edit constants in:
|
|
145
|
+
|
|
146
|
+
```text
|
|
147
|
+
src/sas2parquet/convert.py
|
|
148
|
+
```
|
|
149
|
+
|
|
150
|
+
---
|
|
151
|
+
|
|
152
|
+
## 📄 License
|
|
153
|
+
|
|
154
|
+
MIT License
|
|
@@ -0,0 +1,122 @@
|
|
|
1
|
+
# sas2parquet
|
|
2
|
+
|
|
3
|
+
[](https://pypi.org/project/sas2parquet/)
|
|
4
|
+
[](https://pypi.org/project/sas2parquet/)
|
|
5
|
+
[](LICENCE)
|
|
6
|
+
|
|
7
|
+
**The ultimate SAS (.sas7bdat) to Parquet converter** — built to handle files that fail with standard tools.
|
|
8
|
+
|
|
9
|
+
`sas2parquet` automatically detects encodings, repairs schemas, infers correct data types, and performs **pixel-perfect validation** between SAS and Parquet outputs.
|
|
10
|
+
|
|
11
|
+
---
|
|
12
|
+
|
|
13
|
+
## ✨ Features
|
|
14
|
+
|
|
15
|
+
| Feature | Description |
|
|
16
|
+
|-------|-------------|
|
|
17
|
+
| 🔄 **Auto Encoding** | Detects UTF-8, Latin1, CP1252 from metadata or fallback |
|
|
18
|
+
| 🧠 **Smart Types** | Infers datetime, numeric, string with 20+ retry strategies |
|
|
19
|
+
| ✅ **Validation** | Chunk-by-chunk comparison (metadata, counts, values) |
|
|
20
|
+
| 📊 **Memory Safe** | Chunked processing (96GB RAM optimized, configurable) |
|
|
21
|
+
| 💾 **ZSTD Compression** | Level-6 ZSTD for efficient Parquet storage |
|
|
22
|
+
| 📝 **Detailed Logs** | Full conversion trace + mismatch reports |
|
|
23
|
+
| 🎯 **Two Modes** | Single file or recursive directory processing |
|
|
24
|
+
|
|
25
|
+
---
|
|
26
|
+
|
|
27
|
+
## 🚀 Quick Start
|
|
28
|
+
|
|
29
|
+
### Install
|
|
30
|
+
```bash
|
|
31
|
+
pip install sas2parquet
|
|
32
|
+
```
|
|
33
|
+
|
|
34
|
+
---
|
|
35
|
+
|
|
36
|
+
## ✅ Usage
|
|
37
|
+
|
|
38
|
+
### Convert a directory (recommended)
|
|
39
|
+
|
|
40
|
+
```bash
|
|
41
|
+
sas2parquet path/to/sasdata/
|
|
42
|
+
```
|
|
43
|
+
|
|
44
|
+
- Converts **all `.sas7bdat` files recursively**
|
|
45
|
+
- Creates `parquetdata/` and `logging/` next to `sasdata/`
|
|
46
|
+
|
|
47
|
+
---
|
|
48
|
+
|
|
49
|
+
### Convert a single file
|
|
50
|
+
|
|
51
|
+
```bash
|
|
52
|
+
sas2parquet path/to/file.sas7bdat
|
|
53
|
+
```
|
|
54
|
+
|
|
55
|
+
Output (default):
|
|
56
|
+
```text
|
|
57
|
+
path/to/file.parquet
|
|
58
|
+
```
|
|
59
|
+
|
|
60
|
+
---
|
|
61
|
+
|
|
62
|
+
### Specify output location
|
|
63
|
+
|
|
64
|
+
#### Directory mode — custom output directory
|
|
65
|
+
```bash
|
|
66
|
+
sas2parquet path/to/sasdata/ --out path/to/parquetdata/
|
|
67
|
+
```
|
|
68
|
+
|
|
69
|
+
#### File mode — custom output file
|
|
70
|
+
```bash
|
|
71
|
+
sas2parquet path/to/file.sas7bdat --out path/to/output.parquet
|
|
72
|
+
```
|
|
73
|
+
|
|
74
|
+
---
|
|
75
|
+
|
|
76
|
+
### Custom log directory (directory mode)
|
|
77
|
+
|
|
78
|
+
```bash
|
|
79
|
+
sas2parquet path/to/sasdata/ --log-dir path/to/logs/
|
|
80
|
+
```
|
|
81
|
+
|
|
82
|
+
---
|
|
83
|
+
|
|
84
|
+
## 📁 Directory Mode Behavior
|
|
85
|
+
|
|
86
|
+
```text
|
|
87
|
+
your-project/
|
|
88
|
+
├── sasdata/
|
|
89
|
+
│ ├── file1.sas7bdat
|
|
90
|
+
│ └── subfolder/
|
|
91
|
+
│ └── nested.sas7bdat
|
|
92
|
+
├── parquetdata/
|
|
93
|
+
│ ├── file1.parquet
|
|
94
|
+
│ └── subfolder_parquet/
|
|
95
|
+
│ └── nested.parquet
|
|
96
|
+
└── logging/
|
|
97
|
+
└── conversion_20260205_1145.log
|
|
98
|
+
```
|
|
99
|
+
|
|
100
|
+
---
|
|
101
|
+
|
|
102
|
+
## 🛠️ CLI Reference
|
|
103
|
+
|
|
104
|
+
```bash
|
|
105
|
+
sas2parquet --help
|
|
106
|
+
```
|
|
107
|
+
|
|
108
|
+
---
|
|
109
|
+
|
|
110
|
+
## ⚙️ Configuration (Advanced)
|
|
111
|
+
|
|
112
|
+
Edit constants in:
|
|
113
|
+
|
|
114
|
+
```text
|
|
115
|
+
src/sas2parquet/convert.py
|
|
116
|
+
```
|
|
117
|
+
|
|
118
|
+
---
|
|
119
|
+
|
|
120
|
+
## 📄 License
|
|
121
|
+
|
|
122
|
+
MIT License
|
|
@@ -0,0 +1,48 @@
|
|
|
1
|
+
[tool.poetry]
|
|
2
|
+
name = "sas2parquet"
|
|
3
|
+
version = "0.2.0"
|
|
4
|
+
description = "SAS → Parquet Hybrid Converter & Validator"
|
|
5
|
+
readme = "README.md"
|
|
6
|
+
authors = ["Zaman Ziabakhshganji <zaman.ganji@gmail.com>"]
|
|
7
|
+
license = "MIT"
|
|
8
|
+
packages = [{ include = "sas2parquet", from = "src" }]
|
|
9
|
+
# Optional but nice for PyPI:
|
|
10
|
+
repository = "https://github.com/<you>/<repo>"
|
|
11
|
+
homepage = "https://github.com/<you>/<repo>"
|
|
12
|
+
keywords = ["sas", "parquet", "etl", "data", "pyarrow"]
|
|
13
|
+
classifiers = [
|
|
14
|
+
"Programming Language :: Python :: 3",
|
|
15
|
+
"Programming Language :: Python :: 3.11",
|
|
16
|
+
"Programming Language :: Python :: 3.12",
|
|
17
|
+
"License :: OSI Approved :: MIT License",
|
|
18
|
+
"Operating System :: OS Independent",
|
|
19
|
+
]
|
|
20
|
+
|
|
21
|
+
[tool.poetry.dependencies]
|
|
22
|
+
python = ">=3.11,<4.0"
|
|
23
|
+
requests = ">=2.32.5,<3.0.0"
|
|
24
|
+
narwhals = "==2.13.0"
|
|
25
|
+
numpy = "==2.3.5"
|
|
26
|
+
pandas = "==2.3.3"
|
|
27
|
+
polars = "==1.36.1"
|
|
28
|
+
pyarrow = "==22.0.0"
|
|
29
|
+
pyreadstat = "==1.3.2"
|
|
30
|
+
python-dateutil = "==2.9.0.post0"
|
|
31
|
+
pytz = "==2025.2"
|
|
32
|
+
six = "==1.17.0"
|
|
33
|
+
tzdata = "==2025.2"
|
|
34
|
+
|
|
35
|
+
# 🚫 Strongly consider NOT shipping these as required deps:
|
|
36
|
+
# pyspark, py4j, polars-runtime-32
|
|
37
|
+
# They dramatically inflate installs and aren't required for your conversion script.
|
|
38
|
+
# If you still want them, put them behind extras (see below).
|
|
39
|
+
|
|
40
|
+
[tool.poetry.group.dev.dependencies]
|
|
41
|
+
pytest = ">=9.0.2,<10.0.0"
|
|
42
|
+
|
|
43
|
+
[tool.poetry.scripts]
|
|
44
|
+
sas2parquet = "sas2parquet.cli:main"
|
|
45
|
+
|
|
46
|
+
[build-system]
|
|
47
|
+
requires = ["poetry-core>=2.0.0,<3.0.0"]
|
|
48
|
+
build-backend = "poetry.core.masonry.api"
|
|
@@ -0,0 +1,114 @@
|
|
|
1
|
+
#!/usr/bin/env python
|
|
2
|
+
"""
|
|
3
|
+
CLI entrypoint for sas2parquet.
|
|
4
|
+
"""
|
|
5
|
+
|
|
6
|
+
import argparse
|
|
7
|
+
import sys
|
|
8
|
+
from pathlib import Path
|
|
9
|
+
import importlib.metadata
|
|
10
|
+
|
|
11
|
+
try:
|
|
12
|
+
__version__ = importlib.metadata.version("sas2parquet")
|
|
13
|
+
except importlib.metadata.PackageNotFoundError:
|
|
14
|
+
__version__ = "dev"
|
|
15
|
+
|
|
16
|
+
from .convert import main as convert_dir, reconvert_file_ultimate
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
def main() -> None:
|
|
20
|
+
parser = argparse.ArgumentParser(
|
|
21
|
+
prog="sas2parquet",
|
|
22
|
+
description="SAS to Parquet converter with validation",
|
|
23
|
+
)
|
|
24
|
+
|
|
25
|
+
parser.add_argument(
|
|
26
|
+
"--version",
|
|
27
|
+
action="version",
|
|
28
|
+
version=f"%(prog)s {__version__}",
|
|
29
|
+
)
|
|
30
|
+
|
|
31
|
+
# Backward compatibility (optional)
|
|
32
|
+
parser.add_argument(
|
|
33
|
+
"--dir-mode",
|
|
34
|
+
"-d",
|
|
35
|
+
action="store_true",
|
|
36
|
+
help="(Backward compatible) Treat input as a directory. "
|
|
37
|
+
"If no path is provided, defaults to ./sasdata",
|
|
38
|
+
)
|
|
39
|
+
|
|
40
|
+
parser.add_argument(
|
|
41
|
+
"path",
|
|
42
|
+
nargs="?",
|
|
43
|
+
help="Path to a .sas7bdat file OR a directory containing SAS files (recursively).",
|
|
44
|
+
)
|
|
45
|
+
|
|
46
|
+
parser.add_argument(
|
|
47
|
+
"--out",
|
|
48
|
+
"-o",
|
|
49
|
+
default=None,
|
|
50
|
+
help="Output Parquet file (file mode) OR output directory (dir mode). "
|
|
51
|
+
"If omitted, dir mode uses sibling 'parquetdata/'.",
|
|
52
|
+
)
|
|
53
|
+
|
|
54
|
+
parser.add_argument(
|
|
55
|
+
"--log-dir",
|
|
56
|
+
default=None,
|
|
57
|
+
help="Directory where logs are written (dir mode). "
|
|
58
|
+
"If omitted, uses sibling 'logging/'.",
|
|
59
|
+
)
|
|
60
|
+
|
|
61
|
+
args = parser.parse_args()
|
|
62
|
+
|
|
63
|
+
# -----------------------------
|
|
64
|
+
# Resolve input path
|
|
65
|
+
# -----------------------------
|
|
66
|
+
if args.dir_mode and args.path is None:
|
|
67
|
+
# Old behavior: default to ./sasdata
|
|
68
|
+
p = Path("sasdata").expanduser().resolve()
|
|
69
|
+
else:
|
|
70
|
+
if args.path is None:
|
|
71
|
+
parser.print_help()
|
|
72
|
+
sys.exit(1)
|
|
73
|
+
p = Path(args.path).expanduser().resolve()
|
|
74
|
+
|
|
75
|
+
if not p.exists():
|
|
76
|
+
print(f"❌ Path not found: {p}")
|
|
77
|
+
sys.exit(2)
|
|
78
|
+
|
|
79
|
+
# -----------------------------
|
|
80
|
+
# Directory mode
|
|
81
|
+
# -----------------------------
|
|
82
|
+
if p.is_dir():
|
|
83
|
+
out_dir = Path(args.out).expanduser().resolve() if args.out else None
|
|
84
|
+
log_dir = Path(args.log_dir).expanduser().resolve() if args.log_dir else None
|
|
85
|
+
|
|
86
|
+
rc = convert_dir(
|
|
87
|
+
p,
|
|
88
|
+
parquet_output_dir=out_dir,
|
|
89
|
+
log_dir=log_dir,
|
|
90
|
+
)
|
|
91
|
+
sys.exit(rc)
|
|
92
|
+
|
|
93
|
+
# -----------------------------
|
|
94
|
+
# File mode
|
|
95
|
+
# -----------------------------
|
|
96
|
+
if p.is_file():
|
|
97
|
+
if p.suffix.lower() != ".sas7bdat":
|
|
98
|
+
print(f"❌ Not a .sas7bdat file: {p.name}")
|
|
99
|
+
sys.exit(2)
|
|
100
|
+
|
|
101
|
+
if args.out:
|
|
102
|
+
out_file = Path(args.out).expanduser().resolve()
|
|
103
|
+
else:
|
|
104
|
+
out_file = p.with_suffix(".parquet")
|
|
105
|
+
|
|
106
|
+
success = reconvert_file_ultimate(p, out_file)
|
|
107
|
+
sys.exit(0 if success else 1)
|
|
108
|
+
|
|
109
|
+
print(f"❌ Unsupported path type: {p}")
|
|
110
|
+
sys.exit(2)
|
|
111
|
+
|
|
112
|
+
|
|
113
|
+
if __name__ == "__main__":
|
|
114
|
+
main()
|
|
@@ -12,19 +12,8 @@ import pyarrow as pa
|
|
|
12
12
|
import pyarrow.parquet as pq
|
|
13
13
|
import pyreadstat
|
|
14
14
|
|
|
15
|
-
# --- Suppress pandas FutureWarnings ---
|
|
16
15
|
warnings.simplefilter(action='ignore', category=FutureWarning)
|
|
17
16
|
|
|
18
|
-
# --- Configuration ---
|
|
19
|
-
# Put your .sas7bdat files inside SAS_INPUT_DIR (including subfolders).
|
|
20
|
-
SAS_INPUT_DIR = Path("sasdata")
|
|
21
|
-
|
|
22
|
-
# IMPORTANT:
|
|
23
|
-
# parquetdata/ and logging/ are created NEXT TO sasdata/ (i.e., in the same parent directory).
|
|
24
|
-
PARQUET_INPUT_DIR = SAS_INPUT_DIR.parent / "parquetdata"
|
|
25
|
-
LOG_DIR = SAS_INPUT_DIR.parent / "logging"
|
|
26
|
-
LOG_FILE_PATH = LOG_DIR / f"conversion_{datetime.now():%Y%m%d_%H%M%S}.log"
|
|
27
|
-
|
|
28
17
|
KNOWN_DATETIME_COLUMNS = [
|
|
29
18
|
'RPNA_DATE_UTC','RPNA_TIME_UTC','RPA_DATE_UTC','TIMESTAMP_UTC',
|
|
30
19
|
'EVENT_START_DATE_UTC','EVENT_END_DATE_UTC',
|
|
@@ -44,13 +33,10 @@ MAX_CHUNK_SIZE = 10_000_000
|
|
|
44
33
|
|
|
45
34
|
# --- Logger ---
|
|
46
35
|
class Logger:
|
|
47
|
-
def __init__(self, path):
|
|
36
|
+
def __init__(self, path: Path):
|
|
48
37
|
self.terminal = sys.stdout
|
|
49
|
-
|
|
50
|
-
# Ensure log folder exists
|
|
51
38
|
path = Path(path)
|
|
52
39
|
path.parent.mkdir(parents=True, exist_ok=True)
|
|
53
|
-
|
|
54
40
|
self.logfile = open(path, 'w', encoding='utf-8')
|
|
55
41
|
|
|
56
42
|
def write(self, msg):
|
|
@@ -111,7 +97,6 @@ def compare_and_report_diffs(sas_path: Path, parquet_path: Path):
|
|
|
111
97
|
pq_it = pq.ParquetFile(parquet_path).iter_batches(batch_size=CH)
|
|
112
98
|
chunk_i = 0
|
|
113
99
|
|
|
114
|
-
# SAS→UNIX epoch offset µs
|
|
115
100
|
offset_us = int((pd.Timestamp("1970-01-01") -
|
|
116
101
|
pd.Timestamp("1960-01-01")).total_seconds() * 1e6)
|
|
117
102
|
|
|
@@ -146,7 +131,6 @@ def compare_and_report_diffs(sas_path: Path, parquet_path: Path):
|
|
|
146
131
|
pcol = ppq.get_column(col)
|
|
147
132
|
ds, dp = scol.dtype, pcol.dtype
|
|
148
133
|
if ds in num_types and dp in num_types:
|
|
149
|
-
# unify int<->float
|
|
150
134
|
if ds in float_types and dp in int_types:
|
|
151
135
|
pcol = pcol.cast(ds)
|
|
152
136
|
elif dp in float_types and ds in int_types:
|
|
@@ -162,7 +146,6 @@ def compare_and_report_diffs(sas_path: Path, parquet_path: Path):
|
|
|
162
146
|
sser = psas.get_column(col)
|
|
163
147
|
pser = ppq.get_column(col)
|
|
164
148
|
|
|
165
|
-
# epoch check
|
|
166
149
|
if sser.dtype == pl.Datetime("us") and pser.dtype == pl.Datetime("us"):
|
|
167
150
|
raw = sas_chunk[col]
|
|
168
151
|
if pd.api.types.is_datetime64_ns_dtype(raw):
|
|
@@ -180,7 +163,6 @@ def compare_and_report_diffs(sas_path: Path, parquet_path: Path):
|
|
|
180
163
|
))
|
|
181
164
|
continue
|
|
182
165
|
|
|
183
|
-
# string compare (with date-only normalization)
|
|
184
166
|
s_str = sser.cast(pl.Utf8)
|
|
185
167
|
p_str = pser.cast(pl.Utf8)
|
|
186
168
|
mask = (s_str != p_str) | (s_str.is_null() != p_str.is_null())
|
|
@@ -209,7 +191,6 @@ def compare_and_report_diffs(sas_path: Path, parquet_path: Path):
|
|
|
209
191
|
def reconvert_file_ultimate(sas_path: Path, parquet_path: Path) -> bool:
|
|
210
192
|
print(f"🛠️ Fixing {sas_path.name}...")
|
|
211
193
|
|
|
212
|
-
# 1) metadata & encoding
|
|
213
194
|
_, meta0 = pyreadstat.read_sas7bdat(sas_path, metadataonly=True)
|
|
214
195
|
enc0 = getattr(meta0, 'file_encoding', None)
|
|
215
196
|
if enc0:
|
|
@@ -232,13 +213,12 @@ def reconvert_file_ultimate(sas_path: Path, parquet_path: Path) -> bool:
|
|
|
232
213
|
cols = meta0.column_names
|
|
233
214
|
read_types = getattr(meta0, 'readstat_variable_types', {}) or {}
|
|
234
215
|
|
|
235
|
-
# SAS formats if available
|
|
236
216
|
fmt_map = {}
|
|
237
217
|
if hasattr(meta0, 'formats'):
|
|
238
218
|
for name, fmt in zip(meta0.column_names, meta0.formats):
|
|
239
219
|
fmt_map[name] = fmt or ""
|
|
240
220
|
|
|
241
|
-
# infer content types
|
|
221
|
+
# infer content types
|
|
242
222
|
content, inf, cnt = {}, {}, 0
|
|
243
223
|
it = pd.read_sas(sas_path, chunksize=MIN_CHUNK_SIZE, encoding=encoding)
|
|
244
224
|
for chunk in it:
|
|
@@ -267,17 +247,14 @@ def reconvert_file_ultimate(sas_path: Path, parquet_path: Path) -> bool:
|
|
|
267
247
|
print(f" Attempt {attempt}…")
|
|
268
248
|
fields = []
|
|
269
249
|
for c in cols:
|
|
270
|
-
# 1) SAS-declared numeric → float64
|
|
271
250
|
if read_types.get(c) == 'double':
|
|
272
251
|
at = pa.float64()
|
|
273
252
|
else:
|
|
274
253
|
cu = c.upper()
|
|
275
|
-
# 2) forced-string
|
|
276
254
|
if cu in {x.upper() for x in COLUMNS_TO_FORCE_AS_STRING}:
|
|
277
255
|
at = pa.string()
|
|
278
256
|
else:
|
|
279
257
|
fmt = fmt_map.get(c, "").upper()
|
|
280
|
-
# 3) datetime/date/time
|
|
281
258
|
if (cu in {x.upper() for x in KNOWN_DATETIME_COLUMNS}
|
|
282
259
|
or any(x in fmt for x in ('DATE', 'TIME', 'DATETIME'))):
|
|
283
260
|
if 'DATE' in fmt and 'DATETIME' not in fmt:
|
|
@@ -286,14 +263,11 @@ def reconvert_file_ultimate(sas_path: Path, parquet_path: Path) -> bool:
|
|
|
286
263
|
at = pa.time64('ms')
|
|
287
264
|
else:
|
|
288
265
|
at = pa.timestamp('ms')
|
|
289
|
-
# 4) fallback
|
|
290
266
|
else:
|
|
291
267
|
at = pa.string()
|
|
292
268
|
|
|
293
|
-
# apply any dynamic override
|
|
294
269
|
if c in overrides:
|
|
295
270
|
at = overrides[c]
|
|
296
|
-
|
|
297
271
|
fields.append(pa.field(c, at))
|
|
298
272
|
|
|
299
273
|
schema = pa.schema(fields)
|
|
@@ -317,27 +291,22 @@ def reconvert_file_ultimate(sas_path: Path, parquet_path: Path) -> bool:
|
|
|
317
291
|
writer.close()
|
|
318
292
|
print(" ✅ Conversion succeeded")
|
|
319
293
|
|
|
320
|
-
# ===== FULL PARQUET VALIDATION (WORKING) =====
|
|
321
294
|
print(" 🔍 Full Parquet validation...")
|
|
322
295
|
try:
|
|
323
296
|
pf = pq.ParquetFile(parquet_path)
|
|
324
297
|
total_rows = 0
|
|
325
298
|
num_groups = pf.metadata.num_row_groups
|
|
326
299
|
batch_count = 0
|
|
327
|
-
|
|
328
300
|
for batch in pf.iter_batches():
|
|
329
301
|
total_rows += batch.num_rows
|
|
330
302
|
batch_count += 1
|
|
331
|
-
|
|
332
303
|
print(f" ✅ Parquet fully validated: {total_rows:,} rows across {num_groups} groups ({batch_count} batches)")
|
|
333
304
|
pf.close()
|
|
334
305
|
except Exception as e:
|
|
335
306
|
print(f" ❌ Parquet validation failed: {e}")
|
|
336
307
|
return False
|
|
337
|
-
# ===== END =====
|
|
338
308
|
|
|
339
309
|
st, dt = compare_and_report_diffs(sas_path, parquet_path)
|
|
340
|
-
|
|
341
310
|
print(f" 🔍 Validation: {st}")
|
|
342
311
|
for d in dt:
|
|
343
312
|
print(" -", d.replace("\n", "\n "))
|
|
@@ -369,39 +338,78 @@ def reconvert_file_ultimate(sas_path: Path, parquet_path: Path) -> bool:
|
|
|
369
338
|
return False
|
|
370
339
|
|
|
371
340
|
|
|
372
|
-
|
|
373
|
-
|
|
341
|
+
def default_parquet_dir_for(sas_input_dir: Path) -> Path:
|
|
342
|
+
# sibling parquetdata/ next to sas_input_dir
|
|
343
|
+
return sas_input_dir.parent / "parquetdata"
|
|
344
|
+
|
|
345
|
+
|
|
346
|
+
def default_log_dir_for(sas_input_dir: Path) -> Path:
|
|
347
|
+
# sibling logging/ next to sas_input_dir
|
|
348
|
+
return sas_input_dir.parent / "logging"
|
|
349
|
+
|
|
350
|
+
|
|
351
|
+
def parquet_path_for_sas(sas_file: Path, sas_input_dir: Path, parquet_output_dir: Path) -> Path:
|
|
352
|
+
rel = sas_file.relative_to(sas_input_dir)
|
|
353
|
+
|
|
354
|
+
if rel.parent == Path("."):
|
|
355
|
+
return (parquet_output_dir / rel.name).with_suffix(".parquet")
|
|
356
|
+
|
|
357
|
+
parquet_dirs = [f"{p}_parquet" for p in rel.parent.parts]
|
|
358
|
+
return (parquet_output_dir.joinpath(*parquet_dirs) / rel.name).with_suffix(".parquet")
|
|
359
|
+
|
|
360
|
+
|
|
361
|
+
# --- Main loop (directory mode) ---
|
|
362
|
+
def main(
|
|
363
|
+
sas_input_dir: Path,
|
|
364
|
+
parquet_output_dir: Path | None = None,
|
|
365
|
+
log_dir: Path | None = None,
|
|
366
|
+
) -> int:
|
|
367
|
+
sas_input_dir = Path(sas_input_dir).expanduser().resolve()
|
|
368
|
+
if not sas_input_dir.exists() or not sas_input_dir.is_dir():
|
|
369
|
+
print(f"❌ Input directory not found or not a directory: {sas_input_dir}")
|
|
370
|
+
return 2
|
|
371
|
+
|
|
372
|
+
parquet_output_dir = (Path(parquet_output_dir).expanduser().resolve()
|
|
373
|
+
if parquet_output_dir else default_parquet_dir_for(sas_input_dir))
|
|
374
|
+
log_dir = (Path(log_dir).expanduser().resolve()
|
|
375
|
+
if log_dir else default_log_dir_for(sas_input_dir))
|
|
376
|
+
|
|
377
|
+
log_file_path = log_dir / f"conversion_{datetime.now():%Y%m%d_%H%M%S}.log"
|
|
378
|
+
|
|
374
379
|
orig = sys.stdout
|
|
375
|
-
sys.stdout = Logger(
|
|
380
|
+
sys.stdout = Logger(log_file_path)
|
|
376
381
|
try:
|
|
377
382
|
print("🚀 SAS → Parquet Hybrid Fix & Validate (full folder)\n")
|
|
378
|
-
|
|
383
|
+
print(f"Input: {sas_input_dir}")
|
|
384
|
+
print(f"Output: {parquet_output_dir}")
|
|
385
|
+
print(f"Logs: {log_file_path}\n")
|
|
386
|
+
|
|
387
|
+
files = list(sas_input_dir.rglob("*.sas7bdat"))
|
|
379
388
|
if not files:
|
|
380
389
|
print("❌ No SAS files found. Exiting.")
|
|
381
|
-
return
|
|
390
|
+
return 1
|
|
382
391
|
|
|
383
392
|
print(f"Found {len(files)} files.\n" + "="*60)
|
|
393
|
+
ok = 0
|
|
394
|
+
bad = 0
|
|
395
|
+
|
|
384
396
|
for sas in files:
|
|
385
|
-
rel = sas.relative_to(
|
|
397
|
+
rel = sas.relative_to(sas_input_dir)
|
|
386
398
|
print(f"\n🗂 Processing: {rel}")
|
|
387
399
|
|
|
388
|
-
|
|
400
|
+
pqf = parquet_path_for_sas(sas, sas_input_dir, parquet_output_dir)
|
|
401
|
+
success = reconvert_file_ultimate(sas, pqf)
|
|
389
402
|
|
|
390
|
-
if
|
|
391
|
-
|
|
403
|
+
if success:
|
|
404
|
+
ok += 1
|
|
392
405
|
else:
|
|
393
|
-
|
|
394
|
-
|
|
395
|
-
|
|
396
|
-
|
|
397
|
-
reconvert_file_ultimate(sas, pqf)
|
|
406
|
+
bad += 1
|
|
407
|
+
|
|
398
408
|
print("-"*60)
|
|
399
409
|
|
|
400
|
-
print("\n✅
|
|
410
|
+
print(f"\n✅ Done. Success={ok}, Failed={bad}. See log at: {log_file_path}")
|
|
411
|
+
return 0 if bad == 0 else 1
|
|
412
|
+
|
|
401
413
|
finally:
|
|
402
414
|
sys.stdout.close()
|
|
403
|
-
sys.stdout = orig
|
|
404
|
-
|
|
405
|
-
|
|
406
|
-
if __name__ == "__main__":
|
|
407
|
-
main()
|
|
415
|
+
sys.stdout = orig
|
sas2parquet-0.1.8/PKG-INFO
DELETED
|
@@ -1,135 +0,0 @@
|
|
|
1
|
-
Metadata-Version: 2.4
|
|
2
|
-
Name: sas2parquet
|
|
3
|
-
Version: 0.1.8
|
|
4
|
-
Summary: SAS → Parquet Hybrid Converter & Validator
|
|
5
|
-
License-File: LICENSE
|
|
6
|
-
Author: Zaman Ziabakhshganji
|
|
7
|
-
Author-email: zaman.ganji@gmail.com
|
|
8
|
-
Requires-Python: >=3.11
|
|
9
|
-
Classifier: Programming Language :: Python :: 3
|
|
10
|
-
Classifier: Programming Language :: Python :: 3.11
|
|
11
|
-
Classifier: Programming Language :: Python :: 3.12
|
|
12
|
-
Classifier: Programming Language :: Python :: 3.13
|
|
13
|
-
Classifier: Programming Language :: Python :: 3.14
|
|
14
|
-
Requires-Dist: narwhals (==2.13.0)
|
|
15
|
-
Requires-Dist: numpy (==2.3.5)
|
|
16
|
-
Requires-Dist: pandas (==2.3.3)
|
|
17
|
-
Requires-Dist: polars (==1.36.1)
|
|
18
|
-
Requires-Dist: polars-runtime-32 (==1.36.1)
|
|
19
|
-
Requires-Dist: py4j (==0.10.9.9)
|
|
20
|
-
Requires-Dist: pyarrow (==22.0.0)
|
|
21
|
-
Requires-Dist: pyreadstat (==1.3.2)
|
|
22
|
-
Requires-Dist: pyspark (==4.0.1)
|
|
23
|
-
Requires-Dist: pytest (>=9.0.2,<10.0.0)
|
|
24
|
-
Requires-Dist: python-dateutil (==2.9.0.post0)
|
|
25
|
-
Requires-Dist: pytz (==2025.2)
|
|
26
|
-
Requires-Dist: requests (>=2.32.5,<3.0.0)
|
|
27
|
-
Requires-Dist: six (==1.17.0)
|
|
28
|
-
Requires-Dist: tzdata (==2025.2)
|
|
29
|
-
Description-Content-Type: text/markdown
|
|
30
|
-
|
|
31
|
-
# sas2parquet
|
|
32
|
-
|
|
33
|
-
[](https://pypi.org/project/sas2parquet/)
|
|
34
|
-
[](https://pypi.org/project/sas2parquet/)
|
|
35
|
-
[](LICENCE)
|
|
36
|
-
|
|
37
|
-
**The ultimate SAS (.sas7bdat) to Parquet converter** - handles problematic files that fail with standard tools. Automatic encoding detection, intelligent type inference, schema repair, and pixel-perfect validation.
|
|
38
|
-
|
|
39
|
-
## ✨ Features
|
|
40
|
-
|
|
41
|
-
| Feature | Description |
|
|
42
|
-
|---------|-------------|
|
|
43
|
-
| 🔄 **Auto Encoding** | Detects UTF-8, Latin1, CP1252 from metadata/fallback |
|
|
44
|
-
| 🧠 **Smart Types** | Infers datetime, numeric, string with 20+ fallback attempts |
|
|
45
|
-
| ✅ **Validation** | Compares SAS vs Parquet chunk-by-chunk (numeric + string) |
|
|
46
|
-
| 📊 **Memory Safe** | Chunked processing (96GB RAM optimized, configurable) |
|
|
47
|
-
| 💾 **ZSTD** | Level 6 compression for maximum efficiency |
|
|
48
|
-
| 📝 **Detailed Logs** | Mismatch reports + full conversion trace |
|
|
49
|
-
| 🎯 **Two Modes** | Single file OR recursive directory processing |
|
|
50
|
-
|
|
51
|
-
## Quick Start
|
|
52
|
-
|
|
53
|
-
### Install
|
|
54
|
-
```bash
|
|
55
|
-
pip install sas2parquet
|
|
56
|
-
```
|
|
57
|
-
|
|
58
|
-
### Single File
|
|
59
|
-
```bash
|
|
60
|
-
sas2parquet input.sas output.parquet
|
|
61
|
-
```
|
|
62
|
-
|
|
63
|
-
### Batch Directory (Recommended)
|
|
64
|
-
```bash
|
|
65
|
-
sas2parquet --dir-mode
|
|
66
|
-
```
|
|
67
|
-
|
|
68
|
-
## 📁 Directory Mode (Default Workflow)
|
|
69
|
-
|
|
70
|
-
### How it works
|
|
71
|
-
|
|
72
|
-
- You provide a `sasdata/` directory containing all `.sas7bdat` files (including nested subfolders).
|
|
73
|
-
- The tool automatically creates a `parquetdata/` directory in the same parent folder as `sasdata/`.
|
|
74
|
-
- All files are converted to Parquet and written into `parquetdata/`, mirroring the original folder structure.
|
|
75
|
-
|
|
76
|
-
```text
|
|
77
|
-
your-project/
|
|
78
|
-
├── sasdata/ # ← Put your .sas7bdat files here
|
|
79
|
-
│ ├── file1.sas7bdat
|
|
80
|
-
│ └── subfolder/
|
|
81
|
-
│ └── nested.sas7bdat
|
|
82
|
-
├── parquetdata/ # ← AUTO-CREATED (mirrors sasdata/)
|
|
83
|
-
│ ├── file1.parquet
|
|
84
|
-
│ └── subfolder/
|
|
85
|
-
│ └── nested.parquet
|
|
86
|
-
└── logging/ # ← AUTO-CREATED (detailed logs)
|
|
87
|
-
└── conversion_20260205_1145.log
|
|
88
|
-
```
|
|
89
|
-
|
|
90
|
-
Just run:
|
|
91
|
-
```bash
|
|
92
|
-
sas2parquet --dir-mode
|
|
93
|
-
```
|
|
94
|
-
|
|
95
|
-
## 🛠️ CLI Reference
|
|
96
|
-
```bash
|
|
97
|
-
sas2parquet --help
|
|
98
|
-
```
|
|
99
|
-
|
|
100
|
-
```text
|
|
101
|
-
usage: sas2parquet [-h] [--dir-mode] [sas_file] [parquet_file]
|
|
102
|
-
|
|
103
|
-
Robust SAS to Parquet converter with validation
|
|
104
|
-
```
|
|
105
|
-
|
|
106
|
-
## 📊 Example Output
|
|
107
|
-
```text
|
|
108
|
-
🚀 SAS → Parquet Hybrid Fix & Validate (full folder)
|
|
109
|
-
Found 3 files.
|
|
110
|
-
============================================================
|
|
111
|
-
...
|
|
112
|
-
```
|
|
113
|
-
|
|
114
|
-
## ⚙️ Configuration (Advanced)
|
|
115
|
-
|
|
116
|
-
Edit `src/sas2parquet/convert.py` constants:
|
|
117
|
-
|
|
118
|
-
```python
|
|
119
|
-
AVAILABLE_RAM_GB = 96
|
|
120
|
-
RAM_USAGE_FACTOR = 0.5
|
|
121
|
-
ZSTD_COMPRESSION_LEVEL = 6
|
|
122
|
-
MIN_CHUNK_SIZE = 100_000
|
|
123
|
-
MAX_CHUNK_SIZE = 10_000_000
|
|
124
|
-
```
|
|
125
|
-
|
|
126
|
-
## 🧪 Validation Details
|
|
127
|
-
Each file undergoes 4-stage validation:
|
|
128
|
-
1. Metadata
|
|
129
|
-
2. Exact counts
|
|
130
|
-
3. Column order
|
|
131
|
-
4. Value comparison
|
|
132
|
-
|
|
133
|
-
## 📄 License
|
|
134
|
-
MIT License
|
|
135
|
-
|
sas2parquet-0.1.8/README.md
DELETED
|
@@ -1,104 +0,0 @@
|
|
|
1
|
-
# sas2parquet
|
|
2
|
-
|
|
3
|
-
[](https://pypi.org/project/sas2parquet/)
|
|
4
|
-
[](https://pypi.org/project/sas2parquet/)
|
|
5
|
-
[](LICENCE)
|
|
6
|
-
|
|
7
|
-
**The ultimate SAS (.sas7bdat) to Parquet converter** - handles problematic files that fail with standard tools. Automatic encoding detection, intelligent type inference, schema repair, and pixel-perfect validation.
|
|
8
|
-
|
|
9
|
-
## ✨ Features
|
|
10
|
-
|
|
11
|
-
| Feature | Description |
|
|
12
|
-
|---------|-------------|
|
|
13
|
-
| 🔄 **Auto Encoding** | Detects UTF-8, Latin1, CP1252 from metadata/fallback |
|
|
14
|
-
| 🧠 **Smart Types** | Infers datetime, numeric, string with 20+ fallback attempts |
|
|
15
|
-
| ✅ **Validation** | Compares SAS vs Parquet chunk-by-chunk (numeric + string) |
|
|
16
|
-
| 📊 **Memory Safe** | Chunked processing (96GB RAM optimized, configurable) |
|
|
17
|
-
| 💾 **ZSTD** | Level 6 compression for maximum efficiency |
|
|
18
|
-
| 📝 **Detailed Logs** | Mismatch reports + full conversion trace |
|
|
19
|
-
| 🎯 **Two Modes** | Single file OR recursive directory processing |
|
|
20
|
-
|
|
21
|
-
## Quick Start
|
|
22
|
-
|
|
23
|
-
### Install
|
|
24
|
-
```bash
|
|
25
|
-
pip install sas2parquet
|
|
26
|
-
```
|
|
27
|
-
|
|
28
|
-
### Single File
|
|
29
|
-
```bash
|
|
30
|
-
sas2parquet input.sas output.parquet
|
|
31
|
-
```
|
|
32
|
-
|
|
33
|
-
### Batch Directory (Recommended)
|
|
34
|
-
```bash
|
|
35
|
-
sas2parquet --dir-mode
|
|
36
|
-
```
|
|
37
|
-
|
|
38
|
-
## 📁 Directory Mode (Default Workflow)
|
|
39
|
-
|
|
40
|
-
### How it works
|
|
41
|
-
|
|
42
|
-
- You provide a `sasdata/` directory containing all `.sas7bdat` files (including nested subfolders).
|
|
43
|
-
- The tool automatically creates a `parquetdata/` directory in the same parent folder as `sasdata/`.
|
|
44
|
-
- All files are converted to Parquet and written into `parquetdata/`, mirroring the original folder structure.
|
|
45
|
-
|
|
46
|
-
```text
|
|
47
|
-
your-project/
|
|
48
|
-
├── sasdata/ # ← Put your .sas7bdat files here
|
|
49
|
-
│ ├── file1.sas7bdat
|
|
50
|
-
│ └── subfolder/
|
|
51
|
-
│ └── nested.sas7bdat
|
|
52
|
-
├── parquetdata/ # ← AUTO-CREATED (mirrors sasdata/)
|
|
53
|
-
│ ├── file1.parquet
|
|
54
|
-
│ └── subfolder/
|
|
55
|
-
│ └── nested.parquet
|
|
56
|
-
└── logging/ # ← AUTO-CREATED (detailed logs)
|
|
57
|
-
└── conversion_20260205_1145.log
|
|
58
|
-
```
|
|
59
|
-
|
|
60
|
-
Just run:
|
|
61
|
-
```bash
|
|
62
|
-
sas2parquet --dir-mode
|
|
63
|
-
```
|
|
64
|
-
|
|
65
|
-
## 🛠️ CLI Reference
|
|
66
|
-
```bash
|
|
67
|
-
sas2parquet --help
|
|
68
|
-
```
|
|
69
|
-
|
|
70
|
-
```text
|
|
71
|
-
usage: sas2parquet [-h] [--dir-mode] [sas_file] [parquet_file]
|
|
72
|
-
|
|
73
|
-
Robust SAS to Parquet converter with validation
|
|
74
|
-
```
|
|
75
|
-
|
|
76
|
-
## 📊 Example Output
|
|
77
|
-
```text
|
|
78
|
-
🚀 SAS → Parquet Hybrid Fix & Validate (full folder)
|
|
79
|
-
Found 3 files.
|
|
80
|
-
============================================================
|
|
81
|
-
...
|
|
82
|
-
```
|
|
83
|
-
|
|
84
|
-
## ⚙️ Configuration (Advanced)
|
|
85
|
-
|
|
86
|
-
Edit `src/sas2parquet/convert.py` constants:
|
|
87
|
-
|
|
88
|
-
```python
|
|
89
|
-
AVAILABLE_RAM_GB = 96
|
|
90
|
-
RAM_USAGE_FACTOR = 0.5
|
|
91
|
-
ZSTD_COMPRESSION_LEVEL = 6
|
|
92
|
-
MIN_CHUNK_SIZE = 100_000
|
|
93
|
-
MAX_CHUNK_SIZE = 10_000_000
|
|
94
|
-
```
|
|
95
|
-
|
|
96
|
-
## 🧪 Validation Details
|
|
97
|
-
Each file undergoes 4-stage validation:
|
|
98
|
-
1. Metadata
|
|
99
|
-
2. Exact counts
|
|
100
|
-
3. Column order
|
|
101
|
-
4. Value comparison
|
|
102
|
-
|
|
103
|
-
## 📄 License
|
|
104
|
-
MIT License
|
sas2parquet-0.1.8/pyproject.toml
DELETED
|
@@ -1,37 +0,0 @@
|
|
|
1
|
-
[project]
|
|
2
|
-
name = "sas2parquet"
|
|
3
|
-
version = "0.1.8"
|
|
4
|
-
description = "SAS → Parquet Hybrid Converter & Validator"
|
|
5
|
-
authors = [
|
|
6
|
-
{name = "Zaman Ziabakhshganji",email = "zaman.ganji@gmail.com"}
|
|
7
|
-
]
|
|
8
|
-
readme = "README.md"
|
|
9
|
-
requires-python = ">=3.11"
|
|
10
|
-
dependencies = [
|
|
11
|
-
"pytest (>=9.0.2,<10.0.0)",
|
|
12
|
-
"requests (>=2.32.5,<3.0.0)",
|
|
13
|
-
"narwhals (==2.13.0)",
|
|
14
|
-
"numpy (==2.3.5)",
|
|
15
|
-
"pandas (==2.3.3)",
|
|
16
|
-
"polars (==1.36.1)",
|
|
17
|
-
"polars-runtime-32 (==1.36.1)",
|
|
18
|
-
"py4j (==0.10.9.9)",
|
|
19
|
-
"pyarrow (==22.0.0)",
|
|
20
|
-
"pyreadstat (==1.3.2)",
|
|
21
|
-
"pyspark (==4.0.1)",
|
|
22
|
-
"python-dateutil (==2.9.0.post0)",
|
|
23
|
-
"pytz (==2025.2)",
|
|
24
|
-
"six (==1.17.0)",
|
|
25
|
-
"tzdata (==2025.2)",
|
|
26
|
-
]
|
|
27
|
-
|
|
28
|
-
[tool.poetry]
|
|
29
|
-
packages = [{include = "sas2parquet", from = "src"}]
|
|
30
|
-
|
|
31
|
-
[tool.poetry.scripts]
|
|
32
|
-
sas2parquet = "sas2parquet.cli:main"
|
|
33
|
-
|
|
34
|
-
[build-system]
|
|
35
|
-
requires = ["poetry-core>=2.0.0,<3.0.0"]
|
|
36
|
-
build-backend = "poetry.core.masonry.api"
|
|
37
|
-
|
|
@@ -1,36 +0,0 @@
|
|
|
1
|
-
#!/usr/bin/env python
|
|
2
|
-
"""CLI entrypoint for sas2parquet."""
|
|
3
|
-
import argparse
|
|
4
|
-
import sys
|
|
5
|
-
from pathlib import Path
|
|
6
|
-
import importlib.metadata
|
|
7
|
-
|
|
8
|
-
# Get version from installed package metadata (works everywhere)
|
|
9
|
-
try:
|
|
10
|
-
__version__ = importlib.metadata.version("sas2parquet")
|
|
11
|
-
except importlib.metadata.PackageNotFoundError:
|
|
12
|
-
__version__ = "dev" # During development
|
|
13
|
-
|
|
14
|
-
from .convert import main as _convert_main, reconvert_file_ultimate
|
|
15
|
-
|
|
16
|
-
def main():
|
|
17
|
-
parser = argparse.ArgumentParser(description="SAS to Parquet converter")
|
|
18
|
-
parser.add_argument("--version", action="version", version=f"%(prog)s {__version__}")
|
|
19
|
-
parser.add_argument("sas_file", nargs="?", help="Single SAS file to convert")
|
|
20
|
-
parser.add_argument("parquet_file", nargs="?", help="Output Parquet file")
|
|
21
|
-
parser.add_argument("--dir-mode", "-d", action="store_true",
|
|
22
|
-
help="Process entire SAS_INPUT_DIR (ignores file args)")
|
|
23
|
-
|
|
24
|
-
args = parser.parse_args()
|
|
25
|
-
|
|
26
|
-
if args.dir_mode:
|
|
27
|
-
_convert_main()
|
|
28
|
-
elif args.sas_file and args.parquet_file:
|
|
29
|
-
success = reconvert_file_ultimate(Path(args.sas_file), Path(args.parquet_file))
|
|
30
|
-
sys.exit(0 if success else 1)
|
|
31
|
-
else:
|
|
32
|
-
parser.print_help()
|
|
33
|
-
sys.exit(1)
|
|
34
|
-
|
|
35
|
-
if __name__ == "__main__":
|
|
36
|
-
main()
|
|
File without changes
|
|
File without changes
|