sas2parquet 0.1.9__py3-none-any.whl → 0.2.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
sas2parquet/cli.py CHANGED
@@ -1,5 +1,8 @@
1
1
  #!/usr/bin/env python
2
- """CLI entrypoint for sas2parquet."""
2
+ """
3
+ CLI entrypoint for sas2parquet.
4
+ """
5
+
3
6
  import argparse
4
7
  import sys
5
8
  from pathlib import Path
@@ -13,41 +16,83 @@ except importlib.metadata.PackageNotFoundError:
13
16
  from .convert import main as convert_dir, reconvert_file_ultimate
14
17
 
15
18
 
16
- def main():
17
- parser = argparse.ArgumentParser(prog="sas2parquet", description="SAS to Parquet converter")
18
- parser.add_argument("--version", action="version", version=f"%(prog)s {__version__}")
19
+ def main() -> None:
20
+ parser = argparse.ArgumentParser(
21
+ prog="sas2parquet",
22
+ description="SAS to Parquet converter with validation",
23
+ )
24
+
25
+ parser.add_argument(
26
+ "--version",
27
+ action="version",
28
+ version=f"%(prog)s {__version__}",
29
+ )
30
+
31
+ # Backward compatibility (optional)
32
+ parser.add_argument(
33
+ "--dir-mode",
34
+ "-d",
35
+ action="store_true",
36
+ help="(Backward compatible) Treat input as a directory. "
37
+ "If no path is provided, defaults to ./sasdata",
38
+ )
19
39
 
20
40
  parser.add_argument(
21
41
  "path",
22
- help="Path to a .sas7bdat file OR a directory containing SAS files (recursively)."
42
+ nargs="?",
43
+ help="Path to a .sas7bdat file OR a directory containing SAS files (recursively).",
23
44
  )
45
+
24
46
  parser.add_argument(
25
- "--out", "-o",
47
+ "--out",
48
+ "-o",
49
+ default=None,
26
50
  help="Output Parquet file (file mode) OR output directory (dir mode). "
27
51
  "If omitted, dir mode uses sibling 'parquetdata/'.",
28
- default=None
29
52
  )
53
+
30
54
  parser.add_argument(
31
55
  "--log-dir",
32
- help="Directory where logs are written (dir mode). If omitted, uses sibling 'logging/'.",
33
- default=None
56
+ default=None,
57
+ help="Directory where logs are written (dir mode). "
58
+ "If omitted, uses sibling 'logging/'.",
34
59
  )
35
60
 
36
61
  args = parser.parse_args()
37
62
 
38
- p = Path(args.path).expanduser().resolve()
63
+ # -----------------------------
64
+ # Resolve input path
65
+ # -----------------------------
66
+ if args.dir_mode and args.path is None:
67
+ # Old behavior: default to ./sasdata
68
+ p = Path("sasdata").expanduser().resolve()
69
+ else:
70
+ if args.path is None:
71
+ parser.print_help()
72
+ sys.exit(1)
73
+ p = Path(args.path).expanduser().resolve()
74
+
39
75
  if not p.exists():
40
76
  print(f"❌ Path not found: {p}")
41
77
  sys.exit(2)
42
78
 
79
+ # -----------------------------
43
80
  # Directory mode
81
+ # -----------------------------
44
82
  if p.is_dir():
45
83
  out_dir = Path(args.out).expanduser().resolve() if args.out else None
46
84
  log_dir = Path(args.log_dir).expanduser().resolve() if args.log_dir else None
47
- rc = convert_dir(p, parquet_output_dir=out_dir, log_dir=log_dir)
85
+
86
+ rc = convert_dir(
87
+ p,
88
+ parquet_output_dir=out_dir,
89
+ log_dir=log_dir,
90
+ )
48
91
  sys.exit(rc)
49
92
 
93
+ # -----------------------------
50
94
  # File mode
95
+ # -----------------------------
51
96
  if p.is_file():
52
97
  if p.suffix.lower() != ".sas7bdat":
53
98
  print(f"❌ Not a .sas7bdat file: {p.name}")
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: sas2parquet
3
- Version: 0.1.9
3
+ Version: 0.2.1
4
4
  Summary: SAS → Parquet Hybrid Converter & Validator
5
5
  License: MIT
6
6
  License-File: LICENSE
@@ -26,8 +26,8 @@ Requires-Dist: pytz (==2025.2)
26
26
  Requires-Dist: requests (>=2.32.5,<3.0.0)
27
27
  Requires-Dist: six (==1.17.0)
28
28
  Requires-Dist: tzdata (==2025.2)
29
- Project-URL: Homepage, https://github.com/<you>/<repo>
30
- Project-URL: Repository, https://github.com/<you>/<repo>
29
+ Project-URL: Homepage, https://ecda.eur.nl/erasmus-data-collaboratory/
30
+ Project-URL: Repository, https://git.ecda.ai/EDC-IXLab/sas2parquet
31
31
  Description-Content-Type: text/markdown
32
32
 
33
33
  # sas2parquet
@@ -36,102 +36,119 @@ Description-Content-Type: text/markdown
36
36
  [![Python versions](https://img.shields.io/pypi/pyversions/sas2parquet.svg)](https://pypi.org/project/sas2parquet/)
37
37
  [![License](https://img.shields.io/pypi/l/sas2parquet.svg)](LICENCE)
38
38
 
39
- **The ultimate SAS (.sas7bdat) to Parquet converter** - handles problematic files that fail with standard tools. Automatic encoding detection, intelligent type inference, schema repair, and pixel-perfect validation.
39
+ **The ultimate SAS (.sas7bdat) to Parquet converter** built to handle files that fail with standard tools.
40
+
41
+ `sas2parquet` automatically detects encodings, repairs schemas, infers correct data types, and performs **pixel-perfect validation** between SAS and Parquet outputs.
42
+
43
+ ---
40
44
 
41
45
  ## ✨ Features
42
46
 
43
47
  | Feature | Description |
44
- |---------|-------------|
45
- | 🔄 **Auto Encoding** | Detects UTF-8, Latin1, CP1252 from metadata/fallback |
46
- | 🧠 **Smart Types** | Infers datetime, numeric, string with 20+ fallback attempts |
47
- | ✅ **Validation** | Compares SAS vs Parquet chunk-by-chunk (numeric + string) |
48
+ |-------|-------------|
49
+ | 🔄 **Auto Encoding** | Detects UTF-8, Latin1, CP1252 from metadata or fallback |
50
+ | 🧠 **Smart Types** | Infers datetime, numeric, string with 20+ retry strategies |
51
+ | ✅ **Validation** | Chunk-by-chunk comparison (metadata, counts, values) |
48
52
  | 📊 **Memory Safe** | Chunked processing (96GB RAM optimized, configurable) |
49
- | 💾 **ZSTD** | Level 6 compression for maximum efficiency |
50
- | 📝 **Detailed Logs** | Mismatch reports + full conversion trace |
51
- | 🎯 **Two Modes** | Single file OR recursive directory processing |
53
+ | 💾 **ZSTD Compression** | Level-6 ZSTD for efficient Parquet storage |
54
+ | 📝 **Detailed Logs** | Full conversion trace + mismatch reports |
55
+ | 🎯 **Two Modes** | Single file or recursive directory processing |
56
+
57
+ ---
52
58
 
53
- ## Quick Start
59
+ ## 🚀 Quick Start
54
60
 
55
61
  ### Install
56
62
  ```bash
57
63
  pip install sas2parquet
58
64
  ```
59
65
 
60
- ### Single File
66
+ ---
67
+
68
+ ## ✅ Usage
69
+
70
+ ### Convert a directory (recommended)
71
+
61
72
  ```bash
62
- sas2parquet input.sas output.parquet
73
+ sas2parquet path/to/sasdata/
63
74
  ```
64
75
 
65
- ### Batch Directory (Recommended)
76
+ - Converts **all `.sas7bdat` files recursively**
77
+ - Creates `parquetdata/` and `logging/` next to `sasdata/`
78
+
79
+ ---
80
+
81
+ ### Convert a single file
82
+
66
83
  ```bash
67
- sas2parquet --dir-mode
84
+ sas2parquet path/to/file.sas7bdat
68
85
  ```
69
86
 
70
- ## 📁 Directory Mode (Default Workflow)
87
+ Output (default):
88
+ ```text
89
+ path/to/file.parquet
90
+ ```
91
+
92
+ ---
93
+
94
+ ### Specify output location
95
+
96
+ #### Directory mode — custom output directory
97
+ ```bash
98
+ sas2parquet path/to/sasdata/ --out path/to/parquetdata/
99
+ ```
100
+
101
+ #### File mode — custom output file
102
+ ```bash
103
+ sas2parquet path/to/file.sas7bdat --out path/to/output.parquet
104
+ ```
71
105
 
72
- ### How it works
106
+ ---
73
107
 
74
- - You provide a `sasdata/` directory containing all `.sas7bdat` files (including nested subfolders).
75
- - The tool automatically creates a `parquetdata/` directory in the same parent folder as `sasdata/`.
76
- - All files are converted to Parquet and written into `parquetdata/`, mirroring the original folder structure.
108
+ ### Custom log directory (directory mode)
109
+
110
+ ```bash
111
+ sas2parquet path/to/sasdata/ --log-dir path/to/logs/
112
+ ```
113
+
114
+ ---
115
+
116
+ ## 📁 Directory Mode Behavior
77
117
 
78
118
  ```text
79
119
  your-project/
80
- ├── sasdata/ # ← Put your .sas7bdat files here
120
+ ├── sasdata/
81
121
  │ ├── file1.sas7bdat
82
122
  │ └── subfolder/
83
123
  │ └── nested.sas7bdat
84
- ├── parquetdata/ # ← AUTO-CREATED (mirrors sasdata/)
124
+ ├── parquetdata/
85
125
  │ ├── file1.parquet
86
- │ └── subfolder/
126
+ │ └── subfolder_parquet/
87
127
  │ └── nested.parquet
88
- └── logging/ # ← AUTO-CREATED (detailed logs)
128
+ └── logging/
89
129
  └── conversion_20260205_1145.log
90
130
  ```
91
131
 
92
- Just run:
93
- ```bash
94
- sas2parquet --dir-mode
95
- ```
132
+ ---
96
133
 
97
134
  ## 🛠️ CLI Reference
135
+
98
136
  ```bash
99
137
  sas2parquet --help
100
138
  ```
101
139
 
102
- ```text
103
- usage: sas2parquet [-h] [--dir-mode] [sas_file] [parquet_file]
104
-
105
- Robust SAS to Parquet converter with validation
106
- ```
107
-
108
- ## 📊 Example Output
109
- ```text
110
- 🚀 SAS → Parquet Hybrid Fix & Validate (full folder)
111
- Found 3 files.
112
- ============================================================
113
- ...
114
- ```
140
+ ---
115
141
 
116
142
  ## ⚙️ Configuration (Advanced)
117
143
 
118
- Edit `src/sas2parquet/convert.py` constants:
144
+ Edit constants in:
119
145
 
120
- ```python
121
- AVAILABLE_RAM_GB = 96
122
- RAM_USAGE_FACTOR = 0.5
123
- ZSTD_COMPRESSION_LEVEL = 6
124
- MIN_CHUNK_SIZE = 100_000
125
- MAX_CHUNK_SIZE = 10_000_000
146
+ ```text
147
+ src/sas2parquet/convert.py
126
148
  ```
127
149
 
128
- ## 🧪 Validation Details
129
- Each file undergoes 4-stage validation:
130
- 1. Metadata
131
- 2. Exact counts
132
- 3. Column order
133
- 4. Value comparison
150
+ ---
134
151
 
135
152
  ## 📄 License
136
- MIT License
137
153
 
154
+ MIT License
@@ -0,0 +1,8 @@
1
+ sas2parquet/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
2
+ sas2parquet/cli.py,sha256=w0HvEiGvN8duuMye7-Vq7IXJfiYGLkvpV0ZsWWXgoBc,2967
3
+ sas2parquet/convert.py,sha256=YbCU-oYa6s6qy5dMp8ZElNDlBA0gVMLqUQrwl4YcfYo,15321
4
+ sas2parquet-0.2.1.dist-info/METADATA,sha256=P6pezvdfkYBsljSRD4ohvRycQohl_kH-KERmsNONNdU,3743
5
+ sas2parquet-0.2.1.dist-info/WHEEL,sha256=3ny-bZhpXrU6vSQ1UPG34FoxZBp3lVcvK0LkgUz6VLk,88
6
+ sas2parquet-0.2.1.dist-info/entry_points.txt,sha256=pg57h0xD_3R9ZC_YfxLLfu_2p1JNhF8xDNS6v7kiSBY,52
7
+ sas2parquet-0.2.1.dist-info/licenses/LICENSE,sha256=ouRycIMUGF1zCj49-ijn1wIlTNknZEoLwAHUp0ifH-g,1066
8
+ sas2parquet-0.2.1.dist-info/RECORD,,
@@ -1,8 +0,0 @@
1
- sas2parquet/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
2
- sas2parquet/cli.py,sha256=n9poi1xxOw75EtZWCIRbkBQCoVLnWE0kh99dU38gBWE,2042
3
- sas2parquet/convert.py,sha256=YbCU-oYa6s6qy5dMp8ZElNDlBA0gVMLqUQrwl4YcfYo,15321
4
- sas2parquet-0.1.9.dist-info/METADATA,sha256=Jef3orIQzWYcTYkIhP1hK-7UynSMt0PvN-6CU9rQQLs,4108
5
- sas2parquet-0.1.9.dist-info/WHEEL,sha256=3ny-bZhpXrU6vSQ1UPG34FoxZBp3lVcvK0LkgUz6VLk,88
6
- sas2parquet-0.1.9.dist-info/entry_points.txt,sha256=pg57h0xD_3R9ZC_YfxLLfu_2p1JNhF8xDNS6v7kiSBY,52
7
- sas2parquet-0.1.9.dist-info/licenses/LICENSE,sha256=ouRycIMUGF1zCj49-ijn1wIlTNknZEoLwAHUp0ifH-g,1066
8
- sas2parquet-0.1.9.dist-info/RECORD,,