sas2parquet 0.1.8__tar.gz → 0.2.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,154 @@
1
+ Metadata-Version: 2.4
2
+ Name: sas2parquet
3
+ Version: 0.2.0
4
+ Summary: SAS → Parquet Hybrid Converter & Validator
5
+ License: MIT
6
+ License-File: LICENSE
7
+ Keywords: sas,parquet,etl,data,pyarrow
8
+ Author: Zaman Ziabakhshganji
9
+ Author-email: zaman.ganji@gmail.com
10
+ Requires-Python: >=3.11,<4.0
11
+ Classifier: License :: OSI Approved :: MIT License
12
+ Classifier: Operating System :: OS Independent
13
+ Classifier: Programming Language :: Python :: 3
14
+ Classifier: Programming Language :: Python :: 3.11
15
+ Classifier: Programming Language :: Python :: 3.12
16
+ Classifier: Programming Language :: Python :: 3.13
17
+ Classifier: Programming Language :: Python :: 3.14
18
+ Requires-Dist: narwhals (==2.13.0)
19
+ Requires-Dist: numpy (==2.3.5)
20
+ Requires-Dist: pandas (==2.3.3)
21
+ Requires-Dist: polars (==1.36.1)
22
+ Requires-Dist: pyarrow (==22.0.0)
23
+ Requires-Dist: pyreadstat (==1.3.2)
24
+ Requires-Dist: python-dateutil (==2.9.0.post0)
25
+ Requires-Dist: pytz (==2025.2)
26
+ Requires-Dist: requests (>=2.32.5,<3.0.0)
27
+ Requires-Dist: six (==1.17.0)
28
+ Requires-Dist: tzdata (==2025.2)
29
+ Project-URL: Homepage, https://github.com/<you>/<repo>
30
+ Project-URL: Repository, https://github.com/<you>/<repo>
31
+ Description-Content-Type: text/markdown
32
+
33
+ # sas2parquet
34
+
35
+ [![PyPI version](https://badge.fury.io/py/sas2parquet.svg)](https://pypi.org/project/sas2parquet/)
36
+ [![Python versions](https://img.shields.io/pypi/pyversions/sas2parquet.svg)](https://pypi.org/project/sas2parquet/)
37
+ [![License](https://img.shields.io/pypi/l/sas2parquet.svg)](LICENCE)
38
+
39
+ **The ultimate SAS (.sas7bdat) to Parquet converter** — built to handle files that fail with standard tools.
40
+
41
+ `sas2parquet` automatically detects encodings, repairs schemas, infers correct data types, and performs **pixel-perfect validation** between SAS and Parquet outputs.
42
+
43
+ ---
44
+
45
+ ## ✨ Features
46
+
47
+ | Feature | Description |
48
+ |-------|-------------|
49
+ | 🔄 **Auto Encoding** | Detects UTF-8, Latin1, CP1252 from metadata or fallback |
50
+ | 🧠 **Smart Types** | Infers datetime, numeric, string with 20+ retry strategies |
51
+ | ✅ **Validation** | Chunk-by-chunk comparison (metadata, counts, values) |
52
+ | 📊 **Memory Safe** | Chunked processing (96GB RAM optimized, configurable) |
53
+ | 💾 **ZSTD Compression** | Level-6 ZSTD for efficient Parquet storage |
54
+ | 📝 **Detailed Logs** | Full conversion trace + mismatch reports |
55
+ | 🎯 **Two Modes** | Single file or recursive directory processing |
56
+
57
+ ---
58
+
59
+ ## 🚀 Quick Start
60
+
61
+ ### Install
62
+ ```bash
63
+ pip install sas2parquet
64
+ ```
65
+
66
+ ---
67
+
68
+ ## ✅ Usage
69
+
70
+ ### Convert a directory (recommended)
71
+
72
+ ```bash
73
+ sas2parquet path/to/sasdata/
74
+ ```
75
+
76
+ - Converts **all `.sas7bdat` files recursively**
77
+ - Creates `parquetdata/` and `logging/` next to `sasdata/`
78
+
79
+ ---
80
+
81
+ ### Convert a single file
82
+
83
+ ```bash
84
+ sas2parquet path/to/file.sas7bdat
85
+ ```
86
+
87
+ Output (default):
88
+ ```text
89
+ path/to/file.parquet
90
+ ```
91
+
92
+ ---
93
+
94
+ ### Specify output location
95
+
96
+ #### Directory mode — custom output directory
97
+ ```bash
98
+ sas2parquet path/to/sasdata/ --out path/to/parquetdata/
99
+ ```
100
+
101
+ #### File mode — custom output file
102
+ ```bash
103
+ sas2parquet path/to/file.sas7bdat --out path/to/output.parquet
104
+ ```
105
+
106
+ ---
107
+
108
+ ### Custom log directory (directory mode)
109
+
110
+ ```bash
111
+ sas2parquet path/to/sasdata/ --log-dir path/to/logs/
112
+ ```
113
+
114
+ ---
115
+
116
+ ## 📁 Directory Mode Behavior
117
+
118
+ ```text
119
+ your-project/
120
+ ├── sasdata/
121
+ │ ├── file1.sas7bdat
122
+ │ └── subfolder/
123
+ │ └── nested.sas7bdat
124
+ ├── parquetdata/
125
+ │ ├── file1.parquet
126
+ │ └── subfolder_parquet/
127
+ │ └── nested.parquet
128
+ └── logging/
129
+ └── conversion_20260205_1145.log
130
+ ```
131
+
132
+ ---
133
+
134
+ ## 🛠️ CLI Reference
135
+
136
+ ```bash
137
+ sas2parquet --help
138
+ ```
139
+
140
+ ---
141
+
142
+ ## ⚙️ Configuration (Advanced)
143
+
144
+ Edit constants in:
145
+
146
+ ```text
147
+ src/sas2parquet/convert.py
148
+ ```
149
+
150
+ ---
151
+
152
+ ## 📄 License
153
+
154
+ MIT License
@@ -0,0 +1,122 @@
1
+ # sas2parquet
2
+
3
+ [![PyPI version](https://badge.fury.io/py/sas2parquet.svg)](https://pypi.org/project/sas2parquet/)
4
+ [![Python versions](https://img.shields.io/pypi/pyversions/sas2parquet.svg)](https://pypi.org/project/sas2parquet/)
5
+ [![License](https://img.shields.io/pypi/l/sas2parquet.svg)](LICENCE)
6
+
7
+ **The ultimate SAS (.sas7bdat) to Parquet converter** — built to handle files that fail with standard tools.
8
+
9
+ `sas2parquet` automatically detects encodings, repairs schemas, infers correct data types, and performs **pixel-perfect validation** between SAS and Parquet outputs.
10
+
11
+ ---
12
+
13
+ ## ✨ Features
14
+
15
+ | Feature | Description |
16
+ |-------|-------------|
17
+ | 🔄 **Auto Encoding** | Detects UTF-8, Latin1, CP1252 from metadata or fallback |
18
+ | 🧠 **Smart Types** | Infers datetime, numeric, string with 20+ retry strategies |
19
+ | ✅ **Validation** | Chunk-by-chunk comparison (metadata, counts, values) |
20
+ | 📊 **Memory Safe** | Chunked processing (96GB RAM optimized, configurable) |
21
+ | 💾 **ZSTD Compression** | Level-6 ZSTD for efficient Parquet storage |
22
+ | 📝 **Detailed Logs** | Full conversion trace + mismatch reports |
23
+ | 🎯 **Two Modes** | Single file or recursive directory processing |
24
+
25
+ ---
26
+
27
+ ## 🚀 Quick Start
28
+
29
+ ### Install
30
+ ```bash
31
+ pip install sas2parquet
32
+ ```
33
+
34
+ ---
35
+
36
+ ## ✅ Usage
37
+
38
+ ### Convert a directory (recommended)
39
+
40
+ ```bash
41
+ sas2parquet path/to/sasdata/
42
+ ```
43
+
44
+ - Converts **all `.sas7bdat` files recursively**
45
+ - Creates `parquetdata/` and `logging/` next to `sasdata/`
46
+
47
+ ---
48
+
49
+ ### Convert a single file
50
+
51
+ ```bash
52
+ sas2parquet path/to/file.sas7bdat
53
+ ```
54
+
55
+ Output (default):
56
+ ```text
57
+ path/to/file.parquet
58
+ ```
59
+
60
+ ---
61
+
62
+ ### Specify output location
63
+
64
+ #### Directory mode — custom output directory
65
+ ```bash
66
+ sas2parquet path/to/sasdata/ --out path/to/parquetdata/
67
+ ```
68
+
69
+ #### File mode — custom output file
70
+ ```bash
71
+ sas2parquet path/to/file.sas7bdat --out path/to/output.parquet
72
+ ```
73
+
74
+ ---
75
+
76
+ ### Custom log directory (directory mode)
77
+
78
+ ```bash
79
+ sas2parquet path/to/sasdata/ --log-dir path/to/logs/
80
+ ```
81
+
82
+ ---
83
+
84
+ ## 📁 Directory Mode Behavior
85
+
86
+ ```text
87
+ your-project/
88
+ ├── sasdata/
89
+ │ ├── file1.sas7bdat
90
+ │ └── subfolder/
91
+ │ └── nested.sas7bdat
92
+ ├── parquetdata/
93
+ │ ├── file1.parquet
94
+ │ └── subfolder_parquet/
95
+ │ └── nested.parquet
96
+ └── logging/
97
+ └── conversion_20260205_1145.log
98
+ ```
99
+
100
+ ---
101
+
102
+ ## 🛠️ CLI Reference
103
+
104
+ ```bash
105
+ sas2parquet --help
106
+ ```
107
+
108
+ ---
109
+
110
+ ## ⚙️ Configuration (Advanced)
111
+
112
+ Edit constants in:
113
+
114
+ ```text
115
+ src/sas2parquet/convert.py
116
+ ```
117
+
118
+ ---
119
+
120
+ ## 📄 License
121
+
122
+ MIT License
@@ -0,0 +1,48 @@
1
+ [tool.poetry]
2
+ name = "sas2parquet"
3
+ version = "0.2.0"
4
+ description = "SAS → Parquet Hybrid Converter & Validator"
5
+ readme = "README.md"
6
+ authors = ["Zaman Ziabakhshganji <zaman.ganji@gmail.com>"]
7
+ license = "MIT"
8
+ packages = [{ include = "sas2parquet", from = "src" }]
9
+ # Optional but nice for PyPI:
10
+ repository = "https://github.com/<you>/<repo>"
11
+ homepage = "https://github.com/<you>/<repo>"
12
+ keywords = ["sas", "parquet", "etl", "data", "pyarrow"]
13
+ classifiers = [
14
+ "Programming Language :: Python :: 3",
15
+ "Programming Language :: Python :: 3.11",
16
+ "Programming Language :: Python :: 3.12",
17
+ "License :: OSI Approved :: MIT License",
18
+ "Operating System :: OS Independent",
19
+ ]
20
+
21
+ [tool.poetry.dependencies]
22
+ python = ">=3.11,<4.0"
23
+ requests = ">=2.32.5,<3.0.0"
24
+ narwhals = "==2.13.0"
25
+ numpy = "==2.3.5"
26
+ pandas = "==2.3.3"
27
+ polars = "==1.36.1"
28
+ pyarrow = "==22.0.0"
29
+ pyreadstat = "==1.3.2"
30
+ python-dateutil = "==2.9.0.post0"
31
+ pytz = "==2025.2"
32
+ six = "==1.17.0"
33
+ tzdata = "==2025.2"
34
+
35
+ # 🚫 Strongly consider NOT shipping these as required deps:
36
+ # pyspark, py4j, polars-runtime-32
37
+ # They dramatically inflate installs and aren't required for your conversion script.
38
+ # If you still want them, put them behind extras (see below).
39
+
40
+ [tool.poetry.group.dev.dependencies]
41
+ pytest = ">=9.0.2,<10.0.0"
42
+
43
+ [tool.poetry.scripts]
44
+ sas2parquet = "sas2parquet.cli:main"
45
+
46
+ [build-system]
47
+ requires = ["poetry-core>=2.0.0,<3.0.0"]
48
+ build-backend = "poetry.core.masonry.api"
@@ -0,0 +1,114 @@
1
+ #!/usr/bin/env python
2
+ """
3
+ CLI entrypoint for sas2parquet.
4
+ """
5
+
6
+ import argparse
7
+ import sys
8
+ from pathlib import Path
9
+ import importlib.metadata
10
+
11
+ try:
12
+ __version__ = importlib.metadata.version("sas2parquet")
13
+ except importlib.metadata.PackageNotFoundError:
14
+ __version__ = "dev"
15
+
16
+ from .convert import main as convert_dir, reconvert_file_ultimate
17
+
18
+
19
+ def main() -> None:
20
+ parser = argparse.ArgumentParser(
21
+ prog="sas2parquet",
22
+ description="SAS to Parquet converter with validation",
23
+ )
24
+
25
+ parser.add_argument(
26
+ "--version",
27
+ action="version",
28
+ version=f"%(prog)s {__version__}",
29
+ )
30
+
31
+ # Backward compatibility (optional)
32
+ parser.add_argument(
33
+ "--dir-mode",
34
+ "-d",
35
+ action="store_true",
36
+ help="(Backward compatible) Treat input as a directory. "
37
+ "If no path is provided, defaults to ./sasdata",
38
+ )
39
+
40
+ parser.add_argument(
41
+ "path",
42
+ nargs="?",
43
+ help="Path to a .sas7bdat file OR a directory containing SAS files (recursively).",
44
+ )
45
+
46
+ parser.add_argument(
47
+ "--out",
48
+ "-o",
49
+ default=None,
50
+ help="Output Parquet file (file mode) OR output directory (dir mode). "
51
+ "If omitted, dir mode uses sibling 'parquetdata/'.",
52
+ )
53
+
54
+ parser.add_argument(
55
+ "--log-dir",
56
+ default=None,
57
+ help="Directory where logs are written (dir mode). "
58
+ "If omitted, uses sibling 'logging/'.",
59
+ )
60
+
61
+ args = parser.parse_args()
62
+
63
+ # -----------------------------
64
+ # Resolve input path
65
+ # -----------------------------
66
+ if args.dir_mode and args.path is None:
67
+ # Old behavior: default to ./sasdata
68
+ p = Path("sasdata").expanduser().resolve()
69
+ else:
70
+ if args.path is None:
71
+ parser.print_help()
72
+ sys.exit(1)
73
+ p = Path(args.path).expanduser().resolve()
74
+
75
+ if not p.exists():
76
+ print(f"❌ Path not found: {p}")
77
+ sys.exit(2)
78
+
79
+ # -----------------------------
80
+ # Directory mode
81
+ # -----------------------------
82
+ if p.is_dir():
83
+ out_dir = Path(args.out).expanduser().resolve() if args.out else None
84
+ log_dir = Path(args.log_dir).expanduser().resolve() if args.log_dir else None
85
+
86
+ rc = convert_dir(
87
+ p,
88
+ parquet_output_dir=out_dir,
89
+ log_dir=log_dir,
90
+ )
91
+ sys.exit(rc)
92
+
93
+ # -----------------------------
94
+ # File mode
95
+ # -----------------------------
96
+ if p.is_file():
97
+ if p.suffix.lower() != ".sas7bdat":
98
+ print(f"❌ Not a .sas7bdat file: {p.name}")
99
+ sys.exit(2)
100
+
101
+ if args.out:
102
+ out_file = Path(args.out).expanduser().resolve()
103
+ else:
104
+ out_file = p.with_suffix(".parquet")
105
+
106
+ success = reconvert_file_ultimate(p, out_file)
107
+ sys.exit(0 if success else 1)
108
+
109
+ print(f"❌ Unsupported path type: {p}")
110
+ sys.exit(2)
111
+
112
+
113
+ if __name__ == "__main__":
114
+ main()
@@ -12,19 +12,8 @@ import pyarrow as pa
12
12
  import pyarrow.parquet as pq
13
13
  import pyreadstat
14
14
 
15
- # --- Suppress pandas FutureWarnings ---
16
15
  warnings.simplefilter(action='ignore', category=FutureWarning)
17
16
 
18
- # --- Configuration ---
19
- # Put your .sas7bdat files inside SAS_INPUT_DIR (including subfolders).
20
- SAS_INPUT_DIR = Path("sasdata")
21
-
22
- # IMPORTANT:
23
- # parquetdata/ and logging/ are created NEXT TO sasdata/ (i.e., in the same parent directory).
24
- PARQUET_INPUT_DIR = SAS_INPUT_DIR.parent / "parquetdata"
25
- LOG_DIR = SAS_INPUT_DIR.parent / "logging"
26
- LOG_FILE_PATH = LOG_DIR / f"conversion_{datetime.now():%Y%m%d_%H%M%S}.log"
27
-
28
17
  KNOWN_DATETIME_COLUMNS = [
29
18
  'RPNA_DATE_UTC','RPNA_TIME_UTC','RPA_DATE_UTC','TIMESTAMP_UTC',
30
19
  'EVENT_START_DATE_UTC','EVENT_END_DATE_UTC',
@@ -44,13 +33,10 @@ MAX_CHUNK_SIZE = 10_000_000
44
33
 
45
34
  # --- Logger ---
46
35
  class Logger:
47
- def __init__(self, path):
36
+ def __init__(self, path: Path):
48
37
  self.terminal = sys.stdout
49
-
50
- # Ensure log folder exists
51
38
  path = Path(path)
52
39
  path.parent.mkdir(parents=True, exist_ok=True)
53
-
54
40
  self.logfile = open(path, 'w', encoding='utf-8')
55
41
 
56
42
  def write(self, msg):
@@ -111,7 +97,6 @@ def compare_and_report_diffs(sas_path: Path, parquet_path: Path):
111
97
  pq_it = pq.ParquetFile(parquet_path).iter_batches(batch_size=CH)
112
98
  chunk_i = 0
113
99
 
114
- # SAS→UNIX epoch offset µs
115
100
  offset_us = int((pd.Timestamp("1970-01-01") -
116
101
  pd.Timestamp("1960-01-01")).total_seconds() * 1e6)
117
102
 
@@ -146,7 +131,6 @@ def compare_and_report_diffs(sas_path: Path, parquet_path: Path):
146
131
  pcol = ppq.get_column(col)
147
132
  ds, dp = scol.dtype, pcol.dtype
148
133
  if ds in num_types and dp in num_types:
149
- # unify int<->float
150
134
  if ds in float_types and dp in int_types:
151
135
  pcol = pcol.cast(ds)
152
136
  elif dp in float_types and ds in int_types:
@@ -162,7 +146,6 @@ def compare_and_report_diffs(sas_path: Path, parquet_path: Path):
162
146
  sser = psas.get_column(col)
163
147
  pser = ppq.get_column(col)
164
148
 
165
- # epoch check
166
149
  if sser.dtype == pl.Datetime("us") and pser.dtype == pl.Datetime("us"):
167
150
  raw = sas_chunk[col]
168
151
  if pd.api.types.is_datetime64_ns_dtype(raw):
@@ -180,7 +163,6 @@ def compare_and_report_diffs(sas_path: Path, parquet_path: Path):
180
163
  ))
181
164
  continue
182
165
 
183
- # string compare (with date-only normalization)
184
166
  s_str = sser.cast(pl.Utf8)
185
167
  p_str = pser.cast(pl.Utf8)
186
168
  mask = (s_str != p_str) | (s_str.is_null() != p_str.is_null())
@@ -209,7 +191,6 @@ def compare_and_report_diffs(sas_path: Path, parquet_path: Path):
209
191
  def reconvert_file_ultimate(sas_path: Path, parquet_path: Path) -> bool:
210
192
  print(f"🛠️ Fixing {sas_path.name}...")
211
193
 
212
- # 1) metadata & encoding
213
194
  _, meta0 = pyreadstat.read_sas7bdat(sas_path, metadataonly=True)
214
195
  enc0 = getattr(meta0, 'file_encoding', None)
215
196
  if enc0:
@@ -232,13 +213,12 @@ def reconvert_file_ultimate(sas_path: Path, parquet_path: Path) -> bool:
232
213
  cols = meta0.column_names
233
214
  read_types = getattr(meta0, 'readstat_variable_types', {}) or {}
234
215
 
235
- # SAS formats if available
236
216
  fmt_map = {}
237
217
  if hasattr(meta0, 'formats'):
238
218
  for name, fmt in zip(meta0.column_names, meta0.formats):
239
219
  fmt_map[name] = fmt or ""
240
220
 
241
- # infer content types from first few chunks
221
+ # infer content types
242
222
  content, inf, cnt = {}, {}, 0
243
223
  it = pd.read_sas(sas_path, chunksize=MIN_CHUNK_SIZE, encoding=encoding)
244
224
  for chunk in it:
@@ -267,17 +247,14 @@ def reconvert_file_ultimate(sas_path: Path, parquet_path: Path) -> bool:
267
247
  print(f" Attempt {attempt}…")
268
248
  fields = []
269
249
  for c in cols:
270
- # 1) SAS-declared numeric → float64
271
250
  if read_types.get(c) == 'double':
272
251
  at = pa.float64()
273
252
  else:
274
253
  cu = c.upper()
275
- # 2) forced-string
276
254
  if cu in {x.upper() for x in COLUMNS_TO_FORCE_AS_STRING}:
277
255
  at = pa.string()
278
256
  else:
279
257
  fmt = fmt_map.get(c, "").upper()
280
- # 3) datetime/date/time
281
258
  if (cu in {x.upper() for x in KNOWN_DATETIME_COLUMNS}
282
259
  or any(x in fmt for x in ('DATE', 'TIME', 'DATETIME'))):
283
260
  if 'DATE' in fmt and 'DATETIME' not in fmt:
@@ -286,14 +263,11 @@ def reconvert_file_ultimate(sas_path: Path, parquet_path: Path) -> bool:
286
263
  at = pa.time64('ms')
287
264
  else:
288
265
  at = pa.timestamp('ms')
289
- # 4) fallback
290
266
  else:
291
267
  at = pa.string()
292
268
 
293
- # apply any dynamic override
294
269
  if c in overrides:
295
270
  at = overrides[c]
296
-
297
271
  fields.append(pa.field(c, at))
298
272
 
299
273
  schema = pa.schema(fields)
@@ -317,27 +291,22 @@ def reconvert_file_ultimate(sas_path: Path, parquet_path: Path) -> bool:
317
291
  writer.close()
318
292
  print(" ✅ Conversion succeeded")
319
293
 
320
- # ===== FULL PARQUET VALIDATION (WORKING) =====
321
294
  print(" 🔍 Full Parquet validation...")
322
295
  try:
323
296
  pf = pq.ParquetFile(parquet_path)
324
297
  total_rows = 0
325
298
  num_groups = pf.metadata.num_row_groups
326
299
  batch_count = 0
327
-
328
300
  for batch in pf.iter_batches():
329
301
  total_rows += batch.num_rows
330
302
  batch_count += 1
331
-
332
303
  print(f" ✅ Parquet fully validated: {total_rows:,} rows across {num_groups} groups ({batch_count} batches)")
333
304
  pf.close()
334
305
  except Exception as e:
335
306
  print(f" ❌ Parquet validation failed: {e}")
336
307
  return False
337
- # ===== END =====
338
308
 
339
309
  st, dt = compare_and_report_diffs(sas_path, parquet_path)
340
-
341
310
  print(f" 🔍 Validation: {st}")
342
311
  for d in dt:
343
312
  print(" -", d.replace("\n", "\n "))
@@ -369,39 +338,78 @@ def reconvert_file_ultimate(sas_path: Path, parquet_path: Path) -> bool:
369
338
  return False
370
339
 
371
340
 
372
- # --- Main loop ---
373
- def main():
341
+ def default_parquet_dir_for(sas_input_dir: Path) -> Path:
342
+ # sibling parquetdata/ next to sas_input_dir
343
+ return sas_input_dir.parent / "parquetdata"
344
+
345
+
346
+ def default_log_dir_for(sas_input_dir: Path) -> Path:
347
+ # sibling logging/ next to sas_input_dir
348
+ return sas_input_dir.parent / "logging"
349
+
350
+
351
+ def parquet_path_for_sas(sas_file: Path, sas_input_dir: Path, parquet_output_dir: Path) -> Path:
352
+ rel = sas_file.relative_to(sas_input_dir)
353
+
354
+ if rel.parent == Path("."):
355
+ return (parquet_output_dir / rel.name).with_suffix(".parquet")
356
+
357
+ parquet_dirs = [f"{p}_parquet" for p in rel.parent.parts]
358
+ return (parquet_output_dir.joinpath(*parquet_dirs) / rel.name).with_suffix(".parquet")
359
+
360
+
361
+ # --- Main loop (directory mode) ---
362
+ def main(
363
+ sas_input_dir: Path,
364
+ parquet_output_dir: Path | None = None,
365
+ log_dir: Path | None = None,
366
+ ) -> int:
367
+ sas_input_dir = Path(sas_input_dir).expanduser().resolve()
368
+ if not sas_input_dir.exists() or not sas_input_dir.is_dir():
369
+ print(f"❌ Input directory not found or not a directory: {sas_input_dir}")
370
+ return 2
371
+
372
+ parquet_output_dir = (Path(parquet_output_dir).expanduser().resolve()
373
+ if parquet_output_dir else default_parquet_dir_for(sas_input_dir))
374
+ log_dir = (Path(log_dir).expanduser().resolve()
375
+ if log_dir else default_log_dir_for(sas_input_dir))
376
+
377
+ log_file_path = log_dir / f"conversion_{datetime.now():%Y%m%d_%H%M%S}.log"
378
+
374
379
  orig = sys.stdout
375
- sys.stdout = Logger(LOG_FILE_PATH)
380
+ sys.stdout = Logger(log_file_path)
376
381
  try:
377
382
  print("🚀 SAS → Parquet Hybrid Fix & Validate (full folder)\n")
378
- files = list(SAS_INPUT_DIR.rglob("*.sas7bdat"))
383
+ print(f"Input: {sas_input_dir}")
384
+ print(f"Output: {parquet_output_dir}")
385
+ print(f"Logs: {log_file_path}\n")
386
+
387
+ files = list(sas_input_dir.rglob("*.sas7bdat"))
379
388
  if not files:
380
389
  print("❌ No SAS files found. Exiting.")
381
- return
390
+ return 1
382
391
 
383
392
  print(f"Found {len(files)} files.\n" + "="*60)
393
+ ok = 0
394
+ bad = 0
395
+
384
396
  for sas in files:
385
- rel = sas.relative_to(SAS_INPUT_DIR)
397
+ rel = sas.relative_to(sas_input_dir)
386
398
  print(f"\n🗂 Processing: {rel}")
387
399
 
388
- # Mirror structure under parquetdata/ (which lives next to sasdata/)
400
+ pqf = parquet_path_for_sas(sas, sas_input_dir, parquet_output_dir)
401
+ success = reconvert_file_ultimate(sas, pqf)
389
402
 
390
- if rel.parent == Path("."):
391
- pqf = (PARQUET_INPUT_DIR / rel.name).with_suffix(".parquet")
403
+ if success:
404
+ ok += 1
392
405
  else:
393
- parquet_dirs = [f"{p}_parquet" for p in rel.parent.parts]
394
- pqf = (PARQUET_INPUT_DIR.joinpath(*parquet_dirs) / rel.name).with_suffix(".parquet")
395
-
396
-
397
- reconvert_file_ultimate(sas, pqf)
406
+ bad += 1
407
+
398
408
  print("-"*60)
399
409
 
400
- print("\n✅ All done. See log at:", LOG_FILE_PATH)
410
+ print(f"\n✅ Done. Success={ok}, Failed={bad}. See log at: {log_file_path}")
411
+ return 0 if bad == 0 else 1
412
+
401
413
  finally:
402
414
  sys.stdout.close()
403
- sys.stdout = orig
404
-
405
-
406
- if __name__ == "__main__":
407
- main()
415
+ sys.stdout = orig
@@ -1,135 +0,0 @@
1
- Metadata-Version: 2.4
2
- Name: sas2parquet
3
- Version: 0.1.8
4
- Summary: SAS → Parquet Hybrid Converter & Validator
5
- License-File: LICENSE
6
- Author: Zaman Ziabakhshganji
7
- Author-email: zaman.ganji@gmail.com
8
- Requires-Python: >=3.11
9
- Classifier: Programming Language :: Python :: 3
10
- Classifier: Programming Language :: Python :: 3.11
11
- Classifier: Programming Language :: Python :: 3.12
12
- Classifier: Programming Language :: Python :: 3.13
13
- Classifier: Programming Language :: Python :: 3.14
14
- Requires-Dist: narwhals (==2.13.0)
15
- Requires-Dist: numpy (==2.3.5)
16
- Requires-Dist: pandas (==2.3.3)
17
- Requires-Dist: polars (==1.36.1)
18
- Requires-Dist: polars-runtime-32 (==1.36.1)
19
- Requires-Dist: py4j (==0.10.9.9)
20
- Requires-Dist: pyarrow (==22.0.0)
21
- Requires-Dist: pyreadstat (==1.3.2)
22
- Requires-Dist: pyspark (==4.0.1)
23
- Requires-Dist: pytest (>=9.0.2,<10.0.0)
24
- Requires-Dist: python-dateutil (==2.9.0.post0)
25
- Requires-Dist: pytz (==2025.2)
26
- Requires-Dist: requests (>=2.32.5,<3.0.0)
27
- Requires-Dist: six (==1.17.0)
28
- Requires-Dist: tzdata (==2025.2)
29
- Description-Content-Type: text/markdown
30
-
31
- # sas2parquet
32
-
33
- [![PyPI version](https://badge.fury.io/py/sas2parquet.svg)](https://pypi.org/project/sas2parquet/)
34
- [![Python versions](https://img.shields.io/pypi/pyversions/sas2parquet.svg)](https://pypi.org/project/sas2parquet/)
35
- [![License](https://img.shields.io/pypi/l/sas2parquet.svg)](LICENCE)
36
-
37
- **The ultimate SAS (.sas7bdat) to Parquet converter** - handles problematic files that fail with standard tools. Automatic encoding detection, intelligent type inference, schema repair, and pixel-perfect validation.
38
-
39
- ## ✨ Features
40
-
41
- | Feature | Description |
42
- |---------|-------------|
43
- | 🔄 **Auto Encoding** | Detects UTF-8, Latin1, CP1252 from metadata/fallback |
44
- | 🧠 **Smart Types** | Infers datetime, numeric, string with 20+ fallback attempts |
45
- | ✅ **Validation** | Compares SAS vs Parquet chunk-by-chunk (numeric + string) |
46
- | 📊 **Memory Safe** | Chunked processing (96GB RAM optimized, configurable) |
47
- | 💾 **ZSTD** | Level 6 compression for maximum efficiency |
48
- | 📝 **Detailed Logs** | Mismatch reports + full conversion trace |
49
- | 🎯 **Two Modes** | Single file OR recursive directory processing |
50
-
51
- ## Quick Start
52
-
53
- ### Install
54
- ```bash
55
- pip install sas2parquet
56
- ```
57
-
58
- ### Single File
59
- ```bash
60
- sas2parquet input.sas output.parquet
61
- ```
62
-
63
- ### Batch Directory (Recommended)
64
- ```bash
65
- sas2parquet --dir-mode
66
- ```
67
-
68
- ## 📁 Directory Mode (Default Workflow)
69
-
70
- ### How it works
71
-
72
- - You provide a `sasdata/` directory containing all `.sas7bdat` files (including nested subfolders).
73
- - The tool automatically creates a `parquetdata/` directory in the same parent folder as `sasdata/`.
74
- - All files are converted to Parquet and written into `parquetdata/`, mirroring the original folder structure.
75
-
76
- ```text
77
- your-project/
78
- ├── sasdata/ # ← Put your .sas7bdat files here
79
- │ ├── file1.sas7bdat
80
- │ └── subfolder/
81
- │ └── nested.sas7bdat
82
- ├── parquetdata/ # ← AUTO-CREATED (mirrors sasdata/)
83
- │ ├── file1.parquet
84
- │ └── subfolder/
85
- │ └── nested.parquet
86
- └── logging/ # ← AUTO-CREATED (detailed logs)
87
- └── conversion_20260205_1145.log
88
- ```
89
-
90
- Just run:
91
- ```bash
92
- sas2parquet --dir-mode
93
- ```
94
-
95
- ## 🛠️ CLI Reference
96
- ```bash
97
- sas2parquet --help
98
- ```
99
-
100
- ```text
101
- usage: sas2parquet [-h] [--dir-mode] [sas_file] [parquet_file]
102
-
103
- Robust SAS to Parquet converter with validation
104
- ```
105
-
106
- ## 📊 Example Output
107
- ```text
108
- 🚀 SAS → Parquet Hybrid Fix & Validate (full folder)
109
- Found 3 files.
110
- ============================================================
111
- ...
112
- ```
113
-
114
- ## ⚙️ Configuration (Advanced)
115
-
116
- Edit `src/sas2parquet/convert.py` constants:
117
-
118
- ```python
119
- AVAILABLE_RAM_GB = 96
120
- RAM_USAGE_FACTOR = 0.5
121
- ZSTD_COMPRESSION_LEVEL = 6
122
- MIN_CHUNK_SIZE = 100_000
123
- MAX_CHUNK_SIZE = 10_000_000
124
- ```
125
-
126
- ## 🧪 Validation Details
127
- Each file undergoes 4-stage validation:
128
- 1. Metadata
129
- 2. Exact counts
130
- 3. Column order
131
- 4. Value comparison
132
-
133
- ## 📄 License
134
- MIT License
135
-
@@ -1,104 +0,0 @@
1
- # sas2parquet
2
-
3
- [![PyPI version](https://badge.fury.io/py/sas2parquet.svg)](https://pypi.org/project/sas2parquet/)
4
- [![Python versions](https://img.shields.io/pypi/pyversions/sas2parquet.svg)](https://pypi.org/project/sas2parquet/)
5
- [![License](https://img.shields.io/pypi/l/sas2parquet.svg)](LICENCE)
6
-
7
- **The ultimate SAS (.sas7bdat) to Parquet converter** - handles problematic files that fail with standard tools. Automatic encoding detection, intelligent type inference, schema repair, and pixel-perfect validation.
8
-
9
- ## ✨ Features
10
-
11
- | Feature | Description |
12
- |---------|-------------|
13
- | 🔄 **Auto Encoding** | Detects UTF-8, Latin1, CP1252 from metadata/fallback |
14
- | 🧠 **Smart Types** | Infers datetime, numeric, string with 20+ fallback attempts |
15
- | ✅ **Validation** | Compares SAS vs Parquet chunk-by-chunk (numeric + string) |
16
- | 📊 **Memory Safe** | Chunked processing (96GB RAM optimized, configurable) |
17
- | 💾 **ZSTD** | Level 6 compression for maximum efficiency |
18
- | 📝 **Detailed Logs** | Mismatch reports + full conversion trace |
19
- | 🎯 **Two Modes** | Single file OR recursive directory processing |
20
-
21
- ## Quick Start
22
-
23
- ### Install
24
- ```bash
25
- pip install sas2parquet
26
- ```
27
-
28
- ### Single File
29
- ```bash
30
- sas2parquet input.sas output.parquet
31
- ```
32
-
33
- ### Batch Directory (Recommended)
34
- ```bash
35
- sas2parquet --dir-mode
36
- ```
37
-
38
- ## 📁 Directory Mode (Default Workflow)
39
-
40
- ### How it works
41
-
42
- - You provide a `sasdata/` directory containing all `.sas7bdat` files (including nested subfolders).
43
- - The tool automatically creates a `parquetdata/` directory in the same parent folder as `sasdata/`.
44
- - All files are converted to Parquet and written into `parquetdata/`, mirroring the original folder structure.
45
-
46
- ```text
47
- your-project/
48
- ├── sasdata/ # ← Put your .sas7bdat files here
49
- │ ├── file1.sas7bdat
50
- │ └── subfolder/
51
- │ └── nested.sas7bdat
52
- ├── parquetdata/ # ← AUTO-CREATED (mirrors sasdata/)
53
- │ ├── file1.parquet
54
- │ └── subfolder/
55
- │ └── nested.parquet
56
- └── logging/ # ← AUTO-CREATED (detailed logs)
57
- └── conversion_20260205_1145.log
58
- ```
59
-
60
- Just run:
61
- ```bash
62
- sas2parquet --dir-mode
63
- ```
64
-
65
- ## 🛠️ CLI Reference
66
- ```bash
67
- sas2parquet --help
68
- ```
69
-
70
- ```text
71
- usage: sas2parquet [-h] [--dir-mode] [sas_file] [parquet_file]
72
-
73
- Robust SAS to Parquet converter with validation
74
- ```
75
-
76
- ## 📊 Example Output
77
- ```text
78
- 🚀 SAS → Parquet Hybrid Fix & Validate (full folder)
79
- Found 3 files.
80
- ============================================================
81
- ...
82
- ```
83
-
84
- ## ⚙️ Configuration (Advanced)
85
-
86
- Edit `src/sas2parquet/convert.py` constants:
87
-
88
- ```python
89
- AVAILABLE_RAM_GB = 96
90
- RAM_USAGE_FACTOR = 0.5
91
- ZSTD_COMPRESSION_LEVEL = 6
92
- MIN_CHUNK_SIZE = 100_000
93
- MAX_CHUNK_SIZE = 10_000_000
94
- ```
95
-
96
- ## 🧪 Validation Details
97
- Each file undergoes 4-stage validation:
98
- 1. Metadata
99
- 2. Exact counts
100
- 3. Column order
101
- 4. Value comparison
102
-
103
- ## 📄 License
104
- MIT License
@@ -1,37 +0,0 @@
1
- [project]
2
- name = "sas2parquet"
3
- version = "0.1.8"
4
- description = "SAS → Parquet Hybrid Converter & Validator"
5
- authors = [
6
- {name = "Zaman Ziabakhshganji",email = "zaman.ganji@gmail.com"}
7
- ]
8
- readme = "README.md"
9
- requires-python = ">=3.11"
10
- dependencies = [
11
- "pytest (>=9.0.2,<10.0.0)",
12
- "requests (>=2.32.5,<3.0.0)",
13
- "narwhals (==2.13.0)",
14
- "numpy (==2.3.5)",
15
- "pandas (==2.3.3)",
16
- "polars (==1.36.1)",
17
- "polars-runtime-32 (==1.36.1)",
18
- "py4j (==0.10.9.9)",
19
- "pyarrow (==22.0.0)",
20
- "pyreadstat (==1.3.2)",
21
- "pyspark (==4.0.1)",
22
- "python-dateutil (==2.9.0.post0)",
23
- "pytz (==2025.2)",
24
- "six (==1.17.0)",
25
- "tzdata (==2025.2)",
26
- ]
27
-
28
- [tool.poetry]
29
- packages = [{include = "sas2parquet", from = "src"}]
30
-
31
- [tool.poetry.scripts]
32
- sas2parquet = "sas2parquet.cli:main"
33
-
34
- [build-system]
35
- requires = ["poetry-core>=2.0.0,<3.0.0"]
36
- build-backend = "poetry.core.masonry.api"
37
-
@@ -1,36 +0,0 @@
1
- #!/usr/bin/env python
2
- """CLI entrypoint for sas2parquet."""
3
- import argparse
4
- import sys
5
- from pathlib import Path
6
- import importlib.metadata
7
-
8
- # Get version from installed package metadata (works everywhere)
9
- try:
10
- __version__ = importlib.metadata.version("sas2parquet")
11
- except importlib.metadata.PackageNotFoundError:
12
- __version__ = "dev" # During development
13
-
14
- from .convert import main as _convert_main, reconvert_file_ultimate
15
-
16
- def main():
17
- parser = argparse.ArgumentParser(description="SAS to Parquet converter")
18
- parser.add_argument("--version", action="version", version=f"%(prog)s {__version__}")
19
- parser.add_argument("sas_file", nargs="?", help="Single SAS file to convert")
20
- parser.add_argument("parquet_file", nargs="?", help="Output Parquet file")
21
- parser.add_argument("--dir-mode", "-d", action="store_true",
22
- help="Process entire SAS_INPUT_DIR (ignores file args)")
23
-
24
- args = parser.parse_args()
25
-
26
- if args.dir_mode:
27
- _convert_main()
28
- elif args.sas_file and args.parquet_file:
29
- success = reconvert_file_ultimate(Path(args.sas_file), Path(args.parquet_file))
30
- sys.exit(0 if success else 1)
31
- else:
32
- parser.print_help()
33
- sys.exit(1)
34
-
35
- if __name__ == "__main__":
36
- main()
File without changes