sas2parquet 0.1.6__tar.gz → 0.1.9__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,11 +1,15 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: sas2parquet
3
- Version: 0.1.6
3
+ Version: 0.1.9
4
4
  Summary: SAS → Parquet Hybrid Converter & Validator
5
+ License: MIT
5
6
  License-File: LICENSE
7
+ Keywords: sas,parquet,etl,data,pyarrow
6
8
  Author: Zaman Ziabakhshganji
7
9
  Author-email: zaman.ganji@gmail.com
8
- Requires-Python: >=3.11
10
+ Requires-Python: >=3.11,<4.0
11
+ Classifier: License :: OSI Approved :: MIT License
12
+ Classifier: Operating System :: OS Independent
9
13
  Classifier: Programming Language :: Python :: 3
10
14
  Classifier: Programming Language :: Python :: 3.11
11
15
  Classifier: Programming Language :: Python :: 3.12
@@ -15,17 +19,15 @@ Requires-Dist: narwhals (==2.13.0)
15
19
  Requires-Dist: numpy (==2.3.5)
16
20
  Requires-Dist: pandas (==2.3.3)
17
21
  Requires-Dist: polars (==1.36.1)
18
- Requires-Dist: polars-runtime-32 (==1.36.1)
19
- Requires-Dist: py4j (==0.10.9.9)
20
22
  Requires-Dist: pyarrow (==22.0.0)
21
23
  Requires-Dist: pyreadstat (==1.3.2)
22
- Requires-Dist: pyspark (==4.0.1)
23
- Requires-Dist: pytest (>=9.0.2,<10.0.0)
24
24
  Requires-Dist: python-dateutil (==2.9.0.post0)
25
25
  Requires-Dist: pytz (==2025.2)
26
26
  Requires-Dist: requests (>=2.32.5,<3.0.0)
27
27
  Requires-Dist: six (==1.17.0)
28
28
  Requires-Dist: tzdata (==2025.2)
29
+ Project-URL: Homepage, https://github.com/<you>/<repo>
30
+ Project-URL: Repository, https://github.com/<you>/<repo>
29
31
  Description-Content-Type: text/markdown
30
32
 
31
33
  # sas2parquet
@@ -75,13 +77,15 @@ sas2parquet --dir-mode
75
77
 
76
78
  ```text
77
79
  your-project/
78
- ├── sasdata/
80
+ ├── sasdata/ # ← Put your .sas7bdat files here
79
81
  │ ├── file1.sas7bdat
80
- │ └── nested.sas7bdat
81
- ├── parquetdata/
82
+ │ └── subfolder/
83
+ │ └── nested.sas7bdat
84
+ ├── parquetdata/ # ← AUTO-CREATED (mirrors sasdata/)
82
85
  │ ├── file1.parquet
83
- │ └── nested.parquet
84
- └── logging/
86
+ │ └── subfolder/
87
+ └── nested.parquet
88
+ └── logging/ # ← AUTO-CREATED (detailed logs)
85
89
  └── conversion_20260205_1145.log
86
90
  ```
87
91
 
@@ -45,13 +45,15 @@ sas2parquet --dir-mode
45
45
 
46
46
  ```text
47
47
  your-project/
48
- ├── sasdata/
48
+ ├── sasdata/ # ← Put your .sas7bdat files here
49
49
  │ ├── file1.sas7bdat
50
- │ └── nested.sas7bdat
51
- ├── parquetdata/
50
+ │ └── subfolder/
51
+ │ └── nested.sas7bdat
52
+ ├── parquetdata/ # ← AUTO-CREATED (mirrors sasdata/)
52
53
  │ ├── file1.parquet
53
- │ └── nested.parquet
54
- └── logging/
54
+ │ └── subfolder/
55
+ └── nested.parquet
56
+ └── logging/ # ← AUTO-CREATED (detailed logs)
55
57
  └── conversion_20260205_1145.log
56
58
  ```
57
59
 
@@ -0,0 +1,48 @@
1
+ [tool.poetry]
2
+ name = "sas2parquet"
3
+ version = "0.1.9"
4
+ description = "SAS → Parquet Hybrid Converter & Validator"
5
+ readme = "README.md"
6
+ authors = ["Zaman Ziabakhshganji <zaman.ganji@gmail.com>"]
7
+ license = "MIT"
8
+ packages = [{ include = "sas2parquet", from = "src" }]
9
+ # Optional but nice for PyPI:
10
+ repository = "https://github.com/<you>/<repo>"
11
+ homepage = "https://github.com/<you>/<repo>"
12
+ keywords = ["sas", "parquet", "etl", "data", "pyarrow"]
13
+ classifiers = [
14
+ "Programming Language :: Python :: 3",
15
+ "Programming Language :: Python :: 3.11",
16
+ "Programming Language :: Python :: 3.12",
17
+ "License :: OSI Approved :: MIT License",
18
+ "Operating System :: OS Independent",
19
+ ]
20
+
21
+ [tool.poetry.dependencies]
22
+ python = ">=3.11,<4.0"
23
+ requests = ">=2.32.5,<3.0.0"
24
+ narwhals = "==2.13.0"
25
+ numpy = "==2.3.5"
26
+ pandas = "==2.3.3"
27
+ polars = "==1.36.1"
28
+ pyarrow = "==22.0.0"
29
+ pyreadstat = "==1.3.2"
30
+ python-dateutil = "==2.9.0.post0"
31
+ pytz = "==2025.2"
32
+ six = "==1.17.0"
33
+ tzdata = "==2025.2"
34
+
35
+ # 🚫 Strongly consider NOT shipping these as required deps:
36
+ # pyspark, py4j, polars-runtime-32
37
+ # They dramatically inflate installs and aren't required for your conversion script.
38
+ # If you still want them, put them behind extras (see below).
39
+
40
+ [tool.poetry.group.dev.dependencies]
41
+ pytest = ">=9.0.2,<10.0.0"
42
+
43
+ [tool.poetry.scripts]
44
+ sas2parquet = "sas2parquet.cli:main"
45
+
46
+ [build-system]
47
+ requires = ["poetry-core>=2.0.0,<3.0.0"]
48
+ build-backend = "poetry.core.masonry.api"
@@ -0,0 +1,69 @@
1
+ #!/usr/bin/env python
2
+ """CLI entrypoint for sas2parquet."""
3
+ import argparse
4
+ import sys
5
+ from pathlib import Path
6
+ import importlib.metadata
7
+
8
+ try:
9
+ __version__ = importlib.metadata.version("sas2parquet")
10
+ except importlib.metadata.PackageNotFoundError:
11
+ __version__ = "dev"
12
+
13
+ from .convert import main as convert_dir, reconvert_file_ultimate
14
+
15
+
16
+ def main():
17
+ parser = argparse.ArgumentParser(prog="sas2parquet", description="SAS to Parquet converter")
18
+ parser.add_argument("--version", action="version", version=f"%(prog)s {__version__}")
19
+
20
+ parser.add_argument(
21
+ "path",
22
+ help="Path to a .sas7bdat file OR a directory containing SAS files (recursively)."
23
+ )
24
+ parser.add_argument(
25
+ "--out", "-o",
26
+ help="Output Parquet file (file mode) OR output directory (dir mode). "
27
+ "If omitted, dir mode uses sibling 'parquetdata/'.",
28
+ default=None
29
+ )
30
+ parser.add_argument(
31
+ "--log-dir",
32
+ help="Directory where logs are written (dir mode). If omitted, uses sibling 'logging/'.",
33
+ default=None
34
+ )
35
+
36
+ args = parser.parse_args()
37
+
38
+ p = Path(args.path).expanduser().resolve()
39
+ if not p.exists():
40
+ print(f"❌ Path not found: {p}")
41
+ sys.exit(2)
42
+
43
+ # Directory mode
44
+ if p.is_dir():
45
+ out_dir = Path(args.out).expanduser().resolve() if args.out else None
46
+ log_dir = Path(args.log_dir).expanduser().resolve() if args.log_dir else None
47
+ rc = convert_dir(p, parquet_output_dir=out_dir, log_dir=log_dir)
48
+ sys.exit(rc)
49
+
50
+ # File mode
51
+ if p.is_file():
52
+ if p.suffix.lower() != ".sas7bdat":
53
+ print(f"❌ Not a .sas7bdat file: {p.name}")
54
+ sys.exit(2)
55
+
56
+ if args.out:
57
+ out_file = Path(args.out).expanduser().resolve()
58
+ else:
59
+ out_file = p.with_suffix(".parquet")
60
+
61
+ success = reconvert_file_ultimate(p, out_file)
62
+ sys.exit(0 if success else 1)
63
+
64
+ print(f"❌ Unsupported path type: {p}")
65
+ sys.exit(2)
66
+
67
+
68
+ if __name__ == "__main__":
69
+ main()
@@ -12,19 +12,8 @@ import pyarrow as pa
12
12
  import pyarrow.parquet as pq
13
13
  import pyreadstat
14
14
 
15
- # --- Suppress pandas FutureWarnings ---
16
15
  warnings.simplefilter(action='ignore', category=FutureWarning)
17
16
 
18
- # --- Configuration ---
19
- # Put your .sas7bdat files inside SAS_INPUT_DIR (including subfolders).
20
- SAS_INPUT_DIR = Path("sasdata")
21
-
22
- # IMPORTANT:
23
- # parquetdata/ and logging/ are created NEXT TO sasdata/ (i.e., in the same parent directory).
24
- PARQUET_INPUT_DIR = SAS_INPUT_DIR.parent / "parquetdata"
25
- LOG_DIR = SAS_INPUT_DIR.parent / "logging"
26
- LOG_FILE_PATH = LOG_DIR / f"conversion_{datetime.now():%Y%m%d_%H%M%S}.log"
27
-
28
17
  KNOWN_DATETIME_COLUMNS = [
29
18
  'RPNA_DATE_UTC','RPNA_TIME_UTC','RPA_DATE_UTC','TIMESTAMP_UTC',
30
19
  'EVENT_START_DATE_UTC','EVENT_END_DATE_UTC',
@@ -44,13 +33,10 @@ MAX_CHUNK_SIZE = 10_000_000
44
33
 
45
34
  # --- Logger ---
46
35
  class Logger:
47
- def __init__(self, path):
36
+ def __init__(self, path: Path):
48
37
  self.terminal = sys.stdout
49
-
50
- # Ensure log folder exists
51
38
  path = Path(path)
52
39
  path.parent.mkdir(parents=True, exist_ok=True)
53
-
54
40
  self.logfile = open(path, 'w', encoding='utf-8')
55
41
 
56
42
  def write(self, msg):
@@ -111,7 +97,6 @@ def compare_and_report_diffs(sas_path: Path, parquet_path: Path):
111
97
  pq_it = pq.ParquetFile(parquet_path).iter_batches(batch_size=CH)
112
98
  chunk_i = 0
113
99
 
114
- # SAS→UNIX epoch offset µs
115
100
  offset_us = int((pd.Timestamp("1970-01-01") -
116
101
  pd.Timestamp("1960-01-01")).total_seconds() * 1e6)
117
102
 
@@ -146,7 +131,6 @@ def compare_and_report_diffs(sas_path: Path, parquet_path: Path):
146
131
  pcol = ppq.get_column(col)
147
132
  ds, dp = scol.dtype, pcol.dtype
148
133
  if ds in num_types and dp in num_types:
149
- # unify int<->float
150
134
  if ds in float_types and dp in int_types:
151
135
  pcol = pcol.cast(ds)
152
136
  elif dp in float_types and ds in int_types:
@@ -162,7 +146,6 @@ def compare_and_report_diffs(sas_path: Path, parquet_path: Path):
162
146
  sser = psas.get_column(col)
163
147
  pser = ppq.get_column(col)
164
148
 
165
- # epoch check
166
149
  if sser.dtype == pl.Datetime("us") and pser.dtype == pl.Datetime("us"):
167
150
  raw = sas_chunk[col]
168
151
  if pd.api.types.is_datetime64_ns_dtype(raw):
@@ -180,7 +163,6 @@ def compare_and_report_diffs(sas_path: Path, parquet_path: Path):
180
163
  ))
181
164
  continue
182
165
 
183
- # string compare (with date-only normalization)
184
166
  s_str = sser.cast(pl.Utf8)
185
167
  p_str = pser.cast(pl.Utf8)
186
168
  mask = (s_str != p_str) | (s_str.is_null() != p_str.is_null())
@@ -209,7 +191,6 @@ def compare_and_report_diffs(sas_path: Path, parquet_path: Path):
209
191
  def reconvert_file_ultimate(sas_path: Path, parquet_path: Path) -> bool:
210
192
  print(f"🛠️ Fixing {sas_path.name}...")
211
193
 
212
- # 1) metadata & encoding
213
194
  _, meta0 = pyreadstat.read_sas7bdat(sas_path, metadataonly=True)
214
195
  enc0 = getattr(meta0, 'file_encoding', None)
215
196
  if enc0:
@@ -232,13 +213,12 @@ def reconvert_file_ultimate(sas_path: Path, parquet_path: Path) -> bool:
232
213
  cols = meta0.column_names
233
214
  read_types = getattr(meta0, 'readstat_variable_types', {}) or {}
234
215
 
235
- # SAS formats if available
236
216
  fmt_map = {}
237
217
  if hasattr(meta0, 'formats'):
238
218
  for name, fmt in zip(meta0.column_names, meta0.formats):
239
219
  fmt_map[name] = fmt or ""
240
220
 
241
- # infer content types from first few chunks
221
+ # infer content types
242
222
  content, inf, cnt = {}, {}, 0
243
223
  it = pd.read_sas(sas_path, chunksize=MIN_CHUNK_SIZE, encoding=encoding)
244
224
  for chunk in it:
@@ -267,17 +247,14 @@ def reconvert_file_ultimate(sas_path: Path, parquet_path: Path) -> bool:
267
247
  print(f" Attempt {attempt}…")
268
248
  fields = []
269
249
  for c in cols:
270
- # 1) SAS-declared numeric → float64
271
250
  if read_types.get(c) == 'double':
272
251
  at = pa.float64()
273
252
  else:
274
253
  cu = c.upper()
275
- # 2) forced-string
276
254
  if cu in {x.upper() for x in COLUMNS_TO_FORCE_AS_STRING}:
277
255
  at = pa.string()
278
256
  else:
279
257
  fmt = fmt_map.get(c, "").upper()
280
- # 3) datetime/date/time
281
258
  if (cu in {x.upper() for x in KNOWN_DATETIME_COLUMNS}
282
259
  or any(x in fmt for x in ('DATE', 'TIME', 'DATETIME'))):
283
260
  if 'DATE' in fmt and 'DATETIME' not in fmt:
@@ -286,14 +263,11 @@ def reconvert_file_ultimate(sas_path: Path, parquet_path: Path) -> bool:
286
263
  at = pa.time64('ms')
287
264
  else:
288
265
  at = pa.timestamp('ms')
289
- # 4) fallback
290
266
  else:
291
267
  at = pa.string()
292
268
 
293
- # apply any dynamic override
294
269
  if c in overrides:
295
270
  at = overrides[c]
296
-
297
271
  fields.append(pa.field(c, at))
298
272
 
299
273
  schema = pa.schema(fields)
@@ -317,27 +291,22 @@ def reconvert_file_ultimate(sas_path: Path, parquet_path: Path) -> bool:
317
291
  writer.close()
318
292
  print(" ✅ Conversion succeeded")
319
293
 
320
- # ===== FULL PARQUET VALIDATION (WORKING) =====
321
294
  print(" 🔍 Full Parquet validation...")
322
295
  try:
323
296
  pf = pq.ParquetFile(parquet_path)
324
297
  total_rows = 0
325
298
  num_groups = pf.metadata.num_row_groups
326
299
  batch_count = 0
327
-
328
300
  for batch in pf.iter_batches():
329
301
  total_rows += batch.num_rows
330
302
  batch_count += 1
331
-
332
303
  print(f" ✅ Parquet fully validated: {total_rows:,} rows across {num_groups} groups ({batch_count} batches)")
333
304
  pf.close()
334
305
  except Exception as e:
335
306
  print(f" ❌ Parquet validation failed: {e}")
336
307
  return False
337
- # ===== END =====
338
308
 
339
309
  st, dt = compare_and_report_diffs(sas_path, parquet_path)
340
-
341
310
  print(f" 🔍 Validation: {st}")
342
311
  for d in dt:
343
312
  print(" -", d.replace("\n", "\n "))
@@ -369,39 +338,78 @@ def reconvert_file_ultimate(sas_path: Path, parquet_path: Path) -> bool:
369
338
  return False
370
339
 
371
340
 
372
- # --- Main loop ---
373
- def main():
341
+ def default_parquet_dir_for(sas_input_dir: Path) -> Path:
342
+ # sibling parquetdata/ next to sas_input_dir
343
+ return sas_input_dir.parent / "parquetdata"
344
+
345
+
346
+ def default_log_dir_for(sas_input_dir: Path) -> Path:
347
+ # sibling logging/ next to sas_input_dir
348
+ return sas_input_dir.parent / "logging"
349
+
350
+
351
+ def parquet_path_for_sas(sas_file: Path, sas_input_dir: Path, parquet_output_dir: Path) -> Path:
352
+ rel = sas_file.relative_to(sas_input_dir)
353
+
354
+ if rel.parent == Path("."):
355
+ return (parquet_output_dir / rel.name).with_suffix(".parquet")
356
+
357
+ parquet_dirs = [f"{p}_parquet" for p in rel.parent.parts]
358
+ return (parquet_output_dir.joinpath(*parquet_dirs) / rel.name).with_suffix(".parquet")
359
+
360
+
361
+ # --- Main loop (directory mode) ---
362
+ def main(
363
+ sas_input_dir: Path,
364
+ parquet_output_dir: Path | None = None,
365
+ log_dir: Path | None = None,
366
+ ) -> int:
367
+ sas_input_dir = Path(sas_input_dir).expanduser().resolve()
368
+ if not sas_input_dir.exists() or not sas_input_dir.is_dir():
369
+ print(f"❌ Input directory not found or not a directory: {sas_input_dir}")
370
+ return 2
371
+
372
+ parquet_output_dir = (Path(parquet_output_dir).expanduser().resolve()
373
+ if parquet_output_dir else default_parquet_dir_for(sas_input_dir))
374
+ log_dir = (Path(log_dir).expanduser().resolve()
375
+ if log_dir else default_log_dir_for(sas_input_dir))
376
+
377
+ log_file_path = log_dir / f"conversion_{datetime.now():%Y%m%d_%H%M%S}.log"
378
+
374
379
  orig = sys.stdout
375
- sys.stdout = Logger(LOG_FILE_PATH)
380
+ sys.stdout = Logger(log_file_path)
376
381
  try:
377
382
  print("🚀 SAS → Parquet Hybrid Fix & Validate (full folder)\n")
378
- files = list(SAS_INPUT_DIR.rglob("*.sas7bdat"))
383
+ print(f"Input: {sas_input_dir}")
384
+ print(f"Output: {parquet_output_dir}")
385
+ print(f"Logs: {log_file_path}\n")
386
+
387
+ files = list(sas_input_dir.rglob("*.sas7bdat"))
379
388
  if not files:
380
389
  print("❌ No SAS files found. Exiting.")
381
- return
390
+ return 1
382
391
 
383
392
  print(f"Found {len(files)} files.\n" + "="*60)
393
+ ok = 0
394
+ bad = 0
395
+
384
396
  for sas in files:
385
- rel = sas.relative_to(SAS_INPUT_DIR)
397
+ rel = sas.relative_to(sas_input_dir)
386
398
  print(f"\n🗂 Processing: {rel}")
387
399
 
388
- # Mirror structure under parquetdata/ (which lives next to sasdata/)
400
+ pqf = parquet_path_for_sas(sas, sas_input_dir, parquet_output_dir)
401
+ success = reconvert_file_ultimate(sas, pqf)
389
402
 
390
- if rel.parent == Path("."):
391
- pqf = (PARQUET_INPUT_DIR / rel.name).with_suffix(".parquet")
403
+ if success:
404
+ ok += 1
392
405
  else:
393
- parquet_dirs = [f"{p}_parquet" for p in rel.parent.parts]
394
- pqf = (PARQUET_INPUT_DIR.joinpath(*parquet_dirs) / rel.name).with_suffix(".parquet")
395
-
396
-
397
- reconvert_file_ultimate(sas, pqf)
406
+ bad += 1
407
+
398
408
  print("-"*60)
399
409
 
400
- print("\n✅ All done. See log at:", LOG_FILE_PATH)
410
+ print(f"\n✅ Done. Success={ok}, Failed={bad}. See log at: {log_file_path}")
411
+ return 0 if bad == 0 else 1
412
+
401
413
  finally:
402
414
  sys.stdout.close()
403
- sys.stdout = orig
404
-
405
-
406
- if __name__ == "__main__":
407
- main()
415
+ sys.stdout = orig
@@ -1,37 +0,0 @@
1
- [project]
2
- name = "sas2parquet"
3
- version = "0.1.6"
4
- description = "SAS → Parquet Hybrid Converter & Validator"
5
- authors = [
6
- {name = "Zaman Ziabakhshganji",email = "zaman.ganji@gmail.com"}
7
- ]
8
- readme = "README.md"
9
- requires-python = ">=3.11"
10
- dependencies = [
11
- "pytest (>=9.0.2,<10.0.0)",
12
- "requests (>=2.32.5,<3.0.0)",
13
- "narwhals (==2.13.0)",
14
- "numpy (==2.3.5)",
15
- "pandas (==2.3.3)",
16
- "polars (==1.36.1)",
17
- "polars-runtime-32 (==1.36.1)",
18
- "py4j (==0.10.9.9)",
19
- "pyarrow (==22.0.0)",
20
- "pyreadstat (==1.3.2)",
21
- "pyspark (==4.0.1)",
22
- "python-dateutil (==2.9.0.post0)",
23
- "pytz (==2025.2)",
24
- "six (==1.17.0)",
25
- "tzdata (==2025.2)",
26
- ]
27
-
28
- [tool.poetry]
29
- packages = [{include = "sas2parquet", from = "src"}]
30
-
31
- [tool.poetry.scripts]
32
- sas2parquet = "sas2parquet.cli:main"
33
-
34
- [build-system]
35
- requires = ["poetry-core>=2.0.0,<3.0.0"]
36
- build-backend = "poetry.core.masonry.api"
37
-
@@ -1,24 +0,0 @@
1
- #!/usr/bin/env python
2
- """CLI entrypoint for sas2parquet."""
3
- import argparse
4
- import sys
5
- from pathlib import Path
6
- from .convert import main as _convert_main # Import your existing main()
7
-
8
- def main():
9
- parser = argparse.ArgumentParser(description="SAS to Parquet converter")
10
- parser.add_argument("sas_file", nargs="?", help="Single SAS file to convert")
11
- parser.add_argument("parquet_file", nargs="?", help="Output Parquet file")
12
- parser.add_argument("--dir-mode", action="store_true",
13
- help="Process entire SAS_INPUT_DIR (ignores file args)")
14
-
15
- args = parser.parse_args()
16
-
17
- # Patch sys.argv for your convert.main() if single file mode
18
- if args.sas_file and not args.dir_mode:
19
- sys.argv = [sys.argv[0], str(Path(args.sas_file)), str(Path(args.parquet_file))]
20
-
21
- _convert_main()
22
-
23
- if __name__ == "__main__":
24
- main()
File without changes