sas2parquet 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sas2parquet/__init__.py +0 -0
- sas2parquet/cli.py +24 -0
- sas2parquet/convert.py +381 -0
- sas2parquet-0.1.0.dist-info/METADATA +164 -0
- sas2parquet-0.1.0.dist-info/RECORD +8 -0
- sas2parquet-0.1.0.dist-info/WHEEL +4 -0
- sas2parquet-0.1.0.dist-info/entry_points.txt +3 -0
- sas2parquet-0.1.0.dist-info/licenses/LICENSE +9 -0
sas2parquet/__init__.py
ADDED
|
File without changes
|
sas2parquet/cli.py
ADDED
|
@@ -0,0 +1,24 @@
|
|
|
1
|
+
#!/usr/bin/env python
|
|
2
|
+
"""CLI entrypoint for sas2parquet."""
|
|
3
|
+
import argparse
|
|
4
|
+
import sys
|
|
5
|
+
from pathlib import Path
|
|
6
|
+
from .convert import main as _convert_main # Import your existing main()
|
|
7
|
+
|
|
8
|
+
def main():
|
|
9
|
+
parser = argparse.ArgumentParser(description="SAS to Parquet converter")
|
|
10
|
+
parser.add_argument("sas_file", nargs="?", help="Single SAS file to convert")
|
|
11
|
+
parser.add_argument("parquet_file", nargs="?", help="Output Parquet file")
|
|
12
|
+
parser.add_argument("--dir-mode", action="store_true",
|
|
13
|
+
help="Process entire SAS_INPUT_DIR (ignores file args)")
|
|
14
|
+
|
|
15
|
+
args = parser.parse_args()
|
|
16
|
+
|
|
17
|
+
# Patch sys.argv for your convert.main() if single file mode
|
|
18
|
+
if args.sas_file and not args.dir_mode:
|
|
19
|
+
sys.argv = [sys.argv[0], str(Path(args.sas_file)), str(Path(args.parquet_file))]
|
|
20
|
+
|
|
21
|
+
_convert_main()
|
|
22
|
+
|
|
23
|
+
if __name__ == "__main__":
|
|
24
|
+
main()
|
sas2parquet/convert.py
ADDED
|
@@ -0,0 +1,381 @@
|
|
|
1
|
+
import os
|
|
2
|
+
import gc
|
|
3
|
+
import re
|
|
4
|
+
import warnings
|
|
5
|
+
import traceback
|
|
6
|
+
import sys
|
|
7
|
+
from pathlib import Path
|
|
8
|
+
from datetime import datetime
|
|
9
|
+
import pandas as pd
|
|
10
|
+
import polars as pl
|
|
11
|
+
import pyarrow as pa
|
|
12
|
+
import pyarrow.parquet as pq
|
|
13
|
+
import pyreadstat
|
|
14
|
+
|
|
15
|
+
# --- Suppress pandas FutureWarnings ---
|
|
16
|
+
warnings.simplefilter(action='ignore', category=FutureWarning)
|
|
17
|
+
|
|
18
|
+
# --- Configuration ---
|
|
19
|
+
# Put your .sas7bdat files inside SAS_INPUT_DIR (including subfolders).
|
|
20
|
+
SAS_INPUT_DIR = Path("sasdata")
|
|
21
|
+
|
|
22
|
+
# IMPORTANT:
|
|
23
|
+
# parquetdata/ and logging/ are created NEXT TO sasdata/ (i.e., in the same parent directory).
|
|
24
|
+
PARQUET_INPUT_DIR = SAS_INPUT_DIR.parent / "parquetdata"
|
|
25
|
+
LOG_DIR = SAS_INPUT_DIR.parent / "logging"
|
|
26
|
+
LOG_FILE_PATH = LOG_DIR / f"conversion_{datetime.now():%Y%m%d_%H%M%S}.log"
|
|
27
|
+
|
|
28
|
+
KNOWN_DATETIME_COLUMNS = [
|
|
29
|
+
'RPNA_DATE_UTC','RPNA_TIME_UTC','RPA_DATE_UTC','TIMESTAMP_UTC',
|
|
30
|
+
'EVENT_START_DATE_UTC','EVENT_END_DATE_UTC',
|
|
31
|
+
'REPORTING_START_DATE_UTC','REPORTING_END_DATE_UTC',
|
|
32
|
+
'RANGE_START','RANGE_END','TTIMESTAMP_UTC','RRANGE_START'
|
|
33
|
+
]
|
|
34
|
+
COLUMNS_TO_FORCE_AS_STRING = ['MATURITY','EARNINGS_TYPE']
|
|
35
|
+
ENCODING_TEST_ORDER = ['utf-8','latin1','cp1252']
|
|
36
|
+
ZSTD_COMPRESSION_LEVEL = 6
|
|
37
|
+
|
|
38
|
+
AVAILABLE_RAM_GB = 96
|
|
39
|
+
RAM_USAGE_FACTOR = 0.5
|
|
40
|
+
ESTIMATED_BYTES_PER_CELL = 10
|
|
41
|
+
MIN_CHUNK_SIZE = 100_000
|
|
42
|
+
MAX_CHUNK_SIZE = 10_000_000
|
|
43
|
+
|
|
44
|
+
|
|
45
|
+
# --- Logger ---
|
|
46
|
+
class Logger:
|
|
47
|
+
def __init__(self, path):
|
|
48
|
+
self.terminal = sys.stdout
|
|
49
|
+
|
|
50
|
+
# Ensure log folder exists
|
|
51
|
+
path = Path(path)
|
|
52
|
+
path.parent.mkdir(parents=True, exist_ok=True)
|
|
53
|
+
|
|
54
|
+
self.logfile = open(path, 'w', encoding='utf-8')
|
|
55
|
+
|
|
56
|
+
def write(self, msg):
|
|
57
|
+
self.terminal.write(msg)
|
|
58
|
+
self.logfile.write(msg)
|
|
59
|
+
|
|
60
|
+
def flush(self):
|
|
61
|
+
self.terminal.flush()
|
|
62
|
+
self.logfile.flush()
|
|
63
|
+
|
|
64
|
+
def close(self):
|
|
65
|
+
self.logfile.close()
|
|
66
|
+
|
|
67
|
+
|
|
68
|
+
# --- Mismatch report ---
|
|
69
|
+
def create_diff_report(sas_s: pl.Series, par_s: pl.Series, mask: pl.Series) -> str:
|
|
70
|
+
cnt = int(mask.sum())
|
|
71
|
+
sas_ex = sas_s.filter(mask).head(5).alias(f"{sas_s.name}_SAS")
|
|
72
|
+
par_ex = par_s.filter(mask).head(5).alias(f"{par_s.name}_PARQ")
|
|
73
|
+
df_ex = pl.DataFrame([sas_ex, par_ex])
|
|
74
|
+
return f"Mismatch in '{sas_s.name}' ({cnt} rows):\n{df_ex}"
|
|
75
|
+
|
|
76
|
+
|
|
77
|
+
# --- Deep comparison ---
|
|
78
|
+
def compare_and_report_diffs(sas_path: Path, parquet_path: Path):
|
|
79
|
+
print(f"\n๐ Comparing {sas_path.name} โ {parquet_path.name}")
|
|
80
|
+
issues = []
|
|
81
|
+
try:
|
|
82
|
+
_, sas_meta = pyreadstat.read_sas7bdat(sas_path, metadataonly=True)
|
|
83
|
+
pq_meta = pq.read_metadata(parquet_path)
|
|
84
|
+
|
|
85
|
+
# 1) metadata dims
|
|
86
|
+
if (sas_meta.number_rows != pq_meta.num_rows
|
|
87
|
+
or sas_meta.number_columns != pq_meta.num_columns):
|
|
88
|
+
return "โ MISMATCH", [
|
|
89
|
+
f"Dimension mismatch: SAS=({sas_meta.number_rows},{sas_meta.number_columns}) "
|
|
90
|
+
f"vs Parquet=({pq_meta.num_rows},{pq_meta.num_columns})"
|
|
91
|
+
]
|
|
92
|
+
|
|
93
|
+
# 2) actual row-count
|
|
94
|
+
CH = MAX_CHUNK_SIZE
|
|
95
|
+
sas_cnt = sum(c.shape[0] for c in pd.read_sas(sas_path, chunksize=CH))
|
|
96
|
+
pq_cnt = sum(b.num_rows for b in
|
|
97
|
+
pq.ParquetFile(parquet_path).iter_batches(batch_size=CH))
|
|
98
|
+
if sas_cnt != pq_cnt:
|
|
99
|
+
return "โ MISMATCH", [f"Row count mismatch: SAS={sas_cnt} vs Parquet={pq_cnt}"]
|
|
100
|
+
|
|
101
|
+
# 3) column order
|
|
102
|
+
sas_cols = sas_meta.column_names
|
|
103
|
+
pq_cols = pq_meta.schema.names
|
|
104
|
+
if sas_cols != pq_cols:
|
|
105
|
+
return "โ MISMATCH", [
|
|
106
|
+
f"Column/order mismatch:\n SAS: {sas_cols}\n Parquet: {pq_cols}"
|
|
107
|
+
]
|
|
108
|
+
|
|
109
|
+
# 4) chunked compare
|
|
110
|
+
sas_it = pd.read_sas(sas_path, chunksize=CH)
|
|
111
|
+
pq_it = pq.ParquetFile(parquet_path).iter_batches(batch_size=CH)
|
|
112
|
+
chunk_i = 0
|
|
113
|
+
|
|
114
|
+
# SASโUNIX epoch offset ยตs
|
|
115
|
+
offset_us = int((pd.Timestamp("1970-01-01") -
|
|
116
|
+
pd.Timestamp("1960-01-01")).total_seconds() * 1e6)
|
|
117
|
+
|
|
118
|
+
int_types = {pl.Int8,pl.Int16,pl.Int32,pl.Int64,
|
|
119
|
+
pl.UInt8,pl.UInt16,pl.UInt32,pl.UInt64}
|
|
120
|
+
float_types = {pl.Float32,pl.Float64}
|
|
121
|
+
num_types = int_types | float_types
|
|
122
|
+
|
|
123
|
+
while True:
|
|
124
|
+
chunk_i += 1
|
|
125
|
+
sas_chunk = next(sas_it, None)
|
|
126
|
+
pq_batch = next(pq_it, None)
|
|
127
|
+
if sas_chunk is None and pq_batch is None:
|
|
128
|
+
break
|
|
129
|
+
if sas_chunk is None or pq_batch is None:
|
|
130
|
+
issues.append("Chunk count mismatch")
|
|
131
|
+
break
|
|
132
|
+
|
|
133
|
+
psas = pl.from_pandas(sas_chunk)
|
|
134
|
+
ppq_raw = pl.from_arrow(pq_batch)
|
|
135
|
+
|
|
136
|
+
# normalize empty strings -> nulls in parquet side
|
|
137
|
+
ppq = ppq_raw.with_columns([
|
|
138
|
+
pl.when(pl.col(c) == "").then(None).otherwise(pl.col(c)).alias(c)
|
|
139
|
+
for c, d in zip(ppq_raw.columns, ppq_raw.dtypes)
|
|
140
|
+
if d == pl.Utf8
|
|
141
|
+
])
|
|
142
|
+
|
|
143
|
+
# 4a) numeric comparison
|
|
144
|
+
for col in psas.columns:
|
|
145
|
+
scol = psas.get_column(col)
|
|
146
|
+
pcol = ppq.get_column(col)
|
|
147
|
+
ds, dp = scol.dtype, pcol.dtype
|
|
148
|
+
if ds in num_types and dp in num_types:
|
|
149
|
+
# unify int<->float
|
|
150
|
+
if ds in float_types and dp in int_types:
|
|
151
|
+
pcol = pcol.cast(ds)
|
|
152
|
+
elif dp in float_types and ds in int_types:
|
|
153
|
+
scol = scol.cast(dp)
|
|
154
|
+
mask = (scol != pcol) | (scol.is_null() != pcol.is_null())
|
|
155
|
+
if mask.any():
|
|
156
|
+
issues.append(f"Chunk {chunk_i} numeric mismatch '{col}'")
|
|
157
|
+
issues.append(create_diff_report(scol, pcol, mask))
|
|
158
|
+
continue
|
|
159
|
+
|
|
160
|
+
# 4b) datetime & string
|
|
161
|
+
for col in psas.columns:
|
|
162
|
+
sser = psas.get_column(col)
|
|
163
|
+
pser = ppq.get_column(col)
|
|
164
|
+
|
|
165
|
+
# epoch check
|
|
166
|
+
if sser.dtype == pl.Datetime("us") and pser.dtype == pl.Datetime("us"):
|
|
167
|
+
raw = sas_chunk[col]
|
|
168
|
+
if pd.api.types.is_datetime64_ns_dtype(raw):
|
|
169
|
+
raw_us = raw.view('int64') // 1000
|
|
170
|
+
else:
|
|
171
|
+
raw_us = (raw * 1e6 + offset_us).astype('Int64')
|
|
172
|
+
pus = pser.cast(pl.Int64)
|
|
173
|
+
if not raw_us.equals(pus.to_pandas()):
|
|
174
|
+
msk = pus.to_pandas() != raw_us
|
|
175
|
+
issues.append(f"Chunk {chunk_i} epoch mismatch '{col}'")
|
|
176
|
+
issues.append(create_diff_report(
|
|
177
|
+
pl.Series(raw_us, name=col + "_SAS_us"),
|
|
178
|
+
pus.alias(col + "_PARQ_us"),
|
|
179
|
+
pl.Series(msk)
|
|
180
|
+
))
|
|
181
|
+
continue
|
|
182
|
+
|
|
183
|
+
# string compare (with date-only normalization)
|
|
184
|
+
s_str = sser.cast(pl.Utf8)
|
|
185
|
+
p_str = pser.cast(pl.Utf8)
|
|
186
|
+
mask = (s_str != p_str) | (s_str.is_null() != p_str.is_null())
|
|
187
|
+
m0 = s_str.str.ends_with(" 00:00:00.000")
|
|
188
|
+
d0 = p_str.str.contains(r'^\d{4}-\d{2}-\d{2}$')
|
|
189
|
+
sd = s_str.str.slice(0, 10) == p_str
|
|
190
|
+
mask &= ~(m0 & d0 & sd)
|
|
191
|
+
if mask.any():
|
|
192
|
+
issues.append(
|
|
193
|
+
f"Chunk {chunk_i} value mismatch '{col}' ({int(mask.sum())} rows)"
|
|
194
|
+
)
|
|
195
|
+
issues.append(create_diff_report(s_str, p_str, mask))
|
|
196
|
+
|
|
197
|
+
if issues:
|
|
198
|
+
break
|
|
199
|
+
|
|
200
|
+
return ("โ๏ธ IDENTICAL", []) if not issues else ("โ MISMATCH", issues)
|
|
201
|
+
|
|
202
|
+
except Exception as e:
|
|
203
|
+
return "โ CRITICAL ERROR", [str(e), traceback.format_exc()]
|
|
204
|
+
finally:
|
|
205
|
+
gc.collect()
|
|
206
|
+
|
|
207
|
+
|
|
208
|
+
# --- Reconversion + validation ---
|
|
209
|
+
def reconvert_file_ultimate(sas_path: Path, parquet_path: Path) -> bool:
|
|
210
|
+
print(f"๐ ๏ธ Fixing {sas_path.name}...")
|
|
211
|
+
|
|
212
|
+
# 1) metadata & encoding
|
|
213
|
+
_, meta0 = pyreadstat.read_sas7bdat(sas_path, metadataonly=True)
|
|
214
|
+
enc0 = getattr(meta0, 'file_encoding', None)
|
|
215
|
+
if enc0:
|
|
216
|
+
encoding = enc0
|
|
217
|
+
print(f" Detected encoding from metadata: {encoding}")
|
|
218
|
+
else:
|
|
219
|
+
encoding = None
|
|
220
|
+
for e in ENCODING_TEST_ORDER:
|
|
221
|
+
try:
|
|
222
|
+
pyreadstat.read_sas7bdat(sas_path, metadataonly=True, encoding=e)
|
|
223
|
+
encoding = e
|
|
224
|
+
break
|
|
225
|
+
except Exception:
|
|
226
|
+
pass
|
|
227
|
+
if not encoding:
|
|
228
|
+
print(" โ Unable to detect encoding; skipping")
|
|
229
|
+
return False
|
|
230
|
+
print(f" Fallback encoding: {encoding}")
|
|
231
|
+
|
|
232
|
+
cols = meta0.column_names
|
|
233
|
+
read_types = getattr(meta0, 'readstat_variable_types', {}) or {}
|
|
234
|
+
|
|
235
|
+
# SAS formats if available
|
|
236
|
+
fmt_map = {}
|
|
237
|
+
if hasattr(meta0, 'formats'):
|
|
238
|
+
for name, fmt in zip(meta0.column_names, meta0.formats):
|
|
239
|
+
fmt_map[name] = fmt or ""
|
|
240
|
+
|
|
241
|
+
# infer content types from first few chunks
|
|
242
|
+
content, inf, cnt = {}, {}, 0
|
|
243
|
+
it = pd.read_sas(sas_path, chunksize=MIN_CHUNK_SIZE, encoding=encoding)
|
|
244
|
+
for chunk in it:
|
|
245
|
+
tbl = pa.Table.from_pandas(chunk.convert_dtypes(), preserve_index=False)
|
|
246
|
+
for f in tbl.schema:
|
|
247
|
+
inf.setdefault(f.name, []).append(f.type)
|
|
248
|
+
cnt += 1
|
|
249
|
+
if cnt >= 5:
|
|
250
|
+
break
|
|
251
|
+
for c in cols:
|
|
252
|
+
ts = inf.get(c, [])
|
|
253
|
+
content[c] = ts[0] if len(set(ts)) == 1 else pa.string()
|
|
254
|
+
|
|
255
|
+
meta_types = {
|
|
256
|
+
c: (pa.float64() if read_types.get(c) == 'double' else pa.string())
|
|
257
|
+
for c in cols
|
|
258
|
+
}
|
|
259
|
+
|
|
260
|
+
special_up = {x.upper() for x in KNOWN_DATETIME_COLUMNS + COLUMNS_TO_FORCE_AS_STRING}
|
|
261
|
+
conflicts = [c for c in cols if c.upper() not in special_up and meta_types[c] != content[c]]
|
|
262
|
+
strat = 'metadata-first' if len(conflicts) <= 5 else 'content-first'
|
|
263
|
+
print(f" Strategy={strat}, conflicts={len(conflicts)}")
|
|
264
|
+
|
|
265
|
+
overrides = {}
|
|
266
|
+
for attempt in range(1, 21):
|
|
267
|
+
print(f" Attempt {attempt}โฆ")
|
|
268
|
+
fields = []
|
|
269
|
+
for c in cols:
|
|
270
|
+
# 1) SAS-declared numeric โ float64
|
|
271
|
+
if read_types.get(c) == 'double':
|
|
272
|
+
at = pa.float64()
|
|
273
|
+
else:
|
|
274
|
+
cu = c.upper()
|
|
275
|
+
# 2) forced-string
|
|
276
|
+
if cu in {x.upper() for x in COLUMNS_TO_FORCE_AS_STRING}:
|
|
277
|
+
at = pa.string()
|
|
278
|
+
else:
|
|
279
|
+
fmt = fmt_map.get(c, "").upper()
|
|
280
|
+
# 3) datetime/date/time
|
|
281
|
+
if (cu in {x.upper() for x in KNOWN_DATETIME_COLUMNS}
|
|
282
|
+
or any(x in fmt for x in ('DATE', 'TIME', 'DATETIME'))):
|
|
283
|
+
if 'DATE' in fmt and 'DATETIME' not in fmt:
|
|
284
|
+
at = pa.date32()
|
|
285
|
+
elif 'TIME' in fmt and 'DATETIME' not in fmt:
|
|
286
|
+
at = pa.time64('ms')
|
|
287
|
+
else:
|
|
288
|
+
at = pa.timestamp('ms')
|
|
289
|
+
# 4) fallback
|
|
290
|
+
else:
|
|
291
|
+
at = pa.string()
|
|
292
|
+
|
|
293
|
+
# apply any dynamic override
|
|
294
|
+
if c in overrides:
|
|
295
|
+
at = overrides[c]
|
|
296
|
+
|
|
297
|
+
fields.append(pa.field(c, at))
|
|
298
|
+
|
|
299
|
+
schema = pa.schema(fields)
|
|
300
|
+
|
|
301
|
+
tgt = int(AVAILABLE_RAM_GB * (1024**3) * RAM_USAGE_FACTOR)
|
|
302
|
+
row_bytes = meta0.number_columns * ESTIMATED_BYTES_PER_CELL or 1
|
|
303
|
+
cs = max(MIN_CHUNK_SIZE, min(MAX_CHUNK_SIZE, int(tgt / row_bytes)))
|
|
304
|
+
|
|
305
|
+
parquet_path.parent.mkdir(parents=True, exist_ok=True)
|
|
306
|
+
writer = pq.ParquetWriter(
|
|
307
|
+
parquet_path,
|
|
308
|
+
schema,
|
|
309
|
+
compression='zstd',
|
|
310
|
+
compression_level=ZSTD_COMPRESSION_LEVEL
|
|
311
|
+
)
|
|
312
|
+
try:
|
|
313
|
+
for chunk in pd.read_sas(sas_path, chunksize=cs, encoding=encoding):
|
|
314
|
+
writer.write_table(
|
|
315
|
+
pa.Table.from_pandas(chunk, schema=schema, preserve_index=False)
|
|
316
|
+
)
|
|
317
|
+
writer.close()
|
|
318
|
+
print(" โ
Conversion succeeded")
|
|
319
|
+
|
|
320
|
+
st, dt = compare_and_report_diffs(sas_path, parquet_path)
|
|
321
|
+
print(f" ๐ Validation: {st}")
|
|
322
|
+
for d in dt:
|
|
323
|
+
print(" -", d.replace("\n", "\n "))
|
|
324
|
+
return True
|
|
325
|
+
|
|
326
|
+
except (pa.lib.ArrowTypeError, pa.lib.ArrowNotImplementedError) as e:
|
|
327
|
+
writer.close()
|
|
328
|
+
mm = re.search(r"column\s+`?(\w+)`?", str(e))
|
|
329
|
+
if mm:
|
|
330
|
+
bad = mm.group(1)
|
|
331
|
+
print(f" โ ๏ธ Override '{bad}' and retryโฆ")
|
|
332
|
+
try:
|
|
333
|
+
df0 = next(pd.read_sas(sas_path, chunksize=MIN_CHUNK_SIZE, encoding=encoding))
|
|
334
|
+
itype = pa.Table.from_pandas(df0[[bad]]).schema.field(bad).type
|
|
335
|
+
overrides[bad] = itype
|
|
336
|
+
except Exception:
|
|
337
|
+
overrides[bad] = pa.string()
|
|
338
|
+
continue
|
|
339
|
+
else:
|
|
340
|
+
print(" โ Unrecoverable:", e)
|
|
341
|
+
break
|
|
342
|
+
|
|
343
|
+
except Exception:
|
|
344
|
+
writer.close()
|
|
345
|
+
print(" โ Conversion failed:", traceback.format_exc())
|
|
346
|
+
break
|
|
347
|
+
|
|
348
|
+
print(f" โ All attempts failed for {sas_path.name}")
|
|
349
|
+
return False
|
|
350
|
+
|
|
351
|
+
|
|
352
|
+
# --- Main loop ---
|
|
353
|
+
def main():
|
|
354
|
+
orig = sys.stdout
|
|
355
|
+
sys.stdout = Logger(LOG_FILE_PATH)
|
|
356
|
+
try:
|
|
357
|
+
print("๐ SAS โ Parquet Hybrid Fix & Validate (full folder)\n")
|
|
358
|
+
files = list(SAS_INPUT_DIR.rglob("*.sas7bdat"))
|
|
359
|
+
if not files:
|
|
360
|
+
print("โ No SAS files found. Exiting.")
|
|
361
|
+
return
|
|
362
|
+
|
|
363
|
+
print(f"Found {len(files)} files.\n" + "="*60)
|
|
364
|
+
for sas in files:
|
|
365
|
+
rel = sas.relative_to(SAS_INPUT_DIR)
|
|
366
|
+
print(f"\n๐ Processing: {rel}")
|
|
367
|
+
|
|
368
|
+
# Mirror structure under parquetdata/ (which lives next to sasdata/)
|
|
369
|
+
pqf = (PARQUET_INPUT_DIR / rel).with_suffix('.parquet')
|
|
370
|
+
|
|
371
|
+
reconvert_file_ultimate(sas, pqf)
|
|
372
|
+
print("-"*60)
|
|
373
|
+
|
|
374
|
+
print("\nโ
All done. See log at:", LOG_FILE_PATH)
|
|
375
|
+
finally:
|
|
376
|
+
sys.stdout.close()
|
|
377
|
+
sys.stdout = orig
|
|
378
|
+
|
|
379
|
+
|
|
380
|
+
if __name__ == "__main__":
|
|
381
|
+
main()
|
|
@@ -0,0 +1,164 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: sas2parquet
|
|
3
|
+
Version: 0.1.0
|
|
4
|
+
Summary: SAS โ Parquet Hybrid Converter & Validator
|
|
5
|
+
License-File: LICENSE
|
|
6
|
+
Author: Zaman Ziabakhshganji
|
|
7
|
+
Author-email: zaman.ganji@gmail.com
|
|
8
|
+
Requires-Python: >=3.11
|
|
9
|
+
Classifier: Programming Language :: Python :: 3
|
|
10
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
11
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
12
|
+
Classifier: Programming Language :: Python :: 3.13
|
|
13
|
+
Classifier: Programming Language :: Python :: 3.14
|
|
14
|
+
Requires-Dist: narwhals (==2.13.0)
|
|
15
|
+
Requires-Dist: numpy (==2.3.5)
|
|
16
|
+
Requires-Dist: pandas (==2.3.3)
|
|
17
|
+
Requires-Dist: polars (==1.36.1)
|
|
18
|
+
Requires-Dist: polars-runtime-32 (==1.36.1)
|
|
19
|
+
Requires-Dist: py4j (==0.10.9.9)
|
|
20
|
+
Requires-Dist: pyarrow (==22.0.0)
|
|
21
|
+
Requires-Dist: pyreadstat (==1.3.2)
|
|
22
|
+
Requires-Dist: pyspark (==4.0.1)
|
|
23
|
+
Requires-Dist: pytest (>=9.0.2,<10.0.0)
|
|
24
|
+
Requires-Dist: python-dateutil (==2.9.0.post0)
|
|
25
|
+
Requires-Dist: pytz (==2025.2)
|
|
26
|
+
Requires-Dist: requests (>=2.32.5,<3.0.0)
|
|
27
|
+
Requires-Dist: six (==1.17.0)
|
|
28
|
+
Requires-Dist: tzdata (==2025.2)
|
|
29
|
+
Description-Content-Type: text/markdown
|
|
30
|
+
|
|
31
|
+
# sas2parquet
|
|
32
|
+
|
|
33
|
+
[](https://pypi.org/project/sas2parquet/)
|
|
34
|
+
[](https://pypi.org/project/sas2parquet/)
|
|
35
|
+
[](https://github.com/yourusername/sas2parquet/blob/main/LICENSE)
|
|
36
|
+
|
|
37
|
+
**The ultimate SAS (.sas7bdat) to Parquet converter** - handles problematic files that fail with standard tools. Automatic encoding detection, intelligent type inference, schema repair, and pixel-perfect validation.
|
|
38
|
+
|
|
39
|
+
## โจ Features
|
|
40
|
+
|
|
41
|
+
| Feature | Description |
|
|
42
|
+
|---------|-------------|
|
|
43
|
+
| ๐ **Auto Encoding** | Detects UTF-8, Latin1, CP1252 from metadata/fallback |
|
|
44
|
+
| ๐ง **Smart Types** | Infers datetime, numeric, string with 20+ fallback attempts |
|
|
45
|
+
| โ
**Validation** | Compares SAS vs Parquet chunk-by-chunk (numeric + string) |
|
|
46
|
+
| ๐ **Memory Safe** | Chunked processing (96GB RAM optimized, configurable) |
|
|
47
|
+
| ๐พ **ZSTD** | Level 6 compression for maximum efficiency |
|
|
48
|
+
| ๐ **Detailed Logs** | Mismatch reports + full conversion trace |
|
|
49
|
+
| ๐ฏ **Two Modes** | Single file OR recursive directory processing |
|
|
50
|
+
|
|
51
|
+
## Quick Start
|
|
52
|
+
|
|
53
|
+
### Install
|
|
54
|
+
```bash
|
|
55
|
+
pip install sas2parquet
|
|
56
|
+
```
|
|
57
|
+
|
|
58
|
+
### Single File
|
|
59
|
+
```bash
|
|
60
|
+
sas2parquet input.sas output.parquet
|
|
61
|
+
```
|
|
62
|
+
|
|
63
|
+
### Batch Directory (Recommended)
|
|
64
|
+
```bash
|
|
65
|
+
sas2parquet --dir-mode
|
|
66
|
+
```
|
|
67
|
+
|
|
68
|
+
## ๐ Directory Mode (Default Workflow)
|
|
69
|
+
|
|
70
|
+
### How it works
|
|
71
|
+
|
|
72
|
+
- You provide a `sasdata/` directory containing all `.sas7bdat` files (including nested subfolders).
|
|
73
|
+
- The tool automatically creates a `parquetdata/` directory in the same parent folder as `sasdata/`.
|
|
74
|
+
- All files are converted to Parquet and written into `parquetdata/`, mirroring the original folder structure.
|
|
75
|
+
|
|
76
|
+
```text
|
|
77
|
+
your-project/
|
|
78
|
+
โโโ sasdata/
|
|
79
|
+
โ โโโ file1.sas7bdat
|
|
80
|
+
โ โโโ nested.sas7bdat
|
|
81
|
+
โโโ parquetdata/
|
|
82
|
+
โ โโโ file1.parquet
|
|
83
|
+
โ โโโ nested.parquet
|
|
84
|
+
โโโ logging/
|
|
85
|
+
โโโ conversion_20260205_1145.log
|
|
86
|
+
```
|
|
87
|
+
|
|
88
|
+
Just run:
|
|
89
|
+
```bash
|
|
90
|
+
sas2parquet --dir-mode
|
|
91
|
+
```
|
|
92
|
+
|
|
93
|
+
## ๐ ๏ธ CLI Reference
|
|
94
|
+
```bash
|
|
95
|
+
sas2parquet --help
|
|
96
|
+
```
|
|
97
|
+
|
|
98
|
+
```text
|
|
99
|
+
usage: sas2parquet [-h] [--dir-mode] [sas_file] [parquet_file]
|
|
100
|
+
|
|
101
|
+
Robust SAS to Parquet converter with validation
|
|
102
|
+
```
|
|
103
|
+
|
|
104
|
+
## ๐ Example Output
|
|
105
|
+
```text
|
|
106
|
+
๐ SAS โ Parquet Hybrid Fix & Validate (full folder)
|
|
107
|
+
Found 3 files.
|
|
108
|
+
============================================================
|
|
109
|
+
...
|
|
110
|
+
```
|
|
111
|
+
|
|
112
|
+
## โ๏ธ Configuration (Advanced)
|
|
113
|
+
|
|
114
|
+
Edit `src/sas2parquet/convert.py` constants:
|
|
115
|
+
|
|
116
|
+
```python
|
|
117
|
+
AVAILABLE_RAM_GB = 96
|
|
118
|
+
RAM_USAGE_FACTOR = 0.5
|
|
119
|
+
ZSTD_COMPRESSION_LEVEL = 6
|
|
120
|
+
MIN_CHUNK_SIZE = 100_000
|
|
121
|
+
MAX_CHUNK_SIZE = 10_000_000
|
|
122
|
+
```
|
|
123
|
+
|
|
124
|
+
## ๐งช Validation Details
|
|
125
|
+
Each file undergoes 4-stage validation:
|
|
126
|
+
1. Metadata
|
|
127
|
+
2. Exact counts
|
|
128
|
+
3. Column order
|
|
129
|
+
4. Value comparison
|
|
130
|
+
|
|
131
|
+
## ๐ป Development
|
|
132
|
+
```bash
|
|
133
|
+
git clone <repo>
|
|
134
|
+
cd sas2parquet
|
|
135
|
+
pip install -e .
|
|
136
|
+
```
|
|
137
|
+
|
|
138
|
+
|
|
139
|
+
## ๐งช Testing
|
|
140
|
+
|
|
141
|
+
This project includes a comprehensive **pytest** test suite covering:
|
|
142
|
+
|
|
143
|
+
- Schema inference and overrides
|
|
144
|
+
- SAS โ Parquet validation logic
|
|
145
|
+
- Error handling and retry behavior
|
|
146
|
+
- Directory mirroring (`sasdata/ โ parquetdata/`)
|
|
147
|
+
- Logging and CLI execution paths
|
|
148
|
+
|
|
149
|
+
### Run tests
|
|
150
|
+
|
|
151
|
+
```bash
|
|
152
|
+
pip install -e .[dev]
|
|
153
|
+
pytest -q
|
|
154
|
+
```
|
|
155
|
+
|
|
156
|
+
## ๐ฆ Dependencies
|
|
157
|
+
- pandas>=2.0
|
|
158
|
+
- polars>=0.20
|
|
159
|
+
- pyarrow>=15.0
|
|
160
|
+
- pyreadstat>=1.4
|
|
161
|
+
|
|
162
|
+
## ๐ License
|
|
163
|
+
MIT License
|
|
164
|
+
|
|
@@ -0,0 +1,8 @@
|
|
|
1
|
+
sas2parquet/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
2
|
+
sas2parquet/cli.py,sha256=Gap4-lrHE3a-tOTRljqmRP6uX-epLFHnOYN0pdffU-g,879
|
|
3
|
+
sas2parquet/convert.py,sha256=36hsDLM0uQF2tuwd9U8k3U5zEZtwQcEJWta3v4SkoXc,13950
|
|
4
|
+
sas2parquet-0.1.0.dist-info/METADATA,sha256=unNq979clzDKKLnxRFr5tebDI9L-i81LiNiPSJW-Nx8,4347
|
|
5
|
+
sas2parquet-0.1.0.dist-info/WHEEL,sha256=3ny-bZhpXrU6vSQ1UPG34FoxZBp3lVcvK0LkgUz6VLk,88
|
|
6
|
+
sas2parquet-0.1.0.dist-info/entry_points.txt,sha256=pg57h0xD_3R9ZC_YfxLLfu_2p1JNhF8xDNS6v7kiSBY,52
|
|
7
|
+
sas2parquet-0.1.0.dist-info/licenses/LICENSE,sha256=ouRycIMUGF1zCj49-ijn1wIlTNknZEoLwAHUp0ifH-g,1066
|
|
8
|
+
sas2parquet-0.1.0.dist-info/RECORD,,
|
|
@@ -0,0 +1,9 @@
|
|
|
1
|
+
MIT License
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2025 EDC-IXLab
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
|
|
6
|
+
|
|
7
|
+
The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
|
|
8
|
+
|
|
9
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|