sas2parquet 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
File without changes
sas2parquet/cli.py ADDED
@@ -0,0 +1,24 @@
1
+ #!/usr/bin/env python
2
+ """CLI entrypoint for sas2parquet."""
3
+ import argparse
4
+ import sys
5
+ from pathlib import Path
6
+ from .convert import main as _convert_main # Import your existing main()
7
+
8
+ def main():
9
+ parser = argparse.ArgumentParser(description="SAS to Parquet converter")
10
+ parser.add_argument("sas_file", nargs="?", help="Single SAS file to convert")
11
+ parser.add_argument("parquet_file", nargs="?", help="Output Parquet file")
12
+ parser.add_argument("--dir-mode", action="store_true",
13
+ help="Process entire SAS_INPUT_DIR (ignores file args)")
14
+
15
+ args = parser.parse_args()
16
+
17
+ # Patch sys.argv for your convert.main() if single file mode
18
+ if args.sas_file and not args.dir_mode:
19
+ sys.argv = [sys.argv[0], str(Path(args.sas_file)), str(Path(args.parquet_file))]
20
+
21
+ _convert_main()
22
+
23
+ if __name__ == "__main__":
24
+ main()
sas2parquet/convert.py ADDED
@@ -0,0 +1,381 @@
1
+ import os
2
+ import gc
3
+ import re
4
+ import warnings
5
+ import traceback
6
+ import sys
7
+ from pathlib import Path
8
+ from datetime import datetime
9
+ import pandas as pd
10
+ import polars as pl
11
+ import pyarrow as pa
12
+ import pyarrow.parquet as pq
13
+ import pyreadstat
14
+
15
+ # --- Suppress pandas FutureWarnings ---
16
+ warnings.simplefilter(action='ignore', category=FutureWarning)
17
+
18
+ # --- Configuration ---
19
+ # Put your .sas7bdat files inside SAS_INPUT_DIR (including subfolders).
20
+ SAS_INPUT_DIR = Path("sasdata")
21
+
22
+ # IMPORTANT:
23
+ # parquetdata/ and logging/ are created NEXT TO sasdata/ (i.e., in the same parent directory).
24
+ PARQUET_INPUT_DIR = SAS_INPUT_DIR.parent / "parquetdata"
25
+ LOG_DIR = SAS_INPUT_DIR.parent / "logging"
26
+ LOG_FILE_PATH = LOG_DIR / f"conversion_{datetime.now():%Y%m%d_%H%M%S}.log"
27
+
28
+ KNOWN_DATETIME_COLUMNS = [
29
+ 'RPNA_DATE_UTC','RPNA_TIME_UTC','RPA_DATE_UTC','TIMESTAMP_UTC',
30
+ 'EVENT_START_DATE_UTC','EVENT_END_DATE_UTC',
31
+ 'REPORTING_START_DATE_UTC','REPORTING_END_DATE_UTC',
32
+ 'RANGE_START','RANGE_END','TTIMESTAMP_UTC','RRANGE_START'
33
+ ]
34
+ COLUMNS_TO_FORCE_AS_STRING = ['MATURITY','EARNINGS_TYPE']
35
+ ENCODING_TEST_ORDER = ['utf-8','latin1','cp1252']
36
+ ZSTD_COMPRESSION_LEVEL = 6
37
+
38
+ AVAILABLE_RAM_GB = 96
39
+ RAM_USAGE_FACTOR = 0.5
40
+ ESTIMATED_BYTES_PER_CELL = 10
41
+ MIN_CHUNK_SIZE = 100_000
42
+ MAX_CHUNK_SIZE = 10_000_000
43
+
44
+
45
+ # --- Logger ---
46
+ class Logger:
47
+ def __init__(self, path):
48
+ self.terminal = sys.stdout
49
+
50
+ # Ensure log folder exists
51
+ path = Path(path)
52
+ path.parent.mkdir(parents=True, exist_ok=True)
53
+
54
+ self.logfile = open(path, 'w', encoding='utf-8')
55
+
56
+ def write(self, msg):
57
+ self.terminal.write(msg)
58
+ self.logfile.write(msg)
59
+
60
+ def flush(self):
61
+ self.terminal.flush()
62
+ self.logfile.flush()
63
+
64
+ def close(self):
65
+ self.logfile.close()
66
+
67
+
68
+ # --- Mismatch report ---
69
+ def create_diff_report(sas_s: pl.Series, par_s: pl.Series, mask: pl.Series) -> str:
70
+ cnt = int(mask.sum())
71
+ sas_ex = sas_s.filter(mask).head(5).alias(f"{sas_s.name}_SAS")
72
+ par_ex = par_s.filter(mask).head(5).alias(f"{par_s.name}_PARQ")
73
+ df_ex = pl.DataFrame([sas_ex, par_ex])
74
+ return f"Mismatch in '{sas_s.name}' ({cnt} rows):\n{df_ex}"
75
+
76
+
77
+ # --- Deep comparison ---
78
+ def compare_and_report_diffs(sas_path: Path, parquet_path: Path):
79
+ print(f"\n๐Ÿ”Ž Comparing {sas_path.name} โ†” {parquet_path.name}")
80
+ issues = []
81
+ try:
82
+ _, sas_meta = pyreadstat.read_sas7bdat(sas_path, metadataonly=True)
83
+ pq_meta = pq.read_metadata(parquet_path)
84
+
85
+ # 1) metadata dims
86
+ if (sas_meta.number_rows != pq_meta.num_rows
87
+ or sas_meta.number_columns != pq_meta.num_columns):
88
+ return "โŒ MISMATCH", [
89
+ f"Dimension mismatch: SAS=({sas_meta.number_rows},{sas_meta.number_columns}) "
90
+ f"vs Parquet=({pq_meta.num_rows},{pq_meta.num_columns})"
91
+ ]
92
+
93
+ # 2) actual row-count
94
+ CH = MAX_CHUNK_SIZE
95
+ sas_cnt = sum(c.shape[0] for c in pd.read_sas(sas_path, chunksize=CH))
96
+ pq_cnt = sum(b.num_rows for b in
97
+ pq.ParquetFile(parquet_path).iter_batches(batch_size=CH))
98
+ if sas_cnt != pq_cnt:
99
+ return "โŒ MISMATCH", [f"Row count mismatch: SAS={sas_cnt} vs Parquet={pq_cnt}"]
100
+
101
+ # 3) column order
102
+ sas_cols = sas_meta.column_names
103
+ pq_cols = pq_meta.schema.names
104
+ if sas_cols != pq_cols:
105
+ return "โŒ MISMATCH", [
106
+ f"Column/order mismatch:\n SAS: {sas_cols}\n Parquet: {pq_cols}"
107
+ ]
108
+
109
+ # 4) chunked compare
110
+ sas_it = pd.read_sas(sas_path, chunksize=CH)
111
+ pq_it = pq.ParquetFile(parquet_path).iter_batches(batch_size=CH)
112
+ chunk_i = 0
113
+
114
+ # SASโ†’UNIX epoch offset ยตs
115
+ offset_us = int((pd.Timestamp("1970-01-01") -
116
+ pd.Timestamp("1960-01-01")).total_seconds() * 1e6)
117
+
118
+ int_types = {pl.Int8,pl.Int16,pl.Int32,pl.Int64,
119
+ pl.UInt8,pl.UInt16,pl.UInt32,pl.UInt64}
120
+ float_types = {pl.Float32,pl.Float64}
121
+ num_types = int_types | float_types
122
+
123
+ while True:
124
+ chunk_i += 1
125
+ sas_chunk = next(sas_it, None)
126
+ pq_batch = next(pq_it, None)
127
+ if sas_chunk is None and pq_batch is None:
128
+ break
129
+ if sas_chunk is None or pq_batch is None:
130
+ issues.append("Chunk count mismatch")
131
+ break
132
+
133
+ psas = pl.from_pandas(sas_chunk)
134
+ ppq_raw = pl.from_arrow(pq_batch)
135
+
136
+ # normalize empty strings -> nulls in parquet side
137
+ ppq = ppq_raw.with_columns([
138
+ pl.when(pl.col(c) == "").then(None).otherwise(pl.col(c)).alias(c)
139
+ for c, d in zip(ppq_raw.columns, ppq_raw.dtypes)
140
+ if d == pl.Utf8
141
+ ])
142
+
143
+ # 4a) numeric comparison
144
+ for col in psas.columns:
145
+ scol = psas.get_column(col)
146
+ pcol = ppq.get_column(col)
147
+ ds, dp = scol.dtype, pcol.dtype
148
+ if ds in num_types and dp in num_types:
149
+ # unify int<->float
150
+ if ds in float_types and dp in int_types:
151
+ pcol = pcol.cast(ds)
152
+ elif dp in float_types and ds in int_types:
153
+ scol = scol.cast(dp)
154
+ mask = (scol != pcol) | (scol.is_null() != pcol.is_null())
155
+ if mask.any():
156
+ issues.append(f"Chunk {chunk_i} numeric mismatch '{col}'")
157
+ issues.append(create_diff_report(scol, pcol, mask))
158
+ continue
159
+
160
+ # 4b) datetime & string
161
+ for col in psas.columns:
162
+ sser = psas.get_column(col)
163
+ pser = ppq.get_column(col)
164
+
165
+ # epoch check
166
+ if sser.dtype == pl.Datetime("us") and pser.dtype == pl.Datetime("us"):
167
+ raw = sas_chunk[col]
168
+ if pd.api.types.is_datetime64_ns_dtype(raw):
169
+ raw_us = raw.view('int64') // 1000
170
+ else:
171
+ raw_us = (raw * 1e6 + offset_us).astype('Int64')
172
+ pus = pser.cast(pl.Int64)
173
+ if not raw_us.equals(pus.to_pandas()):
174
+ msk = pus.to_pandas() != raw_us
175
+ issues.append(f"Chunk {chunk_i} epoch mismatch '{col}'")
176
+ issues.append(create_diff_report(
177
+ pl.Series(raw_us, name=col + "_SAS_us"),
178
+ pus.alias(col + "_PARQ_us"),
179
+ pl.Series(msk)
180
+ ))
181
+ continue
182
+
183
+ # string compare (with date-only normalization)
184
+ s_str = sser.cast(pl.Utf8)
185
+ p_str = pser.cast(pl.Utf8)
186
+ mask = (s_str != p_str) | (s_str.is_null() != p_str.is_null())
187
+ m0 = s_str.str.ends_with(" 00:00:00.000")
188
+ d0 = p_str.str.contains(r'^\d{4}-\d{2}-\d{2}$')
189
+ sd = s_str.str.slice(0, 10) == p_str
190
+ mask &= ~(m0 & d0 & sd)
191
+ if mask.any():
192
+ issues.append(
193
+ f"Chunk {chunk_i} value mismatch '{col}' ({int(mask.sum())} rows)"
194
+ )
195
+ issues.append(create_diff_report(s_str, p_str, mask))
196
+
197
+ if issues:
198
+ break
199
+
200
+ return ("โœ”๏ธ IDENTICAL", []) if not issues else ("โŒ MISMATCH", issues)
201
+
202
+ except Exception as e:
203
+ return "โ— CRITICAL ERROR", [str(e), traceback.format_exc()]
204
+ finally:
205
+ gc.collect()
206
+
207
+
208
+ # --- Reconversion + validation ---
209
+ def reconvert_file_ultimate(sas_path: Path, parquet_path: Path) -> bool:
210
+ print(f"๐Ÿ› ๏ธ Fixing {sas_path.name}...")
211
+
212
+ # 1) metadata & encoding
213
+ _, meta0 = pyreadstat.read_sas7bdat(sas_path, metadataonly=True)
214
+ enc0 = getattr(meta0, 'file_encoding', None)
215
+ if enc0:
216
+ encoding = enc0
217
+ print(f" Detected encoding from metadata: {encoding}")
218
+ else:
219
+ encoding = None
220
+ for e in ENCODING_TEST_ORDER:
221
+ try:
222
+ pyreadstat.read_sas7bdat(sas_path, metadataonly=True, encoding=e)
223
+ encoding = e
224
+ break
225
+ except Exception:
226
+ pass
227
+ if not encoding:
228
+ print(" โŒ Unable to detect encoding; skipping")
229
+ return False
230
+ print(f" Fallback encoding: {encoding}")
231
+
232
+ cols = meta0.column_names
233
+ read_types = getattr(meta0, 'readstat_variable_types', {}) or {}
234
+
235
+ # SAS formats if available
236
+ fmt_map = {}
237
+ if hasattr(meta0, 'formats'):
238
+ for name, fmt in zip(meta0.column_names, meta0.formats):
239
+ fmt_map[name] = fmt or ""
240
+
241
+ # infer content types from first few chunks
242
+ content, inf, cnt = {}, {}, 0
243
+ it = pd.read_sas(sas_path, chunksize=MIN_CHUNK_SIZE, encoding=encoding)
244
+ for chunk in it:
245
+ tbl = pa.Table.from_pandas(chunk.convert_dtypes(), preserve_index=False)
246
+ for f in tbl.schema:
247
+ inf.setdefault(f.name, []).append(f.type)
248
+ cnt += 1
249
+ if cnt >= 5:
250
+ break
251
+ for c in cols:
252
+ ts = inf.get(c, [])
253
+ content[c] = ts[0] if len(set(ts)) == 1 else pa.string()
254
+
255
+ meta_types = {
256
+ c: (pa.float64() if read_types.get(c) == 'double' else pa.string())
257
+ for c in cols
258
+ }
259
+
260
+ special_up = {x.upper() for x in KNOWN_DATETIME_COLUMNS + COLUMNS_TO_FORCE_AS_STRING}
261
+ conflicts = [c for c in cols if c.upper() not in special_up and meta_types[c] != content[c]]
262
+ strat = 'metadata-first' if len(conflicts) <= 5 else 'content-first'
263
+ print(f" Strategy={strat}, conflicts={len(conflicts)}")
264
+
265
+ overrides = {}
266
+ for attempt in range(1, 21):
267
+ print(f" Attempt {attempt}โ€ฆ")
268
+ fields = []
269
+ for c in cols:
270
+ # 1) SAS-declared numeric โ†’ float64
271
+ if read_types.get(c) == 'double':
272
+ at = pa.float64()
273
+ else:
274
+ cu = c.upper()
275
+ # 2) forced-string
276
+ if cu in {x.upper() for x in COLUMNS_TO_FORCE_AS_STRING}:
277
+ at = pa.string()
278
+ else:
279
+ fmt = fmt_map.get(c, "").upper()
280
+ # 3) datetime/date/time
281
+ if (cu in {x.upper() for x in KNOWN_DATETIME_COLUMNS}
282
+ or any(x in fmt for x in ('DATE', 'TIME', 'DATETIME'))):
283
+ if 'DATE' in fmt and 'DATETIME' not in fmt:
284
+ at = pa.date32()
285
+ elif 'TIME' in fmt and 'DATETIME' not in fmt:
286
+ at = pa.time64('ms')
287
+ else:
288
+ at = pa.timestamp('ms')
289
+ # 4) fallback
290
+ else:
291
+ at = pa.string()
292
+
293
+ # apply any dynamic override
294
+ if c in overrides:
295
+ at = overrides[c]
296
+
297
+ fields.append(pa.field(c, at))
298
+
299
+ schema = pa.schema(fields)
300
+
301
+ tgt = int(AVAILABLE_RAM_GB * (1024**3) * RAM_USAGE_FACTOR)
302
+ row_bytes = meta0.number_columns * ESTIMATED_BYTES_PER_CELL or 1
303
+ cs = max(MIN_CHUNK_SIZE, min(MAX_CHUNK_SIZE, int(tgt / row_bytes)))
304
+
305
+ parquet_path.parent.mkdir(parents=True, exist_ok=True)
306
+ writer = pq.ParquetWriter(
307
+ parquet_path,
308
+ schema,
309
+ compression='zstd',
310
+ compression_level=ZSTD_COMPRESSION_LEVEL
311
+ )
312
+ try:
313
+ for chunk in pd.read_sas(sas_path, chunksize=cs, encoding=encoding):
314
+ writer.write_table(
315
+ pa.Table.from_pandas(chunk, schema=schema, preserve_index=False)
316
+ )
317
+ writer.close()
318
+ print(" โœ… Conversion succeeded")
319
+
320
+ st, dt = compare_and_report_diffs(sas_path, parquet_path)
321
+ print(f" ๐Ÿ” Validation: {st}")
322
+ for d in dt:
323
+ print(" -", d.replace("\n", "\n "))
324
+ return True
325
+
326
+ except (pa.lib.ArrowTypeError, pa.lib.ArrowNotImplementedError) as e:
327
+ writer.close()
328
+ mm = re.search(r"column\s+`?(\w+)`?", str(e))
329
+ if mm:
330
+ bad = mm.group(1)
331
+ print(f" โš ๏ธ Override '{bad}' and retryโ€ฆ")
332
+ try:
333
+ df0 = next(pd.read_sas(sas_path, chunksize=MIN_CHUNK_SIZE, encoding=encoding))
334
+ itype = pa.Table.from_pandas(df0[[bad]]).schema.field(bad).type
335
+ overrides[bad] = itype
336
+ except Exception:
337
+ overrides[bad] = pa.string()
338
+ continue
339
+ else:
340
+ print(" โŒ Unrecoverable:", e)
341
+ break
342
+
343
+ except Exception:
344
+ writer.close()
345
+ print(" โŒ Conversion failed:", traceback.format_exc())
346
+ break
347
+
348
+ print(f" โŒ All attempts failed for {sas_path.name}")
349
+ return False
350
+
351
+
352
+ # --- Main loop ---
353
+ def main():
354
+ orig = sys.stdout
355
+ sys.stdout = Logger(LOG_FILE_PATH)
356
+ try:
357
+ print("๐Ÿš€ SAS โ†’ Parquet Hybrid Fix & Validate (full folder)\n")
358
+ files = list(SAS_INPUT_DIR.rglob("*.sas7bdat"))
359
+ if not files:
360
+ print("โŒ No SAS files found. Exiting.")
361
+ return
362
+
363
+ print(f"Found {len(files)} files.\n" + "="*60)
364
+ for sas in files:
365
+ rel = sas.relative_to(SAS_INPUT_DIR)
366
+ print(f"\n๐Ÿ—‚ Processing: {rel}")
367
+
368
+ # Mirror structure under parquetdata/ (which lives next to sasdata/)
369
+ pqf = (PARQUET_INPUT_DIR / rel).with_suffix('.parquet')
370
+
371
+ reconvert_file_ultimate(sas, pqf)
372
+ print("-"*60)
373
+
374
+ print("\nโœ… All done. See log at:", LOG_FILE_PATH)
375
+ finally:
376
+ sys.stdout.close()
377
+ sys.stdout = orig
378
+
379
+
380
+ if __name__ == "__main__":
381
+ main()
@@ -0,0 +1,164 @@
1
+ Metadata-Version: 2.4
2
+ Name: sas2parquet
3
+ Version: 0.1.0
4
+ Summary: SAS โ†’ Parquet Hybrid Converter & Validator
5
+ License-File: LICENSE
6
+ Author: Zaman Ziabakhshganji
7
+ Author-email: zaman.ganji@gmail.com
8
+ Requires-Python: >=3.11
9
+ Classifier: Programming Language :: Python :: 3
10
+ Classifier: Programming Language :: Python :: 3.11
11
+ Classifier: Programming Language :: Python :: 3.12
12
+ Classifier: Programming Language :: Python :: 3.13
13
+ Classifier: Programming Language :: Python :: 3.14
14
+ Requires-Dist: narwhals (==2.13.0)
15
+ Requires-Dist: numpy (==2.3.5)
16
+ Requires-Dist: pandas (==2.3.3)
17
+ Requires-Dist: polars (==1.36.1)
18
+ Requires-Dist: polars-runtime-32 (==1.36.1)
19
+ Requires-Dist: py4j (==0.10.9.9)
20
+ Requires-Dist: pyarrow (==22.0.0)
21
+ Requires-Dist: pyreadstat (==1.3.2)
22
+ Requires-Dist: pyspark (==4.0.1)
23
+ Requires-Dist: pytest (>=9.0.2,<10.0.0)
24
+ Requires-Dist: python-dateutil (==2.9.0.post0)
25
+ Requires-Dist: pytz (==2025.2)
26
+ Requires-Dist: requests (>=2.32.5,<3.0.0)
27
+ Requires-Dist: six (==1.17.0)
28
+ Requires-Dist: tzdata (==2025.2)
29
+ Description-Content-Type: text/markdown
30
+
31
+ # sas2parquet
32
+
33
+ [![PyPI version](https://badge.fury.io/py/sas2parquet.svg)](https://pypi.org/project/sas2parquet/)
34
+ [![Python versions](https://img.shields.io/pypi/pyversions/sas2parquet.svg)](https://pypi.org/project/sas2parquet/)
35
+ [![License](https://img.shields.io/pypi/l/sas2parquet.svg)](https://github.com/yourusername/sas2parquet/blob/main/LICENSE)
36
+
37
+ **The ultimate SAS (.sas7bdat) to Parquet converter** - handles problematic files that fail with standard tools. Automatic encoding detection, intelligent type inference, schema repair, and pixel-perfect validation.
38
+
39
+ ## โœจ Features
40
+
41
+ | Feature | Description |
42
+ |---------|-------------|
43
+ | ๐Ÿ”„ **Auto Encoding** | Detects UTF-8, Latin1, CP1252 from metadata/fallback |
44
+ | ๐Ÿง  **Smart Types** | Infers datetime, numeric, string with 20+ fallback attempts |
45
+ | โœ… **Validation** | Compares SAS vs Parquet chunk-by-chunk (numeric + string) |
46
+ | ๐Ÿ“Š **Memory Safe** | Chunked processing (96GB RAM optimized, configurable) |
47
+ | ๐Ÿ’พ **ZSTD** | Level 6 compression for maximum efficiency |
48
+ | ๐Ÿ“ **Detailed Logs** | Mismatch reports + full conversion trace |
49
+ | ๐ŸŽฏ **Two Modes** | Single file OR recursive directory processing |
50
+
51
+ ## Quick Start
52
+
53
+ ### Install
54
+ ```bash
55
+ pip install sas2parquet
56
+ ```
57
+
58
+ ### Single File
59
+ ```bash
60
+ sas2parquet input.sas output.parquet
61
+ ```
62
+
63
+ ### Batch Directory (Recommended)
64
+ ```bash
65
+ sas2parquet --dir-mode
66
+ ```
67
+
68
+ ## ๐Ÿ“ Directory Mode (Default Workflow)
69
+
70
+ ### How it works
71
+
72
+ - You provide a `sasdata/` directory containing all `.sas7bdat` files (including nested subfolders).
73
+ - The tool automatically creates a `parquetdata/` directory in the same parent folder as `sasdata/`.
74
+ - All files are converted to Parquet and written into `parquetdata/`, mirroring the original folder structure.
75
+
76
+ ```text
77
+ your-project/
78
+ โ”œโ”€โ”€ sasdata/
79
+ โ”‚ โ”œโ”€โ”€ file1.sas7bdat
80
+ โ”‚ โ””โ”€โ”€ nested.sas7bdat
81
+ โ”œโ”€โ”€ parquetdata/
82
+ โ”‚ โ”œโ”€โ”€ file1.parquet
83
+ โ”‚ โ””โ”€โ”€ nested.parquet
84
+ โ””โ”€โ”€ logging/
85
+ โ””โ”€โ”€ conversion_20260205_1145.log
86
+ ```
87
+
88
+ Just run:
89
+ ```bash
90
+ sas2parquet --dir-mode
91
+ ```
92
+
93
+ ## ๐Ÿ› ๏ธ CLI Reference
94
+ ```bash
95
+ sas2parquet --help
96
+ ```
97
+
98
+ ```text
99
+ usage: sas2parquet [-h] [--dir-mode] [sas_file] [parquet_file]
100
+
101
+ Robust SAS to Parquet converter with validation
102
+ ```
103
+
104
+ ## ๐Ÿ“Š Example Output
105
+ ```text
106
+ ๐Ÿš€ SAS โ†’ Parquet Hybrid Fix & Validate (full folder)
107
+ Found 3 files.
108
+ ============================================================
109
+ ...
110
+ ```
111
+
112
+ ## โš™๏ธ Configuration (Advanced)
113
+
114
+ Edit `src/sas2parquet/convert.py` constants:
115
+
116
+ ```python
117
+ AVAILABLE_RAM_GB = 96
118
+ RAM_USAGE_FACTOR = 0.5
119
+ ZSTD_COMPRESSION_LEVEL = 6
120
+ MIN_CHUNK_SIZE = 100_000
121
+ MAX_CHUNK_SIZE = 10_000_000
122
+ ```
123
+
124
+ ## ๐Ÿงช Validation Details
125
+ Each file undergoes 4-stage validation:
126
+ 1. Metadata
127
+ 2. Exact counts
128
+ 3. Column order
129
+ 4. Value comparison
130
+
131
+ ## ๐Ÿ’ป Development
132
+ ```bash
133
+ git clone <repo>
134
+ cd sas2parquet
135
+ pip install -e .
136
+ ```
137
+
138
+
139
+ ## ๐Ÿงช Testing
140
+
141
+ This project includes a comprehensive **pytest** test suite covering:
142
+
143
+ - Schema inference and overrides
144
+ - SAS โ†” Parquet validation logic
145
+ - Error handling and retry behavior
146
+ - Directory mirroring (`sasdata/ โ†’ parquetdata/`)
147
+ - Logging and CLI execution paths
148
+
149
+ ### Run tests
150
+
151
+ ```bash
152
+ pip install -e .[dev]
153
+ pytest -q
154
+ ```
155
+
156
+ ## ๐Ÿ“ฆ Dependencies
157
+ - pandas>=2.0
158
+ - polars>=0.20
159
+ - pyarrow>=15.0
160
+ - pyreadstat>=1.4
161
+
162
+ ## ๐Ÿ“„ License
163
+ MIT License
164
+
@@ -0,0 +1,8 @@
1
+ sas2parquet/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
2
+ sas2parquet/cli.py,sha256=Gap4-lrHE3a-tOTRljqmRP6uX-epLFHnOYN0pdffU-g,879
3
+ sas2parquet/convert.py,sha256=36hsDLM0uQF2tuwd9U8k3U5zEZtwQcEJWta3v4SkoXc,13950
4
+ sas2parquet-0.1.0.dist-info/METADATA,sha256=unNq979clzDKKLnxRFr5tebDI9L-i81LiNiPSJW-Nx8,4347
5
+ sas2parquet-0.1.0.dist-info/WHEEL,sha256=3ny-bZhpXrU6vSQ1UPG34FoxZBp3lVcvK0LkgUz6VLk,88
6
+ sas2parquet-0.1.0.dist-info/entry_points.txt,sha256=pg57h0xD_3R9ZC_YfxLLfu_2p1JNhF8xDNS6v7kiSBY,52
7
+ sas2parquet-0.1.0.dist-info/licenses/LICENSE,sha256=ouRycIMUGF1zCj49-ijn1wIlTNknZEoLwAHUp0ifH-g,1066
8
+ sas2parquet-0.1.0.dist-info/RECORD,,
@@ -0,0 +1,4 @@
1
+ Wheel-Version: 1.0
2
+ Generator: poetry-core 2.3.0
3
+ Root-Is-Purelib: true
4
+ Tag: py3-none-any
@@ -0,0 +1,3 @@
1
+ [console_scripts]
2
+ sas2parquet=sas2parquet.cli:main
3
+
@@ -0,0 +1,9 @@
1
+ MIT License
2
+
3
+ Copyright (c) 2025 EDC-IXLab
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
6
+
7
+ The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
8
+
9
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.