specatwrap14 0.2.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,13 @@
1
+ Metadata-Version: 2.3
2
+ Name: specatwrap14
3
+ Version: 0.2.0
4
+ Summary: A simple wrapper
5
+ Author: Casper Lauge Nørup Koch
6
+ Author-email: Casper Lauge Nørup Koch <kochcasper@gmail.com>
7
+ Requires-Dist: click>=8.3.1
8
+ Requires-Dist: pm4py>=2.7.19.8
9
+ Requires-Dist: polars>=1.38.1
10
+ Requires-Dist: pyreadstat>=1.3.3
11
+ Requires-Python: >=3.12
12
+ Description-Content-Type: text/markdown
13
+
File without changes
@@ -0,0 +1,33 @@
1
+ [project]
2
+ name = "specatwrap14"
3
+ version = "0.2.0"
4
+ description = "A simple wrapper"
5
+ readme = "README.md"
6
+ authors = [
7
+ { name = "Casper Lauge Nørup Koch", email = "kochcasper@gmail.com" }
8
+ ]
9
+ requires-python = ">=3.12"
10
+ dependencies = [
11
+ "click>=8.3.1",
12
+ "pm4py>=2.7.19.8",
13
+ "polars>=1.38.1",
14
+ "pyreadstat>=1.3.3",
15
+ ]
16
+
17
+ [project.scripts]
18
+ specatwrap = "specatwrap14:main"
19
+
20
+ [build-system]
21
+ requires = ["uv_build>=0.10.3,<0.11.0"]
22
+ build-backend = "uv_build"
23
+
24
+
25
+ [tool.pyright]
26
+ reportUnannotatedClassAttribute = "none"
27
+ reportImplicitOverride = "none"
28
+ reportUnknownParameterType = "none"
29
+ reportMissingParameterType = "none"
30
+ reportUnknownVariableType = "none"
31
+ reportUnknownMemberType = "none"
32
+ reportUnknownArgumentType = "none"
33
+ reportAny = "none"
@@ -0,0 +1,129 @@
1
+ """
2
+ Specatwrap - A wrapper for processing healthcare data.
3
+
4
+ This module provides a CLI for converting and processing healthcare data files.
5
+ """
6
+
7
+ import click
8
+ from pathlib import Path
9
+ import sys
10
+
11
+ from .sas_converter import convert_sas_to_parquet
12
+ from .prep import prep
13
+ from .sas_preview import preview
14
+ from .print_update import print_update_script
15
+
16
+
17
+ @click.group()
18
+ @click.version_option(version="0.1.0")
19
+ def cli():
20
+ """
21
+ Specatwrap - A wrapper for processing healthcare data.
22
+
23
+ A command-line tool for processing and converting healthcare data files.
24
+ """
25
+ pass
26
+
27
+
28
+ # Register command groups
29
+ cli.add_command(prep)
30
+ cli.add_command(preview)
31
+ cli.add_command(print_update_script)
32
+
33
+
34
+ @cli.command()
35
+ @click.argument("input_file", type=click.Path(exists=True, path_type=Path))
36
+ @click.argument("output_path", type=click.Path(path_type=Path))
37
+ @click.option(
38
+ "-c",
39
+ "--chunk-size",
40
+ type=int,
41
+ default=100000,
42
+ help="Number of rows to read per chunk (default: 100000)",
43
+ )
44
+ @click.option(
45
+ "--compression",
46
+ type=click.Choice(["zstd", "lz4", "snappy", "gzip"], case_sensitive=False),
47
+ default="zstd",
48
+ help="Parquet compression algorithm (default: zstd)",
49
+ )
50
+ @click.option(
51
+ "--encoding",
52
+ type=str,
53
+ default=None,
54
+ help="File encoding (default: auto-detect)",
55
+ )
56
+ @click.option(
57
+ "--overwrite",
58
+ is_flag=True,
59
+ default=False,
60
+ help="Overwrite existing output directory if it exists",
61
+ )
62
+ @click.option("-v", "--verbose", is_flag=True, help="Enable verbose output")
63
+ def sas2parquet(
64
+ input_file, output_path, chunk_size, compression, encoding, overwrite, verbose
65
+ ):
66
+ """
67
+ Convert a SAS file to Parquet format in chunks.
68
+
69
+ INPUT_FILE: Path to the SAS file (.sas7bdat)
70
+
71
+ OUTPUT_PATH: Path to output directory for Parquet files
72
+
73
+ This command processes large SAS files in chunks to minimize memory usage.
74
+ Each chunk is written to a separate Parquet file in the output directory.
75
+
76
+ Example usage:
77
+
78
+ specatwrap sas2parquet input.sas7bdat output_dir/
79
+
80
+ specatwrap sas2parquet input.sas7bdat output_dir/ --chunk-size 50000 -v
81
+
82
+ specatwrap sas2parquet input.sas7bdat output_dir/ --compression lz4
83
+
84
+ Reading the data later with Polars:
85
+
86
+ import polars as pl
87
+
88
+ df = pl.read_parquet("output_dir/*.parquet")
89
+ """
90
+ try:
91
+ convert_sas_to_parquet(
92
+ input_file=input_file,
93
+ output_path=output_path,
94
+ chunk_size=chunk_size,
95
+ compression=compression,
96
+ encoding=encoding,
97
+ overwrite=overwrite,
98
+ verbose=verbose,
99
+ )
100
+ except FileNotFoundError as e:
101
+ click.secho(f"✗ Error: File not found - {e}", fg="red", err=True)
102
+ sys.exit(1)
103
+ except PermissionError as e:
104
+ click.secho(f"✗ Error: Permission denied - {e}", fg="red", err=True)
105
+ sys.exit(1)
106
+ except ValueError as e:
107
+ click.secho(f"✗ Error: Invalid input - {e}", fg="red", err=True)
108
+ sys.exit(1)
109
+ except MemoryError:
110
+ click.secho(
111
+ f"✗ Error: Out of memory. Try a smaller --chunk-size", fg="red", err=True
112
+ )
113
+ sys.exit(1)
114
+ except Exception as e:
115
+ click.secho(f"✗ Error: {e}", fg="red", err=True)
116
+ if verbose:
117
+ import traceback
118
+
119
+ traceback.print_exc()
120
+ sys.exit(1)
121
+
122
+
123
+ def main():
124
+ """Entry point for the CLI application."""
125
+ cli()
126
+
127
+
128
+ if __name__ == "__main__":
129
+ main()
@@ -0,0 +1,124 @@
1
+ """
2
+ Prep command group for preprocessing healthcare data files.
3
+
4
+ This module provides commands for filtering and preprocessing parquet files
5
+ before converting them to XES format.
6
+ """
7
+
8
+ import click
9
+ from pathlib import Path
10
+ import sys
11
+
12
+ from .io_handler import process_parquet_files
13
+
14
+
15
+ @click.group()
16
+ def prep():
17
+ """
18
+ Preprocess and filter healthcare data files.
19
+
20
+ Commands in this group help prepare raw data files by filtering,
21
+ cleaning, and transforming them before further processing.
22
+ """
23
+ pass
24
+
25
+
26
+ @prep.command()
27
+ @click.option(
28
+ "-i",
29
+ "--input",
30
+ "input_dir",
31
+ type=click.Path(exists=True, file_okay=False, dir_okay=True, path_type=Path),
32
+ required=True,
33
+ help="Path to directory containing parquet files to process.",
34
+ )
35
+ @click.option(
36
+ "-o",
37
+ "--output",
38
+ "output_file",
39
+ type=click.Path(dir_okay=False, file_okay=True, path_type=Path),
40
+ required=True,
41
+ help="Path to output parquet file.",
42
+ )
43
+ @click.option("-v", "--verbose", is_flag=True, help="Enable verbose output.")
44
+ def diagnosis(input_dir, output_file, verbose):
45
+ """
46
+ Filter and preprocess diagnosis parquet files.
47
+
48
+ This command lazily reads all parquet files from a directory, applies
49
+ filtering and preprocessing transformations, and writes the results to
50
+ a single output parquet file.
51
+
52
+ INPUT: Directory path containing parquet files to process.
53
+
54
+ OUTPUT: Path to output parquet file for processed data.
55
+
56
+ Example usage:
57
+
58
+ specatwrap prep diagnosis -i ./data/parquet_files/ -o ./processed/diagnosis.parquet
59
+
60
+ specatwrap prep diagnosis --input ./raw_data/ --output ./clean_data.parquet -v
61
+ """
62
+ try:
63
+ # Display processing information
64
+ click.echo(f"Input directory: {input_dir}")
65
+ click.echo(f"Output file: {output_file}")
66
+ click.echo()
67
+
68
+ # Find all parquet files in input directory (for verbose output)
69
+ if verbose:
70
+ parquet_pattern = str(input_dir / "*.parquet")
71
+ click.echo(f"Searching for parquet files: {parquet_pattern}")
72
+ parquet_files = list(input_dir.glob("*.parquet"))
73
+ if parquet_files:
74
+ click.echo(f"Found {len(parquet_files)} parquet file(s):")
75
+ for f in parquet_files:
76
+ click.echo(f" - {f.name}")
77
+ click.echo()
78
+
79
+ # Process the parquet files using the io_handler
80
+ click.echo("Loading and applying filters...")
81
+ with click.progressbar(
82
+ length=100, label="Processing", show_eta=False, show_percent=True
83
+ ) as bar:
84
+ # Call the generic I/O handler with "diagnosis" filter type
85
+ process_parquet_files(
86
+ input_dir=input_dir,
87
+ output_file=output_file,
88
+ filter_type="diagnosis",
89
+ verbose=verbose,
90
+ )
91
+ bar.update(100)
92
+
93
+ # Display success message
94
+ click.secho("✓ Processing completed successfully!", fg="green", bold=True)
95
+ click.echo(f"Output file: {output_file}")
96
+
97
+ # Display file statistics
98
+ if output_file.exists():
99
+ size_mb = output_file.stat().st_size / (1024 * 1024)
100
+ click.echo(f"File size: {size_mb:.2f} MB")
101
+
102
+ except FileNotFoundError as e:
103
+ click.secho(f"✗ Error: File or directory not found - {e}", fg="red", err=True)
104
+ sys.exit(1)
105
+ except PermissionError as e:
106
+ click.secho(f"✗ Error: Permission denied - {e}", fg="red", err=True)
107
+ sys.exit(1)
108
+ except ValueError as e:
109
+ click.secho(f"✗ Error: Invalid input - {e}", fg="red", err=True)
110
+ sys.exit(1)
111
+ except MemoryError:
112
+ click.secho(
113
+ "✗ Error: Out of memory. Try processing smaller batches.",
114
+ fg="red",
115
+ err=True,
116
+ )
117
+ sys.exit(1)
118
+ except Exception as e:
119
+ click.secho(f"✗ Error: {e}", fg="red", err=True)
120
+ if verbose:
121
+ import traceback
122
+
123
+ traceback.print_exc()
124
+ sys.exit(1)
@@ -0,0 +1,265 @@
1
+ """
2
+ Diagnosis filter classes.
3
+
4
+ This module provides filter classes for preprocessing diagnosis parquet files.
5
+ """
6
+
7
+ from abc import ABC, abstractmethod
8
+ import polars as pl
9
+
10
+
11
+ class BaseFilter(ABC):
12
+ """
13
+ Abstract base class for parquet file filters.
14
+
15
+ Each filter type (diagnosis, procedure, medication, etc.) should inherit
16
+ from this class and implement the apply() method with their specific
17
+ filtering and transformation logic.
18
+ """
19
+
20
+ @abstractmethod
21
+ def apply(self, lf: pl.LazyFrame) -> pl.LazyFrame:
22
+ """
23
+ Apply filtering and preprocessing logic to a LazyFrame.
24
+
25
+ Args:
26
+ lazy_frame: Input Polars LazyFrame to filter/transform
27
+
28
+ Returns:
29
+ Transformed Polars LazyFrame
30
+ """
31
+ pass
32
+
33
+ def get_name(self) -> str:
34
+ """
35
+ Get the name of this filter type.
36
+
37
+ Returns:
38
+ Filter type name (defaults to class name without 'Filter' suffix)
39
+ """
40
+ class_name = self.__class__.__name__
41
+ if class_name.endswith("Filter"):
42
+ return class_name[:-6].lower()
43
+ return class_name.lower()
44
+
45
+
46
+ class DiagnosisFilter(BaseFilter):
47
+ """
48
+ Filter for diagnosis parquet files.
49
+
50
+ Applies diagnosis-specific filtering and column transformations:
51
+ - Filters by birthdate (year > 1980)
52
+ - Filters by region (Region Sjælland)
53
+ - Renames columns to standardized event log format
54
+ """
55
+
56
+ CASE_ATTR = {
57
+ k: f"case:{v}"
58
+ for k, v in {
59
+ "BORGER_FOEDSELSDATO": "patient:birthdate",
60
+ "PNR": "concept:name",
61
+ }.items()
62
+ }
63
+
64
+ EVENT_ATTR = {
65
+ "TORRA_DIAG": "concept:name",
66
+ "ADIAG_TEKST": "medical:diag",
67
+ "KONT_ANS_GEO_REG_TEKST": "org:group",
68
+ "KONT_LPR_ENTITY_ID": "org:resource",
69
+ "KONT_INST_EJERTYPE": "org:role",
70
+ "KONT_STARTTIDSPUNKT": "start_timestamp",
71
+ "KONT_SLUTTIDSPUNKT": "time:timestamp",
72
+ # "BORGER_ALDER_AAR_IND": "patient:age",
73
+ "PRIORITET_TEKST": "medical:priority",
74
+ "KONT_TYPE_TEKST": "medical:contact_type",
75
+ }
76
+
77
+ def _columns_to_select(self) -> list[str]:
78
+ a = list((self.CASE_ATTR | self.EVENT_ATTR).keys())
79
+ a.append("ADIAG") # Needed for TORRA_DIAG transformation
80
+ a.remove("TORRA_DIAG") # We will create this column in the transformation step
81
+ return a
82
+
83
+ def apply(self, lf: pl.LazyFrame) -> pl.LazyFrame:
84
+ """
85
+ Apply diagnosis-specific filtering and transformations.
86
+
87
+ Args:
88
+ lazy_frame: Input LazyFrame containing diagnosis data
89
+
90
+ Returns:
91
+ Filtered and transformed LazyFrame
92
+ """
93
+
94
+ lf = lf.select(self._columns_to_select())
95
+
96
+ lf = self._add_torra_diag(lf)
97
+
98
+ return (
99
+ lf#.filter(pl.col("TORRA_DIAG").list.len() > 4)
100
+ .with_columns(pl.col("TORRA_DIAG").list.join(", "))
101
+ .rename(self.CASE_ATTR | self.EVENT_ATTR)
102
+ )
103
+
104
+ def _add_torra_diag(self, lazy_frame: pl.LazyFrame) -> pl.LazyFrame:
105
+ col = pl.col("ADIAG")
106
+
107
+ def starts_with(prefixes: list[str], label: str) -> pl.Expr:
108
+ pattern = f"^(?:{'|'.join(prefixes)})"
109
+ return (
110
+ pl.when(col.str.contains(pattern))
111
+ .then(pl.lit(label))
112
+ .otherwise(pl.lit(None))
113
+ )
114
+
115
+ return lazy_frame.with_columns(
116
+ TORRA_DIAG=pl.concat_list(
117
+ [
118
+ starts_with(
119
+ ["DJ41", "DJ42", "DJ43", "DJ44", "DJ45", "DJ46"], "LUNG"
120
+ ),
121
+ starts_with(
122
+ [
123
+ "DL405",
124
+ "DM05",
125
+ "DM06",
126
+ "DM07",
127
+ "DM15",
128
+ "DM16",
129
+ "DM17",
130
+ "DM45",
131
+ "DM47",
132
+ "DM50",
133
+ "DM51",
134
+ "DM53",
135
+ "DM54",
136
+ "DM80",
137
+ "DM81",
138
+ "DM82",
139
+ ],
140
+ "MUSCULOSKELETAL",
141
+ ),
142
+ starts_with(
143
+ ["DE03", "DE05", "DE10", "DE11", "DE12", "DE13", "DE14"],
144
+ "ENDOCRINE",
145
+ ),
146
+ starts_with(["DE03", "DE05"], "THYROIDEA"),
147
+ starts_with(["DE10", "DE11", "DE12", "DE13", "DE14"], "DIABETES"),
148
+ starts_with(["DG30", "DG318", "DG319", "DF"], "MENTAL"),
149
+ # CANCER: Matches DC but explicitly excludes DC44
150
+ pl.when(col.str.contains("^DC") & ~col.str.contains("^DC44"))
151
+ .then(pl.lit("CANCER"))
152
+ .otherwise(pl.lit(None)),
153
+ starts_with(
154
+ [
155
+ "DI60",
156
+ "DI61",
157
+ "DI62",
158
+ "DI63",
159
+ "DI64",
160
+ "DI69",
161
+ "DG20",
162
+ "DG35",
163
+ "DG40",
164
+ "DG43",
165
+ ],
166
+ "NEUROLOGICAL",
167
+ ),
168
+ starts_with(
169
+ [
170
+ "DK30",
171
+ "DK50",
172
+ "DK51",
173
+ "DK58",
174
+ "DK70",
175
+ "DK71",
176
+ "DK72",
177
+ "DK73",
178
+ "DK74",
179
+ "DK75",
180
+ "DK76",
181
+ "DK860",
182
+ "DK861",
183
+ ],
184
+ "GASTROINTESTINAL",
185
+ ),
186
+ starts_with(
187
+ [
188
+ "DI20",
189
+ "DI21",
190
+ "DI22",
191
+ "DI23",
192
+ "DI24",
193
+ "DI25",
194
+ "DI47",
195
+ "DI48",
196
+ "DI49",
197
+ "DI50",
198
+ "DI05",
199
+ "DI06",
200
+ "DI07",
201
+ "DI08",
202
+ "DI34",
203
+ "DI35",
204
+ "DI36",
205
+ "DI37",
206
+ "DI441",
207
+ "DI442",
208
+ "DI443",
209
+ "DI444",
210
+ "DI445",
211
+ "DI446",
212
+ "DI447",
213
+ "DI452",
214
+ "DI453",
215
+ "DI454",
216
+ "DI455",
217
+ "DI456",
218
+ "DI457",
219
+ "DI458",
220
+ "DI459",
221
+ ],
222
+ "CARDIOVASCULAR",
223
+ ),
224
+ starts_with(
225
+ [
226
+ "DN03",
227
+ "DN04",
228
+ "DN05",
229
+ "DN11",
230
+ "DN12",
231
+ "DN18",
232
+ "DN19",
233
+ "DZ49",
234
+ "DN80",
235
+ "DZ992",
236
+ "DN393",
237
+ "DN394",
238
+ ],
239
+ "GENITURINARY",
240
+ ),
241
+ starts_with(
242
+ [
243
+ "DH40",
244
+ "DH91",
245
+ "DL40",
246
+ "DH540",
247
+ "DH541",
248
+ "DH542",
249
+ "DH543",
250
+ "DH547",
251
+ "DH900",
252
+ "DH902",
253
+ "DH903",
254
+ "DH905",
255
+ "DH906",
256
+ "DH908",
257
+ ],
258
+ "SENSORY ORGANS",
259
+ ),
260
+ starts_with(
261
+ ["DL23", "DL24", "DL25", "DJ30", "DL500", "DJ450"], "ALLERGY"
262
+ ),
263
+ ]
264
+ ).list.drop_nulls()
265
+ )
@@ -0,0 +1,86 @@
1
+ """
2
+ I/O handler for parquet file processing.
3
+
4
+ This module provides generic I/O functionality for reading, filtering,
5
+ and writing parquet files. It uses a filter registry pattern to support
6
+ different filter types (diagnosis, procedure, medication, etc.).
7
+ """
8
+
9
+ from pathlib import Path
10
+ from typing import Type
11
+ import polars as pl
12
+
13
+ from .diagnosis import BaseFilter, DiagnosisFilter
14
+
15
+
16
+ # Registry mapping filter type strings to filter classes
17
+ FILTER_REGISTRY: dict[str, Type[BaseFilter]] = {
18
+ "diagnosis": DiagnosisFilter,
19
+ # Future additions:
20
+ # "procedure": ProcedureFilter,
21
+ # "medication": MedicationFilter,
22
+ }
23
+
24
+
25
+ def process_parquet_files(
26
+ input_dir: Path,
27
+ output_file: Path,
28
+ filter_type: str,
29
+ verbose: bool = False,
30
+ ):
31
+ """
32
+ Process parquet files with a specified filter type.
33
+
34
+ This function handles all I/O operations:
35
+ - Discovers parquet files in the input directory
36
+ - Lazily loads them using Polars
37
+ - Applies the specified filter transformation
38
+ - Collects the results and writes to output file
39
+
40
+ Args:
41
+ input_dir: Directory containing input parquet files
42
+ output_file: Path to output parquet file
43
+ filter_type: Type of filter to apply (e.g., "diagnosis", "procedure")
44
+ verbose: Whether to include verbose processing (currently unused in I/O layer)
45
+
46
+ Returns:
47
+ Tuple of (row_count, column_count, column_names)
48
+
49
+ Raises:
50
+ FileNotFoundError: If no parquet files found in input directory
51
+ ValueError: If filter_type is not registered
52
+ PermissionError: If unable to read/write files
53
+ MemoryError: If insufficient memory to process data
54
+ """
55
+ # Validate filter type
56
+ if filter_type not in FILTER_REGISTRY:
57
+ available_filters = ", ".join(FILTER_REGISTRY.keys())
58
+ raise ValueError(
59
+ f"Unknown filter type '{filter_type}'. "
60
+ f"Available filters: {available_filters}"
61
+ )
62
+
63
+ # Check if input directory contains parquet files
64
+ parquet_files = list(input_dir.glob("*.parquet"))
65
+ if not parquet_files:
66
+ raise FileNotFoundError(f"No parquet files found in {input_dir}")
67
+
68
+ # Create parquet file pattern for lazy loading
69
+ parquet_pattern = str(input_dir / "*.parquet")
70
+
71
+ # Lazy load all parquet files
72
+ lazy_frame = pl.scan_parquet(parquet_pattern)
73
+
74
+ # Instantiate the appropriate filter
75
+ filter_class = FILTER_REGISTRY[filter_type]
76
+ filter_instance = filter_class()
77
+
78
+ # Apply the filter transformation
79
+ filtered_lazy_frame = filter_instance.apply(lazy_frame)
80
+
81
+ # Collect the lazy frame (materialize the data)
82
+
83
+ # Ensure output directory exists
84
+ output_file.parent.mkdir(parents=True, exist_ok=True)
85
+
86
+ filtered_lazy_frame.sink_parquet(output_file)
@@ -0,0 +1,161 @@
1
+ """
2
+ CLI command to print the standalone update script for the restricted environment.
3
+ """
4
+
5
+ import click
6
+
7
+
8
+ @click.command("print-update-script")
9
+ def print_update_script():
10
+ """
11
+ Print a standalone Python script for updating specatwrap on the restricted environment.
12
+
13
+ The script finds the currently installed version (e.g., specatwrap10),
14
+ uninstalls it, and installs the next version (specatwrap11).
15
+
16
+ Usage on development machine:
17
+
18
+ specatwrap print-update-script > update_specatwrap.py
19
+
20
+ Then transfer update_specatwrap.py to the restricted environment and run:
21
+
22
+ python update_specatwrap.py
23
+ """
24
+ script = '''#!/usr/bin/env python3
25
+ """
26
+ Standalone update script for specatwrap package.
27
+
28
+ This script:
29
+ 1. Finds the currently installed specatwrap version (e.g., specatwrap10)
30
+ 2. Uninstalls it without confirmation
31
+ 3. Installs the next version (e.g., specatwrap11)
32
+
33
+ Usage:
34
+ python update_specatwrap.py
35
+ """
36
+
37
+ import subprocess
38
+ import sys
39
+ import re
40
+
41
+
42
+ def run_command(cmd, capture=True):
43
+ """Run a shell command and return the result."""
44
+ try:
45
+ result = subprocess.run(
46
+ cmd,
47
+ shell=True,
48
+ capture_output=capture,
49
+ text=True,
50
+ check=False
51
+ )
52
+ return result
53
+ except Exception as e:
54
+ print(f"Error running command: {e}")
55
+ sys.exit(1)
56
+
57
+
58
+ def find_installed_version():
59
+ """Find the currently installed specatwrap package."""
60
+ result = run_command("pip list")
61
+
62
+ if result.returncode != 0:
63
+ print("Error: Failed to run 'pip list'")
64
+ print(result.stderr)
65
+ sys.exit(1)
66
+
67
+ # Look for specatwrap packages in pip list output
68
+ lines = result.stdout.split('\\n')
69
+ specatwrap_packages = []
70
+
71
+ for line in lines:
72
+ # Match lines starting with specatwrap (with or without version number)
73
+ match = re.match(r'^(specatwrap\\d*)\\s+', line.strip())
74
+ if match:
75
+ specatwrap_packages.append(match.group(1))
76
+
77
+ if not specatwrap_packages:
78
+ print("Error: No specatwrap package found.")
79
+ print("Please install specatwrap manually first.")
80
+ sys.exit(1)
81
+
82
+ if len(specatwrap_packages) > 1:
83
+ print(f"Error: Multiple specatwrap packages found: {specatwrap_packages}")
84
+ print("Please uninstall extra versions manually.")
85
+ sys.exit(1)
86
+
87
+ return specatwrap_packages[0]
88
+
89
+
90
+ def parse_version(package_name):
91
+ """Extract version number from package name."""
92
+ # Match specatwrap followed by digits
93
+ match = re.match(r'^specatwrap(\\d+)$', package_name)
94
+
95
+ if not match:
96
+ # Package is just "specatwrap" without a number
97
+ print(f"Error: Found '{package_name}' without version number.")
98
+ print("This should not exist in the mirror.")
99
+ sys.exit(1)
100
+
101
+ return int(match.group(1))
102
+
103
+
104
+ def main():
105
+ print("=" * 60)
106
+ print("specatwrap Update Script")
107
+ print("=" * 60)
108
+
109
+ # Step 1: Find currently installed version
110
+ print("\\n[1/3] Finding currently installed version...")
111
+ current_package = find_installed_version()
112
+ print(f"Found: {current_package}")
113
+
114
+ # Parse version number and calculate next version
115
+ current_version = parse_version(current_package)
116
+ next_version = current_version + 1
117
+ next_package = f"specatwrap{next_version}"
118
+
119
+ print(f"Will update: {current_package} -> {next_package}")
120
+
121
+ # Step 2: Uninstall old version
122
+ print(f"\\n[2/3] Uninstalling {current_package}...")
123
+ result = run_command(f"pip uninstall -y {current_package}", capture=False)
124
+
125
+ if result.returncode != 0:
126
+ print(f"\\nError: Failed to uninstall {current_package}")
127
+ sys.exit(1)
128
+
129
+ print(f"Successfully uninstalled {current_package}")
130
+
131
+ # Step 3: Install new version
132
+ print(f"\\n[3/3] Installing {next_package}...")
133
+ result = run_command(f"pip install {next_package}", capture=False)
134
+
135
+ if result.returncode != 0:
136
+ print(f"\\nError: Failed to install {next_package}")
137
+ print("\\nPossible reasons:")
138
+ print(" - The new version may not be available yet in the PyPI mirror")
139
+ print(" - The mirror updates once daily")
140
+ print(" - Please try again later or check if the package was published")
141
+ sys.exit(1)
142
+
143
+ print(f"\\nSuccessfully installed {next_package}")
144
+
145
+ print("\\n" + "=" * 60)
146
+ print("Update complete!")
147
+ print("=" * 60)
148
+ return 0
149
+
150
+
151
+ if __name__ == "__main__":
152
+ try:
153
+ sys.exit(main())
154
+ except KeyboardInterrupt:
155
+ print("\\nUpdate cancelled.")
156
+ sys.exit(1)
157
+ except Exception as e:
158
+ print(f"\\nUnexpected error: {e}")
159
+ sys.exit(1)
160
+ '''
161
+ click.echo(script)
@@ -0,0 +1,171 @@
1
+ """
2
+ SAS to Parquet converter module.
3
+
4
+ This module provides functionality to convert large SAS7BDAT files to Parquet format
5
+ in a memory-efficient way by processing the data in chunks.
6
+ """
7
+
8
+ import sys
9
+ import time
10
+ from pathlib import Path
11
+ from typing import Optional
12
+ import shutil
13
+ import click
14
+ import pyreadstat
15
+ import polars as pl
16
+
17
+
18
+ def convert_sas_to_parquet(
19
+ input_file: Path,
20
+ output_path: Path,
21
+ chunk_size: int = 100000,
22
+ compression: str = "zstd",
23
+ encoding: Optional[str] = None,
24
+ overwrite: bool = False,
25
+ verbose: bool = False,
26
+ ):
27
+ """
28
+ Convert a SAS7BDAT file to Parquet format in chunks.
29
+
30
+ Parameters
31
+ ----------
32
+ input_file : Path
33
+ Path to the SAS file (.sas7bdat)
34
+ output_path : Path
35
+ Path to output directory for Parquet files
36
+ chunk_size : int, optional
37
+ Number of rows to read per chunk (default: 100000)
38
+ compression : str, optional
39
+ Parquet compression algorithm (default: 'zstd')
40
+ encoding : str, optional
41
+ File encoding (default: auto-detect)
42
+ overwrite : bool, optional
43
+ Whether to overwrite existing output directory (default: False)
44
+ verbose : bool, optional
45
+ Enable verbose output (default: False)
46
+
47
+ Raises
48
+ ------
49
+ FileNotFoundError
50
+ If the input file does not exist
51
+ ValueError
52
+ If the input file is not a .sas7bdat file or output path is invalid
53
+ PermissionError
54
+ If there are permission issues with files/directories
55
+ MemoryError
56
+ If there's insufficient memory (suggest smaller chunk_size)
57
+ """
58
+ # Validate input file
59
+ if not input_file.exists():
60
+ click.secho(f"✗ Error: Input file not found: {input_file}", fg="red", err=True)
61
+ sys.exit(1)
62
+
63
+ if input_file.suffix.lower() != ".sas7bdat":
64
+ click.secho("✗ Error: Input file must be a .sas7bdat file", fg="red", err=True)
65
+ sys.exit(1)
66
+
67
+ # Handle output directory
68
+ output_path = Path(output_path)
69
+
70
+ if output_path.exists():
71
+ if not output_path.is_dir():
72
+ click.secho(
73
+ f"✗ Error: Output path exists and is not a directory: {output_path}",
74
+ fg="red",
75
+ err=True,
76
+ )
77
+ sys.exit(1)
78
+
79
+ if overwrite:
80
+ if verbose:
81
+ click.echo(f"Removing existing directory: {output_path}")
82
+ shutil.rmtree(output_path)
83
+ output_path.mkdir(parents=True, exist_ok=True)
84
+ else:
85
+ # Check if directory has parquet files
86
+ existing_parquet = list(output_path.glob("*.parquet"))
87
+ if existing_parquet:
88
+ click.secho(
89
+ f"⚠ Warning: Output directory already contains {len(existing_parquet)} .parquet file(s)",
90
+ fg="yellow",
91
+ )
92
+ if not click.confirm(
93
+ "Do you want to continue and potentially mix files?"
94
+ ):
95
+ click.echo("Aborted.")
96
+ sys.exit(0)
97
+ else:
98
+ output_path.mkdir(parents=True, exist_ok=True)
99
+
100
+ # Display processing information
101
+ click.echo(f"Processing: {input_file}")
102
+ click.echo(f"Output directory: {output_path}")
103
+ if verbose:
104
+ click.echo(f"Chunk size: {chunk_size:,} rows")
105
+ click.echo(f"Compression: {compression}")
106
+ click.echo(f"Encoding: {encoding if encoding else 'auto-detect'}")
107
+ click.echo()
108
+
109
+ # Process file in chunks
110
+ click.echo("Converting SAS to Parquet...")
111
+
112
+ reader = pyreadstat.read_file_in_chunks(
113
+ pyreadstat.read_sas7bdat,
114
+ str(input_file),
115
+ chunksize=chunk_size,
116
+ encoding=encoding,
117
+ output_format="polars",
118
+ )
119
+
120
+ chunk_num = 0
121
+ total_rows = 0
122
+ num_columns = 0
123
+
124
+ for df_polars, meta in reader:
125
+ # Start timing for this chunk
126
+ chunk_start_time = time.time()
127
+
128
+ # Generate chunk filename with zero-padded numbering
129
+ chunk_filename = f"chunk_{chunk_num:04d}.parquet"
130
+ chunk_path = output_path / chunk_filename
131
+
132
+ # Write chunk to parquet
133
+ df_polars.write_parquet(chunk_path, compression=compression)
134
+
135
+ # Calculate elapsed time
136
+ chunk_elapsed_time = time.time() - chunk_start_time
137
+
138
+ # Update statistics
139
+ chunk_rows = len(df_polars)
140
+ total_rows += chunk_rows
141
+ num_columns = len(meta.column_names)
142
+ chunk_num += 1
143
+
144
+ if verbose:
145
+ click.echo(
146
+ f" Wrote chunk {chunk_num}: {chunk_rows:,} rows → {chunk_filename} (took {chunk_elapsed_time:.2f}s)"
147
+ )
148
+ else:
149
+ click.echo(
150
+ f" Wrote chunk {chunk_num}: {chunk_rows:,} rows → {chunk_filename} (took {chunk_elapsed_time:.2f}s)"
151
+ )
152
+
153
+ # Calculate file sizes
154
+ input_size = input_file.stat().st_size / (1024 * 1024) # MB
155
+ output_files = list(output_path.glob("*.parquet"))
156
+ output_size = sum(f.stat().st_size for f in output_files) / (1024 * 1024) # MB
157
+ compression_ratio = input_size / output_size if output_size > 0 else 0
158
+
159
+ # Display success message
160
+ click.echo()
161
+ click.secho("✓ Conversion completed successfully!", fg="green", bold=True)
162
+ click.echo(f"Input file: {input_size:.2f} MB ({input_file.name})")
163
+ click.echo(f"Output files: {output_size:.2f} MB ({len(output_files)} chunks)")
164
+ click.echo(f"Compression: {compression_ratio:.2f}x")
165
+ click.echo(f"Total rows: {total_rows:,}")
166
+ click.echo(f"Columns: {num_columns}")
167
+ click.echo(f"Chunk size: {chunk_size:,} rows/chunk")
168
+ click.echo()
169
+ click.echo("To read the data:")
170
+ click.echo(f" import polars as pl")
171
+ click.echo(f" df = pl.read_parquet('{output_path}/*.parquet')")
@@ -0,0 +1,93 @@
1
+ """
2
+ SAS file preview module.
3
+
4
+ This module provides functionality to quickly preview the first N rows
5
+ of a SAS7BDAT file for inspection purposes.
6
+ """
7
+
8
+ import click
9
+ from pathlib import Path
10
+ import sys
11
+ import pyreadstat
12
+
13
+
14
+ @click.command()
15
+ @click.argument("input_file", type=click.Path(exists=True, path_type=Path))
16
+ @click.option(
17
+ "-n",
18
+ "--num-rows",
19
+ type=int,
20
+ default=10,
21
+ help="Number of rows to display (default: 10)",
22
+ )
23
+ @click.option(
24
+ "--encoding",
25
+ type=str,
26
+ default=None,
27
+ help="File encoding (default: auto-detect)",
28
+ )
29
+ def preview(input_file, num_rows, encoding):
30
+ """
31
+ Preview the first N rows of a SAS file.
32
+
33
+ INPUT_FILE: Path to the SAS file (.sas7bdat)
34
+
35
+ This command quickly reads and displays the first few rows of a SAS file
36
+ to help you inspect the data structure and content without converting
37
+ the entire file.
38
+
39
+ Example usage:
40
+
41
+ specatwrap preview input.sas7bdat
42
+
43
+ specatwrap preview input.sas7bdat --num-rows 20
44
+
45
+ specatwrap preview input.sas7bdat --encoding latin1
46
+ """
47
+ try:
48
+ # Validate file exists
49
+ if not input_file.exists():
50
+ click.secho(f"✗ Error: File not found: {input_file}", fg="red", err=True)
51
+ sys.exit(1)
52
+
53
+ if input_file.suffix.lower() != ".sas7bdat":
54
+ click.secho(
55
+ "✗ Error: Input file must be a .sas7bdat file", fg="red", err=True
56
+ )
57
+ sys.exit(1)
58
+
59
+ click.echo(f"Reading first {num_rows} rows from: {input_file.name}")
60
+ click.echo()
61
+
62
+ # Read first N rows using pyreadstat
63
+ df, meta = pyreadstat.read_sas7bdat(
64
+ str(input_file),
65
+ row_limit=num_rows,
66
+ encoding=encoding,
67
+ output_format="polars",
68
+ )
69
+
70
+ # Display metadata
71
+ click.echo(f"Total columns: {len(meta.column_names)}")
72
+ click.echo(f"Column names: {', '.join(meta.column_names)}")
73
+ if encoding:
74
+ click.echo(f"Encoding: {encoding}")
75
+ click.echo()
76
+
77
+ # Display the data using Polars' pretty print
78
+ click.echo(str(df))
79
+ click.echo()
80
+ click.secho(f"✓ Showing {len(df)} row(s)", fg="green")
81
+
82
+ except FileNotFoundError as e:
83
+ click.secho(f"✗ Error: File not found - {e}", fg="red", err=True)
84
+ sys.exit(1)
85
+ except PermissionError as e:
86
+ click.secho(f"✗ Error: Permission denied - {e}", fg="red", err=True)
87
+ sys.exit(1)
88
+ except ValueError as e:
89
+ click.secho(f"✗ Error: Invalid input - {e}", fg="red", err=True)
90
+ sys.exit(1)
91
+ except Exception as e:
92
+ click.secho(f"✗ Error: {e}", fg="red", err=True)
93
+ sys.exit(1)