specatwrap37 0.2.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- specatwrap37-0.2.0/PKG-INFO +13 -0
- specatwrap37-0.2.0/README.md +0 -0
- specatwrap37-0.2.0/pyproject.toml +33 -0
- specatwrap37-0.2.0/src/specatwrap37/__init__.py +39 -0
- specatwrap37-0.2.0/src/specatwrap37/prep/__init__.py +124 -0
- specatwrap37-0.2.0/src/specatwrap37/prep/diagnosis.py +425 -0
- specatwrap37-0.2.0/src/specatwrap37/prep/io_handler.py +86 -0
- specatwrap37-0.2.0/src/specatwrap37/printable/__init__.py +103 -0
- specatwrap37-0.2.0/src/specatwrap37/printable/combine_death_contact.py +14 -0
- specatwrap37-0.2.0/src/specatwrap37/printable/summary.py +68 -0
- specatwrap37-0.2.0/src/specatwrap37/printable/zoom_in.py +41 -0
- specatwrap37-0.2.0/src/specatwrap37/sas_converter.py +299 -0
- specatwrap37-0.2.0/src/specatwrap37/sas_preview.py +93 -0
|
@@ -0,0 +1,13 @@
|
|
|
1
|
+
Metadata-Version: 2.3
|
|
2
|
+
Name: specatwrap37
|
|
3
|
+
Version: 0.2.0
|
|
4
|
+
Summary: A simple wrapper
|
|
5
|
+
Author: Casper Lauge Nørup Koch
|
|
6
|
+
Author-email: Casper Lauge Nørup Koch <kochcasper@gmail.com>
|
|
7
|
+
Requires-Dist: click>=8.3.1
|
|
8
|
+
Requires-Dist: pm4py>=2.7.19.8
|
|
9
|
+
Requires-Dist: polars>=1.38.1
|
|
10
|
+
Requires-Dist: pyreadstat>=1.3.3
|
|
11
|
+
Requires-Python: >=3.12
|
|
12
|
+
Description-Content-Type: text/markdown
|
|
13
|
+
|
|
File without changes
|
|
@@ -0,0 +1,33 @@
|
|
|
1
|
+
[project]
|
|
2
|
+
name = "specatwrap37"
|
|
3
|
+
version = "0.2.0"
|
|
4
|
+
description = "A simple wrapper"
|
|
5
|
+
readme = "README.md"
|
|
6
|
+
authors = [
|
|
7
|
+
{ name = "Casper Lauge Nørup Koch", email = "kochcasper@gmail.com" }
|
|
8
|
+
]
|
|
9
|
+
requires-python = ">=3.12"
|
|
10
|
+
dependencies = [
|
|
11
|
+
"click>=8.3.1",
|
|
12
|
+
"pm4py>=2.7.19.8",
|
|
13
|
+
"polars>=1.38.1",
|
|
14
|
+
"pyreadstat>=1.3.3",
|
|
15
|
+
]
|
|
16
|
+
|
|
17
|
+
[project.scripts]
|
|
18
|
+
specatwrap = "specatwrap37:main"
|
|
19
|
+
|
|
20
|
+
[build-system]
|
|
21
|
+
requires = ["uv_build>=0.10.3,<0.11.0"]
|
|
22
|
+
build-backend = "uv_build"
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
[tool.pyright]
|
|
26
|
+
reportUnannotatedClassAttribute = "none"
|
|
27
|
+
reportImplicitOverride = "none"
|
|
28
|
+
reportUnknownParameterType = "none"
|
|
29
|
+
reportMissingParameterType = "none"
|
|
30
|
+
reportUnknownVariableType = "none"
|
|
31
|
+
reportUnknownMemberType = "none"
|
|
32
|
+
reportUnknownArgumentType = "none"
|
|
33
|
+
reportAny = "none"
|
|
@@ -0,0 +1,39 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Specatwrap - A wrapper for processing healthcare data.
|
|
3
|
+
|
|
4
|
+
This module provides a CLI for converting and processing healthcare data files.
|
|
5
|
+
"""
|
|
6
|
+
|
|
7
|
+
import click
|
|
8
|
+
|
|
9
|
+
from .sas_converter import sas2parquet
|
|
10
|
+
from .prep import prep
|
|
11
|
+
from .sas_preview import preview
|
|
12
|
+
from .printable import print_file
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
@click.group()
|
|
16
|
+
@click.version_option(version="0.1.0")
|
|
17
|
+
def cli():
|
|
18
|
+
"""
|
|
19
|
+
Specatwrap - A wrapper for processing healthcare data.
|
|
20
|
+
|
|
21
|
+
A command-line tool for processing and converting healthcare data files.
|
|
22
|
+
"""
|
|
23
|
+
pass
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
# Register command groups
|
|
27
|
+
cli.add_command(sas2parquet)
|
|
28
|
+
cli.add_command(prep)
|
|
29
|
+
cli.add_command(preview)
|
|
30
|
+
cli.add_command(print_file)
|
|
31
|
+
|
|
32
|
+
|
|
33
|
+
def main():
|
|
34
|
+
"""Entry point for the CLI application."""
|
|
35
|
+
cli()
|
|
36
|
+
|
|
37
|
+
|
|
38
|
+
if __name__ == "__main__":
|
|
39
|
+
main()
|
|
@@ -0,0 +1,124 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Prep command group for preprocessing healthcare data files.
|
|
3
|
+
|
|
4
|
+
This module provides commands for filtering and preprocessing parquet files
|
|
5
|
+
before converting them to XES format.
|
|
6
|
+
"""
|
|
7
|
+
|
|
8
|
+
import click
|
|
9
|
+
from pathlib import Path
|
|
10
|
+
import sys
|
|
11
|
+
|
|
12
|
+
from .io_handler import process_parquet_files
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
@click.group()
|
|
16
|
+
def prep():
|
|
17
|
+
"""
|
|
18
|
+
Preprocess and filter healthcare data files.
|
|
19
|
+
|
|
20
|
+
Commands in this group help prepare raw data files by filtering,
|
|
21
|
+
cleaning, and transforming them before further processing.
|
|
22
|
+
"""
|
|
23
|
+
pass
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
@prep.command()
|
|
27
|
+
@click.option(
|
|
28
|
+
"-i",
|
|
29
|
+
"--input",
|
|
30
|
+
"input_dir",
|
|
31
|
+
type=click.Path(exists=True, file_okay=False, dir_okay=True, path_type=Path),
|
|
32
|
+
required=True,
|
|
33
|
+
help="Path to directory containing parquet files to process.",
|
|
34
|
+
)
|
|
35
|
+
@click.option(
|
|
36
|
+
"-o",
|
|
37
|
+
"--output",
|
|
38
|
+
"output_file",
|
|
39
|
+
type=click.Path(dir_okay=False, file_okay=True, path_type=Path),
|
|
40
|
+
required=True,
|
|
41
|
+
help="Path to output parquet file.",
|
|
42
|
+
)
|
|
43
|
+
@click.option("-v", "--verbose", is_flag=True, help="Enable verbose output.")
|
|
44
|
+
def diagnosis(input_dir, output_file, verbose):
|
|
45
|
+
"""
|
|
46
|
+
Filter and preprocess diagnosis parquet files.
|
|
47
|
+
|
|
48
|
+
This command lazily reads all parquet files from a directory, applies
|
|
49
|
+
filtering and preprocessing transformations, and writes the results to
|
|
50
|
+
a single output parquet file.
|
|
51
|
+
|
|
52
|
+
INPUT: Directory path containing parquet files to process.
|
|
53
|
+
|
|
54
|
+
OUTPUT: Path to output parquet file for processed data.
|
|
55
|
+
|
|
56
|
+
Example usage:
|
|
57
|
+
|
|
58
|
+
specatwrap prep diagnosis -i ./data/parquet_files/ -o ./processed/diagnosis.parquet
|
|
59
|
+
|
|
60
|
+
specatwrap prep diagnosis --input ./raw_data/ --output ./clean_data.parquet -v
|
|
61
|
+
"""
|
|
62
|
+
try:
|
|
63
|
+
# Display processing information
|
|
64
|
+
click.echo(f"Input directory: {input_dir}")
|
|
65
|
+
click.echo(f"Output file: {output_file}")
|
|
66
|
+
click.echo()
|
|
67
|
+
|
|
68
|
+
# Find all parquet files in input directory (for verbose output)
|
|
69
|
+
if verbose:
|
|
70
|
+
parquet_pattern = str(input_dir / "*.parquet")
|
|
71
|
+
click.echo(f"Searching for parquet files: {parquet_pattern}")
|
|
72
|
+
parquet_files = list(input_dir.glob("*.parquet"))
|
|
73
|
+
if parquet_files:
|
|
74
|
+
click.echo(f"Found {len(parquet_files)} parquet file(s):")
|
|
75
|
+
for f in parquet_files:
|
|
76
|
+
click.echo(f" - {f.name}")
|
|
77
|
+
click.echo()
|
|
78
|
+
|
|
79
|
+
# Process the parquet files using the io_handler
|
|
80
|
+
click.echo("Loading and applying filters...")
|
|
81
|
+
with click.progressbar(
|
|
82
|
+
length=100, label="Processing", show_eta=False, show_percent=True
|
|
83
|
+
) as bar:
|
|
84
|
+
# Call the generic I/O handler with "diagnosis" filter type
|
|
85
|
+
process_parquet_files(
|
|
86
|
+
input_dir=input_dir,
|
|
87
|
+
output_file=output_file,
|
|
88
|
+
filter_type="diagnosis",
|
|
89
|
+
verbose=verbose,
|
|
90
|
+
)
|
|
91
|
+
bar.update(100)
|
|
92
|
+
|
|
93
|
+
# Display success message
|
|
94
|
+
click.secho("✓ Processing completed successfully!", fg="green", bold=True)
|
|
95
|
+
click.echo(f"Output file: {output_file}")
|
|
96
|
+
|
|
97
|
+
# Display file statistics
|
|
98
|
+
if output_file.exists():
|
|
99
|
+
size_mb = output_file.stat().st_size / (1024 * 1024)
|
|
100
|
+
click.echo(f"File size: {size_mb:.2f} MB")
|
|
101
|
+
|
|
102
|
+
except FileNotFoundError as e:
|
|
103
|
+
click.secho(f"✗ Error: File or directory not found - {e}", fg="red", err=True)
|
|
104
|
+
sys.exit(1)
|
|
105
|
+
except PermissionError as e:
|
|
106
|
+
click.secho(f"✗ Error: Permission denied - {e}", fg="red", err=True)
|
|
107
|
+
sys.exit(1)
|
|
108
|
+
except ValueError as e:
|
|
109
|
+
click.secho(f"✗ Error: Invalid input - {e}", fg="red", err=True)
|
|
110
|
+
sys.exit(1)
|
|
111
|
+
except MemoryError:
|
|
112
|
+
click.secho(
|
|
113
|
+
"✗ Error: Out of memory. Try processing smaller batches.",
|
|
114
|
+
fg="red",
|
|
115
|
+
err=True,
|
|
116
|
+
)
|
|
117
|
+
sys.exit(1)
|
|
118
|
+
except Exception as e:
|
|
119
|
+
click.secho(f"✗ Error: {e}", fg="red", err=True)
|
|
120
|
+
if verbose:
|
|
121
|
+
import traceback
|
|
122
|
+
|
|
123
|
+
traceback.print_exc()
|
|
124
|
+
sys.exit(1)
|
|
@@ -0,0 +1,425 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Diagnosis filter classes.
|
|
3
|
+
|
|
4
|
+
This module provides filter classes for preprocessing diagnosis parquet files.
|
|
5
|
+
"""
|
|
6
|
+
|
|
7
|
+
from abc import ABC, abstractmethod
|
|
8
|
+
import polars as pl
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
class BaseFilter(ABC):
|
|
12
|
+
"""
|
|
13
|
+
Abstract base class for parquet file filters.
|
|
14
|
+
|
|
15
|
+
Each filter type (diagnosis, procedure, medication, etc.) should inherit
|
|
16
|
+
from this class and implement the apply() method with their specific
|
|
17
|
+
filtering and transformation logic.
|
|
18
|
+
"""
|
|
19
|
+
|
|
20
|
+
@abstractmethod
|
|
21
|
+
def apply(self, lf: pl.LazyFrame) -> pl.LazyFrame:
|
|
22
|
+
"""
|
|
23
|
+
Apply filtering and preprocessing logic to a LazyFrame.
|
|
24
|
+
|
|
25
|
+
Args:
|
|
26
|
+
lazy_frame: Input Polars LazyFrame to filter/transform
|
|
27
|
+
|
|
28
|
+
Returns:
|
|
29
|
+
Transformed Polars LazyFrame
|
|
30
|
+
"""
|
|
31
|
+
pass
|
|
32
|
+
|
|
33
|
+
def get_name(self) -> str:
|
|
34
|
+
"""
|
|
35
|
+
Get the name of this filter type.
|
|
36
|
+
|
|
37
|
+
Returns:
|
|
38
|
+
Filter type name (defaults to class name without 'Filter' suffix)
|
|
39
|
+
"""
|
|
40
|
+
class_name = self.__class__.__name__
|
|
41
|
+
if class_name.endswith("Filter"):
|
|
42
|
+
return class_name[:-6].lower()
|
|
43
|
+
return class_name.lower()
|
|
44
|
+
|
|
45
|
+
|
|
46
|
+
class DiagnosisFilter(BaseFilter):
|
|
47
|
+
"""
|
|
48
|
+
Filter for diagnosis parquet files.
|
|
49
|
+
|
|
50
|
+
Applies diagnosis-specific filtering and column transformations:
|
|
51
|
+
- Filters by birthdate (year > 1980)
|
|
52
|
+
- Filters by region (Region Sjælland)
|
|
53
|
+
- Renames columns to standardized event log format
|
|
54
|
+
"""
|
|
55
|
+
|
|
56
|
+
CASE_ATTR = {
|
|
57
|
+
k: f"case:{v}"
|
|
58
|
+
for k, v in {
|
|
59
|
+
"BORGER_FOEDSELSDATO": "BDay",
|
|
60
|
+
"PNR": "PNR",
|
|
61
|
+
}.items()
|
|
62
|
+
}
|
|
63
|
+
|
|
64
|
+
EVENT_ATTR = {
|
|
65
|
+
"TORRA_DIAG": "TDiag",
|
|
66
|
+
"ADIAG": "ADiag",
|
|
67
|
+
"ADIAG_TEKST": "diagText",
|
|
68
|
+
"KONT_ANS_GEO_REG_TEKST": "region",
|
|
69
|
+
"KONT_LPR_ENTITY_ID": "org:id",
|
|
70
|
+
"KONT_INST_EJERTYPE": "org:type",
|
|
71
|
+
"KONT_STARTTIDSPUNKT": "startTime",
|
|
72
|
+
"KONT_SLUTTIDSPUNKT": "endTime",
|
|
73
|
+
# "BORGER_ALDER_AAR_IND": "patient:age",
|
|
74
|
+
"PRIORITET_TEKST": "priority",
|
|
75
|
+
"KONT_TYPE_TEKST": "contact_type",
|
|
76
|
+
}
|
|
77
|
+
|
|
78
|
+
def apply(self, lf: pl.LazyFrame) -> pl.LazyFrame:
|
|
79
|
+
"""
|
|
80
|
+
Apply diagnosis-specific filtering and transformations.
|
|
81
|
+
|
|
82
|
+
Args:
|
|
83
|
+
lazy_frame: Input LazyFrame containing diagnosis data
|
|
84
|
+
|
|
85
|
+
Returns:
|
|
86
|
+
Filtered and transformed LazyFrame
|
|
87
|
+
"""
|
|
88
|
+
|
|
89
|
+
lf = self._add_torra_diag(lf)
|
|
90
|
+
|
|
91
|
+
return (
|
|
92
|
+
lf.filter(pl.col("TORRA_DIAG").list.len() > 0)
|
|
93
|
+
.with_columns(pl.col("TORRA_DIAG").list.join(", "))
|
|
94
|
+
.select(list((self.CASE_ATTR | self.EVENT_ATTR).keys()))
|
|
95
|
+
.rename(self.CASE_ATTR | self.EVENT_ATTR)
|
|
96
|
+
)
|
|
97
|
+
|
|
98
|
+
def _add_torra_diag(self, lazy_frame: pl.LazyFrame) -> pl.LazyFrame:
|
|
99
|
+
col = pl.col("ADIAG")
|
|
100
|
+
|
|
101
|
+
def starts_with(prefixes: list[str], label: str) -> pl.Expr:
|
|
102
|
+
pattern = f"^(?:{'|'.join(prefixes)})"
|
|
103
|
+
return (
|
|
104
|
+
pl.when(col.str.contains(pattern))
|
|
105
|
+
.then(pl.lit(label))
|
|
106
|
+
.otherwise(pl.lit(None))
|
|
107
|
+
)
|
|
108
|
+
|
|
109
|
+
return lazy_frame.with_columns(
|
|
110
|
+
TORRA_DIAG=pl.concat_list(
|
|
111
|
+
[
|
|
112
|
+
starts_with(
|
|
113
|
+
["DJ41", "DJ42", "DJ43", "DJ44", "DJ45", "DJ46"], "LUNG"
|
|
114
|
+
),
|
|
115
|
+
starts_with(
|
|
116
|
+
[
|
|
117
|
+
"DL405",
|
|
118
|
+
"DM05",
|
|
119
|
+
"DM06",
|
|
120
|
+
"DM07",
|
|
121
|
+
"DM15",
|
|
122
|
+
"DM16",
|
|
123
|
+
"DM17",
|
|
124
|
+
"DM45",
|
|
125
|
+
"DM47",
|
|
126
|
+
"DM50",
|
|
127
|
+
"DM51",
|
|
128
|
+
"DM53",
|
|
129
|
+
"DM54",
|
|
130
|
+
"DM80",
|
|
131
|
+
"DM81",
|
|
132
|
+
"DM82",
|
|
133
|
+
],
|
|
134
|
+
"MUSCULOSKELETAL",
|
|
135
|
+
),
|
|
136
|
+
starts_with(
|
|
137
|
+
["DE03", "DE05", "DE10", "DE11", "DE12", "DE13", "DE14"],
|
|
138
|
+
"ENDOCRINE",
|
|
139
|
+
),
|
|
140
|
+
starts_with(["DE03", "DE05"], "THYROIDEA"),
|
|
141
|
+
starts_with(["DE10", "DE11", "DE12", "DE13", "DE14"], "DIABETES"),
|
|
142
|
+
starts_with(["DG30", "DG318", "DG319", "DF"], "MENTAL"),
|
|
143
|
+
# CANCER: Matches DC but explicitly excludes DC44
|
|
144
|
+
pl.when(col.str.contains("^DC") & ~col.str.contains("^DC44"))
|
|
145
|
+
.then(pl.lit("CANCER"))
|
|
146
|
+
.otherwise(pl.lit(None)),
|
|
147
|
+
starts_with(
|
|
148
|
+
[
|
|
149
|
+
"DI60",
|
|
150
|
+
"DI61",
|
|
151
|
+
"DI62",
|
|
152
|
+
"DI63",
|
|
153
|
+
"DI64",
|
|
154
|
+
"DI69",
|
|
155
|
+
"DG20",
|
|
156
|
+
"DG35",
|
|
157
|
+
"DG40",
|
|
158
|
+
"DG43",
|
|
159
|
+
],
|
|
160
|
+
"NEUROLOGICAL",
|
|
161
|
+
),
|
|
162
|
+
starts_with(
|
|
163
|
+
[
|
|
164
|
+
"DK30",
|
|
165
|
+
"DK50",
|
|
166
|
+
"DK51",
|
|
167
|
+
"DK58",
|
|
168
|
+
"DK70",
|
|
169
|
+
"DK71",
|
|
170
|
+
"DK72",
|
|
171
|
+
"DK73",
|
|
172
|
+
"DK74",
|
|
173
|
+
"DK75",
|
|
174
|
+
"DK76",
|
|
175
|
+
"DK860",
|
|
176
|
+
"DK861",
|
|
177
|
+
],
|
|
178
|
+
"GASTROINTESTINAL",
|
|
179
|
+
),
|
|
180
|
+
starts_with(
|
|
181
|
+
[
|
|
182
|
+
"DI20",
|
|
183
|
+
"DI21",
|
|
184
|
+
"DI22",
|
|
185
|
+
"DI23",
|
|
186
|
+
"DI24",
|
|
187
|
+
"DI25",
|
|
188
|
+
"DI47",
|
|
189
|
+
"DI48",
|
|
190
|
+
"DI49",
|
|
191
|
+
"DI50",
|
|
192
|
+
"DI05",
|
|
193
|
+
"DI06",
|
|
194
|
+
"DI07",
|
|
195
|
+
"DI08",
|
|
196
|
+
"DI34",
|
|
197
|
+
"DI35",
|
|
198
|
+
"DI36",
|
|
199
|
+
"DI37",
|
|
200
|
+
"DI441",
|
|
201
|
+
"DI442",
|
|
202
|
+
"DI443",
|
|
203
|
+
"DI444",
|
|
204
|
+
"DI445",
|
|
205
|
+
"DI446",
|
|
206
|
+
"DI447",
|
|
207
|
+
"DI452",
|
|
208
|
+
"DI453",
|
|
209
|
+
"DI454",
|
|
210
|
+
"DI455",
|
|
211
|
+
"DI456",
|
|
212
|
+
"DI457",
|
|
213
|
+
"DI458",
|
|
214
|
+
"DI459",
|
|
215
|
+
],
|
|
216
|
+
"CARDIOVASCULAR",
|
|
217
|
+
),
|
|
218
|
+
starts_with(
|
|
219
|
+
[
|
|
220
|
+
"DN03",
|
|
221
|
+
"DN04",
|
|
222
|
+
"DN05",
|
|
223
|
+
"DN11",
|
|
224
|
+
"DN12",
|
|
225
|
+
"DN18",
|
|
226
|
+
"DN19",
|
|
227
|
+
"DZ49",
|
|
228
|
+
"DN80",
|
|
229
|
+
"DZ992",
|
|
230
|
+
"DN393",
|
|
231
|
+
"DN394",
|
|
232
|
+
],
|
|
233
|
+
"GENITURINARY",
|
|
234
|
+
),
|
|
235
|
+
starts_with(
|
|
236
|
+
[
|
|
237
|
+
"DH40",
|
|
238
|
+
"DH91",
|
|
239
|
+
"DL40",
|
|
240
|
+
"DH540",
|
|
241
|
+
"DH541",
|
|
242
|
+
"DH542",
|
|
243
|
+
"DH543",
|
|
244
|
+
"DH547",
|
|
245
|
+
"DH900",
|
|
246
|
+
"DH902",
|
|
247
|
+
"DH903",
|
|
248
|
+
"DH905",
|
|
249
|
+
"DH906",
|
|
250
|
+
"DH908",
|
|
251
|
+
],
|
|
252
|
+
"SENSORY ORGANS",
|
|
253
|
+
),
|
|
254
|
+
starts_with(
|
|
255
|
+
["DL23", "DL24", "DL25", "DJ30", "DL500", "DJ450"], "ALLERGY"
|
|
256
|
+
),
|
|
257
|
+
]
|
|
258
|
+
).list.drop_nulls()
|
|
259
|
+
)
|
|
260
|
+
|
|
261
|
+
def print_torra_diag_method(self) -> None:
|
|
262
|
+
"""Print the _add_torra_diag method to stdout."""
|
|
263
|
+
method_code = """ def _add_torra_diag(self, lazy_frame: pl.LazyFrame) -> pl.LazyFrame:
|
|
264
|
+
col = pl.col("ADIAG")
|
|
265
|
+
|
|
266
|
+
def starts_with(prefixes: list[str], label: str) -> pl.Expr:
|
|
267
|
+
pattern = f"^(?:{'|'.join(prefixes)})"
|
|
268
|
+
return (
|
|
269
|
+
pl.when(col.str.contains(pattern))
|
|
270
|
+
.then(pl.lit(label))
|
|
271
|
+
.otherwise(pl.lit(None))
|
|
272
|
+
)
|
|
273
|
+
|
|
274
|
+
return lazy_frame.with_columns(
|
|
275
|
+
TORRA_DIAG=pl.concat_list(
|
|
276
|
+
[
|
|
277
|
+
starts_with(
|
|
278
|
+
["DJ41", "DJ42", "DJ43", "DJ44", "DJ45", "DJ46"], "LUNG"
|
|
279
|
+
),
|
|
280
|
+
starts_with(
|
|
281
|
+
[
|
|
282
|
+
"DL405",
|
|
283
|
+
"DM05",
|
|
284
|
+
"DM06",
|
|
285
|
+
"DM07",
|
|
286
|
+
"DM15",
|
|
287
|
+
"DM16",
|
|
288
|
+
"DM17",
|
|
289
|
+
"DM45",
|
|
290
|
+
"DM47",
|
|
291
|
+
"DM50",
|
|
292
|
+
"DM51",
|
|
293
|
+
"DM53",
|
|
294
|
+
"DM54",
|
|
295
|
+
"DM80",
|
|
296
|
+
"DM81",
|
|
297
|
+
"DM82",
|
|
298
|
+
],
|
|
299
|
+
"MUSCULOSKELETAL",
|
|
300
|
+
),
|
|
301
|
+
starts_with(
|
|
302
|
+
["DE03", "DE05", "DE10", "DE11", "DE12", "DE13", "DE14"],
|
|
303
|
+
"ENDOCRINE",
|
|
304
|
+
),
|
|
305
|
+
starts_with(["DE03", "DE05"], "THYROIDEA"),
|
|
306
|
+
starts_with(["DE10", "DE11", "DE12", "DE13", "DE14"], "DIABETES"),
|
|
307
|
+
starts_with(["DG30", "DG318", "DG319", "DF"], "MENTAL"),
|
|
308
|
+
# CANCER: Matches DC but explicitly excludes DC44
|
|
309
|
+
pl.when(col.str.contains("^DC") & ~col.str.contains("^DC44"))
|
|
310
|
+
.then(pl.lit("CANCER"))
|
|
311
|
+
.otherwise(pl.lit(None)),
|
|
312
|
+
starts_with(
|
|
313
|
+
[
|
|
314
|
+
"DI60",
|
|
315
|
+
"DI61",
|
|
316
|
+
"DI62",
|
|
317
|
+
"DI63",
|
|
318
|
+
"DI64",
|
|
319
|
+
"DI69",
|
|
320
|
+
"DG20",
|
|
321
|
+
"DG35",
|
|
322
|
+
"DG40",
|
|
323
|
+
"DG43",
|
|
324
|
+
],
|
|
325
|
+
"NEUROLOGICAL",
|
|
326
|
+
),
|
|
327
|
+
starts_with(
|
|
328
|
+
[
|
|
329
|
+
"DK30",
|
|
330
|
+
"DK50",
|
|
331
|
+
"DK51",
|
|
332
|
+
"DK58",
|
|
333
|
+
"DK70",
|
|
334
|
+
"DK71",
|
|
335
|
+
"DK72",
|
|
336
|
+
"DK73",
|
|
337
|
+
"DK74",
|
|
338
|
+
"DK75",
|
|
339
|
+
"DK76",
|
|
340
|
+
"DK860",
|
|
341
|
+
"DK861",
|
|
342
|
+
],
|
|
343
|
+
"GASTROINTESTINAL",
|
|
344
|
+
),
|
|
345
|
+
starts_with(
|
|
346
|
+
[
|
|
347
|
+
"DI20",
|
|
348
|
+
"DI21",
|
|
349
|
+
"DI22",
|
|
350
|
+
"DI23",
|
|
351
|
+
"DI24",
|
|
352
|
+
"DI25",
|
|
353
|
+
"DI47",
|
|
354
|
+
"DI48",
|
|
355
|
+
"DI49",
|
|
356
|
+
"DI50",
|
|
357
|
+
"DI05",
|
|
358
|
+
"DI06",
|
|
359
|
+
"DI07",
|
|
360
|
+
"DI08",
|
|
361
|
+
"DI34",
|
|
362
|
+
"DI35",
|
|
363
|
+
"DI36",
|
|
364
|
+
"DI37",
|
|
365
|
+
"DI441",
|
|
366
|
+
"DI442",
|
|
367
|
+
"DI443",
|
|
368
|
+
"DI444",
|
|
369
|
+
"DI445",
|
|
370
|
+
"DI446",
|
|
371
|
+
"DI447",
|
|
372
|
+
"DI452",
|
|
373
|
+
"DI453",
|
|
374
|
+
"DI454",
|
|
375
|
+
"DI455",
|
|
376
|
+
"DI456",
|
|
377
|
+
"DI457",
|
|
378
|
+
"DI458",
|
|
379
|
+
"DI459",
|
|
380
|
+
],
|
|
381
|
+
"CARDIOVASCULAR",
|
|
382
|
+
),
|
|
383
|
+
starts_with(
|
|
384
|
+
[
|
|
385
|
+
"DN03",
|
|
386
|
+
"DN04",
|
|
387
|
+
"DN05",
|
|
388
|
+
"DN11",
|
|
389
|
+
"DN12",
|
|
390
|
+
"DN18",
|
|
391
|
+
"DN19",
|
|
392
|
+
"DZ49",
|
|
393
|
+
"DN80",
|
|
394
|
+
"DZ992",
|
|
395
|
+
"DN393",
|
|
396
|
+
"DN394",
|
|
397
|
+
],
|
|
398
|
+
"GENITURINARY",
|
|
399
|
+
),
|
|
400
|
+
starts_with(
|
|
401
|
+
[
|
|
402
|
+
"DH40",
|
|
403
|
+
"DH91",
|
|
404
|
+
"DL40",
|
|
405
|
+
"DH540",
|
|
406
|
+
"DH541",
|
|
407
|
+
"DH542",
|
|
408
|
+
"DH543",
|
|
409
|
+
"DH547",
|
|
410
|
+
"DH900",
|
|
411
|
+
"DH902",
|
|
412
|
+
"DH903",
|
|
413
|
+
"DH905",
|
|
414
|
+
"DH906",
|
|
415
|
+
"DH908",
|
|
416
|
+
],
|
|
417
|
+
"SENSORY ORGANS",
|
|
418
|
+
),
|
|
419
|
+
starts_with(
|
|
420
|
+
["DL23", "DL24", "DL25", "DJ30", "DL500", "DJ450"], "ALLERGY"
|
|
421
|
+
),
|
|
422
|
+
]
|
|
423
|
+
).list.drop_nulls()
|
|
424
|
+
)"""
|
|
425
|
+
print(method_code)
|
|
@@ -0,0 +1,86 @@
|
|
|
1
|
+
"""
|
|
2
|
+
I/O handler for parquet file processing.
|
|
3
|
+
|
|
4
|
+
This module provides generic I/O functionality for reading, filtering,
|
|
5
|
+
and writing parquet files. It uses a filter registry pattern to support
|
|
6
|
+
different filter types (diagnosis, procedure, medication, etc.).
|
|
7
|
+
"""
|
|
8
|
+
|
|
9
|
+
from pathlib import Path
|
|
10
|
+
from typing import Type
|
|
11
|
+
import polars as pl
|
|
12
|
+
|
|
13
|
+
from .diagnosis import BaseFilter, DiagnosisFilter
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
# Registry mapping filter type strings to filter classes
|
|
17
|
+
FILTER_REGISTRY: dict[str, Type[BaseFilter]] = {
|
|
18
|
+
"diagnosis": DiagnosisFilter,
|
|
19
|
+
# Future additions:
|
|
20
|
+
# "procedure": ProcedureFilter,
|
|
21
|
+
# "medication": MedicationFilter,
|
|
22
|
+
}
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
def process_parquet_files(
|
|
26
|
+
input_dir: Path,
|
|
27
|
+
output_file: Path,
|
|
28
|
+
filter_type: str,
|
|
29
|
+
verbose: bool = False,
|
|
30
|
+
):
|
|
31
|
+
"""
|
|
32
|
+
Process parquet files with a specified filter type.
|
|
33
|
+
|
|
34
|
+
This function handles all I/O operations:
|
|
35
|
+
- Discovers parquet files in the input directory
|
|
36
|
+
- Lazily loads them using Polars
|
|
37
|
+
- Applies the specified filter transformation
|
|
38
|
+
- Collects the results and writes to output file
|
|
39
|
+
|
|
40
|
+
Args:
|
|
41
|
+
input_dir: Directory containing input parquet files
|
|
42
|
+
output_file: Path to output parquet file
|
|
43
|
+
filter_type: Type of filter to apply (e.g., "diagnosis", "procedure")
|
|
44
|
+
verbose: Whether to include verbose processing (currently unused in I/O layer)
|
|
45
|
+
|
|
46
|
+
Returns:
|
|
47
|
+
Tuple of (row_count, column_count, column_names)
|
|
48
|
+
|
|
49
|
+
Raises:
|
|
50
|
+
FileNotFoundError: If no parquet files found in input directory
|
|
51
|
+
ValueError: If filter_type is not registered
|
|
52
|
+
PermissionError: If unable to read/write files
|
|
53
|
+
MemoryError: If insufficient memory to process data
|
|
54
|
+
"""
|
|
55
|
+
# Validate filter type
|
|
56
|
+
if filter_type not in FILTER_REGISTRY:
|
|
57
|
+
available_filters = ", ".join(FILTER_REGISTRY.keys())
|
|
58
|
+
raise ValueError(
|
|
59
|
+
f"Unknown filter type '{filter_type}'. "
|
|
60
|
+
f"Available filters: {available_filters}"
|
|
61
|
+
)
|
|
62
|
+
|
|
63
|
+
# Check if input directory contains parquet files
|
|
64
|
+
parquet_files = list(input_dir.glob("*.parquet"))
|
|
65
|
+
if not parquet_files:
|
|
66
|
+
raise FileNotFoundError(f"No parquet files found in {input_dir}")
|
|
67
|
+
|
|
68
|
+
# Create parquet file pattern for lazy loading
|
|
69
|
+
parquet_pattern = str(input_dir / "*.parquet")
|
|
70
|
+
|
|
71
|
+
# Lazy load all parquet files
|
|
72
|
+
lazy_frame = pl.scan_parquet(parquet_pattern)
|
|
73
|
+
|
|
74
|
+
# Instantiate the appropriate filter
|
|
75
|
+
filter_class = FILTER_REGISTRY[filter_type]
|
|
76
|
+
filter_instance = filter_class()
|
|
77
|
+
|
|
78
|
+
# Apply the filter transformation
|
|
79
|
+
filtered_lazy_frame = filter_instance.apply(lazy_frame)
|
|
80
|
+
|
|
81
|
+
# Collect the lazy frame (materialize the data)
|
|
82
|
+
|
|
83
|
+
# Ensure output directory exists
|
|
84
|
+
output_file.parent.mkdir(parents=True, exist_ok=True)
|
|
85
|
+
|
|
86
|
+
filtered_lazy_frame.sink_parquet(output_file)
|
|
@@ -0,0 +1,103 @@
|
|
|
1
|
+
"""
|
|
2
|
+
CLI command to copy Python files from the printable folder to clipboard.
|
|
3
|
+
|
|
4
|
+
This allows easy transfer of scripts on the remote protected machine.
|
|
5
|
+
"""
|
|
6
|
+
|
|
7
|
+
import subprocess
|
|
8
|
+
from pathlib import Path
|
|
9
|
+
|
|
10
|
+
import click
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
@click.command("print")
|
|
14
|
+
@click.argument("filename", required=False)
|
|
15
|
+
@click.option("--list", "-l", is_flag=True, help="List all available printable files")
|
|
16
|
+
@click.option("--imports", "-i", is_flag=True, help="Include import statements")
|
|
17
|
+
def print_file(filename: str | None, list: bool, imports: bool) -> None:
|
|
18
|
+
"""
|
|
19
|
+
Copy Python files from the printable folder to clipboard.
|
|
20
|
+
|
|
21
|
+
This command helps transfer code to the restricted environment by copying
|
|
22
|
+
file contents directly to the Windows clipboard.
|
|
23
|
+
|
|
24
|
+
By default, import statements are excluded. Use -i to include them.
|
|
25
|
+
|
|
26
|
+
Usage:
|
|
27
|
+
|
|
28
|
+
specatwrap print diagnosis_kpi
|
|
29
|
+
|
|
30
|
+
specatwrap print diagnosis_kpi -i
|
|
31
|
+
|
|
32
|
+
specatwrap print --list
|
|
33
|
+
|
|
34
|
+
\b
|
|
35
|
+
Arguments:
|
|
36
|
+
FILENAME: Name of the file to copy (without .py extension)
|
|
37
|
+
"""
|
|
38
|
+
printable_dir = Path(__file__).parent
|
|
39
|
+
|
|
40
|
+
# List available files
|
|
41
|
+
if list:
|
|
42
|
+
python_files = sorted(printable_dir.glob("*.py"))
|
|
43
|
+
python_files = [f for f in python_files if f.name != "__init__.py"]
|
|
44
|
+
|
|
45
|
+
if not python_files:
|
|
46
|
+
click.echo("No printable files available.")
|
|
47
|
+
return
|
|
48
|
+
|
|
49
|
+
click.echo("Available printable files:")
|
|
50
|
+
for file in python_files:
|
|
51
|
+
click.echo(f" - {file.stem}")
|
|
52
|
+
return
|
|
53
|
+
|
|
54
|
+
# Print specific file
|
|
55
|
+
if not filename:
|
|
56
|
+
click.echo(
|
|
57
|
+
"Error: Please provide a filename or use --list to see available files."
|
|
58
|
+
)
|
|
59
|
+
click.echo("Usage: specatwrap print <filename>")
|
|
60
|
+
click.echo(" specatwrap print --list")
|
|
61
|
+
raise click.Abort()
|
|
62
|
+
|
|
63
|
+
# Add .py extension if not present
|
|
64
|
+
if not filename.endswith(".py"):
|
|
65
|
+
filename = f"{filename}.py"
|
|
66
|
+
|
|
67
|
+
file_path = printable_dir / filename
|
|
68
|
+
|
|
69
|
+
if not file_path.exists():
|
|
70
|
+
click.echo(f"Error: File '{filename}' not found in printable folder.")
|
|
71
|
+
click.echo("\nUse 'specatwrap print --list' to see available files.")
|
|
72
|
+
raise click.Abort()
|
|
73
|
+
|
|
74
|
+
# Read and copy to clipboard
|
|
75
|
+
try:
|
|
76
|
+
content = file_path.read_text()
|
|
77
|
+
|
|
78
|
+
# Filter out import statements if -i flag is not set
|
|
79
|
+
if not imports:
|
|
80
|
+
lines = content.split("\n")
|
|
81
|
+
filtered_lines = []
|
|
82
|
+
|
|
83
|
+
for line in lines:
|
|
84
|
+
stripped = line.strip()
|
|
85
|
+
# Skip import and from...import statements
|
|
86
|
+
if stripped.startswith("import ") or stripped.startswith("from "):
|
|
87
|
+
continue
|
|
88
|
+
filtered_lines.append(line)
|
|
89
|
+
|
|
90
|
+
content = "\n".join(filtered_lines)
|
|
91
|
+
|
|
92
|
+
# Copy to clipboard using Windows clip.exe
|
|
93
|
+
subprocess.run(["clip.exe"], input=content, text=True, check=True)
|
|
94
|
+
|
|
95
|
+
import_status = "with imports" if imports else "without imports"
|
|
96
|
+
click.echo(f"✓ Content of '{filename}' copied to clipboard ({import_status})!")
|
|
97
|
+
|
|
98
|
+
except subprocess.CalledProcessError as e:
|
|
99
|
+
click.echo(f"Error copying to clipboard: {e}")
|
|
100
|
+
raise click.Abort()
|
|
101
|
+
except Exception as e:
|
|
102
|
+
click.echo(f"Error reading file: {e}")
|
|
103
|
+
raise click.Abort()
|
|
@@ -0,0 +1,14 @@
|
|
|
1
|
+
import polars as pl
|
|
2
|
+
|
|
3
|
+
lf = pl.scan_parquet("diag.parquet")
|
|
4
|
+
lf_death = pl.scan_parquet("death.parquet")
|
|
5
|
+
|
|
6
|
+
lf_death = lf_death.rename(
|
|
7
|
+
{"PNR": "case:PNR", "DODDATO": "startTime"}
|
|
8
|
+
).with_columns(
|
|
9
|
+
pl.lit("Death").alias("TDiag"),
|
|
10
|
+
pl.col("startTime").cast(pl.Datetime("us")).dt.offset_by("1d").dt.offset_by("-1us"),
|
|
11
|
+
) # Shift death events to just before the next day to ensure they are ordered after any diagnoses on the same day
|
|
12
|
+
|
|
13
|
+
lf = pl.concat([lf, lf_death], how="diagonal").sort(["case:PNR", "startTime"])
|
|
14
|
+
lf.sink_parquet("combined_diag_death.parquet")
|
|
@@ -0,0 +1,68 @@
|
|
|
1
|
+
import polars as pl
|
|
2
|
+
|
|
3
|
+
lf = pl.scan_parquet("combined_diag_death.parquet").unique(
|
|
4
|
+
subset=["case:PNR", "TDiag"], keep="first", maintain_order=True
|
|
5
|
+
)
|
|
6
|
+
|
|
7
|
+
# Computing the average time between diagnoses for each case, excluding the death event
|
|
8
|
+
lf = (
|
|
9
|
+
lf.with_columns(pl.col("startTime").diff().over("case:PNR").alias("_tmp_diff"))
|
|
10
|
+
.with_columns(
|
|
11
|
+
pl.col("_tmp_diff")
|
|
12
|
+
.filter(pl.col("TDiag") != "Death")
|
|
13
|
+
.mean()
|
|
14
|
+
.over("case:PNR")
|
|
15
|
+
.alias("case:avg_inter_diagnosis_time")
|
|
16
|
+
)
|
|
17
|
+
.drop("_tmp_diff")
|
|
18
|
+
)
|
|
19
|
+
|
|
20
|
+
# Compute time to death from the last diagnosis for cases that ended with death, ensuring we only compute this if there are multiple events in the case (i.e., Death isn't the only event)
|
|
21
|
+
lf = lf.with_columns(
|
|
22
|
+
pl.when(
|
|
23
|
+
(pl.col("TDiag") == "Death")
|
|
24
|
+
& (pl.len().over("case:PNR") > 1) # Ensure Death isn't the only event
|
|
25
|
+
)
|
|
26
|
+
.then(pl.col("startTime") - pl.col("startTime").shift(1).over("case:PNR"))
|
|
27
|
+
.otherwise(None)
|
|
28
|
+
.alias("case:time_to_death_from_last_diag")
|
|
29
|
+
)
|
|
30
|
+
|
|
31
|
+
# Computing variants
|
|
32
|
+
# 1. Group by case to get the sequence of events as a "variant"
|
|
33
|
+
lf_tmp = (
|
|
34
|
+
lf.group_by("case:PNR")
|
|
35
|
+
.agg(
|
|
36
|
+
pl.col("TDiag").str.join(">").str.replace(r">Death$", "").alias("variants"),
|
|
37
|
+
pl.col("case:avg_inter_diagnosis_time").first(),
|
|
38
|
+
pl.col("case:time_to_death_from_last_diag").last(),
|
|
39
|
+
(pl.col("TDiag") != "Death").sum().alias("diagnosis_count"),
|
|
40
|
+
(pl.col("TDiag") == "Death").any().alias("died"),
|
|
41
|
+
)
|
|
42
|
+
.with_columns(pl.col("died").mean().over("variants").alias("mortality_kpi"))
|
|
43
|
+
.drop("died")
|
|
44
|
+
)
|
|
45
|
+
|
|
46
|
+
lf_summary = lf_tmp.group_by("variants").agg(
|
|
47
|
+
pl.col("mortality_kpi").first(),
|
|
48
|
+
pl.len().alias("count"),
|
|
49
|
+
pl.col("diagnosis_count").first(),
|
|
50
|
+
pl.col("case:avg_inter_diagnosis_time")
|
|
51
|
+
.mean()
|
|
52
|
+
.alias("avg_avg_inter_diagnosis_time"),
|
|
53
|
+
pl.col("case:time_to_death_from_last_diag")
|
|
54
|
+
.mean()
|
|
55
|
+
.alias("avg_inter_diagnosis_time_to_death"),
|
|
56
|
+
)
|
|
57
|
+
|
|
58
|
+
# Joining the KPI back to the main log for further analysis or export
|
|
59
|
+
patient_kpi_map = lf_tmp.select(
|
|
60
|
+
[
|
|
61
|
+
pl.col("case:PNR"),
|
|
62
|
+
pl.col("mortality_kpi").alias("case:mortality_kpi"),
|
|
63
|
+
pl.col("case:avg_inter_diagnosis_time"),
|
|
64
|
+
pl.col("case:time_to_death_from_last_diag"),
|
|
65
|
+
pl.col("diagnosis_count"),
|
|
66
|
+
]
|
|
67
|
+
)
|
|
68
|
+
lf = lf.join(patient_kpi_map, on="case:PNR", how="left")
|
|
@@ -0,0 +1,41 @@
|
|
|
1
|
+
import polars as pl
|
|
2
|
+
|
|
3
|
+
lf = pl.scan_parquet("combined_diag_death.parquet").unique(
|
|
4
|
+
subset=["case:PNR", "ADiag"], keep="first", maintain_order=True
|
|
5
|
+
)
|
|
6
|
+
|
|
7
|
+
# Computing the average time between diagnoses for each case, excluding the death event
|
|
8
|
+
lf = (
|
|
9
|
+
lf.with_columns(pl.col("startTime").diff().over("case:PNR").alias("_tmp_diff"))
|
|
10
|
+
.with_columns(
|
|
11
|
+
pl.col("_tmp_diff")
|
|
12
|
+
.filter(pl.col("TDiag") != "Death")
|
|
13
|
+
.mean()
|
|
14
|
+
.over("case:PNR")
|
|
15
|
+
.alias("case:avg_inter_diagnosis_time")
|
|
16
|
+
)
|
|
17
|
+
.drop("_tmp_diff")
|
|
18
|
+
)
|
|
19
|
+
|
|
20
|
+
# Computing variants
|
|
21
|
+
# 1. Group by case to get the sequence of events as a "variant"
|
|
22
|
+
lf_tmp = (
|
|
23
|
+
lf.group_by("case:PNR")
|
|
24
|
+
.agg(
|
|
25
|
+
pl.col("TDiag").str.join(">").str.replace(r">Death$", "").alias("variants"),
|
|
26
|
+
(pl.col("TDiag") != "Death").sum().alias("diagnosis_count"),
|
|
27
|
+
(pl.col("TDiag") == "Death").any().alias("died"),
|
|
28
|
+
)
|
|
29
|
+
.with_columns(pl.col("died").mean().over("variants").alias("case:mortality"))
|
|
30
|
+
.drop("died")
|
|
31
|
+
)
|
|
32
|
+
|
|
33
|
+
# Joining the KPI back to the main log for further analysis or export
|
|
34
|
+
patient_kpi_map = lf_tmp.select(
|
|
35
|
+
[
|
|
36
|
+
pl.col("case:PNR"),
|
|
37
|
+
pl.col("case:mortality"),
|
|
38
|
+
pl.col("diagnosis_count"),
|
|
39
|
+
]
|
|
40
|
+
)
|
|
41
|
+
lf = lf.join(patient_kpi_map, on="case:PNR", how="left")
|
|
@@ -0,0 +1,299 @@
|
|
|
1
|
+
"""
|
|
2
|
+
SAS to Parquet converter module.
|
|
3
|
+
|
|
4
|
+
This module provides functionality to convert large SAS7BDAT files to Parquet format
|
|
5
|
+
in a memory-efficient way by processing the data in chunks.
|
|
6
|
+
"""
|
|
7
|
+
|
|
8
|
+
import sys
|
|
9
|
+
import time
|
|
10
|
+
from pathlib import Path
|
|
11
|
+
from typing import Optional
|
|
12
|
+
import shutil
|
|
13
|
+
import click
|
|
14
|
+
import pyreadstat
|
|
15
|
+
import polars as pl
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
# Table-specific column mappings
|
|
19
|
+
TABLE_TYPE_COLUMNS = {
|
|
20
|
+
"diagnosis": [
|
|
21
|
+
"BORGER_FOEDSELSDATO",
|
|
22
|
+
"PNR",
|
|
23
|
+
"ADIAG",
|
|
24
|
+
"ADIAG_TEKST",
|
|
25
|
+
"KONT_ANS_GEO_REG_TEKST",
|
|
26
|
+
"KONT_LPR_ENTITY_ID",
|
|
27
|
+
"KONT_INST_EJERTYPE",
|
|
28
|
+
"KONT_STARTTIDSPUNKT",
|
|
29
|
+
"KONT_SLUTTIDSPUNKT",
|
|
30
|
+
"PRIORITET_TEKST",
|
|
31
|
+
"KONT_TYPE_TEKST",
|
|
32
|
+
],
|
|
33
|
+
}
|
|
34
|
+
|
|
35
|
+
|
|
36
|
+
def convert_sas_to_parquet(
|
|
37
|
+
input_file: Path,
|
|
38
|
+
output_path: Path,
|
|
39
|
+
chunk_size: int = 100000,
|
|
40
|
+
compression: str = "zstd",
|
|
41
|
+
encoding: Optional[str] = None,
|
|
42
|
+
overwrite: bool = False,
|
|
43
|
+
verbose: bool = False,
|
|
44
|
+
cols: Optional[list[str]] = None,
|
|
45
|
+
):
|
|
46
|
+
"""
|
|
47
|
+
Convert a SAS7BDAT file to Parquet format in chunks.
|
|
48
|
+
|
|
49
|
+
Parameters
|
|
50
|
+
----------
|
|
51
|
+
input_file : Path
|
|
52
|
+
Path to the SAS file (.sas7bdat)
|
|
53
|
+
output_path : Path
|
|
54
|
+
Path to output directory for Parquet files
|
|
55
|
+
chunk_size : int, optional
|
|
56
|
+
Number of rows to read per chunk (default: 100000)
|
|
57
|
+
compression : str, optional
|
|
58
|
+
Parquet compression algorithm (default: 'zstd')
|
|
59
|
+
encoding : str, optional
|
|
60
|
+
File encoding (default: auto-detect)
|
|
61
|
+
overwrite : bool, optional
|
|
62
|
+
Whether to overwrite existing output directory (default: False)
|
|
63
|
+
verbose : bool, optional
|
|
64
|
+
Enable verbose output (default: False)
|
|
65
|
+
cols : list[str], optional
|
|
66
|
+
List of columns to read from the SAS file (default: None, reads all columns)
|
|
67
|
+
|
|
68
|
+
Raises
|
|
69
|
+
------
|
|
70
|
+
FileNotFoundError
|
|
71
|
+
If the input file does not exist
|
|
72
|
+
ValueError
|
|
73
|
+
If the input file is not a .sas7bdat file or output path is invalid
|
|
74
|
+
PermissionError
|
|
75
|
+
If there are permission issues with files/directories
|
|
76
|
+
MemoryError
|
|
77
|
+
If there's insufficient memory (suggest smaller chunk_size)
|
|
78
|
+
"""
|
|
79
|
+
# Validate input file
|
|
80
|
+
if not input_file.exists():
|
|
81
|
+
click.secho(f"✗ Error: Input file not found: {input_file}", fg="red", err=True)
|
|
82
|
+
sys.exit(1)
|
|
83
|
+
|
|
84
|
+
if input_file.suffix.lower() != ".sas7bdat":
|
|
85
|
+
click.secho("✗ Error: Input file must be a .sas7bdat file", fg="red", err=True)
|
|
86
|
+
sys.exit(1)
|
|
87
|
+
|
|
88
|
+
# Handle output directory
|
|
89
|
+
output_path = Path(output_path)
|
|
90
|
+
|
|
91
|
+
if output_path.exists():
|
|
92
|
+
if not output_path.is_dir():
|
|
93
|
+
click.secho(
|
|
94
|
+
f"✗ Error: Output path exists and is not a directory: {output_path}",
|
|
95
|
+
fg="red",
|
|
96
|
+
err=True,
|
|
97
|
+
)
|
|
98
|
+
sys.exit(1)
|
|
99
|
+
|
|
100
|
+
if overwrite:
|
|
101
|
+
if verbose:
|
|
102
|
+
click.echo(f"Removing existing directory: {output_path}")
|
|
103
|
+
shutil.rmtree(output_path)
|
|
104
|
+
output_path.mkdir(parents=True, exist_ok=True)
|
|
105
|
+
else:
|
|
106
|
+
# Check if directory has parquet files
|
|
107
|
+
existing_parquet = list(output_path.glob("*.parquet"))
|
|
108
|
+
if existing_parquet:
|
|
109
|
+
click.secho(
|
|
110
|
+
f"⚠ Warning: Output directory already contains {len(existing_parquet)} .parquet file(s)",
|
|
111
|
+
fg="yellow",
|
|
112
|
+
)
|
|
113
|
+
if not click.confirm(
|
|
114
|
+
"Do you want to continue and potentially mix files?"
|
|
115
|
+
):
|
|
116
|
+
click.echo("Aborted.")
|
|
117
|
+
sys.exit(0)
|
|
118
|
+
else:
|
|
119
|
+
output_path.mkdir(parents=True, exist_ok=True)
|
|
120
|
+
|
|
121
|
+
# Display processing information
|
|
122
|
+
click.echo(f"Processing: {input_file}")
|
|
123
|
+
click.echo(f"Output directory: {output_path}")
|
|
124
|
+
if verbose:
|
|
125
|
+
click.echo(f"Chunk size: {chunk_size:,} rows")
|
|
126
|
+
click.echo(f"Compression: {compression}")
|
|
127
|
+
click.echo(f"Encoding: {encoding if encoding else 'auto-detect'}")
|
|
128
|
+
click.echo()
|
|
129
|
+
|
|
130
|
+
# Process file in chunks
|
|
131
|
+
click.echo("Converting SAS to Parquet...")
|
|
132
|
+
|
|
133
|
+
reader = pyreadstat.read_file_in_chunks(
|
|
134
|
+
pyreadstat.read_sas7bdat,
|
|
135
|
+
str(input_file),
|
|
136
|
+
chunksize=chunk_size,
|
|
137
|
+
encoding=encoding,
|
|
138
|
+
output_format="polars",
|
|
139
|
+
usecols=cols,
|
|
140
|
+
)
|
|
141
|
+
|
|
142
|
+
chunk_num = 0
|
|
143
|
+
total_rows = 0
|
|
144
|
+
num_columns = 0
|
|
145
|
+
|
|
146
|
+
for df_polars, meta in reader:
|
|
147
|
+
# Start timing for this chunk
|
|
148
|
+
chunk_start_time = time.time()
|
|
149
|
+
|
|
150
|
+
# Generate chunk filename with zero-padded numbering
|
|
151
|
+
chunk_filename = f"chunk_{chunk_num:04d}.parquet"
|
|
152
|
+
chunk_path = output_path / chunk_filename
|
|
153
|
+
|
|
154
|
+
# Write chunk to parquet
|
|
155
|
+
df_polars.write_parquet(chunk_path, compression=compression)
|
|
156
|
+
|
|
157
|
+
# Calculate elapsed time
|
|
158
|
+
chunk_elapsed_time = time.time() - chunk_start_time
|
|
159
|
+
|
|
160
|
+
# Update statistics
|
|
161
|
+
chunk_rows = len(df_polars)
|
|
162
|
+
total_rows += chunk_rows
|
|
163
|
+
num_columns = len(meta.column_names)
|
|
164
|
+
chunk_num += 1
|
|
165
|
+
|
|
166
|
+
if verbose:
|
|
167
|
+
click.echo(
|
|
168
|
+
f" Wrote chunk {chunk_num}: {chunk_rows:,} rows → {chunk_filename} (took {chunk_elapsed_time:.2f}s)"
|
|
169
|
+
)
|
|
170
|
+
else:
|
|
171
|
+
click.echo(
|
|
172
|
+
f" Wrote chunk {chunk_num}: {chunk_rows:,} rows → {chunk_filename} (took {chunk_elapsed_time:.2f}s)"
|
|
173
|
+
)
|
|
174
|
+
|
|
175
|
+
# Calculate file sizes
|
|
176
|
+
input_size = input_file.stat().st_size / (1024 * 1024) # MB
|
|
177
|
+
output_files = list(output_path.glob("*.parquet"))
|
|
178
|
+
output_size = sum(f.stat().st_size for f in output_files) / (1024 * 1024) # MB
|
|
179
|
+
compression_ratio = input_size / output_size if output_size > 0 else 0
|
|
180
|
+
|
|
181
|
+
# Display success message
|
|
182
|
+
click.echo()
|
|
183
|
+
click.secho("✓ Conversion completed successfully!", fg="green", bold=True)
|
|
184
|
+
click.echo(f"Input file: {input_size:.2f} MB ({input_file.name})")
|
|
185
|
+
click.echo(f"Output files: {output_size:.2f} MB ({len(output_files)} chunks)")
|
|
186
|
+
click.echo(f"Compression: {compression_ratio:.2f}x")
|
|
187
|
+
click.echo(f"Total rows: {total_rows:,}")
|
|
188
|
+
click.echo(f"Columns: {num_columns}")
|
|
189
|
+
click.echo(f"Chunk size: {chunk_size:,} rows/chunk")
|
|
190
|
+
click.echo()
|
|
191
|
+
click.echo("To read the data:")
|
|
192
|
+
click.echo(f" import polars as pl")
|
|
193
|
+
click.echo(f" df = pl.read_parquet('{output_path}/*.parquet')")
|
|
194
|
+
|
|
195
|
+
|
|
196
|
+
@click.command()
|
|
197
|
+
@click.argument("input_file", type=click.Path(exists=True, path_type=Path))
|
|
198
|
+
@click.argument("output_path", type=click.Path(path_type=Path))
|
|
199
|
+
@click.option(
|
|
200
|
+
"--table-type",
|
|
201
|
+
type=click.Choice(["diagnosis"], case_sensitive=False),
|
|
202
|
+
default=None,
|
|
203
|
+
help="Type of table to process (determines which columns to extract). If not specified, all columns are read.",
|
|
204
|
+
)
|
|
205
|
+
@click.option(
|
|
206
|
+
"-c",
|
|
207
|
+
"--chunk-size",
|
|
208
|
+
type=int,
|
|
209
|
+
default=100000,
|
|
210
|
+
help="Number of rows to read per chunk (default: 100000)",
|
|
211
|
+
)
|
|
212
|
+
@click.option(
|
|
213
|
+
"--compression",
|
|
214
|
+
type=click.Choice(["zstd", "lz4", "snappy", "gzip"], case_sensitive=False),
|
|
215
|
+
default="zstd",
|
|
216
|
+
help="Parquet compression algorithm (default: zstd)",
|
|
217
|
+
)
|
|
218
|
+
@click.option(
|
|
219
|
+
"--encoding",
|
|
220
|
+
type=str,
|
|
221
|
+
default=None,
|
|
222
|
+
help="File encoding (default: auto-detect)",
|
|
223
|
+
)
|
|
224
|
+
@click.option(
|
|
225
|
+
"--overwrite",
|
|
226
|
+
is_flag=True,
|
|
227
|
+
default=False,
|
|
228
|
+
help="Overwrite existing output directory if it exists",
|
|
229
|
+
)
|
|
230
|
+
@click.option("-v", "--verbose", is_flag=True, help="Enable verbose output")
|
|
231
|
+
def sas2parquet(
|
|
232
|
+
input_file,
|
|
233
|
+
output_path,
|
|
234
|
+
table_type,
|
|
235
|
+
chunk_size,
|
|
236
|
+
compression,
|
|
237
|
+
encoding,
|
|
238
|
+
overwrite,
|
|
239
|
+
verbose,
|
|
240
|
+
):
|
|
241
|
+
"""
|
|
242
|
+
Convert a SAS file to Parquet format in chunks.
|
|
243
|
+
|
|
244
|
+
INPUT_FILE: Path to the SAS file (.sas7bdat)
|
|
245
|
+
|
|
246
|
+
OUTPUT_PATH: Path to output directory for Parquet files
|
|
247
|
+
|
|
248
|
+
This command processes large SAS files in chunks to minimize memory usage.
|
|
249
|
+
Each chunk is written to a separate Parquet file in the output directory.
|
|
250
|
+
|
|
251
|
+
Example usage:
|
|
252
|
+
|
|
253
|
+
specatwrap sas2parquet input.sas7bdat output_dir/ --table-type diagnosis
|
|
254
|
+
|
|
255
|
+
specatwrap sas2parquet input.sas7bdat output_dir/ --chunk-size 50000 -v
|
|
256
|
+
|
|
257
|
+
specatwrap sas2parquet input.sas7bdat output_dir/ --table-type diagnosis --compression lz4
|
|
258
|
+
|
|
259
|
+
Reading the data later with Polars:
|
|
260
|
+
|
|
261
|
+
import polars as pl
|
|
262
|
+
|
|
263
|
+
df = pl.read_parquet("output_dir/*.parquet")
|
|
264
|
+
"""
|
|
265
|
+
# Get columns for the specified table type (None means all columns)
|
|
266
|
+
cols = TABLE_TYPE_COLUMNS.get(table_type.lower()) if table_type else None
|
|
267
|
+
|
|
268
|
+
try:
|
|
269
|
+
convert_sas_to_parquet(
|
|
270
|
+
input_file=input_file,
|
|
271
|
+
output_path=output_path,
|
|
272
|
+
chunk_size=chunk_size,
|
|
273
|
+
compression=compression,
|
|
274
|
+
encoding=encoding,
|
|
275
|
+
overwrite=overwrite,
|
|
276
|
+
verbose=verbose,
|
|
277
|
+
cols=cols,
|
|
278
|
+
)
|
|
279
|
+
except FileNotFoundError as e:
|
|
280
|
+
click.secho(f"✗ Error: File not found - {e}", fg="red", err=True)
|
|
281
|
+
sys.exit(1)
|
|
282
|
+
except PermissionError as e:
|
|
283
|
+
click.secho(f"✗ Error: Permission denied - {e}", fg="red", err=True)
|
|
284
|
+
sys.exit(1)
|
|
285
|
+
except ValueError as e:
|
|
286
|
+
click.secho(f"✗ Error: Invalid input - {e}", fg="red", err=True)
|
|
287
|
+
sys.exit(1)
|
|
288
|
+
except MemoryError:
|
|
289
|
+
click.secho(
|
|
290
|
+
f"✗ Error: Out of memory. Try a smaller --chunk-size", fg="red", err=True
|
|
291
|
+
)
|
|
292
|
+
sys.exit(1)
|
|
293
|
+
except Exception as e:
|
|
294
|
+
click.secho(f"✗ Error: {e}", fg="red", err=True)
|
|
295
|
+
if verbose:
|
|
296
|
+
import traceback
|
|
297
|
+
|
|
298
|
+
traceback.print_exc()
|
|
299
|
+
sys.exit(1)
|
|
@@ -0,0 +1,93 @@
|
|
|
1
|
+
"""
|
|
2
|
+
SAS file preview module.
|
|
3
|
+
|
|
4
|
+
This module provides functionality to quickly preview the first N rows
|
|
5
|
+
of a SAS7BDAT file for inspection purposes.
|
|
6
|
+
"""
|
|
7
|
+
|
|
8
|
+
import click
|
|
9
|
+
from pathlib import Path
|
|
10
|
+
import sys
|
|
11
|
+
import pyreadstat
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
@click.command()
|
|
15
|
+
@click.argument("input_file", type=click.Path(exists=True, path_type=Path))
|
|
16
|
+
@click.option(
|
|
17
|
+
"-n",
|
|
18
|
+
"--num-rows",
|
|
19
|
+
type=int,
|
|
20
|
+
default=10,
|
|
21
|
+
help="Number of rows to display (default: 10)",
|
|
22
|
+
)
|
|
23
|
+
@click.option(
|
|
24
|
+
"--encoding",
|
|
25
|
+
type=str,
|
|
26
|
+
default=None,
|
|
27
|
+
help="File encoding (default: auto-detect)",
|
|
28
|
+
)
|
|
29
|
+
def preview(input_file, num_rows, encoding):
|
|
30
|
+
"""
|
|
31
|
+
Preview the first N rows of a SAS file.
|
|
32
|
+
|
|
33
|
+
INPUT_FILE: Path to the SAS file (.sas7bdat)
|
|
34
|
+
|
|
35
|
+
This command quickly reads and displays the first few rows of a SAS file
|
|
36
|
+
to help you inspect the data structure and content without converting
|
|
37
|
+
the entire file.
|
|
38
|
+
|
|
39
|
+
Example usage:
|
|
40
|
+
|
|
41
|
+
specatwrap preview input.sas7bdat
|
|
42
|
+
|
|
43
|
+
specatwrap preview input.sas7bdat --num-rows 20
|
|
44
|
+
|
|
45
|
+
specatwrap preview input.sas7bdat --encoding latin1
|
|
46
|
+
"""
|
|
47
|
+
try:
|
|
48
|
+
# Validate file exists
|
|
49
|
+
if not input_file.exists():
|
|
50
|
+
click.secho(f"✗ Error: File not found: {input_file}", fg="red", err=True)
|
|
51
|
+
sys.exit(1)
|
|
52
|
+
|
|
53
|
+
if input_file.suffix.lower() != ".sas7bdat":
|
|
54
|
+
click.secho(
|
|
55
|
+
"✗ Error: Input file must be a .sas7bdat file", fg="red", err=True
|
|
56
|
+
)
|
|
57
|
+
sys.exit(1)
|
|
58
|
+
|
|
59
|
+
click.echo(f"Reading first {num_rows} rows from: {input_file.name}")
|
|
60
|
+
click.echo()
|
|
61
|
+
|
|
62
|
+
# Read first N rows using pyreadstat
|
|
63
|
+
df, meta = pyreadstat.read_sas7bdat(
|
|
64
|
+
str(input_file),
|
|
65
|
+
row_limit=num_rows,
|
|
66
|
+
encoding=encoding,
|
|
67
|
+
output_format="polars",
|
|
68
|
+
)
|
|
69
|
+
|
|
70
|
+
# Display metadata
|
|
71
|
+
click.echo(f"Total columns: {len(meta.column_names)}")
|
|
72
|
+
click.echo(f"Column names: {', '.join(meta.column_names)}")
|
|
73
|
+
if encoding:
|
|
74
|
+
click.echo(f"Encoding: {encoding}")
|
|
75
|
+
click.echo()
|
|
76
|
+
|
|
77
|
+
# Display the data using Polars' pretty print
|
|
78
|
+
click.echo(str(df))
|
|
79
|
+
click.echo()
|
|
80
|
+
click.secho(f"✓ Showing {len(df)} row(s)", fg="green")
|
|
81
|
+
|
|
82
|
+
except FileNotFoundError as e:
|
|
83
|
+
click.secho(f"✗ Error: File not found - {e}", fg="red", err=True)
|
|
84
|
+
sys.exit(1)
|
|
85
|
+
except PermissionError as e:
|
|
86
|
+
click.secho(f"✗ Error: Permission denied - {e}", fg="red", err=True)
|
|
87
|
+
sys.exit(1)
|
|
88
|
+
except ValueError as e:
|
|
89
|
+
click.secho(f"✗ Error: Invalid input - {e}", fg="red", err=True)
|
|
90
|
+
sys.exit(1)
|
|
91
|
+
except Exception as e:
|
|
92
|
+
click.secho(f"✗ Error: {e}", fg="red", err=True)
|
|
93
|
+
sys.exit(1)
|