specatwrap88 0.2.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- specatwrap88-0.2.0/PKG-INFO +13 -0
- specatwrap88-0.2.0/README.md +0 -0
- specatwrap88-0.2.0/pyproject.toml +33 -0
- specatwrap88-0.2.0/src/specatwrap88/__init__.py +39 -0
- specatwrap88-0.2.0/src/specatwrap88/prep/__init__.py +112 -0
- specatwrap88-0.2.0/src/specatwrap88/prep/base_filter.py +42 -0
- specatwrap88-0.2.0/src/specatwrap88/prep/diagnosis.py +226 -0
- specatwrap88-0.2.0/src/specatwrap88/prep/io_handler.py +89 -0
- specatwrap88-0.2.0/src/specatwrap88/prep/pathways.py +35 -0
- specatwrap88-0.2.0/src/specatwrap88/printable/DECLARE_ratio.py +32 -0
- specatwrap88-0.2.0/src/specatwrap88/printable/DECLARE_templates.py +26 -0
- specatwrap88-0.2.0/src/specatwrap88/printable/DIAG_all.py +96 -0
- specatwrap88-0.2.0/src/specatwrap88/printable/DIAG_initial_sorting.py +23 -0
- specatwrap88-0.2.0/src/specatwrap88/printable/DIAG_initial_sorting_all.py +15 -0
- specatwrap88-0.2.0/src/specatwrap88/printable/DIAG_stats.py +49 -0
- specatwrap88-0.2.0/src/specatwrap88/printable/__init__.py +146 -0
- specatwrap88-0.2.0/src/specatwrap88/printable/burst.py +53 -0
- specatwrap88-0.2.0/src/specatwrap88/printable/combine_death_contact.py +15 -0
- specatwrap88-0.2.0/src/specatwrap88/printable/dicsover.py +28 -0
- specatwrap88-0.2.0/src/specatwrap88/printable/proc_dfg.py +137 -0
- specatwrap88-0.2.0/src/specatwrap88/printable/reinvent-burst.py +8 -0
- specatwrap88-0.2.0/src/specatwrap88/printable/reinvent.py +77 -0
- specatwrap88-0.2.0/src/specatwrap88/printable/summary.py +87 -0
- specatwrap88-0.2.0/src/specatwrap88/printable/tmp_conv.py +25 -0
- specatwrap88-0.2.0/src/specatwrap88/printable/zoom_in.py +72 -0
- specatwrap88-0.2.0/src/specatwrap88/sas_converter.py +317 -0
- specatwrap88-0.2.0/src/specatwrap88/sas_preview.py +93 -0
|
@@ -0,0 +1,13 @@
|
|
|
1
|
+
Metadata-Version: 2.3
|
|
2
|
+
Name: specatwrap88
|
|
3
|
+
Version: 0.2.0
|
|
4
|
+
Summary: A simple wrapper
|
|
5
|
+
Author: Casper Lauge Nørup Koch
|
|
6
|
+
Author-email: Casper Lauge Nørup Koch <kochcasper@gmail.com>
|
|
7
|
+
Requires-Dist: click>=8.3.1
|
|
8
|
+
Requires-Dist: pm4py>=2.7.19.8
|
|
9
|
+
Requires-Dist: polars>=1.38.1
|
|
10
|
+
Requires-Dist: pyreadstat>=1.3.3
|
|
11
|
+
Requires-Python: >=3.12
|
|
12
|
+
Description-Content-Type: text/markdown
|
|
13
|
+
|
|
File without changes
|
|
@@ -0,0 +1,33 @@
|
|
|
1
|
+
[project]
|
|
2
|
+
name = "specatwrap88"
|
|
3
|
+
version = "0.2.0"
|
|
4
|
+
description = "A simple wrapper"
|
|
5
|
+
readme = "README.md"
|
|
6
|
+
authors = [
|
|
7
|
+
{ name = "Casper Lauge Nørup Koch", email = "kochcasper@gmail.com" }
|
|
8
|
+
]
|
|
9
|
+
requires-python = ">=3.12"
|
|
10
|
+
dependencies = [
|
|
11
|
+
"click>=8.3.1",
|
|
12
|
+
"pm4py>=2.7.19.8",
|
|
13
|
+
"polars>=1.38.1",
|
|
14
|
+
"pyreadstat>=1.3.3",
|
|
15
|
+
]
|
|
16
|
+
|
|
17
|
+
[project.scripts]
|
|
18
|
+
specatwrap = "specatwrap88:main"
|
|
19
|
+
|
|
20
|
+
[build-system]
|
|
21
|
+
requires = ["uv_build>=0.10.3,<0.11.0"]
|
|
22
|
+
build-backend = "uv_build"
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
[tool.pyright]
|
|
26
|
+
reportUnannotatedClassAttribute = "none"
|
|
27
|
+
reportImplicitOverride = "none"
|
|
28
|
+
reportUnknownParameterType = "none"
|
|
29
|
+
reportMissingParameterType = "none"
|
|
30
|
+
reportUnknownVariableType = "none"
|
|
31
|
+
reportUnknownMemberType = "none"
|
|
32
|
+
reportUnknownArgumentType = "none"
|
|
33
|
+
reportAny = "none"
|
|
@@ -0,0 +1,39 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Specatwrap - A wrapper for processing healthcare data.
|
|
3
|
+
|
|
4
|
+
This module provides a CLI for converting and processing healthcare data files.
|
|
5
|
+
"""
|
|
6
|
+
|
|
7
|
+
import click
|
|
8
|
+
|
|
9
|
+
from .sas_converter import sas2parquet
|
|
10
|
+
from .prep import prep
|
|
11
|
+
from .sas_preview import preview
|
|
12
|
+
from .printable import print_file
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
@click.group()
|
|
16
|
+
@click.version_option(version="0.1.0")
|
|
17
|
+
def cli():
|
|
18
|
+
"""
|
|
19
|
+
Specatwrap - A wrapper for processing healthcare data.
|
|
20
|
+
|
|
21
|
+
A command-line tool for processing and converting healthcare data files.
|
|
22
|
+
"""
|
|
23
|
+
pass
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
# Register command groups
|
|
27
|
+
cli.add_command(sas2parquet)
|
|
28
|
+
cli.add_command(prep)
|
|
29
|
+
cli.add_command(preview)
|
|
30
|
+
cli.add_command(print_file)
|
|
31
|
+
|
|
32
|
+
|
|
33
|
+
def main():
|
|
34
|
+
"""Entry point for the CLI application."""
|
|
35
|
+
cli()
|
|
36
|
+
|
|
37
|
+
|
|
38
|
+
if __name__ == "__main__":
|
|
39
|
+
main()
|
|
@@ -0,0 +1,112 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Prep command for preprocessing healthcare data files.
|
|
3
|
+
|
|
4
|
+
This module provides a command for filtering and preprocessing parquet files
|
|
5
|
+
before converting them to XES format.
|
|
6
|
+
"""
|
|
7
|
+
|
|
8
|
+
import click
|
|
9
|
+
from pathlib import Path
|
|
10
|
+
import sys
|
|
11
|
+
|
|
12
|
+
from .io_handler import process_parquet_files
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
@click.command()
|
|
16
|
+
@click.argument("filter_type", type=str)
|
|
17
|
+
@click.option(
|
|
18
|
+
"-i",
|
|
19
|
+
"--input",
|
|
20
|
+
"input_dir",
|
|
21
|
+
type=click.Path(exists=True, file_okay=False, dir_okay=True, path_type=Path),
|
|
22
|
+
required=True,
|
|
23
|
+
help="Path to directory containing parquet files to process.",
|
|
24
|
+
)
|
|
25
|
+
@click.option(
|
|
26
|
+
"-o",
|
|
27
|
+
"--output",
|
|
28
|
+
"output_file",
|
|
29
|
+
type=click.Path(dir_okay=False, file_okay=True, path_type=Path),
|
|
30
|
+
required=True,
|
|
31
|
+
help="Path to output parquet file.",
|
|
32
|
+
)
|
|
33
|
+
@click.option("-v", "--verbose", is_flag=True, help="Enable verbose output.")
|
|
34
|
+
def prep(
|
|
35
|
+
filter_type: str,
|
|
36
|
+
input_dir: Path,
|
|
37
|
+
output_file: Path,
|
|
38
|
+
verbose: bool,
|
|
39
|
+
) -> None:
|
|
40
|
+
"""
|
|
41
|
+
Preprocess and filter healthcare data files.
|
|
42
|
+
|
|
43
|
+
FILTER_TYPE: Name of the registered filter to apply, such as
|
|
44
|
+
"diagnosis" or "pathways".
|
|
45
|
+
|
|
46
|
+
This command lazily reads all parquet files from a directory, applies
|
|
47
|
+
the selected preprocessing transformation, and writes the results to
|
|
48
|
+
a single output parquet file.
|
|
49
|
+
"""
|
|
50
|
+
try:
|
|
51
|
+
# Display processing information
|
|
52
|
+
click.echo(f"Filter type: {filter_type}")
|
|
53
|
+
click.echo(f"Input directory: {input_dir}")
|
|
54
|
+
click.echo(f"Output file: {output_file}")
|
|
55
|
+
click.echo()
|
|
56
|
+
|
|
57
|
+
# Find all parquet files in input directory (for verbose output)
|
|
58
|
+
if verbose:
|
|
59
|
+
parquet_pattern = str(input_dir / "*.parquet")
|
|
60
|
+
click.echo(f"Searching for parquet files: {parquet_pattern}")
|
|
61
|
+
parquet_files = list(input_dir.glob("*.parquet"))
|
|
62
|
+
if parquet_files:
|
|
63
|
+
click.echo(f"Found {len(parquet_files)} parquet file(s):")
|
|
64
|
+
for f in parquet_files:
|
|
65
|
+
click.echo(f" - {f.name}")
|
|
66
|
+
click.echo()
|
|
67
|
+
|
|
68
|
+
# Process the parquet files using the io_handler
|
|
69
|
+
click.echo("Loading and applying filters...")
|
|
70
|
+
with click.progressbar(
|
|
71
|
+
length=100, label="Processing", show_eta=False, show_percent=True
|
|
72
|
+
) as bar:
|
|
73
|
+
process_parquet_files(
|
|
74
|
+
input_dir=input_dir,
|
|
75
|
+
output_file=output_file,
|
|
76
|
+
filter_type=filter_type,
|
|
77
|
+
verbose=verbose,
|
|
78
|
+
)
|
|
79
|
+
bar.update(100)
|
|
80
|
+
|
|
81
|
+
# Display success message
|
|
82
|
+
click.secho("✓ Processing completed successfully!", fg="green", bold=True)
|
|
83
|
+
click.echo(f"Output file: {output_file}")
|
|
84
|
+
|
|
85
|
+
# Display file statistics
|
|
86
|
+
if output_file.exists():
|
|
87
|
+
size_mb = output_file.stat().st_size / (1024 * 1024)
|
|
88
|
+
click.echo(f"File size: {size_mb:.2f} MB")
|
|
89
|
+
|
|
90
|
+
except FileNotFoundError as e:
|
|
91
|
+
click.secho(f"✗ Error: File or directory not found - {e}", fg="red", err=True)
|
|
92
|
+
sys.exit(1)
|
|
93
|
+
except PermissionError as e:
|
|
94
|
+
click.secho(f"✗ Error: Permission denied - {e}", fg="red", err=True)
|
|
95
|
+
sys.exit(1)
|
|
96
|
+
except ValueError as e:
|
|
97
|
+
click.secho(f"✗ Error: Invalid input - {e}", fg="red", err=True)
|
|
98
|
+
sys.exit(1)
|
|
99
|
+
except MemoryError:
|
|
100
|
+
click.secho(
|
|
101
|
+
"✗ Error: Out of memory. Try processing smaller batches.",
|
|
102
|
+
fg="red",
|
|
103
|
+
err=True,
|
|
104
|
+
)
|
|
105
|
+
sys.exit(1)
|
|
106
|
+
except Exception as e:
|
|
107
|
+
click.secho(f"✗ Error: {e}", fg="red", err=True)
|
|
108
|
+
if verbose:
|
|
109
|
+
import traceback
|
|
110
|
+
|
|
111
|
+
traceback.print_exc()
|
|
112
|
+
sys.exit(1)
|
|
@@ -0,0 +1,42 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Base filter class for parquet preprocessing.
|
|
3
|
+
"""
|
|
4
|
+
|
|
5
|
+
from abc import ABC, abstractmethod
|
|
6
|
+
|
|
7
|
+
import polars as pl
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
class BaseFilter(ABC):
|
|
11
|
+
"""
|
|
12
|
+
Abstract base class for parquet file filters.
|
|
13
|
+
|
|
14
|
+
Each filter type (diagnosis, procedure, medication, etc.) should inherit
|
|
15
|
+
from this class and implement the apply() method with their specific
|
|
16
|
+
filtering and transformation logic.
|
|
17
|
+
"""
|
|
18
|
+
|
|
19
|
+
@abstractmethod
|
|
20
|
+
def apply(self, lf: pl.LazyFrame) -> pl.LazyFrame:
|
|
21
|
+
"""
|
|
22
|
+
Apply filtering and preprocessing logic to a LazyFrame.
|
|
23
|
+
|
|
24
|
+
Args:
|
|
25
|
+
lazy_frame: Input Polars LazyFrame to filter/transform
|
|
26
|
+
|
|
27
|
+
Returns:
|
|
28
|
+
Transformed Polars LazyFrame
|
|
29
|
+
"""
|
|
30
|
+
pass
|
|
31
|
+
|
|
32
|
+
def get_name(self) -> str:
|
|
33
|
+
"""
|
|
34
|
+
Get the name of this filter type.
|
|
35
|
+
|
|
36
|
+
Returns:
|
|
37
|
+
Filter type name (defaults to class name without 'Filter' suffix)
|
|
38
|
+
"""
|
|
39
|
+
class_name = self.__class__.__name__
|
|
40
|
+
if class_name.endswith("Filter"):
|
|
41
|
+
return class_name[:-6].lower()
|
|
42
|
+
return class_name.lower()
|
|
@@ -0,0 +1,226 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Diagnosis filter classes.
|
|
3
|
+
|
|
4
|
+
This module provides filter classes for preprocessing diagnosis parquet files.
|
|
5
|
+
"""
|
|
6
|
+
|
|
7
|
+
import polars as pl
|
|
8
|
+
|
|
9
|
+
from .base_filter import BaseFilter
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
class DiagnosisFilter(BaseFilter):
|
|
13
|
+
"""
|
|
14
|
+
Filter for diagnosis parquet files.
|
|
15
|
+
|
|
16
|
+
Applies diagnosis-specific filtering and column transformations:
|
|
17
|
+
- Filters by birthdate (year > 1980)
|
|
18
|
+
- Filters by region (Region Sjælland)
|
|
19
|
+
- Renames columns to standardized event log format
|
|
20
|
+
"""
|
|
21
|
+
|
|
22
|
+
CASE_ATTR = {
|
|
23
|
+
k: f"case:{v}"
|
|
24
|
+
for k, v in {
|
|
25
|
+
"BORGER_FOEDSELSDATO": "BDay",
|
|
26
|
+
"PNR": "PNR",
|
|
27
|
+
"BORGER_KOEN": "gender",
|
|
28
|
+
}.items()
|
|
29
|
+
}
|
|
30
|
+
|
|
31
|
+
EVENT_ATTR = {
|
|
32
|
+
"TORRA_DIAG": "TDiag",
|
|
33
|
+
"ADIAG": "ADiag",
|
|
34
|
+
"ADIAG_TEKST": "diagText",
|
|
35
|
+
"KONT_ANS_GEO_REG_TEKST": "region",
|
|
36
|
+
"KONT_LPR_ENTITY_ID": "org:id",
|
|
37
|
+
"KONT_INST_EJERTYPE": "org:type",
|
|
38
|
+
"KONT_STARTTIDSPUNKT": "startTime",
|
|
39
|
+
"KONT_SLUTTIDSPUNKT": "endTime",
|
|
40
|
+
# "BORGER_ALDER_AAR_IND": "patient:age",
|
|
41
|
+
"PRIORITET_TEKST": "priority",
|
|
42
|
+
"KONT_TYPE_TEKST": "contact_type",
|
|
43
|
+
}
|
|
44
|
+
|
|
45
|
+
def apply(self, lf: pl.LazyFrame) -> pl.LazyFrame:
|
|
46
|
+
"""
|
|
47
|
+
Apply diagnosis-specific filtering and transformations.
|
|
48
|
+
|
|
49
|
+
Args:
|
|
50
|
+
lazy_frame: Input LazyFrame containing diagnosis data
|
|
51
|
+
|
|
52
|
+
Returns:
|
|
53
|
+
Filtered and transformed LazyFrame
|
|
54
|
+
"""
|
|
55
|
+
|
|
56
|
+
lf = self._add_torra_diag(lf)
|
|
57
|
+
|
|
58
|
+
return (
|
|
59
|
+
lf.filter(pl.col("TORRA_DIAG").list.len() > 0)
|
|
60
|
+
.with_columns(pl.col("TORRA_DIAG").list.join(", "))
|
|
61
|
+
.select(list((self.CASE_ATTR | self.EVENT_ATTR).keys()))
|
|
62
|
+
.rename(self.CASE_ATTR | self.EVENT_ATTR)
|
|
63
|
+
)
|
|
64
|
+
|
|
65
|
+
def _add_torra_diag(self, lazy_frame: pl.LazyFrame) -> pl.LazyFrame:
|
|
66
|
+
col = pl.col("ADIAG")
|
|
67
|
+
|
|
68
|
+
def starts_with(prefixes: list[str], label: str) -> pl.Expr:
|
|
69
|
+
pattern = f"^(?:{'|'.join(prefixes)})"
|
|
70
|
+
return (
|
|
71
|
+
pl.when(col.str.contains(pattern))
|
|
72
|
+
.then(pl.lit(label))
|
|
73
|
+
.otherwise(pl.lit(None))
|
|
74
|
+
)
|
|
75
|
+
|
|
76
|
+
return lazy_frame.with_columns(
|
|
77
|
+
TORRA_DIAG=pl.concat_list(
|
|
78
|
+
[
|
|
79
|
+
starts_with(
|
|
80
|
+
["DJ41", "DJ42", "DJ43", "DJ44", "DJ45", "DJ46"], "LUNG"
|
|
81
|
+
),
|
|
82
|
+
starts_with(
|
|
83
|
+
[
|
|
84
|
+
"DL405",
|
|
85
|
+
"DM05",
|
|
86
|
+
"DM06",
|
|
87
|
+
"DM07",
|
|
88
|
+
"DM15",
|
|
89
|
+
"DM16",
|
|
90
|
+
"DM17",
|
|
91
|
+
"DM45",
|
|
92
|
+
"DM47",
|
|
93
|
+
"DM50",
|
|
94
|
+
"DM51",
|
|
95
|
+
"DM53",
|
|
96
|
+
"DM54",
|
|
97
|
+
"DM80",
|
|
98
|
+
"DM81",
|
|
99
|
+
"DM82",
|
|
100
|
+
],
|
|
101
|
+
"MUSCULOSKELETAL",
|
|
102
|
+
),
|
|
103
|
+
starts_with(
|
|
104
|
+
["DE03", "DE05", "DE10", "DE11", "DE12", "DE13", "DE14"],
|
|
105
|
+
"ENDOCRINE",
|
|
106
|
+
),
|
|
107
|
+
starts_with(["DE03", "DE05"], "THYROIDEA"),
|
|
108
|
+
starts_with(["DE10", "DE11", "DE12", "DE13", "DE14"], "DIABETES"),
|
|
109
|
+
starts_with(["DG30", "DG318", "DG319", "DF"], "MENTAL"),
|
|
110
|
+
# CANCER: Matches DC but explicitly excludes DC44
|
|
111
|
+
pl.when(col.str.contains("^DC") & ~col.str.contains("^DC44"))
|
|
112
|
+
.then(pl.lit("CANCER"))
|
|
113
|
+
.otherwise(pl.lit(None)),
|
|
114
|
+
starts_with(
|
|
115
|
+
[
|
|
116
|
+
"DI60",
|
|
117
|
+
"DI61",
|
|
118
|
+
"DI62",
|
|
119
|
+
"DI63",
|
|
120
|
+
"DI64",
|
|
121
|
+
"DI69",
|
|
122
|
+
"DG20",
|
|
123
|
+
"DG35",
|
|
124
|
+
"DG40",
|
|
125
|
+
"DG43",
|
|
126
|
+
],
|
|
127
|
+
"NEUROLOGICAL",
|
|
128
|
+
),
|
|
129
|
+
starts_with(
|
|
130
|
+
[
|
|
131
|
+
"DK30",
|
|
132
|
+
"DK50",
|
|
133
|
+
"DK51",
|
|
134
|
+
"DK58",
|
|
135
|
+
"DK70",
|
|
136
|
+
"DK71",
|
|
137
|
+
"DK72",
|
|
138
|
+
"DK73",
|
|
139
|
+
"DK74",
|
|
140
|
+
"DK75",
|
|
141
|
+
"DK76",
|
|
142
|
+
"DK860",
|
|
143
|
+
"DK861",
|
|
144
|
+
],
|
|
145
|
+
"GASTROINTESTINAL",
|
|
146
|
+
),
|
|
147
|
+
starts_with(
|
|
148
|
+
[
|
|
149
|
+
"DI20",
|
|
150
|
+
"DI21",
|
|
151
|
+
"DI22",
|
|
152
|
+
"DI23",
|
|
153
|
+
"DI24",
|
|
154
|
+
"DI25",
|
|
155
|
+
"DI47",
|
|
156
|
+
"DI48",
|
|
157
|
+
"DI49",
|
|
158
|
+
"DI50",
|
|
159
|
+
"DI05",
|
|
160
|
+
"DI06",
|
|
161
|
+
"DI07",
|
|
162
|
+
"DI08",
|
|
163
|
+
"DI34",
|
|
164
|
+
"DI35",
|
|
165
|
+
"DI36",
|
|
166
|
+
"DI37",
|
|
167
|
+
"DI441",
|
|
168
|
+
"DI442",
|
|
169
|
+
"DI443",
|
|
170
|
+
"DI444",
|
|
171
|
+
"DI445",
|
|
172
|
+
"DI446",
|
|
173
|
+
"DI447",
|
|
174
|
+
"DI452",
|
|
175
|
+
"DI453",
|
|
176
|
+
"DI454",
|
|
177
|
+
"DI455",
|
|
178
|
+
"DI456",
|
|
179
|
+
"DI457",
|
|
180
|
+
"DI458",
|
|
181
|
+
"DI459",
|
|
182
|
+
],
|
|
183
|
+
"CARDIOVASCULAR",
|
|
184
|
+
),
|
|
185
|
+
starts_with(
|
|
186
|
+
[
|
|
187
|
+
"DN03",
|
|
188
|
+
"DN04",
|
|
189
|
+
"DN05",
|
|
190
|
+
"DN11",
|
|
191
|
+
"DN12",
|
|
192
|
+
"DN18",
|
|
193
|
+
"DN19",
|
|
194
|
+
"DZ49",
|
|
195
|
+
"DN80",
|
|
196
|
+
"DZ992",
|
|
197
|
+
"DN393",
|
|
198
|
+
"DN394",
|
|
199
|
+
],
|
|
200
|
+
"GENITURINARY",
|
|
201
|
+
),
|
|
202
|
+
starts_with(
|
|
203
|
+
[
|
|
204
|
+
"DH40",
|
|
205
|
+
"DH91",
|
|
206
|
+
"DL40",
|
|
207
|
+
"DH540",
|
|
208
|
+
"DH541",
|
|
209
|
+
"DH542",
|
|
210
|
+
"DH543",
|
|
211
|
+
"DH547",
|
|
212
|
+
"DH900",
|
|
213
|
+
"DH902",
|
|
214
|
+
"DH903",
|
|
215
|
+
"DH905",
|
|
216
|
+
"DH906",
|
|
217
|
+
"DH908",
|
|
218
|
+
],
|
|
219
|
+
"SENSORY ORGANS",
|
|
220
|
+
),
|
|
221
|
+
starts_with(
|
|
222
|
+
["DL23", "DL24", "DL25", "DJ30", "DL500", "DJ450"], "ALLERGY"
|
|
223
|
+
),
|
|
224
|
+
]
|
|
225
|
+
).list.drop_nulls()
|
|
226
|
+
)
|
|
@@ -0,0 +1,89 @@
|
|
|
1
|
+
"""
|
|
2
|
+
I/O handler for parquet file processing.
|
|
3
|
+
|
|
4
|
+
This module provides generic I/O functionality for reading, filtering,
|
|
5
|
+
and writing parquet files. It uses a filter registry pattern to support
|
|
6
|
+
different filter types (diagnosis, procedure, medication, etc.).
|
|
7
|
+
"""
|
|
8
|
+
|
|
9
|
+
from pathlib import Path
|
|
10
|
+
from typing import Type
|
|
11
|
+
import polars as pl
|
|
12
|
+
|
|
13
|
+
from .base_filter import BaseFilter
|
|
14
|
+
from .diagnosis import DiagnosisFilter
|
|
15
|
+
from .pathways import PathwaysFilter
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
# Registry mapping filter type strings to filter classes
|
|
19
|
+
FILTER_REGISTRY: dict[str, Type[BaseFilter]] = {
|
|
20
|
+
"diagnosis": DiagnosisFilter,
|
|
21
|
+
"pathways": PathwaysFilter,
|
|
22
|
+
# Future additions:
|
|
23
|
+
# "procedure": ProcedureFilter,
|
|
24
|
+
# "medication": MedicationFilter,
|
|
25
|
+
}
|
|
26
|
+
|
|
27
|
+
|
|
28
|
+
def process_parquet_files(
|
|
29
|
+
input_dir: Path,
|
|
30
|
+
output_file: Path,
|
|
31
|
+
filter_type: str,
|
|
32
|
+
verbose: bool = False,
|
|
33
|
+
):
|
|
34
|
+
"""
|
|
35
|
+
Process parquet files with a specified filter type.
|
|
36
|
+
|
|
37
|
+
This function handles all I/O operations:
|
|
38
|
+
- Discovers parquet files in the input directory
|
|
39
|
+
- Lazily loads them using Polars
|
|
40
|
+
- Applies the specified filter transformation
|
|
41
|
+
- Collects the results and writes to output file
|
|
42
|
+
|
|
43
|
+
Args:
|
|
44
|
+
input_dir: Directory containing input parquet files
|
|
45
|
+
output_file: Path to output parquet file
|
|
46
|
+
filter_type: Type of filter to apply (e.g., "diagnosis", "procedure")
|
|
47
|
+
verbose: Whether to include verbose processing (currently unused in I/O layer)
|
|
48
|
+
|
|
49
|
+
Returns:
|
|
50
|
+
Tuple of (row_count, column_count, column_names)
|
|
51
|
+
|
|
52
|
+
Raises:
|
|
53
|
+
FileNotFoundError: If no parquet files found in input directory
|
|
54
|
+
ValueError: If filter_type is not registered
|
|
55
|
+
PermissionError: If unable to read/write files
|
|
56
|
+
MemoryError: If insufficient memory to process data
|
|
57
|
+
"""
|
|
58
|
+
# Validate filter type
|
|
59
|
+
if filter_type not in FILTER_REGISTRY:
|
|
60
|
+
available_filters = ", ".join(FILTER_REGISTRY.keys())
|
|
61
|
+
raise ValueError(
|
|
62
|
+
f"Unknown filter type '{filter_type}'. "
|
|
63
|
+
f"Available filters: {available_filters}"
|
|
64
|
+
)
|
|
65
|
+
|
|
66
|
+
# Check if input directory contains parquet files
|
|
67
|
+
parquet_files = list(input_dir.glob("*.parquet"))
|
|
68
|
+
if not parquet_files:
|
|
69
|
+
raise FileNotFoundError(f"No parquet files found in {input_dir}")
|
|
70
|
+
|
|
71
|
+
# Create parquet file pattern for lazy loading
|
|
72
|
+
parquet_pattern = str(input_dir / "*.parquet")
|
|
73
|
+
|
|
74
|
+
# Lazy load all parquet files
|
|
75
|
+
lazy_frame = pl.scan_parquet(parquet_pattern)
|
|
76
|
+
|
|
77
|
+
# Instantiate the appropriate filter
|
|
78
|
+
filter_class = FILTER_REGISTRY[filter_type]
|
|
79
|
+
filter_instance = filter_class()
|
|
80
|
+
|
|
81
|
+
# Apply the filter transformation
|
|
82
|
+
filtered_lazy_frame = filter_instance.apply(lazy_frame)
|
|
83
|
+
|
|
84
|
+
# Collect the lazy frame (materialize the data)
|
|
85
|
+
|
|
86
|
+
# Ensure output directory exists
|
|
87
|
+
output_file.parent.mkdir(parents=True, exist_ok=True)
|
|
88
|
+
|
|
89
|
+
filtered_lazy_frame.sink_parquet(output_file)
|
|
@@ -0,0 +1,35 @@
|
|
|
1
|
+
import polars as pl
|
|
2
|
+
|
|
3
|
+
from .base_filter import BaseFilter
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
class PathwaysFilter(BaseFilter):
|
|
7
|
+
CASE_ATTR = {
|
|
8
|
+
k: f"case:{v}"
|
|
9
|
+
for k, v in {
|
|
10
|
+
"PNR": "PNR",
|
|
11
|
+
"DW_EK_BORGER": "patientId",
|
|
12
|
+
"DW_EK_HELBREDSFORLOEB": "caseId",
|
|
13
|
+
"HELBREDSFORL_STARTTIDSPUNKT": "pathStartTime",
|
|
14
|
+
"HELBREDSFORL_SLUTTIDSPUNKT": "pathEndTime",
|
|
15
|
+
}.items()
|
|
16
|
+
}
|
|
17
|
+
|
|
18
|
+
EVENT_ATTR = {
|
|
19
|
+
"DW_EK_FORLOEB": "forlId",
|
|
20
|
+
"FORL_AFSLUT_MAADE_TEKST": "endReason",
|
|
21
|
+
"FORL_HENV_AARSAG": "diagnosis",
|
|
22
|
+
"FORL_HENV_MAADE_TEKST": "referralReason",
|
|
23
|
+
"FORL_LABEL": "label",
|
|
24
|
+
"FORL_LABEL_TEKST": "labelText",
|
|
25
|
+
"FORL_ANS": "ans",
|
|
26
|
+
"FORL_ANS_INST": "ansInst",
|
|
27
|
+
"FORL_STARTTIDSPUNKT": "startTime",
|
|
28
|
+
"FORL_SLUTTIDSPUNKT": "endTime",
|
|
29
|
+
}
|
|
30
|
+
|
|
31
|
+
def apply(self, lf: pl.LazyFrame) -> pl.LazyFrame:
|
|
32
|
+
|
|
33
|
+
return lf.select(list((self.CASE_ATTR | self.EVENT_ATTR).keys())).rename(
|
|
34
|
+
self.CASE_ATTR | self.EVENT_ATTR
|
|
35
|
+
)
|
|
@@ -0,0 +1,32 @@
|
|
|
1
|
+
from typing import Any
|
|
2
|
+
|
|
3
|
+
|
|
4
|
+
def declare_counts_to_ratios(
|
|
5
|
+
declare_model: dict[str, dict[Any, dict[str, int]]],
|
|
6
|
+
num_cases: int,
|
|
7
|
+
) -> dict[str, dict[Any, dict[str, float]]]:
|
|
8
|
+
if num_cases <= 0:
|
|
9
|
+
raise ValueError("num_cases must be > 0")
|
|
10
|
+
|
|
11
|
+
out: dict[str, dict[Any, dict[str, float]]] = {}
|
|
12
|
+
|
|
13
|
+
for template, constraints in declare_model.items():
|
|
14
|
+
out[template] = {}
|
|
15
|
+
|
|
16
|
+
for constraint_key, metrics in constraints.items():
|
|
17
|
+
support_count = float(metrics.get("support", 0))
|
|
18
|
+
|
|
19
|
+
confidence_count = float(metrics.get("confidence", 0))
|
|
20
|
+
|
|
21
|
+
support_ratio = support_count / float(num_cases)
|
|
22
|
+
|
|
23
|
+
confidence_ratio = (
|
|
24
|
+
confidence_count / support_count if support_count > 0 else 0.0
|
|
25
|
+
)
|
|
26
|
+
|
|
27
|
+
out[template][constraint_key] = {
|
|
28
|
+
"support": support_ratio,
|
|
29
|
+
"confidence": confidence_ratio,
|
|
30
|
+
}
|
|
31
|
+
|
|
32
|
+
return out
|
|
@@ -0,0 +1,26 @@
|
|
|
1
|
+
from pm4py.algo.discovery.declare import templates as t
|
|
2
|
+
|
|
3
|
+
all_templates = {
|
|
4
|
+
t.EXISTENCE,
|
|
5
|
+
t.EXACTLY_ONE,
|
|
6
|
+
t.INIT,
|
|
7
|
+
t.RESPONDED_EXISTENCE,
|
|
8
|
+
t.RESPONSE,
|
|
9
|
+
t.PRECEDENCE,
|
|
10
|
+
t.SUCCESSION,
|
|
11
|
+
t.ALTRESPONSE,
|
|
12
|
+
t.ALTPRECEDENCE,
|
|
13
|
+
t.ALTSUCCESSION,
|
|
14
|
+
t.CHAINRESPONSE,
|
|
15
|
+
t.CHAINPRECEDENCE,
|
|
16
|
+
t.CHAINSUCCESSION,
|
|
17
|
+
t.ABSENCE,
|
|
18
|
+
t.COEXISTENCE,
|
|
19
|
+
t.NONCOEXISTENCE,
|
|
20
|
+
t.NONSUCCESSION,
|
|
21
|
+
t.NONCHAINSUCCESSION,
|
|
22
|
+
}
|
|
23
|
+
|
|
24
|
+
excluded = {t.ABSENCE, t.NONCOEXISTENCE}
|
|
25
|
+
allowed = all_templates - excluded
|
|
26
|
+
|