specatwrap45 0.2.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,13 @@
1
+ Metadata-Version: 2.3
2
+ Name: specatwrap45
3
+ Version: 0.2.0
4
+ Summary: A simple wrapper
5
+ Author: Casper Lauge Nørup Koch
6
+ Author-email: Casper Lauge Nørup Koch <kochcasper@gmail.com>
7
+ Requires-Dist: click>=8.3.1
8
+ Requires-Dist: pm4py>=2.7.19.8
9
+ Requires-Dist: polars>=1.38.1
10
+ Requires-Dist: pyreadstat>=1.3.3
11
+ Requires-Python: >=3.12
12
+ Description-Content-Type: text/markdown
13
+
File without changes
@@ -0,0 +1,33 @@
1
+ [project]
2
+ name = "specatwrap45"
3
+ version = "0.2.0"
4
+ description = "A simple wrapper"
5
+ readme = "README.md"
6
+ authors = [
7
+ { name = "Casper Lauge Nørup Koch", email = "kochcasper@gmail.com" }
8
+ ]
9
+ requires-python = ">=3.12"
10
+ dependencies = [
11
+ "click>=8.3.1",
12
+ "pm4py>=2.7.19.8",
13
+ "polars>=1.38.1",
14
+ "pyreadstat>=1.3.3",
15
+ ]
16
+
17
+ [project.scripts]
18
+ specatwrap = "specatwrap45:main"
19
+
20
+ [build-system]
21
+ requires = ["uv_build>=0.10.3,<0.11.0"]
22
+ build-backend = "uv_build"
23
+
24
+
25
+ [tool.pyright]
26
+ reportUnannotatedClassAttribute = "none"
27
+ reportImplicitOverride = "none"
28
+ reportUnknownParameterType = "none"
29
+ reportMissingParameterType = "none"
30
+ reportUnknownVariableType = "none"
31
+ reportUnknownMemberType = "none"
32
+ reportUnknownArgumentType = "none"
33
+ reportAny = "none"
@@ -0,0 +1,39 @@
1
+ """
2
+ Specatwrap - A wrapper for processing healthcare data.
3
+
4
+ This module provides a CLI for converting and processing healthcare data files.
5
+ """
6
+
7
+ import click
8
+
9
+ from .sas_converter import sas2parquet
10
+ from .prep import prep
11
+ from .sas_preview import preview
12
+ from .printable import print_file
13
+
14
+
15
+ @click.group()
16
+ @click.version_option(version="0.1.0")
17
+ def cli():
18
+ """
19
+ Specatwrap - A wrapper for processing healthcare data.
20
+
21
+ A command-line tool for processing and converting healthcare data files.
22
+ """
23
+ pass
24
+
25
+
26
+ # Register command groups
27
+ cli.add_command(sas2parquet)
28
+ cli.add_command(prep)
29
+ cli.add_command(preview)
30
+ cli.add_command(print_file)
31
+
32
+
33
+ def main():
34
+ """Entry point for the CLI application."""
35
+ cli()
36
+
37
+
38
+ if __name__ == "__main__":
39
+ main()
@@ -0,0 +1,124 @@
1
+ """
2
+ Prep command group for preprocessing healthcare data files.
3
+
4
+ This module provides commands for filtering and preprocessing parquet files
5
+ before converting them to XES format.
6
+ """
7
+
8
+ import click
9
+ from pathlib import Path
10
+ import sys
11
+
12
+ from .io_handler import process_parquet_files
13
+
14
+
15
+ @click.group()
16
+ def prep():
17
+ """
18
+ Preprocess and filter healthcare data files.
19
+
20
+ Commands in this group help prepare raw data files by filtering,
21
+ cleaning, and transforming them before further processing.
22
+ """
23
+ pass
24
+
25
+
26
+ @prep.command()
27
+ @click.option(
28
+ "-i",
29
+ "--input",
30
+ "input_dir",
31
+ type=click.Path(exists=True, file_okay=False, dir_okay=True, path_type=Path),
32
+ required=True,
33
+ help="Path to directory containing parquet files to process.",
34
+ )
35
+ @click.option(
36
+ "-o",
37
+ "--output",
38
+ "output_file",
39
+ type=click.Path(dir_okay=False, file_okay=True, path_type=Path),
40
+ required=True,
41
+ help="Path to output parquet file.",
42
+ )
43
+ @click.option("-v", "--verbose", is_flag=True, help="Enable verbose output.")
44
+ def diagnosis(input_dir, output_file, verbose):
45
+ """
46
+ Filter and preprocess diagnosis parquet files.
47
+
48
+ This command lazily reads all parquet files from a directory, applies
49
+ filtering and preprocessing transformations, and writes the results to
50
+ a single output parquet file.
51
+
52
+ INPUT: Directory path containing parquet files to process.
53
+
54
+ OUTPUT: Path to output parquet file for processed data.
55
+
56
+ Example usage:
57
+
58
+ specatwrap prep diagnosis -i ./data/parquet_files/ -o ./processed/diagnosis.parquet
59
+
60
+ specatwrap prep diagnosis --input ./raw_data/ --output ./clean_data.parquet -v
61
+ """
62
+ try:
63
+ # Display processing information
64
+ click.echo(f"Input directory: {input_dir}")
65
+ click.echo(f"Output file: {output_file}")
66
+ click.echo()
67
+
68
+ # Find all parquet files in input directory (for verbose output)
69
+ if verbose:
70
+ parquet_pattern = str(input_dir / "*.parquet")
71
+ click.echo(f"Searching for parquet files: {parquet_pattern}")
72
+ parquet_files = list(input_dir.glob("*.parquet"))
73
+ if parquet_files:
74
+ click.echo(f"Found {len(parquet_files)} parquet file(s):")
75
+ for f in parquet_files:
76
+ click.echo(f" - {f.name}")
77
+ click.echo()
78
+
79
+ # Process the parquet files using the io_handler
80
+ click.echo("Loading and applying filters...")
81
+ with click.progressbar(
82
+ length=100, label="Processing", show_eta=False, show_percent=True
83
+ ) as bar:
84
+ # Call the generic I/O handler with "diagnosis" filter type
85
+ process_parquet_files(
86
+ input_dir=input_dir,
87
+ output_file=output_file,
88
+ filter_type="diagnosis",
89
+ verbose=verbose,
90
+ )
91
+ bar.update(100)
92
+
93
+ # Display success message
94
+ click.secho("✓ Processing completed successfully!", fg="green", bold=True)
95
+ click.echo(f"Output file: {output_file}")
96
+
97
+ # Display file statistics
98
+ if output_file.exists():
99
+ size_mb = output_file.stat().st_size / (1024 * 1024)
100
+ click.echo(f"File size: {size_mb:.2f} MB")
101
+
102
+ except FileNotFoundError as e:
103
+ click.secho(f"✗ Error: File or directory not found - {e}", fg="red", err=True)
104
+ sys.exit(1)
105
+ except PermissionError as e:
106
+ click.secho(f"✗ Error: Permission denied - {e}", fg="red", err=True)
107
+ sys.exit(1)
108
+ except ValueError as e:
109
+ click.secho(f"✗ Error: Invalid input - {e}", fg="red", err=True)
110
+ sys.exit(1)
111
+ except MemoryError:
112
+ click.secho(
113
+ "✗ Error: Out of memory. Try processing smaller batches.",
114
+ fg="red",
115
+ err=True,
116
+ )
117
+ sys.exit(1)
118
+ except Exception as e:
119
+ click.secho(f"✗ Error: {e}", fg="red", err=True)
120
+ if verbose:
121
+ import traceback
122
+
123
+ traceback.print_exc()
124
+ sys.exit(1)
@@ -0,0 +1,426 @@
1
+ """
2
+ Diagnosis filter classes.
3
+
4
+ This module provides filter classes for preprocessing diagnosis parquet files.
5
+ """
6
+
7
+ from abc import ABC, abstractmethod
8
+ import polars as pl
9
+
10
+
11
+ class BaseFilter(ABC):
12
+ """
13
+ Abstract base class for parquet file filters.
14
+
15
+ Each filter type (diagnosis, procedure, medication, etc.) should inherit
16
+ from this class and implement the apply() method with their specific
17
+ filtering and transformation logic.
18
+ """
19
+
20
+ @abstractmethod
21
+ def apply(self, lf: pl.LazyFrame) -> pl.LazyFrame:
22
+ """
23
+ Apply filtering and preprocessing logic to a LazyFrame.
24
+
25
+ Args:
26
+ lazy_frame: Input Polars LazyFrame to filter/transform
27
+
28
+ Returns:
29
+ Transformed Polars LazyFrame
30
+ """
31
+ pass
32
+
33
+ def get_name(self) -> str:
34
+ """
35
+ Get the name of this filter type.
36
+
37
+ Returns:
38
+ Filter type name (defaults to class name without 'Filter' suffix)
39
+ """
40
+ class_name = self.__class__.__name__
41
+ if class_name.endswith("Filter"):
42
+ return class_name[:-6].lower()
43
+ return class_name.lower()
44
+
45
+
46
+ class DiagnosisFilter(BaseFilter):
47
+ """
48
+ Filter for diagnosis parquet files.
49
+
50
+ Applies diagnosis-specific filtering and column transformations:
51
+ - Filters by birthdate (year > 1980)
52
+ - Filters by region (Region Sjælland)
53
+ - Renames columns to standardized event log format
54
+ """
55
+
56
+ CASE_ATTR = {
57
+ k: f"case:{v}"
58
+ for k, v in {
59
+ "BORGER_FOEDSELSDATO": "BDay",
60
+ "PNR": "PNR",
61
+ "BORGER_KOEN": "gender"
62
+ }.items()
63
+ }
64
+
65
+ EVENT_ATTR = {
66
+ "TORRA_DIAG": "TDiag",
67
+ "ADIAG": "ADiag",
68
+ "ADIAG_TEKST": "diagText",
69
+ "KONT_ANS_GEO_REG_TEKST": "region",
70
+ "KONT_LPR_ENTITY_ID": "org:id",
71
+ "KONT_INST_EJERTYPE": "org:type",
72
+ "KONT_STARTTIDSPUNKT": "startTime",
73
+ "KONT_SLUTTIDSPUNKT": "endTime",
74
+ # "BORGER_ALDER_AAR_IND": "patient:age",
75
+ "PRIORITET_TEKST": "priority",
76
+ "KONT_TYPE_TEKST": "contact_type",
77
+ }
78
+
79
+ def apply(self, lf: pl.LazyFrame) -> pl.LazyFrame:
80
+ """
81
+ Apply diagnosis-specific filtering and transformations.
82
+
83
+ Args:
84
+ lazy_frame: Input LazyFrame containing diagnosis data
85
+
86
+ Returns:
87
+ Filtered and transformed LazyFrame
88
+ """
89
+
90
+ lf = self._add_torra_diag(lf)
91
+
92
+ return (
93
+ lf.filter(pl.col("TORRA_DIAG").list.len() > 0)
94
+ .with_columns(pl.col("TORRA_DIAG").list.join(", "))
95
+ .select(list((self.CASE_ATTR | self.EVENT_ATTR).keys()))
96
+ .rename(self.CASE_ATTR | self.EVENT_ATTR)
97
+ )
98
+
99
+ def _add_torra_diag(self, lazy_frame: pl.LazyFrame) -> pl.LazyFrame:
100
+ col = pl.col("ADIAG")
101
+
102
+ def starts_with(prefixes: list[str], label: str) -> pl.Expr:
103
+ pattern = f"^(?:{'|'.join(prefixes)})"
104
+ return (
105
+ pl.when(col.str.contains(pattern))
106
+ .then(pl.lit(label))
107
+ .otherwise(pl.lit(None))
108
+ )
109
+
110
+ return lazy_frame.with_columns(
111
+ TORRA_DIAG=pl.concat_list(
112
+ [
113
+ starts_with(
114
+ ["DJ41", "DJ42", "DJ43", "DJ44", "DJ45", "DJ46"], "LUNG"
115
+ ),
116
+ starts_with(
117
+ [
118
+ "DL405",
119
+ "DM05",
120
+ "DM06",
121
+ "DM07",
122
+ "DM15",
123
+ "DM16",
124
+ "DM17",
125
+ "DM45",
126
+ "DM47",
127
+ "DM50",
128
+ "DM51",
129
+ "DM53",
130
+ "DM54",
131
+ "DM80",
132
+ "DM81",
133
+ "DM82",
134
+ ],
135
+ "MUSCULOSKELETAL",
136
+ ),
137
+ starts_with(
138
+ ["DE03", "DE05", "DE10", "DE11", "DE12", "DE13", "DE14"],
139
+ "ENDOCRINE",
140
+ ),
141
+ starts_with(["DE03", "DE05"], "THYROIDEA"),
142
+ starts_with(["DE10", "DE11", "DE12", "DE13", "DE14"], "DIABETES"),
143
+ starts_with(["DG30", "DG318", "DG319", "DF"], "MENTAL"),
144
+ # CANCER: Matches DC but explicitly excludes DC44
145
+ pl.when(col.str.contains("^DC") & ~col.str.contains("^DC44"))
146
+ .then(pl.lit("CANCER"))
147
+ .otherwise(pl.lit(None)),
148
+ starts_with(
149
+ [
150
+ "DI60",
151
+ "DI61",
152
+ "DI62",
153
+ "DI63",
154
+ "DI64",
155
+ "DI69",
156
+ "DG20",
157
+ "DG35",
158
+ "DG40",
159
+ "DG43",
160
+ ],
161
+ "NEUROLOGICAL",
162
+ ),
163
+ starts_with(
164
+ [
165
+ "DK30",
166
+ "DK50",
167
+ "DK51",
168
+ "DK58",
169
+ "DK70",
170
+ "DK71",
171
+ "DK72",
172
+ "DK73",
173
+ "DK74",
174
+ "DK75",
175
+ "DK76",
176
+ "DK860",
177
+ "DK861",
178
+ ],
179
+ "GASTROINTESTINAL",
180
+ ),
181
+ starts_with(
182
+ [
183
+ "DI20",
184
+ "DI21",
185
+ "DI22",
186
+ "DI23",
187
+ "DI24",
188
+ "DI25",
189
+ "DI47",
190
+ "DI48",
191
+ "DI49",
192
+ "DI50",
193
+ "DI05",
194
+ "DI06",
195
+ "DI07",
196
+ "DI08",
197
+ "DI34",
198
+ "DI35",
199
+ "DI36",
200
+ "DI37",
201
+ "DI441",
202
+ "DI442",
203
+ "DI443",
204
+ "DI444",
205
+ "DI445",
206
+ "DI446",
207
+ "DI447",
208
+ "DI452",
209
+ "DI453",
210
+ "DI454",
211
+ "DI455",
212
+ "DI456",
213
+ "DI457",
214
+ "DI458",
215
+ "DI459",
216
+ ],
217
+ "CARDIOVASCULAR",
218
+ ),
219
+ starts_with(
220
+ [
221
+ "DN03",
222
+ "DN04",
223
+ "DN05",
224
+ "DN11",
225
+ "DN12",
226
+ "DN18",
227
+ "DN19",
228
+ "DZ49",
229
+ "DN80",
230
+ "DZ992",
231
+ "DN393",
232
+ "DN394",
233
+ ],
234
+ "GENITURINARY",
235
+ ),
236
+ starts_with(
237
+ [
238
+ "DH40",
239
+ "DH91",
240
+ "DL40",
241
+ "DH540",
242
+ "DH541",
243
+ "DH542",
244
+ "DH543",
245
+ "DH547",
246
+ "DH900",
247
+ "DH902",
248
+ "DH903",
249
+ "DH905",
250
+ "DH906",
251
+ "DH908",
252
+ ],
253
+ "SENSORY ORGANS",
254
+ ),
255
+ starts_with(
256
+ ["DL23", "DL24", "DL25", "DJ30", "DL500", "DJ450"], "ALLERGY"
257
+ ),
258
+ ]
259
+ ).list.drop_nulls()
260
+ )
261
+
262
+ def print_torra_diag_method(self) -> None:
263
+ """Print the _add_torra_diag method to stdout."""
264
+ method_code = """ def _add_torra_diag(self, lazy_frame: pl.LazyFrame) -> pl.LazyFrame:
265
+ col = pl.col("ADIAG")
266
+
267
+ def starts_with(prefixes: list[str], label: str) -> pl.Expr:
268
+ pattern = f"^(?:{'|'.join(prefixes)})"
269
+ return (
270
+ pl.when(col.str.contains(pattern))
271
+ .then(pl.lit(label))
272
+ .otherwise(pl.lit(None))
273
+ )
274
+
275
+ return lazy_frame.with_columns(
276
+ TORRA_DIAG=pl.concat_list(
277
+ [
278
+ starts_with(
279
+ ["DJ41", "DJ42", "DJ43", "DJ44", "DJ45", "DJ46"], "LUNG"
280
+ ),
281
+ starts_with(
282
+ [
283
+ "DL405",
284
+ "DM05",
285
+ "DM06",
286
+ "DM07",
287
+ "DM15",
288
+ "DM16",
289
+ "DM17",
290
+ "DM45",
291
+ "DM47",
292
+ "DM50",
293
+ "DM51",
294
+ "DM53",
295
+ "DM54",
296
+ "DM80",
297
+ "DM81",
298
+ "DM82",
299
+ ],
300
+ "MUSCULOSKELETAL",
301
+ ),
302
+ starts_with(
303
+ ["DE03", "DE05", "DE10", "DE11", "DE12", "DE13", "DE14"],
304
+ "ENDOCRINE",
305
+ ),
306
+ starts_with(["DE03", "DE05"], "THYROIDEA"),
307
+ starts_with(["DE10", "DE11", "DE12", "DE13", "DE14"], "DIABETES"),
308
+ starts_with(["DG30", "DG318", "DG319", "DF"], "MENTAL"),
309
+ # CANCER: Matches DC but explicitly excludes DC44
310
+ pl.when(col.str.contains("^DC") & ~col.str.contains("^DC44"))
311
+ .then(pl.lit("CANCER"))
312
+ .otherwise(pl.lit(None)),
313
+ starts_with(
314
+ [
315
+ "DI60",
316
+ "DI61",
317
+ "DI62",
318
+ "DI63",
319
+ "DI64",
320
+ "DI69",
321
+ "DG20",
322
+ "DG35",
323
+ "DG40",
324
+ "DG43",
325
+ ],
326
+ "NEUROLOGICAL",
327
+ ),
328
+ starts_with(
329
+ [
330
+ "DK30",
331
+ "DK50",
332
+ "DK51",
333
+ "DK58",
334
+ "DK70",
335
+ "DK71",
336
+ "DK72",
337
+ "DK73",
338
+ "DK74",
339
+ "DK75",
340
+ "DK76",
341
+ "DK860",
342
+ "DK861",
343
+ ],
344
+ "GASTROINTESTINAL",
345
+ ),
346
+ starts_with(
347
+ [
348
+ "DI20",
349
+ "DI21",
350
+ "DI22",
351
+ "DI23",
352
+ "DI24",
353
+ "DI25",
354
+ "DI47",
355
+ "DI48",
356
+ "DI49",
357
+ "DI50",
358
+ "DI05",
359
+ "DI06",
360
+ "DI07",
361
+ "DI08",
362
+ "DI34",
363
+ "DI35",
364
+ "DI36",
365
+ "DI37",
366
+ "DI441",
367
+ "DI442",
368
+ "DI443",
369
+ "DI444",
370
+ "DI445",
371
+ "DI446",
372
+ "DI447",
373
+ "DI452",
374
+ "DI453",
375
+ "DI454",
376
+ "DI455",
377
+ "DI456",
378
+ "DI457",
379
+ "DI458",
380
+ "DI459",
381
+ ],
382
+ "CARDIOVASCULAR",
383
+ ),
384
+ starts_with(
385
+ [
386
+ "DN03",
387
+ "DN04",
388
+ "DN05",
389
+ "DN11",
390
+ "DN12",
391
+ "DN18",
392
+ "DN19",
393
+ "DZ49",
394
+ "DN80",
395
+ "DZ992",
396
+ "DN393",
397
+ "DN394",
398
+ ],
399
+ "GENITURINARY",
400
+ ),
401
+ starts_with(
402
+ [
403
+ "DH40",
404
+ "DH91",
405
+ "DL40",
406
+ "DH540",
407
+ "DH541",
408
+ "DH542",
409
+ "DH543",
410
+ "DH547",
411
+ "DH900",
412
+ "DH902",
413
+ "DH903",
414
+ "DH905",
415
+ "DH906",
416
+ "DH908",
417
+ ],
418
+ "SENSORY ORGANS",
419
+ ),
420
+ starts_with(
421
+ ["DL23", "DL24", "DL25", "DJ30", "DL500", "DJ450"], "ALLERGY"
422
+ ),
423
+ ]
424
+ ).list.drop_nulls()
425
+ )"""
426
+ print(method_code)