imperfekt 0.2.3__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (46) hide show
  1. imperfekt-0.2.3/.gitignore +25 -0
  2. imperfekt-0.2.3/LICENSE +21 -0
  3. imperfekt-0.2.3/PKG-INFO +163 -0
  4. imperfekt-0.2.3/README.md +105 -0
  5. imperfekt-0.2.3/imperfekt/__init__.py +22 -0
  6. imperfekt-0.2.3/imperfekt/analysis/README.md +59 -0
  7. imperfekt-0.2.3/imperfekt/analysis/__init__.py +0 -0
  8. imperfekt-0.2.3/imperfekt/analysis/imperfekt.png +0 -0
  9. imperfekt-0.2.3/imperfekt/analysis/imperfekt.py +487 -0
  10. imperfekt-0.2.3/imperfekt/analysis/intervariable/README.md +449 -0
  11. imperfekt-0.2.3/imperfekt/analysis/intervariable/__init__.py +0 -0
  12. imperfekt-0.2.3/imperfekt/analysis/intervariable/asymmetric_analysis.py +734 -0
  13. imperfekt-0.2.3/imperfekt/analysis/intervariable/html_report_generator.py +877 -0
  14. imperfekt-0.2.3/imperfekt/analysis/intervariable/intervariable.py +912 -0
  15. imperfekt-0.2.3/imperfekt/analysis/intervariable/marmnar.py +366 -0
  16. imperfekt-0.2.3/imperfekt/analysis/intervariable/mcar.py +273 -0
  17. imperfekt-0.2.3/imperfekt/analysis/intervariable/row_statistics.py +248 -0
  18. imperfekt-0.2.3/imperfekt/analysis/intervariable/symmetric_correlation.py +378 -0
  19. imperfekt-0.2.3/imperfekt/analysis/intravariable/README.md +375 -0
  20. imperfekt-0.2.3/imperfekt/analysis/intravariable/__init__.py +0 -0
  21. imperfekt-0.2.3/imperfekt/analysis/intravariable/autocorrelation.py +148 -0
  22. imperfekt-0.2.3/imperfekt/analysis/intravariable/column_statistics.py +192 -0
  23. imperfekt-0.2.3/imperfekt/analysis/intravariable/date_time_statistics.py +178 -0
  24. imperfekt-0.2.3/imperfekt/analysis/intravariable/gap_statistics.py +325 -0
  25. imperfekt-0.2.3/imperfekt/analysis/intravariable/html_report_generator.py +877 -0
  26. imperfekt-0.2.3/imperfekt/analysis/intravariable/intravariable.py +718 -0
  27. imperfekt-0.2.3/imperfekt/analysis/intravariable/markov_chain_summary.py +208 -0
  28. imperfekt-0.2.3/imperfekt/analysis/intravariable/windowed_significance.py +125 -0
  29. imperfekt-0.2.3/imperfekt/analysis/preliminary/README.md +301 -0
  30. imperfekt-0.2.3/imperfekt/analysis/preliminary/html_report_generator.py +435 -0
  31. imperfekt-0.2.3/imperfekt/analysis/preliminary/preliminary.py +494 -0
  32. imperfekt-0.2.3/imperfekt/analysis/utils/events.py +153 -0
  33. imperfekt-0.2.3/imperfekt/analysis/utils/html_reporting_helpers.py +440 -0
  34. imperfekt-0.2.3/imperfekt/analysis/utils/kruskal_wallis.py +290 -0
  35. imperfekt-0.2.3/imperfekt/analysis/utils/masking.py +272 -0
  36. imperfekt-0.2.3/imperfekt/analysis/utils/pretty_printing.py +20 -0
  37. imperfekt-0.2.3/imperfekt/analysis/utils/statistics_utils.py +251 -0
  38. imperfekt-0.2.3/imperfekt/analysis/utils/visualization_utils.py +974 -0
  39. imperfekt-0.2.3/imperfekt/config/global_settings.py +13 -0
  40. imperfekt-0.2.3/imperfekt/features/README.md +198 -0
  41. imperfekt-0.2.3/imperfekt/features/__init__.py +0 -0
  42. imperfekt-0.2.3/imperfekt/features/core.py +220 -0
  43. imperfekt-0.2.3/imperfekt/features/interaction.py +114 -0
  44. imperfekt-0.2.3/imperfekt/features/temporal.py +171 -0
  45. imperfekt-0.2.3/imperfekt/features/window.py +138 -0
  46. imperfekt-0.2.3/pyproject.toml +118 -0
@@ -0,0 +1,25 @@
1
+ # Data
2
+ data/
3
+ *.parquet
4
+ *.csv
5
+ *.sas7bdat
6
+
7
+ # Python stuff
8
+ /**/__pycache__/
9
+ *$py.class
10
+ **.egg-info
11
+ dist/
12
+ build/
13
+ .eggs/
14
+
15
+ # Virtual environments
16
+ .venv/
17
+ venv/
18
+
19
+ # etc
20
+ .vscode
21
+ .idea/
22
+ .ruff_cache
23
+ .ipynb_checkpoints/
24
+ .DS_Store
25
+ Thumbs.db
@@ -0,0 +1,21 @@
1
+ MIT License
2
+
3
+ Copyright (c) 2025 Tamara Krafft
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
@@ -0,0 +1,163 @@
1
+ Metadata-Version: 2.4
2
+ Name: imperfekt
3
+ Version: 0.2.3
4
+ Summary: A framework to analyze imperfections (missingness, noise) in time-series datasets.
5
+ Project-URL: Homepage, https://github.com/krafftta/imperfekt
6
+ Project-URL: Repository, https://github.com/krafftta/imperfekt
7
+ Project-URL: Issues, https://github.com/krafftta/imperfekt/issues
8
+ Author: Tamara Krafft
9
+ License-Expression: MIT
10
+ License-File: LICENSE
11
+ Keywords: data-quality,healthcare,missing-data,missingness-analysis,polars,time-series
12
+ Classifier: Development Status :: 4 - Beta
13
+ Classifier: Intended Audience :: Healthcare Industry
14
+ Classifier: Intended Audience :: Science/Research
15
+ Classifier: License :: OSI Approved :: MIT License
16
+ Classifier: Operating System :: OS Independent
17
+ Classifier: Programming Language :: Python :: 3
18
+ Classifier: Programming Language :: Python :: 3.10
19
+ Classifier: Programming Language :: Python :: 3.11
20
+ Classifier: Programming Language :: Python :: 3.12
21
+ Classifier: Topic :: Scientific/Engineering :: Information Analysis
22
+ Classifier: Typing :: Typed
23
+ Requires-Python: >=3.10
24
+ Requires-Dist: numpy>=1.24.0
25
+ Requires-Dist: pandas>=2.0.0
26
+ Requires-Dist: plotly>=5.18.0
27
+ Requires-Dist: polars>=0.20.0
28
+ Requires-Dist: rich>=13.0.0
29
+ Requires-Dist: scipy>=1.13.0
30
+ Provides-Extra: dev
31
+ Requires-Dist: kaleido>=0.2.1; extra == 'dev'
32
+ Requires-Dist: matplotlib>=3.8.0; extra == 'dev'
33
+ Requires-Dist: missingno>=0.5.2; extra == 'dev'
34
+ Requires-Dist: mypy>=1.7.0; extra == 'dev'
35
+ Requires-Dist: pre-commit>=3.5.0; extra == 'dev'
36
+ Requires-Dist: pytest-cov>=4.1.0; extra == 'dev'
37
+ Requires-Dist: pytest>=7.4.0; extra == 'dev'
38
+ Requires-Dist: ruff>=0.1.0; extra == 'dev'
39
+ Requires-Dist: scikit-posthocs>=0.9.0; extra == 'dev'
40
+ Requires-Dist: statsmodels>=0.14.0; extra == 'dev'
41
+ Provides-Extra: docs
42
+ Requires-Dist: mkdocs-material>=9.5.0; extra == 'docs'
43
+ Requires-Dist: mkdocs>=1.5.0; extra == 'docs'
44
+ Requires-Dist: mkdocstrings[python]>=0.24.0; extra == 'docs'
45
+ Provides-Extra: full
46
+ Requires-Dist: kaleido>=0.2.1; extra == 'full'
47
+ Requires-Dist: matplotlib>=3.8.0; extra == 'full'
48
+ Requires-Dist: missingno>=0.5.2; extra == 'full'
49
+ Requires-Dist: scikit-posthocs>=0.9.0; extra == 'full'
50
+ Requires-Dist: statsmodels>=0.14.0; extra == 'full'
51
+ Provides-Extra: stats
52
+ Requires-Dist: scikit-posthocs>=0.9.0; extra == 'stats'
53
+ Requires-Dist: statsmodels>=0.14.0; extra == 'stats'
54
+ Provides-Extra: viz
55
+ Requires-Dist: kaleido>=0.2.1; extra == 'viz'
56
+ Requires-Dist: matplotlib>=3.8.0; extra == 'viz'
57
+ Description-Content-Type: text/markdown
58
+
59
+ # Imperfekt - Understanding Data Imperfections in Time-Series
60
+
61
+ [![PyPI version](https://badge.fury.io/py/imperfekt.svg)](https://badge.fury.io/py/imperfekt)
62
+ [![License: MIT](https://img.shields.io/badge/License-MIT-yellow.svg)](https://opensource.org/licenses/MIT)
63
+ [![Python 3.10+](https://img.shields.io/badge/python-3.10+-blue.svg)](https://www.python.org/downloads/)
64
+
65
+ A comprehensive analysis toolkit for studying "imperfect" data patterns in time-series datasets.
66
+ Imperfection refers to missingness, noise, and other data quality issues that can be indicated using a binary mask.
67
+
68
+ ## Overview
69
+
70
+ This library provides tools to analyze data quality issues in time-series data, including:
71
+ - **Intravariable analysis** of imperfection patterns for individual variables
72
+ - **Intervariable analysis** of co-occurring imperfections across multiple parameters
73
+ - **Feature generation** based on missingness patterns for downstream ML tasks
74
+
75
+ ## Installation
76
+
77
+ Install the library using `pip`:
78
+
79
+ ```bash
80
+ pip install imperfekt
81
+ ```
82
+
83
+ For additional features:
84
+ ```bash
85
+ # With visualization export support
86
+ pip install imperfekt[viz]
87
+
88
+ # With statistical tests
89
+ pip install imperfekt[stats]
90
+
91
+ # Full installation
92
+ pip install imperfekt[full]
93
+ ```
94
+
95
+ ## Quick Start
96
+
97
+ ```python
98
+ import polars as pl
99
+ from imperfekt import Imperfekt, FeatureGenerator
100
+
101
+ # Load your time-series data
102
+ df = pl.read_parquet("your_data.parquet")
103
+
104
+ # Run full imperfection analysis
105
+ analyzer = Imperfekt(
106
+ df=df,
107
+ id_col="id", # Unique identifier column
108
+ clock_col="clock", # Timestamp column
109
+ cols=["var1", "var2"], # Variables to analyze
110
+ save_path="./results"
111
+ )
112
+ results = analyzer.run()
113
+
114
+ # Or generate missingness-aware features for ML
115
+ fg = FeatureGenerator(
116
+ df=df,
117
+ id_col="id",
118
+ clock_col="clock",
119
+ variable_cols=["var1", "var2"]
120
+ )
121
+ features_df = fg.add_binary_masks().add_temporal_features().df
122
+ ```
123
+
124
+ ## Library Structure
125
+
126
+ ```
127
+ imperfekt/
128
+ ├── analysis/
129
+ │ ├── preliminary/ # Basic data exploration
130
+ │ ├── intravariable/ # Single variable analysis
131
+ │ ├── intervariable/ # Multi-variable patterns
132
+ │ └── utils/ # Shared utilities
133
+ ├── features/ # Feature engineering
134
+ │ ├── core.py # FeatureGenerator class
135
+ │ ├── temporal.py # Time-based features
136
+ │ └── interaction.py # Variable interactions
137
+ └── config/ # Default settings
138
+ ```
139
+
140
+ ## Data Format
141
+
142
+ The library expects time-series data with the following structure:
143
+
144
+ | Column | Description |
145
+ |--------|-------------|
146
+ | `id` | Unique identifier for each time-series (e.g., patient, sensor) |
147
+ | `clock` | Timestamp for each observation |
148
+ | `var1`, `var2`, ... | Variables to analyze |
149
+
150
+ ## Key Dependencies
151
+
152
+ - **polars**: High-performance data processing
153
+ - **plotly**: Interactive visualizations
154
+ - **scipy**: Statistical computations
155
+
156
+ ## Contributing
157
+
158
+ Contributions are welcome! Please feel free to submit a Pull Request.
159
+
160
+ ## License
161
+
162
+ This project is licensed under the MIT License - see the [LICENSE](LICENSE) file for details.
163
+
@@ -0,0 +1,105 @@
1
+ # Imperfekt - Understanding Data Imperfections in Time-Series
2
+
3
+ [![PyPI version](https://badge.fury.io/py/imperfekt.svg)](https://badge.fury.io/py/imperfekt)
4
+ [![License: MIT](https://img.shields.io/badge/License-MIT-yellow.svg)](https://opensource.org/licenses/MIT)
5
+ [![Python 3.10+](https://img.shields.io/badge/python-3.10+-blue.svg)](https://www.python.org/downloads/)
6
+
7
+ A comprehensive analysis toolkit for studying "imperfect" data patterns in time-series datasets.
8
+ Imperfection refers to missingness, noise, and other data quality issues that can be indicated using a binary mask.
9
+
10
+ ## Overview
11
+
12
+ This library provides tools to analyze data quality issues in time-series data, including:
13
+ - **Intravariable analysis** of imperfection patterns for individual variables
14
+ - **Intervariable analysis** of co-occurring imperfections across multiple parameters
15
+ - **Feature generation** based on missingness patterns for downstream ML tasks
16
+
17
+ ## Installation
18
+
19
+ Install the library using `pip`:
20
+
21
+ ```bash
22
+ pip install imperfekt
23
+ ```
24
+
25
+ For additional features:
26
+ ```bash
27
+ # With visualization export support
28
+ pip install imperfekt[viz]
29
+
30
+ # With statistical tests
31
+ pip install imperfekt[stats]
32
+
33
+ # Full installation
34
+ pip install imperfekt[full]
35
+ ```
36
+
37
+ ## Quick Start
38
+
39
+ ```python
40
+ import polars as pl
41
+ from imperfekt import Imperfekt, FeatureGenerator
42
+
43
+ # Load your time-series data
44
+ df = pl.read_parquet("your_data.parquet")
45
+
46
+ # Run full imperfection analysis
47
+ analyzer = Imperfekt(
48
+ df=df,
49
+ id_col="id", # Unique identifier column
50
+ clock_col="clock", # Timestamp column
51
+ cols=["var1", "var2"], # Variables to analyze
52
+ save_path="./results"
53
+ )
54
+ results = analyzer.run()
55
+
56
+ # Or generate missingness-aware features for ML
57
+ fg = FeatureGenerator(
58
+ df=df,
59
+ id_col="id",
60
+ clock_col="clock",
61
+ variable_cols=["var1", "var2"]
62
+ )
63
+ features_df = fg.add_binary_masks().add_temporal_features().df
64
+ ```
65
+
66
+ ## Library Structure
67
+
68
+ ```
69
+ imperfekt/
70
+ ├── analysis/
71
+ │ ├── preliminary/ # Basic data exploration
72
+ │ ├── intravariable/ # Single variable analysis
73
+ │ ├── intervariable/ # Multi-variable patterns
74
+ │ └── utils/ # Shared utilities
75
+ ├── features/ # Feature engineering
76
+ │ ├── core.py # FeatureGenerator class
77
+ │ ├── temporal.py # Time-based features
78
+ │ └── interaction.py # Variable interactions
79
+ └── config/ # Default settings
80
+ ```
81
+
82
+ ## Data Format
83
+
84
+ The library expects time-series data with the following structure:
85
+
86
+ | Column | Description |
87
+ |--------|-------------|
88
+ | `id` | Unique identifier for each time-series (e.g., patient, sensor) |
89
+ | `clock` | Timestamp for each observation |
90
+ | `var1`, `var2`, ... | Variables to analyze |
91
+
92
+ ## Key Dependencies
93
+
94
+ - **polars**: High-performance data processing
95
+ - **plotly**: Interactive visualizations
96
+ - **scipy**: Statistical computations
97
+
98
+ ## Contributing
99
+
100
+ Contributions are welcome! Please feel free to submit a Pull Request.
101
+
102
+ ## License
103
+
104
+ This project is licensed under the MIT License - see the [LICENSE](LICENSE) file for details.
105
+
@@ -0,0 +1,22 @@
1
+ """
2
+ imperfekt: A framework to analyze imperfections in time-series datasets.
3
+
4
+ This library provides tools for:
5
+ - Intravariable analysis of missing data patterns
6
+ - Intervariable analysis of co-occurring missingness
7
+ - Feature generation based on missingness patterns
8
+ """
9
+
10
+ from imperfekt.analysis.imperfekt import Imperfekt
11
+ from imperfekt.analysis.intervariable.intervariable import IntervariableImperfection
12
+ from imperfekt.analysis.intravariable.intravariable import IntravariableImperfection
13
+ from imperfekt.features.core import FeatureGenerator
14
+
15
+ __version__ = "0.2.0"
16
+ __all__ = [
17
+ "Imperfekt",
18
+ "IntravariableImperfection",
19
+ "IntervariableImperfection",
20
+ "FeatureGenerator",
21
+ "__version__",
22
+ ]
@@ -0,0 +1,59 @@
1
+ # Analysis Module
2
+
3
+ This module provides statistical analysis tools for characterizing imperfection (missingness and noise) in time-series data.
4
+
5
+ ## Structure
6
+
7
+ | Submodule | Purpose |
8
+ |-----------|---------|
9
+ | `preliminary/` | Descriptive statistics, normality tests, correlation, and autocorrelation |
10
+ | `intravariable/` | Within-column analysis: gap patterns, Markov chains, windowed significance |
11
+ | `intervariable/` | Between-column analysis: MCAR tests, MAR/MNAR detection, symmetric/asymmetric correlation |
12
+ | `utils/` | Shared utilities for statistics, visualization, and HTML reporting |
13
+
14
+ ## Usage
15
+
16
+ ```python
17
+ from imperfekt.analysis import Imperfekt
18
+
19
+ df = pl.DataFrame({
20
+ "patient": ["a", "a", "a", "a", "b", "b", "b"],
21
+ "time": [
22
+ "2023-01-01 08:00", "2023-01-01 08:05", "2023-01-01 08:10", "2023-01-01 08:15",
23
+ "2023-01-02 12:00", "2023-01-02 12:05", "2023-01-02 12:10"
24
+ ],
25
+ "heartrate": [60, None, 70, None, 80, 85, None],
26
+ "blood_pressure": [120, 125, None, None, 130, None, 140],
27
+ "resprate": [12, 14, None, 16, 18, None, 20],
28
+ }).with_columns(
29
+ pl.col("time").str.strptime(pl.Datetime, format="%Y-%m-%d %H:%M")
30
+ )
31
+
32
+ analyzer = Imperfekt(
33
+ df=df,
34
+ id_col="patient",
35
+ clock_col="time",
36
+ clock_no_col="time_no",
37
+ save_path="/path",
38
+ plot_library="matplotlib",
39
+ renderer="notebook_connected",
40
+ )
41
+ # Run all analyses
42
+ analyzer.run()
43
+
44
+ # Run preliminary analysis only
45
+ analyzer.preliminary.run()
46
+ ```
47
+
48
+ ## Detailed Documentation
49
+
50
+ Each submodule contains its own README with further details:
51
+
52
+ - [Preliminary Analysis](preliminary/README.md)
53
+ - [Intravariable Analysis](intravariable/README.md)
54
+ - [Intervariable Analysis](intervariable/README.md)
55
+
56
+ ### Overview Figure
57
+ ![Imperfekt Analysis Matrix](imperfekt.png "Imperfekt Analysis Matrix")
58
+
59
+
File without changes