mobts 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- mobts/__init__.py +4 -0
- mobts/configs/__init__.py +0 -0
- mobts/configs/config_common.py +29 -0
- mobts/configs/config_imputation.py +64 -0
- mobts/configs/config_preprocessing.py +70 -0
- mobts/divider/__init__.py +29 -0
- mobts/divider/divider.py +55 -0
- mobts/divider/divider_error.py +61 -0
- mobts/imputation/__init__.py +2 -0
- mobts/imputation/donors.py +317 -0
- mobts/imputation/pipeline.py +254 -0
- mobts/imputation/selector.py +296 -0
- mobts/imputation/stl.py +279 -0
- mobts/main.py +32 -0
- mobts/preprocessing/__init__.py +1 -0
- mobts/preprocessing/cleaning.py +296 -0
- mobts/preprocessing/outliers.py +196 -0
- mobts/preprocessing/pipeline.py +361 -0
- mobts/preprocessing/plotting.py +204 -0
- mobts/utils/__init__.py +0 -0
- mobts/utils/formatting.py +208 -0
- mobts-0.1.0.dist-info/METADATA +80 -0
- mobts-0.1.0.dist-info/RECORD +26 -0
- mobts-0.1.0.dist-info/WHEEL +5 -0
- mobts-0.1.0.dist-info/licenses/LICENSE +21 -0
- mobts-0.1.0.dist-info/top_level.txt +1 -0
mobts/__init__.py
ADDED
|
File without changes
|
|
@@ -0,0 +1,29 @@
|
|
|
1
|
+
from dataclasses import dataclass, field
|
|
2
|
+
from typing import Optional
|
|
3
|
+
|
|
4
|
+
|
|
5
|
+
@dataclass
|
|
6
|
+
class ColumnsConfig:
|
|
7
|
+
"""
|
|
8
|
+
Canonical column names used in the pipeline after standardization.
|
|
9
|
+
"""
|
|
10
|
+
|
|
11
|
+
counter: str = 'name'
|
|
12
|
+
timestamp: str = 'timestamp'
|
|
13
|
+
count: str = 'count'
|
|
14
|
+
|
|
15
|
+
weekday: str = 'weekday'
|
|
16
|
+
week_num: str = 'week_num'
|
|
17
|
+
how: str = 'how'
|
|
18
|
+
hour: str = 'hour'
|
|
19
|
+
date: str = 'date'
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
@dataclass
|
|
23
|
+
class SparsityConfig:
|
|
24
|
+
"""
|
|
25
|
+
For removing counters with not enough valid counts
|
|
26
|
+
"""
|
|
27
|
+
|
|
28
|
+
drop_sparse_counters: bool = True
|
|
29
|
+
sparse_threshold: float = 0.5
|
|
@@ -0,0 +1,64 @@
|
|
|
1
|
+
from dataclasses import dataclass
|
|
2
|
+
from typing import Optional
|
|
3
|
+
|
|
4
|
+
|
|
5
|
+
@dataclass
|
|
6
|
+
class STLConfig:
|
|
7
|
+
"""
|
|
8
|
+
Config used in STL imputation
|
|
9
|
+
"""
|
|
10
|
+
|
|
11
|
+
# STL seasonal period (for daily)
|
|
12
|
+
stl_season_daily = 7
|
|
13
|
+
stl_season_hourly = 168
|
|
14
|
+
|
|
15
|
+
# clipping
|
|
16
|
+
clip_lower = 0
|
|
17
|
+
|
|
18
|
+
# rollinng median
|
|
19
|
+
rolling_median_window: int = 2
|
|
20
|
+
rolling_median_min_valid: int = 1
|
|
21
|
+
|
|
22
|
+
# STL robust
|
|
23
|
+
stl_robust = False
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
@dataclass
|
|
27
|
+
class DonorsConfig:
|
|
28
|
+
"""
|
|
29
|
+
Configs for Donor-based imputation
|
|
30
|
+
"""
|
|
31
|
+
|
|
32
|
+
top_k_donor = 25
|
|
33
|
+
max_donor_rate = 0.5
|
|
34
|
+
|
|
35
|
+
# scaled median
|
|
36
|
+
sm_min_overlap_day = 60
|
|
37
|
+
sm_min_overlap_hour = sm_min_overlap_day * 24
|
|
38
|
+
sm_min_neighbors = 20
|
|
39
|
+
|
|
40
|
+
# regression
|
|
41
|
+
min_mutual_days = 60
|
|
42
|
+
min_mutual_hours = min_mutual_days * 24
|
|
43
|
+
min_pred_days = 30
|
|
44
|
+
min_pred_hours = min_pred_days * 24
|
|
45
|
+
min_pred_coverage = 0.9
|
|
46
|
+
|
|
47
|
+
|
|
48
|
+
@dataclass
|
|
49
|
+
class OutputConfig:
|
|
50
|
+
"""
|
|
51
|
+
Configs for output columns and final selection
|
|
52
|
+
"""
|
|
53
|
+
|
|
54
|
+
# calculated column names
|
|
55
|
+
col_intp = 'count_intp'
|
|
56
|
+
col_stl_imputed = 'count_stl_imputed'
|
|
57
|
+
col_sm_imputed: str = 'count_sm_imputed'
|
|
58
|
+
col_reg_imputed: str = 'count_reg_imputed'
|
|
59
|
+
col_final: str = 'count_imputed'
|
|
60
|
+
col_method_used: str = 'imputation_method'
|
|
61
|
+
|
|
62
|
+
stl_method: str = 'STL'
|
|
63
|
+
sm_method: str = 'M7'
|
|
64
|
+
reg_method: str = 'M8'
|
|
@@ -0,0 +1,70 @@
|
|
|
1
|
+
from dataclasses import dataclass, field
|
|
2
|
+
from typing import Optional
|
|
3
|
+
from .config_common import ColumnsConfig, SparsityConfig
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
@dataclass
|
|
7
|
+
class PreprocessConfig:
|
|
8
|
+
"""
|
|
9
|
+
Parameters for low-count/zero-run cleaning and operational window trimming.
|
|
10
|
+
if avail_min_valid days out of avail_window is not present, the whole window will be set as non-operational.
|
|
11
|
+
"""
|
|
12
|
+
|
|
13
|
+
low_rel_daily: float = 0.01 # threshold as fraction of station median
|
|
14
|
+
low_abs_daily: float = 5 # absolute floor threshold to be considered low count noise
|
|
15
|
+
low_run_min_daily: int = 2 # consecutive low count days to be set to NaN
|
|
16
|
+
|
|
17
|
+
zero_rate_max: float = 0.05 # threshold to consider 0s normal
|
|
18
|
+
|
|
19
|
+
night_hours = [1, 2, 3, 4, 5, 6]
|
|
20
|
+
|
|
21
|
+
zero_run_min: int = 6
|
|
22
|
+
island_max_len: int = 6
|
|
23
|
+
surround_min_len: int = 12
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
@dataclass
|
|
27
|
+
class STLConfig:
|
|
28
|
+
"""
|
|
29
|
+
Parameters for STL decomposition outlier scoring.
|
|
30
|
+
"""
|
|
31
|
+
|
|
32
|
+
period: int = 28 # seasonal period in days, set to 4 weeks
|
|
33
|
+
robust: bool = False # set to False to avoid heavy computation
|
|
34
|
+
|
|
35
|
+
|
|
36
|
+
@dataclass
|
|
37
|
+
class OutlierConfig:
|
|
38
|
+
"""
|
|
39
|
+
Parameters for thresholding STL outlier scores.
|
|
40
|
+
"""
|
|
41
|
+
|
|
42
|
+
threshold_daily: float = 20 # threshold to be tuned via plotting
|
|
43
|
+
threshold_hourly: float = 45 # threshold to be tuned via plotting
|
|
44
|
+
|
|
45
|
+
|
|
46
|
+
@dataclass
|
|
47
|
+
class PlotConfig:
|
|
48
|
+
"""
|
|
49
|
+
Parameters for plotting the detected outliers.
|
|
50
|
+
"""
|
|
51
|
+
|
|
52
|
+
ncols: int = 3
|
|
53
|
+
figsize_width: float = 15
|
|
54
|
+
min_fig_height: float = 10
|
|
55
|
+
height_per_row: float = 3
|
|
56
|
+
linewidth_d: float = 0.5
|
|
57
|
+
linewidth_h: float = 0.3
|
|
58
|
+
marker_size: float = 10
|
|
59
|
+
x_label_rotation: int = 30
|
|
60
|
+
max_stations: Optional[int] = None
|
|
61
|
+
|
|
62
|
+
|
|
63
|
+
@dataclass
|
|
64
|
+
class PipelineConfig:
|
|
65
|
+
cols: ColumnsConfig = field(default_factory=ColumnsConfig)
|
|
66
|
+
sparse: SparsityConfig = field(default_factory=SparsityConfig)
|
|
67
|
+
preprocess: PreprocessConfig = field(default_factory=PreprocessConfig)
|
|
68
|
+
stl: STLConfig = field(default_factory=STLConfig)
|
|
69
|
+
outliers: OutlierConfig = field(default_factory=OutlierConfig)
|
|
70
|
+
plot: PlotConfig = field(default_factory=PlotConfig)
|
|
@@ -0,0 +1,29 @@
|
|
|
1
|
+
#!python3
|
|
2
|
+
# -*- coding: utf-8 -*-
|
|
3
|
+
"""
|
|
4
|
+
Module for division operations and custom exceptions.
|
|
5
|
+
|
|
6
|
+
This module provides functions and exceptions related to division operations.
|
|
7
|
+
It imports the `divide` function and the `CantDivideByZeroError` exception from
|
|
8
|
+
other modules and makes them available for use in this module.
|
|
9
|
+
|
|
10
|
+
Functions
|
|
11
|
+
---------
|
|
12
|
+
divide(a, b)
|
|
13
|
+
Divide two numbers, raising a custom exception if the divisor is zero.
|
|
14
|
+
|
|
15
|
+
Exceptions
|
|
16
|
+
----------
|
|
17
|
+
CantDivideByZeroError
|
|
18
|
+
Raised when an attempt is made to divide by zero.
|
|
19
|
+
|
|
20
|
+
Imports
|
|
21
|
+
--------
|
|
22
|
+
- divide: Function for performing division operations.
|
|
23
|
+
- CantDivideByZeroError: Exception raised for division by zero errors.
|
|
24
|
+
"""
|
|
25
|
+
|
|
26
|
+
from .divider import divide
|
|
27
|
+
from .divider_error import CantDivideByZeroError
|
|
28
|
+
|
|
29
|
+
__all__ = ['divide', 'CantDivideByZeroError']
|
mobts/divider/divider.py
ADDED
|
@@ -0,0 +1,55 @@
|
|
|
1
|
+
#!python3
|
|
2
|
+
# -*- coding: utf-8 -*-
|
|
3
|
+
"""
|
|
4
|
+
Module for division operations with custom exceptions.
|
|
5
|
+
|
|
6
|
+
This module provides a function for performing division
|
|
7
|
+
and raises a custom exception when attempting to divide by zero.
|
|
8
|
+
|
|
9
|
+
Functions
|
|
10
|
+
---------
|
|
11
|
+
divide(a, b)
|
|
12
|
+
Divide two numbers, raising a custom exception if the divisor is zero.
|
|
13
|
+
|
|
14
|
+
Exceptions
|
|
15
|
+
----------
|
|
16
|
+
CantDivideByZeroError
|
|
17
|
+
Raised when an attempt is made to divide by zero.
|
|
18
|
+
"""
|
|
19
|
+
|
|
20
|
+
from .divider_error import CantDivideByZeroError
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
def divide(a, b):
|
|
24
|
+
"""
|
|
25
|
+
Divide two numbers, raising a custom exception if the divisor is zero.
|
|
26
|
+
|
|
27
|
+
Parameters
|
|
28
|
+
----------
|
|
29
|
+
a : float
|
|
30
|
+
The dividend.
|
|
31
|
+
b : float
|
|
32
|
+
The divisor.
|
|
33
|
+
|
|
34
|
+
Returns
|
|
35
|
+
-------
|
|
36
|
+
float
|
|
37
|
+
The result of the division.
|
|
38
|
+
|
|
39
|
+
Raises
|
|
40
|
+
------
|
|
41
|
+
CantDivideByZeroError
|
|
42
|
+
If the divisor (b) is zero.
|
|
43
|
+
|
|
44
|
+
Examples
|
|
45
|
+
--------
|
|
46
|
+
>>> divide(10, 2)
|
|
47
|
+
5.0
|
|
48
|
+
>>> divide(10, 0)
|
|
49
|
+
Traceback (most recent call last):
|
|
50
|
+
...
|
|
51
|
+
CantDivideByZeroError
|
|
52
|
+
"""
|
|
53
|
+
if b == 0:
|
|
54
|
+
raise CantDivideByZeroError()
|
|
55
|
+
return a / b
|
|
@@ -0,0 +1,61 @@
|
|
|
1
|
+
#!python3
|
|
2
|
+
# -*- coding: utf-8 -*-
|
|
3
|
+
"""
|
|
4
|
+
Module for custom exceptions related to calculator operations.
|
|
5
|
+
|
|
6
|
+
This module defines custom exceptions used in calculator operations,
|
|
7
|
+
including a base exception class and a specific exception for division by zero errors.
|
|
8
|
+
|
|
9
|
+
Classes
|
|
10
|
+
-------
|
|
11
|
+
CalculatorError
|
|
12
|
+
Base class for exceptions in calculator operations.
|
|
13
|
+
CantDivideByZeroError
|
|
14
|
+
Exception raised when an attempt is made to divide by zero.
|
|
15
|
+
|
|
16
|
+
Exceptions
|
|
17
|
+
----------
|
|
18
|
+
CalculatorError
|
|
19
|
+
Base class for exceptions in the calculator domain.
|
|
20
|
+
CantDivideByZeroError
|
|
21
|
+
Raised specifically for division by zero errors.
|
|
22
|
+
"""
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
class CalculatorError(Exception):
|
|
26
|
+
"""
|
|
27
|
+
Base class for exceptions in calculator operations.
|
|
28
|
+
|
|
29
|
+
This class is intended to be used as a base class for other calculator-related
|
|
30
|
+
exceptions. It inherits from the built-in Exception class and allows for custom
|
|
31
|
+
exception handling in the calculator domain.
|
|
32
|
+
|
|
33
|
+
Parameters
|
|
34
|
+
----------
|
|
35
|
+
args : tuple
|
|
36
|
+
Variable length argument list passed to the base Exception class.
|
|
37
|
+
"""
|
|
38
|
+
|
|
39
|
+
def __init__(self, *args):
|
|
40
|
+
super().__init__(args)
|
|
41
|
+
|
|
42
|
+
|
|
43
|
+
class CantDivideByZeroError(CalculatorError):
|
|
44
|
+
"""
|
|
45
|
+
Exception raised when an attempt is made to divide by zero.
|
|
46
|
+
|
|
47
|
+
This exception is a specific subclass of CalculatorError and is intended to be
|
|
48
|
+
used when a division by zero error occurs. It provides a custom error message
|
|
49
|
+
indicating that division by zero is not allowed.
|
|
50
|
+
|
|
51
|
+
Parameters
|
|
52
|
+
----------
|
|
53
|
+
None
|
|
54
|
+
|
|
55
|
+
Notes
|
|
56
|
+
-----
|
|
57
|
+
The default message for this exception is "tu ne peux pas diviser par zéro".
|
|
58
|
+
"""
|
|
59
|
+
|
|
60
|
+
def __init__(self):
|
|
61
|
+
super().__init__('tu ne peux pas diviser par zéro')
|
|
@@ -0,0 +1,317 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Module concerned with the donor-based imputations
|
|
3
|
+
|
|
4
|
+
This module contains:
|
|
5
|
+
- determining the minimum overlap period for scaled median imputation method based on project's temporal frequency
|
|
6
|
+
- building pivot tables for further operations, where timestamp would be index, counters as columns, and counts as values
|
|
7
|
+
- creating a correlation matrix of counters based on pearson correlation between counts
|
|
8
|
+
- scaled medians imputation
|
|
9
|
+
- regression imputation
|
|
10
|
+
|
|
11
|
+
"""
|
|
12
|
+
|
|
13
|
+
import numpy as np
|
|
14
|
+
import pandas as pd
|
|
15
|
+
from typing import Iterable, Optional
|
|
16
|
+
from sklearn.linear_model import LinearRegression
|
|
17
|
+
|
|
18
|
+
from ..configs.config_common import ColumnsConfig
|
|
19
|
+
from ..configs.config_imputation import STLConfig, DonorsConfig, OutputConfig
|
|
20
|
+
from .selector import _select_regression_donors, _get_min_mutual_period
|
|
21
|
+
from ..utils.formatting import _determine_temporal_frequency, _validate_frequency
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
def _get_min_overlap_period_sm(freq: str, donors_cfg: DonorsConfig = DonorsConfig()) -> int:
|
|
25
|
+
"""
|
|
26
|
+
Determines the minimum overlap period necessary for scaled medians imputation
|
|
27
|
+
|
|
28
|
+
------
|
|
29
|
+
Parameters:
|
|
30
|
+
|
|
31
|
+
- freq: temporal frequency of the project
|
|
32
|
+
- donors_cfg: donors' config
|
|
33
|
+
|
|
34
|
+
-----
|
|
35
|
+
Returns:
|
|
36
|
+
|
|
37
|
+
- integar corresponding to the minimum necessary overlap period
|
|
38
|
+
|
|
39
|
+
"""
|
|
40
|
+
|
|
41
|
+
freq = _validate_frequency(freq)
|
|
42
|
+
if freq == 'hourly':
|
|
43
|
+
return donors_cfg.sm_min_overlap_hour
|
|
44
|
+
elif freq == 'daily':
|
|
45
|
+
return donors_cfg.sm_min_overlap_day
|
|
46
|
+
else:
|
|
47
|
+
raise ValueError(f'Unsupported frequency: {freq}')
|
|
48
|
+
|
|
49
|
+
|
|
50
|
+
def _build_pivots(
|
|
51
|
+
df: pd.DataFrame,
|
|
52
|
+
cols: ColumnsConfig = ColumnsConfig(),
|
|
53
|
+
stl_cfg: STLConfig = STLConfig(),
|
|
54
|
+
) -> pd.DataFrame:
|
|
55
|
+
"""
|
|
56
|
+
builds pivots of data, where timestamp would be index, counters as columns, and counts as values
|
|
57
|
+
|
|
58
|
+
------
|
|
59
|
+
Parameters:
|
|
60
|
+
|
|
61
|
+
- df: full network DataFrame
|
|
62
|
+
- cols: columns config
|
|
63
|
+
- stl_cfg: STL config
|
|
64
|
+
|
|
65
|
+
-----
|
|
66
|
+
Returns:
|
|
67
|
+
|
|
68
|
+
- pivot_raw: building a pivot based on raw observed counts
|
|
69
|
+
- pivot_ts: building a pivot based on smoothed out time series of STL's trend + seasonality
|
|
70
|
+
"""
|
|
71
|
+
|
|
72
|
+
out = df.copy()
|
|
73
|
+
|
|
74
|
+
ts = (out['stl_trend'] + out['stl_season']).clip(lower=stl_cfg.clip_lower)
|
|
75
|
+
out['_ts_'] = ts
|
|
76
|
+
|
|
77
|
+
pivot_raw = out.pivot_table(index=cols.timestamp, columns=cols.counter, values=cols.count, aggfunc='mean')
|
|
78
|
+
pivot_ts = out.pivot_table(index=cols.timestamp, columns=cols.counter, values='_ts_', aggfunc='mean')
|
|
79
|
+
|
|
80
|
+
return pivot_raw, pivot_ts
|
|
81
|
+
|
|
82
|
+
|
|
83
|
+
def _corralation_matrix_donors(pivot_for_corr: pd.DataFrame) -> pd.DataFrame:
|
|
84
|
+
"""
|
|
85
|
+
builds the correlation matrix of counters based on pearson correlation between counts, counters, and timestamps
|
|
86
|
+
|
|
87
|
+
------
|
|
88
|
+
Parameters:
|
|
89
|
+
|
|
90
|
+
- pivot_for_corr: the '_build_pivots' function's output, which is a pivot of counts
|
|
91
|
+
|
|
92
|
+
-----
|
|
93
|
+
Returns:
|
|
94
|
+
|
|
95
|
+
- the correlation matrix of counters
|
|
96
|
+
"""
|
|
97
|
+
|
|
98
|
+
corr = pivot_for_corr.corr()
|
|
99
|
+
correlation_matrix = {s: corr[s].drop(labels=[s]).sort_values(ascending=False).index.tolist() for s in corr.columns}
|
|
100
|
+
|
|
101
|
+
return correlation_matrix
|
|
102
|
+
|
|
103
|
+
|
|
104
|
+
def impute_scaled_median(
|
|
105
|
+
df: pd.DataFrame,
|
|
106
|
+
pivot: pd.DataFrame,
|
|
107
|
+
donor_map: dict[str, list[str]],
|
|
108
|
+
freq: str,
|
|
109
|
+
counters=None,
|
|
110
|
+
cols: ColumnsConfig = ColumnsConfig(),
|
|
111
|
+
donors_cfg: DonorsConfig = DonorsConfig(),
|
|
112
|
+
out_cfg: OutputConfig = OutputConfig(),
|
|
113
|
+
) -> pd.DataFrame:
|
|
114
|
+
"""
|
|
115
|
+
Fills missing values using scaled median of donors (M7)
|
|
116
|
+
|
|
117
|
+
------
|
|
118
|
+
Parameters:
|
|
119
|
+
|
|
120
|
+
- df: the complete network dataset
|
|
121
|
+
- pivot: pivotted dataset of counters
|
|
122
|
+
- donor_map: dictionary map of donors
|
|
123
|
+
- freq: temporal frequency of the project
|
|
124
|
+
- counters: counters to be operated on. if NaN, all counters will be processed
|
|
125
|
+
- cols: columns config
|
|
126
|
+
- donors_cfg: donors' config
|
|
127
|
+
- out_cfg: output config
|
|
128
|
+
|
|
129
|
+
-----
|
|
130
|
+
Returns:
|
|
131
|
+
|
|
132
|
+
- Imputed DataFrame using scaled medians method (M7)
|
|
133
|
+
|
|
134
|
+
-----
|
|
135
|
+
Notes:
|
|
136
|
+
|
|
137
|
+
- the 'counters' argument is added in order to be utilized through piepline, to skip counters which do not have data holes. this gives us the possibility to only process counters with holes
|
|
138
|
+
"""
|
|
139
|
+
|
|
140
|
+
out = df.copy()
|
|
141
|
+
|
|
142
|
+
# setting the imputed column as NaN to be filled later
|
|
143
|
+
out[out_cfg.col_sm_imputed] = np.nan
|
|
144
|
+
|
|
145
|
+
sm_min_overlap = _get_min_overlap_period_sm(freq=freq)
|
|
146
|
+
|
|
147
|
+
targets = counters if counters is not None else donor_map.keys()
|
|
148
|
+
|
|
149
|
+
# loop that goes through target counters, retrieves donors and identifies eligible ones, and calculates median
|
|
150
|
+
for target in targets:
|
|
151
|
+
donors = donor_map.get(target, [])
|
|
152
|
+
|
|
153
|
+
if target not in pivot.columns:
|
|
154
|
+
continue
|
|
155
|
+
|
|
156
|
+
# a maximum of retrieved donors is set, to limit calculations on the entirety of donors (default set to 0.5)
|
|
157
|
+
max_d = int(len(donors) * donors_cfg.max_donor_rate)
|
|
158
|
+
donors = donors[:max_d]
|
|
159
|
+
|
|
160
|
+
if not donors:
|
|
161
|
+
continue
|
|
162
|
+
|
|
163
|
+
# first checks if there are enough observations on the target itself
|
|
164
|
+
y_t = pivot[target]
|
|
165
|
+
avail_idx = y_t.index[y_t.notna()]
|
|
166
|
+
|
|
167
|
+
if avail_idx.size < sm_min_overlap:
|
|
168
|
+
continue
|
|
169
|
+
|
|
170
|
+
median_target = np.nanmedian(y_t.loc[avail_idx])
|
|
171
|
+
|
|
172
|
+
if not np.isfinite(median_target):
|
|
173
|
+
continue
|
|
174
|
+
|
|
175
|
+
# goes through each donor, checks validity, if valid -> adds the donor and its median to corresponding lists
|
|
176
|
+
valid_donors = []
|
|
177
|
+
donor_meds = []
|
|
178
|
+
sm_counter = 0
|
|
179
|
+
|
|
180
|
+
for d in donors:
|
|
181
|
+
if d not in pivot.columns:
|
|
182
|
+
continue
|
|
183
|
+
|
|
184
|
+
if pivot[[target, d, *valid_donors]].notna().all(axis=1).sum() < sm_min_overlap:
|
|
185
|
+
continue
|
|
186
|
+
|
|
187
|
+
arr = pivot.loc[avail_idx, d].to_numpy(dtype=float)
|
|
188
|
+
|
|
189
|
+
# infinity checks are run on multiple steps
|
|
190
|
+
if np.isfinite(arr).any():
|
|
191
|
+
md = np.nanmedian(arr)
|
|
192
|
+
if np.isfinite(md):
|
|
193
|
+
valid_donors.append(d)
|
|
194
|
+
donor_meds.append(md)
|
|
195
|
+
|
|
196
|
+
# once the counter hits 'top_k_donor', the donor loop ends
|
|
197
|
+
sm_counter = +1
|
|
198
|
+
|
|
199
|
+
if sm_counter == donors_cfg.top_k_donor:
|
|
200
|
+
break
|
|
201
|
+
|
|
202
|
+
median_donors = float(np.median(donor_meds))
|
|
203
|
+
|
|
204
|
+
if not (np.isfinite(median_donors) and median_donors > 0):
|
|
205
|
+
continue
|
|
206
|
+
|
|
207
|
+
# the scale is used to fit median of donors to the target
|
|
208
|
+
scale = median_target / median_donors
|
|
209
|
+
|
|
210
|
+
if not np.isfinite(scale):
|
|
211
|
+
continue
|
|
212
|
+
|
|
213
|
+
# the mini pivot dataset of donors are produced
|
|
214
|
+
mat = pivot[valid_donors].to_numpy(dtype=float)
|
|
215
|
+
med_series = np.nanmedian(mat, axis=1) * scale
|
|
216
|
+
donor_series = pd.Series(med_series, index=pivot.index)
|
|
217
|
+
|
|
218
|
+
mask = (out[cols.counter] == target) & out[cols.count].isna()
|
|
219
|
+
|
|
220
|
+
if mask.any():
|
|
221
|
+
out.loc[mask, out_cfg.col_sm_imputed] = out.loc[mask, cols.timestamp].map(donor_series)
|
|
222
|
+
|
|
223
|
+
out[out_cfg.col_sm_imputed] = out[cols.count].fillna(out[out_cfg.col_sm_imputed])
|
|
224
|
+
|
|
225
|
+
return out
|
|
226
|
+
|
|
227
|
+
|
|
228
|
+
def impute_regression(
|
|
229
|
+
df: pd.DataFrame,
|
|
230
|
+
pivot: pd.DataFrame,
|
|
231
|
+
freq: str,
|
|
232
|
+
donor_map: dict[str, list[str]],
|
|
233
|
+
counters=None,
|
|
234
|
+
cols: ColumnsConfig = ColumnsConfig(),
|
|
235
|
+
donors_cfg: DonorsConfig = DonorsConfig(),
|
|
236
|
+
stl_cfg: STLConfig = STLConfig(),
|
|
237
|
+
out_cfg: OutputConfig = OutputConfig(),
|
|
238
|
+
) -> pd.DataFrame:
|
|
239
|
+
"""
|
|
240
|
+
Fills missing values using regression prediction of donors (M8)
|
|
241
|
+
|
|
242
|
+
------
|
|
243
|
+
Parameters:
|
|
244
|
+
|
|
245
|
+
- df: the complete network dataset
|
|
246
|
+
- pivot: pivotted dataset of counters
|
|
247
|
+
- donor_map: dictionary map of donors
|
|
248
|
+
- freq: temporal frequency of the project
|
|
249
|
+
- counters: counters to be operated on. if NaN, all counters will be processed
|
|
250
|
+
- cols: columns config
|
|
251
|
+
- donors_cfg: donors' config
|
|
252
|
+
- out_cfg: output config
|
|
253
|
+
- stl_cfg: STL config
|
|
254
|
+
|
|
255
|
+
-----
|
|
256
|
+
Returns:
|
|
257
|
+
|
|
258
|
+
- Imputed DataFrame using regression method (M8)
|
|
259
|
+
|
|
260
|
+
-----
|
|
261
|
+
Notes:
|
|
262
|
+
|
|
263
|
+
- the 'counters' argument is added in order to be utilized through piepline, to skip counters which do not have data holes. this gives us the possibility to only process counters with holes
|
|
264
|
+
"""
|
|
265
|
+
|
|
266
|
+
# shortened values for ease of use
|
|
267
|
+
s_col, d_col, v_col = cols.counter, cols.timestamp, cols.count
|
|
268
|
+
|
|
269
|
+
# starts with NaN imputed column
|
|
270
|
+
out = df.copy()
|
|
271
|
+
pred_col = '_reg_pred_'
|
|
272
|
+
out[pred_col] = np.nan
|
|
273
|
+
|
|
274
|
+
min_mutual_period = _get_min_mutual_period(freq)
|
|
275
|
+
|
|
276
|
+
targets = counters if counters is not None else donor_map.keys()
|
|
277
|
+
|
|
278
|
+
# for each targer counter, gets donors using pre-defined function
|
|
279
|
+
for target in targets:
|
|
280
|
+
donors = donor_map.get(target, [])
|
|
281
|
+
if target not in pivot.columns:
|
|
282
|
+
continue
|
|
283
|
+
|
|
284
|
+
selected = _select_regression_donors(target=target, pivot=pivot, freq=freq, donors=donors, donors_cfg=donors_cfg)
|
|
285
|
+
|
|
286
|
+
# set y_imp as the pr
|
|
287
|
+
y = pivot[target]
|
|
288
|
+
y_imp = y.copy()
|
|
289
|
+
|
|
290
|
+
if selected:
|
|
291
|
+
X = pivot[selected]
|
|
292
|
+
mask_fit = y.notna() & X.notna().all(axis=1)
|
|
293
|
+
|
|
294
|
+
# masks to see if there are enough mutual observations between target and donors to build the model
|
|
295
|
+
if mask_fit.sum() > min_mutual_period:
|
|
296
|
+
# builds and fits the model
|
|
297
|
+
model = LinearRegression()
|
|
298
|
+
model.fit(X.loc[mask_fit], y.loc[mask_fit])
|
|
299
|
+
|
|
300
|
+
# masks for prediction where target is null
|
|
301
|
+
mask_pred = X.notna().all(axis=1)
|
|
302
|
+
|
|
303
|
+
# replaces y_imp with the prediction (y_hat)
|
|
304
|
+
if mask_pred.any():
|
|
305
|
+
y_hat = model.predict(X.loc[mask_pred])
|
|
306
|
+
y_hat = np.maximum(y_hat, stl_cfg.clip_lower)
|
|
307
|
+
y_imp.loc[mask_pred] = y_hat
|
|
308
|
+
|
|
309
|
+
# updates the output for the target
|
|
310
|
+
mask_rows = out[s_col] == target
|
|
311
|
+
if mask_rows.any():
|
|
312
|
+
out.loc[mask_rows, pred_col] = out.loc[mask_rows, d_col].map(y_imp)
|
|
313
|
+
|
|
314
|
+
out[out_cfg.col_reg_imputed] = out[v_col].fillna(out[pred_col])
|
|
315
|
+
out.drop(columns=[pred_col], inplace=True)
|
|
316
|
+
|
|
317
|
+
return out
|