mobts 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
mobts/__init__.py ADDED
@@ -0,0 +1,4 @@
1
+ from .main import hello_world
2
+
3
+ from .preprocessing import run_preprocess_stage_1, apply_threshold, preprocess
4
+ from .imputation import impute
File without changes
@@ -0,0 +1,29 @@
1
+ from dataclasses import dataclass, field
2
+ from typing import Optional
3
+
4
+
5
+ @dataclass
6
+ class ColumnsConfig:
7
+ """
8
+ Canonical column names used in the pipeline after standardization.
9
+ """
10
+
11
+ counter: str = 'name'
12
+ timestamp: str = 'timestamp'
13
+ count: str = 'count'
14
+
15
+ weekday: str = 'weekday'
16
+ week_num: str = 'week_num'
17
+ how: str = 'how'
18
+ hour: str = 'hour'
19
+ date: str = 'date'
20
+
21
+
22
+ @dataclass
23
+ class SparsityConfig:
24
+ """
25
+ For removing counters with not enough valid counts
26
+ """
27
+
28
+ drop_sparse_counters: bool = True
29
+ sparse_threshold: float = 0.5
@@ -0,0 +1,64 @@
1
+ from dataclasses import dataclass
2
+ from typing import Optional
3
+
4
+
5
+ @dataclass
6
+ class STLConfig:
7
+ """
8
+ Config used in STL imputation
9
+ """
10
+
11
+ # STL seasonal period (for daily)
12
+ stl_season_daily = 7
13
+ stl_season_hourly = 168
14
+
15
+ # clipping
16
+ clip_lower = 0
17
+
18
+ # rollinng median
19
+ rolling_median_window: int = 2
20
+ rolling_median_min_valid: int = 1
21
+
22
+ # STL robust
23
+ stl_robust = False
24
+
25
+
26
+ @dataclass
27
+ class DonorsConfig:
28
+ """
29
+ Configs for Donor-based imputation
30
+ """
31
+
32
+ top_k_donor = 25
33
+ max_donor_rate = 0.5
34
+
35
+ # scaled median
36
+ sm_min_overlap_day = 60
37
+ sm_min_overlap_hour = sm_min_overlap_day * 24
38
+ sm_min_neighbors = 20
39
+
40
+ # regression
41
+ min_mutual_days = 60
42
+ min_mutual_hours = min_mutual_days * 24
43
+ min_pred_days = 30
44
+ min_pred_hours = min_pred_days * 24
45
+ min_pred_coverage = 0.9
46
+
47
+
48
+ @dataclass
49
+ class OutputConfig:
50
+ """
51
+ Configs for output columns and final selection
52
+ """
53
+
54
+ # calculated column names
55
+ col_intp = 'count_intp'
56
+ col_stl_imputed = 'count_stl_imputed'
57
+ col_sm_imputed: str = 'count_sm_imputed'
58
+ col_reg_imputed: str = 'count_reg_imputed'
59
+ col_final: str = 'count_imputed'
60
+ col_method_used: str = 'imputation_method'
61
+
62
+ stl_method: str = 'STL'
63
+ sm_method: str = 'M7'
64
+ reg_method: str = 'M8'
@@ -0,0 +1,70 @@
1
+ from dataclasses import dataclass, field
2
+ from typing import Optional
3
+ from .config_common import ColumnsConfig, SparsityConfig
4
+
5
+
6
+ @dataclass
7
+ class PreprocessConfig:
8
+ """
9
+ Parameters for low-count/zero-run cleaning and operational window trimming.
10
+ if avail_min_valid days out of avail_window is not present, the whole window will be set as non-operational.
11
+ """
12
+
13
+ low_rel_daily: float = 0.01 # threshold as fraction of station median
14
+ low_abs_daily: float = 5 # absolute floor threshold to be considered low count noise
15
+ low_run_min_daily: int = 2 # consecutive low count days to be set to NaN
16
+
17
+ zero_rate_max: float = 0.05 # threshold to consider 0s normal
18
+
19
+ night_hours = [1, 2, 3, 4, 5, 6]
20
+
21
+ zero_run_min: int = 6
22
+ island_max_len: int = 6
23
+ surround_min_len: int = 12
24
+
25
+
26
+ @dataclass
27
+ class STLConfig:
28
+ """
29
+ Parameters for STL decomposition outlier scoring.
30
+ """
31
+
32
+ period: int = 28 # seasonal period in days, set to 4 weeks
33
+ robust: bool = False # set to False to avoid heavy computation
34
+
35
+
36
+ @dataclass
37
+ class OutlierConfig:
38
+ """
39
+ Parameters for thresholding STL outlier scores.
40
+ """
41
+
42
+ threshold_daily: float = 20 # threshold to be tuned via plotting
43
+ threshold_hourly: float = 45 # threshold to be tuned via plotting
44
+
45
+
46
+ @dataclass
47
+ class PlotConfig:
48
+ """
49
+ Parameters for plotting the detected outliers.
50
+ """
51
+
52
+ ncols: int = 3
53
+ figsize_width: float = 15
54
+ min_fig_height: float = 10
55
+ height_per_row: float = 3
56
+ linewidth_d: float = 0.5
57
+ linewidth_h: float = 0.3
58
+ marker_size: float = 10
59
+ x_label_rotation: int = 30
60
+ max_stations: Optional[int] = None
61
+
62
+
63
+ @dataclass
64
+ class PipelineConfig:
65
+ cols: ColumnsConfig = field(default_factory=ColumnsConfig)
66
+ sparse: SparsityConfig = field(default_factory=SparsityConfig)
67
+ preprocess: PreprocessConfig = field(default_factory=PreprocessConfig)
68
+ stl: STLConfig = field(default_factory=STLConfig)
69
+ outliers: OutlierConfig = field(default_factory=OutlierConfig)
70
+ plot: PlotConfig = field(default_factory=PlotConfig)
@@ -0,0 +1,29 @@
1
+ #!python3
2
+ # -*- coding: utf-8 -*-
3
+ """
4
+ Module for division operations and custom exceptions.
5
+
6
+ This module provides functions and exceptions related to division operations.
7
+ It imports the `divide` function and the `CantDivideByZeroError` exception from
8
+ other modules and makes them available for use in this module.
9
+
10
+ Functions
11
+ ---------
12
+ divide(a, b)
13
+ Divide two numbers, raising a custom exception if the divisor is zero.
14
+
15
+ Exceptions
16
+ ----------
17
+ CantDivideByZeroError
18
+ Raised when an attempt is made to divide by zero.
19
+
20
+ Imports
21
+ --------
22
+ - divide: Function for performing division operations.
23
+ - CantDivideByZeroError: Exception raised for division by zero errors.
24
+ """
25
+
26
+ from .divider import divide
27
+ from .divider_error import CantDivideByZeroError
28
+
29
+ __all__ = ['divide', 'CantDivideByZeroError']
@@ -0,0 +1,55 @@
1
+ #!python3
2
+ # -*- coding: utf-8 -*-
3
+ """
4
+ Module for division operations with custom exceptions.
5
+
6
+ This module provides a function for performing division
7
+ and raises a custom exception when attempting to divide by zero.
8
+
9
+ Functions
10
+ ---------
11
+ divide(a, b)
12
+ Divide two numbers, raising a custom exception if the divisor is zero.
13
+
14
+ Exceptions
15
+ ----------
16
+ CantDivideByZeroError
17
+ Raised when an attempt is made to divide by zero.
18
+ """
19
+
20
+ from .divider_error import CantDivideByZeroError
21
+
22
+
23
+ def divide(a, b):
24
+ """
25
+ Divide two numbers, raising a custom exception if the divisor is zero.
26
+
27
+ Parameters
28
+ ----------
29
+ a : float
30
+ The dividend.
31
+ b : float
32
+ The divisor.
33
+
34
+ Returns
35
+ -------
36
+ float
37
+ The result of the division.
38
+
39
+ Raises
40
+ ------
41
+ CantDivideByZeroError
42
+ If the divisor (b) is zero.
43
+
44
+ Examples
45
+ --------
46
+ >>> divide(10, 2)
47
+ 5.0
48
+ >>> divide(10, 0)
49
+ Traceback (most recent call last):
50
+ ...
51
+ CantDivideByZeroError
52
+ """
53
+ if b == 0:
54
+ raise CantDivideByZeroError()
55
+ return a / b
@@ -0,0 +1,61 @@
1
+ #!python3
2
+ # -*- coding: utf-8 -*-
3
+ """
4
+ Module for custom exceptions related to calculator operations.
5
+
6
+ This module defines custom exceptions used in calculator operations,
7
+ including a base exception class and a specific exception for division by zero errors.
8
+
9
+ Classes
10
+ -------
11
+ CalculatorError
12
+ Base class for exceptions in calculator operations.
13
+ CantDivideByZeroError
14
+ Exception raised when an attempt is made to divide by zero.
15
+
16
+ Exceptions
17
+ ----------
18
+ CalculatorError
19
+ Base class for exceptions in the calculator domain.
20
+ CantDivideByZeroError
21
+ Raised specifically for division by zero errors.
22
+ """
23
+
24
+
25
+ class CalculatorError(Exception):
26
+ """
27
+ Base class for exceptions in calculator operations.
28
+
29
+ This class is intended to be used as a base class for other calculator-related
30
+ exceptions. It inherits from the built-in Exception class and allows for custom
31
+ exception handling in the calculator domain.
32
+
33
+ Parameters
34
+ ----------
35
+ args : tuple
36
+ Variable length argument list passed to the base Exception class.
37
+ """
38
+
39
+ def __init__(self, *args):
40
+ super().__init__(args)
41
+
42
+
43
+ class CantDivideByZeroError(CalculatorError):
44
+ """
45
+ Exception raised when an attempt is made to divide by zero.
46
+
47
+ This exception is a specific subclass of CalculatorError and is intended to be
48
+ used when a division by zero error occurs. It provides a custom error message
49
+ indicating that division by zero is not allowed.
50
+
51
+ Parameters
52
+ ----------
53
+ None
54
+
55
+ Notes
56
+ -----
57
+ The default message for this exception is "tu ne peux pas diviser par zéro".
58
+ """
59
+
60
+ def __init__(self):
61
+ super().__init__('tu ne peux pas diviser par zéro')
@@ -0,0 +1,2 @@
1
+ from .pipeline import impute
2
+ from .donors import impute_scaled_median, impute_regression
@@ -0,0 +1,317 @@
1
+ """
2
+ Module concerned with the donor-based imputations
3
+
4
+ This module contains:
5
+ - determining the minimum overlap period for scaled median imputation method based on project's temporal frequency
6
+ - building pivot tables for further operations, where timestamp would be index, counters as columns, and counts as values
7
+ - creating a correlation matrix of counters based on pearson correlation between counts
8
+ - scaled medians imputation
9
+ - regression imputation
10
+
11
+ """
12
+
13
+ import numpy as np
14
+ import pandas as pd
15
+ from typing import Iterable, Optional
16
+ from sklearn.linear_model import LinearRegression
17
+
18
+ from ..configs.config_common import ColumnsConfig
19
+ from ..configs.config_imputation import STLConfig, DonorsConfig, OutputConfig
20
+ from .selector import _select_regression_donors, _get_min_mutual_period
21
+ from ..utils.formatting import _determine_temporal_frequency, _validate_frequency
22
+
23
+
24
+ def _get_min_overlap_period_sm(freq: str, donors_cfg: DonorsConfig = DonorsConfig()) -> int:
25
+ """
26
+ Determines the minimum overlap period necessary for scaled medians imputation
27
+
28
+ ------
29
+ Parameters:
30
+
31
+ - freq: temporal frequency of the project
32
+ - donors_cfg: donors' config
33
+
34
+ -----
35
+ Returns:
36
+
37
+ - integar corresponding to the minimum necessary overlap period
38
+
39
+ """
40
+
41
+ freq = _validate_frequency(freq)
42
+ if freq == 'hourly':
43
+ return donors_cfg.sm_min_overlap_hour
44
+ elif freq == 'daily':
45
+ return donors_cfg.sm_min_overlap_day
46
+ else:
47
+ raise ValueError(f'Unsupported frequency: {freq}')
48
+
49
+
50
+ def _build_pivots(
51
+ df: pd.DataFrame,
52
+ cols: ColumnsConfig = ColumnsConfig(),
53
+ stl_cfg: STLConfig = STLConfig(),
54
+ ) -> pd.DataFrame:
55
+ """
56
+ builds pivots of data, where timestamp would be index, counters as columns, and counts as values
57
+
58
+ ------
59
+ Parameters:
60
+
61
+ - df: full network DataFrame
62
+ - cols: columns config
63
+ - stl_cfg: STL config
64
+
65
+ -----
66
+ Returns:
67
+
68
+ - pivot_raw: building a pivot based on raw observed counts
69
+ - pivot_ts: building a pivot based on smoothed out time series of STL's trend + seasonality
70
+ """
71
+
72
+ out = df.copy()
73
+
74
+ ts = (out['stl_trend'] + out['stl_season']).clip(lower=stl_cfg.clip_lower)
75
+ out['_ts_'] = ts
76
+
77
+ pivot_raw = out.pivot_table(index=cols.timestamp, columns=cols.counter, values=cols.count, aggfunc='mean')
78
+ pivot_ts = out.pivot_table(index=cols.timestamp, columns=cols.counter, values='_ts_', aggfunc='mean')
79
+
80
+ return pivot_raw, pivot_ts
81
+
82
+
83
+ def _corralation_matrix_donors(pivot_for_corr: pd.DataFrame) -> pd.DataFrame:
84
+ """
85
+ builds the correlation matrix of counters based on pearson correlation between counts, counters, and timestamps
86
+
87
+ ------
88
+ Parameters:
89
+
90
+ - pivot_for_corr: the '_build_pivots' function's output, which is a pivot of counts
91
+
92
+ -----
93
+ Returns:
94
+
95
+ - the correlation matrix of counters
96
+ """
97
+
98
+ corr = pivot_for_corr.corr()
99
+ correlation_matrix = {s: corr[s].drop(labels=[s]).sort_values(ascending=False).index.tolist() for s in corr.columns}
100
+
101
+ return correlation_matrix
102
+
103
+
104
+ def impute_scaled_median(
105
+ df: pd.DataFrame,
106
+ pivot: pd.DataFrame,
107
+ donor_map: dict[str, list[str]],
108
+ freq: str,
109
+ counters=None,
110
+ cols: ColumnsConfig = ColumnsConfig(),
111
+ donors_cfg: DonorsConfig = DonorsConfig(),
112
+ out_cfg: OutputConfig = OutputConfig(),
113
+ ) -> pd.DataFrame:
114
+ """
115
+ Fills missing values using scaled median of donors (M7)
116
+
117
+ ------
118
+ Parameters:
119
+
120
+ - df: the complete network dataset
121
+ - pivot: pivotted dataset of counters
122
+ - donor_map: dictionary map of donors
123
+ - freq: temporal frequency of the project
124
+ - counters: counters to be operated on. if NaN, all counters will be processed
125
+ - cols: columns config
126
+ - donors_cfg: donors' config
127
+ - out_cfg: output config
128
+
129
+ -----
130
+ Returns:
131
+
132
+ - Imputed DataFrame using scaled medians method (M7)
133
+
134
+ -----
135
+ Notes:
136
+
137
+ - the 'counters' argument is added in order to be utilized through piepline, to skip counters which do not have data holes. this gives us the possibility to only process counters with holes
138
+ """
139
+
140
+ out = df.copy()
141
+
142
+ # setting the imputed column as NaN to be filled later
143
+ out[out_cfg.col_sm_imputed] = np.nan
144
+
145
+ sm_min_overlap = _get_min_overlap_period_sm(freq=freq)
146
+
147
+ targets = counters if counters is not None else donor_map.keys()
148
+
149
+ # loop that goes through target counters, retrieves donors and identifies eligible ones, and calculates median
150
+ for target in targets:
151
+ donors = donor_map.get(target, [])
152
+
153
+ if target not in pivot.columns:
154
+ continue
155
+
156
+ # a maximum of retrieved donors is set, to limit calculations on the entirety of donors (default set to 0.5)
157
+ max_d = int(len(donors) * donors_cfg.max_donor_rate)
158
+ donors = donors[:max_d]
159
+
160
+ if not donors:
161
+ continue
162
+
163
+ # first checks if there are enough observations on the target itself
164
+ y_t = pivot[target]
165
+ avail_idx = y_t.index[y_t.notna()]
166
+
167
+ if avail_idx.size < sm_min_overlap:
168
+ continue
169
+
170
+ median_target = np.nanmedian(y_t.loc[avail_idx])
171
+
172
+ if not np.isfinite(median_target):
173
+ continue
174
+
175
+ # goes through each donor, checks validity, if valid -> adds the donor and its median to corresponding lists
176
+ valid_donors = []
177
+ donor_meds = []
178
+ sm_counter = 0
179
+
180
+ for d in donors:
181
+ if d not in pivot.columns:
182
+ continue
183
+
184
+ if pivot[[target, d, *valid_donors]].notna().all(axis=1).sum() < sm_min_overlap:
185
+ continue
186
+
187
+ arr = pivot.loc[avail_idx, d].to_numpy(dtype=float)
188
+
189
+ # infinity checks are run on multiple steps
190
+ if np.isfinite(arr).any():
191
+ md = np.nanmedian(arr)
192
+ if np.isfinite(md):
193
+ valid_donors.append(d)
194
+ donor_meds.append(md)
195
+
196
+ # once the counter hits 'top_k_donor', the donor loop ends
197
+ sm_counter = +1
198
+
199
+ if sm_counter == donors_cfg.top_k_donor:
200
+ break
201
+
202
+ median_donors = float(np.median(donor_meds))
203
+
204
+ if not (np.isfinite(median_donors) and median_donors > 0):
205
+ continue
206
+
207
+ # the scale is used to fit median of donors to the target
208
+ scale = median_target / median_donors
209
+
210
+ if not np.isfinite(scale):
211
+ continue
212
+
213
+ # the mini pivot dataset of donors are produced
214
+ mat = pivot[valid_donors].to_numpy(dtype=float)
215
+ med_series = np.nanmedian(mat, axis=1) * scale
216
+ donor_series = pd.Series(med_series, index=pivot.index)
217
+
218
+ mask = (out[cols.counter] == target) & out[cols.count].isna()
219
+
220
+ if mask.any():
221
+ out.loc[mask, out_cfg.col_sm_imputed] = out.loc[mask, cols.timestamp].map(donor_series)
222
+
223
+ out[out_cfg.col_sm_imputed] = out[cols.count].fillna(out[out_cfg.col_sm_imputed])
224
+
225
+ return out
226
+
227
+
228
+ def impute_regression(
229
+ df: pd.DataFrame,
230
+ pivot: pd.DataFrame,
231
+ freq: str,
232
+ donor_map: dict[str, list[str]],
233
+ counters=None,
234
+ cols: ColumnsConfig = ColumnsConfig(),
235
+ donors_cfg: DonorsConfig = DonorsConfig(),
236
+ stl_cfg: STLConfig = STLConfig(),
237
+ out_cfg: OutputConfig = OutputConfig(),
238
+ ) -> pd.DataFrame:
239
+ """
240
+ Fills missing values using regression prediction of donors (M8)
241
+
242
+ ------
243
+ Parameters:
244
+
245
+ - df: the complete network dataset
246
+ - pivot: pivotted dataset of counters
247
+ - donor_map: dictionary map of donors
248
+ - freq: temporal frequency of the project
249
+ - counters: counters to be operated on. if NaN, all counters will be processed
250
+ - cols: columns config
251
+ - donors_cfg: donors' config
252
+ - out_cfg: output config
253
+ - stl_cfg: STL config
254
+
255
+ -----
256
+ Returns:
257
+
258
+ - Imputed DataFrame using regression method (M8)
259
+
260
+ -----
261
+ Notes:
262
+
263
+ - the 'counters' argument is added in order to be utilized through piepline, to skip counters which do not have data holes. this gives us the possibility to only process counters with holes
264
+ """
265
+
266
+ # shortened values for ease of use
267
+ s_col, d_col, v_col = cols.counter, cols.timestamp, cols.count
268
+
269
+ # starts with NaN imputed column
270
+ out = df.copy()
271
+ pred_col = '_reg_pred_'
272
+ out[pred_col] = np.nan
273
+
274
+ min_mutual_period = _get_min_mutual_period(freq)
275
+
276
+ targets = counters if counters is not None else donor_map.keys()
277
+
278
+ # for each targer counter, gets donors using pre-defined function
279
+ for target in targets:
280
+ donors = donor_map.get(target, [])
281
+ if target not in pivot.columns:
282
+ continue
283
+
284
+ selected = _select_regression_donors(target=target, pivot=pivot, freq=freq, donors=donors, donors_cfg=donors_cfg)
285
+
286
+ # set y_imp as the pr
287
+ y = pivot[target]
288
+ y_imp = y.copy()
289
+
290
+ if selected:
291
+ X = pivot[selected]
292
+ mask_fit = y.notna() & X.notna().all(axis=1)
293
+
294
+ # masks to see if there are enough mutual observations between target and donors to build the model
295
+ if mask_fit.sum() > min_mutual_period:
296
+ # builds and fits the model
297
+ model = LinearRegression()
298
+ model.fit(X.loc[mask_fit], y.loc[mask_fit])
299
+
300
+ # masks for prediction where target is null
301
+ mask_pred = X.notna().all(axis=1)
302
+
303
+ # replaces y_imp with the prediction (y_hat)
304
+ if mask_pred.any():
305
+ y_hat = model.predict(X.loc[mask_pred])
306
+ y_hat = np.maximum(y_hat, stl_cfg.clip_lower)
307
+ y_imp.loc[mask_pred] = y_hat
308
+
309
+ # updates the output for the target
310
+ mask_rows = out[s_col] == target
311
+ if mask_rows.any():
312
+ out.loc[mask_rows, pred_col] = out.loc[mask_rows, d_col].map(y_imp)
313
+
314
+ out[out_cfg.col_reg_imputed] = out[v_col].fillna(out[pred_col])
315
+ out.drop(columns=[pred_col], inplace=True)
316
+
317
+ return out