wizata-dsapi 2.0.0.dev22__tar.gz → 2.0.0.dev24__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {wizata_dsapi-2.0.0.dev22/wizata_dsapi.egg-info → wizata_dsapi-2.0.0.dev24}/PKG-INFO +1 -1
- {wizata_dsapi-2.0.0.dev22 → wizata_dsapi-2.0.0.dev24}/wizata_dsapi/mlmodel.py +5 -0
- {wizata_dsapi-2.0.0.dev22 → wizata_dsapi-2.0.0.dev24}/wizata_dsapi/script.py +7 -1
- wizata_dsapi-2.0.0.dev24/wizata_dsapi/scripts/__init__.py +1 -0
- wizata_dsapi-2.0.0.dev24/wizata_dsapi/scripts/common.py +372 -0
- wizata_dsapi-2.0.0.dev24/wizata_dsapi/version.py +1 -0
- {wizata_dsapi-2.0.0.dev22 → wizata_dsapi-2.0.0.dev24/wizata_dsapi.egg-info}/PKG-INFO +1 -1
- wizata_dsapi-2.0.0.dev22/wizata_dsapi/scripts/__init__.py +0 -1
- wizata_dsapi-2.0.0.dev22/wizata_dsapi/scripts/common.py +0 -122
- wizata_dsapi-2.0.0.dev22/wizata_dsapi/version.py +0 -1
- {wizata_dsapi-2.0.0.dev22 → wizata_dsapi-2.0.0.dev24}/LICENSE.txt +0 -0
- {wizata_dsapi-2.0.0.dev22 → wizata_dsapi-2.0.0.dev24}/README.rst +0 -0
- {wizata_dsapi-2.0.0.dev22 → wizata_dsapi-2.0.0.dev24}/setup.cfg +0 -0
- {wizata_dsapi-2.0.0.dev22 → wizata_dsapi-2.0.0.dev24}/setup.py +0 -0
- {wizata_dsapi-2.0.0.dev22 → wizata_dsapi-2.0.0.dev24}/wizata_dsapi/__init__.py +0 -0
- {wizata_dsapi-2.0.0.dev22 → wizata_dsapi-2.0.0.dev24}/wizata_dsapi/api_config.py +0 -0
- {wizata_dsapi-2.0.0.dev22 → wizata_dsapi-2.0.0.dev24}/wizata_dsapi/api_dto.py +0 -0
- {wizata_dsapi-2.0.0.dev22 → wizata_dsapi-2.0.0.dev24}/wizata_dsapi/api_interface.py +0 -0
- {wizata_dsapi-2.0.0.dev22 → wizata_dsapi-2.0.0.dev24}/wizata_dsapi/bucket.py +0 -0
- {wizata_dsapi-2.0.0.dev22 → wizata_dsapi-2.0.0.dev24}/wizata_dsapi/business_label.py +0 -0
- {wizata_dsapi-2.0.0.dev22 → wizata_dsapi-2.0.0.dev24}/wizata_dsapi/context.py +0 -0
- {wizata_dsapi-2.0.0.dev22 → wizata_dsapi-2.0.0.dev24}/wizata_dsapi/dashboard.py +0 -0
- {wizata_dsapi-2.0.0.dev22 → wizata_dsapi-2.0.0.dev24}/wizata_dsapi/dataframe_toolkit.py +0 -0
- {wizata_dsapi-2.0.0.dev22 → wizata_dsapi-2.0.0.dev24}/wizata_dsapi/datapoint.py +0 -0
- {wizata_dsapi-2.0.0.dev22 → wizata_dsapi-2.0.0.dev24}/wizata_dsapi/datastore.py +0 -0
- {wizata_dsapi-2.0.0.dev22 → wizata_dsapi-2.0.0.dev24}/wizata_dsapi/ds_dataframe.py +0 -0
- {wizata_dsapi-2.0.0.dev22 → wizata_dsapi-2.0.0.dev24}/wizata_dsapi/dsapi_json_encoder.py +0 -0
- {wizata_dsapi-2.0.0.dev22 → wizata_dsapi-2.0.0.dev24}/wizata_dsapi/edge_config.py +0 -0
- {wizata_dsapi-2.0.0.dev22 → wizata_dsapi-2.0.0.dev24}/wizata_dsapi/edge_device.py +0 -0
- {wizata_dsapi-2.0.0.dev22 → wizata_dsapi-2.0.0.dev24}/wizata_dsapi/edge_module.py +0 -0
- {wizata_dsapi-2.0.0.dev22 → wizata_dsapi-2.0.0.dev24}/wizata_dsapi/evaluation.py +0 -0
- {wizata_dsapi-2.0.0.dev22 → wizata_dsapi-2.0.0.dev24}/wizata_dsapi/execution.py +0 -0
- {wizata_dsapi-2.0.0.dev22 → wizata_dsapi-2.0.0.dev24}/wizata_dsapi/execution_log.py +0 -0
- {wizata_dsapi-2.0.0.dev22 → wizata_dsapi-2.0.0.dev24}/wizata_dsapi/experiment.py +0 -0
- {wizata_dsapi-2.0.0.dev22 → wizata_dsapi-2.0.0.dev24}/wizata_dsapi/graylog_log.py +0 -0
- {wizata_dsapi-2.0.0.dev22 → wizata_dsapi-2.0.0.dev24}/wizata_dsapi/group_system.py +0 -0
- {wizata_dsapi-2.0.0.dev22 → wizata_dsapi-2.0.0.dev24}/wizata_dsapi/ilogger.py +0 -0
- {wizata_dsapi-2.0.0.dev22 → wizata_dsapi-2.0.0.dev24}/wizata_dsapi/insight.py +0 -0
- {wizata_dsapi-2.0.0.dev22 → wizata_dsapi-2.0.0.dev24}/wizata_dsapi/mobile_asset.py +0 -0
- {wizata_dsapi-2.0.0.dev22 → wizata_dsapi-2.0.0.dev24}/wizata_dsapi/model_toolkit.py +0 -0
- {wizata_dsapi-2.0.0.dev22 → wizata_dsapi-2.0.0.dev24}/wizata_dsapi/models/__init__.py +0 -0
- {wizata_dsapi-2.0.0.dev22 → wizata_dsapi-2.0.0.dev24}/wizata_dsapi/models/common.py +0 -0
- {wizata_dsapi-2.0.0.dev22 → wizata_dsapi-2.0.0.dev24}/wizata_dsapi/notification.py +0 -0
- {wizata_dsapi-2.0.0.dev22 → wizata_dsapi-2.0.0.dev24}/wizata_dsapi/paged_query_result.py +0 -0
- {wizata_dsapi-2.0.0.dev22 → wizata_dsapi-2.0.0.dev24}/wizata_dsapi/pipeline.py +0 -0
- {wizata_dsapi-2.0.0.dev22 → wizata_dsapi-2.0.0.dev24}/wizata_dsapi/pipeline_image.py +0 -0
- {wizata_dsapi-2.0.0.dev22 → wizata_dsapi-2.0.0.dev24}/wizata_dsapi/plot.py +0 -0
- {wizata_dsapi-2.0.0.dev22 → wizata_dsapi-2.0.0.dev24}/wizata_dsapi/plots/__init__.py +0 -0
- {wizata_dsapi-2.0.0.dev22 → wizata_dsapi-2.0.0.dev24}/wizata_dsapi/plots/common.py +0 -0
- {wizata_dsapi-2.0.0.dev22 → wizata_dsapi-2.0.0.dev24}/wizata_dsapi/plots/theme.py +0 -0
- {wizata_dsapi-2.0.0.dev22 → wizata_dsapi-2.0.0.dev24}/wizata_dsapi/request.py +0 -0
- {wizata_dsapi-2.0.0.dev22 → wizata_dsapi-2.0.0.dev24}/wizata_dsapi/search.py +0 -0
- {wizata_dsapi-2.0.0.dev22 → wizata_dsapi-2.0.0.dev24}/wizata_dsapi/solution_component.py +0 -0
- {wizata_dsapi-2.0.0.dev22 → wizata_dsapi-2.0.0.dev24}/wizata_dsapi/streamlit_utils.py +0 -0
- {wizata_dsapi-2.0.0.dev22 → wizata_dsapi-2.0.0.dev24}/wizata_dsapi/template.py +0 -0
- {wizata_dsapi-2.0.0.dev22 → wizata_dsapi-2.0.0.dev24}/wizata_dsapi/template_config.py +0 -0
- {wizata_dsapi-2.0.0.dev22 → wizata_dsapi-2.0.0.dev24}/wizata_dsapi/trigger.py +0 -0
- {wizata_dsapi-2.0.0.dev22 → wizata_dsapi-2.0.0.dev24}/wizata_dsapi/twin.py +0 -0
- {wizata_dsapi-2.0.0.dev22 → wizata_dsapi-2.0.0.dev24}/wizata_dsapi/twinregistration.py +0 -0
- {wizata_dsapi-2.0.0.dev22 → wizata_dsapi-2.0.0.dev24}/wizata_dsapi/user.py +0 -0
- {wizata_dsapi-2.0.0.dev22 → wizata_dsapi-2.0.0.dev24}/wizata_dsapi/wizard_function.py +0 -0
- {wizata_dsapi-2.0.0.dev22 → wizata_dsapi-2.0.0.dev24}/wizata_dsapi/wizard_request.py +0 -0
- {wizata_dsapi-2.0.0.dev22 → wizata_dsapi-2.0.0.dev24}/wizata_dsapi/wizata_dsapi_client.py +0 -0
- {wizata_dsapi-2.0.0.dev22 → wizata_dsapi-2.0.0.dev24}/wizata_dsapi/words.py +0 -0
- {wizata_dsapi-2.0.0.dev22 → wizata_dsapi-2.0.0.dev24}/wizata_dsapi.egg-info/SOURCES.txt +0 -0
- {wizata_dsapi-2.0.0.dev22 → wizata_dsapi-2.0.0.dev24}/wizata_dsapi.egg-info/dependency_links.txt +0 -0
- {wizata_dsapi-2.0.0.dev22 → wizata_dsapi-2.0.0.dev24}/wizata_dsapi.egg-info/requires.txt +0 -0
- {wizata_dsapi-2.0.0.dev22 → wizata_dsapi-2.0.0.dev24}/wizata_dsapi.egg-info/top_level.txt +0 -0
|
@@ -472,6 +472,7 @@ class MLModelConfig(ApiDto):
|
|
|
472
472
|
self.train_test_split_pct = train_test_split_pct
|
|
473
473
|
self.train_test_split_type = train_test_split_type
|
|
474
474
|
self.function = function
|
|
475
|
+
self.properties = None
|
|
475
476
|
self.properties_mapping = properties_mapping
|
|
476
477
|
|
|
477
478
|
# features management
|
|
@@ -540,6 +541,8 @@ class MLModelConfig(ApiDto):
|
|
|
540
541
|
self.train_test_split_type = obj["train_test_split_type"]
|
|
541
542
|
if "function" in obj.keys() and obj["function"] is not None:
|
|
542
543
|
self.function = obj["function"]
|
|
544
|
+
if "properties" in obj and obj["properties"] is not None:
|
|
545
|
+
self.properties = obj["properties"]
|
|
543
546
|
if "properties_mapping" in obj:
|
|
544
547
|
self.properties_mapping = obj["properties_mapping"]
|
|
545
548
|
|
|
@@ -617,6 +620,8 @@ class MLModelConfig(ApiDto):
|
|
|
617
620
|
obj["features"] = self.features
|
|
618
621
|
if self.features_from_file is not None:
|
|
619
622
|
obj["features_from_file"] = self.features_from_file
|
|
623
|
+
if self.properties is not None and isinstance(self.properties, dict):
|
|
624
|
+
obj["properties"] = self.properties
|
|
620
625
|
if self.properties_mapping is not None and isinstance(self.properties_mapping, dict):
|
|
621
626
|
obj["properties_mapping"] = self.properties_mapping
|
|
622
627
|
|
|
@@ -187,7 +187,8 @@ class Script(ApiDto):
|
|
|
187
187
|
return "json"
|
|
188
188
|
|
|
189
189
|
def __init__(self, script_id=None, description=None, function=None,
|
|
190
|
-
script_type=None, version=None, library=None, category=None
|
|
190
|
+
script_type=None, version=None, library=None, category=None,
|
|
191
|
+
properties=None):
|
|
191
192
|
|
|
192
193
|
# Id
|
|
193
194
|
if script_id is None:
|
|
@@ -202,6 +203,7 @@ class Script(ApiDto):
|
|
|
202
203
|
self.library = library
|
|
203
204
|
self.module = None
|
|
204
205
|
self.category = category
|
|
206
|
+
self.properties = properties if properties is not None else {}
|
|
205
207
|
|
|
206
208
|
# Source code property (not serialized)
|
|
207
209
|
self.source = None
|
|
@@ -271,6 +273,8 @@ class Script(ApiDto):
|
|
|
271
273
|
obj["source"] = self.source
|
|
272
274
|
if self.category is not None:
|
|
273
275
|
obj["category"] = self.category.value
|
|
276
|
+
if self.properties:
|
|
277
|
+
obj["properties"] = self.properties
|
|
274
278
|
return obj
|
|
275
279
|
|
|
276
280
|
def to_json(self, target: str = None):
|
|
@@ -304,6 +308,8 @@ class Script(ApiDto):
|
|
|
304
308
|
self.source = obj["source"]
|
|
305
309
|
if "category" in obj and obj["category"] is not None:
|
|
306
310
|
self.category = ScriptCategory(obj["category"])
|
|
311
|
+
if "properties" in obj and obj["properties"] is not None:
|
|
312
|
+
self.properties = obj["properties"]
|
|
307
313
|
|
|
308
314
|
def copy(self, myfunction):
|
|
309
315
|
"""
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
from .common import filter_df, fillna, clustering, merge, normalize, interpolate, remove_outliers, resample, rolling, diff, lag, clip, steady_state_filter, pca, setpoint_deviation, formula, target_feat_to_binary
|
|
@@ -0,0 +1,372 @@
|
|
|
1
|
+
import re
|
|
2
|
+
|
|
3
|
+
import wizata_dsapi
|
|
4
|
+
|
|
5
|
+
import pandas
|
|
6
|
+
import numpy
|
|
7
|
+
|
|
8
|
+
import sklearn
|
|
9
|
+
import sklearn.cluster
|
|
10
|
+
import sklearn.metrics
|
|
11
|
+
import sklearn.ensemble
|
|
12
|
+
import sklearn.preprocessing
|
|
13
|
+
import sklearn.decomposition
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
_FORMULA_SAFE_FUNCTIONS = {
|
|
17
|
+
"abs": numpy.abs,
|
|
18
|
+
"sqrt": numpy.sqrt,
|
|
19
|
+
"log": numpy.log,
|
|
20
|
+
"log10": numpy.log10,
|
|
21
|
+
"exp": numpy.exp,
|
|
22
|
+
"clip": numpy.clip,
|
|
23
|
+
"round": numpy.round,
|
|
24
|
+
"min": numpy.minimum,
|
|
25
|
+
"max": numpy.maximum,
|
|
26
|
+
}
|
|
27
|
+
|
|
28
|
+
|
|
29
|
+
def filter_df(context: wizata_dsapi.Context):
|
|
30
|
+
"""Filter dataframe rows using pandas query strings from the 'filters' property list."""
|
|
31
|
+
|
|
32
|
+
if "filters" not in context.properties or not isinstance(context.properties['filters'], list):
|
|
33
|
+
raise ValueError(f'there is no list *filters* in properties - please set them on context or config')
|
|
34
|
+
|
|
35
|
+
df = context.dataframe.copy()
|
|
36
|
+
|
|
37
|
+
filters = context.properties['filters']
|
|
38
|
+
for filter_row in filters:
|
|
39
|
+
try:
|
|
40
|
+
df = df.query(filter_row)
|
|
41
|
+
except pandas.errors.ParserError as e:
|
|
42
|
+
raise ValueError(f"error parsing filter string '{filter_row}': {e}")
|
|
43
|
+
|
|
44
|
+
return df
|
|
45
|
+
|
|
46
|
+
|
|
47
|
+
def clustering(context: wizata_dsapi.Context):
|
|
48
|
+
"""K-means clustering with automatic cluster count selection via silhouette score."""
|
|
49
|
+
df = context.dataframe.copy()
|
|
50
|
+
scaler = sklearn.preprocessing.StandardScaler()
|
|
51
|
+
df_clustering_scaler = scaler.fit_transform(df)
|
|
52
|
+
|
|
53
|
+
range_n_clusters = list(range(2, min(10, df_clustering_scaler.shape[0])))
|
|
54
|
+
silhouette_avg = []
|
|
55
|
+
for num_clusters in range_n_clusters:
|
|
56
|
+
kmeans = sklearn.cluster.KMeans(n_clusters=num_clusters)
|
|
57
|
+
kmeans.fit(df_clustering_scaler)
|
|
58
|
+
cluster_labels = kmeans.labels_
|
|
59
|
+
unique, counts = numpy.unique(cluster_labels, return_counts=True)
|
|
60
|
+
|
|
61
|
+
if len(unique) >= 2:
|
|
62
|
+
silhouette_avg.append(sklearn.metrics.silhouette_score(df_clustering_scaler, cluster_labels))
|
|
63
|
+
else:
|
|
64
|
+
silhouette_avg.append(numpy.nan)
|
|
65
|
+
|
|
66
|
+
if numpy.isnan(silhouette_avg).all():
|
|
67
|
+
df['cluster_labels'] = 0
|
|
68
|
+
else:
|
|
69
|
+
best_nb_clusters = silhouette_avg.index(max(silhouette_avg)) + 2
|
|
70
|
+
kmeans = sklearn.cluster.KMeans(n_clusters=best_nb_clusters)
|
|
71
|
+
kmeans.fit(df_clustering_scaler)
|
|
72
|
+
cluster_labels = kmeans.labels_
|
|
73
|
+
df['cluster_labels'] = cluster_labels
|
|
74
|
+
df['cluster_labels'] = df['cluster_labels'].apply(lambda x: int(x + 1))
|
|
75
|
+
|
|
76
|
+
return df
|
|
77
|
+
|
|
78
|
+
|
|
79
|
+
def merge(context: wizata_dsapi.Context):
|
|
80
|
+
"""Merge multiple dataframes by index using outer join (configurable via 'how' property)."""
|
|
81
|
+
dataframes = context.current_dataframes()
|
|
82
|
+
if len(dataframes) <= 1:
|
|
83
|
+
raise ValueError(f'there is not enough dataframes to concat')
|
|
84
|
+
|
|
85
|
+
how = "outer"
|
|
86
|
+
if "how" in context.properties:
|
|
87
|
+
how = context.properties["how"]
|
|
88
|
+
|
|
89
|
+
df = None
|
|
90
|
+
for key in dataframes:
|
|
91
|
+
if df is None:
|
|
92
|
+
df = dataframes[key]
|
|
93
|
+
else:
|
|
94
|
+
df = df.merge(dataframes[key], how=how, left_index=True, right_index=True)
|
|
95
|
+
return df
|
|
96
|
+
|
|
97
|
+
|
|
98
|
+
def fillna(context: wizata_dsapi.Context):
|
|
99
|
+
"""Fill missing values per column using the 'fillna' property dict mapping column names to fill values."""
|
|
100
|
+
df = context.dataframe
|
|
101
|
+
|
|
102
|
+
if "fillna" not in context.properties:
|
|
103
|
+
raise KeyError(f'please set a property dict fillna')
|
|
104
|
+
|
|
105
|
+
for key in context.properties["fillna"]:
|
|
106
|
+
df[key] = df[key].fillna(value=context.properties["fillna"][key])
|
|
107
|
+
|
|
108
|
+
return df
|
|
109
|
+
|
|
110
|
+
|
|
111
|
+
def normalize(context: wizata_dsapi.Context):
|
|
112
|
+
"""Normalize all numeric columns using 'minmax' (default) or 'zscore' scaling (configurable via 'method' property)."""
|
|
113
|
+
df = context.dataframe.copy()
|
|
114
|
+
|
|
115
|
+
method = context.properties.get("method", "minmax")
|
|
116
|
+
columns = df.select_dtypes(include="number").columns.tolist()
|
|
117
|
+
|
|
118
|
+
if not columns:
|
|
119
|
+
raise ValueError(f'no numeric columns to normalize')
|
|
120
|
+
|
|
121
|
+
if method == "minmax":
|
|
122
|
+
scaler = sklearn.preprocessing.MinMaxScaler()
|
|
123
|
+
elif method == "zscore":
|
|
124
|
+
scaler = sklearn.preprocessing.StandardScaler()
|
|
125
|
+
else:
|
|
126
|
+
raise ValueError(f"unknown normalize method '{method}', use 'minmax' or 'zscore'")
|
|
127
|
+
|
|
128
|
+
df[columns] = scaler.fit_transform(df[columns])
|
|
129
|
+
return df
|
|
130
|
+
|
|
131
|
+
|
|
132
|
+
def interpolate(context: wizata_dsapi.Context):
|
|
133
|
+
"""Interpolate missing values in numeric columns. The 'method' property selects the pandas interpolation method: 'time' (default, respects timestamp spacing), 'linear', 'nearest', 'pad', 'polynomial', or 'spline'."""
|
|
134
|
+
df = context.dataframe.copy()
|
|
135
|
+
|
|
136
|
+
method = context.properties.get("method", "time")
|
|
137
|
+
|
|
138
|
+
return df.interpolate(method=method)
|
|
139
|
+
|
|
140
|
+
|
|
141
|
+
def remove_outliers(context: wizata_dsapi.Context):
|
|
142
|
+
"""Drop rows containing outliers in any numeric column. The 'method' property selects 'iqr' (default, Tukey 1.5*IQR rule) or 'zscore' (drops rows beyond ±3 sigma)."""
|
|
143
|
+
df = context.dataframe.copy()
|
|
144
|
+
|
|
145
|
+
method = context.properties.get("method", "iqr")
|
|
146
|
+
columns = df.select_dtypes(include="number").columns.tolist()
|
|
147
|
+
|
|
148
|
+
if not columns:
|
|
149
|
+
return df
|
|
150
|
+
|
|
151
|
+
if method == "iqr":
|
|
152
|
+
q1 = df[columns].quantile(0.25)
|
|
153
|
+
q3 = df[columns].quantile(0.75)
|
|
154
|
+
iqr = q3 - q1
|
|
155
|
+
lower = q1 - 1.5 * iqr
|
|
156
|
+
upper = q3 + 1.5 * iqr
|
|
157
|
+
mask = ((df[columns] >= lower) & (df[columns] <= upper)).all(axis=1)
|
|
158
|
+
elif method == "zscore":
|
|
159
|
+
mean = df[columns].mean()
|
|
160
|
+
std = df[columns].std()
|
|
161
|
+
z = (df[columns] - mean).abs().divide(std)
|
|
162
|
+
mask = (z.fillna(0) <= 3).all(axis=1)
|
|
163
|
+
else:
|
|
164
|
+
raise ValueError(f"unknown remove_outliers method '{method}', use 'iqr' or 'zscore'")
|
|
165
|
+
|
|
166
|
+
return df[mask]
|
|
167
|
+
|
|
168
|
+
|
|
169
|
+
def resample(context: wizata_dsapi.Context):
|
|
170
|
+
"""Resample the dataframe to a new time frequency. Property 'freq' is required (pandas offset alias, e.g. '1min', '5min', '1H', '1D'); 'agg' selects the aggregation ('mean' default, 'sum', 'min', 'max', 'first', 'last', 'median')."""
|
|
171
|
+
df = context.dataframe
|
|
172
|
+
|
|
173
|
+
if "freq" not in context.properties:
|
|
174
|
+
raise KeyError(f"please set a 'freq' property (pandas offset alias, e.g. '1min', '1H')")
|
|
175
|
+
|
|
176
|
+
freq = context.properties["freq"]
|
|
177
|
+
agg = context.properties.get("agg", "mean")
|
|
178
|
+
|
|
179
|
+
return df.resample(freq).agg(agg)
|
|
180
|
+
|
|
181
|
+
|
|
182
|
+
def rolling(context: wizata_dsapi.Context):
|
|
183
|
+
"""Apply a rolling-window aggregation over all numeric columns. Property 'window' is required (integer number of rows); 'agg' selects the aggregation ('mean' default, 'sum', 'std', 'min', 'max', 'median')."""
|
|
184
|
+
df = context.dataframe
|
|
185
|
+
|
|
186
|
+
if "window" not in context.properties:
|
|
187
|
+
raise KeyError(f"please set a 'window' property (integer number of rows)")
|
|
188
|
+
|
|
189
|
+
window = context.properties["window"]
|
|
190
|
+
agg = context.properties.get("agg", "mean")
|
|
191
|
+
|
|
192
|
+
return df.rolling(window=window).agg(agg)
|
|
193
|
+
|
|
194
|
+
|
|
195
|
+
def diff(context: wizata_dsapi.Context):
|
|
196
|
+
"""Compute discrete differences (rate of change) across all numeric columns. Property 'periods' (default 1) is the number of rows to shift before subtracting — use 1 for first derivative, higher for longer horizons."""
|
|
197
|
+
df = context.dataframe
|
|
198
|
+
|
|
199
|
+
periods = context.properties.get("periods", 1)
|
|
200
|
+
|
|
201
|
+
return df.diff(periods=periods)
|
|
202
|
+
|
|
203
|
+
|
|
204
|
+
def lag(context: wizata_dsapi.Context):
|
|
205
|
+
"""Add lagged versions of all numeric columns as new '<col>_lag<N>' columns, preserving originals. Property 'periods' is required (integer number of rows to shift back)."""
|
|
206
|
+
df = context.dataframe
|
|
207
|
+
|
|
208
|
+
if "periods" not in context.properties:
|
|
209
|
+
raise KeyError(f"please set a 'periods' property (integer number of rows to lag)")
|
|
210
|
+
|
|
211
|
+
periods = context.properties["periods"]
|
|
212
|
+
columns = df.select_dtypes(include="number").columns.tolist()
|
|
213
|
+
|
|
214
|
+
for col in columns:
|
|
215
|
+
df[f"{col}_lag{periods}"] = df[col].shift(periods)
|
|
216
|
+
|
|
217
|
+
return df
|
|
218
|
+
|
|
219
|
+
|
|
220
|
+
def clip(context: wizata_dsapi.Context):
|
|
221
|
+
"""Clip all numeric columns to a physical range. Properties 'min' and/or 'max' (floats) bound the values; at least one of the two must be provided."""
|
|
222
|
+
df = context.dataframe
|
|
223
|
+
|
|
224
|
+
lower = context.properties.get("min")
|
|
225
|
+
upper = context.properties.get("max")
|
|
226
|
+
|
|
227
|
+
if lower is None and upper is None:
|
|
228
|
+
raise KeyError(f"please set at least one of 'min' or 'max' properties")
|
|
229
|
+
|
|
230
|
+
columns = df.select_dtypes(include="number").columns.tolist()
|
|
231
|
+
df[columns] = df[columns].clip(lower=lower, upper=upper)
|
|
232
|
+
|
|
233
|
+
return df
|
|
234
|
+
|
|
235
|
+
|
|
236
|
+
def steady_state_filter(context: wizata_dsapi.Context):
|
|
237
|
+
"""Keep only rows where all numeric columns are in steady state (rolling std over 'window' rows stays below 'tolerance'). Drops transients and start-up periods — a standard preprocessing step before process modeling."""
|
|
238
|
+
df = context.dataframe
|
|
239
|
+
|
|
240
|
+
if "window" not in context.properties:
|
|
241
|
+
raise KeyError(f"please set a 'window' property (integer number of rows)")
|
|
242
|
+
if "tolerance" not in context.properties:
|
|
243
|
+
raise KeyError(f"please set a 'tolerance' property (max allowed rolling std)")
|
|
244
|
+
|
|
245
|
+
window = context.properties["window"]
|
|
246
|
+
tolerance = context.properties["tolerance"]
|
|
247
|
+
|
|
248
|
+
columns = df.select_dtypes(include="number").columns.tolist()
|
|
249
|
+
if not columns:
|
|
250
|
+
return df
|
|
251
|
+
|
|
252
|
+
rolling_std = df[columns].rolling(window=window).std()
|
|
253
|
+
mask = (rolling_std <= tolerance).all(axis=1)
|
|
254
|
+
|
|
255
|
+
return df[mask]
|
|
256
|
+
|
|
257
|
+
|
|
258
|
+
def pca(context: wizata_dsapi.Context):
|
|
259
|
+
"""Reduce numeric columns to principal components, replacing them with 'PC1', 'PC2', ... Property 'n_components' is required — an integer (number of components) or a float in (0, 1] (minimum explained variance ratio). NaN values must be handled upstream (e.g. with interpolate or fillna)."""
|
|
260
|
+
df = context.dataframe
|
|
261
|
+
|
|
262
|
+
if "n_components" not in context.properties:
|
|
263
|
+
raise KeyError(f"please set an 'n_components' property (int or float in (0,1])")
|
|
264
|
+
|
|
265
|
+
n_components = context.properties["n_components"]
|
|
266
|
+
columns = df.select_dtypes(include="number").columns.tolist()
|
|
267
|
+
|
|
268
|
+
if not columns:
|
|
269
|
+
raise ValueError(f"no numeric columns to reduce")
|
|
270
|
+
|
|
271
|
+
if df[columns].isna().any().any():
|
|
272
|
+
raise ValueError(f"PCA cannot handle NaN values — run interpolate or fillna upstream")
|
|
273
|
+
|
|
274
|
+
model = sklearn.decomposition.PCA(n_components=n_components)
|
|
275
|
+
transformed = model.fit_transform(df[columns])
|
|
276
|
+
|
|
277
|
+
pc_cols = [f"PC{i + 1}" for i in range(transformed.shape[1])]
|
|
278
|
+
return pandas.DataFrame(transformed, index=df.index, columns=pc_cols)
|
|
279
|
+
|
|
280
|
+
|
|
281
|
+
def setpoint_deviation(context: wizata_dsapi.Context):
|
|
282
|
+
"""Add '<measurement>_deviation' columns computed as (measurement - setpoint) for each setpoint datapoint in the dataframe. Setpoints are auto-detected via BusinessType.SET_POINTS and paired with a non-setpoint datapoint sharing the same category_id and twin_id. Ambiguous or unpaired setpoints are skipped."""
|
|
283
|
+
df = context.dataframe
|
|
284
|
+
datapoints = context.datapoints or {}
|
|
285
|
+
|
|
286
|
+
setpoints = {
|
|
287
|
+
col: dp for col, dp in datapoints.items()
|
|
288
|
+
if col in df.columns and dp.business_type == wizata_dsapi.BusinessType.SET_POINTS
|
|
289
|
+
}
|
|
290
|
+
|
|
291
|
+
if not setpoints:
|
|
292
|
+
raise ValueError(f"no setpoint datapoints found in context (BusinessType.SET_POINTS)")
|
|
293
|
+
|
|
294
|
+
paired = 0
|
|
295
|
+
for sp_col, sp_dp in setpoints.items():
|
|
296
|
+
candidates = [
|
|
297
|
+
col for col, dp in datapoints.items()
|
|
298
|
+
if col in df.columns
|
|
299
|
+
and col != sp_col
|
|
300
|
+
and dp.business_type != wizata_dsapi.BusinessType.SET_POINTS
|
|
301
|
+
and dp.category_id is not None
|
|
302
|
+
and dp.category_id == sp_dp.category_id
|
|
303
|
+
and dp.twin_id == sp_dp.twin_id
|
|
304
|
+
]
|
|
305
|
+
if len(candidates) == 1:
|
|
306
|
+
meas_col = candidates[0]
|
|
307
|
+
df[f"{meas_col}_deviation"] = df[meas_col] - df[sp_col]
|
|
308
|
+
paired += 1
|
|
309
|
+
|
|
310
|
+
if paired == 0:
|
|
311
|
+
raise ValueError(f"no setpoint/measurement pairs could be resolved — check that paired datapoints share category_id and twin_id")
|
|
312
|
+
|
|
313
|
+
return df
|
|
314
|
+
|
|
315
|
+
|
|
316
|
+
def formula(context: wizata_dsapi.Context):
|
|
317
|
+
"""Add a new column computed from a user-defined math expression over existing columns. Property 'expression' is required — intuitive math syntax referencing column names (e.g. 'temp_1 + temp_2', '(p_in - p_out) / p_in * 100', 'sqrt(vibration_x**2 + vibration_y**2)', 'clip(temperature, 0, 500)', 'log(power + 1)'). Supports arithmetic operators (+, -, *, /, %, **) and these functions: abs, sqrt, log, log10, exp, clip, round, min, max. Column names with spaces must be wrapped in backticks (e.g. '`motor temp` * 2'). Property 'result' (default 'result') names the output column."""
|
|
318
|
+
df = context.dataframe
|
|
319
|
+
|
|
320
|
+
if "expression" not in context.properties:
|
|
321
|
+
raise KeyError(f"please set an 'expression' property (e.g. 'col_a + col_b')")
|
|
322
|
+
|
|
323
|
+
expression = context.properties["expression"]
|
|
324
|
+
result = context.properties.get("result", "result")
|
|
325
|
+
|
|
326
|
+
# Whitelist validation: strip everything we allow, flag any leftover token.
|
|
327
|
+
sanitized = expression
|
|
328
|
+
for col in sorted(df.columns, key=len, reverse=True):
|
|
329
|
+
sanitized = sanitized.replace(f"`{col}`", "").replace(col, "")
|
|
330
|
+
for fn in _FORMULA_SAFE_FUNCTIONS:
|
|
331
|
+
sanitized = sanitized.replace(fn, "")
|
|
332
|
+
sanitized = re.sub(r"[\d\.\+\-\*/\%\(\)\s,\^]", "", sanitized)
|
|
333
|
+
if sanitized.strip():
|
|
334
|
+
raise ValueError(
|
|
335
|
+
f"expression contains disallowed tokens: '{sanitized.strip()}' — "
|
|
336
|
+
f"only column names, numbers, arithmetic operators, and these functions are allowed: "
|
|
337
|
+
f"{list(_FORMULA_SAFE_FUNCTIONS.keys())}"
|
|
338
|
+
)
|
|
339
|
+
|
|
340
|
+
df[result] = df.eval(expression, local_dict=_FORMULA_SAFE_FUNCTIONS, engine="python")
|
|
341
|
+
return df
|
|
342
|
+
|
|
343
|
+
|
|
344
|
+
def target_feat_to_binary(context: wizata_dsapi.Context):
|
|
345
|
+
"""Convert a target feature column to binary (0/1) using a threshold operator (lt, lte, gt, gte)."""
|
|
346
|
+
df = context.dataframe
|
|
347
|
+
|
|
348
|
+
if "target_feat" not in context.properties:
|
|
349
|
+
raise KeyError(f'please set a target feature to transform to binary class')
|
|
350
|
+
|
|
351
|
+
target_feat = context.properties["target_feat"]["sensor"]
|
|
352
|
+
operator = context.properties["target_feat"]["operator"]
|
|
353
|
+
threshold = context.properties["target_feat"]["threshold"]
|
|
354
|
+
|
|
355
|
+
if operator == 'lt':
|
|
356
|
+
df[target_feat] = numpy.where(df[target_feat] < threshold, 1, 0)
|
|
357
|
+
elif operator == 'lte':
|
|
358
|
+
df[target_feat] = numpy.where(df[target_feat] <= threshold, 1, 0)
|
|
359
|
+
elif operator == 'gt':
|
|
360
|
+
df[target_feat] = numpy.where(df[target_feat] > threshold, 1, 0)
|
|
361
|
+
elif operator == 'gte':
|
|
362
|
+
df[target_feat] = numpy.where(df[target_feat] >= threshold, 1, 0)
|
|
363
|
+
else:
|
|
364
|
+
raise KeyError(f'operator type for binarisation not know')
|
|
365
|
+
|
|
366
|
+
# Check if at least 1 value of each class
|
|
367
|
+
if df[target_feat].nunique() == 1:
|
|
368
|
+
raise KeyError(f'classification model requires 2 classes, only one was detected')
|
|
369
|
+
elif df[target_feat].nunique() > 2:
|
|
370
|
+
raise KeyError(f'classification model requires 2 classes, more than 2 were detected')
|
|
371
|
+
|
|
372
|
+
return df
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
__version__ = "2.0.0.dev24"
|
|
@@ -1 +0,0 @@
|
|
|
1
|
-
from .common import filter_df, fillna, clustering, merge, target_feat_to_binary
|
|
@@ -1,122 +0,0 @@
|
|
|
1
|
-
import wizata_dsapi
|
|
2
|
-
|
|
3
|
-
import pandas
|
|
4
|
-
import numpy
|
|
5
|
-
|
|
6
|
-
import sklearn
|
|
7
|
-
import sklearn.cluster
|
|
8
|
-
import sklearn.metrics
|
|
9
|
-
import sklearn.ensemble
|
|
10
|
-
|
|
11
|
-
|
|
12
|
-
def filter_df(context: wizata_dsapi.Context):
|
|
13
|
-
"""Filter dataframe rows using pandas query strings from the 'filters' property list."""
|
|
14
|
-
|
|
15
|
-
if "filters" not in context.properties or not isinstance(context.properties['filters'], list):
|
|
16
|
-
raise ValueError(f'there is no list *filters* in properties - please set them on context or config')
|
|
17
|
-
|
|
18
|
-
df = context.dataframe.copy()
|
|
19
|
-
|
|
20
|
-
filters = context.properties['filters']
|
|
21
|
-
for filter_row in filters:
|
|
22
|
-
try:
|
|
23
|
-
df = df.query(filter_row)
|
|
24
|
-
except pandas.errors.ParserError as e:
|
|
25
|
-
raise ValueError(f"error parsing filter string '{filter_row}': {e}")
|
|
26
|
-
|
|
27
|
-
return df
|
|
28
|
-
|
|
29
|
-
|
|
30
|
-
def clustering(context: wizata_dsapi.Context):
|
|
31
|
-
"""K-means clustering with automatic cluster count selection via silhouette score."""
|
|
32
|
-
df = context.dataframe.copy()
|
|
33
|
-
scaler = sklearn.preprocessing.StandardScaler()
|
|
34
|
-
df_clustering_scaler = scaler.fit_transform(df)
|
|
35
|
-
|
|
36
|
-
range_n_clusters = list(range(2, min(10, df_clustering_scaler.shape[0])))
|
|
37
|
-
silhouette_avg = []
|
|
38
|
-
for num_clusters in range_n_clusters:
|
|
39
|
-
kmeans = sklearn.cluster.KMeans(n_clusters=num_clusters)
|
|
40
|
-
kmeans.fit(df_clustering_scaler)
|
|
41
|
-
cluster_labels = kmeans.labels_
|
|
42
|
-
unique, counts = numpy.unique(cluster_labels, return_counts=True)
|
|
43
|
-
|
|
44
|
-
if len(unique) >= 2:
|
|
45
|
-
silhouette_avg.append(sklearn.metrics.silhouette_score(df_clustering_scaler, cluster_labels))
|
|
46
|
-
else:
|
|
47
|
-
silhouette_avg.append(numpy.nan)
|
|
48
|
-
|
|
49
|
-
if numpy.isnan(silhouette_avg).all():
|
|
50
|
-
df['cluster_labels'] = 0
|
|
51
|
-
else:
|
|
52
|
-
best_nb_clusters = silhouette_avg.index(max(silhouette_avg)) + 2
|
|
53
|
-
kmeans = sklearn.cluster.KMeans(n_clusters=best_nb_clusters)
|
|
54
|
-
kmeans.fit(df_clustering_scaler)
|
|
55
|
-
cluster_labels = kmeans.labels_
|
|
56
|
-
df['cluster_labels'] = cluster_labels
|
|
57
|
-
df['cluster_labels'] = df['cluster_labels'].apply(lambda x: int(x + 1))
|
|
58
|
-
|
|
59
|
-
return df
|
|
60
|
-
|
|
61
|
-
|
|
62
|
-
def merge(context: wizata_dsapi.Context):
|
|
63
|
-
"""Merge multiple dataframes by index using outer join (configurable via 'how' property)."""
|
|
64
|
-
dataframes = context.current_dataframes()
|
|
65
|
-
if len(dataframes) <= 1:
|
|
66
|
-
raise ValueError(f'there is not enough dataframes to concat')
|
|
67
|
-
|
|
68
|
-
how = "outer"
|
|
69
|
-
if "how" in context.properties:
|
|
70
|
-
how = context.properties["how"]
|
|
71
|
-
|
|
72
|
-
df = None
|
|
73
|
-
for key in dataframes:
|
|
74
|
-
if df is None:
|
|
75
|
-
df = dataframes[key]
|
|
76
|
-
else:
|
|
77
|
-
df = df.merge(dataframes[key], how=how, left_index=True, right_index=True)
|
|
78
|
-
return df
|
|
79
|
-
|
|
80
|
-
|
|
81
|
-
def fillna(context: wizata_dsapi.Context):
|
|
82
|
-
"""Fill missing values per column using the 'fillna' property dict mapping column names to fill values."""
|
|
83
|
-
df = context.dataframe
|
|
84
|
-
|
|
85
|
-
if "fillna" not in context.properties:
|
|
86
|
-
raise KeyError(f'please set a property dict fillna')
|
|
87
|
-
|
|
88
|
-
for key in context.properties["fillna"]:
|
|
89
|
-
df[key] = df[key].fillna(value=context.properties["fillna"][key])
|
|
90
|
-
|
|
91
|
-
return df
|
|
92
|
-
|
|
93
|
-
|
|
94
|
-
def target_feat_to_binary(context: wizata_dsapi.Context):
|
|
95
|
-
"""Convert a target feature column to binary (0/1) using a threshold operator (lt, lte, gt, gte)."""
|
|
96
|
-
df = context.dataframe
|
|
97
|
-
|
|
98
|
-
if "target_feat" not in context.properties:
|
|
99
|
-
raise KeyError(f'please set a target feature to transform to binary class')
|
|
100
|
-
|
|
101
|
-
target_feat = context.properties["target_feat"]["sensor"]
|
|
102
|
-
operator = context.properties["target_feat"]["operator"]
|
|
103
|
-
threshold = context.properties["target_feat"]["threshold"]
|
|
104
|
-
|
|
105
|
-
if operator == 'lt':
|
|
106
|
-
df[target_feat] = numpy.where(df[target_feat] < threshold, 1, 0)
|
|
107
|
-
elif operator == 'lte':
|
|
108
|
-
df[target_feat] = numpy.where(df[target_feat] <= threshold, 1, 0)
|
|
109
|
-
elif operator == 'gt':
|
|
110
|
-
df[target_feat] = numpy.where(df[target_feat] > threshold, 1, 0)
|
|
111
|
-
elif operator == 'gte':
|
|
112
|
-
df[target_feat] = numpy.where(df[target_feat] >= threshold, 1, 0)
|
|
113
|
-
else:
|
|
114
|
-
raise KeyError(f'operator type for binarisation not know')
|
|
115
|
-
|
|
116
|
-
# Check if at least 1 value of each class
|
|
117
|
-
if df[target_feat].nunique() == 1:
|
|
118
|
-
raise KeyError(f'classification model requires 2 classes, only one was detected')
|
|
119
|
-
elif df[target_feat].nunique() > 2:
|
|
120
|
-
raise KeyError(f'classification model requires 2 classes, more than 2 were detected')
|
|
121
|
-
|
|
122
|
-
return df
|
|
@@ -1 +0,0 @@
|
|
|
1
|
-
__version__ = "2.0.0.dev22"
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{wizata_dsapi-2.0.0.dev22 → wizata_dsapi-2.0.0.dev24}/wizata_dsapi.egg-info/dependency_links.txt
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|