k2pipe 0.3.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- k2pipe-0.3.0/PKG-INFO +320 -0
- k2pipe-0.3.0/README.md +291 -0
- k2pipe-0.3.0/pyproject.toml +54 -0
- k2pipe-0.3.0/setup.cfg +4 -0
- k2pipe-0.3.0/src/k2pipe/__init__.py +0 -0
- k2pipe-0.3.0/src/k2pipe/case_data_downloader.py +405 -0
- k2pipe-0.3.0/src/k2pipe/features/__init__.py +33 -0
- k2pipe-0.3.0/src/k2pipe/features/feature_utils.py +216 -0
- k2pipe-0.3.0/src/k2pipe/my_rolling.py +74 -0
- k2pipe-0.3.0/src/k2pipe/mypipe.py +1051 -0
- k2pipe-0.3.0/src/k2pipe/pipe.py +168 -0
- k2pipe-0.3.0/src/k2pipe.egg-info/PKG-INFO +320 -0
- k2pipe-0.3.0/src/k2pipe.egg-info/SOURCES.txt +15 -0
- k2pipe-0.3.0/src/k2pipe.egg-info/dependency_links.txt +1 -0
- k2pipe-0.3.0/src/k2pipe.egg-info/requires.txt +13 -0
- k2pipe-0.3.0/src/k2pipe.egg-info/top_level.txt +2 -0
- k2pipe-0.3.0/src/perf/data_processing_performance.py +776 -0
k2pipe-0.3.0/PKG-INFO
ADDED
|
@@ -0,0 +1,320 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: k2pipe
|
|
3
|
+
Version: 0.3.0
|
|
4
|
+
Summary: k2pipe
|
|
5
|
+
Author-email: K2data <admin@k2data.com.cn>
|
|
6
|
+
Project-URL: Homepage, https://www.k2data.com.cn
|
|
7
|
+
Classifier: Development Status :: 3 - Alpha
|
|
8
|
+
Classifier: Intended Audience :: Developers
|
|
9
|
+
Classifier: Programming Language :: Python :: 3
|
|
10
|
+
Classifier: Programming Language :: Python :: 3.8
|
|
11
|
+
Classifier: Programming Language :: Python :: 3.9
|
|
12
|
+
Classifier: Programming Language :: Python :: 3.10
|
|
13
|
+
Classifier: Operating System :: OS Independent
|
|
14
|
+
Classifier: Topic :: Software Development :: Libraries
|
|
15
|
+
Requires-Python: >=3.8
|
|
16
|
+
Description-Content-Type: text/markdown
|
|
17
|
+
Requires-Dist: pandas
|
|
18
|
+
Requires-Dist: numexpr
|
|
19
|
+
Requires-Dist: scipy
|
|
20
|
+
Requires-Dist: loguru
|
|
21
|
+
Requires-Dist: greenlet==3.1.0
|
|
22
|
+
Requires-Dist: k2magic
|
|
23
|
+
Requires-Dist: graphviz
|
|
24
|
+
Provides-Extra: dev
|
|
25
|
+
Requires-Dist: pytest; extra == "dev"
|
|
26
|
+
Requires-Dist: pytest-mock; extra == "dev"
|
|
27
|
+
Requires-Dist: pytest-cov; extra == "dev"
|
|
28
|
+
Requires-Dist: pytz; extra == "dev"
|
|
29
|
+
|
|
30
|
+
# Pipe SDK (暂用名)
|
|
31
|
+
通过配置的方式,简化数据处理流程中常用操作,使得数据处理的业务逻辑更加直观和稳定。
|
|
32
|
+
|
|
33
|
+
## 一、安装
|
|
34
|
+
|
|
35
|
+
要求python 3.8或以上,建议3.8.10版本。
|
|
36
|
+
|
|
37
|
+
```
|
|
38
|
+
pip install -U k2pipe
|
|
39
|
+
```
|
|
40
|
+
|
|
41
|
+
## 二、使用方法
|
|
42
|
+
|
|
43
|
+
### 2.1 数据加工
|
|
44
|
+
|
|
45
|
+
提供 `extract_features()` 方法,根据配置里的计算规则,将当前DataFrame处理并返回结果DataFrame。
|
|
46
|
+
|
|
47
|
+
#### 2.1.1 配置文件
|
|
48
|
+
|
|
49
|
+
配置DataFrame一般从csv文件中加载,它定义了对原始DataFrame进行数据转换的规则。配置文件中每一行定义结果DataFrame里的一个(新)列,其中feature是列名,expression是列的计算表达式。
|
|
50
|
+
|
|
51
|
+
配置里定义的列是顺序处理的,因此列表达式既可以引用原始DataFrame里的列,也可以引用前面新定义的列。
|
|
52
|
+
|
|
53
|
+
| feature | expression | comment |
|
|
54
|
+
|---------|------------------------------------|------------------------------|
|
|
55
|
+
| feat1 | col1 / 100 - col2 | 四则运算示例 |
|
|
56
|
+
| feat2 | col2.rolling(3).mean() | 按行数滚动窗口示例 |
|
|
57
|
+
| feat3 | col2.rolling('3D').mean() | 按时间滚动窗口示例(dataframe必须有时间索引列) |
|
|
58
|
+
| feat4 | feat1.my_func() | 自定义函数示例(函数需要在python代码里注册) |
|
|
59
|
+
| feat5 | "where(col1 > 50, col1, col2 * 2)" | 条件赋值示例(支持多层where嵌套) |
|
|
60
|
+
| # | 注释内容 | 注释行示例 |
|
|
61
|
+
| feat6 | (col3 - k_ts).dt.days | 时间处理示例 |
|
|
62
|
+
| * | | 表示保留原始dataframe的所有列 |
|
|
63
|
+
| feat7 | k_device.str[1] | 字符串操作示例 |
|
|
64
|
+
| feat8 | "where(col1.isna(), 1, 2)" | 空值判断示例 |
|
|
65
|
+
| feat9 | @df.shape[0] | 获取待处理的DataFrame的属性示例 |
|
|
66
|
+
| feat10 | feat01.round() | 取整操作示例 |
|
|
67
|
+
| feat11 | k_ts + col1.astype('timedelta64[s]') | 时间处理示例1(暂时无法进行常数时间计算) |
|
|
68
|
+
| feat12 | k_ts.time_shift('-10s') | 时间处理示例2(带有常数时间的计算) |
|
|
69
|
+
|
|
70
|
+
说明:
|
|
71
|
+
- 若表达式或注释包含逗号,需要用双引号包裹,双引号前不能有空格
|
|
72
|
+
- 表达式不支持apply()函数
|
|
73
|
+
- 内置变量列表:@df, @pd
|
|
74
|
+
- 内置函数列表:time_shift()
|
|
75
|
+
- 更多示例:[K2Pipe Examples](https://gitlab.kstonedata.k2/zhanghao/k2pipe/tree/develop/tests/example)
|
|
76
|
+
- 表达式语法参考:[Mastering Eval Expressions in Pandas](https://www.sparkcodehub.com/pandas/advanced/eval-expressions-guide)
|
|
77
|
+
|
|
78
|
+
|
|
79
|
+
#### 2.1.2 Python代码
|
|
80
|
+
|
|
81
|
+
根据配置里定义的处理规则,将原始DataFrame数据转换为结果DataFrame数据。
|
|
82
|
+
|
|
83
|
+
```python
|
|
84
|
+
import pandas as pd
|
|
85
|
+
from pathlib import Path
|
|
86
|
+
from k2pipe.mypipe import MyDataFrame
|
|
87
|
+
|
|
88
|
+
# 样例数据
|
|
89
|
+
df = MyDataFrame({'k_device': ['dev1','dev1', 'dev1', 'dev2', 'dev2', 'dev2'],
|
|
90
|
+
'col1': [50, 60, 70, 80, 90, 100],
|
|
91
|
+
'col2': [1, 2, 3, 4, 5, 6]})
|
|
92
|
+
df['k_ts'] = pd.date_range(start='2025-01-01 08:00:00', periods=len(df), freq='s')
|
|
93
|
+
df['ts2'] = pd.date_range(start='2025-01-01 12:00:00', periods=len(df), freq='s')
|
|
94
|
+
df.set_index('k_ts', inplace=True)
|
|
95
|
+
|
|
96
|
+
# 配置信息
|
|
97
|
+
config = pd.read_csv(Path(__file__).parent / 'my_feat.csv')
|
|
98
|
+
df.columns = df.columns.str.strip()
|
|
99
|
+
|
|
100
|
+
# 注册自定义函数
|
|
101
|
+
pd.Series.my_func = (lambda x: x.rolling(3).mean())
|
|
102
|
+
|
|
103
|
+
# 处理数据
|
|
104
|
+
result = df.extract_features(config)
|
|
105
|
+
```
|
|
106
|
+
|
|
107
|
+
#### 2.1.3 运行结果
|
|
108
|
+
|
|
109
|
+
原始数据:
|
|
110
|
+
|
|
111
|
+
```
|
|
112
|
+
k_device col1 col2 col3
|
|
113
|
+
k_ts
|
|
114
|
+
2025-01-01 08:00:00 dev1 50 1 2025-01-01 12:00:00
|
|
115
|
+
2025-01-01 08:00:01 dev1 60 2 2025-01-01 12:00:01
|
|
116
|
+
2025-01-01 08:00:02 dev1 70 3 2025-01-01 12:00:02
|
|
117
|
+
2025-01-01 08:00:03 dev2 80 4 2025-01-01 12:00:03
|
|
118
|
+
2025-01-01 08:00:04 dev2 90 5 2025-01-01 12:00:04
|
|
119
|
+
2025-01-01 08:00:05 dev2 100 6 2025-01-01 12:00:05
|
|
120
|
+
|
|
121
|
+
```
|
|
122
|
+
|
|
123
|
+
结果数据:
|
|
124
|
+
|
|
125
|
+
```
|
|
126
|
+
k_device feat01 feat02 feat03 feat04 feat05 feat06 feat07 feat08 feat09 feat10 feat11
|
|
127
|
+
k_ts
|
|
128
|
+
2025-01-01 08:00:00 dev1 -0.5 NaN 1.0 NaN 2 0 e 2 6 -0.0 2025-01-01 08:00:10
|
|
129
|
+
2025-01-01 08:00:01 dev1 -1.4 NaN 1.5 NaN 60 0 e 2 6 -1.0 2025-01-01 08:00:11
|
|
130
|
+
2025-01-01 08:00:02 dev1 -2.3 2.0 2.0 -1.4 70 0 e 2 6 -2.0 2025-01-01 08:00:12
|
|
131
|
+
2025-01-01 08:00:03 dev2 -3.2 3.0 2.5 -2.3 80 0 e 2 6 -3.0 2025-01-01 08:00:13
|
|
132
|
+
2025-01-01 08:00:04 dev2 -4.1 4.0 3.0 -3.2 90 0 e 2 6 -4.0 2025-01-01 08:00:14
|
|
133
|
+
2025-01-01 08:00:05 dev2 -5.0 5.0 3.5 -4.1 100 0 e 2 6 -5.0 2025-01-01 08:00:15
|
|
134
|
+
```
|
|
135
|
+
|
|
136
|
+
### 2.2 函数库
|
|
137
|
+
|
|
138
|
+
#### 2.2.1 内置函数库
|
|
139
|
+
|
|
140
|
+
k2pipe提供了一批常用函数,主要用于对DataFrame的列计算slope、bias、smooth等指标。
|
|
141
|
+
|
|
142
|
+
使用方法:
|
|
143
|
+
|
|
144
|
+
```python
|
|
145
|
+
import pandas as pd
|
|
146
|
+
from k2pipe import features
|
|
147
|
+
|
|
148
|
+
df = pd.DataFrame({'col1': [20, 40, 70, 80, 90, 100]})
|
|
149
|
+
df['col1_diff'] = df['col1'].series_diff(period=1)
|
|
150
|
+
print(df)
|
|
151
|
+
```
|
|
152
|
+
|
|
153
|
+
输出结果:
|
|
154
|
+
```
|
|
155
|
+
col1 col1_diff
|
|
156
|
+
0 20 NaN
|
|
157
|
+
1 40 20.0
|
|
158
|
+
2 70 30.0
|
|
159
|
+
3 80 10.0
|
|
160
|
+
4 90 10.0
|
|
161
|
+
5 100 10.0
|
|
162
|
+
```
|
|
163
|
+
|
|
164
|
+
支持的函数:
|
|
165
|
+
|
|
166
|
+
| 函数名 | 参数 | 说明 |
|
|
167
|
+
|--------|------|------|
|
|
168
|
+
| series_current | 无 | 当前值(前向填充) |
|
|
169
|
+
| series_smooth| period: int = 30, method: str = 'mean' | 平滑值(移动平均),支持均值和中位数方法 |
|
|
170
|
+
| series_diff | period: int = 1 | 差值特征:当前值与前一值的差值 |
|
|
171
|
+
| series_bias | period: int = 0, base_window: int = 30, method: str = 'mean' | 偏差值:当前窗口均值与基准均值的差值 |
|
|
172
|
+
| series_linear_slope| period: int = 3 | 斜率特征:使用线性拟合方法 |
|
|
173
|
+
| series_sen_slope | period: int = 5, alpha: float = 0.75, p_thre: float = 0.25 | 斜率特征:使用 Sen's slope 方法 |
|
|
174
|
+
| series_strict_trend| period: int = 3 | 趋势特征:严格趋势判断 |
|
|
175
|
+
| series_soft_trend | period: int = 3 | 趋势特征:软趋势判断 |
|
|
176
|
+
| series_max | period: int = 3 | 最大值特征:指定窗口内的最大值 |
|
|
177
|
+
| series_min | period: int = 3 | 最小值特征:指定窗口内的最小值 |
|
|
178
|
+
| series_dmax | period: int = 3 | 差分后最大值特征:对差分序列取最大值 |
|
|
179
|
+
| series_dmin | period: int = 3 | 差分后最小值特征:对差分序列取最小值 |
|
|
180
|
+
| series_count | period: int = 3 | 计数特征:指定窗口内的非空值数量 |
|
|
181
|
+
| series_q05 | period: int = 3 | 5%分位数特征:指定窗口内的5%分位数值 |
|
|
182
|
+
| series_q95 | period: int = 3 | 95%分位数特征:指定窗口内的95%分位数值 |
|
|
183
|
+
| series_median | period: int = 3 | 中位数特征:指定窗口内的中位数 |
|
|
184
|
+
| series_mean | period: int = 3 | 均值特征:指定窗口内的均值 |
|
|
185
|
+
|
|
186
|
+
#### 2.2.2 自定义函数库
|
|
187
|
+
|
|
188
|
+
如果内置函数库不能满足需求,可以通过自定义函数库的方式扩展。注册自定义函数的方法示例如下:
|
|
189
|
+
|
|
190
|
+
```python
|
|
191
|
+
import pandas as pd
|
|
192
|
+
|
|
193
|
+
# 定义自定义函数(返回值是pd.Series类型)
|
|
194
|
+
def my_func(self: pd.Series, my_ratio: int = 2) -> pd.Series:
|
|
195
|
+
return self * my_ratio
|
|
196
|
+
|
|
197
|
+
# 注册自定义函数
|
|
198
|
+
pd.Series.my_func = my_func
|
|
199
|
+
|
|
200
|
+
# 调用自定义函数
|
|
201
|
+
df = ...
|
|
202
|
+
df['col1_my_col'] = df['col1'].my_func(my_ratio=2)
|
|
203
|
+
```
|
|
204
|
+
|
|
205
|
+
### 2.3 统一数据格式
|
|
206
|
+
|
|
207
|
+
提供`format_columns()`方法,将输入的DataFrame内的`k_ts`转为时间类型,将`k_device`列转为字符串类型。
|
|
208
|
+
|
|
209
|
+
```python
|
|
210
|
+
import pandas as pd
|
|
211
|
+
from k2pipe.mypipe import MyDataFrame, init_pipe
|
|
212
|
+
|
|
213
|
+
init_pipe()
|
|
214
|
+
df = MyDataFrame(...)
|
|
215
|
+
df = df.format_columns()
|
|
216
|
+
```
|
|
217
|
+
|
|
218
|
+
|
|
219
|
+
### 2.4 数据流分析
|
|
220
|
+
|
|
221
|
+
提供`generate_dataflow()`方法,生成svg格式的数据流图功能。此功能依赖`graphviz`包,安装方法见[这里](https://pypi.org/project/graphviz/)。
|
|
222
|
+
|
|
223
|
+
```python
|
|
224
|
+
df1 = MyDataFrame({'k_device': ['dev1', 'dev1', 'dev1', 'dev2', 'dev2', 'dev2'],
|
|
225
|
+
'col1': [50, 60, 70, 80, 90, 100],
|
|
226
|
+
'col2': [1, 2, 3, 4, 5, 6]})
|
|
227
|
+
df1['k_ts'] = pd.date_range(start='2025-01-01 08:00:00', periods=len(df1), freq='s')
|
|
228
|
+
df2 = df1.copy()
|
|
229
|
+
df1 = df1.rename(columns={'col1': 'col3', 'col2': 'col4'})
|
|
230
|
+
result = df1.merge(df2, on=['k_device', 'k_ts'])
|
|
231
|
+
result.generate_dataflow(filename='dataflow_merge.svg', show_value=False)
|
|
232
|
+
```
|
|
233
|
+
|
|
234
|
+
数据流图效果(灰色矩形代表DataFrame实例,白色矩形表示数据列Series,黄色矩形表示没有连出边的数据列):
|
|
235
|
+
|
|
236
|
+

|
|
237
|
+
|
|
238
|
+
|
|
239
|
+
### 2.5 长宽表转换
|
|
240
|
+
|
|
241
|
+
提供`wide_to_long()`方法和`long_to_wide()`方法,用于长表风格与宽表风格的转换。
|
|
242
|
+
|
|
243
|
+
宽表举例:
|
|
244
|
+
|
|
245
|
+
```
|
|
246
|
+
k_ts k_device f1_mean_3D f1_mean_5D f1_slope_3D f1_slope_5D f2_mean_3D f2_mean_5D f2_slope_3D f2_slope_5D
|
|
247
|
+
0 2025-01-01 dev1 8 24 67 87 79 48 10 94
|
|
248
|
+
1 2025-01-01 dev2 52 98 53 66 98 14 34 24
|
|
249
|
+
2 2025-01-02 dev1 15 60 58 16 9 93 86 2
|
|
250
|
+
3 2025-01-02 dev2 27 4 31 1 13 83 4 91
|
|
251
|
+
4 2025-01-03 dev1 59 67 7 49 47 65 61 14
|
|
252
|
+
5 2025-01-03 dev2 55 71 80 2 94 19 98 63
|
|
253
|
+
```
|
|
254
|
+
|
|
255
|
+
长表举例:
|
|
256
|
+
|
|
257
|
+
```
|
|
258
|
+
k_ts k_device feature measure period value
|
|
259
|
+
0 2025-01-01 dev1 f1 mean 3D 8
|
|
260
|
+
1 2025-01-01 dev1 f1 mean 5D 24
|
|
261
|
+
2 2025-01-01 dev1 f1 slope 3D 67
|
|
262
|
+
3 2025-01-01 dev1 f1 slope 5D 87
|
|
263
|
+
4 2025-01-01 dev1 f2 mean 3D 79
|
|
264
|
+
5 2025-01-01 dev1 f2 mean 5D 48
|
|
265
|
+
6 2025-01-01 dev1 f2 slope 3D 10
|
|
266
|
+
7 2025-01-01 dev1 f2 slope 5D 94
|
|
267
|
+
8 2025-01-01 dev2 f1 mean 3D 52
|
|
268
|
+
9 2025-01-01 dev2 f1 mean 5D 98
|
|
269
|
+
10 2025-01-01 dev2 f1 slope 3D 53
|
|
270
|
+
11 2025-01-01 dev2 f1 slope 5D 66
|
|
271
|
+
12 2025-01-01 dev2 f2 mean 3D 98
|
|
272
|
+
13 2025-01-01 dev2 f2 mean 5D 14
|
|
273
|
+
14 2025-01-01 dev2 f2 slope 3D 34
|
|
274
|
+
15 2025-01-01 dev2 f2 slope 5D 24
|
|
275
|
+
16 2025-01-02 dev1 f1 mean 3D 15
|
|
276
|
+
17 2025-01-02 dev1 f1 mean 5D 60
|
|
277
|
+
18 2025-01-02 dev1 f1 slope 3D 58
|
|
278
|
+
19 2025-01-02 dev1 f1 slope 5D 16
|
|
279
|
+
20 2025-01-02 dev1 f2 mean 3D 9
|
|
280
|
+
21 2025-01-02 dev1 f2 mean 5D 93
|
|
281
|
+
22 2025-01-02 dev1 f2 slope 3D 86
|
|
282
|
+
23 2025-01-02 dev1 f2 slope 5D 2
|
|
283
|
+
24 2025-01-02 dev2 f1 mean 3D 27
|
|
284
|
+
25 2025-01-02 dev2 f1 mean 5D 4
|
|
285
|
+
26 2025-01-02 dev2 f1 slope 3D 31
|
|
286
|
+
27 2025-01-02 dev2 f1 slope 5D 1
|
|
287
|
+
28 2025-01-02 dev2 f2 mean 3D 13
|
|
288
|
+
29 2025-01-02 dev2 f2 mean 5D 83
|
|
289
|
+
30 2025-01-02 dev2 f2 slope 3D 4
|
|
290
|
+
31 2025-01-02 dev2 f2 slope 5D 91
|
|
291
|
+
32 2025-01-03 dev1 f1 mean 3D 59
|
|
292
|
+
33 2025-01-03 dev1 f1 mean 5D 67
|
|
293
|
+
34 2025-01-03 dev1 f1 slope 3D 7
|
|
294
|
+
35 2025-01-03 dev1 f1 slope 5D 49
|
|
295
|
+
36 2025-01-03 dev1 f2 mean 3D 47
|
|
296
|
+
37 2025-01-03 dev1 f2 mean 5D 65
|
|
297
|
+
38 2025-01-03 dev1 f2 slope 3D 61
|
|
298
|
+
39 2025-01-03 dev1 f2 slope 5D 14
|
|
299
|
+
40 2025-01-03 dev2 f1 mean 3D 55
|
|
300
|
+
41 2025-01-03 dev2 f1 mean 5D 71
|
|
301
|
+
42 2025-01-03 dev2 f1 slope 3D 80
|
|
302
|
+
43 2025-01-03 dev2 f1 slope 5D 2
|
|
303
|
+
44 2025-01-03 dev2 f2 mean 3D 94
|
|
304
|
+
45 2025-01-03 dev2 f2 mean 5D 19
|
|
305
|
+
46 2025-01-03 dev2 f2 slope 3D 98
|
|
306
|
+
47 2025-01-03 dev2 f2 slope 5D 63
|
|
307
|
+
```
|
|
308
|
+
|
|
309
|
+
宽表转长表:
|
|
310
|
+
|
|
311
|
+
```python
|
|
312
|
+
df_long = df_wide.wide_to_long()
|
|
313
|
+
```
|
|
314
|
+
|
|
315
|
+
长表转宽表:
|
|
316
|
+
|
|
317
|
+
```python
|
|
318
|
+
df_wide = df_long.long_to_wide()
|
|
319
|
+
```
|
|
320
|
+
|
k2pipe-0.3.0/README.md
ADDED
|
@@ -0,0 +1,291 @@
|
|
|
1
|
+
# Pipe SDK (暂用名)
|
|
2
|
+
通过配置的方式,简化数据处理流程中常用操作,使得数据处理的业务逻辑更加直观和稳定。
|
|
3
|
+
|
|
4
|
+
## 一、安装
|
|
5
|
+
|
|
6
|
+
要求python 3.8或以上,建议3.8.10版本。
|
|
7
|
+
|
|
8
|
+
```
|
|
9
|
+
pip install -U k2pipe
|
|
10
|
+
```
|
|
11
|
+
|
|
12
|
+
## 二、使用方法
|
|
13
|
+
|
|
14
|
+
### 2.1 数据加工
|
|
15
|
+
|
|
16
|
+
提供 `extract_features()` 方法,根据配置里的计算规则,将当前DataFrame处理并返回结果DataFrame。
|
|
17
|
+
|
|
18
|
+
#### 2.1.1 配置文件
|
|
19
|
+
|
|
20
|
+
配置DataFrame一般从csv文件中加载,它定义了对原始DataFrame进行数据转换的规则。配置文件中每一行定义结果DataFrame里的一个(新)列,其中feature是列名,expression是列的计算表达式。
|
|
21
|
+
|
|
22
|
+
配置里定义的列是顺序处理的,因此列表达式既可以引用原始DataFrame里的列,也可以引用前面新定义的列。
|
|
23
|
+
|
|
24
|
+
| feature | expression | comment |
|
|
25
|
+
|---------|------------------------------------|------------------------------|
|
|
26
|
+
| feat1 | col1 / 100 - col2 | 四则运算示例 |
|
|
27
|
+
| feat2 | col2.rolling(3).mean() | 按行数滚动窗口示例 |
|
|
28
|
+
| feat3 | col2.rolling('3D').mean() | 按时间滚动窗口示例(dataframe必须有时间索引列) |
|
|
29
|
+
| feat4 | feat1.my_func() | 自定义函数示例(函数需要在python代码里注册) |
|
|
30
|
+
| feat5 | "where(col1 > 50, col1, col2 * 2)" | 条件赋值示例(支持多层where嵌套) |
|
|
31
|
+
| # | 注释内容 | 注释行示例 |
|
|
32
|
+
| feat6 | (col3 - k_ts).dt.days | 时间处理示例 |
|
|
33
|
+
| * | | 表示保留原始dataframe的所有列 |
|
|
34
|
+
| feat7 | k_device.str[1] | 字符串操作示例 |
|
|
35
|
+
| feat8 | "where(col1.isna(), 1, 2)" | 空值判断示例 |
|
|
36
|
+
| feat9 | @df.shape[0] | 获取待处理的DataFrame的属性示例 |
|
|
37
|
+
| feat10 | feat01.round() | 取整操作示例 |
|
|
38
|
+
| feat11 | k_ts + col1.astype('timedelta64[s]') | 时间处理示例1(暂时无法进行常数时间计算) |
|
|
39
|
+
| feat12 | k_ts.time_shift('-10s') | 时间处理示例2(带有常数时间的计算) |
|
|
40
|
+
|
|
41
|
+
说明:
|
|
42
|
+
- 若表达式或注释包含逗号,需要用双引号包裹,双引号前不能有空格
|
|
43
|
+
- 表达式不支持apply()函数
|
|
44
|
+
- 内置变量列表:@df, @pd
|
|
45
|
+
- 内置函数列表:time_shift()
|
|
46
|
+
- 更多示例:[K2Pipe Examples](https://gitlab.kstonedata.k2/zhanghao/k2pipe/tree/develop/tests/example)
|
|
47
|
+
- 表达式语法参考:[Mastering Eval Expressions in Pandas](https://www.sparkcodehub.com/pandas/advanced/eval-expressions-guide)
|
|
48
|
+
|
|
49
|
+
|
|
50
|
+
#### 2.1.2 Python代码
|
|
51
|
+
|
|
52
|
+
根据配置里定义的处理规则,将原始DataFrame数据转换为结果DataFrame数据。
|
|
53
|
+
|
|
54
|
+
```python
|
|
55
|
+
import pandas as pd
|
|
56
|
+
from pathlib import Path
|
|
57
|
+
from k2pipe.mypipe import MyDataFrame
|
|
58
|
+
|
|
59
|
+
# 样例数据
|
|
60
|
+
df = MyDataFrame({'k_device': ['dev1','dev1', 'dev1', 'dev2', 'dev2', 'dev2'],
|
|
61
|
+
'col1': [50, 60, 70, 80, 90, 100],
|
|
62
|
+
'col2': [1, 2, 3, 4, 5, 6]})
|
|
63
|
+
df['k_ts'] = pd.date_range(start='2025-01-01 08:00:00', periods=len(df), freq='s')
|
|
64
|
+
df['ts2'] = pd.date_range(start='2025-01-01 12:00:00', periods=len(df), freq='s')
|
|
65
|
+
df.set_index('k_ts', inplace=True)
|
|
66
|
+
|
|
67
|
+
# 配置信息
|
|
68
|
+
config = pd.read_csv(Path(__file__).parent / 'my_feat.csv')
|
|
69
|
+
df.columns = df.columns.str.strip()
|
|
70
|
+
|
|
71
|
+
# 注册自定义函数
|
|
72
|
+
pd.Series.my_func = (lambda x: x.rolling(3).mean())
|
|
73
|
+
|
|
74
|
+
# 处理数据
|
|
75
|
+
result = df.extract_features(config)
|
|
76
|
+
```
|
|
77
|
+
|
|
78
|
+
#### 2.1.3 运行结果
|
|
79
|
+
|
|
80
|
+
原始数据:
|
|
81
|
+
|
|
82
|
+
```
|
|
83
|
+
k_device col1 col2 col3
|
|
84
|
+
k_ts
|
|
85
|
+
2025-01-01 08:00:00 dev1 50 1 2025-01-01 12:00:00
|
|
86
|
+
2025-01-01 08:00:01 dev1 60 2 2025-01-01 12:00:01
|
|
87
|
+
2025-01-01 08:00:02 dev1 70 3 2025-01-01 12:00:02
|
|
88
|
+
2025-01-01 08:00:03 dev2 80 4 2025-01-01 12:00:03
|
|
89
|
+
2025-01-01 08:00:04 dev2 90 5 2025-01-01 12:00:04
|
|
90
|
+
2025-01-01 08:00:05 dev2 100 6 2025-01-01 12:00:05
|
|
91
|
+
|
|
92
|
+
```
|
|
93
|
+
|
|
94
|
+
结果数据:
|
|
95
|
+
|
|
96
|
+
```
|
|
97
|
+
k_device feat01 feat02 feat03 feat04 feat05 feat06 feat07 feat08 feat09 feat10 feat11
|
|
98
|
+
k_ts
|
|
99
|
+
2025-01-01 08:00:00 dev1 -0.5 NaN 1.0 NaN 2 0 e 2 6 -0.0 2025-01-01 08:00:10
|
|
100
|
+
2025-01-01 08:00:01 dev1 -1.4 NaN 1.5 NaN 60 0 e 2 6 -1.0 2025-01-01 08:00:11
|
|
101
|
+
2025-01-01 08:00:02 dev1 -2.3 2.0 2.0 -1.4 70 0 e 2 6 -2.0 2025-01-01 08:00:12
|
|
102
|
+
2025-01-01 08:00:03 dev2 -3.2 3.0 2.5 -2.3 80 0 e 2 6 -3.0 2025-01-01 08:00:13
|
|
103
|
+
2025-01-01 08:00:04 dev2 -4.1 4.0 3.0 -3.2 90 0 e 2 6 -4.0 2025-01-01 08:00:14
|
|
104
|
+
2025-01-01 08:00:05 dev2 -5.0 5.0 3.5 -4.1 100 0 e 2 6 -5.0 2025-01-01 08:00:15
|
|
105
|
+
```
|
|
106
|
+
|
|
107
|
+
### 2.2 函数库
|
|
108
|
+
|
|
109
|
+
#### 2.2.1 内置函数库
|
|
110
|
+
|
|
111
|
+
k2pipe提供了一批常用函数,主要用于对DataFrame的列计算slope、bias、smooth等指标。
|
|
112
|
+
|
|
113
|
+
使用方法:
|
|
114
|
+
|
|
115
|
+
```python
|
|
116
|
+
import pandas as pd
|
|
117
|
+
from k2pipe import features
|
|
118
|
+
|
|
119
|
+
df = pd.DataFrame({'col1': [20, 40, 70, 80, 90, 100]})
|
|
120
|
+
df['col1_diff'] = df['col1'].series_diff(period=1)
|
|
121
|
+
print(df)
|
|
122
|
+
```
|
|
123
|
+
|
|
124
|
+
输出结果:
|
|
125
|
+
```
|
|
126
|
+
col1 col1_diff
|
|
127
|
+
0 20 NaN
|
|
128
|
+
1 40 20.0
|
|
129
|
+
2 70 30.0
|
|
130
|
+
3 80 10.0
|
|
131
|
+
4 90 10.0
|
|
132
|
+
5 100 10.0
|
|
133
|
+
```
|
|
134
|
+
|
|
135
|
+
支持的函数:
|
|
136
|
+
|
|
137
|
+
| 函数名 | 参数 | 说明 |
|
|
138
|
+
|--------|------|------|
|
|
139
|
+
| series_current | 无 | 当前值(前向填充) |
|
|
140
|
+
| series_smooth| period: int = 30, method: str = 'mean' | 平滑值(移动平均),支持均值和中位数方法 |
|
|
141
|
+
| series_diff | period: int = 1 | 差值特征:当前值与前一值的差值 |
|
|
142
|
+
| series_bias | period: int = 0, base_window: int = 30, method: str = 'mean' | 偏差值:当前窗口均值与基准均值的差值 |
|
|
143
|
+
| series_linear_slope| period: int = 3 | 斜率特征:使用线性拟合方法 |
|
|
144
|
+
| series_sen_slope | period: int = 5, alpha: float = 0.75, p_thre: float = 0.25 | 斜率特征:使用 Sen's slope 方法 |
|
|
145
|
+
| series_strict_trend| period: int = 3 | 趋势特征:严格趋势判断 |
|
|
146
|
+
| series_soft_trend | period: int = 3 | 趋势特征:软趋势判断 |
|
|
147
|
+
| series_max | period: int = 3 | 最大值特征:指定窗口内的最大值 |
|
|
148
|
+
| series_min | period: int = 3 | 最小值特征:指定窗口内的最小值 |
|
|
149
|
+
| series_dmax | period: int = 3 | 差分后最大值特征:对差分序列取最大值 |
|
|
150
|
+
| series_dmin | period: int = 3 | 差分后最小值特征:对差分序列取最小值 |
|
|
151
|
+
| series_count | period: int = 3 | 计数特征:指定窗口内的非空值数量 |
|
|
152
|
+
| series_q05 | period: int = 3 | 5%分位数特征:指定窗口内的5%分位数值 |
|
|
153
|
+
| series_q95 | period: int = 3 | 95%分位数特征:指定窗口内的95%分位数值 |
|
|
154
|
+
| series_median | period: int = 3 | 中位数特征:指定窗口内的中位数 |
|
|
155
|
+
| series_mean | period: int = 3 | 均值特征:指定窗口内的均值 |
|
|
156
|
+
|
|
157
|
+
#### 2.2.2 自定义函数库
|
|
158
|
+
|
|
159
|
+
如果内置函数库不能满足需求,可以通过自定义函数库的方式扩展。注册自定义函数的方法示例如下:
|
|
160
|
+
|
|
161
|
+
```python
|
|
162
|
+
import pandas as pd
|
|
163
|
+
|
|
164
|
+
# 定义自定义函数(返回值是pd.Series类型)
|
|
165
|
+
def my_func(self: pd.Series, my_ratio: int = 2) -> pd.Series:
|
|
166
|
+
return self * my_ratio
|
|
167
|
+
|
|
168
|
+
# 注册自定义函数
|
|
169
|
+
pd.Series.my_func = my_func
|
|
170
|
+
|
|
171
|
+
# 调用自定义函数
|
|
172
|
+
df = ...
|
|
173
|
+
df['col1_my_col'] = df['col1'].my_func(my_ratio=2)
|
|
174
|
+
```
|
|
175
|
+
|
|
176
|
+
### 2.3 统一数据格式
|
|
177
|
+
|
|
178
|
+
提供`format_columns()`方法,将输入的DataFrame内的`k_ts`转为时间类型,将`k_device`列转为字符串类型。
|
|
179
|
+
|
|
180
|
+
```python
|
|
181
|
+
import pandas as pd
|
|
182
|
+
from k2pipe.mypipe import MyDataFrame, init_pipe
|
|
183
|
+
|
|
184
|
+
init_pipe()
|
|
185
|
+
df = MyDataFrame(...)
|
|
186
|
+
df = df.format_columns()
|
|
187
|
+
```
|
|
188
|
+
|
|
189
|
+
|
|
190
|
+
### 2.4 数据流分析
|
|
191
|
+
|
|
192
|
+
提供`generate_dataflow()`方法,生成svg格式的数据流图功能。此功能依赖`graphviz`包,安装方法见[这里](https://pypi.org/project/graphviz/)。
|
|
193
|
+
|
|
194
|
+
```python
|
|
195
|
+
df1 = MyDataFrame({'k_device': ['dev1', 'dev1', 'dev1', 'dev2', 'dev2', 'dev2'],
|
|
196
|
+
'col1': [50, 60, 70, 80, 90, 100],
|
|
197
|
+
'col2': [1, 2, 3, 4, 5, 6]})
|
|
198
|
+
df1['k_ts'] = pd.date_range(start='2025-01-01 08:00:00', periods=len(df1), freq='s')
|
|
199
|
+
df2 = df1.copy()
|
|
200
|
+
df1 = df1.rename(columns={'col1': 'col3', 'col2': 'col4'})
|
|
201
|
+
result = df1.merge(df2, on=['k_device', 'k_ts'])
|
|
202
|
+
result.generate_dataflow(filename='dataflow_merge.svg', show_value=False)
|
|
203
|
+
```
|
|
204
|
+
|
|
205
|
+
数据流图效果(灰色矩形代表DataFrame实例,白色矩形表示数据列Series,黄色矩形表示没有连出边的数据列):
|
|
206
|
+
|
|
207
|
+

|
|
208
|
+
|
|
209
|
+
|
|
210
|
+
### 2.5 长宽表转换
|
|
211
|
+
|
|
212
|
+
提供`wide_to_long()`方法和`long_to_wide()`方法,用于长表风格与宽表风格的转换。
|
|
213
|
+
|
|
214
|
+
宽表举例:
|
|
215
|
+
|
|
216
|
+
```
|
|
217
|
+
k_ts k_device f1_mean_3D f1_mean_5D f1_slope_3D f1_slope_5D f2_mean_3D f2_mean_5D f2_slope_3D f2_slope_5D
|
|
218
|
+
0 2025-01-01 dev1 8 24 67 87 79 48 10 94
|
|
219
|
+
1 2025-01-01 dev2 52 98 53 66 98 14 34 24
|
|
220
|
+
2 2025-01-02 dev1 15 60 58 16 9 93 86 2
|
|
221
|
+
3 2025-01-02 dev2 27 4 31 1 13 83 4 91
|
|
222
|
+
4 2025-01-03 dev1 59 67 7 49 47 65 61 14
|
|
223
|
+
5 2025-01-03 dev2 55 71 80 2 94 19 98 63
|
|
224
|
+
```
|
|
225
|
+
|
|
226
|
+
长表举例:
|
|
227
|
+
|
|
228
|
+
```
|
|
229
|
+
k_ts k_device feature measure period value
|
|
230
|
+
0 2025-01-01 dev1 f1 mean 3D 8
|
|
231
|
+
1 2025-01-01 dev1 f1 mean 5D 24
|
|
232
|
+
2 2025-01-01 dev1 f1 slope 3D 67
|
|
233
|
+
3 2025-01-01 dev1 f1 slope 5D 87
|
|
234
|
+
4 2025-01-01 dev1 f2 mean 3D 79
|
|
235
|
+
5 2025-01-01 dev1 f2 mean 5D 48
|
|
236
|
+
6 2025-01-01 dev1 f2 slope 3D 10
|
|
237
|
+
7 2025-01-01 dev1 f2 slope 5D 94
|
|
238
|
+
8 2025-01-01 dev2 f1 mean 3D 52
|
|
239
|
+
9 2025-01-01 dev2 f1 mean 5D 98
|
|
240
|
+
10 2025-01-01 dev2 f1 slope 3D 53
|
|
241
|
+
11 2025-01-01 dev2 f1 slope 5D 66
|
|
242
|
+
12 2025-01-01 dev2 f2 mean 3D 98
|
|
243
|
+
13 2025-01-01 dev2 f2 mean 5D 14
|
|
244
|
+
14 2025-01-01 dev2 f2 slope 3D 34
|
|
245
|
+
15 2025-01-01 dev2 f2 slope 5D 24
|
|
246
|
+
16 2025-01-02 dev1 f1 mean 3D 15
|
|
247
|
+
17 2025-01-02 dev1 f1 mean 5D 60
|
|
248
|
+
18 2025-01-02 dev1 f1 slope 3D 58
|
|
249
|
+
19 2025-01-02 dev1 f1 slope 5D 16
|
|
250
|
+
20 2025-01-02 dev1 f2 mean 3D 9
|
|
251
|
+
21 2025-01-02 dev1 f2 mean 5D 93
|
|
252
|
+
22 2025-01-02 dev1 f2 slope 3D 86
|
|
253
|
+
23 2025-01-02 dev1 f2 slope 5D 2
|
|
254
|
+
24 2025-01-02 dev2 f1 mean 3D 27
|
|
255
|
+
25 2025-01-02 dev2 f1 mean 5D 4
|
|
256
|
+
26 2025-01-02 dev2 f1 slope 3D 31
|
|
257
|
+
27 2025-01-02 dev2 f1 slope 5D 1
|
|
258
|
+
28 2025-01-02 dev2 f2 mean 3D 13
|
|
259
|
+
29 2025-01-02 dev2 f2 mean 5D 83
|
|
260
|
+
30 2025-01-02 dev2 f2 slope 3D 4
|
|
261
|
+
31 2025-01-02 dev2 f2 slope 5D 91
|
|
262
|
+
32 2025-01-03 dev1 f1 mean 3D 59
|
|
263
|
+
33 2025-01-03 dev1 f1 mean 5D 67
|
|
264
|
+
34 2025-01-03 dev1 f1 slope 3D 7
|
|
265
|
+
35 2025-01-03 dev1 f1 slope 5D 49
|
|
266
|
+
36 2025-01-03 dev1 f2 mean 3D 47
|
|
267
|
+
37 2025-01-03 dev1 f2 mean 5D 65
|
|
268
|
+
38 2025-01-03 dev1 f2 slope 3D 61
|
|
269
|
+
39 2025-01-03 dev1 f2 slope 5D 14
|
|
270
|
+
40 2025-01-03 dev2 f1 mean 3D 55
|
|
271
|
+
41 2025-01-03 dev2 f1 mean 5D 71
|
|
272
|
+
42 2025-01-03 dev2 f1 slope 3D 80
|
|
273
|
+
43 2025-01-03 dev2 f1 slope 5D 2
|
|
274
|
+
44 2025-01-03 dev2 f2 mean 3D 94
|
|
275
|
+
45 2025-01-03 dev2 f2 mean 5D 19
|
|
276
|
+
46 2025-01-03 dev2 f2 slope 3D 98
|
|
277
|
+
47 2025-01-03 dev2 f2 slope 5D 63
|
|
278
|
+
```
|
|
279
|
+
|
|
280
|
+
宽表转长表:
|
|
281
|
+
|
|
282
|
+
```python
|
|
283
|
+
df_long = df_wide.wide_to_long()
|
|
284
|
+
```
|
|
285
|
+
|
|
286
|
+
长表转宽表:
|
|
287
|
+
|
|
288
|
+
```python
|
|
289
|
+
df_wide = df_long.long_to_wide()
|
|
290
|
+
```
|
|
291
|
+
|
|
@@ -0,0 +1,54 @@
|
|
|
1
|
+
[build-system]
|
|
2
|
+
requires = ["setuptools>=61", "wheel"]
|
|
3
|
+
build-backend = "setuptools.build_meta"
|
|
4
|
+
|
|
5
|
+
[project]
|
|
6
|
+
name = "k2pipe"
|
|
7
|
+
version = "0.3.0"
|
|
8
|
+
authors = [{ name = "K2data", email = "admin@k2data.com.cn" }]
|
|
9
|
+
description = "k2pipe"
|
|
10
|
+
readme = "README.md"
|
|
11
|
+
requires-python = ">=3.8"
|
|
12
|
+
dependencies = [
|
|
13
|
+
"pandas",
|
|
14
|
+
"numexpr",
|
|
15
|
+
"scipy",
|
|
16
|
+
"loguru",
|
|
17
|
+
"greenlet==3.1.0",
|
|
18
|
+
"k2magic",
|
|
19
|
+
"graphviz",
|
|
20
|
+
]
|
|
21
|
+
classifiers = [
|
|
22
|
+
"Development Status :: 3 - Alpha",
|
|
23
|
+
"Intended Audience :: Developers",
|
|
24
|
+
"Programming Language :: Python :: 3",
|
|
25
|
+
"Programming Language :: Python :: 3.8",
|
|
26
|
+
"Programming Language :: Python :: 3.9",
|
|
27
|
+
"Programming Language :: Python :: 3.10",
|
|
28
|
+
"Operating System :: OS Independent",
|
|
29
|
+
"Topic :: Software Development :: Libraries",
|
|
30
|
+
]
|
|
31
|
+
|
|
32
|
+
[project.urls]
|
|
33
|
+
Homepage = "https://www.k2data.com.cn"
|
|
34
|
+
|
|
35
|
+
[project.optional-dependencies]
|
|
36
|
+
dev = [
|
|
37
|
+
"pytest",
|
|
38
|
+
"pytest-mock",
|
|
39
|
+
"pytest-cov",
|
|
40
|
+
"pytz",
|
|
41
|
+
]
|
|
42
|
+
|
|
43
|
+
[tool.setuptools]
|
|
44
|
+
package-dir = { "" = "src" }
|
|
45
|
+
|
|
46
|
+
[tool.setuptools.packages.find]
|
|
47
|
+
where = ["src"]
|
|
48
|
+
|
|
49
|
+
[tool.setuptools.package-data]
|
|
50
|
+
"*" = ["*.dll", "*.so*"]
|
|
51
|
+
|
|
52
|
+
[tool.pytest.ini_options]
|
|
53
|
+
testpaths = ["tests"]
|
|
54
|
+
filterwarnings = ["ignore"]
|
k2pipe-0.3.0/setup.cfg
ADDED
|
File without changes
|