k2pipe 0.1.9__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- k2pipe/__init__.py +0 -0
- k2pipe/mypipe.py +812 -0
- k2pipe/pipe.py +168 -0
- k2pipe-0.1.9.dist-info/METADATA +255 -0
- k2pipe-0.1.9.dist-info/RECORD +7 -0
- k2pipe-0.1.9.dist-info/WHEEL +5 -0
- k2pipe-0.1.9.dist-info/top_level.txt +1 -0
k2pipe/__init__.py
ADDED
|
File without changes
|
k2pipe/mypipe.py
ADDED
|
@@ -0,0 +1,812 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
import ast
|
|
3
|
+
from pathlib import Path
|
|
4
|
+
|
|
5
|
+
import numpy as np
|
|
6
|
+
import pandas as pd
|
|
7
|
+
from pandas import Series
|
|
8
|
+
from threading import local
|
|
9
|
+
|
|
10
|
+
_thread_local = local()
|
|
11
|
+
_thread_local.accessed_cols = []
|
|
12
|
+
|
|
13
|
+
mydataframes = []
|
|
14
|
+
|
|
15
|
+
# K2Pipe提供的内置函数,用于解决暂时无法通过eval()实现的常用操作,如时间操作
|
|
16
|
+
def time_shift(self: Series, *args, **kwargs):
|
|
17
|
+
return self + pd.to_timedelta(*args, **kwargs)
|
|
18
|
+
pd.Series.time_shift = time_shift
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
# 修改pd.concat()方法
|
|
22
|
+
_original_concat = pd.concat
|
|
23
|
+
def my_concat(objs, axis=0, **kwargs):
|
|
24
|
+
result = _original_concat(objs, axis=axis, **kwargs)
|
|
25
|
+
|
|
26
|
+
if isinstance(result, pd.DataFrame):
|
|
27
|
+
result = MyDataFrame(result)
|
|
28
|
+
|
|
29
|
+
result.config = pd.DataFrame()
|
|
30
|
+
if axis == 0:
|
|
31
|
+
# 纵向拼接的情况
|
|
32
|
+
result.name = 'concat'
|
|
33
|
+
result.config['feature'] = list(objs[0].columns)
|
|
34
|
+
for col in objs[0].columns:
|
|
35
|
+
result.actual_mappings[col] = col
|
|
36
|
+
elif axis == 1:
|
|
37
|
+
# 横向拼接的情况
|
|
38
|
+
result.name = 'concat(1)'
|
|
39
|
+
|
|
40
|
+
all_features = []
|
|
41
|
+
for obj in objs:
|
|
42
|
+
if isinstance(obj, pd.Series):
|
|
43
|
+
all_features.append(obj.name)
|
|
44
|
+
elif isinstance(obj, pd.DataFrame):
|
|
45
|
+
# merge的实现会调用concat(axis=1)
|
|
46
|
+
# 虽然pandas的concat支持两个df有同名列,但结果格式复杂容易出错,这里禁止这种情况
|
|
47
|
+
if bool(set(all_features) & set(obj.columns.values)):
|
|
48
|
+
raise ValueError(f'横向拼接的DataFrame不能有同名列:{all_features} -- {obj.columns.values}')
|
|
49
|
+
all_features.extend(obj.columns)
|
|
50
|
+
else:
|
|
51
|
+
raise ValueError('暂不支持非Series类型的拼接')
|
|
52
|
+
result.config['feature'] = all_features
|
|
53
|
+
for col in all_features:
|
|
54
|
+
result.actual_mappings[col] = col
|
|
55
|
+
|
|
56
|
+
# 建立连接关系
|
|
57
|
+
result.input_dfs = objs
|
|
58
|
+
for obj in objs:
|
|
59
|
+
obj.output_df = result
|
|
60
|
+
|
|
61
|
+
# 加入到processors列表
|
|
62
|
+
mydataframes.append(result)
|
|
63
|
+
return result
|
|
64
|
+
# merge也会调用自定义concat()方法,若覆盖会报错:
|
|
65
|
+
# AttributeError: 'Series' object has no attribute 'columns'
|
|
66
|
+
# 暂时不覆盖原生concat
|
|
67
|
+
pd.concat = my_concat
|
|
68
|
+
|
|
69
|
+
class MyDataFrame(pd.DataFrame):
|
|
70
|
+
_metadata = ['name', 'config','actual_mappings','missing_mappings','input_dfs','output_df']
|
|
71
|
+
|
|
72
|
+
def __init__(self, *args, name=None, config:pd.DataFrame=None, actual_mappings=None, missing_mappings=None, input_dfs=None, output_df=None, **kwargs):
|
|
73
|
+
super().__init__(*args, **kwargs)
|
|
74
|
+
self.name = name
|
|
75
|
+
self.config = config
|
|
76
|
+
self.actual_mappings = actual_mappings # 实际发生的计算关系(例如 * 已经展开)
|
|
77
|
+
self.missing_mappings = missing_mappings # 未能跟踪到的计算关系(例如未通过配置方式生成的列)
|
|
78
|
+
self.input_dfs = input_dfs
|
|
79
|
+
self.output_df = output_df
|
|
80
|
+
if self.input_dfs is None:
|
|
81
|
+
self.input_dfs = []
|
|
82
|
+
if self.actual_mappings is None:
|
|
83
|
+
self.actual_mappings = {}
|
|
84
|
+
if self.missing_mappings is None:
|
|
85
|
+
self.missing_mappings = []
|
|
86
|
+
|
|
87
|
+
@property
|
|
88
|
+
def _constructor(self):
|
|
89
|
+
# 确保在 df 操作(如 df.head(), df.copy())后仍返回 MyDataFrame 类型
|
|
90
|
+
return MyDataFrame
|
|
91
|
+
|
|
92
|
+
|
|
93
|
+
def merge(self, right, on=None, **kwargs):
|
|
94
|
+
# 目前不支持自动加_x、_y的处理
|
|
95
|
+
# 如果两个df有同名列(on内除外),则抛出异常
|
|
96
|
+
if on is None:
|
|
97
|
+
raise ValueError("strict_merge 要求必须显式指定 `on` 参数。")
|
|
98
|
+
# 标准化 on 为集合
|
|
99
|
+
if isinstance(on, str):
|
|
100
|
+
on_cols = {on}
|
|
101
|
+
else:
|
|
102
|
+
on_cols = set(on)
|
|
103
|
+
# 检查 on 列是否都存在于两个 DataFrame 中
|
|
104
|
+
missing_in_left = on_cols - set(self.columns)
|
|
105
|
+
missing_in_right = on_cols - set(right.columns)
|
|
106
|
+
if missing_in_left or missing_in_right:
|
|
107
|
+
raise KeyError(
|
|
108
|
+
f"连接键缺失:left 缺少 {missing_in_left},right 缺少 {missing_in_right}"
|
|
109
|
+
)
|
|
110
|
+
common_cols = set(self.columns) & set(right.columns)
|
|
111
|
+
extra_common = common_cols - on_cols
|
|
112
|
+
if extra_common:
|
|
113
|
+
raise ValueError(
|
|
114
|
+
f"发现非连接键的同名列(除 `on={on}` 外): {sorted(extra_common)}。"
|
|
115
|
+
"请重命名列或移除重复列后再合并。"
|
|
116
|
+
)
|
|
117
|
+
|
|
118
|
+
# 原始merge结果
|
|
119
|
+
result = MyDataFrame(super().merge(right=right, **kwargs))
|
|
120
|
+
|
|
121
|
+
# FIXME: 重名带有后缀的情况还没有处理
|
|
122
|
+
# actual_mappings
|
|
123
|
+
result.name = 'merge'
|
|
124
|
+
result.config = pd.DataFrame()
|
|
125
|
+
result.config['feature'] = list(self.columns) + list(right.columns)
|
|
126
|
+
for col in self.columns:
|
|
127
|
+
result.actual_mappings[col] = col
|
|
128
|
+
for col in right.columns:
|
|
129
|
+
result.actual_mappings[col] = col
|
|
130
|
+
|
|
131
|
+
# 建立连接关系
|
|
132
|
+
result.input_dfs = [self,right]
|
|
133
|
+
self.output_df = result
|
|
134
|
+
right.output_df = result
|
|
135
|
+
|
|
136
|
+
# 加入到processors列表
|
|
137
|
+
mydataframes.append(result)
|
|
138
|
+
return result
|
|
139
|
+
|
|
140
|
+
|
|
141
|
+
# 覆盖pd.DataFrame的rename方法
|
|
142
|
+
def rename(self, inplace = None, *args, **kwargs):
|
|
143
|
+
if inplace:
|
|
144
|
+
raise ValueError("mydataframe.rename 暂不支持 inplace=True 参数") # TODO
|
|
145
|
+
result = MyDataFrame(super().rename(*args, **kwargs))
|
|
146
|
+
# actual_mappings
|
|
147
|
+
result.name = 'rename'
|
|
148
|
+
result.config = pd.DataFrame()
|
|
149
|
+
result.config['feature'] = list(result.columns)
|
|
150
|
+
for old, new in zip(list(self.columns), list(result.columns)):
|
|
151
|
+
result.actual_mappings[new] = old
|
|
152
|
+
# 建立连接关系
|
|
153
|
+
result.input_dfs = [self]
|
|
154
|
+
self.output_df = result
|
|
155
|
+
# 加入到processors列表
|
|
156
|
+
mydataframes.append(result)
|
|
157
|
+
return result
|
|
158
|
+
|
|
159
|
+
|
|
160
|
+
# 覆盖pd.DataFrame的fitler方法
|
|
161
|
+
def filter(self, *args, **kwargs) -> MyDataFrame:
|
|
162
|
+
result = MyDataFrame(super().filter(*args, **kwargs))
|
|
163
|
+
# actual_mappings
|
|
164
|
+
result.name = 'filter'
|
|
165
|
+
result.config = pd.DataFrame()
|
|
166
|
+
result.config['feature'] = list(result.columns)
|
|
167
|
+
for col in result.columns:
|
|
168
|
+
result.actual_mappings[col] = col
|
|
169
|
+
# 建立连接关系
|
|
170
|
+
result.input_dfs = [self]
|
|
171
|
+
self.output_df = result
|
|
172
|
+
# 加入到processors列表
|
|
173
|
+
mydataframes.append(result)
|
|
174
|
+
return result
|
|
175
|
+
|
|
176
|
+
# 追溯 df[['col1','col2']] 这类filter操作
|
|
177
|
+
def __getitem__(self, key):
|
|
178
|
+
if isinstance(key, tuple):
|
|
179
|
+
key = list(key)
|
|
180
|
+
|
|
181
|
+
if isinstance(key, list):
|
|
182
|
+
result = self.filter(key)
|
|
183
|
+
result.name = 'getitem'
|
|
184
|
+
return result
|
|
185
|
+
|
|
186
|
+
# 为__setitem__时获取使用记录做准备
|
|
187
|
+
if isinstance(key, str):
|
|
188
|
+
_thread_local.accessed_cols.append(key)
|
|
189
|
+
|
|
190
|
+
# 其他key情况直接返回原始结果,不追溯:
|
|
191
|
+
# slice类型:merge里会使用到此形式 left = self.left[:]
|
|
192
|
+
# Series类型:drop_duplicate里使用
|
|
193
|
+
# str类型:过于常用,例如df['k_ts'] = pd.to_datetime(df['k_ts'])
|
|
194
|
+
# 可能还有其他类型
|
|
195
|
+
return super().__getitem__(key)
|
|
196
|
+
|
|
197
|
+
|
|
198
|
+
# 追溯 df['new_col'] = df['col1'] + df['col2'] 这类操作
|
|
199
|
+
def __setitem__(self, key, value):
|
|
200
|
+
super().__setitem__(key, value)
|
|
201
|
+
|
|
202
|
+
# 清除__getitem__里记录的访问列的全局记录
|
|
203
|
+
accessed = getattr(_thread_local, 'accessed_cols', [])
|
|
204
|
+
_thread_local.accessed_cols = []
|
|
205
|
+
|
|
206
|
+
# 避免过于频繁的追溯记录
|
|
207
|
+
if not isinstance(key, str):
|
|
208
|
+
return
|
|
209
|
+
if not isinstance(value, Series):
|
|
210
|
+
return
|
|
211
|
+
if key in ['k_ts', 'k_device']:
|
|
212
|
+
return
|
|
213
|
+
if accessed == []:
|
|
214
|
+
return
|
|
215
|
+
|
|
216
|
+
# FIXME: 列类型转换astype()也会被记录,追溯的意义不大
|
|
217
|
+
# if key=='jiuxy':
|
|
218
|
+
# print()
|
|
219
|
+
|
|
220
|
+
# 流程图里不能新建df节点
|
|
221
|
+
expression = '+'.join(accessed)
|
|
222
|
+
if self.config is None:
|
|
223
|
+
# self本身不是extract_features或rename等操作得到的情况
|
|
224
|
+
# 通常是直接MyDataFrame()形成的
|
|
225
|
+
self.config = pd.DataFrame()
|
|
226
|
+
self.config['feature'] = [key]
|
|
227
|
+
self.config['expression'] = expression
|
|
228
|
+
self.actual_mappings[key] = expression
|
|
229
|
+
else:
|
|
230
|
+
# 例如 result[feature_name] = _eval(result, expression)这种情况accessed为空,导致expression为空
|
|
231
|
+
# FIXME: 存在重复添加情况
|
|
232
|
+
self.config = pd.concat([self.config, pd.DataFrame([{'feature': key, 'expression': expression}])], ignore_index=True)
|
|
233
|
+
self.actual_mappings[key] = expression
|
|
234
|
+
|
|
235
|
+
# FIXME: 无法创建新的MyDataFrame实例,仅用名称提示存在setitem操作
|
|
236
|
+
if self.name is None or 'set(' in self.name:
|
|
237
|
+
return
|
|
238
|
+
self.name = self.name +f'\nset({key})'
|
|
239
|
+
|
|
240
|
+
|
|
241
|
+
# 追溯 df = df.assign(new_col = df['col1'] + df['col2'])
|
|
242
|
+
def assign(self, **kwargs):
|
|
243
|
+
result = MyDataFrame(super().assign(**kwargs))
|
|
244
|
+
|
|
245
|
+
result.name = 'assign'
|
|
246
|
+
result.config = pd.DataFrame()
|
|
247
|
+
result.config['feature'] = list(result.columns)
|
|
248
|
+
for col in self.columns:
|
|
249
|
+
result.actual_mappings[col] = col
|
|
250
|
+
for key, value in kwargs.items():
|
|
251
|
+
# FIXME: 无法获取到assign里的原始表达式
|
|
252
|
+
# assign会触发__setitem__,需要消除影响(self对象的actual_mappings)
|
|
253
|
+
result.actual_mappings[key] = self.actual_mappings[key]
|
|
254
|
+
self.actual_mappings.pop(key)
|
|
255
|
+
|
|
256
|
+
# 建立连接关系
|
|
257
|
+
result.input_dfs = [self]
|
|
258
|
+
self.output_df = result
|
|
259
|
+
# 加入到processors列表
|
|
260
|
+
mydataframes.append(result)
|
|
261
|
+
return result
|
|
262
|
+
|
|
263
|
+
|
|
264
|
+
# 覆盖pd.DataFrame的query方法
|
|
265
|
+
def query(self, *args, **kwargs):
|
|
266
|
+
result = MyDataFrame(super().query(*args, **kwargs))
|
|
267
|
+
# actual_mappings
|
|
268
|
+
result.name = 'query'
|
|
269
|
+
result.config = pd.DataFrame()
|
|
270
|
+
result.config['feature'] = list(result.columns)
|
|
271
|
+
for col in result.columns:
|
|
272
|
+
result.actual_mappings[col] = col
|
|
273
|
+
# 建立连接关系
|
|
274
|
+
result.input_dfs = [self]
|
|
275
|
+
self.output_df = result
|
|
276
|
+
# 加入到processors列表
|
|
277
|
+
mydataframes.append(result)
|
|
278
|
+
return result
|
|
279
|
+
|
|
280
|
+
|
|
281
|
+
# mypipe内部应使用原始drop,以免产生多余的追溯
|
|
282
|
+
def drop_old(self, *args, **kwargs):
|
|
283
|
+
return super().drop(*args, **kwargs)
|
|
284
|
+
|
|
285
|
+
|
|
286
|
+
# 覆盖pd.DataFrame的drop方法
|
|
287
|
+
def drop(self, *args, **kwargs):
|
|
288
|
+
result = MyDataFrame(super().drop(*args, **kwargs))
|
|
289
|
+
result.name = 'drop'
|
|
290
|
+
result.config = pd.DataFrame()
|
|
291
|
+
result.config['feature'] = list(result.columns)
|
|
292
|
+
for col in result.columns:
|
|
293
|
+
result.actual_mappings[col] = col
|
|
294
|
+
# 建立连接关系
|
|
295
|
+
result.input_dfs = [self]
|
|
296
|
+
self.output_df = result
|
|
297
|
+
# 加入到processors列表
|
|
298
|
+
mydataframes.append(result)
|
|
299
|
+
return result
|
|
300
|
+
|
|
301
|
+
|
|
302
|
+
def extract_features(self, config: pd.DataFrame, step_name:str=None):
|
|
303
|
+
result = MyDataFrame(self) # 不能用copy()创建新实例,会将actual_mappings等属性复制过来
|
|
304
|
+
result.name = step_name
|
|
305
|
+
result.columns = result.columns.str.strip() # 防止列名前后有空格造成难以排查的错误
|
|
306
|
+
|
|
307
|
+
# 展开第一个 * 为所有列名,并放在最前面
|
|
308
|
+
if '*' in config['feature'].values:
|
|
309
|
+
config.drop(config[config['feature'] == '*'].index, inplace=True)
|
|
310
|
+
new_df = pd.DataFrame(columns=config.columns)
|
|
311
|
+
for col in list(self.columns):
|
|
312
|
+
new_df.loc[len(new_df)] = {'feature':col, 'expression':col, 'comment':'*'}
|
|
313
|
+
for idx, row in config.iterrows():
|
|
314
|
+
new_df.loc[len(new_df)] = row
|
|
315
|
+
config = new_df
|
|
316
|
+
|
|
317
|
+
result.config = config
|
|
318
|
+
|
|
319
|
+
for _, row in config.iterrows():
|
|
320
|
+
# 忽略注释行
|
|
321
|
+
if row[0].startswith('#'):
|
|
322
|
+
continue
|
|
323
|
+
|
|
324
|
+
feature_name = row['feature']
|
|
325
|
+
if not pd.isna(feature_name):
|
|
326
|
+
feature_name = feature_name.strip()
|
|
327
|
+
else:
|
|
328
|
+
raise ValueError(f"特征名称不能为空 {row}, line: {_}")
|
|
329
|
+
|
|
330
|
+
_validate_var_name(feature_name)
|
|
331
|
+
|
|
332
|
+
expression = row['expression']
|
|
333
|
+
if not pd.isna(expression):
|
|
334
|
+
expression = expression.strip()
|
|
335
|
+
else:
|
|
336
|
+
result[feature_name] = np.nan
|
|
337
|
+
continue
|
|
338
|
+
|
|
339
|
+
# 非数值类型用eval容易报错,这种情况直接赋值
|
|
340
|
+
if feature_name == expression:
|
|
341
|
+
result[feature_name] = result[expression]
|
|
342
|
+
else :
|
|
343
|
+
result[feature_name] = _eval(result, expression)
|
|
344
|
+
|
|
345
|
+
# 记录实际生成的列
|
|
346
|
+
expression_values = {}
|
|
347
|
+
cols = _extract_column_names(expression)
|
|
348
|
+
for col in cols:
|
|
349
|
+
expression_values[col] = result[col]
|
|
350
|
+
result.actual_mappings[feature_name] = expression
|
|
351
|
+
|
|
352
|
+
# 非通过配置产生的列,也放入mappings信息以备追踪
|
|
353
|
+
# 将 result 中存在但 config.mappings 中缺失的列加入 config.missing_mappings
|
|
354
|
+
if self.config is not None:
|
|
355
|
+
missing_columns = set(self.columns) - set(self.config['feature'])
|
|
356
|
+
for col in missing_columns:
|
|
357
|
+
result.missing_mappings.append({
|
|
358
|
+
'feature': col,
|
|
359
|
+
'expression': '(Unknown)',
|
|
360
|
+
# 'feature_value': result[col].copy(),
|
|
361
|
+
# 'expression_values': {}
|
|
362
|
+
})
|
|
363
|
+
|
|
364
|
+
mydataframes.append(result)
|
|
365
|
+
|
|
366
|
+
# 删除self中存在但config中没有定义的列
|
|
367
|
+
config_columns = set(config['feature'].dropna())
|
|
368
|
+
original_columns = set(self.columns)
|
|
369
|
+
columns_to_drop = original_columns - config_columns
|
|
370
|
+
result = result.drop_old(columns=columns_to_drop, errors='ignore')
|
|
371
|
+
|
|
372
|
+
result = _sort_columns(result)
|
|
373
|
+
|
|
374
|
+
self.output_df = result
|
|
375
|
+
result.input_dfs = [self]
|
|
376
|
+
|
|
377
|
+
return result
|
|
378
|
+
|
|
379
|
+
|
|
380
|
+
# 向前追踪指定df的指定列的计算逻辑
|
|
381
|
+
def trace_column(self, feature_to_trace:str):
|
|
382
|
+
assert isinstance(feature_to_trace, str)
|
|
383
|
+
|
|
384
|
+
# start_line: 倒序处理的开始行号(若为None则处理所有行)
|
|
385
|
+
def _build_pipe_tree_recursive(df, feature, depth=0, start_line:int=None):
|
|
386
|
+
if df.input_dfs is None:
|
|
387
|
+
return None
|
|
388
|
+
|
|
389
|
+
if start_line is None:
|
|
390
|
+
start_line = len(df.actual_mappings)
|
|
391
|
+
|
|
392
|
+
# 倒序遍历
|
|
393
|
+
# 获取 actual_mappings 的键值对列表
|
|
394
|
+
mappings_list = list(df.actual_mappings.items())
|
|
395
|
+
for idx in range(start_line - 1, -1, -1): # 从 start_line-1 到 0
|
|
396
|
+
mapped_feature, expr = mappings_list[idx]
|
|
397
|
+
if mapped_feature == feature :
|
|
398
|
+
# 避免无限递归(同一个配置文件内部递归查找时)
|
|
399
|
+
# if df is self and feature == expr:
|
|
400
|
+
# continue
|
|
401
|
+
input_names = _extract_column_names(expr)
|
|
402
|
+
|
|
403
|
+
children = []
|
|
404
|
+
for name in input_names:
|
|
405
|
+
|
|
406
|
+
# 同一个配置文件内部的递归匹配
|
|
407
|
+
# 从当前行的上一行继续倒序匹配
|
|
408
|
+
if idx > 1: # FIXME: 改为>0?
|
|
409
|
+
child_ast_self = _build_pipe_tree_recursive(df, name, depth + 1, idx -1)
|
|
410
|
+
if child_ast_self:
|
|
411
|
+
children.append(child_ast_self)
|
|
412
|
+
|
|
413
|
+
# 前一个配置文件内的递归匹配
|
|
414
|
+
for input_df in df.input_dfs:
|
|
415
|
+
child_ast_prev = _build_pipe_tree_recursive(input_df, name, depth + 1)
|
|
416
|
+
if child_ast_prev:
|
|
417
|
+
children.append(child_ast_prev)
|
|
418
|
+
|
|
419
|
+
return {
|
|
420
|
+
"feature": feature,
|
|
421
|
+
"df": df.copy(),
|
|
422
|
+
"mapping": {"feature": mapped_feature, "expression": expr},
|
|
423
|
+
"expression": expr,
|
|
424
|
+
"children": children,
|
|
425
|
+
"depth": depth
|
|
426
|
+
}
|
|
427
|
+
|
|
428
|
+
def _print_pipe_tree(ast_node, indent=0):
|
|
429
|
+
if ast_node is None:
|
|
430
|
+
print("└── (empty)")
|
|
431
|
+
return
|
|
432
|
+
spaces = " " * indent
|
|
433
|
+
expr = ast_node["expression"]
|
|
434
|
+
feature = ast_node['feature']
|
|
435
|
+
df = ast_node["df"]
|
|
436
|
+
missing_features = [item['feature'] for item in df.missing_mappings]
|
|
437
|
+
exp_missing_features = set(_extract_column_names(expr)).intersection(set(missing_features))
|
|
438
|
+
# if feature == expr:
|
|
439
|
+
# print(f"{spaces}└── [{df.name}] - "+
|
|
440
|
+
# (f" // missing: {exp_missing_features}" if exp_missing_features else ""))
|
|
441
|
+
# else:
|
|
442
|
+
print(f"{spaces}└── [{df.name}] {feature} = {expr} "+
|
|
443
|
+
(f" // missing: {exp_missing_features}" if exp_missing_features else ""))
|
|
444
|
+
for child in ast_node["children"]:
|
|
445
|
+
_print_pipe_tree(child, indent + 1)
|
|
446
|
+
|
|
447
|
+
tree = _build_pipe_tree_recursive(self, feature_to_trace)
|
|
448
|
+
_print_pipe_tree(tree)
|
|
449
|
+
return tree
|
|
450
|
+
|
|
451
|
+
|
|
452
|
+
# 向前追溯多个列
|
|
453
|
+
def trace_columns(self, features_to_trace:list):
|
|
454
|
+
for feature in features_to_trace:
|
|
455
|
+
print(feature)
|
|
456
|
+
self.trace_column(feature)
|
|
457
|
+
print()
|
|
458
|
+
|
|
459
|
+
|
|
460
|
+
# 宽表转长表,例如:
|
|
461
|
+
# k_ts, f1_mean_3D, f1_slope_3D, f2_mean_3D, f2_slope_3D
|
|
462
|
+
# 2025 - 01 - 01, 1, 2, 3, 4
|
|
463
|
+
# 2025 - 01 - 02, 5, 6, 7, 8
|
|
464
|
+
# 转为:
|
|
465
|
+
# k_ts, feature, measure, period, value
|
|
466
|
+
# 2025 - 01 - 01, f1, mean, 3D, 1
|
|
467
|
+
# 2025 - 01 - 01, f1, slope, 3D, 2
|
|
468
|
+
# 2025 - 01 - 01, f2, mean, 3D, 3
|
|
469
|
+
# 2025 - 01 - 01, f2, slope, 3D, 4
|
|
470
|
+
# 2025 - 01 - 02, f1, mean, 3D, 5
|
|
471
|
+
# 2025 - 01 - 02, f1, slope, 3D, 6
|
|
472
|
+
# 2025 - 01 - 02, f2, mean, 3D, 7
|
|
473
|
+
# 2025 - 01 - 02, f2, slope, 3D, 8
|
|
474
|
+
def wide_to_long(self):
|
|
475
|
+
id_vars = ['k_ts','k_device']
|
|
476
|
+
value_vars = [col for col in self.columns if col != 'k_ts' and col != 'k_device']
|
|
477
|
+
df_melted = self.melt(id_vars=id_vars, value_vars=value_vars, var_name='feature_measure_period',
|
|
478
|
+
value_name='value')
|
|
479
|
+
split_cols = df_melted['feature_measure_period'].str.rsplit('_', n=2, expand=True)
|
|
480
|
+
df_melted[['feature', 'measure', 'period']] = split_cols
|
|
481
|
+
result = df_melted[['k_ts', 'k_device', 'feature', 'measure', 'period', 'value']]
|
|
482
|
+
result = result.sort_values(['k_ts', 'feature', 'measure']).reset_index(drop=True)
|
|
483
|
+
return result
|
|
484
|
+
|
|
485
|
+
|
|
486
|
+
# 长表转宽表
|
|
487
|
+
def long_to_wide(self):
|
|
488
|
+
required_cols = ['k_ts', 'k_device', 'feature', 'measure', 'period', 'value']
|
|
489
|
+
missing_cols = [col for col in required_cols if col not in self.columns]
|
|
490
|
+
if missing_cols:
|
|
491
|
+
raise ValueError(f"缺少必需的列: {missing_cols}")
|
|
492
|
+
wide_df = self.copy()
|
|
493
|
+
wide_df['new_col'] = wide_df['feature'] + '_' + wide_df['measure'] + '_' + wide_df['period']
|
|
494
|
+
wide_df = MyDataFrame(wide_df.pivot(index=['k_ts', 'k_device'], columns='new_col', values='value'))
|
|
495
|
+
wide_df = wide_df.reset_index()
|
|
496
|
+
wide_df.columns.name = None
|
|
497
|
+
return wide_df
|
|
498
|
+
|
|
499
|
+
|
|
500
|
+
# 生成数据流图
|
|
501
|
+
# show_value: 是否显示此列数据值(第一行)
|
|
502
|
+
# highlight_useless_column:是否高亮显示无输出edge的列(无用列)
|
|
503
|
+
def generate_dataflow(self, filename: Path = None, show_value=False, highlight_useless_column=True):
|
|
504
|
+
# graphviz需要本地安装应用(仅pip install graphviz不够),比较麻烦
|
|
505
|
+
# 所以开发者可能本地没有生成数据流图的条件
|
|
506
|
+
# 此时仅警告不实际生成图(不抛出异常以免影响测试用例的完成)
|
|
507
|
+
try:
|
|
508
|
+
import os
|
|
509
|
+
import graphviz
|
|
510
|
+
from graphviz import ExecutableNotFound
|
|
511
|
+
except ImportError as e:
|
|
512
|
+
print(f"警告: 未安装graphviz,请先安装graphviz应用,然后 pip install graphviz {e}")
|
|
513
|
+
return None
|
|
514
|
+
|
|
515
|
+
if filename.suffix.lower() != '.svg':
|
|
516
|
+
raise ValueError(f"仅支持 .svg 格式: {filename.suffix}")
|
|
517
|
+
|
|
518
|
+
dot = graphviz.Digraph(comment='DataFlow Graph', format='svg')
|
|
519
|
+
# ranksep: df矩形之间的横向距离(英寸)
|
|
520
|
+
# nodesep: 列矩形之间的纵向距离(英寸)
|
|
521
|
+
dot.attr(rankdir='LR', splines='spline', ranksep='1', nodesep='0.12')
|
|
522
|
+
# 设置中文字体,优先使用系统中存在的字体
|
|
523
|
+
dot.attr('graph', fontname='SimHei,SimSun,Microsoft YaHei,DejaVu Sans,Arial,sans-serif', fontsize='12')
|
|
524
|
+
dot.attr('node', fontname='SimHei,SimSun,Microsoft YaHei,DejaVu Sans,Arial,sans-serif',
|
|
525
|
+
shape='box', style='filled', fillcolor='white', fontsize='10', height='0.3')
|
|
526
|
+
dot.attr('edge', fontname='SimHei,SimSun,Microsoft YaHei,DejaVu Sans,Arial,sans-serif')
|
|
527
|
+
|
|
528
|
+
# 使用集合记录已访问的节点,避免重复处理
|
|
529
|
+
visited_dfs = set()
|
|
530
|
+
visited_edges = set()
|
|
531
|
+
all_col_nodes = set() # 记录所有列节点ID
|
|
532
|
+
output_sources = set() # 记录有出边的源节点ID
|
|
533
|
+
|
|
534
|
+
def add_dataframe_node(df):
|
|
535
|
+
"""添加DataFrame节点到图中"""
|
|
536
|
+
if id(df) in visited_dfs:
|
|
537
|
+
return
|
|
538
|
+
visited_dfs.add(id(df))
|
|
539
|
+
|
|
540
|
+
# 创建子图表示DataFrame,使用cluster前缀使graphviz将其渲染为带边框的组
|
|
541
|
+
with dot.subgraph(name=f'cluster_{id(df)}') as c:
|
|
542
|
+
c.attr(label=f'{df.name or "DataFrame"}',
|
|
543
|
+
fontname='SimHei,SimSun,Microsoft YaHei,DejaVu Sans,Arial,sans-serif')
|
|
544
|
+
c.attr(style='filled', color='lightgrey')
|
|
545
|
+
c.attr(rankdir='TB')
|
|
546
|
+
|
|
547
|
+
# 添加列节点 - 强制垂直排列在同一列
|
|
548
|
+
prev_col_node_id = None
|
|
549
|
+
for i, col in enumerate(df.columns):
|
|
550
|
+
col_node_id = f'col_{id(df)}_{i}_{col}'
|
|
551
|
+
label = f'{col} ({df.iloc[0][col]})' if show_value else col
|
|
552
|
+
c.node(col_node_id, label=label)
|
|
553
|
+
all_col_nodes.add(col_node_id)
|
|
554
|
+
|
|
555
|
+
# 强制垂直排列:避免同一df里两个列出现左右排列的情况,导致连线难以看清
|
|
556
|
+
# if prev_col_node_id:
|
|
557
|
+
# c.edge(prev_col_node_id, col_node_id, style='invis')
|
|
558
|
+
# prev_col_node_id = col_node_id
|
|
559
|
+
|
|
560
|
+
# 强制垂直排列:避免同一df里两个列出现左右排列的情况,导致连线难以看清
|
|
561
|
+
if len(df.columns) > 1:
|
|
562
|
+
with c.subgraph() as s:
|
|
563
|
+
s.attr(rank='same')
|
|
564
|
+
for i, col in enumerate(df.columns):
|
|
565
|
+
col_node_id = f'col_{id(df)}_{i}_{col}'
|
|
566
|
+
s.node(col_node_id)
|
|
567
|
+
|
|
568
|
+
def build_graph_recursive(current_df):
|
|
569
|
+
"""递归构建图"""
|
|
570
|
+
# 添加当前DataFrame节点
|
|
571
|
+
add_dataframe_node(current_df)
|
|
572
|
+
|
|
573
|
+
# 处理上游节点
|
|
574
|
+
for input_df in current_df.input_dfs:
|
|
575
|
+
build_graph_recursive(input_df)
|
|
576
|
+
|
|
577
|
+
# 根据actual_mappings创建列之间的连接
|
|
578
|
+
# 注意:current_df内部不会有同名列,input_dfs之间不会有同名列;但current_df与input_dfs之间可能有同名列
|
|
579
|
+
for feature, expression in current_df.actual_mappings.items():
|
|
580
|
+
create_edges_from_mapping(feature, expression, input_df, current_df)
|
|
581
|
+
|
|
582
|
+
# 处理本节点内部连接
|
|
583
|
+
for feature, expression in current_df.actual_mappings.items():
|
|
584
|
+
create_edges_from_mapping(feature, expression, current_df, current_df)
|
|
585
|
+
|
|
586
|
+
# 根据feature mapping信息创建连接线
|
|
587
|
+
def create_edges_from_mapping(feature, expression, input_df, current_df):
|
|
588
|
+
target_feature = feature
|
|
589
|
+
|
|
590
|
+
# 提取表达式中的输入列
|
|
591
|
+
input_cols = _extract_column_names(expression)
|
|
592
|
+
|
|
593
|
+
# 在上游DataFrame中找到对应的列并创建连接
|
|
594
|
+
for input_col in input_cols:
|
|
595
|
+
# 避免current_df列自身的连接
|
|
596
|
+
if current_df is input_df and target_feature == input_col:
|
|
597
|
+
continue
|
|
598
|
+
if input_col in input_df.columns:
|
|
599
|
+
target_idx = list(current_df.columns).index(target_feature)
|
|
600
|
+
target_node_id = f'col_{id(current_df)}_{target_idx}_{target_feature}'
|
|
601
|
+
|
|
602
|
+
# 找到上游DataFrame中的源列节点ID
|
|
603
|
+
source_idx = list(input_df.columns).index(input_col)
|
|
604
|
+
source_node_id = f'col_{id(input_df)}_{source_idx}_{input_col}'
|
|
605
|
+
|
|
606
|
+
# 创建连接,避免重复边
|
|
607
|
+
edge_key = (source_node_id, target_node_id)
|
|
608
|
+
if edge_key not in visited_edges:
|
|
609
|
+
if input_df is current_df:
|
|
610
|
+
dot.edge(source_node_id, target_node_id, color='gray') # 同一个DataFrame内部的连接用灰色
|
|
611
|
+
else:
|
|
612
|
+
dot.edge(source_node_id, target_node_id)
|
|
613
|
+
visited_edges.add(edge_key)
|
|
614
|
+
output_sources.add(source_node_id)
|
|
615
|
+
|
|
616
|
+
# 从当前DataFrame开始构建图
|
|
617
|
+
build_graph_recursive(self)
|
|
618
|
+
|
|
619
|
+
# 如果启用了 highlight_useless_column,则将没有输出边的列高亮
|
|
620
|
+
if highlight_useless_column:
|
|
621
|
+
no_output_nodes = all_col_nodes - output_sources
|
|
622
|
+
for node_id in no_output_nodes:
|
|
623
|
+
dot.node(node_id, fillcolor='yellow')
|
|
624
|
+
|
|
625
|
+
try:
|
|
626
|
+
# 渲染图片,render里的filename参数不要带扩展名
|
|
627
|
+
dot.render(os.path.splitext(filename)[0], cleanup=True) # cleanup=True删除临时文件
|
|
628
|
+
print(f"数据流图已保存到: {filename}")
|
|
629
|
+
return dot
|
|
630
|
+
except ExecutableNotFound as e:
|
|
631
|
+
print(f"警告: 未安装graphviz应用,请先下载安装。 {e}")
|
|
632
|
+
return None
|
|
633
|
+
|
|
634
|
+
# def trace_unused_columns(self):
|
|
635
|
+
# """
|
|
636
|
+
# 实现trace_redundant_columns方法,用于识别整个数据处理管道中的冗余列。
|
|
637
|
+
# 返回一个字典,key是processor的名字,value是此processor里未使用的feature列表
|
|
638
|
+
# """
|
|
639
|
+
#
|
|
640
|
+
# # 结果字典,key是processor名字,value是未使用的列列表
|
|
641
|
+
# redundant_dict = {}
|
|
642
|
+
#
|
|
643
|
+
# # 遍历所有processor
|
|
644
|
+
# for processor in processors:
|
|
645
|
+
# # 收集该processor中定义的所有特征列
|
|
646
|
+
# # if 'feature' in processor.columns:
|
|
647
|
+
# # processor_columns = set(processor['feature'].dropna())
|
|
648
|
+
# # processor_columns.discard('*') # 排除通配符
|
|
649
|
+
# processor_columns = set(pd.DataFrame(processor.actual_mappings)['feature'])
|
|
650
|
+
#
|
|
651
|
+
# # 收集该processor实际使用的列(即其他特征表达式中引用的列)
|
|
652
|
+
# used_in_expressions = set()
|
|
653
|
+
# for proc in processors:
|
|
654
|
+
# for mapping in proc.actual_mappings:
|
|
655
|
+
# expr_cols = _extract_column_names(mapping['expression'])
|
|
656
|
+
# used_in_expressions.update(expr_cols)
|
|
657
|
+
#
|
|
658
|
+
# # 找出该processor中未被使用的列
|
|
659
|
+
# redundant_columns = processor_columns - used_in_expressions
|
|
660
|
+
#
|
|
661
|
+
# # 存入结果字典
|
|
662
|
+
# if processor.name:
|
|
663
|
+
# redundant_dict[processor.name] = list(sorted(redundant_columns))
|
|
664
|
+
# else:
|
|
665
|
+
# redundant_dict[f"Unnamed_Processor_{id(processor)}"] = list(sorted(redundant_columns))
|
|
666
|
+
#
|
|
667
|
+
# # 输出结果
|
|
668
|
+
# print("Redundant Columns by Processor:")
|
|
669
|
+
# for processor_name, columns in redundant_dict.items():
|
|
670
|
+
# if columns:
|
|
671
|
+
# print(f" [{processor_name}]:")
|
|
672
|
+
# for col in columns:
|
|
673
|
+
# print(f" - {col}")
|
|
674
|
+
# else:
|
|
675
|
+
# print(f" [{processor_name}]: (no redundant columns)")
|
|
676
|
+
#
|
|
677
|
+
# return redundant_dict
|
|
678
|
+
|
|
679
|
+
|
|
680
|
+
# 确保DataFrame的时间戳和设备列的类型,时间戳作为索引
|
|
681
|
+
# 将object类型的列转为string类型,前者不支持eval()
|
|
682
|
+
def format_columns(self) -> MyDataFrame:
|
|
683
|
+
result = MyDataFrame(self)
|
|
684
|
+
result.name = self.name
|
|
685
|
+
if 'k_ts' in result.columns:
|
|
686
|
+
result['k_ts'] = pd.to_datetime(result['k_ts'])
|
|
687
|
+
# 若k_ts同时作为索引和普通列,对merge操作会报错('k_ts' is both an index level and a column label, which is ambiguous.)
|
|
688
|
+
# 若k_ts仅作为索引,df['k_ts']会报错 (KeyError)
|
|
689
|
+
# result = result.set_index(['k_ts'], drop=True)
|
|
690
|
+
if 'k_device' in result.columns:
|
|
691
|
+
result['k_device'] = result['k_device'].astype(str)
|
|
692
|
+
|
|
693
|
+
# 将object类型的列转为string类型,避免eval()里报错
|
|
694
|
+
object_cols = result.select_dtypes(include=['object']).columns
|
|
695
|
+
result[object_cols] = result[object_cols].astype('string')
|
|
696
|
+
|
|
697
|
+
# 列名去掉首尾空格,防止难以察觉的错误
|
|
698
|
+
result.columns = result.columns.str.strip()
|
|
699
|
+
|
|
700
|
+
# 列名排序,方便调试对比
|
|
701
|
+
result = _sort_columns(result)
|
|
702
|
+
|
|
703
|
+
return result
|
|
704
|
+
|
|
705
|
+
|
|
706
|
+
# 输出指定processor指定feature的相关数据
|
|
707
|
+
def print_processor(processor_name, feature_name):
|
|
708
|
+
found = False
|
|
709
|
+
for processor in mydataframes:
|
|
710
|
+
if processor.name == processor_name:
|
|
711
|
+
for mapping in processor.actual_mappings:
|
|
712
|
+
if mapping['feature'] == feature_name:
|
|
713
|
+
found = True
|
|
714
|
+
print(f"[{processor.name}]")
|
|
715
|
+
print(f"{feature_name:<10}{mapping['feature_value'].tolist()}")
|
|
716
|
+
values = mapping['expression_values']
|
|
717
|
+
for key, value in values.items():
|
|
718
|
+
print(f"{key:<10}{value.tolist()}")
|
|
719
|
+
if not found:
|
|
720
|
+
print(f"未找到processor[{processor_name}]中feature[{feature_name}]")
|
|
721
|
+
|
|
722
|
+
|
|
723
|
+
|
|
724
|
+
# 检验列名是否合法
|
|
725
|
+
def _validate_var_name(var_name: str):
|
|
726
|
+
forbidden_chars = {'.', '[', ']', '-', '+', '*', '/', '\\', '%', '&'}
|
|
727
|
+
if any(char in forbidden_chars for char in var_name):
|
|
728
|
+
raise ValueError(f"变量名 '{var_name}' 包含非法字符")
|
|
729
|
+
|
|
730
|
+
|
|
731
|
+
# 先使用numexpr解析,若失败再尝试python解析
|
|
732
|
+
def _eval(df: pd.DataFrame, expression: str):
|
|
733
|
+
result = None
|
|
734
|
+
|
|
735
|
+
# dataframe的eval()方法不支持where表达式,自己实现
|
|
736
|
+
if expression.startswith('where'):
|
|
737
|
+
args = _parse_where_args(expression)
|
|
738
|
+
if len(args) == 3:
|
|
739
|
+
return np.where(_eval(df, args[0]), _eval(df, args[1]), _eval(df, args[2]))
|
|
740
|
+
else:
|
|
741
|
+
raise ValueError(f"无效的where表达式格式: {expression}")
|
|
742
|
+
|
|
743
|
+
try:
|
|
744
|
+
result = df.eval(expression, engine='numexpr')
|
|
745
|
+
except Exception as e:
|
|
746
|
+
# numexpr不支持字符串等操作,此时尝试降级到python解释器(性能较低)
|
|
747
|
+
# 典型错误信息:'unknown type object'、'unknown type datetimedelta64[ns]'
|
|
748
|
+
try:
|
|
749
|
+
result = df.eval(expression, engine='python')
|
|
750
|
+
except Exception as e:
|
|
751
|
+
# 如果python解析器也失败,报错
|
|
752
|
+
cols = _extract_column_names( expression)
|
|
753
|
+
print('\n表达式执行失败相关输入数据:')
|
|
754
|
+
print(df[cols])
|
|
755
|
+
raise Exception(f'表达式 {expression} 执行失败(python): {e}')
|
|
756
|
+
return result
|
|
757
|
+
|
|
758
|
+
|
|
759
|
+
# 为解决嵌套where()的情况,将原来的正则表达式方案改为手动解析方案
|
|
760
|
+
def _parse_where_args(s):
|
|
761
|
+
if not s.startswith('where(') or not s.endswith(')'):
|
|
762
|
+
raise ValueError("Not a where expression")
|
|
763
|
+
# 去掉 'where(' 和最后的 ')'
|
|
764
|
+
inner = s[6:-1]
|
|
765
|
+
args = []
|
|
766
|
+
paren_level = 0
|
|
767
|
+
current = []
|
|
768
|
+
for char in inner:
|
|
769
|
+
if char == ',' and paren_level == 0:
|
|
770
|
+
args.append(''.join(current).strip())
|
|
771
|
+
current = []
|
|
772
|
+
else:
|
|
773
|
+
if char == '(':
|
|
774
|
+
paren_level += 1
|
|
775
|
+
elif char == ')':
|
|
776
|
+
paren_level -= 1
|
|
777
|
+
current.append(char)
|
|
778
|
+
args.append(''.join(current).strip()) # 最后一个参数
|
|
779
|
+
return args
|
|
780
|
+
|
|
781
|
+
|
|
782
|
+
def _extract_column_names(expr: str):
|
|
783
|
+
if expr.startswith('where'):
|
|
784
|
+
args = _parse_where_args(expr)
|
|
785
|
+
# FIXME: 根据实际情况,选择arg[1]或arg[2]
|
|
786
|
+
return [] # FIXME
|
|
787
|
+
|
|
788
|
+
# FIXME:带有@pd的表达式无法解析(如 @pd.shape[0]) )
|
|
789
|
+
if '@' in expr:
|
|
790
|
+
return [] # FIXME
|
|
791
|
+
|
|
792
|
+
tree = ast.parse(expr, mode='eval')
|
|
793
|
+
names = set()
|
|
794
|
+
|
|
795
|
+
class NameVisitor(ast.NodeVisitor):
|
|
796
|
+
def visit_Name(self, node):
|
|
797
|
+
names.add(node.id)
|
|
798
|
+
self.generic_visit(node)
|
|
799
|
+
|
|
800
|
+
NameVisitor().visit(tree)
|
|
801
|
+
return sorted(names) # 或直接返回 names(set)
|
|
802
|
+
|
|
803
|
+
|
|
804
|
+
# 列按字母顺序排序
|
|
805
|
+
def _sort_columns(df: pd.DataFrame):
|
|
806
|
+
cols = sorted(df.columns)
|
|
807
|
+
if 'k_device' in cols:
|
|
808
|
+
cols = ['k_device'] + [col for col in cols if col != 'k_device']
|
|
809
|
+
if 'k_ts' in cols:
|
|
810
|
+
cols = ['k_ts'] + [col for col in cols if col != 'k_ts']
|
|
811
|
+
# 不使用df[cols]的写法,避免产生追溯记录
|
|
812
|
+
return df.reindex(columns=cols)
|
k2pipe/pipe.py
ADDED
|
@@ -0,0 +1,168 @@
|
|
|
1
|
+
import ast
|
|
2
|
+
|
|
3
|
+
import numpy as np
|
|
4
|
+
import pandas as pd
|
|
5
|
+
from loguru import logger
|
|
6
|
+
|
|
7
|
+
##### ##### ##### ##### ##### #####
|
|
8
|
+
##### 后面将被MyDataFrame替换 #####
|
|
9
|
+
##### ##### ##### ##### ##### #####
|
|
10
|
+
|
|
11
|
+
# 初始化,为DataFrame添加自定义方法
|
|
12
|
+
def init_pipe():
|
|
13
|
+
pd.DataFrame.extract_features = extract_features
|
|
14
|
+
pd.DataFrame.format_columns = format_columns
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
# 根据配置文件对DataFrame做特征提取
|
|
18
|
+
def extract_features(self, config: pd.DataFrame) -> pd.DataFrame:
|
|
19
|
+
result = self.copy()
|
|
20
|
+
result.columns = result.columns.str.strip() # 防止列名前后有空格造成难以排查的错误
|
|
21
|
+
for _, row in config.iterrows():
|
|
22
|
+
# 忽略注释行
|
|
23
|
+
if row[0].startswith('#'):
|
|
24
|
+
continue
|
|
25
|
+
|
|
26
|
+
feature_name = row['feature']
|
|
27
|
+
if not pd.isna(feature_name):
|
|
28
|
+
feature_name = feature_name.strip()
|
|
29
|
+
else:
|
|
30
|
+
raise ValueError(f"特征名称不能为空 {row}, line: {_}")
|
|
31
|
+
|
|
32
|
+
if feature_name == '*':
|
|
33
|
+
continue
|
|
34
|
+
|
|
35
|
+
_validate_var_name(feature_name)
|
|
36
|
+
|
|
37
|
+
expression = row['expression']
|
|
38
|
+
if not pd.isna(expression):
|
|
39
|
+
expression = expression.strip()
|
|
40
|
+
else:
|
|
41
|
+
result[feature_name] = np.nan
|
|
42
|
+
continue
|
|
43
|
+
|
|
44
|
+
# 非数值类型用eval容易报错,这种情况直接赋值
|
|
45
|
+
if feature_name == expression:
|
|
46
|
+
result[feature_name] = result[expression]
|
|
47
|
+
continue
|
|
48
|
+
|
|
49
|
+
result[feature_name] = _eval(result, expression)
|
|
50
|
+
|
|
51
|
+
# 删除self中存在但config中没有定义的列
|
|
52
|
+
# 但若config里包含*,则表示原始所有列都复制过来,此时跳过删除步骤
|
|
53
|
+
if not '*' in config['feature'].values:
|
|
54
|
+
config_columns = set(config['feature'].dropna())
|
|
55
|
+
original_columns = set(self.columns)
|
|
56
|
+
columns_to_drop = original_columns - config_columns
|
|
57
|
+
result = result.drop(columns=columns_to_drop, errors='ignore')
|
|
58
|
+
|
|
59
|
+
result = _sort_columns(result)
|
|
60
|
+
|
|
61
|
+
return result
|
|
62
|
+
|
|
63
|
+
|
|
64
|
+
# 先使用numexpr解析,若失败再尝试python解析
|
|
65
|
+
def _eval(df: pd.DataFrame, expression: str):
|
|
66
|
+
result = None
|
|
67
|
+
|
|
68
|
+
# dataframe的eval()方法不支持where表达式,自己实现
|
|
69
|
+
if expression.startswith('where'):
|
|
70
|
+
args = _parse_where_args(expression)
|
|
71
|
+
if len(args) == 3:
|
|
72
|
+
return np.where(_eval(df, args[0]), _eval(df, args[1]), _eval(df, args[2]))
|
|
73
|
+
else:
|
|
74
|
+
raise ValueError(f"无效的where表达式格式: {expression}")
|
|
75
|
+
|
|
76
|
+
try:
|
|
77
|
+
result = df.eval(expression, engine='numexpr')
|
|
78
|
+
except Exception as e:
|
|
79
|
+
# numexpr不支持字符串等操作,此时尝试降级到python解释器(性能较低)
|
|
80
|
+
# 典型错误信息:'unknown type object'、'unknown type datetimedelta64[ns]'
|
|
81
|
+
try:
|
|
82
|
+
result = df.eval(expression, engine='python')
|
|
83
|
+
except Exception as e:
|
|
84
|
+
cols = _extract_column_names(expression)
|
|
85
|
+
print('\n表达式执行失败相关输入数据:')
|
|
86
|
+
print(df[cols])
|
|
87
|
+
raise Exception(f'表达式 {expression} 执行失败(python): {e}')
|
|
88
|
+
return result
|
|
89
|
+
|
|
90
|
+
|
|
91
|
+
# 确保DataFrame的时间戳和设备列的类型,时间戳作为索引
|
|
92
|
+
# 将object类型的列转为string类型,前者不支持eval()
|
|
93
|
+
def format_columns(self) -> pd.DataFrame:
|
|
94
|
+
result = self.copy()
|
|
95
|
+
if 'k_ts' in result.columns:
|
|
96
|
+
result['k_ts'] = pd.to_datetime(result['k_ts'])
|
|
97
|
+
if 'k_device' in result.columns:
|
|
98
|
+
result['k_device'] = result['k_device'].astype(str)
|
|
99
|
+
# result = result.set_index(['k_ts'], drop=False)
|
|
100
|
+
|
|
101
|
+
object_cols = result.select_dtypes(include=['object']).columns
|
|
102
|
+
result[object_cols] = result[object_cols].astype('string')
|
|
103
|
+
|
|
104
|
+
result = _sort_columns(result)
|
|
105
|
+
|
|
106
|
+
return result
|
|
107
|
+
|
|
108
|
+
|
|
109
|
+
def _extract_column_names(expr: str):
|
|
110
|
+
if expr.startswith('where'):
|
|
111
|
+
args = _parse_where_args(expr)
|
|
112
|
+
# FIXME: 根据实际情况,选择arg[1]或arg[2]
|
|
113
|
+
return [] # FIXME
|
|
114
|
+
|
|
115
|
+
# FIXME:带有@pd的表达式无法解析(如 @pd.shape[0]) )
|
|
116
|
+
if '@' in expr:
|
|
117
|
+
return [] # FIXME
|
|
118
|
+
|
|
119
|
+
tree = ast.parse(expr, mode='eval')
|
|
120
|
+
names = set()
|
|
121
|
+
|
|
122
|
+
class NameVisitor(ast.NodeVisitor):
|
|
123
|
+
def visit_Name(self, node):
|
|
124
|
+
names.add(node.id)
|
|
125
|
+
self.generic_visit(node)
|
|
126
|
+
|
|
127
|
+
NameVisitor().visit(tree)
|
|
128
|
+
return sorted(names) # 或直接返回 names(set)
|
|
129
|
+
|
|
130
|
+
|
|
131
|
+
# 列按字母顺序排序
|
|
132
|
+
def _sort_columns(df:pd.DataFrame):
|
|
133
|
+
cols = sorted(df.columns)
|
|
134
|
+
if 'k_device' in cols:
|
|
135
|
+
cols = ['k_device'] + [col for col in cols if col != 'k_device']
|
|
136
|
+
if 'k_ts' in cols:
|
|
137
|
+
cols = ['k_ts'] + [col for col in cols if col != 'k_ts']
|
|
138
|
+
return df[cols]
|
|
139
|
+
|
|
140
|
+
|
|
141
|
+
# 为解决嵌套where()的情况,将原来的正则表达式方案改为手动解析方案
|
|
142
|
+
def _parse_where_args(s):
|
|
143
|
+
if not s.startswith('where(') or not s.endswith(')'):
|
|
144
|
+
raise ValueError("Not a where expression")
|
|
145
|
+
# 去掉 'where(' 和最后的 ')'
|
|
146
|
+
inner = s[6:-1]
|
|
147
|
+
args = []
|
|
148
|
+
paren_level = 0
|
|
149
|
+
current = []
|
|
150
|
+
for char in inner:
|
|
151
|
+
if char == ',' and paren_level == 0:
|
|
152
|
+
args.append(''.join(current).strip())
|
|
153
|
+
current = []
|
|
154
|
+
else:
|
|
155
|
+
if char == '(':
|
|
156
|
+
paren_level += 1
|
|
157
|
+
elif char == ')':
|
|
158
|
+
paren_level -= 1
|
|
159
|
+
current.append(char)
|
|
160
|
+
args.append(''.join(current).strip()) # 最后一个参数
|
|
161
|
+
return args
|
|
162
|
+
|
|
163
|
+
|
|
164
|
+
def _validate_var_name(var_name: str):
|
|
165
|
+
forbidden_chars = {'.', '[', ']', '-', '+', '*', '/', '\\', '%', '&'}
|
|
166
|
+
if any(char in forbidden_chars for char in var_name):
|
|
167
|
+
raise ValueError(f"变量名 '{var_name}' 包含非法字符")
|
|
168
|
+
return True
|
|
@@ -0,0 +1,255 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: k2pipe
|
|
3
|
+
Version: 0.1.9
|
|
4
|
+
Summary: k2pipe
|
|
5
|
+
Home-page: https://www.k2data.com.cn
|
|
6
|
+
Author: K2data
|
|
7
|
+
Author-email: admin@k2data.com.cn
|
|
8
|
+
Classifier: Development Status :: 3 - Alpha
|
|
9
|
+
Classifier: Intended Audience :: Developers
|
|
10
|
+
Classifier: Programming Language :: Python :: 3
|
|
11
|
+
Classifier: Programming Language :: Python :: 3.8
|
|
12
|
+
Classifier: Programming Language :: Python :: 3.9
|
|
13
|
+
Classifier: Programming Language :: Python :: 3.10
|
|
14
|
+
Classifier: Operating System :: OS Independent
|
|
15
|
+
Classifier: Topic :: Software Development :: Libraries
|
|
16
|
+
Requires-Python: >=3.8
|
|
17
|
+
Description-Content-Type: text/markdown
|
|
18
|
+
Requires-Dist: pandas
|
|
19
|
+
Requires-Dist: numexpr
|
|
20
|
+
Requires-Dist: loguru
|
|
21
|
+
Requires-Dist: graphviz
|
|
22
|
+
Requires-Dist: pytest
|
|
23
|
+
Requires-Dist: pytest-mock
|
|
24
|
+
Requires-Dist: pytest-cov
|
|
25
|
+
Dynamic: author
|
|
26
|
+
Dynamic: author-email
|
|
27
|
+
Dynamic: classifier
|
|
28
|
+
Dynamic: description
|
|
29
|
+
Dynamic: description-content-type
|
|
30
|
+
Dynamic: home-page
|
|
31
|
+
Dynamic: requires-dist
|
|
32
|
+
Dynamic: requires-python
|
|
33
|
+
Dynamic: summary
|
|
34
|
+
|
|
35
|
+
# Pipe SDK (暂用名)
|
|
36
|
+
通过配置的方式,简化数据处理流程中常用操作,使得数据处理的业务逻辑更加直观和稳定。
|
|
37
|
+
|
|
38
|
+
## 一、安装
|
|
39
|
+
|
|
40
|
+
要求python 3.8或以上,建议3.8.10版本。
|
|
41
|
+
|
|
42
|
+
```
|
|
43
|
+
pip install -U k2pipe
|
|
44
|
+
```
|
|
45
|
+
|
|
46
|
+
## 二、使用方法
|
|
47
|
+
|
|
48
|
+
### 2.1 数据加工
|
|
49
|
+
|
|
50
|
+
提供 `extract_features()` 方法,根据配置里的计算规则,将当前DataFrame处理并返回结果DataFrame。
|
|
51
|
+
|
|
52
|
+
#### 2.1.1 配置文件
|
|
53
|
+
|
|
54
|
+
配置DataFrame一般从csv文件中加载,它定义了对原始DataFrame进行数据转换的规则。配置文件中每一行定义结果DataFrame里的一个(新)列,其中feature是列名,expression是列的计算表达式。
|
|
55
|
+
|
|
56
|
+
配置里定义的列是顺序处理的,因此列表达式既可以引用原始DataFrame里的列,也可以引用前面新定义的列。
|
|
57
|
+
|
|
58
|
+
| feature | expression | comment |
|
|
59
|
+
|---------|------------------------------------|------------------------------|
|
|
60
|
+
| feat1 | col1 / 100 - col2 | 四则运算示例 |
|
|
61
|
+
| feat2 | col2.rolling(3).mean() | 按行数滚动窗口示例 |
|
|
62
|
+
| feat3 | col2.rolling('3D').mean() | 按时间滚动窗口示例(dataframe必须有时间索引列) |
|
|
63
|
+
| feat4 | feat1.my_func() | 自定义函数示例(函数需要在python代码里注册) |
|
|
64
|
+
| feat5 | "where(col1 > 50, col1, col2 * 2)" | 条件赋值示例(支持多层where嵌套) |
|
|
65
|
+
| # | 注释内容 | 注释行示例 |
|
|
66
|
+
| feat6 | (col3 - k_ts).dt.days | 时间处理示例 |
|
|
67
|
+
| * | | 表示保留原始dataframe的所有列 |
|
|
68
|
+
| feat7 | k_device.str[1] | 字符串操作示例 |
|
|
69
|
+
| feat8 | "where(col1.isna(), 1, 2)" | 空值判断示例 |
|
|
70
|
+
| feat9 | @df.shape[0] | 获取待处理的DataFrame的属性示例 |
|
|
71
|
+
| feat10 | feat01.round() | 取整操作示例 |
|
|
72
|
+
| feat11 | k_ts + col1.astype('timedelta64[s]') | 时间处理示例1(暂时无法进行常数时间计算) |
|
|
73
|
+
| feat12 | k_ts.time_shift('-10s') | 时间处理示例2(带有常数时间的计算) |
|
|
74
|
+
|
|
75
|
+
说明:
|
|
76
|
+
- 若表达式或注释包含逗号,需要用双引号包裹,双引号前不能有空格
|
|
77
|
+
- 表达式不支持apply()函数
|
|
78
|
+
- 内置变量列表:@df
|
|
79
|
+
- 内置函数列表:time_shift()
|
|
80
|
+
- 更多示例:[K2Pipe Examples](https://gitlab.kstonedata.k2/zhanghao/k2pipe/tree/develop/tests/example)
|
|
81
|
+
- 表达式语法参考:[Mastering Eval Expressions in Pandas](https://www.sparkcodehub.com/pandas/advanced/eval-expressions-guide)
|
|
82
|
+
|
|
83
|
+
|
|
84
|
+
#### 2.2.2 Python代码
|
|
85
|
+
|
|
86
|
+
根据配置里定义的处理规则,将原始DataFrame数据转换为结果DataFrame数据。
|
|
87
|
+
|
|
88
|
+
```python
|
|
89
|
+
import pandas as pd
|
|
90
|
+
from pathlib import Path
|
|
91
|
+
from k2pipe.mypipe import MyDataFrame
|
|
92
|
+
|
|
93
|
+
# 样例数据
|
|
94
|
+
df = MyDataFrame({'k_device': ['dev1','dev1', 'dev1', 'dev2', 'dev2', 'dev2'],
|
|
95
|
+
'col1': [50, 60, 70, 80, 90, 100],
|
|
96
|
+
'col2': [1, 2, 3, 4, 5, 6]})
|
|
97
|
+
df['k_ts'] = pd.date_range(start='2025-01-01 08:00:00', periods=len(df), freq='S')
|
|
98
|
+
df['ts2'] = pd.date_range(start='2025-01-01 12:00:00', periods=len(df), freq='S')
|
|
99
|
+
df.set_index('k_ts', inplace=True)
|
|
100
|
+
|
|
101
|
+
# 配置信息
|
|
102
|
+
config = pd.read_csv(Path(__file__).parent / 'my_feat.csv')
|
|
103
|
+
df.columns = df.columns.str.strip()
|
|
104
|
+
|
|
105
|
+
# 注册自定义函数
|
|
106
|
+
pd.Series.my_func = (lambda x: x.rolling(3).mean())
|
|
107
|
+
|
|
108
|
+
# 处理数据
|
|
109
|
+
result = df.extract_features(config)
|
|
110
|
+
```
|
|
111
|
+
|
|
112
|
+
#### 2.2.3 运行结果
|
|
113
|
+
|
|
114
|
+
原始数据:
|
|
115
|
+
|
|
116
|
+
```
|
|
117
|
+
k_device col1 col2 col3
|
|
118
|
+
k_ts
|
|
119
|
+
2025-01-01 08:00:00 dev1 50 1 2025-01-01 12:00:00
|
|
120
|
+
2025-01-01 08:00:01 dev1 60 2 2025-01-01 12:00:01
|
|
121
|
+
2025-01-01 08:00:02 dev1 70 3 2025-01-01 12:00:02
|
|
122
|
+
2025-01-01 08:00:03 dev2 80 4 2025-01-01 12:00:03
|
|
123
|
+
2025-01-01 08:00:04 dev2 90 5 2025-01-01 12:00:04
|
|
124
|
+
2025-01-01 08:00:05 dev2 100 6 2025-01-01 12:00:05
|
|
125
|
+
|
|
126
|
+
```
|
|
127
|
+
|
|
128
|
+
结果数据:
|
|
129
|
+
|
|
130
|
+
```
|
|
131
|
+
k_device feat01 feat02 feat03 feat04 feat05 feat06 feat07 feat08 feat09 feat10 feat11
|
|
132
|
+
k_ts
|
|
133
|
+
2025-01-01 08:00:00 dev1 -0.5 NaN 1.0 NaN 2 0 e 2 6 -0.0 2025-01-01 08:00:10
|
|
134
|
+
2025-01-01 08:00:01 dev1 -1.4 NaN 1.5 NaN 60 0 e 2 6 -1.0 2025-01-01 08:00:11
|
|
135
|
+
2025-01-01 08:00:02 dev1 -2.3 2.0 2.0 -1.4 70 0 e 2 6 -2.0 2025-01-01 08:00:12
|
|
136
|
+
2025-01-01 08:00:03 dev2 -3.2 3.0 2.5 -2.3 80 0 e 2 6 -3.0 2025-01-01 08:00:13
|
|
137
|
+
2025-01-01 08:00:04 dev2 -4.1 4.0 3.0 -3.2 90 0 e 2 6 -4.0 2025-01-01 08:00:14
|
|
138
|
+
2025-01-01 08:00:05 dev2 -5.0 5.0 3.5 -4.1 100 0 e 2 6 -5.0 2025-01-01 08:00:15
|
|
139
|
+
```
|
|
140
|
+
|
|
141
|
+
### 2.2 统一数据格式
|
|
142
|
+
|
|
143
|
+
提供`format_columns()`方法,将输入的DataFrame内的`k_ts`转为时间类型,将`k_device`列转为字符串类型。
|
|
144
|
+
|
|
145
|
+
```python
|
|
146
|
+
import pandas as pd
|
|
147
|
+
from k2pipe.mypipe import MyDataFrame
|
|
148
|
+
|
|
149
|
+
df = MyDataFrame(...)
|
|
150
|
+
df = df.format_columns()
|
|
151
|
+
```
|
|
152
|
+
|
|
153
|
+
|
|
154
|
+
### 2.3 数据流分析
|
|
155
|
+
|
|
156
|
+
提供`generate_dataflow()`方法,生成svg格式的数据流图功能。此功能依赖`graphviz`包,安装方法见[这里](https://pypi.org/project/graphviz/)。
|
|
157
|
+
|
|
158
|
+
```python
|
|
159
|
+
df1 = MyDataFrame({'k_device': ['dev1', 'dev1', 'dev1', 'dev2', 'dev2', 'dev2'],
|
|
160
|
+
'col1': [50, 60, 70, 80, 90, 100],
|
|
161
|
+
'col2': [1, 2, 3, 4, 5, 6]})
|
|
162
|
+
df1['k_ts'] = pd.date_range(start='2025-01-01 08:00:00', periods=len(df1), freq='S')
|
|
163
|
+
df2 = df1.copy()
|
|
164
|
+
df1 = df1.rename(columns={'col1': 'col3', 'col2': 'col4'})
|
|
165
|
+
result = df1.merge(df2, on=['k_device', 'k_ts'])
|
|
166
|
+
result.generate_dataflow(filename='dataflow_merge.svg', show_value=False)
|
|
167
|
+
```
|
|
168
|
+
|
|
169
|
+
数据流图效果(灰色矩形代表DataFrame实例,白色矩形表示数据列Series,黄色矩形表示没有连出边的数据列):
|
|
170
|
+
|
|
171
|
+

|
|
172
|
+
|
|
173
|
+
|
|
174
|
+
### 2.4 长宽表转换
|
|
175
|
+
|
|
176
|
+
提供`wide_to_long()`方法和`long_to_wide()`方法,用于长表风格与宽表风格的转换。
|
|
177
|
+
|
|
178
|
+
宽表举例:
|
|
179
|
+
|
|
180
|
+
```
|
|
181
|
+
k_ts k_device f1_mean_3D f1_mean_5D f1_slope_3D f1_slope_5D f2_mean_3D f2_mean_5D f2_slope_3D f2_slope_5D
|
|
182
|
+
0 2025-01-01 dev1 8 24 67 87 79 48 10 94
|
|
183
|
+
1 2025-01-01 dev2 52 98 53 66 98 14 34 24
|
|
184
|
+
2 2025-01-02 dev1 15 60 58 16 9 93 86 2
|
|
185
|
+
3 2025-01-02 dev2 27 4 31 1 13 83 4 91
|
|
186
|
+
4 2025-01-03 dev1 59 67 7 49 47 65 61 14
|
|
187
|
+
5 2025-01-03 dev2 55 71 80 2 94 19 98 63
|
|
188
|
+
```
|
|
189
|
+
|
|
190
|
+
长表举例:
|
|
191
|
+
|
|
192
|
+
```
|
|
193
|
+
k_ts k_device feature measure period value
|
|
194
|
+
0 2025-01-01 dev1 f1 mean 3D 8
|
|
195
|
+
1 2025-01-01 dev1 f1 mean 5D 24
|
|
196
|
+
2 2025-01-01 dev1 f1 slope 3D 67
|
|
197
|
+
3 2025-01-01 dev1 f1 slope 5D 87
|
|
198
|
+
4 2025-01-01 dev1 f2 mean 3D 79
|
|
199
|
+
5 2025-01-01 dev1 f2 mean 5D 48
|
|
200
|
+
6 2025-01-01 dev1 f2 slope 3D 10
|
|
201
|
+
7 2025-01-01 dev1 f2 slope 5D 94
|
|
202
|
+
8 2025-01-01 dev2 f1 mean 3D 52
|
|
203
|
+
9 2025-01-01 dev2 f1 mean 5D 98
|
|
204
|
+
10 2025-01-01 dev2 f1 slope 3D 53
|
|
205
|
+
11 2025-01-01 dev2 f1 slope 5D 66
|
|
206
|
+
12 2025-01-01 dev2 f2 mean 3D 98
|
|
207
|
+
13 2025-01-01 dev2 f2 mean 5D 14
|
|
208
|
+
14 2025-01-01 dev2 f2 slope 3D 34
|
|
209
|
+
15 2025-01-01 dev2 f2 slope 5D 24
|
|
210
|
+
16 2025-01-02 dev1 f1 mean 3D 15
|
|
211
|
+
17 2025-01-02 dev1 f1 mean 5D 60
|
|
212
|
+
18 2025-01-02 dev1 f1 slope 3D 58
|
|
213
|
+
19 2025-01-02 dev1 f1 slope 5D 16
|
|
214
|
+
20 2025-01-02 dev1 f2 mean 3D 9
|
|
215
|
+
21 2025-01-02 dev1 f2 mean 5D 93
|
|
216
|
+
22 2025-01-02 dev1 f2 slope 3D 86
|
|
217
|
+
23 2025-01-02 dev1 f2 slope 5D 2
|
|
218
|
+
24 2025-01-02 dev2 f1 mean 3D 27
|
|
219
|
+
25 2025-01-02 dev2 f1 mean 5D 4
|
|
220
|
+
26 2025-01-02 dev2 f1 slope 3D 31
|
|
221
|
+
27 2025-01-02 dev2 f1 slope 5D 1
|
|
222
|
+
28 2025-01-02 dev2 f2 mean 3D 13
|
|
223
|
+
29 2025-01-02 dev2 f2 mean 5D 83
|
|
224
|
+
30 2025-01-02 dev2 f2 slope 3D 4
|
|
225
|
+
31 2025-01-02 dev2 f2 slope 5D 91
|
|
226
|
+
32 2025-01-03 dev1 f1 mean 3D 59
|
|
227
|
+
33 2025-01-03 dev1 f1 mean 5D 67
|
|
228
|
+
34 2025-01-03 dev1 f1 slope 3D 7
|
|
229
|
+
35 2025-01-03 dev1 f1 slope 5D 49
|
|
230
|
+
36 2025-01-03 dev1 f2 mean 3D 47
|
|
231
|
+
37 2025-01-03 dev1 f2 mean 5D 65
|
|
232
|
+
38 2025-01-03 dev1 f2 slope 3D 61
|
|
233
|
+
39 2025-01-03 dev1 f2 slope 5D 14
|
|
234
|
+
40 2025-01-03 dev2 f1 mean 3D 55
|
|
235
|
+
41 2025-01-03 dev2 f1 mean 5D 71
|
|
236
|
+
42 2025-01-03 dev2 f1 slope 3D 80
|
|
237
|
+
43 2025-01-03 dev2 f1 slope 5D 2
|
|
238
|
+
44 2025-01-03 dev2 f2 mean 3D 94
|
|
239
|
+
45 2025-01-03 dev2 f2 mean 5D 19
|
|
240
|
+
46 2025-01-03 dev2 f2 slope 3D 98
|
|
241
|
+
47 2025-01-03 dev2 f2 slope 5D 63
|
|
242
|
+
```
|
|
243
|
+
|
|
244
|
+
宽表转长表:
|
|
245
|
+
|
|
246
|
+
```python
|
|
247
|
+
df_long = df_wide.wide_to_long()
|
|
248
|
+
```
|
|
249
|
+
|
|
250
|
+
长表转宽表:
|
|
251
|
+
|
|
252
|
+
```python
|
|
253
|
+
df_wide = df_long.long_to_wide()
|
|
254
|
+
```
|
|
255
|
+
|
|
@@ -0,0 +1,7 @@
|
|
|
1
|
+
k2pipe/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
2
|
+
k2pipe/mypipe.py,sha256=O8h65Polb-xDw6k8pT5iIvp9zrT9Urrmez8kvtdsrT4,34388
|
|
3
|
+
k2pipe/pipe.py,sha256=K6uXTOHgQvXchvOChRO72h37C7VJOuWbkGWNF6XKif0,5797
|
|
4
|
+
k2pipe-0.1.9.dist-info/METADATA,sha256=chHK4FCNsqwcH3YsejM67KcwXWoEq3e5ngDLNT1E9RU,11509
|
|
5
|
+
k2pipe-0.1.9.dist-info/WHEEL,sha256=0CuiUZ_p9E4cD6NyLD6UG80LBXYyiSYZOKDm5lp32xk,91
|
|
6
|
+
k2pipe-0.1.9.dist-info/top_level.txt,sha256=0fzk4s78hGSUO5KiuC_PlFZOI_3Z2RMlNCjOVQSxxpI,7
|
|
7
|
+
k2pipe-0.1.9.dist-info/RECORD,,
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
k2pipe
|