k2pipe 0.1.9__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
k2pipe/__init__.py ADDED
File without changes
k2pipe/mypipe.py ADDED
@@ -0,0 +1,812 @@
1
+ from __future__ import annotations
2
+ import ast
3
+ from pathlib import Path
4
+
5
+ import numpy as np
6
+ import pandas as pd
7
+ from pandas import Series
8
+ from threading import local
9
+
10
+ _thread_local = local()
11
+ _thread_local.accessed_cols = []
12
+
13
+ mydataframes = []
14
+
15
+ # K2Pipe提供的内置函数,用于解决暂时无法通过eval()实现的常用操作,如时间操作
16
+ def time_shift(self: Series, *args, **kwargs):
17
+ return self + pd.to_timedelta(*args, **kwargs)
18
+ pd.Series.time_shift = time_shift
19
+
20
+
21
+ # 修改pd.concat()方法
22
+ _original_concat = pd.concat
23
+ def my_concat(objs, axis=0, **kwargs):
24
+ result = _original_concat(objs, axis=axis, **kwargs)
25
+
26
+ if isinstance(result, pd.DataFrame):
27
+ result = MyDataFrame(result)
28
+
29
+ result.config = pd.DataFrame()
30
+ if axis == 0:
31
+ # 纵向拼接的情况
32
+ result.name = 'concat'
33
+ result.config['feature'] = list(objs[0].columns)
34
+ for col in objs[0].columns:
35
+ result.actual_mappings[col] = col
36
+ elif axis == 1:
37
+ # 横向拼接的情况
38
+ result.name = 'concat(1)'
39
+
40
+ all_features = []
41
+ for obj in objs:
42
+ if isinstance(obj, pd.Series):
43
+ all_features.append(obj.name)
44
+ elif isinstance(obj, pd.DataFrame):
45
+ # merge的实现会调用concat(axis=1)
46
+ # 虽然pandas的concat支持两个df有同名列,但结果格式复杂容易出错,这里禁止这种情况
47
+ if bool(set(all_features) & set(obj.columns.values)):
48
+ raise ValueError(f'横向拼接的DataFrame不能有同名列:{all_features} -- {obj.columns.values}')
49
+ all_features.extend(obj.columns)
50
+ else:
51
+ raise ValueError('暂不支持非Series类型的拼接')
52
+ result.config['feature'] = all_features
53
+ for col in all_features:
54
+ result.actual_mappings[col] = col
55
+
56
+ # 建立连接关系
57
+ result.input_dfs = objs
58
+ for obj in objs:
59
+ obj.output_df = result
60
+
61
+ # 加入到processors列表
62
+ mydataframes.append(result)
63
+ return result
64
+ # merge也会调用自定义concat()方法,若覆盖会报错:
65
+ # AttributeError: 'Series' object has no attribute 'columns'
66
+ # 暂时不覆盖原生concat
67
+ pd.concat = my_concat
68
+
69
+ class MyDataFrame(pd.DataFrame):
70
+ _metadata = ['name', 'config','actual_mappings','missing_mappings','input_dfs','output_df']
71
+
72
+ def __init__(self, *args, name=None, config:pd.DataFrame=None, actual_mappings=None, missing_mappings=None, input_dfs=None, output_df=None, **kwargs):
73
+ super().__init__(*args, **kwargs)
74
+ self.name = name
75
+ self.config = config
76
+ self.actual_mappings = actual_mappings # 实际发生的计算关系(例如 * 已经展开)
77
+ self.missing_mappings = missing_mappings # 未能跟踪到的计算关系(例如未通过配置方式生成的列)
78
+ self.input_dfs = input_dfs
79
+ self.output_df = output_df
80
+ if self.input_dfs is None:
81
+ self.input_dfs = []
82
+ if self.actual_mappings is None:
83
+ self.actual_mappings = {}
84
+ if self.missing_mappings is None:
85
+ self.missing_mappings = []
86
+
87
+ @property
88
+ def _constructor(self):
89
+ # 确保在 df 操作(如 df.head(), df.copy())后仍返回 MyDataFrame 类型
90
+ return MyDataFrame
91
+
92
+
93
+ def merge(self, right, on=None, **kwargs):
94
+ # 目前不支持自动加_x、_y的处理
95
+ # 如果两个df有同名列(on内除外),则抛出异常
96
+ if on is None:
97
+ raise ValueError("strict_merge 要求必须显式指定 `on` 参数。")
98
+ # 标准化 on 为集合
99
+ if isinstance(on, str):
100
+ on_cols = {on}
101
+ else:
102
+ on_cols = set(on)
103
+ # 检查 on 列是否都存在于两个 DataFrame 中
104
+ missing_in_left = on_cols - set(self.columns)
105
+ missing_in_right = on_cols - set(right.columns)
106
+ if missing_in_left or missing_in_right:
107
+ raise KeyError(
108
+ f"连接键缺失:left 缺少 {missing_in_left},right 缺少 {missing_in_right}"
109
+ )
110
+ common_cols = set(self.columns) & set(right.columns)
111
+ extra_common = common_cols - on_cols
112
+ if extra_common:
113
+ raise ValueError(
114
+ f"发现非连接键的同名列(除 `on={on}` 外): {sorted(extra_common)}。"
115
+ "请重命名列或移除重复列后再合并。"
116
+ )
117
+
118
+ # 原始merge结果
119
+ result = MyDataFrame(super().merge(right=right, **kwargs))
120
+
121
+ # FIXME: 重名带有后缀的情况还没有处理
122
+ # actual_mappings
123
+ result.name = 'merge'
124
+ result.config = pd.DataFrame()
125
+ result.config['feature'] = list(self.columns) + list(right.columns)
126
+ for col in self.columns:
127
+ result.actual_mappings[col] = col
128
+ for col in right.columns:
129
+ result.actual_mappings[col] = col
130
+
131
+ # 建立连接关系
132
+ result.input_dfs = [self,right]
133
+ self.output_df = result
134
+ right.output_df = result
135
+
136
+ # 加入到processors列表
137
+ mydataframes.append(result)
138
+ return result
139
+
140
+
141
+ # 覆盖pd.DataFrame的rename方法
142
+ def rename(self, inplace = None, *args, **kwargs):
143
+ if inplace:
144
+ raise ValueError("mydataframe.rename 暂不支持 inplace=True 参数") # TODO
145
+ result = MyDataFrame(super().rename(*args, **kwargs))
146
+ # actual_mappings
147
+ result.name = 'rename'
148
+ result.config = pd.DataFrame()
149
+ result.config['feature'] = list(result.columns)
150
+ for old, new in zip(list(self.columns), list(result.columns)):
151
+ result.actual_mappings[new] = old
152
+ # 建立连接关系
153
+ result.input_dfs = [self]
154
+ self.output_df = result
155
+ # 加入到processors列表
156
+ mydataframes.append(result)
157
+ return result
158
+
159
+
160
+ # 覆盖pd.DataFrame的fitler方法
161
+ def filter(self, *args, **kwargs) -> MyDataFrame:
162
+ result = MyDataFrame(super().filter(*args, **kwargs))
163
+ # actual_mappings
164
+ result.name = 'filter'
165
+ result.config = pd.DataFrame()
166
+ result.config['feature'] = list(result.columns)
167
+ for col in result.columns:
168
+ result.actual_mappings[col] = col
169
+ # 建立连接关系
170
+ result.input_dfs = [self]
171
+ self.output_df = result
172
+ # 加入到processors列表
173
+ mydataframes.append(result)
174
+ return result
175
+
176
+ # 追溯 df[['col1','col2']] 这类filter操作
177
+ def __getitem__(self, key):
178
+ if isinstance(key, tuple):
179
+ key = list(key)
180
+
181
+ if isinstance(key, list):
182
+ result = self.filter(key)
183
+ result.name = 'getitem'
184
+ return result
185
+
186
+ # 为__setitem__时获取使用记录做准备
187
+ if isinstance(key, str):
188
+ _thread_local.accessed_cols.append(key)
189
+
190
+ # 其他key情况直接返回原始结果,不追溯:
191
+ # slice类型:merge里会使用到此形式 left = self.left[:]
192
+ # Series类型:drop_duplicate里使用
193
+ # str类型:过于常用,例如df['k_ts'] = pd.to_datetime(df['k_ts'])
194
+ # 可能还有其他类型
195
+ return super().__getitem__(key)
196
+
197
+
198
+ # 追溯 df['new_col'] = df['col1'] + df['col2'] 这类操作
199
+ def __setitem__(self, key, value):
200
+ super().__setitem__(key, value)
201
+
202
+ # 清除__getitem__里记录的访问列的全局记录
203
+ accessed = getattr(_thread_local, 'accessed_cols', [])
204
+ _thread_local.accessed_cols = []
205
+
206
+ # 避免过于频繁的追溯记录
207
+ if not isinstance(key, str):
208
+ return
209
+ if not isinstance(value, Series):
210
+ return
211
+ if key in ['k_ts', 'k_device']:
212
+ return
213
+ if accessed == []:
214
+ return
215
+
216
+ # FIXME: 列类型转换astype()也会被记录,追溯的意义不大
217
+ # if key=='jiuxy':
218
+ # print()
219
+
220
+ # 流程图里不能新建df节点
221
+ expression = '+'.join(accessed)
222
+ if self.config is None:
223
+ # self本身不是extract_features或rename等操作得到的情况
224
+ # 通常是直接MyDataFrame()形成的
225
+ self.config = pd.DataFrame()
226
+ self.config['feature'] = [key]
227
+ self.config['expression'] = expression
228
+ self.actual_mappings[key] = expression
229
+ else:
230
+ # 例如 result[feature_name] = _eval(result, expression)这种情况accessed为空,导致expression为空
231
+ # FIXME: 存在重复添加情况
232
+ self.config = pd.concat([self.config, pd.DataFrame([{'feature': key, 'expression': expression}])], ignore_index=True)
233
+ self.actual_mappings[key] = expression
234
+
235
+ # FIXME: 无法创建新的MyDataFrame实例,仅用名称提示存在setitem操作
236
+ if self.name is None or 'set(' in self.name:
237
+ return
238
+ self.name = self.name +f'\nset({key})'
239
+
240
+
241
+ # 追溯 df = df.assign(new_col = df['col1'] + df['col2'])
242
+ def assign(self, **kwargs):
243
+ result = MyDataFrame(super().assign(**kwargs))
244
+
245
+ result.name = 'assign'
246
+ result.config = pd.DataFrame()
247
+ result.config['feature'] = list(result.columns)
248
+ for col in self.columns:
249
+ result.actual_mappings[col] = col
250
+ for key, value in kwargs.items():
251
+ # FIXME: 无法获取到assign里的原始表达式
252
+ # assign会触发__setitem__,需要消除影响(self对象的actual_mappings)
253
+ result.actual_mappings[key] = self.actual_mappings[key]
254
+ self.actual_mappings.pop(key)
255
+
256
+ # 建立连接关系
257
+ result.input_dfs = [self]
258
+ self.output_df = result
259
+ # 加入到processors列表
260
+ mydataframes.append(result)
261
+ return result
262
+
263
+
264
+ # 覆盖pd.DataFrame的query方法
265
+ def query(self, *args, **kwargs):
266
+ result = MyDataFrame(super().query(*args, **kwargs))
267
+ # actual_mappings
268
+ result.name = 'query'
269
+ result.config = pd.DataFrame()
270
+ result.config['feature'] = list(result.columns)
271
+ for col in result.columns:
272
+ result.actual_mappings[col] = col
273
+ # 建立连接关系
274
+ result.input_dfs = [self]
275
+ self.output_df = result
276
+ # 加入到processors列表
277
+ mydataframes.append(result)
278
+ return result
279
+
280
+
281
+ # mypipe内部应使用原始drop,以免产生多余的追溯
282
+ def drop_old(self, *args, **kwargs):
283
+ return super().drop(*args, **kwargs)
284
+
285
+
286
+ # 覆盖pd.DataFrame的drop方法
287
+ def drop(self, *args, **kwargs):
288
+ result = MyDataFrame(super().drop(*args, **kwargs))
289
+ result.name = 'drop'
290
+ result.config = pd.DataFrame()
291
+ result.config['feature'] = list(result.columns)
292
+ for col in result.columns:
293
+ result.actual_mappings[col] = col
294
+ # 建立连接关系
295
+ result.input_dfs = [self]
296
+ self.output_df = result
297
+ # 加入到processors列表
298
+ mydataframes.append(result)
299
+ return result
300
+
301
+
302
+ def extract_features(self, config: pd.DataFrame, step_name:str=None):
303
+ result = MyDataFrame(self) # 不能用copy()创建新实例,会将actual_mappings等属性复制过来
304
+ result.name = step_name
305
+ result.columns = result.columns.str.strip() # 防止列名前后有空格造成难以排查的错误
306
+
307
+ # 展开第一个 * 为所有列名,并放在最前面
308
+ if '*' in config['feature'].values:
309
+ config.drop(config[config['feature'] == '*'].index, inplace=True)
310
+ new_df = pd.DataFrame(columns=config.columns)
311
+ for col in list(self.columns):
312
+ new_df.loc[len(new_df)] = {'feature':col, 'expression':col, 'comment':'*'}
313
+ for idx, row in config.iterrows():
314
+ new_df.loc[len(new_df)] = row
315
+ config = new_df
316
+
317
+ result.config = config
318
+
319
+ for _, row in config.iterrows():
320
+ # 忽略注释行
321
+ if row[0].startswith('#'):
322
+ continue
323
+
324
+ feature_name = row['feature']
325
+ if not pd.isna(feature_name):
326
+ feature_name = feature_name.strip()
327
+ else:
328
+ raise ValueError(f"特征名称不能为空 {row}, line: {_}")
329
+
330
+ _validate_var_name(feature_name)
331
+
332
+ expression = row['expression']
333
+ if not pd.isna(expression):
334
+ expression = expression.strip()
335
+ else:
336
+ result[feature_name] = np.nan
337
+ continue
338
+
339
+ # 非数值类型用eval容易报错,这种情况直接赋值
340
+ if feature_name == expression:
341
+ result[feature_name] = result[expression]
342
+ else :
343
+ result[feature_name] = _eval(result, expression)
344
+
345
+ # 记录实际生成的列
346
+ expression_values = {}
347
+ cols = _extract_column_names(expression)
348
+ for col in cols:
349
+ expression_values[col] = result[col]
350
+ result.actual_mappings[feature_name] = expression
351
+
352
+ # 非通过配置产生的列,也放入mappings信息以备追踪
353
+ # 将 result 中存在但 config.mappings 中缺失的列加入 config.missing_mappings
354
+ if self.config is not None:
355
+ missing_columns = set(self.columns) - set(self.config['feature'])
356
+ for col in missing_columns:
357
+ result.missing_mappings.append({
358
+ 'feature': col,
359
+ 'expression': '(Unknown)',
360
+ # 'feature_value': result[col].copy(),
361
+ # 'expression_values': {}
362
+ })
363
+
364
+ mydataframes.append(result)
365
+
366
+ # 删除self中存在但config中没有定义的列
367
+ config_columns = set(config['feature'].dropna())
368
+ original_columns = set(self.columns)
369
+ columns_to_drop = original_columns - config_columns
370
+ result = result.drop_old(columns=columns_to_drop, errors='ignore')
371
+
372
+ result = _sort_columns(result)
373
+
374
+ self.output_df = result
375
+ result.input_dfs = [self]
376
+
377
+ return result
378
+
379
+
380
+ # 向前追踪指定df的指定列的计算逻辑
381
+ def trace_column(self, feature_to_trace:str):
382
+ assert isinstance(feature_to_trace, str)
383
+
384
+ # start_line: 倒序处理的开始行号(若为None则处理所有行)
385
+ def _build_pipe_tree_recursive(df, feature, depth=0, start_line:int=None):
386
+ if df.input_dfs is None:
387
+ return None
388
+
389
+ if start_line is None:
390
+ start_line = len(df.actual_mappings)
391
+
392
+ # 倒序遍历
393
+ # 获取 actual_mappings 的键值对列表
394
+ mappings_list = list(df.actual_mappings.items())
395
+ for idx in range(start_line - 1, -1, -1): # 从 start_line-1 到 0
396
+ mapped_feature, expr = mappings_list[idx]
397
+ if mapped_feature == feature :
398
+ # 避免无限递归(同一个配置文件内部递归查找时)
399
+ # if df is self and feature == expr:
400
+ # continue
401
+ input_names = _extract_column_names(expr)
402
+
403
+ children = []
404
+ for name in input_names:
405
+
406
+ # 同一个配置文件内部的递归匹配
407
+ # 从当前行的上一行继续倒序匹配
408
+ if idx > 1: # FIXME: 改为>0?
409
+ child_ast_self = _build_pipe_tree_recursive(df, name, depth + 1, idx -1)
410
+ if child_ast_self:
411
+ children.append(child_ast_self)
412
+
413
+ # 前一个配置文件内的递归匹配
414
+ for input_df in df.input_dfs:
415
+ child_ast_prev = _build_pipe_tree_recursive(input_df, name, depth + 1)
416
+ if child_ast_prev:
417
+ children.append(child_ast_prev)
418
+
419
+ return {
420
+ "feature": feature,
421
+ "df": df.copy(),
422
+ "mapping": {"feature": mapped_feature, "expression": expr},
423
+ "expression": expr,
424
+ "children": children,
425
+ "depth": depth
426
+ }
427
+
428
+ def _print_pipe_tree(ast_node, indent=0):
429
+ if ast_node is None:
430
+ print("└── (empty)")
431
+ return
432
+ spaces = " " * indent
433
+ expr = ast_node["expression"]
434
+ feature = ast_node['feature']
435
+ df = ast_node["df"]
436
+ missing_features = [item['feature'] for item in df.missing_mappings]
437
+ exp_missing_features = set(_extract_column_names(expr)).intersection(set(missing_features))
438
+ # if feature == expr:
439
+ # print(f"{spaces}└── [{df.name}] - "+
440
+ # (f" // missing: {exp_missing_features}" if exp_missing_features else ""))
441
+ # else:
442
+ print(f"{spaces}└── [{df.name}] {feature} = {expr} "+
443
+ (f" // missing: {exp_missing_features}" if exp_missing_features else ""))
444
+ for child in ast_node["children"]:
445
+ _print_pipe_tree(child, indent + 1)
446
+
447
+ tree = _build_pipe_tree_recursive(self, feature_to_trace)
448
+ _print_pipe_tree(tree)
449
+ return tree
450
+
451
+
452
+ # 向前追溯多个列
453
+ def trace_columns(self, features_to_trace:list):
454
+ for feature in features_to_trace:
455
+ print(feature)
456
+ self.trace_column(feature)
457
+ print()
458
+
459
+
460
+ # 宽表转长表,例如:
461
+ # k_ts, f1_mean_3D, f1_slope_3D, f2_mean_3D, f2_slope_3D
462
+ # 2025 - 01 - 01, 1, 2, 3, 4
463
+ # 2025 - 01 - 02, 5, 6, 7, 8
464
+ # 转为:
465
+ # k_ts, feature, measure, period, value
466
+ # 2025 - 01 - 01, f1, mean, 3D, 1
467
+ # 2025 - 01 - 01, f1, slope, 3D, 2
468
+ # 2025 - 01 - 01, f2, mean, 3D, 3
469
+ # 2025 - 01 - 01, f2, slope, 3D, 4
470
+ # 2025 - 01 - 02, f1, mean, 3D, 5
471
+ # 2025 - 01 - 02, f1, slope, 3D, 6
472
+ # 2025 - 01 - 02, f2, mean, 3D, 7
473
+ # 2025 - 01 - 02, f2, slope, 3D, 8
474
+ def wide_to_long(self):
475
+ id_vars = ['k_ts','k_device']
476
+ value_vars = [col for col in self.columns if col != 'k_ts' and col != 'k_device']
477
+ df_melted = self.melt(id_vars=id_vars, value_vars=value_vars, var_name='feature_measure_period',
478
+ value_name='value')
479
+ split_cols = df_melted['feature_measure_period'].str.rsplit('_', n=2, expand=True)
480
+ df_melted[['feature', 'measure', 'period']] = split_cols
481
+ result = df_melted[['k_ts', 'k_device', 'feature', 'measure', 'period', 'value']]
482
+ result = result.sort_values(['k_ts', 'feature', 'measure']).reset_index(drop=True)
483
+ return result
484
+
485
+
486
+ # 长表转宽表
487
+ def long_to_wide(self):
488
+ required_cols = ['k_ts', 'k_device', 'feature', 'measure', 'period', 'value']
489
+ missing_cols = [col for col in required_cols if col not in self.columns]
490
+ if missing_cols:
491
+ raise ValueError(f"缺少必需的列: {missing_cols}")
492
+ wide_df = self.copy()
493
+ wide_df['new_col'] = wide_df['feature'] + '_' + wide_df['measure'] + '_' + wide_df['period']
494
+ wide_df = MyDataFrame(wide_df.pivot(index=['k_ts', 'k_device'], columns='new_col', values='value'))
495
+ wide_df = wide_df.reset_index()
496
+ wide_df.columns.name = None
497
+ return wide_df
498
+
499
+
500
+ # 生成数据流图
501
+ # show_value: 是否显示此列数据值(第一行)
502
+ # highlight_useless_column:是否高亮显示无输出edge的列(无用列)
503
+ def generate_dataflow(self, filename: Path = None, show_value=False, highlight_useless_column=True):
504
+ # graphviz需要本地安装应用(仅pip install graphviz不够),比较麻烦
505
+ # 所以开发者可能本地没有生成数据流图的条件
506
+ # 此时仅警告不实际生成图(不抛出异常以免影响测试用例的完成)
507
+ try:
508
+ import os
509
+ import graphviz
510
+ from graphviz import ExecutableNotFound
511
+ except ImportError as e:
512
+ print(f"警告: 未安装graphviz,请先安装graphviz应用,然后 pip install graphviz {e}")
513
+ return None
514
+
515
+ if filename.suffix.lower() != '.svg':
516
+ raise ValueError(f"仅支持 .svg 格式: {filename.suffix}")
517
+
518
+ dot = graphviz.Digraph(comment='DataFlow Graph', format='svg')
519
+ # ranksep: df矩形之间的横向距离(英寸)
520
+ # nodesep: 列矩形之间的纵向距离(英寸)
521
+ dot.attr(rankdir='LR', splines='spline', ranksep='1', nodesep='0.12')
522
+ # 设置中文字体,优先使用系统中存在的字体
523
+ dot.attr('graph', fontname='SimHei,SimSun,Microsoft YaHei,DejaVu Sans,Arial,sans-serif', fontsize='12')
524
+ dot.attr('node', fontname='SimHei,SimSun,Microsoft YaHei,DejaVu Sans,Arial,sans-serif',
525
+ shape='box', style='filled', fillcolor='white', fontsize='10', height='0.3')
526
+ dot.attr('edge', fontname='SimHei,SimSun,Microsoft YaHei,DejaVu Sans,Arial,sans-serif')
527
+
528
+ # 使用集合记录已访问的节点,避免重复处理
529
+ visited_dfs = set()
530
+ visited_edges = set()
531
+ all_col_nodes = set() # 记录所有列节点ID
532
+ output_sources = set() # 记录有出边的源节点ID
533
+
534
+ def add_dataframe_node(df):
535
+ """添加DataFrame节点到图中"""
536
+ if id(df) in visited_dfs:
537
+ return
538
+ visited_dfs.add(id(df))
539
+
540
+ # 创建子图表示DataFrame,使用cluster前缀使graphviz将其渲染为带边框的组
541
+ with dot.subgraph(name=f'cluster_{id(df)}') as c:
542
+ c.attr(label=f'{df.name or "DataFrame"}',
543
+ fontname='SimHei,SimSun,Microsoft YaHei,DejaVu Sans,Arial,sans-serif')
544
+ c.attr(style='filled', color='lightgrey')
545
+ c.attr(rankdir='TB')
546
+
547
+ # 添加列节点 - 强制垂直排列在同一列
548
+ prev_col_node_id = None
549
+ for i, col in enumerate(df.columns):
550
+ col_node_id = f'col_{id(df)}_{i}_{col}'
551
+ label = f'{col} ({df.iloc[0][col]})' if show_value else col
552
+ c.node(col_node_id, label=label)
553
+ all_col_nodes.add(col_node_id)
554
+
555
+ # 强制垂直排列:避免同一df里两个列出现左右排列的情况,导致连线难以看清
556
+ # if prev_col_node_id:
557
+ # c.edge(prev_col_node_id, col_node_id, style='invis')
558
+ # prev_col_node_id = col_node_id
559
+
560
+ # 强制垂直排列:避免同一df里两个列出现左右排列的情况,导致连线难以看清
561
+ if len(df.columns) > 1:
562
+ with c.subgraph() as s:
563
+ s.attr(rank='same')
564
+ for i, col in enumerate(df.columns):
565
+ col_node_id = f'col_{id(df)}_{i}_{col}'
566
+ s.node(col_node_id)
567
+
568
+ def build_graph_recursive(current_df):
569
+ """递归构建图"""
570
+ # 添加当前DataFrame节点
571
+ add_dataframe_node(current_df)
572
+
573
+ # 处理上游节点
574
+ for input_df in current_df.input_dfs:
575
+ build_graph_recursive(input_df)
576
+
577
+ # 根据actual_mappings创建列之间的连接
578
+ # 注意:current_df内部不会有同名列,input_dfs之间不会有同名列;但current_df与input_dfs之间可能有同名列
579
+ for feature, expression in current_df.actual_mappings.items():
580
+ create_edges_from_mapping(feature, expression, input_df, current_df)
581
+
582
+ # 处理本节点内部连接
583
+ for feature, expression in current_df.actual_mappings.items():
584
+ create_edges_from_mapping(feature, expression, current_df, current_df)
585
+
586
+ # 根据feature mapping信息创建连接线
587
+ def create_edges_from_mapping(feature, expression, input_df, current_df):
588
+ target_feature = feature
589
+
590
+ # 提取表达式中的输入列
591
+ input_cols = _extract_column_names(expression)
592
+
593
+ # 在上游DataFrame中找到对应的列并创建连接
594
+ for input_col in input_cols:
595
+ # 避免current_df列自身的连接
596
+ if current_df is input_df and target_feature == input_col:
597
+ continue
598
+ if input_col in input_df.columns:
599
+ target_idx = list(current_df.columns).index(target_feature)
600
+ target_node_id = f'col_{id(current_df)}_{target_idx}_{target_feature}'
601
+
602
+ # 找到上游DataFrame中的源列节点ID
603
+ source_idx = list(input_df.columns).index(input_col)
604
+ source_node_id = f'col_{id(input_df)}_{source_idx}_{input_col}'
605
+
606
+ # 创建连接,避免重复边
607
+ edge_key = (source_node_id, target_node_id)
608
+ if edge_key not in visited_edges:
609
+ if input_df is current_df:
610
+ dot.edge(source_node_id, target_node_id, color='gray') # 同一个DataFrame内部的连接用灰色
611
+ else:
612
+ dot.edge(source_node_id, target_node_id)
613
+ visited_edges.add(edge_key)
614
+ output_sources.add(source_node_id)
615
+
616
+ # 从当前DataFrame开始构建图
617
+ build_graph_recursive(self)
618
+
619
+ # 如果启用了 highlight_useless_column,则将没有输出边的列高亮
620
+ if highlight_useless_column:
621
+ no_output_nodes = all_col_nodes - output_sources
622
+ for node_id in no_output_nodes:
623
+ dot.node(node_id, fillcolor='yellow')
624
+
625
+ try:
626
+ # 渲染图片,render里的filename参数不要带扩展名
627
+ dot.render(os.path.splitext(filename)[0], cleanup=True) # cleanup=True删除临时文件
628
+ print(f"数据流图已保存到: {filename}")
629
+ return dot
630
+ except ExecutableNotFound as e:
631
+ print(f"警告: 未安装graphviz应用,请先下载安装。 {e}")
632
+ return None
633
+
634
+ # def trace_unused_columns(self):
635
+ # """
636
+ # 实现trace_redundant_columns方法,用于识别整个数据处理管道中的冗余列。
637
+ # 返回一个字典,key是processor的名字,value是此processor里未使用的feature列表
638
+ # """
639
+ #
640
+ # # 结果字典,key是processor名字,value是未使用的列列表
641
+ # redundant_dict = {}
642
+ #
643
+ # # 遍历所有processor
644
+ # for processor in processors:
645
+ # # 收集该processor中定义的所有特征列
646
+ # # if 'feature' in processor.columns:
647
+ # # processor_columns = set(processor['feature'].dropna())
648
+ # # processor_columns.discard('*') # 排除通配符
649
+ # processor_columns = set(pd.DataFrame(processor.actual_mappings)['feature'])
650
+ #
651
+ # # 收集该processor实际使用的列(即其他特征表达式中引用的列)
652
+ # used_in_expressions = set()
653
+ # for proc in processors:
654
+ # for mapping in proc.actual_mappings:
655
+ # expr_cols = _extract_column_names(mapping['expression'])
656
+ # used_in_expressions.update(expr_cols)
657
+ #
658
+ # # 找出该processor中未被使用的列
659
+ # redundant_columns = processor_columns - used_in_expressions
660
+ #
661
+ # # 存入结果字典
662
+ # if processor.name:
663
+ # redundant_dict[processor.name] = list(sorted(redundant_columns))
664
+ # else:
665
+ # redundant_dict[f"Unnamed_Processor_{id(processor)}"] = list(sorted(redundant_columns))
666
+ #
667
+ # # 输出结果
668
+ # print("Redundant Columns by Processor:")
669
+ # for processor_name, columns in redundant_dict.items():
670
+ # if columns:
671
+ # print(f" [{processor_name}]:")
672
+ # for col in columns:
673
+ # print(f" - {col}")
674
+ # else:
675
+ # print(f" [{processor_name}]: (no redundant columns)")
676
+ #
677
+ # return redundant_dict
678
+
679
+
680
+ # 确保DataFrame的时间戳和设备列的类型,时间戳作为索引
681
+ # 将object类型的列转为string类型,前者不支持eval()
682
+ def format_columns(self) -> MyDataFrame:
683
+ result = MyDataFrame(self)
684
+ result.name = self.name
685
+ if 'k_ts' in result.columns:
686
+ result['k_ts'] = pd.to_datetime(result['k_ts'])
687
+ # 若k_ts同时作为索引和普通列,对merge操作会报错('k_ts' is both an index level and a column label, which is ambiguous.)
688
+ # 若k_ts仅作为索引,df['k_ts']会报错 (KeyError)
689
+ # result = result.set_index(['k_ts'], drop=True)
690
+ if 'k_device' in result.columns:
691
+ result['k_device'] = result['k_device'].astype(str)
692
+
693
+ # 将object类型的列转为string类型,避免eval()里报错
694
+ object_cols = result.select_dtypes(include=['object']).columns
695
+ result[object_cols] = result[object_cols].astype('string')
696
+
697
+ # 列名去掉首尾空格,防止难以察觉的错误
698
+ result.columns = result.columns.str.strip()
699
+
700
+ # 列名排序,方便调试对比
701
+ result = _sort_columns(result)
702
+
703
+ return result
704
+
705
+
706
+ # 输出指定processor指定feature的相关数据
707
+ def print_processor(processor_name, feature_name):
708
+ found = False
709
+ for processor in mydataframes:
710
+ if processor.name == processor_name:
711
+ for mapping in processor.actual_mappings:
712
+ if mapping['feature'] == feature_name:
713
+ found = True
714
+ print(f"[{processor.name}]")
715
+ print(f"{feature_name:<10}{mapping['feature_value'].tolist()}")
716
+ values = mapping['expression_values']
717
+ for key, value in values.items():
718
+ print(f"{key:<10}{value.tolist()}")
719
+ if not found:
720
+ print(f"未找到processor[{processor_name}]中feature[{feature_name}]")
721
+
722
+
723
+
724
+ # 检验列名是否合法
725
+ def _validate_var_name(var_name: str):
726
+ forbidden_chars = {'.', '[', ']', '-', '+', '*', '/', '\\', '%', '&'}
727
+ if any(char in forbidden_chars for char in var_name):
728
+ raise ValueError(f"变量名 '{var_name}' 包含非法字符")
729
+
730
+
731
+ # 先使用numexpr解析,若失败再尝试python解析
732
+ def _eval(df: pd.DataFrame, expression: str):
733
+ result = None
734
+
735
+ # dataframe的eval()方法不支持where表达式,自己实现
736
+ if expression.startswith('where'):
737
+ args = _parse_where_args(expression)
738
+ if len(args) == 3:
739
+ return np.where(_eval(df, args[0]), _eval(df, args[1]), _eval(df, args[2]))
740
+ else:
741
+ raise ValueError(f"无效的where表达式格式: {expression}")
742
+
743
+ try:
744
+ result = df.eval(expression, engine='numexpr')
745
+ except Exception as e:
746
+ # numexpr不支持字符串等操作,此时尝试降级到python解释器(性能较低)
747
+ # 典型错误信息:'unknown type object'、'unknown type datetimedelta64[ns]'
748
+ try:
749
+ result = df.eval(expression, engine='python')
750
+ except Exception as e:
751
+ # 如果python解析器也失败,报错
752
+ cols = _extract_column_names( expression)
753
+ print('\n表达式执行失败相关输入数据:')
754
+ print(df[cols])
755
+ raise Exception(f'表达式 {expression} 执行失败(python): {e}')
756
+ return result
757
+
758
+
759
+ # 为解决嵌套where()的情况,将原来的正则表达式方案改为手动解析方案
760
+ def _parse_where_args(s):
761
+ if not s.startswith('where(') or not s.endswith(')'):
762
+ raise ValueError("Not a where expression")
763
+ # 去掉 'where(' 和最后的 ')'
764
+ inner = s[6:-1]
765
+ args = []
766
+ paren_level = 0
767
+ current = []
768
+ for char in inner:
769
+ if char == ',' and paren_level == 0:
770
+ args.append(''.join(current).strip())
771
+ current = []
772
+ else:
773
+ if char == '(':
774
+ paren_level += 1
775
+ elif char == ')':
776
+ paren_level -= 1
777
+ current.append(char)
778
+ args.append(''.join(current).strip()) # 最后一个参数
779
+ return args
780
+
781
+
782
+ def _extract_column_names(expr: str):
783
+ if expr.startswith('where'):
784
+ args = _parse_where_args(expr)
785
+ # FIXME: 根据实际情况,选择arg[1]或arg[2]
786
+ return [] # FIXME
787
+
788
+ # FIXME:带有@pd的表达式无法解析(如 @pd.shape[0]) )
789
+ if '@' in expr:
790
+ return [] # FIXME
791
+
792
+ tree = ast.parse(expr, mode='eval')
793
+ names = set()
794
+
795
+ class NameVisitor(ast.NodeVisitor):
796
+ def visit_Name(self, node):
797
+ names.add(node.id)
798
+ self.generic_visit(node)
799
+
800
+ NameVisitor().visit(tree)
801
+ return sorted(names) # 或直接返回 names(set)
802
+
803
+
804
+ # 列按字母顺序排序
805
+ def _sort_columns(df: pd.DataFrame):
806
+ cols = sorted(df.columns)
807
+ if 'k_device' in cols:
808
+ cols = ['k_device'] + [col for col in cols if col != 'k_device']
809
+ if 'k_ts' in cols:
810
+ cols = ['k_ts'] + [col for col in cols if col != 'k_ts']
811
+ # 不使用df[cols]的写法,避免产生追溯记录
812
+ return df.reindex(columns=cols)
k2pipe/pipe.py ADDED
@@ -0,0 +1,168 @@
1
+ import ast
2
+
3
+ import numpy as np
4
+ import pandas as pd
5
+ from loguru import logger
6
+
7
+ ##### ##### ##### ##### ##### #####
8
+ ##### 后面将被MyDataFrame替换 #####
9
+ ##### ##### ##### ##### ##### #####
10
+
11
+ # 初始化,为DataFrame添加自定义方法
12
+ def init_pipe():
13
+ pd.DataFrame.extract_features = extract_features
14
+ pd.DataFrame.format_columns = format_columns
15
+
16
+
17
+ # 根据配置文件对DataFrame做特征提取
18
+ def extract_features(self, config: pd.DataFrame) -> pd.DataFrame:
19
+ result = self.copy()
20
+ result.columns = result.columns.str.strip() # 防止列名前后有空格造成难以排查的错误
21
+ for _, row in config.iterrows():
22
+ # 忽略注释行
23
+ if row[0].startswith('#'):
24
+ continue
25
+
26
+ feature_name = row['feature']
27
+ if not pd.isna(feature_name):
28
+ feature_name = feature_name.strip()
29
+ else:
30
+ raise ValueError(f"特征名称不能为空 {row}, line: {_}")
31
+
32
+ if feature_name == '*':
33
+ continue
34
+
35
+ _validate_var_name(feature_name)
36
+
37
+ expression = row['expression']
38
+ if not pd.isna(expression):
39
+ expression = expression.strip()
40
+ else:
41
+ result[feature_name] = np.nan
42
+ continue
43
+
44
+ # 非数值类型用eval容易报错,这种情况直接赋值
45
+ if feature_name == expression:
46
+ result[feature_name] = result[expression]
47
+ continue
48
+
49
+ result[feature_name] = _eval(result, expression)
50
+
51
+ # 删除self中存在但config中没有定义的列
52
+ # 但若config里包含*,则表示原始所有列都复制过来,此时跳过删除步骤
53
+ if not '*' in config['feature'].values:
54
+ config_columns = set(config['feature'].dropna())
55
+ original_columns = set(self.columns)
56
+ columns_to_drop = original_columns - config_columns
57
+ result = result.drop(columns=columns_to_drop, errors='ignore')
58
+
59
+ result = _sort_columns(result)
60
+
61
+ return result
62
+
63
+
64
+ # 先使用numexpr解析,若失败再尝试python解析
65
+ def _eval(df: pd.DataFrame, expression: str):
66
+ result = None
67
+
68
+ # dataframe的eval()方法不支持where表达式,自己实现
69
+ if expression.startswith('where'):
70
+ args = _parse_where_args(expression)
71
+ if len(args) == 3:
72
+ return np.where(_eval(df, args[0]), _eval(df, args[1]), _eval(df, args[2]))
73
+ else:
74
+ raise ValueError(f"无效的where表达式格式: {expression}")
75
+
76
+ try:
77
+ result = df.eval(expression, engine='numexpr')
78
+ except Exception as e:
79
+ # numexpr不支持字符串等操作,此时尝试降级到python解释器(性能较低)
80
+ # 典型错误信息:'unknown type object'、'unknown type datetimedelta64[ns]'
81
+ try:
82
+ result = df.eval(expression, engine='python')
83
+ except Exception as e:
84
+ cols = _extract_column_names(expression)
85
+ print('\n表达式执行失败相关输入数据:')
86
+ print(df[cols])
87
+ raise Exception(f'表达式 {expression} 执行失败(python): {e}')
88
+ return result
89
+
90
+
91
+ # 确保DataFrame的时间戳和设备列的类型,时间戳作为索引
92
+ # 将object类型的列转为string类型,前者不支持eval()
93
+ def format_columns(self) -> pd.DataFrame:
94
+ result = self.copy()
95
+ if 'k_ts' in result.columns:
96
+ result['k_ts'] = pd.to_datetime(result['k_ts'])
97
+ if 'k_device' in result.columns:
98
+ result['k_device'] = result['k_device'].astype(str)
99
+ # result = result.set_index(['k_ts'], drop=False)
100
+
101
+ object_cols = result.select_dtypes(include=['object']).columns
102
+ result[object_cols] = result[object_cols].astype('string')
103
+
104
+ result = _sort_columns(result)
105
+
106
+ return result
107
+
108
+
109
+ def _extract_column_names(expr: str):
110
+ if expr.startswith('where'):
111
+ args = _parse_where_args(expr)
112
+ # FIXME: 根据实际情况,选择arg[1]或arg[2]
113
+ return [] # FIXME
114
+
115
+ # FIXME:带有@pd的表达式无法解析(如 @pd.shape[0]) )
116
+ if '@' in expr:
117
+ return [] # FIXME
118
+
119
+ tree = ast.parse(expr, mode='eval')
120
+ names = set()
121
+
122
+ class NameVisitor(ast.NodeVisitor):
123
+ def visit_Name(self, node):
124
+ names.add(node.id)
125
+ self.generic_visit(node)
126
+
127
+ NameVisitor().visit(tree)
128
+ return sorted(names) # 或直接返回 names(set)
129
+
130
+
131
+ # 列按字母顺序排序
132
+ def _sort_columns(df:pd.DataFrame):
133
+ cols = sorted(df.columns)
134
+ if 'k_device' in cols:
135
+ cols = ['k_device'] + [col for col in cols if col != 'k_device']
136
+ if 'k_ts' in cols:
137
+ cols = ['k_ts'] + [col for col in cols if col != 'k_ts']
138
+ return df[cols]
139
+
140
+
141
+ # 为解决嵌套where()的情况,将原来的正则表达式方案改为手动解析方案
142
+ def _parse_where_args(s):
143
+ if not s.startswith('where(') or not s.endswith(')'):
144
+ raise ValueError("Not a where expression")
145
+ # 去掉 'where(' 和最后的 ')'
146
+ inner = s[6:-1]
147
+ args = []
148
+ paren_level = 0
149
+ current = []
150
+ for char in inner:
151
+ if char == ',' and paren_level == 0:
152
+ args.append(''.join(current).strip())
153
+ current = []
154
+ else:
155
+ if char == '(':
156
+ paren_level += 1
157
+ elif char == ')':
158
+ paren_level -= 1
159
+ current.append(char)
160
+ args.append(''.join(current).strip()) # 最后一个参数
161
+ return args
162
+
163
+
164
+ def _validate_var_name(var_name: str):
165
+ forbidden_chars = {'.', '[', ']', '-', '+', '*', '/', '\\', '%', '&'}
166
+ if any(char in forbidden_chars for char in var_name):
167
+ raise ValueError(f"变量名 '{var_name}' 包含非法字符")
168
+ return True
@@ -0,0 +1,255 @@
1
+ Metadata-Version: 2.4
2
+ Name: k2pipe
3
+ Version: 0.1.9
4
+ Summary: k2pipe
5
+ Home-page: https://www.k2data.com.cn
6
+ Author: K2data
7
+ Author-email: admin@k2data.com.cn
8
+ Classifier: Development Status :: 3 - Alpha
9
+ Classifier: Intended Audience :: Developers
10
+ Classifier: Programming Language :: Python :: 3
11
+ Classifier: Programming Language :: Python :: 3.8
12
+ Classifier: Programming Language :: Python :: 3.9
13
+ Classifier: Programming Language :: Python :: 3.10
14
+ Classifier: Operating System :: OS Independent
15
+ Classifier: Topic :: Software Development :: Libraries
16
+ Requires-Python: >=3.8
17
+ Description-Content-Type: text/markdown
18
+ Requires-Dist: pandas
19
+ Requires-Dist: numexpr
20
+ Requires-Dist: loguru
21
+ Requires-Dist: graphviz
22
+ Requires-Dist: pytest
23
+ Requires-Dist: pytest-mock
24
+ Requires-Dist: pytest-cov
25
+ Dynamic: author
26
+ Dynamic: author-email
27
+ Dynamic: classifier
28
+ Dynamic: description
29
+ Dynamic: description-content-type
30
+ Dynamic: home-page
31
+ Dynamic: requires-dist
32
+ Dynamic: requires-python
33
+ Dynamic: summary
34
+
35
+ # Pipe SDK (暂用名)
36
+ 通过配置的方式,简化数据处理流程中常用操作,使得数据处理的业务逻辑更加直观和稳定。
37
+
38
+ ## 一、安装
39
+
40
+ 要求python 3.8或以上,建议3.8.10版本。
41
+
42
+ ```
43
+ pip install -U k2pipe
44
+ ```
45
+
46
+ ## 二、使用方法
47
+
48
+ ### 2.1 数据加工
49
+
50
+ 提供 `extract_features()` 方法,根据配置里的计算规则,将当前DataFrame处理并返回结果DataFrame。
51
+
52
+ #### 2.1.1 配置文件
53
+
54
+ 配置DataFrame一般从csv文件中加载,它定义了对原始DataFrame进行数据转换的规则。配置文件中每一行定义结果DataFrame里的一个(新)列,其中feature是列名,expression是列的计算表达式。
55
+
56
+ 配置里定义的列是顺序处理的,因此列表达式既可以引用原始DataFrame里的列,也可以引用前面新定义的列。
57
+
58
+ | feature | expression | comment |
59
+ |---------|------------------------------------|------------------------------|
60
+ | feat1 | col1 / 100 - col2 | 四则运算示例 |
61
+ | feat2 | col2.rolling(3).mean() | 按行数滚动窗口示例 |
62
+ | feat3 | col2.rolling('3D').mean() | 按时间滚动窗口示例(dataframe必须有时间索引列) |
63
+ | feat4 | feat1.my_func() | 自定义函数示例(函数需要在python代码里注册) |
64
+ | feat5 | "where(col1 > 50, col1, col2 * 2)" | 条件赋值示例(支持多层where嵌套) |
65
+ | # | 注释内容 | 注释行示例 |
66
+ | feat6 | (col3 - k_ts).dt.days | 时间处理示例 |
67
+ | * | | 表示保留原始dataframe的所有列 |
68
+ | feat7 | k_device.str[1] | 字符串操作示例 |
69
+ | feat8 | "where(col1.isna(), 1, 2)" | 空值判断示例 |
70
+ | feat9 | @df.shape[0] | 获取待处理的DataFrame的属性示例 |
71
+ | feat10 | feat01.round() | 取整操作示例 |
72
+ | feat11 | k_ts + col1.astype('timedelta64[s]') | 时间处理示例1(暂时无法进行常数时间计算) |
73
+ | feat12 | k_ts.time_shift('-10s') | 时间处理示例2(带有常数时间的计算) |
74
+
75
+ 说明:
76
+ - 若表达式或注释包含逗号,需要用双引号包裹,双引号前不能有空格
77
+ - 表达式不支持apply()函数
78
+ - 内置变量列表:@df
79
+ - 内置函数列表:time_shift()
80
+ - 更多示例:[K2Pipe Examples](https://gitlab.kstonedata.k2/zhanghao/k2pipe/tree/develop/tests/example)
81
+ - 表达式语法参考:[Mastering Eval Expressions in Pandas](https://www.sparkcodehub.com/pandas/advanced/eval-expressions-guide)
82
+
83
+
84
+ #### 2.2.2 Python代码
85
+
86
+ 根据配置里定义的处理规则,将原始DataFrame数据转换为结果DataFrame数据。
87
+
88
+ ```python
89
+ import pandas as pd
90
+ from pathlib import Path
91
+ from k2pipe.mypipe import MyDataFrame
92
+
93
+ # 样例数据
94
+ df = MyDataFrame({'k_device': ['dev1','dev1', 'dev1', 'dev2', 'dev2', 'dev2'],
95
+ 'col1': [50, 60, 70, 80, 90, 100],
96
+ 'col2': [1, 2, 3, 4, 5, 6]})
97
+ df['k_ts'] = pd.date_range(start='2025-01-01 08:00:00', periods=len(df), freq='S')
98
+ df['ts2'] = pd.date_range(start='2025-01-01 12:00:00', periods=len(df), freq='S')
99
+ df.set_index('k_ts', inplace=True)
100
+
101
+ # 配置信息
102
+ config = pd.read_csv(Path(__file__).parent / 'my_feat.csv')
103
+ df.columns = df.columns.str.strip()
104
+
105
+ # 注册自定义函数
106
+ pd.Series.my_func = (lambda x: x.rolling(3).mean())
107
+
108
+ # 处理数据
109
+ result = df.extract_features(config)
110
+ ```
111
+
112
+ #### 2.2.3 运行结果
113
+
114
+ 原始数据:
115
+
116
+ ```
117
+ k_device col1 col2 col3
118
+ k_ts
119
+ 2025-01-01 08:00:00 dev1 50 1 2025-01-01 12:00:00
120
+ 2025-01-01 08:00:01 dev1 60 2 2025-01-01 12:00:01
121
+ 2025-01-01 08:00:02 dev1 70 3 2025-01-01 12:00:02
122
+ 2025-01-01 08:00:03 dev2 80 4 2025-01-01 12:00:03
123
+ 2025-01-01 08:00:04 dev2 90 5 2025-01-01 12:00:04
124
+ 2025-01-01 08:00:05 dev2 100 6 2025-01-01 12:00:05
125
+
126
+ ```
127
+
128
+ 结果数据:
129
+
130
+ ```
131
+ k_device feat01 feat02 feat03 feat04 feat05 feat06 feat07 feat08 feat09 feat10 feat11
132
+ k_ts
133
+ 2025-01-01 08:00:00 dev1 -0.5 NaN 1.0 NaN 2 0 e 2 6 -0.0 2025-01-01 08:00:10
134
+ 2025-01-01 08:00:01 dev1 -1.4 NaN 1.5 NaN 60 0 e 2 6 -1.0 2025-01-01 08:00:11
135
+ 2025-01-01 08:00:02 dev1 -2.3 2.0 2.0 -1.4 70 0 e 2 6 -2.0 2025-01-01 08:00:12
136
+ 2025-01-01 08:00:03 dev2 -3.2 3.0 2.5 -2.3 80 0 e 2 6 -3.0 2025-01-01 08:00:13
137
+ 2025-01-01 08:00:04 dev2 -4.1 4.0 3.0 -3.2 90 0 e 2 6 -4.0 2025-01-01 08:00:14
138
+ 2025-01-01 08:00:05 dev2 -5.0 5.0 3.5 -4.1 100 0 e 2 6 -5.0 2025-01-01 08:00:15
139
+ ```
140
+
141
+ ### 2.2 统一数据格式
142
+
143
+ 提供`format_columns()`方法,将输入的DataFrame内的`k_ts`转为时间类型,将`k_device`列转为字符串类型。
144
+
145
+ ```python
146
+ import pandas as pd
147
+ from k2pipe.mypipe import MyDataFrame
148
+
149
+ df = MyDataFrame(...)
150
+ df = df.format_columns()
151
+ ```
152
+
153
+
154
+ ### 2.3 数据流分析
155
+
156
+ 提供`generate_dataflow()`方法,生成svg格式的数据流图功能。此功能依赖`graphviz`包,安装方法见[这里](https://pypi.org/project/graphviz/)。
157
+
158
+ ```python
159
+ df1 = MyDataFrame({'k_device': ['dev1', 'dev1', 'dev1', 'dev2', 'dev2', 'dev2'],
160
+ 'col1': [50, 60, 70, 80, 90, 100],
161
+ 'col2': [1, 2, 3, 4, 5, 6]})
162
+ df1['k_ts'] = pd.date_range(start='2025-01-01 08:00:00', periods=len(df1), freq='S')
163
+ df2 = df1.copy()
164
+ df1 = df1.rename(columns={'col1': 'col3', 'col2': 'col4'})
165
+ result = df1.merge(df2, on=['k_device', 'k_ts'])
166
+ result.generate_dataflow(filename='dataflow_merge.svg', show_value=False)
167
+ ```
168
+
169
+ 数据流图效果(灰色矩形代表DataFrame实例,白色矩形表示数据列Series,黄色矩形表示没有连出边的数据列):
170
+
171
+ ![merge](https://gitlab.kstonedata.k2/zhanghao/k2pipe/raw/dev_hao/static/dataflow_merge.png)
172
+
173
+
174
+ ### 2.4 长宽表转换
175
+
176
+ 提供`wide_to_long()`方法和`long_to_wide()`方法,用于长表风格与宽表风格的转换。
177
+
178
+ 宽表举例:
179
+
180
+ ```
181
+ k_ts k_device f1_mean_3D f1_mean_5D f1_slope_3D f1_slope_5D f2_mean_3D f2_mean_5D f2_slope_3D f2_slope_5D
182
+ 0 2025-01-01 dev1 8 24 67 87 79 48 10 94
183
+ 1 2025-01-01 dev2 52 98 53 66 98 14 34 24
184
+ 2 2025-01-02 dev1 15 60 58 16 9 93 86 2
185
+ 3 2025-01-02 dev2 27 4 31 1 13 83 4 91
186
+ 4 2025-01-03 dev1 59 67 7 49 47 65 61 14
187
+ 5 2025-01-03 dev2 55 71 80 2 94 19 98 63
188
+ ```
189
+
190
+ 长表举例:
191
+
192
+ ```
193
+ k_ts k_device feature measure period value
194
+ 0 2025-01-01 dev1 f1 mean 3D 8
195
+ 1 2025-01-01 dev1 f1 mean 5D 24
196
+ 2 2025-01-01 dev1 f1 slope 3D 67
197
+ 3 2025-01-01 dev1 f1 slope 5D 87
198
+ 4 2025-01-01 dev1 f2 mean 3D 79
199
+ 5 2025-01-01 dev1 f2 mean 5D 48
200
+ 6 2025-01-01 dev1 f2 slope 3D 10
201
+ 7 2025-01-01 dev1 f2 slope 5D 94
202
+ 8 2025-01-01 dev2 f1 mean 3D 52
203
+ 9 2025-01-01 dev2 f1 mean 5D 98
204
+ 10 2025-01-01 dev2 f1 slope 3D 53
205
+ 11 2025-01-01 dev2 f1 slope 5D 66
206
+ 12 2025-01-01 dev2 f2 mean 3D 98
207
+ 13 2025-01-01 dev2 f2 mean 5D 14
208
+ 14 2025-01-01 dev2 f2 slope 3D 34
209
+ 15 2025-01-01 dev2 f2 slope 5D 24
210
+ 16 2025-01-02 dev1 f1 mean 3D 15
211
+ 17 2025-01-02 dev1 f1 mean 5D 60
212
+ 18 2025-01-02 dev1 f1 slope 3D 58
213
+ 19 2025-01-02 dev1 f1 slope 5D 16
214
+ 20 2025-01-02 dev1 f2 mean 3D 9
215
+ 21 2025-01-02 dev1 f2 mean 5D 93
216
+ 22 2025-01-02 dev1 f2 slope 3D 86
217
+ 23 2025-01-02 dev1 f2 slope 5D 2
218
+ 24 2025-01-02 dev2 f1 mean 3D 27
219
+ 25 2025-01-02 dev2 f1 mean 5D 4
220
+ 26 2025-01-02 dev2 f1 slope 3D 31
221
+ 27 2025-01-02 dev2 f1 slope 5D 1
222
+ 28 2025-01-02 dev2 f2 mean 3D 13
223
+ 29 2025-01-02 dev2 f2 mean 5D 83
224
+ 30 2025-01-02 dev2 f2 slope 3D 4
225
+ 31 2025-01-02 dev2 f2 slope 5D 91
226
+ 32 2025-01-03 dev1 f1 mean 3D 59
227
+ 33 2025-01-03 dev1 f1 mean 5D 67
228
+ 34 2025-01-03 dev1 f1 slope 3D 7
229
+ 35 2025-01-03 dev1 f1 slope 5D 49
230
+ 36 2025-01-03 dev1 f2 mean 3D 47
231
+ 37 2025-01-03 dev1 f2 mean 5D 65
232
+ 38 2025-01-03 dev1 f2 slope 3D 61
233
+ 39 2025-01-03 dev1 f2 slope 5D 14
234
+ 40 2025-01-03 dev2 f1 mean 3D 55
235
+ 41 2025-01-03 dev2 f1 mean 5D 71
236
+ 42 2025-01-03 dev2 f1 slope 3D 80
237
+ 43 2025-01-03 dev2 f1 slope 5D 2
238
+ 44 2025-01-03 dev2 f2 mean 3D 94
239
+ 45 2025-01-03 dev2 f2 mean 5D 19
240
+ 46 2025-01-03 dev2 f2 slope 3D 98
241
+ 47 2025-01-03 dev2 f2 slope 5D 63
242
+ ```
243
+
244
+ 宽表转长表:
245
+
246
+ ```python
247
+ df_long = df_wide.wide_to_long()
248
+ ```
249
+
250
+ 长表转宽表:
251
+
252
+ ```python
253
+ df_wide = df_long.long_to_wide()
254
+ ```
255
+
@@ -0,0 +1,7 @@
1
+ k2pipe/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
2
+ k2pipe/mypipe.py,sha256=O8h65Polb-xDw6k8pT5iIvp9zrT9Urrmez8kvtdsrT4,34388
3
+ k2pipe/pipe.py,sha256=K6uXTOHgQvXchvOChRO72h37C7VJOuWbkGWNF6XKif0,5797
4
+ k2pipe-0.1.9.dist-info/METADATA,sha256=chHK4FCNsqwcH3YsejM67KcwXWoEq3e5ngDLNT1E9RU,11509
5
+ k2pipe-0.1.9.dist-info/WHEEL,sha256=0CuiUZ_p9E4cD6NyLD6UG80LBXYyiSYZOKDm5lp32xk,91
6
+ k2pipe-0.1.9.dist-info/top_level.txt,sha256=0fzk4s78hGSUO5KiuC_PlFZOI_3Z2RMlNCjOVQSxxpI,7
7
+ k2pipe-0.1.9.dist-info/RECORD,,
@@ -0,0 +1,5 @@
1
+ Wheel-Version: 1.0
2
+ Generator: setuptools (80.3.1)
3
+ Root-Is-Purelib: true
4
+ Tag: py3-none-any
5
+
@@ -0,0 +1 @@
1
+ k2pipe