staran 0.6.0__py3-none-any.whl → 1.0.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- staran/__init__.py +10 -0
- staran/tools/__init__.py +5 -5
- staran-1.0.0.dist-info/METADATA +301 -0
- staran-1.0.0.dist-info/RECORD +8 -0
- staran/banks/__init__.py +0 -30
- staran/banks/xinjiang_icbc/__init__.py +0 -90
- staran/engines/__init__.py +0 -65
- staran/engines/base.py +0 -255
- staran/engines/hive.py +0 -163
- staran/engines/spark.py +0 -252
- staran/engines/turing.py +0 -439
- staran/examples/__init__.py +0 -8
- staran/examples/aum_longtail.py +0 -250
- staran/examples/aum_longtail_old.py +0 -487
- staran/features/__init__.py +0 -59
- staran/features/engines.py +0 -284
- staran/features/generator.py +0 -603
- staran/features/manager.py +0 -155
- staran/features/schema.py +0 -193
- staran/models/__init__.py +0 -72
- staran/models/bank_configs.py +0 -269
- staran/models/config.py +0 -271
- staran/models/daifa_models.py +0 -361
- staran/models/registry.py +0 -281
- staran/models/target.py +0 -321
- staran/schemas/__init__.py +0 -27
- staran/schemas/aum/__init__.py +0 -210
- staran/schemas/document_generator.py +0 -350
- staran/tools/document_generator.py +0 -350
- staran-0.6.0.dist-info/METADATA +0 -564
- staran-0.6.0.dist-info/RECORD +0 -33
- {staran-0.6.0.dist-info → staran-1.0.0.dist-info}/WHEEL +0 -0
- {staran-0.6.0.dist-info → staran-1.0.0.dist-info}/licenses/LICENSE +0 -0
- {staran-0.6.0.dist-info → staran-1.0.0.dist-info}/top_level.txt +0 -0
staran/engines/turing.py
DELETED
@@ -1,439 +0,0 @@
|
|
1
|
-
#!/usr/bin/env python3
|
2
|
-
# -*- coding: utf-8 -*-
|
3
|
-
|
4
|
-
"""
|
5
|
-
图灵平台引擎
|
6
|
-
继承Spark引擎,重写执行和下载方法以使用turingPythonLib
|
7
|
-
"""
|
8
|
-
|
9
|
-
from typing import Dict, Any, Optional, List, Callable
|
10
|
-
import sys
|
11
|
-
import os
|
12
|
-
from datetime import datetime
|
13
|
-
from .spark import SparkEngine
|
14
|
-
|
15
|
-
# 尝试导入turingPythonLib(在图灵平台环境中)
|
16
|
-
try:
|
17
|
-
sys.path.append("/nfsHome/")
|
18
|
-
import turingPythonLib as tp
|
19
|
-
TURINGLIB_AVAILABLE = True
|
20
|
-
except ImportError:
|
21
|
-
tp = None
|
22
|
-
TURINGLIB_AVAILABLE = False
|
23
|
-
|
24
|
-
|
25
|
-
class TuringEngine(SparkEngine):
|
26
|
-
"""
|
27
|
-
图灵平台引擎
|
28
|
-
继承Spark引擎,使用turingPythonLib进行SQL执行和数据下载
|
29
|
-
"""
|
30
|
-
|
31
|
-
def __init__(self, database_name: str, sql_executor: Optional[Callable] = None):
|
32
|
-
# 不使用传入的sql_executor,因为我们使用turingPythonLib
|
33
|
-
super().__init__(database_name, None)
|
34
|
-
|
35
|
-
# 检查turingPythonLib是否可用
|
36
|
-
if not TURINGLIB_AVAILABLE:
|
37
|
-
print("⚠️ 警告: turingPythonLib不可用,将使用模拟模式")
|
38
|
-
|
39
|
-
def get_engine_name(self) -> str:
|
40
|
-
return "Turing Platform (Spark)"
|
41
|
-
|
42
|
-
# ==================== 重写SQL执行方法 ====================
|
43
|
-
|
44
|
-
def execute_sql(self, sql: str, description: str = "") -> Any:
|
45
|
-
"""
|
46
|
-
使用turingPythonLib执行SQL
|
47
|
-
|
48
|
-
Args:
|
49
|
-
sql: SQL语句
|
50
|
-
description: 执行描述
|
51
|
-
|
52
|
-
Returns:
|
53
|
-
执行结果
|
54
|
-
"""
|
55
|
-
if TURINGLIB_AVAILABLE:
|
56
|
-
try:
|
57
|
-
# 使用turingPythonLib执行SQL
|
58
|
-
result = tp.execute_sql(sql)
|
59
|
-
|
60
|
-
self.execution_history.append({
|
61
|
-
'sql': sql,
|
62
|
-
'description': description,
|
63
|
-
'timestamp': datetime.now(),
|
64
|
-
'result': result,
|
65
|
-
'platform': 'turingPythonLib'
|
66
|
-
})
|
67
|
-
|
68
|
-
return result
|
69
|
-
|
70
|
-
except Exception as e:
|
71
|
-
error_result = {
|
72
|
-
'status': 'error',
|
73
|
-
'message': f"执行SQL失败: {str(e)}",
|
74
|
-
'error': str(e)
|
75
|
-
}
|
76
|
-
|
77
|
-
self.execution_history.append({
|
78
|
-
'sql': sql,
|
79
|
-
'description': description,
|
80
|
-
'timestamp': datetime.now(),
|
81
|
-
'result': error_result,
|
82
|
-
'platform': 'turingPythonLib'
|
83
|
-
})
|
84
|
-
|
85
|
-
return error_result
|
86
|
-
else:
|
87
|
-
# 模拟模式
|
88
|
-
print(f"模拟执行SQL: {description or 'SQL语句'}")
|
89
|
-
print(f" {sql[:100]}...")
|
90
|
-
|
91
|
-
mock_result = {
|
92
|
-
'status': 'simulated',
|
93
|
-
'message': '模拟执行成功',
|
94
|
-
'sql': sql[:100] + '...'
|
95
|
-
}
|
96
|
-
|
97
|
-
self.execution_history.append({
|
98
|
-
'sql': sql,
|
99
|
-
'description': description,
|
100
|
-
'timestamp': datetime.now(),
|
101
|
-
'result': mock_result,
|
102
|
-
'platform': 'simulation'
|
103
|
-
})
|
104
|
-
|
105
|
-
return mock_result
|
106
|
-
|
107
|
-
def create_table(self, table_name: str, select_sql: str,
|
108
|
-
execute: bool = False, mode: str = "cluster",
|
109
|
-
spark_resource: Optional[Dict[str, str]] = None) -> Dict[str, Any]:
|
110
|
-
"""
|
111
|
-
使用turingPythonLib创建表
|
112
|
-
|
113
|
-
Args:
|
114
|
-
table_name: 表名
|
115
|
-
select_sql: 选择SQL
|
116
|
-
execute: 是否立即执行
|
117
|
-
mode: 运行模式 ('local' 或 'cluster')
|
118
|
-
spark_resource: Spark资源配置
|
119
|
-
|
120
|
-
Returns:
|
121
|
-
创建结果
|
122
|
-
"""
|
123
|
-
full_table_name = self.get_full_table_name(table_name)
|
124
|
-
|
125
|
-
result = {
|
126
|
-
'table_name': table_name,
|
127
|
-
'full_table_name': full_table_name,
|
128
|
-
'select_sql': select_sql,
|
129
|
-
'executed': execute,
|
130
|
-
'mode': mode
|
131
|
-
}
|
132
|
-
|
133
|
-
if execute:
|
134
|
-
if TURINGLIB_AVAILABLE:
|
135
|
-
# 构建turingPythonLib参数
|
136
|
-
params = {
|
137
|
-
'create_mode': 'by_select',
|
138
|
-
'table_name': full_table_name,
|
139
|
-
'sql': select_sql,
|
140
|
-
'run_mode': mode
|
141
|
-
}
|
142
|
-
|
143
|
-
# 如果是集群模式且提供了资源配置
|
144
|
-
if mode == 'cluster' and spark_resource:
|
145
|
-
params['spark_resource'] = spark_resource
|
146
|
-
elif mode == 'cluster':
|
147
|
-
# 使用默认资源配置
|
148
|
-
params['spark_resource'] = {
|
149
|
-
'num_executors': '4',
|
150
|
-
'driver_cores': '2',
|
151
|
-
'driver_memory': '4G',
|
152
|
-
'executor_cores': '2',
|
153
|
-
'executor_memory': '4G'
|
154
|
-
}
|
155
|
-
|
156
|
-
try:
|
157
|
-
tp_result = tp.create_hive_table(params)
|
158
|
-
|
159
|
-
result.update({
|
160
|
-
'status': 'success',
|
161
|
-
'message': f"成功创建表: {full_table_name}",
|
162
|
-
'turinglib_result': tp_result,
|
163
|
-
'params': params
|
164
|
-
})
|
165
|
-
|
166
|
-
except Exception as e:
|
167
|
-
result.update({
|
168
|
-
'status': 'error',
|
169
|
-
'message': f"创建表失败: {str(e)}",
|
170
|
-
'error': str(e),
|
171
|
-
'params': params
|
172
|
-
})
|
173
|
-
else:
|
174
|
-
# 模拟模式
|
175
|
-
result.update({
|
176
|
-
'status': 'simulated',
|
177
|
-
'message': f"模拟创建表: {full_table_name}",
|
178
|
-
'simulated': True
|
179
|
-
})
|
180
|
-
else:
|
181
|
-
result['status'] = 'prepared'
|
182
|
-
|
183
|
-
return result
|
184
|
-
|
185
|
-
# ==================== 重写数据下载方法 ====================
|
186
|
-
|
187
|
-
def download_table_data(self, table_name: str, output_path: str,
|
188
|
-
source: str = "hadoop", mode: str = "cluster",
|
189
|
-
columns: str = "*", condition: str = "",
|
190
|
-
overwrite_path: str = "yes",
|
191
|
-
spark_resource: Optional[Dict[str, str]] = None,
|
192
|
-
**kwargs) -> Dict[str, Any]:
|
193
|
-
"""
|
194
|
-
使用turingPythonLib下载表数据
|
195
|
-
|
196
|
-
Args:
|
197
|
-
table_name: 要下载的表名
|
198
|
-
output_path: 输出路径,必须以 'file:///nfsHome/' 开头
|
199
|
-
source: 数据源类型 ('hadoop' 或 'mppdb')
|
200
|
-
mode: 运行模式 ('local' 或 'cluster')
|
201
|
-
columns: 要选择的列,默认为 "*"
|
202
|
-
condition: WHERE条件
|
203
|
-
overwrite_path: 是否覆盖路径 ('yes' 或 'no')
|
204
|
-
spark_resource: 集群模式下的资源配置
|
205
|
-
**kwargs: 其他参数
|
206
|
-
|
207
|
-
Returns:
|
208
|
-
下载结果
|
209
|
-
"""
|
210
|
-
# 验证输出路径
|
211
|
-
if not output_path.startswith('file:///nfsHome/'):
|
212
|
-
raise ValueError("输出路径必须以 'file:///nfsHome/' 开头")
|
213
|
-
|
214
|
-
full_table_name = self.get_full_table_name(table_name)
|
215
|
-
|
216
|
-
# 构建下载SQL
|
217
|
-
sql = f"SELECT {columns} FROM {full_table_name}"
|
218
|
-
if condition.strip():
|
219
|
-
if not condition.upper().strip().startswith('WHERE'):
|
220
|
-
condition = f"WHERE {condition}"
|
221
|
-
sql += f" {condition}"
|
222
|
-
|
223
|
-
# 构建下载参数
|
224
|
-
params = {
|
225
|
-
'sql': sql,
|
226
|
-
'source': source,
|
227
|
-
'outputPath': output_path,
|
228
|
-
'overwrite_path': overwrite_path,
|
229
|
-
'mode': mode
|
230
|
-
}
|
231
|
-
|
232
|
-
# 如果是集群模式且提供了资源配置
|
233
|
-
if mode == 'cluster' and spark_resource:
|
234
|
-
params['spark_resource'] = spark_resource
|
235
|
-
elif mode == 'cluster':
|
236
|
-
# 使用默认资源配置
|
237
|
-
params['spark_resource'] = {
|
238
|
-
'num_executors': '4',
|
239
|
-
'driver_cores': '2',
|
240
|
-
'driver_memory': '4G',
|
241
|
-
'executor_cores': '2',
|
242
|
-
'executor_memory': '4G'
|
243
|
-
}
|
244
|
-
|
245
|
-
try:
|
246
|
-
if TURINGLIB_AVAILABLE:
|
247
|
-
# 使用真实的turingPythonLib
|
248
|
-
tp_result = tp.download(params)
|
249
|
-
|
250
|
-
# 判断下载是否成功
|
251
|
-
if isinstance(tp_result, dict) and tp_result.get('success') == '0':
|
252
|
-
return {
|
253
|
-
'status': 'success',
|
254
|
-
'message': f'数据已下载到: {output_path}',
|
255
|
-
'table_name': table_name,
|
256
|
-
'output_path': output_path,
|
257
|
-
'turinglib_result': tp_result,
|
258
|
-
'params': params
|
259
|
-
}
|
260
|
-
else:
|
261
|
-
return {
|
262
|
-
'status': 'error',
|
263
|
-
'message': f"下载失败: {tp_result.get('data', '未知错误')}",
|
264
|
-
'table_name': table_name,
|
265
|
-
'turinglib_result': tp_result,
|
266
|
-
'params': params
|
267
|
-
}
|
268
|
-
else:
|
269
|
-
# 模拟模式
|
270
|
-
return {
|
271
|
-
'status': 'simulated',
|
272
|
-
'message': f'模拟下载到: {output_path}',
|
273
|
-
'table_name': table_name,
|
274
|
-
'output_path': output_path,
|
275
|
-
'turinglib_result': {'success': '0', 'message': '模拟下载成功'},
|
276
|
-
'params': params,
|
277
|
-
'simulated': True
|
278
|
-
}
|
279
|
-
|
280
|
-
except Exception as e:
|
281
|
-
return {
|
282
|
-
'status': 'error',
|
283
|
-
'message': f"下载异常: {str(e)}",
|
284
|
-
'table_name': table_name,
|
285
|
-
'error': str(e),
|
286
|
-
'params': params
|
287
|
-
}
|
288
|
-
|
289
|
-
def download_query_result(self, sql: str, output_path: str,
|
290
|
-
source: str = "hadoop", mode: str = "cluster",
|
291
|
-
overwrite_path: str = "yes",
|
292
|
-
spark_resource: Optional[Dict[str, str]] = None,
|
293
|
-
**kwargs) -> Dict[str, Any]:
|
294
|
-
"""
|
295
|
-
直接下载查询结果,使用turingPythonLib
|
296
|
-
|
297
|
-
Args:
|
298
|
-
sql: 查询SQL
|
299
|
-
output_path: 输出路径
|
300
|
-
source: 数据源类型
|
301
|
-
mode: 运行模式
|
302
|
-
overwrite_path: 是否覆盖路径
|
303
|
-
spark_resource: 资源配置
|
304
|
-
**kwargs: 其他参数
|
305
|
-
|
306
|
-
Returns:
|
307
|
-
下载结果
|
308
|
-
"""
|
309
|
-
# 验证输出路径
|
310
|
-
if not output_path.startswith('file:///nfsHome/'):
|
311
|
-
raise ValueError("输出路径必须以 'file:///nfsHome/' 开头")
|
312
|
-
|
313
|
-
# 构建下载参数
|
314
|
-
params = {
|
315
|
-
'sql': sql,
|
316
|
-
'source': source,
|
317
|
-
'outputPath': output_path,
|
318
|
-
'overwrite_path': overwrite_path,
|
319
|
-
'mode': mode
|
320
|
-
}
|
321
|
-
|
322
|
-
# 如果是集群模式且提供了资源配置
|
323
|
-
if mode == 'cluster' and spark_resource:
|
324
|
-
params['spark_resource'] = spark_resource
|
325
|
-
elif mode == 'cluster':
|
326
|
-
params['spark_resource'] = {
|
327
|
-
'num_executors': '4',
|
328
|
-
'driver_cores': '2',
|
329
|
-
'driver_memory': '4G',
|
330
|
-
'executor_cores': '2',
|
331
|
-
'executor_memory': '4G'
|
332
|
-
}
|
333
|
-
|
334
|
-
try:
|
335
|
-
if TURINGLIB_AVAILABLE:
|
336
|
-
tp_result = tp.download(params)
|
337
|
-
|
338
|
-
if isinstance(tp_result, dict) and tp_result.get('success') == '0':
|
339
|
-
return {
|
340
|
-
'status': 'success',
|
341
|
-
'message': f'查询结果已下载到: {output_path}',
|
342
|
-
'output_path': output_path,
|
343
|
-
'turinglib_result': tp_result,
|
344
|
-
'params': params
|
345
|
-
}
|
346
|
-
else:
|
347
|
-
return {
|
348
|
-
'status': 'error',
|
349
|
-
'message': f"下载失败: {tp_result.get('data', '未知错误')}",
|
350
|
-
'turinglib_result': tp_result,
|
351
|
-
'params': params
|
352
|
-
}
|
353
|
-
else:
|
354
|
-
return {
|
355
|
-
'status': 'simulated',
|
356
|
-
'message': f'模拟下载查询结果到: {output_path}',
|
357
|
-
'output_path': output_path,
|
358
|
-
'turinglib_result': {'success': '0', 'message': '模拟下载成功'},
|
359
|
-
'params': params,
|
360
|
-
'simulated': True
|
361
|
-
}
|
362
|
-
|
363
|
-
except Exception as e:
|
364
|
-
return {
|
365
|
-
'status': 'error',
|
366
|
-
'message': f"下载查询结果失败: {str(e)}",
|
367
|
-
'error': str(e),
|
368
|
-
'params': params
|
369
|
-
}
|
370
|
-
|
371
|
-
# ==================== 图灵平台特有方法 ====================
|
372
|
-
|
373
|
-
def install_python_packages(self, packages: List[str]) -> Dict[str, Any]:
|
374
|
-
"""
|
375
|
-
安装Python包(使用turingPythonLib)
|
376
|
-
|
377
|
-
Args:
|
378
|
-
packages: 要安装的包列表
|
379
|
-
|
380
|
-
Returns:
|
381
|
-
安装结果
|
382
|
-
"""
|
383
|
-
results = []
|
384
|
-
|
385
|
-
for package in packages:
|
386
|
-
try:
|
387
|
-
if TURINGLIB_AVAILABLE:
|
388
|
-
tp.pip_install(package)
|
389
|
-
results.append({
|
390
|
-
'package': package,
|
391
|
-
'status': 'success',
|
392
|
-
'message': f'成功安装 {package}'
|
393
|
-
})
|
394
|
-
else:
|
395
|
-
results.append({
|
396
|
-
'package': package,
|
397
|
-
'status': 'simulated',
|
398
|
-
'message': f'模拟安装 {package} (turingPythonLib不可用)'
|
399
|
-
})
|
400
|
-
except Exception as e:
|
401
|
-
results.append({
|
402
|
-
'package': package,
|
403
|
-
'status': 'error',
|
404
|
-
'error': str(e),
|
405
|
-
'message': f'安装 {package} 失败'
|
406
|
-
})
|
407
|
-
|
408
|
-
return {
|
409
|
-
'total_packages': len(packages),
|
410
|
-
'successful_installs': len([r for r in results if r['status'] == 'success']),
|
411
|
-
'results': results
|
412
|
-
}
|
413
|
-
|
414
|
-
def get_platform_info(self) -> Dict[str, Any]:
|
415
|
-
"""获取图灵平台信息"""
|
416
|
-
return {
|
417
|
-
'engine_name': self.get_engine_name(),
|
418
|
-
'engine_type': self.get_engine_type().value,
|
419
|
-
'turinglib_available': TURINGLIB_AVAILABLE,
|
420
|
-
'nfs_home_exists': os.path.exists('/nfsHome'),
|
421
|
-
'database_name': self.database_name,
|
422
|
-
'current_working_dir': os.getcwd(),
|
423
|
-
'python_path': sys.path[:3] # 只显示前3个路径
|
424
|
-
}
|
425
|
-
|
426
|
-
|
427
|
-
# 便捷创建函数
|
428
|
-
def create_turing_engine(database_name: str, **kwargs) -> TuringEngine:
|
429
|
-
"""
|
430
|
-
便捷函数:创建图灵引擎实例
|
431
|
-
|
432
|
-
Args:
|
433
|
-
database_name: 数据库名称
|
434
|
-
**kwargs: 其他参数
|
435
|
-
|
436
|
-
Returns:
|
437
|
-
图灵引擎实例
|
438
|
-
"""
|
439
|
-
return TuringEngine(database_name, **kwargs)
|
staran/examples/__init__.py
DELETED
staran/examples/aum_longtail.py
DELETED
@@ -1,250 +0,0 @@
|
|
1
|
-
"""
|
2
|
-
AUM代发长尾模型示例
|
3
|
-
基于Staran v0.3.0架构,使用schemas模块的预定义表结构
|
4
|
-
"""
|
5
|
-
|
6
|
-
from typing import Dict, Optional
|
7
|
-
from ..engines import create_turing_engine
|
8
|
-
from ..features import FeatureManager, FeatureConfig, FeatureType
|
9
|
-
from ..tools import Date
|
10
|
-
from ..schemas.aum import get_aum_schemas
|
11
|
-
|
12
|
-
|
13
|
-
class AUMLongtailExample:
|
14
|
-
"""AUM代发长尾模型示例类"""
|
15
|
-
|
16
|
-
def __init__(self, database: str = "dwegdata03000"):
|
17
|
-
"""
|
18
|
-
初始化AUM长尾模型示例
|
19
|
-
|
20
|
-
Args:
|
21
|
-
database: 数据库名称,默认为dwegdata03000
|
22
|
-
"""
|
23
|
-
self.database = database
|
24
|
-
self.engine = create_turing_engine(database)
|
25
|
-
self.schemas = get_aum_schemas() # 从schemas模块获取预定义的表结构
|
26
|
-
|
27
|
-
def run(self, feature_date: Optional[str] = None, output_path: str = "file:///nfsHome/aum_longtail") -> Dict:
|
28
|
-
"""
|
29
|
-
运行完整的AUM长尾模型特征工程
|
30
|
-
|
31
|
-
Args:
|
32
|
-
feature_date: 特征日期,格式为YYYYMM,默认为当前月
|
33
|
-
output_path: 输出路径,默认为file:///nfsHome/aum_longtail
|
34
|
-
|
35
|
-
Returns:
|
36
|
-
包含所有结果的字典
|
37
|
-
"""
|
38
|
-
if feature_date is None:
|
39
|
-
feature_date = Date.today().format_compact()[:6]
|
40
|
-
|
41
|
-
print(f"🚀 开始AUM长尾模型特征工程 - {feature_date}")
|
42
|
-
print("="*60)
|
43
|
-
|
44
|
-
results = {}
|
45
|
-
|
46
|
-
# 步骤1: 生成A表特征(行为特征表)- 只生成原始拷贝和聚合特征
|
47
|
-
print("📊 步骤1: 生成客户行为特征...")
|
48
|
-
results['behavior'] = self._generate_behavior_features('behavior', feature_date)
|
49
|
-
|
50
|
-
# 步骤2: 生成B表特征(资产平均值表)- 完整特征
|
51
|
-
print("💰 步骤2: 生成资产平均值特征...")
|
52
|
-
results['asset_avg'] = self._generate_full_features('asset_avg', feature_date)
|
53
|
-
|
54
|
-
# 步骤3: 生成C表特征(资产配置表)- 完整特征
|
55
|
-
print("📈 步骤3: 生成资产配置特征...")
|
56
|
-
results['asset_config'] = self._generate_full_features('asset_config', feature_date)
|
57
|
-
|
58
|
-
# 步骤4: 生成D表特征(月度统计表)- 完整特征
|
59
|
-
print("📋 步骤4: 生成月度统计特征...")
|
60
|
-
results['monthly_stat'] = self._generate_full_features('monthly_stat', feature_date)
|
61
|
-
|
62
|
-
# 步骤5: 导出特征表
|
63
|
-
print("💾 步骤5: 导出特征表...")
|
64
|
-
results['exports'] = self._export_features(feature_date, output_path)
|
65
|
-
|
66
|
-
print("="*60)
|
67
|
-
print("✅ AUM长尾模型特征工程完成!")
|
68
|
-
return results
|
69
|
-
|
70
|
-
def _generate_behavior_features(self, table_type: str, feature_date: str) -> Dict:
|
71
|
-
"""生成行为特征(A表)- 只生成原始拷贝和聚合特征"""
|
72
|
-
schema = self.schemas[table_type]
|
73
|
-
manager = FeatureManager(self.engine, self.database)
|
74
|
-
|
75
|
-
# A表特征配置:只启用原始拷贝和聚合
|
76
|
-
config = FeatureConfig()
|
77
|
-
config.enable_feature(FeatureType.RAW_COPY)
|
78
|
-
config.enable_feature(FeatureType.AGGREGATION)
|
79
|
-
config.disable_feature(FeatureType.MOM) # 不生成环比
|
80
|
-
config.disable_feature(FeatureType.YOY) # 不生成同比
|
81
|
-
|
82
|
-
print(f" 🔧 生成{schema.table_name}的特征...")
|
83
|
-
result = manager.generate_features(
|
84
|
-
schema=schema,
|
85
|
-
config=config,
|
86
|
-
feature_date=feature_date
|
87
|
-
)
|
88
|
-
|
89
|
-
feature_count = manager.count_features(schema, config)
|
90
|
-
print(f" ✅ A表特征生成完成: {feature_count}个特征")
|
91
|
-
return result
|
92
|
-
|
93
|
-
def _generate_full_features(self, table_type: str, feature_date: str) -> Dict:
|
94
|
-
"""生成完整特征(B、C、D表)- 聚合+5个月环比+1年同比"""
|
95
|
-
schema = self.schemas[table_type]
|
96
|
-
manager = FeatureManager(self.engine, self.database)
|
97
|
-
|
98
|
-
# B、C、D表特征配置:完整特征集
|
99
|
-
config = FeatureConfig()
|
100
|
-
config.enable_feature(FeatureType.AGGREGATION)
|
101
|
-
config.enable_feature(FeatureType.MOM, mom_windows=[5]) # 5个月环比
|
102
|
-
config.enable_feature(FeatureType.YOY, yoy_windows=[12]) # 1年同比
|
103
|
-
|
104
|
-
print(f" 🔧 生成{schema.table_name}的特征...")
|
105
|
-
result = manager.generate_features(
|
106
|
-
schema=schema,
|
107
|
-
config=config,
|
108
|
-
feature_date=feature_date
|
109
|
-
)
|
110
|
-
|
111
|
-
feature_count = manager.count_features(schema, config)
|
112
|
-
print(f" ✅ {table_type}表特征生成完成: {feature_count}个特征")
|
113
|
-
return result
|
114
|
-
|
115
|
-
def _export_features(self, feature_date: str, output_path: str) -> Dict:
|
116
|
-
"""导出所有特征表到指定路径"""
|
117
|
-
file_prefixes = {
|
118
|
-
'behavior': 'aum_behavior_features',
|
119
|
-
'asset_avg': 'aum_asset_avg_features',
|
120
|
-
'asset_config': 'aum_asset_config_features',
|
121
|
-
'monthly_stat': 'monthly_stat_features'
|
122
|
-
}
|
123
|
-
|
124
|
-
results = {}
|
125
|
-
for table_type, file_prefix in file_prefixes.items():
|
126
|
-
print(f" 💾 导出{table_type}表...")
|
127
|
-
|
128
|
-
# 构建特征表名
|
129
|
-
table_name = f"{self.schemas[table_type].table_name}_{feature_date}_f001"
|
130
|
-
|
131
|
-
result = self.engine.download_table_data(
|
132
|
-
table_name=f"{self.database}.{table_name}",
|
133
|
-
output_path=f"{output_path}/{file_prefix}_{feature_date}.parquet",
|
134
|
-
mode="cluster"
|
135
|
-
)
|
136
|
-
|
137
|
-
results[table_type] = result
|
138
|
-
print(f" ✅ 导出 {table_type}: {result.get('status', 'unknown')}")
|
139
|
-
|
140
|
-
return results
|
141
|
-
|
142
|
-
def get_summary(self) -> Dict:
|
143
|
-
"""获取示例摘要信息"""
|
144
|
-
summary = {
|
145
|
-
'database': self.database,
|
146
|
-
'tables': {},
|
147
|
-
'total_features': 0
|
148
|
-
}
|
149
|
-
|
150
|
-
for table_type, schema in self.schemas.items():
|
151
|
-
try:
|
152
|
-
manager = FeatureManager(self.engine, self.database)
|
153
|
-
|
154
|
-
if table_type == 'behavior':
|
155
|
-
# A表只有原始拷贝和聚合特征
|
156
|
-
config = FeatureConfig()
|
157
|
-
config.enable_feature(FeatureType.RAW_COPY)
|
158
|
-
config.enable_feature(FeatureType.AGGREGATION)
|
159
|
-
config.disable_feature(FeatureType.MOM)
|
160
|
-
config.disable_feature(FeatureType.YOY)
|
161
|
-
else:
|
162
|
-
# B、C、D表包含完整特征:聚合+5个月MoM+1年YoY
|
163
|
-
config = FeatureConfig()
|
164
|
-
config.enable_feature(FeatureType.AGGREGATION, mom_windows=[5], yoy_windows=[12])
|
165
|
-
|
166
|
-
feature_count = manager.count_features(schema, config)
|
167
|
-
summary['tables'][table_type] = {
|
168
|
-
'table_name': schema.table_name,
|
169
|
-
'field_count': len(schema.fields),
|
170
|
-
'feature_count': feature_count,
|
171
|
-
'features': {
|
172
|
-
'total': feature_count,
|
173
|
-
'aggregation': len(schema.fields), # 估算
|
174
|
-
'mom': len(schema.fields) * 5 if table_type != 'behavior' else 0,
|
175
|
-
'yoy': len(schema.fields) * 1 if table_type != 'behavior' else 0
|
176
|
-
}
|
177
|
-
}
|
178
|
-
summary['total_features'] += feature_count
|
179
|
-
except Exception as e:
|
180
|
-
# 在模拟模式下返回预估数量
|
181
|
-
base_fields = len(schema.fields)
|
182
|
-
if table_type == 'behavior':
|
183
|
-
estimated_features = base_fields * 2 # 原始拷贝 + 聚合
|
184
|
-
agg_count = base_fields
|
185
|
-
mom_count = 0
|
186
|
-
yoy_count = 0
|
187
|
-
else:
|
188
|
-
estimated_features = base_fields * 8 # 聚合 + MoM + YoY 组合
|
189
|
-
agg_count = base_fields
|
190
|
-
mom_count = base_fields * 5
|
191
|
-
yoy_count = base_fields * 1
|
192
|
-
|
193
|
-
summary['tables'][table_type] = {
|
194
|
-
'table_name': schema.table_name,
|
195
|
-
'field_count': base_fields,
|
196
|
-
'feature_count': estimated_features,
|
197
|
-
'mode': 'estimated',
|
198
|
-
'features': {
|
199
|
-
'total': estimated_features,
|
200
|
-
'aggregation': agg_count,
|
201
|
-
'mom': mom_count,
|
202
|
-
'yoy': yoy_count
|
203
|
-
}
|
204
|
-
}
|
205
|
-
summary['total_features'] += estimated_features
|
206
|
-
|
207
|
-
return summary
|
208
|
-
|
209
|
-
|
210
|
-
# 简化的API函数
|
211
|
-
def create_aum_example(database: str = "dwegdata03000") -> AUMLongtailExample:
|
212
|
-
"""
|
213
|
-
一键创建AUM长尾模型示例
|
214
|
-
|
215
|
-
Args:
|
216
|
-
database: 数据库名称,默认为dwegdata03000
|
217
|
-
|
218
|
-
Returns:
|
219
|
-
AUMLongtailExample实例
|
220
|
-
"""
|
221
|
-
return AUMLongtailExample(database)
|
222
|
-
|
223
|
-
|
224
|
-
def run_aum_example(feature_date: Optional[str] = None,
|
225
|
-
database: str = "dwegdata03000",
|
226
|
-
output_path: str = "file:///nfsHome/aum_longtail") -> Dict:
|
227
|
-
"""
|
228
|
-
一键运行AUM长尾模型特征工程
|
229
|
-
|
230
|
-
Args:
|
231
|
-
feature_date: 特征日期,格式为YYYYMM,默认为当前月
|
232
|
-
database: 数据库名称,默认为dwegdata03000
|
233
|
-
output_path: 输出路径,默认为file:///nfsHome/aum_longtail
|
234
|
-
|
235
|
-
Returns:
|
236
|
-
包含所有结果的字典
|
237
|
-
|
238
|
-
Example:
|
239
|
-
>>> results = run_aum_example('202507')
|
240
|
-
>>> print(f"生成特征数: {len(results)}")
|
241
|
-
"""
|
242
|
-
example = create_aum_example(database)
|
243
|
-
return example.run(feature_date, output_path)
|
244
|
-
|
245
|
-
|
246
|
-
__all__ = [
|
247
|
-
'AUMLongtailExample',
|
248
|
-
'create_aum_example',
|
249
|
-
'run_aum_example'
|
250
|
-
]
|