staran 0.5.0__py3-none-any.whl → 0.6.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,487 +0,0 @@
1
- """
2
- AUM代发长尾模型示例
3
- 基于Staran v0.3.0架构,使用schemas模块的预定义表结构
4
- """
5
-
6
- from typing import Dict, List, Optional
7
- from ..engines import create_turing_engine
8
- from ..features import FeatureManager, FeatureConfig, FeatureType
9
- from ..tools import Date
10
- from ..schemas.aum import get_aum_schemas
11
-
12
-
13
- class AUMLongtailExample:
14
- """AUM代发长尾模型示例类"""
15
-
16
- def __init__(self, database: str = "dwegdata03000"):
17
- """
18
- 初始化AUM长尾模型示例
19
-
20
- Args:
21
- database: 数据库名称,默认为dwegdata03000
22
- """
23
- self.database = database
24
- self.engine = create_turing_engine(database)
25
- self.schemas = get_aum_schemas() # 从schemas模块获取预定义的表结构
26
-
27
- def run(self, feature_date: Optional[str] = None, output_path: str = "file:///nfsHome/aum_longtail") -> Dict:
28
- """创建表结构定义"""
29
- schemas = {}
30
-
31
- # A表:bi_hlwj_dfcw_f1_f4_wy - 客户行为特征表(只生成原始拷贝和聚合特征)
32
- schemas['behavior'] = self._create_behavior_schema()
33
-
34
- # B表:bi_hlwj_zi_chan_avg_wy - 资产平均余额表(生成全部特征)
35
- schemas['asset_avg'] = self._create_asset_avg_schema()
36
-
37
- # C表:bi_hlwj_zi_chang_month_total_zb - 月度资产配置表(生成全部特征)
38
- schemas['asset_config'] = self._create_asset_config_schema()
39
-
40
- # D表:bi_hlwj_realy_month_stat_wy - 月度实际统计表(生成全部特征)
41
- schemas['monthly_stat'] = self._create_monthly_stat_schema()
42
-
43
- return schemas
44
-
45
- def _create_behavior_schema(self) -> TableSchema:
46
- """创建A表结构 - 客户行为特征表"""
47
- schema = TableSchema('bi_hlwj_dfcw_f1_f4_wy')
48
- schema.add_primary_key('party_id', 'string')
49
- schema.add_date_field('data_dt', 'string')
50
-
51
- # 基础行为字段
52
- schema.add_field("buy_ct", "string", comment="购买次数", aggregatable=True)
53
- schema.add_field("recency", "string", comment="最近一次购买距今天数", aggregatable=True)
54
- schema.add_field("tenure", "string", comment="客户关系持续时间", aggregatable=True)
55
- schema.add_field("window1", "string", comment="时间窗口标记")
56
- schema.add_field("freq", "string", comment="总购买频率", aggregatable=True)
57
- schema.add_field("freq1", "string", comment="最近时间段购买频率", aggregatable=True)
58
- schema.add_field("productidcount", "string", comment="产品种类数", aggregatable=True)
59
- schema.add_field("orderidcount", "string", comment="订单数", aggregatable=True)
60
- schema.add_field("productcategorycount", "string", comment="产品品类数", aggregatable=True)
61
-
62
- # productamount和m1~m4统计字段
63
- stats_fields = [
64
- ("max", "最大值"), ("min", "最小值"), ("sum", "总和"),
65
- ("avg", "平均值"), ("var", "方差"), ("std", "标准差"),
66
- ("rng", "范围"), ("med", "中位数")
67
- ]
68
-
69
- m_fields = {
70
- "productamount": "购买金额",
71
- "m1": "去重订单数",
72
- "m2": "去重商品数",
73
- "m3": "去重渠道数",
74
- "m4": "去重产品品类数"
75
- }
76
-
77
- for prefix, meaning in m_fields.items():
78
- for stat_key, stat_desc in stats_fields:
79
- field_name = f"{prefix}_{stat_key}"
80
- description = f"{meaning}的{stat_desc}"
81
- schema.add_field(field_name, "string", comment=description, aggregatable=True)
82
-
83
- # 客户属性字段
84
- schema.add_field("life_day", "string", comment="客户生命周期天数", aggregatable=True)
85
- schema.add_field("gender", "string", comment="性别(编码)")
86
- schema.add_field("open_day", "string", comment="开户天数", aggregatable=True)
87
- schema.add_field("label", "string", comment="标签值(如是否购买)")
88
-
89
- schema.set_monthly_unique(False) # A表不是每人每月唯一
90
- return schema
91
-
92
- def _create_asset_avg_schema(self) -> TableSchema:
93
- """创建B表结构 - 资产平均余额表"""
94
- schema = TableSchema('bi_hlwj_zi_chan_avg_wy')
95
- schema.add_primary_key('party_id', 'string')
96
- schema.add_date_field('data_dt', 'string')
97
-
98
- # 总余额字段
99
- schema.add_field("asset_total_bal", "string", comment="总资产余额", aggregatable=True)
100
- schema.add_field("liab_total_bal", "string", comment="总负债余额", aggregatable=True)
101
- schema.add_field("dpsit_total_bal", "string", comment="存款总余额", aggregatable=True)
102
- schema.add_field("loan_total_bal", "string", comment="贷款总余额", aggregatable=True)
103
- schema.add_field("card_total_bal", "string", comment="信用卡总余额", aggregatable=True)
104
- schema.add_field("mid_busi_total_bal", "string", comment="中间业务总余额", aggregatable=True)
105
-
106
- # 平均资产余额字段
107
- for period in ["month", "year", "3", "6", "12"]:
108
- schema.add_field(
109
- f"avg_asset_bal_{period}",
110
- "string",
111
- comment=f"平均资产余额 ({period}期)",
112
- aggregatable=True
113
- )
114
-
115
- # 平均存款余额字段
116
- for period in ["3", "12"]:
117
- schema.add_field(
118
- f"avg_dpsit_bal_{period}",
119
- "string",
120
- comment=f"平均存款余额 ({period}期)",
121
- aggregatable=True
122
- )
123
-
124
- schema.set_monthly_unique(True) # B表每人每月唯一
125
- return schema
126
-
127
- def _create_asset_config_schema(self) -> TableSchema:
128
- """创建C表结构 - 月度资产配置表"""
129
- schema = TableSchema('bi_hlwj_zi_chang_month_total_zb')
130
- schema.add_primary_key('party_id', 'string')
131
- schema.add_date_field('data_dt', 'string')
132
-
133
- # 资产配置字段
134
- asset_fields = [
135
- ("SEG_ASSET_TOTAL", "总资产余额"),
136
- ("INDV_CONSM_LOAN_AMT", "个人消费贷款余额"),
137
- ("INDV_HOUSE_LOAN_AMT", "个人住房贷款余额"),
138
- ("INDV_OPER_LOAN_AMT", "个人经营贷款余额"),
139
- ("DPSIT_BAL", "存款余额"),
140
- ("TBOND_BAL", "国债余额"),
141
- ("FUND_BAL", "基金余额"),
142
- ("BOND_BAL", "债券余额"),
143
- ("GOLD_BAL", "黄金余额"),
144
- ("WCURR_CHREM_BAL", "外币现钞余额"),
145
- ("PRESV_MGMT_SECU_BAL", "保值管理证券余额"),
146
- ("INSURE_FORM_BAL", "保险单余额"),
147
- ("CRDT_CARD_OD_BAL", "信用卡透支余额"),
148
- ("CRDT_CARD_CON_AMT", "信用卡消费金额"),
149
- ("SEMI_CRDT_CARD_OD_BAL", "准贷记卡透支余额"),
150
- ("SEMI_CRDT_CARD_CON_AMT", "准贷记卡消费金额"),
151
- ("INTER_CARD_CON_AMT", "国际卡消费金额"),
152
- ("INTER_CARD_OD_BAL", "国际卡透支余额"),
153
- ("CRDT_CARD_DPSIT_BAL", "信用卡存款余额"),
154
- ("SEMI_CRDT_CARD_DPSIT_BAL", "准贷记卡存款余额"),
155
- ("INTER_CARD_DPSIT_BAL", "国际卡存款余额"),
156
- ("SILVER_BAL", "白银余额"),
157
- ("AGENT_SOLID_SILVER_BAL", "代发实物白银余额"),
158
- ("PT_BAL", "个人养老金余额"),
159
- ("PD_BAL", "个人养老金存款余额"),
160
- ("OTHER_METAL_BAL", "其他金属余额"),
161
- ("CURR_DPSIT_BAL", "活期存款余额"),
162
- ("TIME_DPSIT_BAL", "定期存款余额"),
163
- ("OIL_BAL", "石油余额"),
164
- ("FX_BAL", "外汇余额")
165
- ]
166
-
167
- for field_name, description in asset_fields:
168
- schema.add_field(field_name, "string", comment=description, aggregatable=True)
169
-
170
- schema.set_monthly_unique(True) # C表每人每月唯一
171
- return schema
172
-
173
- def _create_monthly_stat_schema(self) -> TableSchema:
174
- """创建D表结构 - 月度实际统计表"""
175
- schema = TableSchema('bi_hlwj_realy_month_stat_wy')
176
- schema.add_primary_key('party_dt', 'string') # 注意这个表的主键是party_dt
177
- schema.add_date_field('data_dt', 'string')
178
-
179
- # 渠道存取款字段
180
- channels = {
181
- "CASH_DEPIST": "现金",
182
- "REMIT": "汇款",
183
- "YY": "邮政储蓄",
184
- "UNIONPAY": "银联",
185
- "FIN_ASSET": "理财产品",
186
- "CORP_ACCT": "对公账户"
187
- }
188
-
189
- for prefix, desc in channels.items():
190
- schema.add_field(f"{prefix}_IN", "string", comment=f"{desc}存入金额", aggregatable=True)
191
- schema.add_field(f"{prefix}_OUT", "string", comment=f"{desc}取出金额", aggregatable=True)
192
-
193
- # 其他存取款字段
194
- schema.add_field("AGENT_SALARY_IN", "string", comment="代发工资存入金额", aggregatable=True)
195
- schema.add_field("CREDIT_CARD_OUT", "string", comment="信用卡取出金额", aggregatable=True)
196
- schema.add_field("DEBIT_CARD_OUT", "string", comment="借记卡取出金额", aggregatable=True)
197
- schema.add_field("BATCH_DEDUCT_OUT", "string", comment="批量扣款金额", aggregatable=True)
198
-
199
- # 交易渠道指标字段
200
- fields = [
201
- ("DEBIT_CARD", "借记卡", "MON3"),
202
- ("CREDIT_CARD", "信用卡", "MON3"),
203
- ("THIRD_PAYMENT", "第三方支付", "MON3"),
204
- ("MOBBANK", "手机银行", "MON12"),
205
- ("TELBANK", "电话银行", "MON12")
206
- ]
207
-
208
- metrics = [("TX_CNT", "交易次数"), ("TX_AMT", "交易金额")]
209
-
210
- for channel, desc, period in fields:
211
- for metric_code, metric_desc in metrics:
212
- field_name = f"{channel}_{metric_code}_{period}"
213
- description = f"{desc}{metric_desc}(近{period[-2:]}个月)"
214
- schema.add_field(field_name, "string", comment=description, aggregatable=True)
215
-
216
- # 其他交易字段
217
- schema.add_field("COUNTER_TX_CNT_MON12", "string", comment="柜台交易次数(近12个月)", aggregatable=True)
218
- schema.add_field("WEBBANK_TX_CNT_MON12", "string", comment="网银交易次数(近12个月)", aggregatable=True)
219
-
220
- # 境外交易字段
221
- for i in range(1, 6):
222
- schema.add_field(f"Y1_OVERS_CTY{i}_CNT", "string", comment=f"近一年境外国家{i}的交易次数", aggregatable=True)
223
- schema.add_field(f"Y1_OVERS_CNT_CTY{i}_CD", "string", comment=f"近一年境外国家{i}的交易次数(编码)")
224
- schema.add_field(f"Y1_OVERS_CTY{i}_AMT", "string", comment=f"近一年境外国家{i}的交易金额", aggregatable=True)
225
- schema.add_field(f"Y1_OVERS_AMT_CTY{i}_CD", "string", comment=f"近一年境外国家{i}的交易金额(编码)")
226
-
227
- schema.add_field("Y1_OVERS_OTHER_CTY_CNT", "string", comment="近一年其他境外国家的交易次数", aggregatable=True)
228
- schema.add_field("Y1_OVERS_OTHER_CTY_AMT", "string", comment="近一年其他境外国家的交易金额", aggregatable=True)
229
-
230
- schema.set_monthly_unique(True) # D表每人每月唯一
231
- return schema
232
-
233
- def run(self, feature_date: Optional[str] = None, output_path: str = "file:///nfsHome/aum_longtail") -> Dict:
234
- """
235
- 运行完整的AUM长尾模型特征工程
236
-
237
- Args:
238
- feature_date: 特征日期,格式为YYYYMM,默认为当前月
239
- output_path: 输出路径,默认为file:///nfsHome/aum_longtail
240
-
241
- Returns:
242
- 包含所有结果的字典
243
- """
244
- if feature_date is None:
245
- feature_date = Date.today().format_compact()[:6]
246
-
247
- print(f"🚀 开始AUM长尾模型特征工程 - {feature_date}")
248
- print("="*60)
249
-
250
- results = {}
251
-
252
- # 1. A表特征(只生成原始拷贝和聚合特征)
253
- print("📊 A表:客户行为特征(原始拷贝 + 聚合特征)")
254
- results['behavior'] = self._generate_behavior_features(feature_date)
255
-
256
- # 2. B表特征(生成全部特征:聚合 + 环比5个月 + 同比1年)
257
- print("💰 B表:资产平均余额特征(聚合 + 环比5个月 + 同比1年)")
258
- results['asset_avg'] = self._generate_full_features('asset_avg', feature_date)
259
-
260
- # 3. C表特征(生成全部特征)
261
- print("📈 C表:月度资产配置特征(聚合 + 环比5个月 + 同比1年)")
262
- results['asset_config'] = self._generate_full_features('asset_config', feature_date)
263
-
264
- # 4. D表特征(生成全部特征)
265
- print("📋 D表:月度实际统计特征(聚合 + 环比5个月 + 同比1年)")
266
- results['monthly_stat'] = self._generate_full_features('monthly_stat', feature_date)
267
-
268
- # 5. 导出训练数据
269
- print("💾 导出训练数据...")
270
- results['export'] = self._export_datasets(feature_date, output_path)
271
-
272
- print("\n" + "="*60)
273
- print("✅ AUM长尾模型特征工程完成!")
274
- print(f"📂 输出路径: {output_path}")
275
-
276
- return results
277
-
278
- def _generate_behavior_features(self, feature_date: str) -> Dict:
279
- """生成A表特征(仅原始拷贝和聚合特征)"""
280
- schema = self.schemas['behavior']
281
- manager = FeatureManager(self.engine, self.database)
282
-
283
- # 配置特征生成(只启用原始拷贝和聚合)
284
- config = FeatureConfig()
285
- config.enable_feature(FeatureType.RAW_COPY)
286
- config.enable_feature(FeatureType.AGGREGATION)
287
- config.disable_feature(FeatureType.MOM)
288
- config.disable_feature(FeatureType.YOY)
289
-
290
- # 使用完整的聚合类型
291
- config.set_aggregation_types(['sum', 'avg', 'max', 'min', 'count', 'stddev'])
292
-
293
- from ..features.generator import FeatureGenerator
294
- generator = FeatureGenerator(schema, manager, config)
295
-
296
- # 生成特征表
297
- result = generator.create_feature_table(
298
- feature_type=FeatureType.AGGREGATION,
299
- year=int(feature_date[:4]),
300
- month=int(feature_date[4:6]),
301
- feature_num=1,
302
- execute=True
303
- )
304
-
305
- print(f" ✅ 生成表: {result}")
306
- return {'table_name': result, 'feature_types': ['raw_copy', 'aggregation']}
307
-
308
- def _generate_full_features(self, table_type: str, feature_date: str) -> Dict:
309
- """生成完整特征(聚合 + 环比5个月 + 同比1年)"""
310
- schema = self.schemas[table_type]
311
- manager = FeatureManager(self.engine, self.database)
312
-
313
- # 配置特征生成(启用所有特征)
314
- config = FeatureConfig()
315
- config.enable_feature(FeatureType.RAW_COPY)
316
- config.enable_feature(FeatureType.AGGREGATION)
317
- config.enable_feature(FeatureType.MOM)
318
- config.enable_feature(FeatureType.YOY)
319
-
320
- # 设置环比过去5个月
321
- config.set_mom_periods([1, 2, 3, 4, 5])
322
- # 设置同比过去1年
323
- config.set_yoy_periods([1])
324
-
325
- from ..features.generator import FeatureGenerator
326
- generator = FeatureGenerator(schema, manager, config)
327
-
328
- # 生成完整特征表
329
- result = generator.create_feature_table(
330
- feature_type=FeatureType.AGGREGATION, # 主要特征类型
331
- year=int(feature_date[:4]),
332
- month=int(feature_date[4:6]),
333
- feature_num=1,
334
- execute=True
335
- )
336
-
337
- print(f" ✅ 生成表: {result}")
338
- return {
339
- 'table_name': result,
340
- 'feature_types': ['raw_copy', 'aggregation', 'mom_5m', 'yoy_1y']
341
- }
342
-
343
- def _export_datasets(self, feature_date: str, output_path: str) -> Dict:
344
- """导出训练数据集"""
345
- results = {}
346
-
347
- # 导出各个特征表的数据
348
- table_mappings = {
349
- 'behavior': 'behavior_features',
350
- 'asset_avg': 'asset_avg_features',
351
- 'asset_config': 'asset_config_features',
352
- 'monthly_stat': 'monthly_stat_features'
353
- }
354
-
355
- for table_type, file_prefix in table_mappings.items():
356
- table_name = f"{self.schemas[table_type].table_name}_{feature_date}_f001"
357
-
358
- result = self.engine.download_table_data(
359
- table_name=f"{self.database}.{table_name}",
360
- output_path=f"{output_path}/{file_prefix}_{feature_date}.parquet",
361
- mode="cluster"
362
- )
363
-
364
- results[table_type] = result
365
- print(f" ✅ 导出 {table_type}: {result.get('status', 'unknown')}")
366
-
367
- return results
368
-
369
- def get_summary(self) -> Dict:
370
- """获取示例摘要信息"""
371
- summary = {
372
- 'database': self.database,
373
- 'tables': {},
374
- 'total_features': 0
375
- }
376
-
377
- for table_type, schema in self.schemas.items():
378
- try:
379
- manager = FeatureManager(self.engine, self.database)
380
-
381
- if table_type == 'behavior':
382
- # A表只有原始拷贝和聚合特征
383
- config = FeatureConfig()
384
- config.enable_feature(FeatureType.RAW_COPY)
385
- config.enable_feature(FeatureType.AGGREGATION)
386
- config.disable_feature(FeatureType.MOM)
387
- config.disable_feature(FeatureType.YOY)
388
- else:
389
- # B、C、D表包含完整特征:聚合+5个月MoM+1年YoY
390
- config = FeatureConfig()
391
- config.enable_feature(FeatureType.AGGREGATION, mom_windows=[5], yoy_windows=[12])
392
-
393
- feature_count = manager.count_features(schema, config)
394
- summary['tables'][table_type] = {
395
- 'table_name': schema.table_name,
396
- 'field_count': len(schema.fields),
397
- 'feature_count': feature_count,
398
- 'features': {
399
- 'total': feature_count,
400
- 'aggregation': len(schema.fields), # 估算
401
- 'mom': len(schema.fields) * 5 if table_type != 'behavior' else 0,
402
- 'yoy': len(schema.fields) * 1 if table_type != 'behavior' else 0
403
- }
404
- }
405
- summary['total_features'] += feature_count
406
- except Exception as e:
407
- # 在模拟模式下返回预估数量
408
- base_fields = len(schema.fields)
409
- if table_type == 'behavior':
410
- estimated_features = base_fields * 2 # 原始拷贝 + 聚合
411
- agg_count = base_fields
412
- mom_count = 0
413
- yoy_count = 0
414
- else:
415
- estimated_features = base_fields * 8 # 聚合 + MoM + YoY 组合
416
- agg_count = base_fields
417
- mom_count = base_fields * 5
418
- yoy_count = base_fields * 1
419
-
420
- summary['tables'][table_type] = {
421
- 'table_name': schema.table_name,
422
- 'field_count': base_fields,
423
- 'feature_count': estimated_features,
424
- 'mode': 'estimated',
425
- 'features': {
426
- 'total': estimated_features,
427
- 'aggregation': agg_count,
428
- 'mom': mom_count,
429
- 'yoy': yoy_count
430
- }
431
- }
432
- summary['total_features'] += estimated_features
433
-
434
- return summary
435
-
436
- def print_summary(self):
437
- """打印示例摘要"""
438
- summary = self.get_summary()
439
-
440
- print("🎯 AUM代发长尾模型示例摘要")
441
- print("="*50)
442
- print(f"数据库: {summary['database']}")
443
- print(f"总特征数: {summary['total_features']}")
444
- print()
445
-
446
- for table_type, info in summary['tables'].items():
447
- features = info['features']
448
- print(f"📊 {table_type.upper()}表 ({info['table_name']})")
449
- print(f" - 字段数: {info['fields_count']}")
450
- print(f" - 总特征: {features['total']}")
451
- print(f" - 原始拷贝: {features['raw_copy']}")
452
- print(f" - 聚合特征: {features['aggregation']}")
453
- print(f" - 环比特征: {features['mom']}")
454
- print(f" - 同比特征: {features['yoy']}")
455
- print()
456
-
457
-
458
- # 简化的使用接口
459
- def create_aum_example(database: str = "dwegdata03000") -> AUMLongtailExample:
460
- """
461
- 创建AUM长尾模型示例
462
-
463
- Args:
464
- database: 数据库名称
465
-
466
- Returns:
467
- AUMLongtailExample实例
468
- """
469
- return AUMLongtailExample(database)
470
-
471
-
472
- def run_aum_example(feature_date: Optional[str] = None,
473
- database: str = "dwegdata03000",
474
- output_path: str = "file:///nfsHome/aum_longtail") -> Dict:
475
- """
476
- 一键运行AUM长尾模型示例
477
-
478
- Args:
479
- feature_date: 特征日期,格式YYYYMM
480
- database: 数据库名称
481
- output_path: 输出路径
482
-
483
- Returns:
484
- 执行结果
485
- """
486
- example = create_aum_example(database)
487
- return example.run(feature_date, output_path)