staran 0.6.1__py3-none-any.whl → 1.0.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
staran/models/target.py DELETED
@@ -1,321 +0,0 @@
1
- """
2
- 目标变量定义模块
3
-
4
- 提供基于SQL的目标变量定义和生成功能
5
- """
6
-
7
- from enum import Enum
8
- from typing import Dict, Any, List, Optional, Union
9
- from dataclasses import dataclass, field
10
- from datetime import datetime
11
- import re
12
-
13
-
14
- class TargetType(Enum):
15
- """目标变量类型"""
16
- BINARY_CLASSIFICATION = "binary_classification" # 二分类
17
- MULTI_CLASSIFICATION = "multi_classification" # 多分类
18
- REGRESSION = "regression" # 回归
19
- RANKING = "ranking" # 排序
20
- CLUSTERING = "clustering" # 聚类
21
- SQL_BASED = "sql_based" # 基于SQL的自定义目标
22
-
23
-
24
- class TargetEncoding(Enum):
25
- """目标变量编码方式"""
26
- NONE = "none" # 不编码
27
- LABEL_ENCODING = "label" # 标签编码
28
- ONE_HOT = "one_hot" # 独热编码
29
- ORDINAL = "ordinal" # 序数编码
30
- BINARY = "binary" # 二进制编码
31
-
32
-
33
- @dataclass
34
- class TargetDefinition:
35
- """目标变量定义类"""
36
- # 基本信息
37
- name: str # 目标变量名称
38
- target_type: TargetType # 目标类型
39
- description: str = "" # 描述
40
-
41
- # SQL定义 (核心功能)
42
- sql_query: str = "" # 生成目标变量的SQL查询
43
- target_column: str = "target" # 目标列名
44
-
45
- # 数据信息
46
- data_type: str = "float" # 数据类型
47
- encoding: TargetEncoding = TargetEncoding.NONE # 编码方式
48
-
49
- # 分类相关
50
- class_labels: List[str] = field(default_factory=list) # 类别标签
51
- class_weights: Dict[str, float] = field(default_factory=dict) # 类别权重
52
-
53
- # 回归相关
54
- min_value: Optional[float] = None # 最小值
55
- max_value: Optional[float] = None # 最大值
56
- normalization: bool = False # 是否标准化
57
-
58
- # 时间相关
59
- time_window: str = "" # 时间窗口 (如 "30_days", "3_months")
60
- prediction_horizon: str = "" # 预测时间范围
61
-
62
- # 银行特定
63
- bank_code: str = "generic" # 银行代码
64
- business_rules: Dict[str, Any] = field(default_factory=dict) # 业务规则
65
-
66
- # 元数据
67
- created_at: datetime = field(default_factory=datetime.now)
68
- created_by: str = "system"
69
- version: str = "1.0.0"
70
- tags: List[str] = field(default_factory=list)
71
-
72
- # 验证配置
73
- validation_rules: Dict[str, Any] = field(default_factory=dict)
74
-
75
- def __post_init__(self):
76
- """初始化后处理"""
77
- if not self.sql_query and self.target_type != TargetType.SQL_BASED:
78
- self.sql_query = self._generate_default_sql()
79
-
80
- # 验证SQL语法
81
- if self.sql_query:
82
- self._validate_sql()
83
-
84
- def _generate_default_sql(self) -> str:
85
- """根据目标类型生成默认SQL"""
86
- if self.target_type == TargetType.BINARY_CLASSIFICATION:
87
- return f"""
88
- SELECT party_id,
89
- CASE WHEN condition THEN 1 ELSE 0 END as {self.target_column}
90
- FROM source_table
91
- WHERE data_dt = '{{data_dt}}'
92
- """
93
- elif self.target_type == TargetType.REGRESSION:
94
- return f"""
95
- SELECT party_id,
96
- target_value as {self.target_column}
97
- FROM source_table
98
- WHERE data_dt = '{{data_dt}}'
99
- """
100
- else:
101
- return ""
102
-
103
- def _validate_sql(self):
104
- """验证SQL语法基本正确性"""
105
- sql = self.sql_query.strip().upper()
106
-
107
- # 基本SQL结构检查 (支持WITH语句)
108
- if not (sql.startswith('SELECT') or sql.startswith('WITH')):
109
- raise ValueError("SQL查询必须以SELECT或WITH开始")
110
-
111
- # 检查是否包含目标列
112
- if self.target_column.upper() not in sql:
113
- print(f"警告: SQL中未找到目标列 '{self.target_column}'")
114
-
115
- # 检查参数占位符
116
- placeholders = re.findall(r'\{(\w+)\}', self.sql_query)
117
- if placeholders:
118
- print(f"发现参数占位符: {placeholders}")
119
-
120
- def generate_sql(self, **params) -> str:
121
- """
122
- 生成最终的SQL查询,替换参数占位符
123
-
124
- Args:
125
- **params: SQL参数字典
126
-
127
- Returns:
128
- 最终的SQL查询字符串
129
- """
130
- sql = self.sql_query
131
-
132
- # 替换参数占位符
133
- for key, value in params.items():
134
- placeholder = f"{{{key}}}"
135
- sql = sql.replace(placeholder, str(value))
136
-
137
- return sql
138
-
139
- def get_sample_sql(self, data_dt: str = "20250728") -> str:
140
- """获取示例SQL"""
141
- return self.generate_sql(data_dt=data_dt)
142
-
143
- def validate_target_values(self, values: List[Any]) -> bool:
144
- """验证目标值是否符合定义"""
145
- if self.target_type == TargetType.BINARY_CLASSIFICATION:
146
- unique_values = set(values)
147
- return unique_values.issubset({0, 1, 0.0, 1.0})
148
-
149
- elif self.target_type == TargetType.MULTI_CLASSIFICATION:
150
- if self.class_labels:
151
- unique_values = set(values)
152
- return unique_values.issubset(set(self.class_labels))
153
-
154
- elif self.target_type == TargetType.REGRESSION:
155
- if self.min_value is not None and self.max_value is not None:
156
- return all(self.min_value <= v <= self.max_value for v in values)
157
-
158
- return True
159
-
160
- def to_dict(self) -> Dict[str, Any]:
161
- """转换为字典格式"""
162
- return {
163
- 'name': self.name,
164
- 'target_type': self.target_type.value,
165
- 'description': self.description,
166
- 'sql_query': self.sql_query,
167
- 'target_column': self.target_column,
168
- 'data_type': self.data_type,
169
- 'encoding': self.encoding.value,
170
- 'class_labels': self.class_labels,
171
- 'class_weights': self.class_weights,
172
- 'min_value': self.min_value,
173
- 'max_value': self.max_value,
174
- 'normalization': self.normalization,
175
- 'time_window': self.time_window,
176
- 'prediction_horizon': self.prediction_horizon,
177
- 'bank_code': self.bank_code,
178
- 'business_rules': self.business_rules,
179
- 'created_at': self.created_at.isoformat(),
180
- 'created_by': self.created_by,
181
- 'version': self.version,
182
- 'tags': self.tags,
183
- 'validation_rules': self.validation_rules
184
- }
185
-
186
-
187
- def create_target_definition(
188
- name: str,
189
- target_type: str,
190
- sql_query: str,
191
- target_column: str = "target",
192
- bank_code: str = "generic",
193
- **kwargs
194
- ) -> TargetDefinition:
195
- """
196
- 创建目标变量定义的便捷函数
197
-
198
- Args:
199
- name: 目标变量名称
200
- target_type: 目标类型
201
- sql_query: SQL查询
202
- target_column: 目标列名
203
- bank_code: 银行代码
204
- **kwargs: 其他配置参数
205
-
206
- Returns:
207
- TargetDefinition实例
208
- """
209
- return TargetDefinition(
210
- name=name,
211
- target_type=TargetType(target_type),
212
- sql_query=sql_query,
213
- target_column=target_column,
214
- bank_code=bank_code,
215
- **kwargs
216
- )
217
-
218
-
219
- # 预定义的目标变量模板
220
- TARGET_TEMPLATES = {
221
- "aum_longtail_purchase": {
222
- "target_type": "binary_classification",
223
- "description": "AUM长尾客户未来购买预测",
224
- "sql_query": """
225
- SELECT
226
- a.party_id,
227
- CASE
228
- WHEN b.purchase_amount > 0 THEN 1
229
- ELSE 0
230
- END as target
231
- FROM
232
- bi_hlwj_dfcw_f1_f4_wy a
233
- LEFT JOIN (
234
- SELECT party_id, SUM(productamount_sum) as purchase_amount
235
- FROM bi_hlwj_dfcw_f1_f4_wy
236
- WHERE data_dt BETWEEN '{start_dt}' AND '{end_dt}'
237
- GROUP BY party_id
238
- ) b ON a.party_id = b.party_id
239
- WHERE a.data_dt = '{feature_dt}'
240
- """,
241
- "target_column": "target",
242
- "time_window": "30_days",
243
- "class_labels": ["no_purchase", "purchase"]
244
- },
245
-
246
- "customer_value_prediction": {
247
- "target_type": "regression",
248
- "description": "客户价值预测",
249
- "sql_query": """
250
- SELECT
251
- party_id,
252
- asset_total_bal as target
253
- FROM
254
- bi_hlwj_zi_chan_avg_wy
255
- WHERE
256
- data_dt = '{target_dt}'
257
- """,
258
- "target_column": "target",
259
- "data_type": "float",
260
- "normalization": True
261
- },
262
-
263
- "risk_level_classification": {
264
- "target_type": "multi_classification",
265
- "description": "风险等级分类",
266
- "sql_query": """
267
- SELECT
268
- party_id,
269
- CASE
270
- WHEN asset_total_bal < 10000 THEN 'low_risk'
271
- WHEN asset_total_bal < 100000 THEN 'medium_risk'
272
- ELSE 'high_risk'
273
- END as target
274
- FROM
275
- bi_hlwj_zi_chan_avg_wy
276
- WHERE
277
- data_dt = '{data_dt}'
278
- """,
279
- "target_column": "target",
280
- "class_labels": ["low_risk", "medium_risk", "high_risk"]
281
- }
282
- }
283
-
284
-
285
- def create_preset_target(preset_name: str, **overrides) -> TargetDefinition:
286
- """
287
- 基于预设模板创建目标变量定义
288
-
289
- Args:
290
- preset_name: 预设模板名称
291
- **overrides: 覆盖的配置参数
292
-
293
- Returns:
294
- TargetDefinition实例
295
- """
296
- if preset_name not in TARGET_TEMPLATES:
297
- raise ValueError(f"未知的目标变量模板: {preset_name}")
298
-
299
- template = TARGET_TEMPLATES[preset_name].copy()
300
- template.update(overrides)
301
-
302
- return create_target_definition(
303
- name=preset_name,
304
- **template
305
- )
306
-
307
-
308
- def create_icbc_target(name: str, sql_query: str, target_type: str = "binary_classification", **kwargs) -> TargetDefinition:
309
- """创建工商银行专用目标变量定义"""
310
- return create_target_definition(
311
- name=name,
312
- target_type=target_type,
313
- sql_query=sql_query,
314
- bank_code="icbc",
315
- business_rules={
316
- "data_retention_days": 90,
317
- "privacy_compliance": True,
318
- "audit_required": True
319
- },
320
- **kwargs
321
- )
@@ -1,27 +0,0 @@
1
- """
2
- Staran Schemas模块 - 新疆工行代发长尾客户表结构定义
3
-
4
- 提供新疆工行代发长尾客户的标准化表结构定义和字段管理功能。
5
-
6
- 主要功能:
7
- - 代发长尾客户表结构定义
8
- - 业务字段含义管理
9
- - 新疆工行专用配置
10
- - 表结构文档生成
11
- """
12
-
13
- from ..tools.document_generator import SchemaDocumentGenerator
14
- from .aum import *
15
-
16
- __all__ = [
17
- 'SchemaDocumentGenerator',
18
- # 新疆工行代发长尾客户表
19
- 'XinjiangICBCDaifaLongtailBehaviorSchema',
20
- 'XinjiangICBCDaifaLongtailAssetAvgSchema',
21
- 'XinjiangICBCDaifaLongtailAssetConfigSchema',
22
- 'XinjiangICBCDaifaLongtailMonthlyStatSchema',
23
- 'get_xinjiang_icbc_daifa_longtail_schemas',
24
- 'export_xinjiang_icbc_daifa_longtail_docs'
25
- ]
26
-
27
- __version__ = "0.6.0"
@@ -1,210 +0,0 @@
1
- """
2
- 新疆工行代发长尾客户表结构定义模块
3
-
4
- 包含新疆工行代发长尾客户相关的所有表结构:
5
- - 代发长尾客户行为特征表 (XinjiangICBCDaifaLongtailBehaviorSchema)
6
- - 代发长尾客户资产平均值表 (XinjiangICBCDaifaLongtailAssetAvgSchema)
7
- - 代发长尾客户资产配置表 (XinjiangICBCDaifaLongtailAssetConfigSchema)
8
- - 代发长尾客户月度统计表 (XinjiangICBCDaifaLongtailMonthlyStatSchema)
9
-
10
- 数据库: xinjiang_icbc_daifa_longtail
11
- 业务范围: 代发长尾客户(资产10k-100k)
12
-
13
- 这些表结构可以用于:
14
- 1. 代发长尾客户特征工程
15
- 2. 提升模型和防流失模型构建
16
- 3. 业务文档生成
17
- 4. 数据质量检查
18
- """
19
-
20
- from typing import Dict
21
- from ...features.schema import TableSchema
22
- from ...tools.document_generator import SchemaDocumentGenerator
23
-
24
-
25
- class XinjiangICBCDaifaLongtailBehaviorSchema:
26
- """新疆工行代发长尾客户行为特征表 - 严格按照已提供给行方的字段"""
27
-
28
- # 统计指标定义 - 与原始定义完全一致
29
- _STATS = [
30
- ("max", "最大值"),
31
- ("min", "最小值"),
32
- ("sum", "总和"),
33
- ("avg", "均值"),
34
- ("var", "方差"),
35
- ("std", "标准差"),
36
- ("rng", "极差"),
37
- ("med", "中位数"),
38
- ]
39
-
40
- @staticmethod
41
- def create() -> TableSchema:
42
- """创建新疆工行代发长尾客户行为特征表结构"""
43
- schema = TableSchema('xinjiang_icbc_daifa_hlwj_dfcw_f1_f4_wy')
44
- schema.add_primary_key('party_id', 'string')
45
- schema.add_date_field('data_dt', 'string')
46
-
47
- # 基础字段 - 严格按照原始定义
48
- schema.add_field("buy_ct", "int", comment="购买次数", aggregatable=True)
49
- schema.add_field("recency", "int", comment="最近一次购买距今天数", aggregatable=True)
50
- schema.add_field("tenure", "int", comment="客户关系持续时间", aggregatable=True)
51
- schema.add_field("window1", "string", comment="时间窗口标记")
52
- schema.add_field("freq", "float", comment="总购买频率", aggregatable=True)
53
- schema.add_field("freq1", "float", comment="最近时间段购买频率", aggregatable=True)
54
- schema.add_field("productidcount", "int", comment="产品种类数", aggregatable=True)
55
- schema.add_field("orderidcount", "int", comment="订单数", aggregatable=True)
56
- schema.add_field("label", "float", comment="标签值(如是否购买)", aggregatable=True)
57
-
58
- # productamount and m1 ~ m4 的含义描述
59
- m_fields = {
60
- "productamount": "购买金额",
61
- "m1": "去重订单数",
62
- "m2": "去重商品数",
63
- "m3": "去重渠道数",
64
- "m4": "去重产品品类数",
65
- }
66
-
67
- # 使用循环注册 productamount and m1~m4 各统计字段
68
- for prefix, meaning in m_fields.items():
69
- for stat_key, stat_desc in XinjiangICBCDaifaLongtailBehaviorSchema._STATS:
70
- field_name = f"{prefix}_{stat_key}"
71
- description = f"{meaning}的{stat_desc}"
72
- schema.add_field(field_name, "float", comment=description, aggregatable=True)
73
-
74
- # 其他字段
75
- schema.add_field("life_day", "float", comment="客户生命周期天数", aggregatable=True)
76
- schema.add_field("gender", "float", comment="性别(编码)", aggregatable=True)
77
- schema.add_field("open_day", "float", comment="开户天数", aggregatable=True)
78
-
79
- schema.set_monthly_unique(False) # 每人每日记录
80
- return schema
81
-
82
-
83
- class XinjiangICBCDaifaLongtailAssetAvgSchema:
84
- """新疆工行代发长尾客户资产平均值表"""
85
-
86
- @staticmethod
87
- def create() -> TableSchema:
88
- """创建新疆工行代发长尾客户资产平均值表结构"""
89
- schema = TableSchema('xinjiang_icbc_daifa_hlwj_zi_chan_avg_wy')
90
- schema.add_primary_key('party_id', 'string')
91
- schema.add_date_field('data_dt', 'string')
92
-
93
- # 基础余额字段
94
- schema.add_field("asset_total_bal", "decimal", comment="总资产余额", aggregatable=True)
95
- schema.add_field("liab_total_bal", "decimal", comment="总负债余额", aggregatable=True)
96
- schema.add_field("net_asset_bal", "decimal", comment="净资产余额", aggregatable=True)
97
-
98
- # 存款相关字段
99
- schema.add_field("dep_bal", "decimal", comment="存款余额", aggregatable=True)
100
- schema.add_field("current_dep_bal", "decimal", comment="活期存款余额", aggregatable=True)
101
- schema.add_field("time_dep_bal", "decimal", comment="定期存款余额", aggregatable=True)
102
-
103
- # 理财投资字段
104
- schema.add_field("wealth_bal", "decimal", comment="理财余额", aggregatable=True)
105
- schema.add_field("fund_bal", "decimal", comment="基金余额", aggregatable=True)
106
- schema.add_field("insurance_bal", "decimal", comment="保险余额", aggregatable=True)
107
-
108
- schema.set_monthly_unique(True) # 每人每月一条记录
109
- return schema
110
-
111
-
112
- class XinjiangICBCDaifaLongtailAssetConfigSchema:
113
- """新疆工行代发长尾客户资产配置表"""
114
-
115
- @staticmethod
116
- def create() -> TableSchema:
117
- """创建新疆工行代发长尾客户资产配置表结构"""
118
- schema = TableSchema('xinjiang_icbc_daifa_hlwj_zi_chan_config_wy')
119
- schema.add_primary_key('party_id', 'string')
120
- schema.add_date_field('data_dt', 'string')
121
-
122
- # 资产配置比例字段
123
- schema.add_field("cash_ratio", "float", comment="现金类资产占比", aggregatable=True)
124
- schema.add_field("fixed_income_ratio", "float", comment="固收类资产占比", aggregatable=True)
125
- schema.add_field("equity_ratio", "float", comment="权益类资产占比", aggregatable=True)
126
- schema.add_field("alternative_ratio", "float", comment="另类资产占比", aggregatable=True)
127
-
128
- # 风险偏好相关
129
- schema.add_field("risk_level", "int", comment="风险偏好等级(1-5)", aggregatable=True)
130
- schema.add_field("investment_experience", "int", comment="投资经验年限", aggregatable=True)
131
-
132
- # 配置变化指标
133
- schema.add_field("config_change_freq", "int", comment="配置调整频率", aggregatable=True)
134
- schema.add_field("rebalance_count", "int", comment="再平衡次数", aggregatable=True)
135
-
136
- schema.set_monthly_unique(True)
137
- return schema
138
-
139
-
140
- class XinjiangICBCDaifaLongtailMonthlyStatSchema:
141
- """新疆工行代发长尾客户月度统计表"""
142
-
143
- @staticmethod
144
- def create() -> TableSchema:
145
- """创建新疆工行代发长尾客户月度统计表结构"""
146
- schema = TableSchema('xinjiang_icbc_daifa_hlwj_monthly_stat_wy')
147
- schema.add_primary_key('party_id', 'string')
148
- schema.add_date_field('data_dt', 'string')
149
-
150
- # 月度交易统计
151
- schema.add_field("monthly_txn_count", "int", comment="月度交易笔数", aggregatable=True)
152
- schema.add_field("monthly_txn_amount", "decimal", comment="月度交易金额", aggregatable=True)
153
- schema.add_field("monthly_deposit_amount", "decimal", comment="月度存入金额", aggregatable=True)
154
- schema.add_field("monthly_withdraw_amount", "decimal", comment="月度取出金额", aggregatable=True)
155
-
156
- # 代发工资相关统计
157
- schema.add_field("salary_amount", "decimal", comment="月度代发工资金额", aggregatable=True)
158
- schema.add_field("salary_date", "string", comment="代发工资日期")
159
- schema.add_field("salary_stability", "float", comment="工资稳定性指数", aggregatable=True)
160
-
161
- # 长尾客户特征
162
- schema.add_field("longtail_score", "float", comment="长尾客户评分", aggregatable=True)
163
- schema.add_field("upgrade_potential", "float", comment="提升潜力评分", aggregatable=True)
164
- schema.add_field("churn_risk", "float", comment="流失风险评分", aggregatable=True)
165
-
166
- # 活跃度指标
167
- schema.add_field("login_days", "int", comment="月度登录天数", aggregatable=True)
168
- schema.add_field("channel_usage", "string", comment="渠道使用情况")
169
-
170
- schema.set_monthly_unique(True)
171
- return schema
172
-
173
-
174
- def get_xinjiang_icbc_daifa_longtail_schemas() -> Dict[str, TableSchema]:
175
- """获取新疆工行代发长尾客户所有表结构"""
176
- return {
177
- 'daifa_longtail_behavior': XinjiangICBCDaifaLongtailBehaviorSchema.create(),
178
- 'daifa_longtail_asset_avg': XinjiangICBCDaifaLongtailAssetAvgSchema.create(),
179
- 'daifa_longtail_asset_config': XinjiangICBCDaifaLongtailAssetConfigSchema.create(),
180
- 'daifa_longtail_monthly_stat': XinjiangICBCDaifaLongtailMonthlyStatSchema.create(),
181
- }
182
-
183
-
184
- def export_xinjiang_icbc_daifa_longtail_docs(output_dir: str = "./docs") -> Dict[str, str]:
185
- """导出新疆工行代发长尾客户表结构文档"""
186
- generator = SchemaDocumentGenerator()
187
- schemas = get_xinjiang_icbc_daifa_longtail_schemas()
188
- exported_files = {}
189
-
190
- for table_type, schema in schemas.items():
191
- file_path = generator.export_schema_doc(
192
- schema,
193
- business_domain="新疆工行代发长尾客户",
194
- table_type=table_type,
195
- output_dir=output_dir
196
- )
197
- exported_files[table_type] = file_path
198
-
199
- return exported_files
200
-
201
-
202
- # 导出主要组件
203
- __all__ = [
204
- 'XinjiangICBCDaifaLongtailBehaviorSchema',
205
- 'XinjiangICBCDaifaLongtailAssetAvgSchema',
206
- 'XinjiangICBCDaifaLongtailAssetConfigSchema',
207
- 'XinjiangICBCDaifaLongtailMonthlyStatSchema',
208
- 'get_xinjiang_icbc_daifa_longtail_schemas',
209
- 'export_xinjiang_icbc_daifa_longtail_docs'
210
- ]