staran 0.6.0__py3-none-any.whl → 1.0.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,155 +0,0 @@
1
- #!/usr/bin/env python3
2
- # -*- coding: utf-8 -*-
3
-
4
- """
5
- 特征管理器
6
- 负责特征工程的核心管理功能,基于新的引擎架构
7
- """
8
-
9
- from typing import Optional, Dict, Any, List, Callable
10
- from datetime import datetime
11
- from ..engines import BaseEngine, create_engine, DatabaseType
12
-
13
-
14
- class FeatureManager:
15
- """
16
- 特征管理器 - 使用引擎架构的核心特征管理
17
- """
18
-
19
- def __init__(self, database_name: str, engine_type: str = "spark",
20
- sql_executor: Optional[Callable] = None):
21
- """
22
- 初始化特征管理器
23
-
24
- Args:
25
- database_name: 数据库名称
26
- engine_type: 引擎类型 ('spark', 'hive', 'turing')
27
- sql_executor: SQL执行器函数 (可选,仅用于非turing引擎)
28
- """
29
- self.database_name = database_name
30
- self.engine_type = engine_type
31
-
32
- # 创建数据库引擎
33
- self.engine = create_engine(
34
- engine_type=engine_type,
35
- database_name=database_name,
36
- sql_executor=sql_executor
37
- )
38
-
39
- # 委托给引擎的方法
40
- def execute_sql(self, sql: str, description: str = "") -> Any:
41
- """执行SQL语句"""
42
- return self.engine.execute_sql(sql, description)
43
-
44
- def get_full_table_name(self, table_name: str) -> str:
45
- """获取完整的表名(包含数据库名)"""
46
- return self.engine.get_full_table_name(table_name)
47
-
48
- def generate_table_name(self, base_name: str, year: int, month: int,
49
- suffix: str = "raw") -> str:
50
- """
51
- 生成标准化的表名
52
- 格式: {base_name}_{yyyy}_{MM}_{suffix}
53
- """
54
- return self.engine.generate_table_name(base_name, year, month, suffix)
55
-
56
- def create_table(self, table_name: str, select_sql: str,
57
- execute: bool = False, **kwargs) -> Dict[str, Any]:
58
- """创建表"""
59
- return self.engine.create_table(table_name, select_sql, execute, **kwargs)
60
-
61
- def drop_table(self, table_name: str, execute: bool = False) -> Dict[str, Any]:
62
- """删除表"""
63
- return self.engine.drop_table(table_name, execute)
64
-
65
- def download_table_data(self, table_name: str, output_path: str,
66
- **kwargs) -> Dict[str, Any]:
67
- """下载表数据"""
68
- return self.engine.download_table_data(table_name, output_path, **kwargs)
69
-
70
- def download_query_result(self, sql: str, output_path: str,
71
- **kwargs) -> Dict[str, Any]:
72
- """下载查询结果"""
73
- return self.engine.download_query_result(sql, output_path, **kwargs)
74
-
75
- def get_execution_history(self) -> List[Dict]:
76
- """获取SQL执行历史"""
77
- return self.engine.get_execution_history()
78
-
79
- def clear_history(self):
80
- """清空执行历史"""
81
- self.engine.clear_history()
82
-
83
- def __str__(self):
84
- return f"FeatureManager(engine={self.engine})"
85
-
86
-
87
- class FeatureTableManager:
88
- """
89
- 特征表管理器
90
- 负责特征表的创建、删除、管理等操作
91
- """
92
-
93
- def __init__(self, feature_manager: FeatureManager):
94
- """
95
- 初始化表管理器
96
-
97
- Args:
98
- feature_manager: 特征管理器实例
99
- """
100
- self.feature_manager = feature_manager
101
- self.created_tables = []
102
-
103
- def create_feature_table(self, base_name: str, year: int, month: int,
104
- version: int, sql: str, execute: bool = False,
105
- **kwargs) -> str:
106
- """
107
- 创建特征表
108
-
109
- Args:
110
- base_name: 基础表名
111
- year: 年份
112
- month: 月份
113
- version: 版本号
114
- sql: 创建表的SQL
115
- execute: 是否立即执行
116
- **kwargs: 传递给引擎的其他参数
117
-
118
- Returns:
119
- 创建的表名
120
- """
121
- table_name = self.feature_manager.generate_table_name(base_name, year, month)
122
-
123
- result = self.feature_manager.create_table(table_name, sql, execute, **kwargs)
124
-
125
- if execute and result.get('status') == 'success':
126
- self.created_tables.append(table_name)
127
-
128
- return table_name
129
-
130
- def drop_feature_table(self, table_name: str, execute: bool = False) -> str:
131
- """
132
- 删除特征表
133
-
134
- Args:
135
- table_name: 表名
136
- execute: 是否立即执行
137
-
138
- Returns:
139
- 删除表的SQL
140
- """
141
- result = self.feature_manager.drop_table(table_name, execute)
142
-
143
- if execute and result.get('status') == 'success':
144
- if table_name in self.created_tables:
145
- self.created_tables.remove(table_name)
146
-
147
- return result.get('sql', '')
148
-
149
- def get_created_tables(self) -> List[str]:
150
- """获取已创建的表列表"""
151
- return self.created_tables.copy()
152
-
153
- def table_exists(self, table_name: str) -> bool:
154
- """检查表是否存在(简单检查,实际需要查询数据库)"""
155
- return table_name in self.created_tables
staran/features/schema.py DELETED
@@ -1,193 +0,0 @@
1
- #!/usr/bin/env python3
2
- # -*- coding: utf-8 -*-
3
-
4
- """
5
- 表结构定义模块
6
- 定义数据库表的字段、类型和分析特性
7
- """
8
-
9
- from enum import Enum
10
- from typing import Dict, List, Optional, Union
11
- from dataclasses import dataclass
12
-
13
-
14
- class FieldType(Enum):
15
- """字段类型枚举"""
16
- STRING = "string"
17
- INTEGER = "int"
18
- BIGINT = "bigint"
19
- DECIMAL = "decimal"
20
- DOUBLE = "double"
21
- FLOAT = "float"
22
- DATE = "date"
23
- TIMESTAMP = "timestamp"
24
- BOOLEAN = "boolean"
25
-
26
-
27
- @dataclass
28
- class Field:
29
- """字段定义"""
30
- name: str
31
- field_type: FieldType
32
- is_primary_key: bool = False
33
- is_date_field: bool = False
34
- aggregatable: bool = False
35
- nullable: bool = True
36
- comment: str = ""
37
-
38
- def __post_init__(self):
39
- """初始化后处理"""
40
- # 数值类型默认可聚合
41
- if self.field_type in [FieldType.INTEGER, FieldType.BIGINT,
42
- FieldType.DECIMAL, FieldType.DOUBLE, FieldType.FLOAT]:
43
- if not hasattr(self, '_aggregatable_set'):
44
- self.aggregatable = True
45
-
46
- def set_aggregatable(self, aggregatable: bool):
47
- """设置是否可聚合"""
48
- self.aggregatable = aggregatable
49
- self._aggregatable_set = True
50
- return self
51
-
52
-
53
- class TableSchema:
54
- """表结构定义类"""
55
-
56
- def __init__(self, table_name: str, comment: str = ""):
57
- """
58
- 初始化表结构
59
-
60
- Args:
61
- table_name: 表名
62
- comment: 表注释
63
- """
64
- self.table_name = table_name
65
- self.comment = comment
66
- self.fields: Dict[str, Field] = {}
67
- self.primary_key: Optional[str] = None
68
- self.date_field: Optional[str] = None
69
- self.is_monthly_unique: bool = False
70
-
71
- def add_field(self, name: str, field_type: Union[str, FieldType],
72
- aggregatable: bool = None, nullable: bool = True,
73
- comment: str = "") -> 'TableSchema':
74
- """
75
- 添加字段
76
-
77
- Args:
78
- name: 字段名
79
- field_type: 字段类型
80
- aggregatable: 是否可聚合(None时自动判断)
81
- nullable: 是否可空
82
- comment: 字段注释
83
-
84
- Returns:
85
- self: 支持链式调用
86
- """
87
- if isinstance(field_type, str):
88
- field_type = FieldType(field_type.lower())
89
-
90
- field = Field(
91
- name=name,
92
- field_type=field_type,
93
- nullable=nullable,
94
- comment=comment
95
- )
96
-
97
- if aggregatable is not None:
98
- field.set_aggregatable(aggregatable)
99
-
100
- self.fields[name] = field
101
- return self
102
-
103
- def add_primary_key(self, name: str, field_type: Union[str, FieldType],
104
- comment: str = "主键") -> 'TableSchema':
105
- """添加主键字段"""
106
- if isinstance(field_type, str):
107
- field_type = FieldType(field_type.lower())
108
-
109
- field = Field(
110
- name=name,
111
- field_type=field_type,
112
- is_primary_key=True,
113
- nullable=False,
114
- comment=comment
115
- )
116
- field.set_aggregatable(False)
117
-
118
- self.fields[name] = field
119
- self.primary_key = name
120
- return self
121
-
122
- def add_date_field(self, name: str, field_type: Union[str, FieldType] = FieldType.DATE,
123
- comment: str = "日期字段") -> 'TableSchema':
124
- """添加日期字段"""
125
- if isinstance(field_type, str):
126
- field_type = FieldType(field_type.lower())
127
-
128
- field = Field(
129
- name=name,
130
- field_type=field_type,
131
- is_date_field=True,
132
- nullable=False,
133
- comment=comment
134
- )
135
- field.set_aggregatable(False)
136
-
137
- self.fields[name] = field
138
- self.date_field = name
139
- return self
140
-
141
- def set_monthly_unique(self, is_unique: bool = True) -> 'TableSchema':
142
- """设置是否为每人每月唯一数据"""
143
- self.is_monthly_unique = is_unique
144
- return self
145
-
146
- def get_aggregatable_fields(self) -> List[Field]:
147
- """获取可聚合字段列表"""
148
- return [field for field in self.fields.values() if field.aggregatable]
149
-
150
- def get_non_aggregatable_fields(self) -> List[Field]:
151
- """获取不可聚合字段列表(用于原始拷贝)"""
152
- return [field for field in self.fields.values()
153
- if not field.aggregatable and not field.is_primary_key and not field.is_date_field]
154
-
155
- def validate(self) -> bool:
156
- """验证表结构"""
157
- if not self.primary_key:
158
- raise ValueError("表必须定义主键")
159
-
160
- if not self.date_field:
161
- raise ValueError("表必须定义日期字段")
162
-
163
- if self.primary_key not in self.fields:
164
- raise ValueError(f"主键字段 {self.primary_key} 不存在")
165
-
166
- if self.date_field not in self.fields:
167
- raise ValueError(f"日期字段 {self.date_field} 不存在")
168
-
169
- return True
170
-
171
- def __str__(self) -> str:
172
- """字符串表示"""
173
- lines = [f"Table: {self.table_name}"]
174
- if self.comment:
175
- lines.append(f"Comment: {self.comment}")
176
-
177
- lines.append(f"Primary Key: {self.primary_key}")
178
- lines.append(f"Date Field: {self.date_field}")
179
- lines.append(f"Monthly Unique: {self.is_monthly_unique}")
180
- lines.append("Fields:")
181
-
182
- for field in self.fields.values():
183
- flag_str = ""
184
- if field.is_primary_key:
185
- flag_str += "[PK]"
186
- if field.is_date_field:
187
- flag_str += "[DATE]"
188
- if field.aggregatable:
189
- flag_str += "[AGG]"
190
-
191
- lines.append(f" {field.name}: {field.field_type.value} {flag_str}")
192
-
193
- return "\n".join(lines)
staran/models/__init__.py DELETED
@@ -1,72 +0,0 @@
1
- """
2
- staran.models v0.6.0 - 新疆工行代发长尾客户模型管理
3
-
4
- 专门针对新疆工行代发长尾客户的两个核心模型:
5
- 1. 代发长尾客户提升3k预测模型 (daifa_longtail_upgrade_3k)
6
- 2. 代发长尾客户防流失1.5k预测模型 (daifa_longtail_churn_1_5k)
7
-
8
- 主要功能:
9
- - 模型配置管理
10
- - SQL驱动的目标变量定义
11
- - 模型注册和版本控制
12
- - 新疆工行特定配置
13
- """
14
-
15
- from .config import ModelConfig, create_model_config
16
- from .target import TargetDefinition, create_target_definition
17
- from .registry import ModelRegistry, register_model, save_model_registry
18
- from .daifa_models import (
19
- create_daifa_longtail_upgrade_model,
20
- create_daifa_longtail_churn_model,
21
- get_available_daifa_models,
22
- create_both_daifa_models
23
- )
24
-
25
- # 便捷函数
26
- def create_xinjiang_icbc_models(output_dir: str = "./xinjiang_models") -> dict:
27
- """为新疆工行创建两个代发长尾客户模型"""
28
- return create_both_daifa_models(output_dir)
29
-
30
- def list_available_models() -> list:
31
- """列出所有可用的代发长尾客户模型"""
32
- return get_available_daifa_models()
33
-
34
- def get_model_summary() -> dict:
35
- """获取模型概述信息"""
36
- return {
37
- "version": "0.6.0",
38
- "bank": "新疆工行",
39
- "business_domain": "代发长尾客户",
40
- "models": [
41
- {
42
- "name": "daifa_longtail_upgrade_3k",
43
- "description": "预测下个月代发长尾客户资产提升3k的概率",
44
- "target_amount": 3000,
45
- "model_type": "binary_classification"
46
- },
47
- {
48
- "name": "daifa_longtail_churn_1_5k",
49
- "description": "预测下个月代发长尾客户流失1.5k资产的风险",
50
- "target_amount": 1500,
51
- "model_type": "binary_classification"
52
- }
53
- ]
54
- }
55
-
56
- __all__ = [
57
- # 核心组件
58
- 'ModelConfig', 'TargetDefinition', 'ModelRegistry',
59
-
60
- # 创建函数
61
- 'create_model_config', 'create_target_definition', 'register_model',
62
-
63
- # 代发长尾模型
64
- 'create_daifa_longtail_upgrade_model', 'create_daifa_longtail_churn_model',
65
- 'create_both_daifa_models', 'get_available_daifa_models',
66
-
67
- # 便捷函数
68
- 'create_xinjiang_icbc_models', 'list_available_models', 'get_model_summary',
69
- 'save_model_registry'
70
- ]
71
-
72
- __version__ = "0.6.0"
@@ -1,269 +0,0 @@
1
- """
2
- 银行特定配置模块
3
-
4
- 为不同银行提供定制化的配置和业务规则
5
- """
6
-
7
- from enum import Enum
8
- from typing import Dict, Any, List, Optional
9
- from dataclasses import dataclass, field
10
-
11
-
12
- class BankCode(Enum):
13
- """银行代码枚举"""
14
- ICBC = "icbc" # 工商银行
15
- CCB = "ccb" # 建设银行
16
- BOC = "boc" # 中国银行
17
- ABC = "abc" # 农业银行
18
- CMB = "cmb" # 招商银行
19
- GENERIC = "generic" # 通用配置
20
-
21
-
22
- @dataclass
23
- class BankConfig:
24
- """银行配置类"""
25
- # 基本信息
26
- bank_code: str # 银行代码
27
- bank_name: str # 银行名称
28
- region: str = "cn" # 地区代码
29
-
30
- # 数据库配置
31
- database_config: Dict[str, Any] = field(default_factory=dict)
32
-
33
- # 表名映射 (不同银行的表名可能不同)
34
- table_mappings: Dict[str, str] = field(default_factory=dict)
35
-
36
- # 字段映射 (不同银行的字段名可能不同)
37
- field_mappings: Dict[str, Dict[str, str]] = field(default_factory=dict)
38
-
39
- # 业务规则
40
- business_rules: Dict[str, Any] = field(default_factory=dict)
41
-
42
- # 合规要求
43
- compliance_rules: Dict[str, Any] = field(default_factory=dict)
44
-
45
- # 数据处理规则
46
- data_processing_rules: Dict[str, Any] = field(default_factory=dict)
47
-
48
- # 模型部署配置
49
- deployment_config: Dict[str, Any] = field(default_factory=dict)
50
-
51
- # 特征工程配置
52
- feature_engineering_config: Dict[str, Any] = field(default_factory=dict)
53
-
54
- def get_table_name(self, standard_table: str) -> str:
55
- """获取银行特定的表名"""
56
- return self.table_mappings.get(standard_table, standard_table)
57
-
58
- def get_field_name(self, table: str, standard_field: str) -> str:
59
- """获取银行特定的字段名"""
60
- table_fields = self.field_mappings.get(table, {})
61
- return table_fields.get(standard_field, standard_field)
62
-
63
- def get_business_rule(self, rule_name: str, default=None):
64
- """获取业务规则"""
65
- return self.business_rules.get(rule_name, default)
66
-
67
- def validate_compliance(self, operation: str) -> bool:
68
- """验证操作是否符合合规要求"""
69
- compliance_checks = self.compliance_rules.get(operation, {})
70
- # 这里可以实现具体的合规检查逻辑
71
- return compliance_checks.get('enabled', True)
72
-
73
-
74
- # 银行配置注册表
75
- _BANK_CONFIGS: Dict[str, BankConfig] = {}
76
-
77
-
78
- def register_bank_config(config: BankConfig):
79
- """注册银行配置"""
80
- _BANK_CONFIGS[config.bank_code] = config
81
- print(f"✅ 银行配置 {config.bank_code} ({config.bank_name}) 注册成功")
82
-
83
-
84
- def get_bank_config(bank_code: str) -> Optional[BankConfig]:
85
- """获取银行配置"""
86
- return _BANK_CONFIGS.get(bank_code)
87
-
88
-
89
- def list_bank_configs() -> List[Dict[str, str]]:
90
- """列出所有银行配置"""
91
- return [
92
- {
93
- 'bank_code': config.bank_code,
94
- 'bank_name': config.bank_name,
95
- 'region': config.region
96
- }
97
- for config in _BANK_CONFIGS.values()
98
- ]
99
-
100
-
101
- # 预定义银行配置
102
- def create_icbc_config() -> BankConfig:
103
- """创建工商银行配置"""
104
- return BankConfig(
105
- bank_code="icbc",
106
- bank_name="中国工商银行",
107
- region="cn",
108
-
109
- database_config={
110
- "default_database": "dwegdata03000",
111
- "connection_pool_size": 10,
112
- "query_timeout": 300
113
- },
114
-
115
- table_mappings={
116
- "behavior_table": "bi_hlwj_dfcw_f1_f4_wy",
117
- "asset_avg_table": "bi_hlwj_zi_chan_avg_wy",
118
- "asset_config_table": "bi_hlwj_zi_chang_month_total_zb",
119
- "monthly_stat_table": "bi_hlwj_realy_month_stat_wy"
120
- },
121
-
122
- field_mappings={
123
- "behavior_table": {
124
- "customer_id": "party_id",
125
- "date_field": "data_dt"
126
- }
127
- },
128
-
129
- business_rules={
130
- "data_retention_days": 90,
131
- "min_sample_size": 1000,
132
- "max_features": 500,
133
- "risk_threshold": 0.8,
134
- "aum_threshold": 100000,
135
- "longtail_definition": {
136
- "asset_threshold": 50000,
137
- "activity_threshold": 0.3
138
- }
139
- },
140
-
141
- compliance_rules={
142
- "data_export": {
143
- "enabled": True,
144
- "approval_required": True,
145
- "encryption_required": True
146
- },
147
- "model_deployment": {
148
- "enabled": True,
149
- "testing_required": True,
150
- "documentation_required": True
151
- },
152
- "feature_selection": {
153
- "enabled": True,
154
- "sensitive_data_allowed": False,
155
- "audit_trail_required": True
156
- }
157
- },
158
-
159
- data_processing_rules={
160
- "missing_value_strategy": "median",
161
- "outlier_detection": True,
162
- "outlier_threshold": 3.0,
163
- "feature_scaling": "standard",
164
- "categorical_encoding": "one_hot"
165
- },
166
-
167
- deployment_config={
168
- "platform": "turing",
169
- "environment": "production",
170
- "monitoring_enabled": True,
171
- "auto_scaling": True,
172
- "backup_required": True
173
- },
174
-
175
- feature_engineering_config={
176
- "time_windows": ["1_month", "3_months", "6_months", "1_year"],
177
- "aggregation_functions": ["sum", "avg", "max", "min", "std"],
178
- "interaction_features": True,
179
- "polynomial_features": False,
180
- "target_encoding": True
181
- }
182
- )
183
-
184
-
185
- def create_generic_config() -> BankConfig:
186
- """创建通用银行配置"""
187
- return BankConfig(
188
- bank_code="generic",
189
- bank_name="通用银行配置",
190
- region="generic",
191
-
192
- database_config={
193
- "default_database": "default_db",
194
- "connection_pool_size": 5,
195
- "query_timeout": 180
196
- },
197
-
198
- table_mappings={
199
- "behavior_table": "customer_behavior",
200
- "asset_avg_table": "customer_assets",
201
- "asset_config_table": "asset_config",
202
- "monthly_stat_table": "monthly_stats"
203
- },
204
-
205
- business_rules={
206
- "data_retention_days": 30,
207
- "min_sample_size": 100,
208
- "max_features": 100
209
- },
210
-
211
- compliance_rules={
212
- "data_export": {"enabled": True},
213
- "model_deployment": {"enabled": True}
214
- },
215
-
216
- data_processing_rules={
217
- "missing_value_strategy": "mean",
218
- "outlier_detection": False,
219
- "feature_scaling": "none"
220
- }
221
- )
222
-
223
-
224
- # 初始化默认银行配置
225
- def initialize_default_configs():
226
- """初始化默认银行配置"""
227
- # 注册工商银行配置
228
- register_bank_config(create_icbc_config())
229
-
230
- # 注册通用配置
231
- register_bank_config(create_generic_config())
232
-
233
-
234
- # 自动初始化
235
- initialize_default_configs()
236
-
237
-
238
- # 新疆工行特定配置
239
- def create_xinjiang_icbc_config() -> BankConfig:
240
- """创建新疆工商银行配置"""
241
- base_config = create_icbc_config()
242
-
243
- # 基于基础工行配置进行定制
244
- base_config.bank_code = "xinjiang_icbc"
245
- base_config.bank_name = "新疆工商银行"
246
- base_config.region = "xinjiang"
247
-
248
- # 新疆特定的业务规则
249
- base_config.business_rules.update({
250
- "regional_compliance": True,
251
- "minority_customer_support": True,
252
- "language_support": ["zh", "ug"], # 中文和维吾尔语
253
- "timezone": "Asia/Urumqi",
254
- "currency_support": ["CNY"],
255
- "cross_border_transaction": True
256
- })
257
-
258
- # 新疆特定的数据处理规则
259
- base_config.data_processing_rules.update({
260
- "character_encoding": "utf-8",
261
- "regional_holidays": True,
262
- "time_zone_conversion": True
263
- })
264
-
265
- return base_config
266
-
267
-
268
- # 注册新疆工行配置
269
- register_bank_config(create_xinjiang_icbc_config())