staran 0.6.0__py3-none-any.whl → 0.6.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {staran-0.6.0.dist-info → staran-0.6.1.dist-info}/METADATA +38 -16
- {staran-0.6.0.dist-info → staran-0.6.1.dist-info}/RECORD +5 -10
- staran/examples/__init__.py +0 -8
- staran/examples/aum_longtail.py +0 -250
- staran/examples/aum_longtail_old.py +0 -487
- staran/models/bank_configs.py +0 -269
- staran/schemas/document_generator.py +0 -350
- {staran-0.6.0.dist-info → staran-0.6.1.dist-info}/WHEEL +0 -0
- {staran-0.6.0.dist-info → staran-0.6.1.dist-info}/licenses/LICENSE +0 -0
- {staran-0.6.0.dist-info → staran-0.6.1.dist-info}/top_level.txt +0 -0
@@ -1,42 +1,64 @@
|
|
1
1
|
Metadata-Version: 2.4
|
2
2
|
Name: staran
|
3
|
-
Version: 0.6.
|
3
|
+
Version: 0.6.1
|
4
4
|
Summary: staran - 高性能Python工具库
|
5
5
|
Home-page: https://github.com/starlxa/staran
|
6
6
|
Author: StarAn
|
7
|
-
Author-email: starlxa@icloud.com
|
7
|
+
Author-email: StarAn <starlxa@icloud.com>
|
8
|
+
License: MIT
|
9
|
+
Project-URL: Homepage, https://github.com/starlxa/staran
|
10
|
+
Project-URL: Bug Reports, https://github.com/starlxa/staran/issues
|
11
|
+
Project-URL: Source, https://github.com/starlxa/staran
|
12
|
+
Keywords: machine-learning,feature-engineering,data-processing,sql-generation
|
13
|
+
Classifier: Development Status :: 4 - Beta
|
14
|
+
Classifier: Intended Audience :: Developers
|
15
|
+
Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
|
16
|
+
Classifier: Topic :: Software Development :: Libraries :: Python Modules
|
8
17
|
Classifier: Programming Language :: Python :: 3
|
9
|
-
Classifier:
|
18
|
+
Classifier: Programming Language :: Python :: 3.7
|
19
|
+
Classifier: Programming Language :: Python :: 3.8
|
20
|
+
Classifier: Programming Language :: Python :: 3.9
|
21
|
+
Classifier: Programming Language :: Python :: 3.10
|
22
|
+
Classifier: Programming Language :: Python :: 3.11
|
23
|
+
Classifier: Programming Language :: Python :: 3.12
|
24
|
+
Classifier: Operating System :: OS Independent
|
10
25
|
Requires-Python: >=3.7
|
11
26
|
Description-Content-Type: text/markdown
|
12
27
|
License-File: LICENSE
|
13
|
-
Requires-Dist: datetime
|
14
|
-
Requires-Dist: calendar
|
15
|
-
Requires-Dist: re
|
16
28
|
Dynamic: author
|
17
|
-
Dynamic: author-email
|
18
|
-
Dynamic: classifier
|
19
|
-
Dynamic: description
|
20
|
-
Dynamic: description-content-type
|
21
29
|
Dynamic: home-page
|
22
30
|
Dynamic: license-file
|
23
|
-
Dynamic: requires-dist
|
24
31
|
Dynamic: requires-python
|
25
|
-
Dynamic: summary
|
26
32
|
|
27
|
-
# Star## ✨ v0.6.
|
33
|
+
# Star## Staran ✨ v0.6.1 新特性
|
28
34
|
|
29
|
-
-
|
35
|
+
- � **完善的包管理** - 优化setup.py配置,移除不必要的标准库依赖
|
36
|
+
- �📋 **独立Schema模块** - 专门的表结构定义和管理模块
|
30
37
|
- 📄 **文档自动生成** - 支持Markdown/PDF/HTML格式的技术文档生成
|
31
38
|
- 🏢 **业务域支持** - AUM等业务领域的标准表结构定义
|
32
|
-
- 🔗 **无缝集成** - Schema
|
39
|
+
- 🔗 **无缝集成** - Schema与特征工程模块完美集成
|
33
40
|
- 🛠️ **模块化引擎架构** - 独立的引擎模块,支持Spark、Hive、图灵平台
|
34
41
|
- 🔧 **统一接口设计** - 所有引擎提供一致的SQL生成、执行和下载接口
|
35
42
|
- 🎯 **继承复用架构** - TuringEngine继承SparkEngine,复用SQL生成逻辑
|
36
43
|
- 📦 **清晰代码分离** - SQL生成与平台特定执行逻辑完全分离
|
37
44
|
- 🚀 **易于扩展** - 新增数据库支持只需实现BaseEngine接口
|
38
45
|
- 📁 **独立引擎存储** - engines/文件夹专门存放所有数据库引擎
|
39
|
-
- 🔄 **向后兼容** - 保持对原有API
|
46
|
+
- 🔄 **向后兼容** - 保持对原有API的完全兼容
|
47
|
+
|
48
|
+
## 🎯 专为机器学习设计的Python工具包
|
49
|
+
|
50
|
+
Staran是一个强大的特征工程和数据处理工具包,提供从数据到模型的完整解决方案。特别针对工银图灵平台优化,让特征工程和模型训练变得前所未有的简单。
|
51
|
+
|
52
|
+
## ✨ v0.6.1 新特性
|
53
|
+
|
54
|
+
- 🔧 **完善的包管理** - 优化setup.py配置,移除不必要的标准库依赖
|
55
|
+
- 🛠️ **模块化引擎架构** - 独立的引擎模块,支持Spark、Hive、图灵平台
|
56
|
+
- 🔧 **统一接口设计** - 所有引擎提供一致的SQL生成、执行和下载接口
|
57
|
+
- 🎯 **继承复用架构** - TuringEngine继承SparkEngine,复用SQL生成逻辑
|
58
|
+
- 📦 **清晰代码分离** - SQL生成与平台特定执行逻辑完全分离
|
59
|
+
- 🚀 **易于扩展** - 新增数据库支持只需实现BaseEngine接口
|
60
|
+
- 📁 **独立引擎存储** - engines/文件夹专门存放所有数据库引擎
|
61
|
+
- 🔄 **向后兼容** - 保持对原有API的完全兼容
|
40
62
|
|
41
63
|
## � 专为机器学习设计的Python工具包
|
42
64
|
|
@@ -6,28 +6,23 @@ staran/engines/base.py,sha256=IIN-QxPsO-q3KmQ3Lz0cB9bs6Oac0Wy5MIF605HrHVw,7969
|
|
6
6
|
staran/engines/hive.py,sha256=-KwZiAvK5cxwnoyYQlqGWrcZkeKhbd8QCX3chpbezd0,5894
|
7
7
|
staran/engines/spark.py,sha256=XPxzefD9UF8oigeQISBW892RINJ9dGLbl994FWpIKBc,9361
|
8
8
|
staran/engines/turing.py,sha256=XEKkEMMWedvaGxKQ2vEHmB3TWLNLxOu1upgiBylwqjA,15516
|
9
|
-
staran/examples/__init__.py,sha256=rXjHvD_EA1sl04WAcOMGnktOwZstjUxaei6bo7pPMII,229
|
10
|
-
staran/examples/aum_longtail.py,sha256=UFeLzhslS0Qw1defD9M8mI6Jq4G2BHoyqdjNfX0cgH0,9915
|
11
|
-
staran/examples/aum_longtail_old.py,sha256=wZW_3NsU8lOjohtzI1ewzFIqTDAt8lnUberQJVYePfs,21723
|
12
9
|
staran/features/__init__.py,sha256=uMloEuevUjUPfro8Yv4STwvxpSVL0J1xsQTzN_EkLpo,1828
|
13
10
|
staran/features/engines.py,sha256=kqdS2xjmCVi0Xz1Oc3WaTMIavgAriX8F7VvUgVcpfqo,10039
|
14
11
|
staran/features/generator.py,sha256=CI1F_PshOvokQJelsqSaVp-SNQpMc-WVmjMQKzgdeLw,23114
|
15
12
|
staran/features/manager.py,sha256=2-3Hc3qthtyzwiuQy5QTz6RfhKK3szoylconzI3moc4,5201
|
16
13
|
staran/features/schema.py,sha256=FwOfpTcxq4K8zkO3MFNqKPQBp_e8qY-N6gazqm9_lAQ,6067
|
17
14
|
staran/models/__init__.py,sha256=VbfrRjmnp8KlFSEZOa-buECAaERptzAnvUUZK9dpgtY,2390
|
18
|
-
staran/models/bank_configs.py,sha256=wN3GA_8cb5wevDC-sWRcJ3lMuaHahZVjC85K_t2aQt0,8177
|
19
15
|
staran/models/config.py,sha256=fTbZtJq4-ZuCSSd1eW7TkIbEdDyZv2agHJCYnwOCJ_s,8886
|
20
16
|
staran/models/daifa_models.py,sha256=J7aqK41NDMDjacsjmxqwyuJfgf1kJx-Kaxj5CGQLISE,13166
|
21
17
|
staran/models/registry.py,sha256=Zeey4TtbHtJ40odyZQzOLijyZCmlMBRuniPk_znS2Q8,10223
|
22
18
|
staran/models/target.py,sha256=gKTTatxvOJjmE50qD6G6mhlYLuZL3Cvn3FLNbXl1eeU,10531
|
23
19
|
staran/schemas/__init__.py,sha256=ztrBlQ3irbgM7gHB_dhiLEX1ZpDX2AAWOeiPnZTe-sk,779
|
24
|
-
staran/schemas/document_generator.py,sha256=Mr7TjmKwspqxXnp9DhzZxsRx0l2Bo7MOI8mOxRtgwxU,13600
|
25
20
|
staran/schemas/aum/__init__.py,sha256=z0cuC6A3z-1cPKMDYrn0wCumjKkpk_0kfqGfW1JNEbc,9815
|
26
21
|
staran/tools/__init__.py,sha256=KtudrYnxKD9HZEL4H-mrWlKrmsI3rYjJrLeC9YDTpG4,1054
|
27
22
|
staran/tools/date.py,sha256=-QyEMWVx6czMuOIwcV7kR3gBMRVOwb5qevo7GEFSJKE,10488
|
28
23
|
staran/tools/document_generator.py,sha256=Mr7TjmKwspqxXnp9DhzZxsRx0l2Bo7MOI8mOxRtgwxU,13600
|
29
|
-
staran-0.6.
|
30
|
-
staran-0.6.
|
31
|
-
staran-0.6.
|
32
|
-
staran-0.6.
|
33
|
-
staran-0.6.
|
24
|
+
staran-0.6.1.dist-info/licenses/LICENSE,sha256=2EmsBIyDCono4iVXNpv5_px9qt2b7hfPq1WuyGVMNP4,1361
|
25
|
+
staran-0.6.1.dist-info/METADATA,sha256=_lLAAbTSVcGJ4hSj8DVXig4fGhgGVoYlu_YjR14IJ0M,20444
|
26
|
+
staran-0.6.1.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
|
27
|
+
staran-0.6.1.dist-info/top_level.txt,sha256=NOUZtXSh5oSIEjHrC0lQ9WmoKtD010Q00dghWyag-Zs,7
|
28
|
+
staran-0.6.1.dist-info/RECORD,,
|
staran/examples/__init__.py
DELETED
staran/examples/aum_longtail.py
DELETED
@@ -1,250 +0,0 @@
|
|
1
|
-
"""
|
2
|
-
AUM代发长尾模型示例
|
3
|
-
基于Staran v0.3.0架构,使用schemas模块的预定义表结构
|
4
|
-
"""
|
5
|
-
|
6
|
-
from typing import Dict, Optional
|
7
|
-
from ..engines import create_turing_engine
|
8
|
-
from ..features import FeatureManager, FeatureConfig, FeatureType
|
9
|
-
from ..tools import Date
|
10
|
-
from ..schemas.aum import get_aum_schemas
|
11
|
-
|
12
|
-
|
13
|
-
class AUMLongtailExample:
|
14
|
-
"""AUM代发长尾模型示例类"""
|
15
|
-
|
16
|
-
def __init__(self, database: str = "dwegdata03000"):
|
17
|
-
"""
|
18
|
-
初始化AUM长尾模型示例
|
19
|
-
|
20
|
-
Args:
|
21
|
-
database: 数据库名称,默认为dwegdata03000
|
22
|
-
"""
|
23
|
-
self.database = database
|
24
|
-
self.engine = create_turing_engine(database)
|
25
|
-
self.schemas = get_aum_schemas() # 从schemas模块获取预定义的表结构
|
26
|
-
|
27
|
-
def run(self, feature_date: Optional[str] = None, output_path: str = "file:///nfsHome/aum_longtail") -> Dict:
|
28
|
-
"""
|
29
|
-
运行完整的AUM长尾模型特征工程
|
30
|
-
|
31
|
-
Args:
|
32
|
-
feature_date: 特征日期,格式为YYYYMM,默认为当前月
|
33
|
-
output_path: 输出路径,默认为file:///nfsHome/aum_longtail
|
34
|
-
|
35
|
-
Returns:
|
36
|
-
包含所有结果的字典
|
37
|
-
"""
|
38
|
-
if feature_date is None:
|
39
|
-
feature_date = Date.today().format_compact()[:6]
|
40
|
-
|
41
|
-
print(f"🚀 开始AUM长尾模型特征工程 - {feature_date}")
|
42
|
-
print("="*60)
|
43
|
-
|
44
|
-
results = {}
|
45
|
-
|
46
|
-
# 步骤1: 生成A表特征(行为特征表)- 只生成原始拷贝和聚合特征
|
47
|
-
print("📊 步骤1: 生成客户行为特征...")
|
48
|
-
results['behavior'] = self._generate_behavior_features('behavior', feature_date)
|
49
|
-
|
50
|
-
# 步骤2: 生成B表特征(资产平均值表)- 完整特征
|
51
|
-
print("💰 步骤2: 生成资产平均值特征...")
|
52
|
-
results['asset_avg'] = self._generate_full_features('asset_avg', feature_date)
|
53
|
-
|
54
|
-
# 步骤3: 生成C表特征(资产配置表)- 完整特征
|
55
|
-
print("📈 步骤3: 生成资产配置特征...")
|
56
|
-
results['asset_config'] = self._generate_full_features('asset_config', feature_date)
|
57
|
-
|
58
|
-
# 步骤4: 生成D表特征(月度统计表)- 完整特征
|
59
|
-
print("📋 步骤4: 生成月度统计特征...")
|
60
|
-
results['monthly_stat'] = self._generate_full_features('monthly_stat', feature_date)
|
61
|
-
|
62
|
-
# 步骤5: 导出特征表
|
63
|
-
print("💾 步骤5: 导出特征表...")
|
64
|
-
results['exports'] = self._export_features(feature_date, output_path)
|
65
|
-
|
66
|
-
print("="*60)
|
67
|
-
print("✅ AUM长尾模型特征工程完成!")
|
68
|
-
return results
|
69
|
-
|
70
|
-
def _generate_behavior_features(self, table_type: str, feature_date: str) -> Dict:
|
71
|
-
"""生成行为特征(A表)- 只生成原始拷贝和聚合特征"""
|
72
|
-
schema = self.schemas[table_type]
|
73
|
-
manager = FeatureManager(self.engine, self.database)
|
74
|
-
|
75
|
-
# A表特征配置:只启用原始拷贝和聚合
|
76
|
-
config = FeatureConfig()
|
77
|
-
config.enable_feature(FeatureType.RAW_COPY)
|
78
|
-
config.enable_feature(FeatureType.AGGREGATION)
|
79
|
-
config.disable_feature(FeatureType.MOM) # 不生成环比
|
80
|
-
config.disable_feature(FeatureType.YOY) # 不生成同比
|
81
|
-
|
82
|
-
print(f" 🔧 生成{schema.table_name}的特征...")
|
83
|
-
result = manager.generate_features(
|
84
|
-
schema=schema,
|
85
|
-
config=config,
|
86
|
-
feature_date=feature_date
|
87
|
-
)
|
88
|
-
|
89
|
-
feature_count = manager.count_features(schema, config)
|
90
|
-
print(f" ✅ A表特征生成完成: {feature_count}个特征")
|
91
|
-
return result
|
92
|
-
|
93
|
-
def _generate_full_features(self, table_type: str, feature_date: str) -> Dict:
|
94
|
-
"""生成完整特征(B、C、D表)- 聚合+5个月环比+1年同比"""
|
95
|
-
schema = self.schemas[table_type]
|
96
|
-
manager = FeatureManager(self.engine, self.database)
|
97
|
-
|
98
|
-
# B、C、D表特征配置:完整特征集
|
99
|
-
config = FeatureConfig()
|
100
|
-
config.enable_feature(FeatureType.AGGREGATION)
|
101
|
-
config.enable_feature(FeatureType.MOM, mom_windows=[5]) # 5个月环比
|
102
|
-
config.enable_feature(FeatureType.YOY, yoy_windows=[12]) # 1年同比
|
103
|
-
|
104
|
-
print(f" 🔧 生成{schema.table_name}的特征...")
|
105
|
-
result = manager.generate_features(
|
106
|
-
schema=schema,
|
107
|
-
config=config,
|
108
|
-
feature_date=feature_date
|
109
|
-
)
|
110
|
-
|
111
|
-
feature_count = manager.count_features(schema, config)
|
112
|
-
print(f" ✅ {table_type}表特征生成完成: {feature_count}个特征")
|
113
|
-
return result
|
114
|
-
|
115
|
-
def _export_features(self, feature_date: str, output_path: str) -> Dict:
|
116
|
-
"""导出所有特征表到指定路径"""
|
117
|
-
file_prefixes = {
|
118
|
-
'behavior': 'aum_behavior_features',
|
119
|
-
'asset_avg': 'aum_asset_avg_features',
|
120
|
-
'asset_config': 'aum_asset_config_features',
|
121
|
-
'monthly_stat': 'monthly_stat_features'
|
122
|
-
}
|
123
|
-
|
124
|
-
results = {}
|
125
|
-
for table_type, file_prefix in file_prefixes.items():
|
126
|
-
print(f" 💾 导出{table_type}表...")
|
127
|
-
|
128
|
-
# 构建特征表名
|
129
|
-
table_name = f"{self.schemas[table_type].table_name}_{feature_date}_f001"
|
130
|
-
|
131
|
-
result = self.engine.download_table_data(
|
132
|
-
table_name=f"{self.database}.{table_name}",
|
133
|
-
output_path=f"{output_path}/{file_prefix}_{feature_date}.parquet",
|
134
|
-
mode="cluster"
|
135
|
-
)
|
136
|
-
|
137
|
-
results[table_type] = result
|
138
|
-
print(f" ✅ 导出 {table_type}: {result.get('status', 'unknown')}")
|
139
|
-
|
140
|
-
return results
|
141
|
-
|
142
|
-
def get_summary(self) -> Dict:
|
143
|
-
"""获取示例摘要信息"""
|
144
|
-
summary = {
|
145
|
-
'database': self.database,
|
146
|
-
'tables': {},
|
147
|
-
'total_features': 0
|
148
|
-
}
|
149
|
-
|
150
|
-
for table_type, schema in self.schemas.items():
|
151
|
-
try:
|
152
|
-
manager = FeatureManager(self.engine, self.database)
|
153
|
-
|
154
|
-
if table_type == 'behavior':
|
155
|
-
# A表只有原始拷贝和聚合特征
|
156
|
-
config = FeatureConfig()
|
157
|
-
config.enable_feature(FeatureType.RAW_COPY)
|
158
|
-
config.enable_feature(FeatureType.AGGREGATION)
|
159
|
-
config.disable_feature(FeatureType.MOM)
|
160
|
-
config.disable_feature(FeatureType.YOY)
|
161
|
-
else:
|
162
|
-
# B、C、D表包含完整特征:聚合+5个月MoM+1年YoY
|
163
|
-
config = FeatureConfig()
|
164
|
-
config.enable_feature(FeatureType.AGGREGATION, mom_windows=[5], yoy_windows=[12])
|
165
|
-
|
166
|
-
feature_count = manager.count_features(schema, config)
|
167
|
-
summary['tables'][table_type] = {
|
168
|
-
'table_name': schema.table_name,
|
169
|
-
'field_count': len(schema.fields),
|
170
|
-
'feature_count': feature_count,
|
171
|
-
'features': {
|
172
|
-
'total': feature_count,
|
173
|
-
'aggregation': len(schema.fields), # 估算
|
174
|
-
'mom': len(schema.fields) * 5 if table_type != 'behavior' else 0,
|
175
|
-
'yoy': len(schema.fields) * 1 if table_type != 'behavior' else 0
|
176
|
-
}
|
177
|
-
}
|
178
|
-
summary['total_features'] += feature_count
|
179
|
-
except Exception as e:
|
180
|
-
# 在模拟模式下返回预估数量
|
181
|
-
base_fields = len(schema.fields)
|
182
|
-
if table_type == 'behavior':
|
183
|
-
estimated_features = base_fields * 2 # 原始拷贝 + 聚合
|
184
|
-
agg_count = base_fields
|
185
|
-
mom_count = 0
|
186
|
-
yoy_count = 0
|
187
|
-
else:
|
188
|
-
estimated_features = base_fields * 8 # 聚合 + MoM + YoY 组合
|
189
|
-
agg_count = base_fields
|
190
|
-
mom_count = base_fields * 5
|
191
|
-
yoy_count = base_fields * 1
|
192
|
-
|
193
|
-
summary['tables'][table_type] = {
|
194
|
-
'table_name': schema.table_name,
|
195
|
-
'field_count': base_fields,
|
196
|
-
'feature_count': estimated_features,
|
197
|
-
'mode': 'estimated',
|
198
|
-
'features': {
|
199
|
-
'total': estimated_features,
|
200
|
-
'aggregation': agg_count,
|
201
|
-
'mom': mom_count,
|
202
|
-
'yoy': yoy_count
|
203
|
-
}
|
204
|
-
}
|
205
|
-
summary['total_features'] += estimated_features
|
206
|
-
|
207
|
-
return summary
|
208
|
-
|
209
|
-
|
210
|
-
# 简化的API函数
|
211
|
-
def create_aum_example(database: str = "dwegdata03000") -> AUMLongtailExample:
|
212
|
-
"""
|
213
|
-
一键创建AUM长尾模型示例
|
214
|
-
|
215
|
-
Args:
|
216
|
-
database: 数据库名称,默认为dwegdata03000
|
217
|
-
|
218
|
-
Returns:
|
219
|
-
AUMLongtailExample实例
|
220
|
-
"""
|
221
|
-
return AUMLongtailExample(database)
|
222
|
-
|
223
|
-
|
224
|
-
def run_aum_example(feature_date: Optional[str] = None,
|
225
|
-
database: str = "dwegdata03000",
|
226
|
-
output_path: str = "file:///nfsHome/aum_longtail") -> Dict:
|
227
|
-
"""
|
228
|
-
一键运行AUM长尾模型特征工程
|
229
|
-
|
230
|
-
Args:
|
231
|
-
feature_date: 特征日期,格式为YYYYMM,默认为当前月
|
232
|
-
database: 数据库名称,默认为dwegdata03000
|
233
|
-
output_path: 输出路径,默认为file:///nfsHome/aum_longtail
|
234
|
-
|
235
|
-
Returns:
|
236
|
-
包含所有结果的字典
|
237
|
-
|
238
|
-
Example:
|
239
|
-
>>> results = run_aum_example('202507')
|
240
|
-
>>> print(f"生成特征数: {len(results)}")
|
241
|
-
"""
|
242
|
-
example = create_aum_example(database)
|
243
|
-
return example.run(feature_date, output_path)
|
244
|
-
|
245
|
-
|
246
|
-
__all__ = [
|
247
|
-
'AUMLongtailExample',
|
248
|
-
'create_aum_example',
|
249
|
-
'run_aum_example'
|
250
|
-
]
|