staran 0.2.4__py3-none-any.whl → 0.4.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- staran/__init__.py +24 -9
- staran/engines/__init__.py +65 -0
- staran/engines/base.py +255 -0
- staran/engines/hive.py +163 -0
- staran/engines/spark.py +252 -0
- staran/engines/turing.py +439 -0
- staran/examples/__init__.py +8 -0
- staran/examples/aum_longtail.py +250 -0
- staran/examples/aum_longtail_old.py +487 -0
- staran/features/__init__.py +59 -0
- staran/features/engines.py +284 -0
- staran/features/generator.py +603 -0
- staran/features/manager.py +155 -0
- staran/features/schema.py +193 -0
- staran/schemas/__init__.py +28 -0
- staran/schemas/aum/__init__.py +314 -0
- staran/schemas/document_generator.py +337 -0
- {staran-0.2.4.dist-info → staran-0.4.0.dist-info}/METADATA +151 -33
- staran-0.4.0.dist-info/RECORD +24 -0
- staran-0.2.4.dist-info/RECORD +0 -8
- {staran-0.2.4.dist-info → staran-0.4.0.dist-info}/WHEEL +0 -0
- {staran-0.2.4.dist-info → staran-0.4.0.dist-info}/licenses/LICENSE +0 -0
- {staran-0.2.4.dist-info → staran-0.4.0.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,155 @@
|
|
1
|
+
#!/usr/bin/env python3
|
2
|
+
# -*- coding: utf-8 -*-
|
3
|
+
|
4
|
+
"""
|
5
|
+
特征管理器
|
6
|
+
负责特征工程的核心管理功能,基于新的引擎架构
|
7
|
+
"""
|
8
|
+
|
9
|
+
from typing import Optional, Dict, Any, List, Callable
|
10
|
+
from datetime import datetime
|
11
|
+
from ..engines import BaseEngine, create_engine, DatabaseType
|
12
|
+
|
13
|
+
|
14
|
+
class FeatureManager:
|
15
|
+
"""
|
16
|
+
特征管理器 - 使用引擎架构的核心特征管理
|
17
|
+
"""
|
18
|
+
|
19
|
+
def __init__(self, database_name: str, engine_type: str = "spark",
|
20
|
+
sql_executor: Optional[Callable] = None):
|
21
|
+
"""
|
22
|
+
初始化特征管理器
|
23
|
+
|
24
|
+
Args:
|
25
|
+
database_name: 数据库名称
|
26
|
+
engine_type: 引擎类型 ('spark', 'hive', 'turing')
|
27
|
+
sql_executor: SQL执行器函数 (可选,仅用于非turing引擎)
|
28
|
+
"""
|
29
|
+
self.database_name = database_name
|
30
|
+
self.engine_type = engine_type
|
31
|
+
|
32
|
+
# 创建数据库引擎
|
33
|
+
self.engine = create_engine(
|
34
|
+
engine_type=engine_type,
|
35
|
+
database_name=database_name,
|
36
|
+
sql_executor=sql_executor
|
37
|
+
)
|
38
|
+
|
39
|
+
# 委托给引擎的方法
|
40
|
+
def execute_sql(self, sql: str, description: str = "") -> Any:
|
41
|
+
"""执行SQL语句"""
|
42
|
+
return self.engine.execute_sql(sql, description)
|
43
|
+
|
44
|
+
def get_full_table_name(self, table_name: str) -> str:
|
45
|
+
"""获取完整的表名(包含数据库名)"""
|
46
|
+
return self.engine.get_full_table_name(table_name)
|
47
|
+
|
48
|
+
def generate_table_name(self, base_name: str, year: int, month: int,
|
49
|
+
suffix: str = "raw") -> str:
|
50
|
+
"""
|
51
|
+
生成标准化的表名
|
52
|
+
格式: {base_name}_{yyyy}_{MM}_{suffix}
|
53
|
+
"""
|
54
|
+
return self.engine.generate_table_name(base_name, year, month, suffix)
|
55
|
+
|
56
|
+
def create_table(self, table_name: str, select_sql: str,
|
57
|
+
execute: bool = False, **kwargs) -> Dict[str, Any]:
|
58
|
+
"""创建表"""
|
59
|
+
return self.engine.create_table(table_name, select_sql, execute, **kwargs)
|
60
|
+
|
61
|
+
def drop_table(self, table_name: str, execute: bool = False) -> Dict[str, Any]:
|
62
|
+
"""删除表"""
|
63
|
+
return self.engine.drop_table(table_name, execute)
|
64
|
+
|
65
|
+
def download_table_data(self, table_name: str, output_path: str,
|
66
|
+
**kwargs) -> Dict[str, Any]:
|
67
|
+
"""下载表数据"""
|
68
|
+
return self.engine.download_table_data(table_name, output_path, **kwargs)
|
69
|
+
|
70
|
+
def download_query_result(self, sql: str, output_path: str,
|
71
|
+
**kwargs) -> Dict[str, Any]:
|
72
|
+
"""下载查询结果"""
|
73
|
+
return self.engine.download_query_result(sql, output_path, **kwargs)
|
74
|
+
|
75
|
+
def get_execution_history(self) -> List[Dict]:
|
76
|
+
"""获取SQL执行历史"""
|
77
|
+
return self.engine.get_execution_history()
|
78
|
+
|
79
|
+
def clear_history(self):
|
80
|
+
"""清空执行历史"""
|
81
|
+
self.engine.clear_history()
|
82
|
+
|
83
|
+
def __str__(self):
|
84
|
+
return f"FeatureManager(engine={self.engine})"
|
85
|
+
|
86
|
+
|
87
|
+
class FeatureTableManager:
|
88
|
+
"""
|
89
|
+
特征表管理器
|
90
|
+
负责特征表的创建、删除、管理等操作
|
91
|
+
"""
|
92
|
+
|
93
|
+
def __init__(self, feature_manager: FeatureManager):
|
94
|
+
"""
|
95
|
+
初始化表管理器
|
96
|
+
|
97
|
+
Args:
|
98
|
+
feature_manager: 特征管理器实例
|
99
|
+
"""
|
100
|
+
self.feature_manager = feature_manager
|
101
|
+
self.created_tables = []
|
102
|
+
|
103
|
+
def create_feature_table(self, base_name: str, year: int, month: int,
|
104
|
+
version: int, sql: str, execute: bool = False,
|
105
|
+
**kwargs) -> str:
|
106
|
+
"""
|
107
|
+
创建特征表
|
108
|
+
|
109
|
+
Args:
|
110
|
+
base_name: 基础表名
|
111
|
+
year: 年份
|
112
|
+
month: 月份
|
113
|
+
version: 版本号
|
114
|
+
sql: 创建表的SQL
|
115
|
+
execute: 是否立即执行
|
116
|
+
**kwargs: 传递给引擎的其他参数
|
117
|
+
|
118
|
+
Returns:
|
119
|
+
创建的表名
|
120
|
+
"""
|
121
|
+
table_name = self.feature_manager.generate_table_name(base_name, year, month)
|
122
|
+
|
123
|
+
result = self.feature_manager.create_table(table_name, sql, execute, **kwargs)
|
124
|
+
|
125
|
+
if execute and result.get('status') == 'success':
|
126
|
+
self.created_tables.append(table_name)
|
127
|
+
|
128
|
+
return table_name
|
129
|
+
|
130
|
+
def drop_feature_table(self, table_name: str, execute: bool = False) -> str:
|
131
|
+
"""
|
132
|
+
删除特征表
|
133
|
+
|
134
|
+
Args:
|
135
|
+
table_name: 表名
|
136
|
+
execute: 是否立即执行
|
137
|
+
|
138
|
+
Returns:
|
139
|
+
删除表的SQL
|
140
|
+
"""
|
141
|
+
result = self.feature_manager.drop_table(table_name, execute)
|
142
|
+
|
143
|
+
if execute and result.get('status') == 'success':
|
144
|
+
if table_name in self.created_tables:
|
145
|
+
self.created_tables.remove(table_name)
|
146
|
+
|
147
|
+
return result.get('sql', '')
|
148
|
+
|
149
|
+
def get_created_tables(self) -> List[str]:
|
150
|
+
"""获取已创建的表列表"""
|
151
|
+
return self.created_tables.copy()
|
152
|
+
|
153
|
+
def table_exists(self, table_name: str) -> bool:
|
154
|
+
"""检查表是否存在(简单检查,实际需要查询数据库)"""
|
155
|
+
return table_name in self.created_tables
|
@@ -0,0 +1,193 @@
|
|
1
|
+
#!/usr/bin/env python3
|
2
|
+
# -*- coding: utf-8 -*-
|
3
|
+
|
4
|
+
"""
|
5
|
+
表结构定义模块
|
6
|
+
定义数据库表的字段、类型和分析特性
|
7
|
+
"""
|
8
|
+
|
9
|
+
from enum import Enum
|
10
|
+
from typing import Dict, List, Optional, Union
|
11
|
+
from dataclasses import dataclass
|
12
|
+
|
13
|
+
|
14
|
+
class FieldType(Enum):
|
15
|
+
"""字段类型枚举"""
|
16
|
+
STRING = "string"
|
17
|
+
INTEGER = "int"
|
18
|
+
BIGINT = "bigint"
|
19
|
+
DECIMAL = "decimal"
|
20
|
+
DOUBLE = "double"
|
21
|
+
FLOAT = "float"
|
22
|
+
DATE = "date"
|
23
|
+
TIMESTAMP = "timestamp"
|
24
|
+
BOOLEAN = "boolean"
|
25
|
+
|
26
|
+
|
27
|
+
@dataclass
|
28
|
+
class Field:
|
29
|
+
"""字段定义"""
|
30
|
+
name: str
|
31
|
+
field_type: FieldType
|
32
|
+
is_primary_key: bool = False
|
33
|
+
is_date_field: bool = False
|
34
|
+
aggregatable: bool = False
|
35
|
+
nullable: bool = True
|
36
|
+
comment: str = ""
|
37
|
+
|
38
|
+
def __post_init__(self):
|
39
|
+
"""初始化后处理"""
|
40
|
+
# 数值类型默认可聚合
|
41
|
+
if self.field_type in [FieldType.INTEGER, FieldType.BIGINT,
|
42
|
+
FieldType.DECIMAL, FieldType.DOUBLE, FieldType.FLOAT]:
|
43
|
+
if not hasattr(self, '_aggregatable_set'):
|
44
|
+
self.aggregatable = True
|
45
|
+
|
46
|
+
def set_aggregatable(self, aggregatable: bool):
|
47
|
+
"""设置是否可聚合"""
|
48
|
+
self.aggregatable = aggregatable
|
49
|
+
self._aggregatable_set = True
|
50
|
+
return self
|
51
|
+
|
52
|
+
|
53
|
+
class TableSchema:
|
54
|
+
"""表结构定义类"""
|
55
|
+
|
56
|
+
def __init__(self, table_name: str, comment: str = ""):
|
57
|
+
"""
|
58
|
+
初始化表结构
|
59
|
+
|
60
|
+
Args:
|
61
|
+
table_name: 表名
|
62
|
+
comment: 表注释
|
63
|
+
"""
|
64
|
+
self.table_name = table_name
|
65
|
+
self.comment = comment
|
66
|
+
self.fields: Dict[str, Field] = {}
|
67
|
+
self.primary_key: Optional[str] = None
|
68
|
+
self.date_field: Optional[str] = None
|
69
|
+
self.is_monthly_unique: bool = False
|
70
|
+
|
71
|
+
def add_field(self, name: str, field_type: Union[str, FieldType],
|
72
|
+
aggregatable: bool = None, nullable: bool = True,
|
73
|
+
comment: str = "") -> 'TableSchema':
|
74
|
+
"""
|
75
|
+
添加字段
|
76
|
+
|
77
|
+
Args:
|
78
|
+
name: 字段名
|
79
|
+
field_type: 字段类型
|
80
|
+
aggregatable: 是否可聚合(None时自动判断)
|
81
|
+
nullable: 是否可空
|
82
|
+
comment: 字段注释
|
83
|
+
|
84
|
+
Returns:
|
85
|
+
self: 支持链式调用
|
86
|
+
"""
|
87
|
+
if isinstance(field_type, str):
|
88
|
+
field_type = FieldType(field_type.lower())
|
89
|
+
|
90
|
+
field = Field(
|
91
|
+
name=name,
|
92
|
+
field_type=field_type,
|
93
|
+
nullable=nullable,
|
94
|
+
comment=comment
|
95
|
+
)
|
96
|
+
|
97
|
+
if aggregatable is not None:
|
98
|
+
field.set_aggregatable(aggregatable)
|
99
|
+
|
100
|
+
self.fields[name] = field
|
101
|
+
return self
|
102
|
+
|
103
|
+
def add_primary_key(self, name: str, field_type: Union[str, FieldType],
|
104
|
+
comment: str = "主键") -> 'TableSchema':
|
105
|
+
"""添加主键字段"""
|
106
|
+
if isinstance(field_type, str):
|
107
|
+
field_type = FieldType(field_type.lower())
|
108
|
+
|
109
|
+
field = Field(
|
110
|
+
name=name,
|
111
|
+
field_type=field_type,
|
112
|
+
is_primary_key=True,
|
113
|
+
nullable=False,
|
114
|
+
comment=comment
|
115
|
+
)
|
116
|
+
field.set_aggregatable(False)
|
117
|
+
|
118
|
+
self.fields[name] = field
|
119
|
+
self.primary_key = name
|
120
|
+
return self
|
121
|
+
|
122
|
+
def add_date_field(self, name: str, field_type: Union[str, FieldType] = FieldType.DATE,
|
123
|
+
comment: str = "日期字段") -> 'TableSchema':
|
124
|
+
"""添加日期字段"""
|
125
|
+
if isinstance(field_type, str):
|
126
|
+
field_type = FieldType(field_type.lower())
|
127
|
+
|
128
|
+
field = Field(
|
129
|
+
name=name,
|
130
|
+
field_type=field_type,
|
131
|
+
is_date_field=True,
|
132
|
+
nullable=False,
|
133
|
+
comment=comment
|
134
|
+
)
|
135
|
+
field.set_aggregatable(False)
|
136
|
+
|
137
|
+
self.fields[name] = field
|
138
|
+
self.date_field = name
|
139
|
+
return self
|
140
|
+
|
141
|
+
def set_monthly_unique(self, is_unique: bool = True) -> 'TableSchema':
|
142
|
+
"""设置是否为每人每月唯一数据"""
|
143
|
+
self.is_monthly_unique = is_unique
|
144
|
+
return self
|
145
|
+
|
146
|
+
def get_aggregatable_fields(self) -> List[Field]:
|
147
|
+
"""获取可聚合字段列表"""
|
148
|
+
return [field for field in self.fields.values() if field.aggregatable]
|
149
|
+
|
150
|
+
def get_non_aggregatable_fields(self) -> List[Field]:
|
151
|
+
"""获取不可聚合字段列表(用于原始拷贝)"""
|
152
|
+
return [field for field in self.fields.values()
|
153
|
+
if not field.aggregatable and not field.is_primary_key and not field.is_date_field]
|
154
|
+
|
155
|
+
def validate(self) -> bool:
|
156
|
+
"""验证表结构"""
|
157
|
+
if not self.primary_key:
|
158
|
+
raise ValueError("表必须定义主键")
|
159
|
+
|
160
|
+
if not self.date_field:
|
161
|
+
raise ValueError("表必须定义日期字段")
|
162
|
+
|
163
|
+
if self.primary_key not in self.fields:
|
164
|
+
raise ValueError(f"主键字段 {self.primary_key} 不存在")
|
165
|
+
|
166
|
+
if self.date_field not in self.fields:
|
167
|
+
raise ValueError(f"日期字段 {self.date_field} 不存在")
|
168
|
+
|
169
|
+
return True
|
170
|
+
|
171
|
+
def __str__(self) -> str:
|
172
|
+
"""字符串表示"""
|
173
|
+
lines = [f"Table: {self.table_name}"]
|
174
|
+
if self.comment:
|
175
|
+
lines.append(f"Comment: {self.comment}")
|
176
|
+
|
177
|
+
lines.append(f"Primary Key: {self.primary_key}")
|
178
|
+
lines.append(f"Date Field: {self.date_field}")
|
179
|
+
lines.append(f"Monthly Unique: {self.is_monthly_unique}")
|
180
|
+
lines.append("Fields:")
|
181
|
+
|
182
|
+
for field in self.fields.values():
|
183
|
+
flag_str = ""
|
184
|
+
if field.is_primary_key:
|
185
|
+
flag_str += "[PK]"
|
186
|
+
if field.is_date_field:
|
187
|
+
flag_str += "[DATE]"
|
188
|
+
if field.aggregatable:
|
189
|
+
flag_str += "[AGG]"
|
190
|
+
|
191
|
+
lines.append(f" {field.name}: {field.field_type.value} {flag_str}")
|
192
|
+
|
193
|
+
return "\n".join(lines)
|
@@ -0,0 +1,28 @@
|
|
1
|
+
"""
|
2
|
+
Staran Schemas模块 - 数据表结构定义与文档生成
|
3
|
+
|
4
|
+
提供标准化的表结构定义、字段管理和文档生成功能。
|
5
|
+
支持根据表结构生成Markdown和PDF文档供业务方使用。
|
6
|
+
|
7
|
+
主要功能:
|
8
|
+
- 表结构标准化定义
|
9
|
+
- 业务字段含义管理
|
10
|
+
- 文档自动生成 (MD/PDF)
|
11
|
+
- 多业务领域支持
|
12
|
+
"""
|
13
|
+
|
14
|
+
from .document_generator import SchemaDocumentGenerator
|
15
|
+
from .aum import *
|
16
|
+
|
17
|
+
__all__ = [
|
18
|
+
'SchemaDocumentGenerator',
|
19
|
+
# AUM业务表
|
20
|
+
'AUMBehaviorSchema',
|
21
|
+
'AUMAssetAvgSchema',
|
22
|
+
'AUMAssetConfigSchema',
|
23
|
+
'AUMMonthlyStatSchema',
|
24
|
+
'get_aum_schemas',
|
25
|
+
'export_aum_docs'
|
26
|
+
]
|
27
|
+
|
28
|
+
__version__ = "0.3.0"
|