staran 0.3.0__py3-none-any.whl → 0.4.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,155 @@
1
+ #!/usr/bin/env python3
2
+ # -*- coding: utf-8 -*-
3
+
4
+ """
5
+ 特征管理器
6
+ 负责特征工程的核心管理功能,基于新的引擎架构
7
+ """
8
+
9
+ from typing import Optional, Dict, Any, List, Callable
10
+ from datetime import datetime
11
+ from ..engines import BaseEngine, create_engine, DatabaseType
12
+
13
+
14
+ class FeatureManager:
15
+ """
16
+ 特征管理器 - 使用引擎架构的核心特征管理
17
+ """
18
+
19
+ def __init__(self, database_name: str, engine_type: str = "spark",
20
+ sql_executor: Optional[Callable] = None):
21
+ """
22
+ 初始化特征管理器
23
+
24
+ Args:
25
+ database_name: 数据库名称
26
+ engine_type: 引擎类型 ('spark', 'hive', 'turing')
27
+ sql_executor: SQL执行器函数 (可选,仅用于非turing引擎)
28
+ """
29
+ self.database_name = database_name
30
+ self.engine_type = engine_type
31
+
32
+ # 创建数据库引擎
33
+ self.engine = create_engine(
34
+ engine_type=engine_type,
35
+ database_name=database_name,
36
+ sql_executor=sql_executor
37
+ )
38
+
39
+ # 委托给引擎的方法
40
+ def execute_sql(self, sql: str, description: str = "") -> Any:
41
+ """执行SQL语句"""
42
+ return self.engine.execute_sql(sql, description)
43
+
44
+ def get_full_table_name(self, table_name: str) -> str:
45
+ """获取完整的表名(包含数据库名)"""
46
+ return self.engine.get_full_table_name(table_name)
47
+
48
+ def generate_table_name(self, base_name: str, year: int, month: int,
49
+ suffix: str = "raw") -> str:
50
+ """
51
+ 生成标准化的表名
52
+ 格式: {base_name}_{yyyy}_{MM}_{suffix}
53
+ """
54
+ return self.engine.generate_table_name(base_name, year, month, suffix)
55
+
56
+ def create_table(self, table_name: str, select_sql: str,
57
+ execute: bool = False, **kwargs) -> Dict[str, Any]:
58
+ """创建表"""
59
+ return self.engine.create_table(table_name, select_sql, execute, **kwargs)
60
+
61
+ def drop_table(self, table_name: str, execute: bool = False) -> Dict[str, Any]:
62
+ """删除表"""
63
+ return self.engine.drop_table(table_name, execute)
64
+
65
+ def download_table_data(self, table_name: str, output_path: str,
66
+ **kwargs) -> Dict[str, Any]:
67
+ """下载表数据"""
68
+ return self.engine.download_table_data(table_name, output_path, **kwargs)
69
+
70
+ def download_query_result(self, sql: str, output_path: str,
71
+ **kwargs) -> Dict[str, Any]:
72
+ """下载查询结果"""
73
+ return self.engine.download_query_result(sql, output_path, **kwargs)
74
+
75
+ def get_execution_history(self) -> List[Dict]:
76
+ """获取SQL执行历史"""
77
+ return self.engine.get_execution_history()
78
+
79
+ def clear_history(self):
80
+ """清空执行历史"""
81
+ self.engine.clear_history()
82
+
83
+ def __str__(self):
84
+ return f"FeatureManager(engine={self.engine})"
85
+
86
+
87
+ class FeatureTableManager:
88
+ """
89
+ 特征表管理器
90
+ 负责特征表的创建、删除、管理等操作
91
+ """
92
+
93
+ def __init__(self, feature_manager: FeatureManager):
94
+ """
95
+ 初始化表管理器
96
+
97
+ Args:
98
+ feature_manager: 特征管理器实例
99
+ """
100
+ self.feature_manager = feature_manager
101
+ self.created_tables = []
102
+
103
+ def create_feature_table(self, base_name: str, year: int, month: int,
104
+ version: int, sql: str, execute: bool = False,
105
+ **kwargs) -> str:
106
+ """
107
+ 创建特征表
108
+
109
+ Args:
110
+ base_name: 基础表名
111
+ year: 年份
112
+ month: 月份
113
+ version: 版本号
114
+ sql: 创建表的SQL
115
+ execute: 是否立即执行
116
+ **kwargs: 传递给引擎的其他参数
117
+
118
+ Returns:
119
+ 创建的表名
120
+ """
121
+ table_name = self.feature_manager.generate_table_name(base_name, year, month)
122
+
123
+ result = self.feature_manager.create_table(table_name, sql, execute, **kwargs)
124
+
125
+ if execute and result.get('status') == 'success':
126
+ self.created_tables.append(table_name)
127
+
128
+ return table_name
129
+
130
+ def drop_feature_table(self, table_name: str, execute: bool = False) -> str:
131
+ """
132
+ 删除特征表
133
+
134
+ Args:
135
+ table_name: 表名
136
+ execute: 是否立即执行
137
+
138
+ Returns:
139
+ 删除表的SQL
140
+ """
141
+ result = self.feature_manager.drop_table(table_name, execute)
142
+
143
+ if execute and result.get('status') == 'success':
144
+ if table_name in self.created_tables:
145
+ self.created_tables.remove(table_name)
146
+
147
+ return result.get('sql', '')
148
+
149
+ def get_created_tables(self) -> List[str]:
150
+ """获取已创建的表列表"""
151
+ return self.created_tables.copy()
152
+
153
+ def table_exists(self, table_name: str) -> bool:
154
+ """检查表是否存在(简单检查,实际需要查询数据库)"""
155
+ return table_name in self.created_tables
@@ -0,0 +1,193 @@
1
+ #!/usr/bin/env python3
2
+ # -*- coding: utf-8 -*-
3
+
4
+ """
5
+ 表结构定义模块
6
+ 定义数据库表的字段、类型和分析特性
7
+ """
8
+
9
+ from enum import Enum
10
+ from typing import Dict, List, Optional, Union
11
+ from dataclasses import dataclass
12
+
13
+
14
+ class FieldType(Enum):
15
+ """字段类型枚举"""
16
+ STRING = "string"
17
+ INTEGER = "int"
18
+ BIGINT = "bigint"
19
+ DECIMAL = "decimal"
20
+ DOUBLE = "double"
21
+ FLOAT = "float"
22
+ DATE = "date"
23
+ TIMESTAMP = "timestamp"
24
+ BOOLEAN = "boolean"
25
+
26
+
27
+ @dataclass
28
+ class Field:
29
+ """字段定义"""
30
+ name: str
31
+ field_type: FieldType
32
+ is_primary_key: bool = False
33
+ is_date_field: bool = False
34
+ aggregatable: bool = False
35
+ nullable: bool = True
36
+ comment: str = ""
37
+
38
+ def __post_init__(self):
39
+ """初始化后处理"""
40
+ # 数值类型默认可聚合
41
+ if self.field_type in [FieldType.INTEGER, FieldType.BIGINT,
42
+ FieldType.DECIMAL, FieldType.DOUBLE, FieldType.FLOAT]:
43
+ if not hasattr(self, '_aggregatable_set'):
44
+ self.aggregatable = True
45
+
46
+ def set_aggregatable(self, aggregatable: bool):
47
+ """设置是否可聚合"""
48
+ self.aggregatable = aggregatable
49
+ self._aggregatable_set = True
50
+ return self
51
+
52
+
53
+ class TableSchema:
54
+ """表结构定义类"""
55
+
56
+ def __init__(self, table_name: str, comment: str = ""):
57
+ """
58
+ 初始化表结构
59
+
60
+ Args:
61
+ table_name: 表名
62
+ comment: 表注释
63
+ """
64
+ self.table_name = table_name
65
+ self.comment = comment
66
+ self.fields: Dict[str, Field] = {}
67
+ self.primary_key: Optional[str] = None
68
+ self.date_field: Optional[str] = None
69
+ self.is_monthly_unique: bool = False
70
+
71
+ def add_field(self, name: str, field_type: Union[str, FieldType],
72
+ aggregatable: bool = None, nullable: bool = True,
73
+ comment: str = "") -> 'TableSchema':
74
+ """
75
+ 添加字段
76
+
77
+ Args:
78
+ name: 字段名
79
+ field_type: 字段类型
80
+ aggregatable: 是否可聚合(None时自动判断)
81
+ nullable: 是否可空
82
+ comment: 字段注释
83
+
84
+ Returns:
85
+ self: 支持链式调用
86
+ """
87
+ if isinstance(field_type, str):
88
+ field_type = FieldType(field_type.lower())
89
+
90
+ field = Field(
91
+ name=name,
92
+ field_type=field_type,
93
+ nullable=nullable,
94
+ comment=comment
95
+ )
96
+
97
+ if aggregatable is not None:
98
+ field.set_aggregatable(aggregatable)
99
+
100
+ self.fields[name] = field
101
+ return self
102
+
103
+ def add_primary_key(self, name: str, field_type: Union[str, FieldType],
104
+ comment: str = "主键") -> 'TableSchema':
105
+ """添加主键字段"""
106
+ if isinstance(field_type, str):
107
+ field_type = FieldType(field_type.lower())
108
+
109
+ field = Field(
110
+ name=name,
111
+ field_type=field_type,
112
+ is_primary_key=True,
113
+ nullable=False,
114
+ comment=comment
115
+ )
116
+ field.set_aggregatable(False)
117
+
118
+ self.fields[name] = field
119
+ self.primary_key = name
120
+ return self
121
+
122
+ def add_date_field(self, name: str, field_type: Union[str, FieldType] = FieldType.DATE,
123
+ comment: str = "日期字段") -> 'TableSchema':
124
+ """添加日期字段"""
125
+ if isinstance(field_type, str):
126
+ field_type = FieldType(field_type.lower())
127
+
128
+ field = Field(
129
+ name=name,
130
+ field_type=field_type,
131
+ is_date_field=True,
132
+ nullable=False,
133
+ comment=comment
134
+ )
135
+ field.set_aggregatable(False)
136
+
137
+ self.fields[name] = field
138
+ self.date_field = name
139
+ return self
140
+
141
+ def set_monthly_unique(self, is_unique: bool = True) -> 'TableSchema':
142
+ """设置是否为每人每月唯一数据"""
143
+ self.is_monthly_unique = is_unique
144
+ return self
145
+
146
+ def get_aggregatable_fields(self) -> List[Field]:
147
+ """获取可聚合字段列表"""
148
+ return [field for field in self.fields.values() if field.aggregatable]
149
+
150
+ def get_non_aggregatable_fields(self) -> List[Field]:
151
+ """获取不可聚合字段列表(用于原始拷贝)"""
152
+ return [field for field in self.fields.values()
153
+ if not field.aggregatable and not field.is_primary_key and not field.is_date_field]
154
+
155
+ def validate(self) -> bool:
156
+ """验证表结构"""
157
+ if not self.primary_key:
158
+ raise ValueError("表必须定义主键")
159
+
160
+ if not self.date_field:
161
+ raise ValueError("表必须定义日期字段")
162
+
163
+ if self.primary_key not in self.fields:
164
+ raise ValueError(f"主键字段 {self.primary_key} 不存在")
165
+
166
+ if self.date_field not in self.fields:
167
+ raise ValueError(f"日期字段 {self.date_field} 不存在")
168
+
169
+ return True
170
+
171
+ def __str__(self) -> str:
172
+ """字符串表示"""
173
+ lines = [f"Table: {self.table_name}"]
174
+ if self.comment:
175
+ lines.append(f"Comment: {self.comment}")
176
+
177
+ lines.append(f"Primary Key: {self.primary_key}")
178
+ lines.append(f"Date Field: {self.date_field}")
179
+ lines.append(f"Monthly Unique: {self.is_monthly_unique}")
180
+ lines.append("Fields:")
181
+
182
+ for field in self.fields.values():
183
+ flag_str = ""
184
+ if field.is_primary_key:
185
+ flag_str += "[PK]"
186
+ if field.is_date_field:
187
+ flag_str += "[DATE]"
188
+ if field.aggregatable:
189
+ flag_str += "[AGG]"
190
+
191
+ lines.append(f" {field.name}: {field.field_type.value} {flag_str}")
192
+
193
+ return "\n".join(lines)
@@ -0,0 +1,28 @@
1
+ """
2
+ Staran Schemas模块 - 数据表结构定义与文档生成
3
+
4
+ 提供标准化的表结构定义、字段管理和文档生成功能。
5
+ 支持根据表结构生成Markdown和PDF文档供业务方使用。
6
+
7
+ 主要功能:
8
+ - 表结构标准化定义
9
+ - 业务字段含义管理
10
+ - 文档自动生成 (MD/PDF)
11
+ - 多业务领域支持
12
+ """
13
+
14
+ from .document_generator import SchemaDocumentGenerator
15
+ from .aum import *
16
+
17
+ __all__ = [
18
+ 'SchemaDocumentGenerator',
19
+ # AUM业务表
20
+ 'AUMBehaviorSchema',
21
+ 'AUMAssetAvgSchema',
22
+ 'AUMAssetConfigSchema',
23
+ 'AUMMonthlyStatSchema',
24
+ 'get_aum_schemas',
25
+ 'export_aum_docs'
26
+ ]
27
+
28
+ __version__ = "0.3.0"