staran 0.6.1__py3-none-any.whl → 1.0.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
staran/engines/turing.py DELETED
@@ -1,439 +0,0 @@
1
- #!/usr/bin/env python3
2
- # -*- coding: utf-8 -*-
3
-
4
- """
5
- 图灵平台引擎
6
- 继承Spark引擎,重写执行和下载方法以使用turingPythonLib
7
- """
8
-
9
- from typing import Dict, Any, Optional, List, Callable
10
- import sys
11
- import os
12
- from datetime import datetime
13
- from .spark import SparkEngine
14
-
15
- # 尝试导入turingPythonLib(在图灵平台环境中)
16
- try:
17
- sys.path.append("/nfsHome/")
18
- import turingPythonLib as tp
19
- TURINGLIB_AVAILABLE = True
20
- except ImportError:
21
- tp = None
22
- TURINGLIB_AVAILABLE = False
23
-
24
-
25
- class TuringEngine(SparkEngine):
26
- """
27
- 图灵平台引擎
28
- 继承Spark引擎,使用turingPythonLib进行SQL执行和数据下载
29
- """
30
-
31
- def __init__(self, database_name: str, sql_executor: Optional[Callable] = None):
32
- # 不使用传入的sql_executor,因为我们使用turingPythonLib
33
- super().__init__(database_name, None)
34
-
35
- # 检查turingPythonLib是否可用
36
- if not TURINGLIB_AVAILABLE:
37
- print("⚠️ 警告: turingPythonLib不可用,将使用模拟模式")
38
-
39
- def get_engine_name(self) -> str:
40
- return "Turing Platform (Spark)"
41
-
42
- # ==================== 重写SQL执行方法 ====================
43
-
44
- def execute_sql(self, sql: str, description: str = "") -> Any:
45
- """
46
- 使用turingPythonLib执行SQL
47
-
48
- Args:
49
- sql: SQL语句
50
- description: 执行描述
51
-
52
- Returns:
53
- 执行结果
54
- """
55
- if TURINGLIB_AVAILABLE:
56
- try:
57
- # 使用turingPythonLib执行SQL
58
- result = tp.execute_sql(sql)
59
-
60
- self.execution_history.append({
61
- 'sql': sql,
62
- 'description': description,
63
- 'timestamp': datetime.now(),
64
- 'result': result,
65
- 'platform': 'turingPythonLib'
66
- })
67
-
68
- return result
69
-
70
- except Exception as e:
71
- error_result = {
72
- 'status': 'error',
73
- 'message': f"执行SQL失败: {str(e)}",
74
- 'error': str(e)
75
- }
76
-
77
- self.execution_history.append({
78
- 'sql': sql,
79
- 'description': description,
80
- 'timestamp': datetime.now(),
81
- 'result': error_result,
82
- 'platform': 'turingPythonLib'
83
- })
84
-
85
- return error_result
86
- else:
87
- # 模拟模式
88
- print(f"模拟执行SQL: {description or 'SQL语句'}")
89
- print(f" {sql[:100]}...")
90
-
91
- mock_result = {
92
- 'status': 'simulated',
93
- 'message': '模拟执行成功',
94
- 'sql': sql[:100] + '...'
95
- }
96
-
97
- self.execution_history.append({
98
- 'sql': sql,
99
- 'description': description,
100
- 'timestamp': datetime.now(),
101
- 'result': mock_result,
102
- 'platform': 'simulation'
103
- })
104
-
105
- return mock_result
106
-
107
- def create_table(self, table_name: str, select_sql: str,
108
- execute: bool = False, mode: str = "cluster",
109
- spark_resource: Optional[Dict[str, str]] = None) -> Dict[str, Any]:
110
- """
111
- 使用turingPythonLib创建表
112
-
113
- Args:
114
- table_name: 表名
115
- select_sql: 选择SQL
116
- execute: 是否立即执行
117
- mode: 运行模式 ('local' 或 'cluster')
118
- spark_resource: Spark资源配置
119
-
120
- Returns:
121
- 创建结果
122
- """
123
- full_table_name = self.get_full_table_name(table_name)
124
-
125
- result = {
126
- 'table_name': table_name,
127
- 'full_table_name': full_table_name,
128
- 'select_sql': select_sql,
129
- 'executed': execute,
130
- 'mode': mode
131
- }
132
-
133
- if execute:
134
- if TURINGLIB_AVAILABLE:
135
- # 构建turingPythonLib参数
136
- params = {
137
- 'create_mode': 'by_select',
138
- 'table_name': full_table_name,
139
- 'sql': select_sql,
140
- 'run_mode': mode
141
- }
142
-
143
- # 如果是集群模式且提供了资源配置
144
- if mode == 'cluster' and spark_resource:
145
- params['spark_resource'] = spark_resource
146
- elif mode == 'cluster':
147
- # 使用默认资源配置
148
- params['spark_resource'] = {
149
- 'num_executors': '4',
150
- 'driver_cores': '2',
151
- 'driver_memory': '4G',
152
- 'executor_cores': '2',
153
- 'executor_memory': '4G'
154
- }
155
-
156
- try:
157
- tp_result = tp.create_hive_table(params)
158
-
159
- result.update({
160
- 'status': 'success',
161
- 'message': f"成功创建表: {full_table_name}",
162
- 'turinglib_result': tp_result,
163
- 'params': params
164
- })
165
-
166
- except Exception as e:
167
- result.update({
168
- 'status': 'error',
169
- 'message': f"创建表失败: {str(e)}",
170
- 'error': str(e),
171
- 'params': params
172
- })
173
- else:
174
- # 模拟模式
175
- result.update({
176
- 'status': 'simulated',
177
- 'message': f"模拟创建表: {full_table_name}",
178
- 'simulated': True
179
- })
180
- else:
181
- result['status'] = 'prepared'
182
-
183
- return result
184
-
185
- # ==================== 重写数据下载方法 ====================
186
-
187
- def download_table_data(self, table_name: str, output_path: str,
188
- source: str = "hadoop", mode: str = "cluster",
189
- columns: str = "*", condition: str = "",
190
- overwrite_path: str = "yes",
191
- spark_resource: Optional[Dict[str, str]] = None,
192
- **kwargs) -> Dict[str, Any]:
193
- """
194
- 使用turingPythonLib下载表数据
195
-
196
- Args:
197
- table_name: 要下载的表名
198
- output_path: 输出路径,必须以 'file:///nfsHome/' 开头
199
- source: 数据源类型 ('hadoop' 或 'mppdb')
200
- mode: 运行模式 ('local' 或 'cluster')
201
- columns: 要选择的列,默认为 "*"
202
- condition: WHERE条件
203
- overwrite_path: 是否覆盖路径 ('yes' 或 'no')
204
- spark_resource: 集群模式下的资源配置
205
- **kwargs: 其他参数
206
-
207
- Returns:
208
- 下载结果
209
- """
210
- # 验证输出路径
211
- if not output_path.startswith('file:///nfsHome/'):
212
- raise ValueError("输出路径必须以 'file:///nfsHome/' 开头")
213
-
214
- full_table_name = self.get_full_table_name(table_name)
215
-
216
- # 构建下载SQL
217
- sql = f"SELECT {columns} FROM {full_table_name}"
218
- if condition.strip():
219
- if not condition.upper().strip().startswith('WHERE'):
220
- condition = f"WHERE {condition}"
221
- sql += f" {condition}"
222
-
223
- # 构建下载参数
224
- params = {
225
- 'sql': sql,
226
- 'source': source,
227
- 'outputPath': output_path,
228
- 'overwrite_path': overwrite_path,
229
- 'mode': mode
230
- }
231
-
232
- # 如果是集群模式且提供了资源配置
233
- if mode == 'cluster' and spark_resource:
234
- params['spark_resource'] = spark_resource
235
- elif mode == 'cluster':
236
- # 使用默认资源配置
237
- params['spark_resource'] = {
238
- 'num_executors': '4',
239
- 'driver_cores': '2',
240
- 'driver_memory': '4G',
241
- 'executor_cores': '2',
242
- 'executor_memory': '4G'
243
- }
244
-
245
- try:
246
- if TURINGLIB_AVAILABLE:
247
- # 使用真实的turingPythonLib
248
- tp_result = tp.download(params)
249
-
250
- # 判断下载是否成功
251
- if isinstance(tp_result, dict) and tp_result.get('success') == '0':
252
- return {
253
- 'status': 'success',
254
- 'message': f'数据已下载到: {output_path}',
255
- 'table_name': table_name,
256
- 'output_path': output_path,
257
- 'turinglib_result': tp_result,
258
- 'params': params
259
- }
260
- else:
261
- return {
262
- 'status': 'error',
263
- 'message': f"下载失败: {tp_result.get('data', '未知错误')}",
264
- 'table_name': table_name,
265
- 'turinglib_result': tp_result,
266
- 'params': params
267
- }
268
- else:
269
- # 模拟模式
270
- return {
271
- 'status': 'simulated',
272
- 'message': f'模拟下载到: {output_path}',
273
- 'table_name': table_name,
274
- 'output_path': output_path,
275
- 'turinglib_result': {'success': '0', 'message': '模拟下载成功'},
276
- 'params': params,
277
- 'simulated': True
278
- }
279
-
280
- except Exception as e:
281
- return {
282
- 'status': 'error',
283
- 'message': f"下载异常: {str(e)}",
284
- 'table_name': table_name,
285
- 'error': str(e),
286
- 'params': params
287
- }
288
-
289
- def download_query_result(self, sql: str, output_path: str,
290
- source: str = "hadoop", mode: str = "cluster",
291
- overwrite_path: str = "yes",
292
- spark_resource: Optional[Dict[str, str]] = None,
293
- **kwargs) -> Dict[str, Any]:
294
- """
295
- 直接下载查询结果,使用turingPythonLib
296
-
297
- Args:
298
- sql: 查询SQL
299
- output_path: 输出路径
300
- source: 数据源类型
301
- mode: 运行模式
302
- overwrite_path: 是否覆盖路径
303
- spark_resource: 资源配置
304
- **kwargs: 其他参数
305
-
306
- Returns:
307
- 下载结果
308
- """
309
- # 验证输出路径
310
- if not output_path.startswith('file:///nfsHome/'):
311
- raise ValueError("输出路径必须以 'file:///nfsHome/' 开头")
312
-
313
- # 构建下载参数
314
- params = {
315
- 'sql': sql,
316
- 'source': source,
317
- 'outputPath': output_path,
318
- 'overwrite_path': overwrite_path,
319
- 'mode': mode
320
- }
321
-
322
- # 如果是集群模式且提供了资源配置
323
- if mode == 'cluster' and spark_resource:
324
- params['spark_resource'] = spark_resource
325
- elif mode == 'cluster':
326
- params['spark_resource'] = {
327
- 'num_executors': '4',
328
- 'driver_cores': '2',
329
- 'driver_memory': '4G',
330
- 'executor_cores': '2',
331
- 'executor_memory': '4G'
332
- }
333
-
334
- try:
335
- if TURINGLIB_AVAILABLE:
336
- tp_result = tp.download(params)
337
-
338
- if isinstance(tp_result, dict) and tp_result.get('success') == '0':
339
- return {
340
- 'status': 'success',
341
- 'message': f'查询结果已下载到: {output_path}',
342
- 'output_path': output_path,
343
- 'turinglib_result': tp_result,
344
- 'params': params
345
- }
346
- else:
347
- return {
348
- 'status': 'error',
349
- 'message': f"下载失败: {tp_result.get('data', '未知错误')}",
350
- 'turinglib_result': tp_result,
351
- 'params': params
352
- }
353
- else:
354
- return {
355
- 'status': 'simulated',
356
- 'message': f'模拟下载查询结果到: {output_path}',
357
- 'output_path': output_path,
358
- 'turinglib_result': {'success': '0', 'message': '模拟下载成功'},
359
- 'params': params,
360
- 'simulated': True
361
- }
362
-
363
- except Exception as e:
364
- return {
365
- 'status': 'error',
366
- 'message': f"下载查询结果失败: {str(e)}",
367
- 'error': str(e),
368
- 'params': params
369
- }
370
-
371
- # ==================== 图灵平台特有方法 ====================
372
-
373
- def install_python_packages(self, packages: List[str]) -> Dict[str, Any]:
374
- """
375
- 安装Python包(使用turingPythonLib)
376
-
377
- Args:
378
- packages: 要安装的包列表
379
-
380
- Returns:
381
- 安装结果
382
- """
383
- results = []
384
-
385
- for package in packages:
386
- try:
387
- if TURINGLIB_AVAILABLE:
388
- tp.pip_install(package)
389
- results.append({
390
- 'package': package,
391
- 'status': 'success',
392
- 'message': f'成功安装 {package}'
393
- })
394
- else:
395
- results.append({
396
- 'package': package,
397
- 'status': 'simulated',
398
- 'message': f'模拟安装 {package} (turingPythonLib不可用)'
399
- })
400
- except Exception as e:
401
- results.append({
402
- 'package': package,
403
- 'status': 'error',
404
- 'error': str(e),
405
- 'message': f'安装 {package} 失败'
406
- })
407
-
408
- return {
409
- 'total_packages': len(packages),
410
- 'successful_installs': len([r for r in results if r['status'] == 'success']),
411
- 'results': results
412
- }
413
-
414
- def get_platform_info(self) -> Dict[str, Any]:
415
- """获取图灵平台信息"""
416
- return {
417
- 'engine_name': self.get_engine_name(),
418
- 'engine_type': self.get_engine_type().value,
419
- 'turinglib_available': TURINGLIB_AVAILABLE,
420
- 'nfs_home_exists': os.path.exists('/nfsHome'),
421
- 'database_name': self.database_name,
422
- 'current_working_dir': os.getcwd(),
423
- 'python_path': sys.path[:3] # 只显示前3个路径
424
- }
425
-
426
-
427
- # 便捷创建函数
428
- def create_turing_engine(database_name: str, **kwargs) -> TuringEngine:
429
- """
430
- 便捷函数:创建图灵引擎实例
431
-
432
- Args:
433
- database_name: 数据库名称
434
- **kwargs: 其他参数
435
-
436
- Returns:
437
- 图灵引擎实例
438
- """
439
- return TuringEngine(database_name, **kwargs)
@@ -1,59 +0,0 @@
1
- # 特征工程核心模块
2
-
3
- # 核心组件
4
- from .schema import TableSchema, Field, FieldType
5
- from .generator import FeatureGenerator, FeatureConfig, FeatureType, AggregationType
6
- from .manager import FeatureManager, FeatureTableManager
7
-
8
- # 引擎模块
9
- from ..engines import DatabaseType, SparkEngine, HiveEngine, create_engine
10
-
11
- # 图灵引擎 (可选导入)
12
- try:
13
- from ..engines import TuringEngine, create_turing_engine
14
- _TURING_AVAILABLE = True
15
- except ImportError:
16
- TuringEngine = None
17
- create_turing_engine = None
18
- _TURING_AVAILABLE = False
19
-
20
- # 便利函数
21
- def create_feature_manager(database_name: str, engine_type: str = "spark",
22
- **kwargs) -> FeatureManager:
23
- """创建特征管理器的便利函数"""
24
- return FeatureManager(database_name, engine_type, **kwargs)
25
-
26
- def quick_create_and_download(database_name: str, table_name: str, year: int, month: int,
27
- save_path: str, engine_type: str = "turing", **kwargs):
28
- """快速创建和下载的便利函数"""
29
- if engine_type == "turing" and not _TURING_AVAILABLE:
30
- raise ImportError("图灵引擎不可用,请确保turingPythonLib已安装")
31
-
32
- manager = FeatureManager(database_name, engine_type, **kwargs)
33
- return manager.download_table_data(table_name, save_path)
34
-
35
- # 主要导出
36
- __all__ = [
37
- 'TableSchema',
38
- 'Field',
39
- 'FieldType',
40
- 'FeatureGenerator',
41
- 'FeatureConfig',
42
- 'FeatureType',
43
- 'AggregationType',
44
- 'FeatureManager',
45
- 'FeatureTableManager',
46
- 'DatabaseType',
47
- 'SparkEngine',
48
- 'HiveEngine',
49
- 'create_engine',
50
- 'create_feature_manager'
51
- ]
52
-
53
- # 如果图灵引擎可用,添加到导出
54
- if _TURING_AVAILABLE:
55
- __all__.extend([
56
- 'TuringEngine',
57
- 'create_turing_engine',
58
- 'quick_create_and_download'
59
- ])