mdbq 4.2.7__tar.gz → 4.2.9__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of mdbq might be problematic. Click here for more details.

Files changed (46) hide show
  1. {mdbq-4.2.7 → mdbq-4.2.9}/PKG-INFO +1 -1
  2. mdbq-4.2.9/mdbq/__version__.py +1 -0
  3. {mdbq-4.2.7 → mdbq-4.2.9}/mdbq/mysql/uploader.py +89 -197
  4. {mdbq-4.2.7 → mdbq-4.2.9}/mdbq.egg-info/PKG-INFO +1 -1
  5. mdbq-4.2.7/mdbq/__version__.py +0 -1
  6. {mdbq-4.2.7 → mdbq-4.2.9}/README.txt +0 -0
  7. {mdbq-4.2.7 → mdbq-4.2.9}/mdbq/__init__.py +0 -0
  8. {mdbq-4.2.7 → mdbq-4.2.9}/mdbq/auth/__init__.py +0 -0
  9. {mdbq-4.2.7 → mdbq-4.2.9}/mdbq/auth/auth_backend.py +0 -0
  10. {mdbq-4.2.7 → mdbq-4.2.9}/mdbq/auth/crypto.py +0 -0
  11. {mdbq-4.2.7 → mdbq-4.2.9}/mdbq/auth/rate_limiter.py +0 -0
  12. {mdbq-4.2.7 → mdbq-4.2.9}/mdbq/js/__init__.py +0 -0
  13. {mdbq-4.2.7 → mdbq-4.2.9}/mdbq/js/jc.py +0 -0
  14. {mdbq-4.2.7 → mdbq-4.2.9}/mdbq/log/__init__.py +0 -0
  15. {mdbq-4.2.7 → mdbq-4.2.9}/mdbq/log/mylogger.py +0 -0
  16. {mdbq-4.2.7 → mdbq-4.2.9}/mdbq/myconf/__init__.py +0 -0
  17. {mdbq-4.2.7 → mdbq-4.2.9}/mdbq/myconf/myconf.py +0 -0
  18. {mdbq-4.2.7 → mdbq-4.2.9}/mdbq/mysql/__init__.py +0 -0
  19. {mdbq-4.2.7 → mdbq-4.2.9}/mdbq/mysql/deduplicator.py +0 -0
  20. {mdbq-4.2.7 → mdbq-4.2.9}/mdbq/mysql/mysql.py +0 -0
  21. {mdbq-4.2.7 → mdbq-4.2.9}/mdbq/mysql/s_query.py +0 -0
  22. {mdbq-4.2.7 → mdbq-4.2.9}/mdbq/mysql/unique_.py +0 -0
  23. {mdbq-4.2.7 → mdbq-4.2.9}/mdbq/other/__init__.py +0 -0
  24. {mdbq-4.2.7 → mdbq-4.2.9}/mdbq/other/download_sku_picture.py +0 -0
  25. {mdbq-4.2.7 → mdbq-4.2.9}/mdbq/other/error_handler.py +0 -0
  26. {mdbq-4.2.7 → mdbq-4.2.9}/mdbq/other/otk.py +0 -0
  27. {mdbq-4.2.7 → mdbq-4.2.9}/mdbq/other/pov_city.py +0 -0
  28. {mdbq-4.2.7 → mdbq-4.2.9}/mdbq/other/ua_sj.py +0 -0
  29. {mdbq-4.2.7 → mdbq-4.2.9}/mdbq/pbix/__init__.py +0 -0
  30. {mdbq-4.2.7 → mdbq-4.2.9}/mdbq/pbix/pbix_refresh.py +0 -0
  31. {mdbq-4.2.7 → mdbq-4.2.9}/mdbq/pbix/refresh_all.py +0 -0
  32. {mdbq-4.2.7 → mdbq-4.2.9}/mdbq/redis/__init__.py +0 -0
  33. {mdbq-4.2.7 → mdbq-4.2.9}/mdbq/redis/getredis.py +0 -0
  34. {mdbq-4.2.7 → mdbq-4.2.9}/mdbq/redis/redis_cache.py +0 -0
  35. {mdbq-4.2.7 → mdbq-4.2.9}/mdbq/route/__init__.py +0 -0
  36. {mdbq-4.2.7 → mdbq-4.2.9}/mdbq/route/analytics.py +0 -0
  37. {mdbq-4.2.7 → mdbq-4.2.9}/mdbq/route/monitor.py +0 -0
  38. {mdbq-4.2.7 → mdbq-4.2.9}/mdbq/route/routes.py +0 -0
  39. {mdbq-4.2.7 → mdbq-4.2.9}/mdbq/selenium/__init__.py +0 -0
  40. {mdbq-4.2.7 → mdbq-4.2.9}/mdbq/selenium/get_driver.py +0 -0
  41. {mdbq-4.2.7 → mdbq-4.2.9}/mdbq/spider/__init__.py +0 -0
  42. {mdbq-4.2.7 → mdbq-4.2.9}/mdbq.egg-info/SOURCES.txt +0 -0
  43. {mdbq-4.2.7 → mdbq-4.2.9}/mdbq.egg-info/dependency_links.txt +0 -0
  44. {mdbq-4.2.7 → mdbq-4.2.9}/mdbq.egg-info/top_level.txt +0 -0
  45. {mdbq-4.2.7 → mdbq-4.2.9}/setup.cfg +0 -0
  46. {mdbq-4.2.7 → mdbq-4.2.9}/setup.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.2
2
2
  Name: mdbq
3
- Version: 4.2.7
3
+ Version: 4.2.9
4
4
  Home-page: https://pypi.org/project/mdbq
5
5
  Author: xigua,
6
6
  Author-email: 2587125111@qq.com
@@ -0,0 +1 @@
1
+ VERSION = '4.2.9'
@@ -11,8 +11,6 @@ from typing import Union, List, Dict, Optional, Any, Tuple, Iterator
11
11
  from functools import wraps
12
12
  from decimal import Decimal, InvalidOperation
13
13
  import math
14
- import concurrent.futures
15
- import threading
16
14
  import pymysql
17
15
  import pandas as pd
18
16
  import psutil
@@ -64,6 +62,17 @@ class DatabaseConnectionManager:
64
62
  'autocommit': False
65
63
  }
66
64
 
65
+ # 设置时区为北京时间,确保时间戳的一致性
66
+ if 'init_command' not in self.config:
67
+ pool_params['init_command'] = "SET time_zone = '+08:00'"
68
+ else:
69
+ # 如果用户已设置init_command,则追加时区设置
70
+ existing_commands = self.config['init_command']
71
+ if 'time_zone' not in existing_commands.lower():
72
+ pool_params['init_command'] = f"{existing_commands}; SET time_zone = '+08:00'"
73
+ else:
74
+ pool_params['init_command'] = existing_commands
75
+
67
76
  if self.config.get('ssl'):
68
77
  pool_params['ssl'] = self.config['ssl']
69
78
 
@@ -256,18 +265,26 @@ class DataTypeInferrer:
256
265
  # 采样数据进行类型推断
257
266
  sample_data = data[:sample_size] if len(data) > sample_size else data
258
267
 
268
+ # 首先收集所有列名
269
+ all_columns = set()
270
+ for row in sample_data:
271
+ for col in row.keys():
272
+ if col.lower() not in ['id', 'create_at', 'update_at']:
273
+ all_columns.add(col)
274
+
275
+ # 为每个列初始化候选类型列表
276
+ for col in all_columns:
277
+ type_candidates[col] = []
278
+
259
279
  for row in sample_data:
260
280
  for col, value in row.items():
261
281
  # 跳过系统列
262
282
  if col.lower() in ['id', 'create_at', 'update_at']:
263
283
  continue
264
284
 
265
- if value is not None and str(value).strip():
266
- mysql_type = DataTypeInferrer.infer_mysql_type(value)
267
-
268
- if col not in type_candidates:
269
- type_candidates[col] = []
270
- type_candidates[col].append(mysql_type)
285
+ # 即使值为空,也要推断类型
286
+ mysql_type = DataTypeInferrer.infer_mysql_type(value)
287
+ type_candidates[col].append(mysql_type)
271
288
 
272
289
  # 为每列选择最合适的类型
273
290
  for col, types in type_candidates.items():
@@ -673,6 +690,19 @@ class TableManager:
673
690
  db_name = self._sanitize_identifier(db_name)
674
691
  table_name = self._sanitize_identifier(table_name)
675
692
 
693
+ # 验证columns不为空
694
+ if not columns:
695
+ raise ValueError(f"创建表失败:columns不能为空。数据库: {db_name}, 表: {table_name}")
696
+
697
+ # 验证unique_keys中的列是否存在于columns中
698
+ if unique_keys:
699
+ business_columns = {k.lower(): k for k in columns.keys() if k.lower() not in ['id', 'create_at', 'update_at']}
700
+ for i, uk in enumerate(unique_keys):
701
+ for col in uk:
702
+ col_lower = col.lower()
703
+ if col_lower not in business_columns and col not in columns:
704
+ raise ValueError(f"唯一约束中的列 '{col}' 不存在于表定义中。可用列: {list(business_columns.keys())}")
705
+
676
706
  # 构建列定义
677
707
  column_defs = []
678
708
 
@@ -705,8 +735,15 @@ class TableManager:
705
735
  safe_uk_parts = []
706
736
  for col in filtered_uk:
707
737
  safe_col_name = self._sanitize_identifier(col)
708
- # 检查是否需要前缀索引
709
- col_type = columns.get(col, 'varchar(255)').lower()
738
+ # 检查是否需要前缀索引 - 优先使用原始列名,然后尝试小写
739
+ col_lower = col.lower()
740
+ if col in columns:
741
+ col_type = columns[col].lower()
742
+ elif col_lower in columns:
743
+ col_type = columns[col_lower].lower()
744
+ else:
745
+ col_type = 'varchar(255)'
746
+
710
747
  if 'varchar' in col_type:
711
748
  # 提取varchar长度
712
749
  match = re.search(r'varchar\((\d+)\)', col_type)
@@ -716,20 +753,11 @@ class TableManager:
716
753
  if length > 191:
717
754
  prefix_length = 191
718
755
  safe_uk_parts.append(f"`{safe_col_name}`({prefix_length})")
719
- logger.debug('应用前缀索引', {
720
- '列名': col,
721
- '原始长度': length,
722
- '前缀长度': prefix_length
723
- })
724
756
  else:
725
757
  safe_uk_parts.append(f"`{safe_col_name}`")
726
758
  else:
727
759
  # 如果没有指定长度,默认使用前缀索引
728
760
  safe_uk_parts.append(f"`{safe_col_name}`(191)")
729
- logger.debug('应用默认前缀索引', {
730
- '列名': col,
731
- '前缀长度': 191
732
- })
733
761
  else:
734
762
  # 非varchar字段保持原样
735
763
  safe_uk_parts.append(f"`{safe_col_name}`")
@@ -749,9 +777,17 @@ class TableManager:
749
777
 
750
778
  with self.conn_mgr.get_connection() as conn:
751
779
  with conn.cursor() as cursor:
752
- cursor.execute(sql)
753
- conn.commit()
754
- logger.debug('表已创建', {'database': db_name, 'table': table_name})
780
+ try:
781
+ cursor.execute(sql)
782
+ conn.commit()
783
+ logger.debug('表已创建', {'database': db_name, 'table': table_name})
784
+ except Exception as e:
785
+ logger.error('创建表失败', {
786
+ 'database': db_name,
787
+ 'table': table_name,
788
+ 'error': str(e)
789
+ })
790
+ raise
755
791
 
756
792
  def get_partition_table_name(self, base_name: str, date_value: str, partition_by: str) -> str:
757
793
  """获取分表名称"""
@@ -795,8 +831,6 @@ class TableManager:
795
831
  return cleaned
796
832
 
797
833
 
798
-
799
-
800
834
  class DataProcessor:
801
835
  """数据处理器"""
802
836
 
@@ -1063,6 +1097,12 @@ class MySQLUploader:
1063
1097
  - 支持自动建表、分表、数据类型推断
1064
1098
  - 高可用连接池管理和重试机制
1065
1099
  - 流式批量插入优化
1100
+ - 自动设置数据库连接时区为北京时间(+08:00),确保时间戳一致性
1101
+
1102
+ 时区说明:
1103
+ - 所有数据库连接会自动设置为北京时间(+08:00)
1104
+ - create_at和update_at列使用CURRENT_TIMESTAMP,会按照连接时区记录时间
1105
+ - 可使用check_timezone_settings()方法验证时区设置
1066
1106
  """
1067
1107
 
1068
1108
  def __init__(self, username: str, password: str, host: str = 'localhost',
@@ -1152,21 +1192,35 @@ class MySQLUploader:
1152
1192
  normalized_data = DataProcessor.normalize_data(data)
1153
1193
 
1154
1194
  # 推断或验证列类型
1155
- if set_typ is None:
1195
+ if set_typ is None or not set_typ:
1156
1196
  # 取第一个chunk进行类型推断
1157
1197
  first_chunk = next(iter(normalized_data))
1198
+
1199
+ if not first_chunk:
1200
+ raise ValueError("数据为空,无法推断列类型")
1201
+
1158
1202
  set_typ = DataTypeInferrer.infer_types_from_data(first_chunk)
1159
1203
  # 重新创建迭代器
1160
1204
  normalized_data = DataProcessor.normalize_data(data)
1161
1205
  logger.debug('自动推断数据类型', {'类型映射': set_typ})
1206
+
1207
+ # 验证推断结果
1208
+ if not set_typ or not any(col for col in set_typ.keys() if col.lower() not in ['id', 'create_at', 'update_at']):
1209
+ raise ValueError(f"类型推断失败,无有效业务列。推断结果: {set_typ}")
1162
1210
 
1163
1211
  # 将set_typ的键统一转为小写
1164
1212
  set_typ = self.tran_set_typ_to_lower(set_typ)
1165
1213
 
1214
+ # 最终验证:确保有业务列定义
1215
+ business_columns = {k: v for k, v in set_typ.items() if k.lower() not in ['id', 'create_at', 'update_at']}
1216
+ if not business_columns:
1217
+ raise ValueError(f"没有有效的业务列定义。set_typ: {set_typ}")
1218
+
1166
1219
  # 确保数据库存在
1167
1220
  self.table_mgr.ensure_database_exists(db_name)
1168
1221
 
1169
1222
  # 处理分表逻辑
1223
+
1170
1224
  if partition_by:
1171
1225
  upload_result = self._handle_partitioned_upload(
1172
1226
  db_name, table_name, normalized_data, set_typ,
@@ -1372,6 +1426,16 @@ class MySQLUploader:
1372
1426
  main_result['failed_rows'] += partition_result['failed_rows']
1373
1427
  main_result['tables_created'].extend(partition_result['tables_created'])
1374
1428
 
1429
+ def tran_set_typ_to_lower(self, set_typ: Dict[str, str]) -> Dict[str, str]:
1430
+ if not isinstance(set_typ, dict) or set_typ is None:
1431
+ return {}
1432
+
1433
+ set_typ_lower = {}
1434
+ for key, value in set_typ.items():
1435
+ set_typ_lower[key.lower()] = value
1436
+
1437
+ return set_typ_lower
1438
+
1375
1439
  def close(self):
1376
1440
  """关闭连接"""
1377
1441
  if self.conn_mgr:
@@ -1389,178 +1453,6 @@ class MySQLUploader:
1389
1453
  def __exit__(self, exc_type, exc_val, exc_tb):
1390
1454
  self.close()
1391
1455
 
1392
- def upload_data_concurrent(self, db_name: str, table_name: str,
1393
- data: Union[Dict, List[Dict], pd.DataFrame],
1394
- set_typ: Optional[Dict[str, str]] = None,
1395
- allow_null: bool = False,
1396
- partition_by: Optional[str] = None,
1397
- partition_date_column: str = '日期',
1398
- update_on_duplicate: bool = False,
1399
- unique_keys: Optional[List[List[str]]] = None,
1400
- max_workers: int = 3) -> Dict[str, Any]:
1401
- """
1402
- 并发上传数据到MySQL数据库
1403
-
1404
- :param max_workers: 最大并发工作线程数
1405
- :return: 上传结果详情
1406
- """
1407
- db_name = db_name.lower()
1408
- table_name = table_name.lower()
1409
-
1410
- result = {
1411
- 'success': False,
1412
- 'inserted_rows': 0,
1413
- 'skipped_rows': 0,
1414
- 'failed_rows': 0,
1415
- 'tables_created': []
1416
- }
1417
-
1418
- try:
1419
- # 标准化数据为流式迭代器
1420
- normalized_data = DataProcessor.normalize_data(data, chunk_size=2000) # 更小的chunk用于并发
1421
-
1422
- # 推断或验证列类型
1423
- if set_typ is None:
1424
- first_chunk = next(iter(normalized_data))
1425
- set_typ = DataTypeInferrer.infer_types_from_data(first_chunk)
1426
- normalized_data = DataProcessor.normalize_data(data, chunk_size=2000)
1427
- logger.debug('自动推断数据类型', {'类型映射': set_typ})
1428
-
1429
- # 将set_typ的键统一转为小写
1430
- set_typ = self.tran_set_typ_to_lower(set_typ)
1431
-
1432
- # 确保数据库存在
1433
- self.table_mgr.ensure_database_exists(db_name)
1434
-
1435
- # 创建线程锁用于表创建的线程安全
1436
- table_creation_lock = threading.Lock()
1437
- created_tables_set = set()
1438
-
1439
- def process_chunk_worker(chunk_data):
1440
- """工作线程函数"""
1441
- try:
1442
- if partition_by:
1443
- # 分表处理
1444
- partitioned_chunk = DataProcessor.partition_data_by_date(
1445
- chunk_data, partition_date_column, partition_by
1446
- )
1447
-
1448
- chunk_result = {
1449
- 'inserted_rows': 0,
1450
- 'skipped_rows': 0,
1451
- 'failed_rows': 0,
1452
- 'tables_created': []
1453
- }
1454
-
1455
- for partition_suffix, partition_data in partitioned_chunk.items():
1456
- partition_table_name = f"{table_name}_{partition_suffix}"
1457
- table_key = f"{db_name}.{partition_table_name}"
1458
-
1459
- # 确保表存在(线程安全)
1460
- with table_creation_lock:
1461
- if table_key not in created_tables_set:
1462
- if not self.table_mgr.table_exists(db_name, partition_table_name):
1463
- self.table_mgr.create_table(db_name, partition_table_name, set_typ,
1464
- unique_keys=unique_keys, allow_null=allow_null)
1465
- chunk_result['tables_created'].append(table_key)
1466
- else:
1467
- self.table_mgr.ensure_system_columns(db_name, partition_table_name)
1468
- created_tables_set.add(table_key)
1469
-
1470
- # 准备并插入数据
1471
- prepared_data = DataProcessor.prepare_data_for_insert(
1472
- partition_data, set_typ, allow_null
1473
- )
1474
-
1475
- inserted, skipped, failed = self.data_inserter.insert_data(
1476
- db_name, partition_table_name, prepared_data, set_typ, update_on_duplicate
1477
- )
1478
-
1479
- chunk_result['inserted_rows'] += inserted
1480
- chunk_result['skipped_rows'] += skipped
1481
- chunk_result['failed_rows'] += failed
1482
- else:
1483
- # 单表处理
1484
- table_key = f"{db_name}.{table_name}"
1485
- with table_creation_lock:
1486
- if table_key not in created_tables_set:
1487
- if not self.table_mgr.table_exists(db_name, table_name):
1488
- self.table_mgr.create_table(db_name, table_name, set_typ,
1489
- unique_keys=unique_keys, allow_null=allow_null)
1490
- chunk_result = {'tables_created': [table_key]}
1491
- else:
1492
- self.table_mgr.ensure_system_columns(db_name, table_name)
1493
- chunk_result = {'tables_created': []}
1494
- created_tables_set.add(table_key)
1495
- else:
1496
- chunk_result = {'tables_created': []}
1497
-
1498
- prepared_chunk = DataProcessor.prepare_data_for_insert(
1499
- chunk_data, set_typ, allow_null
1500
- )
1501
-
1502
- inserted, skipped, failed = self.data_inserter.insert_data(
1503
- db_name, table_name, prepared_chunk, set_typ, update_on_duplicate
1504
- )
1505
-
1506
- chunk_result.update({
1507
- 'inserted_rows': inserted,
1508
- 'skipped_rows': skipped,
1509
- 'failed_rows': failed
1510
- })
1511
-
1512
- return chunk_result
1513
-
1514
- except Exception as e:
1515
- logger.error('并发处理chunk失败', {'错误': str(e)})
1516
- return {
1517
- 'inserted_rows': 0,
1518
- 'skipped_rows': 0,
1519
- 'failed_rows': len(chunk_data) if chunk_data else 0,
1520
- 'tables_created': []
1521
- }
1522
-
1523
- # 使用线程池执行并发处理
1524
- with concurrent.futures.ThreadPoolExecutor(max_workers=max_workers) as executor:
1525
- # 提交所有任务
1526
- future_to_chunk = {}
1527
- for chunk in normalized_data:
1528
- if chunk:
1529
- future = executor.submit(process_chunk_worker, chunk)
1530
- future_to_chunk[future] = len(chunk)
1531
-
1532
- # 收集结果
1533
- for future in concurrent.futures.as_completed(future_to_chunk):
1534
- chunk_result = future.result()
1535
- result['inserted_rows'] += chunk_result['inserted_rows']
1536
- result['skipped_rows'] += chunk_result['skipped_rows']
1537
- result['failed_rows'] += chunk_result['failed_rows']
1538
- result['tables_created'].extend(chunk_result['tables_created'])
1539
-
1540
- # 去重tables_created
1541
- result['tables_created'] = list(set(result['tables_created']))
1542
- result['success'] = result['failed_rows'] == 0
1543
-
1544
- except Exception as e:
1545
- logger.error('并发数据上传失败', {
1546
- '数据库': db_name,
1547
- '表名': table_name,
1548
- '错误': str(e)
1549
- })
1550
- result['success'] = False
1551
-
1552
- return result
1553
-
1554
- def tran_set_typ_to_lower(self, set_typ: Dict[str, str]) -> Dict[str, str]:
1555
- if not isinstance(set_typ, dict):
1556
- return set_typ
1557
-
1558
- set_typ_lower = {}
1559
- for key, value in set_typ.items():
1560
- set_typ_lower[key.lower()] = value
1561
-
1562
- return set_typ_lower
1563
-
1564
1456
 
1565
1457
  # 使用示例
1566
1458
  if __name__ == '__main__':
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.2
2
2
  Name: mdbq
3
- Version: 4.2.7
3
+ Version: 4.2.9
4
4
  Home-page: https://pypi.org/project/mdbq
5
5
  Author: xigua,
6
6
  Author-email: 2587125111@qq.com
@@ -1 +0,0 @@
1
- VERSION = '4.2.7'
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes