mdbq 4.0.2__py3-none-any.whl → 4.0.4__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- mdbq/__version__.py +1 -1
- mdbq/aggregation/query_data.py +745 -931
- mdbq/mysql/s_query.py +851 -110
- mdbq/mysql/uploader.py +73 -42
- {mdbq-4.0.2.dist-info → mdbq-4.0.4.dist-info}/METADATA +1 -1
- {mdbq-4.0.2.dist-info → mdbq-4.0.4.dist-info}/RECORD +8 -8
- {mdbq-4.0.2.dist-info → mdbq-4.0.4.dist-info}/WHEEL +0 -0
- {mdbq-4.0.2.dist-info → mdbq-4.0.4.dist-info}/top_level.txt +0 -0
mdbq/mysql/uploader.py
CHANGED
@@ -14,6 +14,7 @@ from dbutils.pooled_db import PooledDB
|
|
14
14
|
import json
|
15
15
|
import sys
|
16
16
|
from decimal import Decimal, InvalidOperation
|
17
|
+
import math
|
17
18
|
|
18
19
|
warnings.filterwarnings('ignore')
|
19
20
|
logger = mylogger.MyLogger(
|
@@ -240,8 +241,16 @@ class MySQLUploader:
|
|
240
241
|
conn = self.pool.connection()
|
241
242
|
return conn
|
242
243
|
except Exception as e:
|
243
|
-
logger.error('
|
244
|
-
|
244
|
+
logger.error('从连接池获取数据库连接失败,尝试重建连接池', {'error': str(e)})
|
245
|
+
# 强制重建连接池
|
246
|
+
try:
|
247
|
+
self.pool = self._create_connection_pool()
|
248
|
+
conn = self.pool.connection()
|
249
|
+
logger.info('重建连接池后获取连接成功')
|
250
|
+
return conn
|
251
|
+
except Exception as e2:
|
252
|
+
logger.error('重建连接池后依然获取连接失败', {'error': str(e2)})
|
253
|
+
raise ConnectionError(f'连接数据库失败: {str(e2)}')
|
245
254
|
|
246
255
|
@_execute_with_retry
|
247
256
|
def _check_database_exists(self, db_name: str) -> bool:
|
@@ -407,31 +416,36 @@ class MySQLUploader:
|
|
407
416
|
col_def += " NOT NULL"
|
408
417
|
column_defs.append(col_def)
|
409
418
|
# 主键处理逻辑调整
|
419
|
+
def _index_col_sql(col):
|
420
|
+
col_type = set_typ.get(col, '').lower()
|
421
|
+
if 'varchar' in col_type or 'text' in col_type:
|
422
|
+
return f"`{self._normalize_col(col)}`(100)"
|
423
|
+
return f"`{self._normalize_col(col)}`"
|
410
424
|
if primary_keys and len(primary_keys) > 0:
|
411
|
-
safe_primary_keys = [
|
412
|
-
primary_key_sql = f"PRIMARY KEY (
|
425
|
+
safe_primary_keys = [_index_col_sql(pk) for pk in primary_keys]
|
426
|
+
primary_key_sql = f"PRIMARY KEY ({','.join(safe_primary_keys)})"
|
413
427
|
else:
|
414
|
-
safe_primary_keys = [
|
428
|
+
safe_primary_keys = [_index_col_sql('id')]
|
415
429
|
primary_key_sql = f"PRIMARY KEY (`id`)"
|
416
430
|
# 索引统一在CREATE TABLE中定义
|
417
431
|
index_defs = []
|
418
432
|
if date_column and date_column in set_typ:
|
419
|
-
safe_date_col =
|
420
|
-
index_defs.append(f"INDEX `idx_{
|
433
|
+
safe_date_col = _index_col_sql(date_column)
|
434
|
+
index_defs.append(f"INDEX `idx_{self._normalize_col(date_column)}` ({safe_date_col})")
|
421
435
|
if indexes:
|
422
436
|
for idx_col in indexes:
|
423
437
|
if idx_col in set_typ:
|
424
|
-
safe_idx_col =
|
425
|
-
index_defs.append(f"INDEX `idx_{
|
438
|
+
safe_idx_col = _index_col_sql(idx_col)
|
439
|
+
index_defs.append(f"INDEX `idx_{self._normalize_col(idx_col)}` ({safe_idx_col})")
|
426
440
|
# UNIQUE KEY定义
|
427
441
|
unique_defs = []
|
428
442
|
if unique_keys:
|
429
443
|
for unique_cols in unique_keys:
|
430
444
|
if not unique_cols:
|
431
445
|
continue
|
432
|
-
safe_unique_cols = [
|
433
|
-
unique_name = f"uniq_{'_'.join(
|
434
|
-
unique_defs.append(f"UNIQUE KEY `{unique_name}` (
|
446
|
+
safe_unique_cols = [_index_col_sql(col) for col in unique_cols]
|
447
|
+
unique_name = f"uniq_{'_'.join([self._normalize_col(c) for c in unique_cols])}"
|
448
|
+
unique_defs.append(f"UNIQUE KEY `{unique_name}` ({','.join(safe_unique_cols)})")
|
435
449
|
index_defs = list(set(index_defs))
|
436
450
|
all_defs = column_defs + [primary_key_sql] + index_defs + unique_defs
|
437
451
|
sql = f"""
|
@@ -447,7 +461,7 @@ class MySQLUploader:
|
|
447
461
|
conn.commit()
|
448
462
|
logger.info('数据表及索引已创建', {'库': db_name, '表': table_name, '索引': indexes, '唯一约束': unique_keys})
|
449
463
|
except Exception as e:
|
450
|
-
logger.error('建表失败', {'库': db_name, '表': table_name, '错误': str(e)})
|
464
|
+
logger.error('建表失败', {'库': db_name, '表': table_name, '错误': str(e), '异常类型': type(e).__name__})
|
451
465
|
if conn is not None:
|
452
466
|
conn.rollback()
|
453
467
|
raise
|
@@ -491,25 +505,45 @@ class MySQLUploader:
|
|
491
505
|
def _validate_value(self, value: Any, column_type: str, allow_null: bool, db_name: str = None, table_name: str = None, col_name: str = None) -> Any:
|
492
506
|
"""
|
493
507
|
根据列类型验证并转换数据值
|
494
|
-
|
495
|
-
:param value: 要验证的值
|
496
|
-
:param column_type: 列的数据类型
|
497
|
-
:param allow_null: 是否允许空值
|
498
|
-
:param db_name: 数据库名(用于日志)
|
499
|
-
:param table_name: 表名(用于日志)
|
500
|
-
:param col_name: 列名(用于日志)
|
501
|
-
:return: 转换后的值
|
502
|
-
:raises ValueError: 当值转换失败时抛出
|
503
508
|
"""
|
509
|
+
column_type_lower = column_type.lower() if column_type else ''
|
510
|
+
# 统一判断None/NaN
|
511
|
+
is_nan = False
|
504
512
|
if value is None:
|
513
|
+
is_nan = True
|
514
|
+
elif isinstance(value, float) and math.isnan(value):
|
515
|
+
is_nan = True
|
516
|
+
elif str(value).lower() in ['nan', 'none']:
|
517
|
+
is_nan = True
|
518
|
+
if is_nan:
|
505
519
|
if not allow_null:
|
506
|
-
|
507
|
-
'
|
508
|
-
|
509
|
-
|
520
|
+
if 'int' in column_type_lower:
|
521
|
+
logger.debug('字段值为None/NaN但不允许空值, 已填充为0', {
|
522
|
+
'库': db_name, '表': table_name, '列': col_name, '字段类型': column_type
|
523
|
+
})
|
524
|
+
return 0
|
525
|
+
elif any(t in column_type_lower for t in ['float', 'double', 'decimal']):
|
526
|
+
logger.debug('字段值为None/NaN但不允许空值, 已填充为0.0', {
|
527
|
+
'库': db_name, '表': table_name, '列': col_name, '字段类型': column_type
|
528
|
+
})
|
529
|
+
return 0.0
|
530
|
+
elif 'date' in column_type_lower or 'time' in column_type_lower:
|
531
|
+
# 判断是date还是datetime/timestamp
|
532
|
+
if 'datetime' in column_type_lower or 'timestamp' in column_type_lower:
|
533
|
+
default_date = '2000-01-01 00:00:00'
|
534
|
+
else:
|
535
|
+
default_date = '2000-01-01'
|
536
|
+
logger.debug('字段值为None/NaN但不允许空值, 已填充为默认日期', {
|
537
|
+
'库': db_name, '表': table_name, '列': col_name, '字段类型': column_type, '默认值': default_date
|
538
|
+
})
|
539
|
+
return default_date
|
540
|
+
else:
|
541
|
+
logger.debug('字段值为None/NaN但不允许空值, 已填充为none字符串', {
|
542
|
+
'库': db_name, '表': table_name, '列': col_name, '字段类型': column_type
|
543
|
+
})
|
544
|
+
return 'none'
|
510
545
|
return None
|
511
546
|
try:
|
512
|
-
column_type_lower = column_type.lower()
|
513
547
|
if isinstance(value, str) and value.strip().endswith('%'):
|
514
548
|
try:
|
515
549
|
percent_str = value.strip().replace('%', '')
|
@@ -881,22 +915,21 @@ class MySQLUploader:
|
|
881
915
|
# set_typ的键清洗
|
882
916
|
set_typ = {self._normalize_col(k): v for k, v in set_typ.items()}
|
883
917
|
|
884
|
-
#
|
885
|
-
data_columns = set()
|
886
|
-
if data and len(data) > 0:
|
887
|
-
data_columns = set(data[0].keys())
|
888
|
-
|
889
|
-
# 过滤set_typ,只保留数据中存在的列
|
918
|
+
# 新实现:严格按set_typ顺序过滤,后补充data中有但set_typ没有的列
|
890
919
|
filtered_set_typ = {}
|
891
|
-
|
892
|
-
|
920
|
+
data_columns = list(data[0].keys()) if data and len(data) > 0 else []
|
921
|
+
# 先按set_typ顺序
|
922
|
+
for col in set_typ:
|
923
|
+
if col in data_columns:
|
893
924
|
filtered_set_typ[col] = set_typ[col]
|
894
|
-
|
895
|
-
|
925
|
+
# 再补充data中有但set_typ没有的列
|
926
|
+
for col in data_columns:
|
927
|
+
if col not in filtered_set_typ:
|
928
|
+
# 推断类型
|
896
929
|
sample_values = [row[col] for row in data if col in row and row[col] is not None][:5]
|
897
930
|
inferred_type = None
|
898
931
|
for val in sample_values:
|
899
|
-
inferred_type = self._infer_data_type(val, no_log=True)
|
932
|
+
inferred_type = self._infer_data_type(val, no_log=True)
|
900
933
|
if inferred_type:
|
901
934
|
break
|
902
935
|
if not inferred_type:
|
@@ -1326,8 +1359,7 @@ class MySQLUploader:
|
|
1326
1359
|
if cached:
|
1327
1360
|
return cached
|
1328
1361
|
# 获取所有列名(排除id)
|
1329
|
-
all_columns = [col for col in set_typ.keys()
|
1330
|
-
if col.lower() != 'id']
|
1362
|
+
all_columns = [col for col in set_typ.keys() if col.lower() != 'id']
|
1331
1363
|
if not check_duplicate:
|
1332
1364
|
sql = self._build_simple_insert_sql(db_name, table_name, all_columns,
|
1333
1365
|
update_on_duplicate)
|
@@ -1364,7 +1396,6 @@ class MySQLUploader:
|
|
1364
1396
|
- 只有遇到严重的数据库错误(如所有行都因唯一约束冲突且没有ON DUPLICATE KEY UPDATE),才会整体回滚。
|
1365
1397
|
- 返回值为(插入行数, 跳过行数, 失败行数)。
|
1366
1398
|
"""
|
1367
|
-
import pymysql # 确保异常类型可用
|
1368
1399
|
def get_optimal_batch_size(total_rows: int) -> int:
|
1369
1400
|
if total_rows <= 100:
|
1370
1401
|
return total_rows
|
@@ -1612,5 +1643,5 @@ def main():
|
|
1612
1643
|
|
1613
1644
|
|
1614
1645
|
if __name__ == '__main__':
|
1615
|
-
main()
|
1646
|
+
# main()
|
1616
1647
|
pass
|
@@ -1,7 +1,7 @@
|
|
1
1
|
mdbq/__init__.py,sha256=Il5Q9ATdX8yXqVxtP_nYqUhExzxPC_qk_WXQ_4h0exg,16
|
2
|
-
mdbq/__version__.py,sha256=
|
2
|
+
mdbq/__version__.py,sha256=lAYjWBa6ThlPWc3_1b0lWM6fn_3Z9ckuALnMMZXhBbs,17
|
3
3
|
mdbq/aggregation/__init__.py,sha256=EeDqX2Aml6SPx8363J-v1lz0EcZtgwIBYyCJV6CcEDU,40
|
4
|
-
mdbq/aggregation/query_data.py,sha256=
|
4
|
+
mdbq/aggregation/query_data.py,sha256=_k6Jg60RaaT056sIaiSO6v84dEnOIOGq-nUJtSr65kI,171861
|
5
5
|
mdbq/config/__init__.py,sha256=jso1oHcy6cJEfa7udS_9uO5X6kZLoPBF8l3wCYmr5dM,18
|
6
6
|
mdbq/config/config.py,sha256=eaTfrfXQ65xLqjr5I8-HkZd_jEY1JkGinEgv3TSLeoQ,3170
|
7
7
|
mdbq/log/__init__.py,sha256=Mpbrav0s0ifLL7lVDAuePEi1hJKiSHhxcv1byBKDl5E,15
|
@@ -10,9 +10,9 @@ mdbq/log/spider_logging.py,sha256=-ozWWEGm3HVv604ozs_OOvVwumjokmUPwbaodesUrPY,16
|
|
10
10
|
mdbq/mysql/__init__.py,sha256=A_DPJyAoEvTSFojiI2e94zP0FKtCkkwKP1kYUCSyQzo,11
|
11
11
|
mdbq/mysql/deduplicator.py,sha256=8v3MC6TJ0YEiExWrTP9OXAxTYnL9XbpYL2vWaER1h2M,73099
|
12
12
|
mdbq/mysql/mysql.py,sha256=pDg771xBugCMSTWeskIFTi3pFLgaqgyG3smzf-86Wn8,56772
|
13
|
-
mdbq/mysql/s_query.py,sha256=
|
13
|
+
mdbq/mysql/s_query.py,sha256=RnVCwMQ_n9PcAimbMWbHe9k8eil8shtCfa3LwLBZi6c,41909
|
14
14
|
mdbq/mysql/unique_.py,sha256=Wgqq_PjAAD757JTa10wjYaJgssZ_C_ypU6DW56jbuyw,21074
|
15
|
-
mdbq/mysql/uploader.py,sha256=
|
15
|
+
mdbq/mysql/uploader.py,sha256=e49Gk09K766QXaus_p3VOMcH2VbexQzKsqDTCGrWoWQ,74419
|
16
16
|
mdbq/other/__init__.py,sha256=jso1oHcy6cJEfa7udS_9uO5X6kZLoPBF8l3wCYmr5dM,18
|
17
17
|
mdbq/other/download_sku_picture.py,sha256=YU8DxKMXbdeE1OOKEA848WVp62jYHw5O4tXTjUdq9H0,44832
|
18
18
|
mdbq/other/otk.py,sha256=iclBIFbQbhlqzUbcMMoePXBpcP1eZ06ZtjnhcA_EbmE,7241
|
@@ -25,7 +25,7 @@ mdbq/redis/__init__.py,sha256=YtgBlVSMDphtpwYX248wGge1x-Ex_mMufz4-8W0XRmA,12
|
|
25
25
|
mdbq/redis/getredis.py,sha256=l3zBK7wrZl0oO42-_UGylyatnIp_SBw8wDDvof9fht4,23534
|
26
26
|
mdbq/spider/__init__.py,sha256=RBMFXGy_jd1HXZhngB2T2XTvJqki8P_Fr-pBcwijnew,18
|
27
27
|
mdbq/spider/aikucun.py,sha256=hPRzLQvFIF4ibN8aP3Dg_ru5meac90faPyzOB22cj-o,20965
|
28
|
-
mdbq-4.0.
|
29
|
-
mdbq-4.0.
|
30
|
-
mdbq-4.0.
|
31
|
-
mdbq-4.0.
|
28
|
+
mdbq-4.0.4.dist-info/METADATA,sha256=VM2dtOiBJ74NlYhq9UWAFAPloayAXPX9bLKnvZJd7Xg,363
|
29
|
+
mdbq-4.0.4.dist-info/WHEEL,sha256=jB7zZ3N9hIM9adW7qlTAyycLYW9npaWKLRzaoVcLKcM,91
|
30
|
+
mdbq-4.0.4.dist-info/top_level.txt,sha256=2FQ-uLnCSB-OwFiWntzmwosW3X2Xqsg0ewh1axsaylA,5
|
31
|
+
mdbq-4.0.4.dist-info/RECORD,,
|
File without changes
|
File without changes
|