mdbq 3.11.2__py3-none-any.whl → 3.11.3__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- mdbq/__version__.py +1 -1
- mdbq/aggregation/query_data.py +0 -3
- mdbq/log/mylogger.py +1 -1
- mdbq/mysql/deduplicator.py +1 -1
- mdbq/mysql/mysql.py +3 -4
- mdbq/mysql/uploader.py +182 -335
- mdbq/spider/aikucun.py +2 -20
- {mdbq-3.11.2.dist-info → mdbq-3.11.3.dist-info}/METADATA +1 -1
- {mdbq-3.11.2.dist-info → mdbq-3.11.3.dist-info}/RECORD +11 -11
- {mdbq-3.11.2.dist-info → mdbq-3.11.3.dist-info}/WHEEL +0 -0
- {mdbq-3.11.2.dist-info → mdbq-3.11.3.dist-info}/top_level.txt +0 -0
mdbq/__version__.py
CHANGED
@@ -1 +1 @@
|
|
1
|
-
VERSION = '3.11.
|
1
|
+
VERSION = '3.11.3'
|
mdbq/aggregation/query_data.py
CHANGED
@@ -9,14 +9,11 @@ import pandas as pd
|
|
9
9
|
import numpy as np
|
10
10
|
from functools import wraps
|
11
11
|
import platform
|
12
|
-
import json
|
13
12
|
import os
|
14
13
|
import time
|
15
14
|
import calendar
|
16
15
|
import concurrent.futures
|
17
|
-
import traceback
|
18
16
|
import logging
|
19
|
-
import sys
|
20
17
|
|
21
18
|
"""
|
22
19
|
|
mdbq/log/mylogger.py
CHANGED
mdbq/mysql/deduplicator.py
CHANGED
mdbq/mysql/mysql.py
CHANGED
@@ -7,10 +7,9 @@ import warnings
|
|
7
7
|
import pymysql
|
8
8
|
import pandas as pd
|
9
9
|
from sqlalchemy import create_engine
|
10
|
-
import os
|
11
10
|
from mdbq.other import otk
|
12
11
|
from mdbq.log import mylogger
|
13
|
-
import
|
12
|
+
import math
|
14
13
|
|
15
14
|
warnings.filterwarnings('ignore')
|
16
15
|
"""
|
@@ -131,7 +130,7 @@ class MysqlUpload:
|
|
131
130
|
new_dict_data: dict = {}
|
132
131
|
for k, v in dict_data.items():
|
133
132
|
k = str(k).lower()
|
134
|
-
k = re.sub(r'[()\-,,$&~^、 ()\"\'
|
133
|
+
k = re.sub(r'[()\-,,$&~^、 ()\"\'"="·/。》《><!!`]', '_', k, re.IGNORECASE)
|
135
134
|
k = k.replace(')', '')
|
136
135
|
k = re.sub(r'_{2,}', '_', k)
|
137
136
|
k = re.sub(r'_+$', '', k)
|
@@ -526,7 +525,7 @@ class MysqlUpload:
|
|
526
525
|
new_dict_data = {}
|
527
526
|
for k, v in dict_data.items():
|
528
527
|
k = str(k).lower()
|
529
|
-
k = re.sub(r'[()\-,,$&~^、 ()\"\'
|
528
|
+
k = re.sub(r'[()\-,,$&~^、 ()\"\'"="·/。》《><!!`]', '_', k, re.IGNORECASE)
|
530
529
|
k = k.replace(')', '')
|
531
530
|
k = re.sub(r'_{2,}', '_', k)
|
532
531
|
k = re.sub(r'_+$', '', k)
|
mdbq/mysql/uploader.py
CHANGED
@@ -10,8 +10,8 @@ from mdbq.log import mylogger
|
|
10
10
|
from typing import Union, List, Dict, Optional, Any, Tuple, Set
|
11
11
|
from dbutils.pooled_db import PooledDB
|
12
12
|
import json
|
13
|
-
from collections import OrderedDict
|
14
13
|
import sys
|
14
|
+
from decimal import Decimal, InvalidOperation
|
15
15
|
|
16
16
|
warnings.filterwarnings('ignore')
|
17
17
|
logger = mylogger.MyLogger(
|
@@ -28,62 +28,44 @@ logger = mylogger.MyLogger(
|
|
28
28
|
)
|
29
29
|
|
30
30
|
|
31
|
-
def count_decimal_places(num_str):
|
31
|
+
def count_decimal_places(num_str: str) -> Tuple[int, int]:
|
32
32
|
"""
|
33
|
-
|
34
|
-
|
35
|
-
:param num_str: 数字字符串
|
36
|
-
:return: 返回元组(整数位数, 小数位数)
|
37
|
-
:raises: 无显式抛出异常,但正则匹配失败时返回(0, 0)
|
38
|
-
"""
|
39
|
-
match = re.match(r'^[-+]?\d+(\.\d+)?([eE][-+]?\d+)?$', str(num_str))
|
40
|
-
if match:
|
41
|
-
# 如果是科学计数法
|
42
|
-
match = re.findall(r'(\d+)\.(\d+)[eE][-+]?(\d+)$', str(num_str))
|
43
|
-
if match:
|
44
|
-
if len(match[0]) == 3:
|
45
|
-
if int(match[0][2]) < len(match[0][1]):
|
46
|
-
# count_int 清除整数部分开头的 0 并计算整数位数
|
47
|
-
count_int = len(re.sub('^0+', '', str(match[0][0]))) + int(match[0][2])
|
48
|
-
# 计算小数位数
|
49
|
-
count_float = len(match[0][1]) - int(match[0][2])
|
50
|
-
return count_int, count_float
|
51
|
-
# 如果是普通小数
|
52
|
-
match = re.findall(r'(\d+)\.(\d+)$', str(num_str))
|
53
|
-
if match:
|
54
|
-
count_int = len(re.sub('^0+', '', str(match[0][0])))
|
55
|
-
count_float = len(match[0][1])
|
56
|
-
return count_int, count_float # 计算小数位数
|
57
|
-
return 0, 0
|
58
|
-
|
59
|
-
|
60
|
-
class StatementCache(OrderedDict):
|
61
|
-
"""
|
62
|
-
基于OrderedDict实现的LRU缓存策略,用于缓存SQL语句
|
63
|
-
|
64
|
-
这个类继承自OrderedDict,实现了最近最少使用(LRU)的缓存策略。
|
65
|
-
当缓存达到最大容量时,会自动删除最早添加的项。
|
33
|
+
统计小数点前后位数,支持科学计数法。
|
34
|
+
返回:(整数位数, 小数位数)
|
66
35
|
"""
|
36
|
+
try:
|
37
|
+
d = Decimal(str(num_str))
|
38
|
+
sign, digits, exponent = d.as_tuple()
|
39
|
+
int_part = len(digits) + exponent if exponent < 0 else len(digits)
|
40
|
+
dec_part = -exponent if exponent < 0 else 0
|
41
|
+
return max(int_part, 0), max(dec_part, 0)
|
42
|
+
except (InvalidOperation, ValueError, TypeError):
|
43
|
+
return (0, 0)
|
44
|
+
|
45
|
+
|
46
|
+
class StatementCache(dict):
|
47
|
+
"""简单LRU缓存实现,用于SQL语句缓存"""
|
67
48
|
def __init__(self, maxsize=100):
|
68
|
-
"""
|
69
|
-
初始化缓存
|
70
|
-
|
71
|
-
:param maxsize: 最大缓存大小,默认为100条SQL语句
|
72
|
-
"""
|
73
49
|
super().__init__()
|
74
|
-
self.
|
75
|
-
|
50
|
+
self._maxsize = maxsize
|
51
|
+
self._order = []
|
52
|
+
def __getitem__(self, key):
|
53
|
+
value = super().__getitem__(key)
|
54
|
+
self._order.remove(key)
|
55
|
+
self._order.append(key)
|
56
|
+
return value
|
76
57
|
def __setitem__(self, key, value):
|
77
|
-
|
78
|
-
|
79
|
-
|
80
|
-
|
81
|
-
|
82
|
-
"""
|
58
|
+
if key in self:
|
59
|
+
self._order.remove(key)
|
60
|
+
elif len(self._order) >= self._maxsize:
|
61
|
+
oldest = self._order.pop(0)
|
62
|
+
super().__delitem__(oldest)
|
83
63
|
super().__setitem__(key, value)
|
84
|
-
|
85
|
-
|
86
|
-
|
64
|
+
self._order.append(key)
|
65
|
+
def get(self, key, default=None):
|
66
|
+
if key in self:
|
67
|
+
return self[key]
|
68
|
+
return default
|
87
69
|
|
88
70
|
class MySQLUploader:
|
89
71
|
"""
|
@@ -194,22 +176,22 @@ class MySQLUploader:
|
|
194
176
|
logger.error('连接池创建失败', {'error': str(e), 'host': self.host, 'port': self.port})
|
195
177
|
raise ConnectionError(f'连接池创建失败: {str(e)}')
|
196
178
|
|
197
|
-
|
179
|
+
@staticmethod
|
180
|
+
def _execute_with_retry(func):
|
198
181
|
"""
|
199
182
|
带重试机制的装饰器,用于数据库操作
|
200
|
-
|
201
183
|
:param func: 被装饰的函数
|
202
184
|
:return: 装饰后的函数
|
203
185
|
:raises: 可能抛出原始异常或最后一次重试的异常
|
204
186
|
"""
|
205
187
|
@wraps(func)
|
206
|
-
def wrapper(*args, **kwargs):
|
188
|
+
def wrapper(self, *args, **kwargs):
|
207
189
|
last_exception = None
|
208
190
|
operation = func.__name__
|
209
191
|
logger.debug(f'开始执行操作: {operation}', {'max_retries': self.max_retries})
|
210
192
|
for attempt in range(self.max_retries):
|
211
193
|
try:
|
212
|
-
result = func(*args, **kwargs)
|
194
|
+
result = func(self, *args, **kwargs)
|
213
195
|
if attempt > 0:
|
214
196
|
logger.info('操作成功(重试后)', {'operation': operation, 'attempts': attempt + 1})
|
215
197
|
else:
|
@@ -255,6 +237,7 @@ class MySQLUploader:
|
|
255
237
|
raise last_exception if last_exception else Exception('发生未知错误')
|
256
238
|
return wrapper
|
257
239
|
|
240
|
+
@_execute_with_retry
|
258
241
|
def _get_connection(self) -> pymysql.connections.Connection:
|
259
242
|
"""
|
260
243
|
从连接池获取数据库连接
|
@@ -270,6 +253,7 @@ class MySQLUploader:
|
|
270
253
|
logger.error('获取数据库连接失败', {'error': str(e)})
|
271
254
|
raise ConnectionError(f'连接数据库失败: {str(e)}')
|
272
255
|
|
256
|
+
@_execute_with_retry
|
273
257
|
def _check_database_exists(self, db_name: str) -> bool:
|
274
258
|
"""
|
275
259
|
检查数据库是否存在
|
@@ -280,6 +264,7 @@ class MySQLUploader:
|
|
280
264
|
"""
|
281
265
|
db_name = self._validate_identifier(db_name)
|
282
266
|
sql = 'SELECT SCHEMA_NAME FROM INFORMATION_SCHEMA.SCHEMATA WHERE SCHEMA_NAME = %s'
|
267
|
+
conn = None
|
283
268
|
try:
|
284
269
|
with self._get_connection() as conn:
|
285
270
|
with conn.cursor() as cursor:
|
@@ -291,6 +276,7 @@ class MySQLUploader:
|
|
291
276
|
logger.error('检查数据库是否存在时出错', {'库': db_name, '错误': str(e)})
|
292
277
|
raise
|
293
278
|
|
279
|
+
@_execute_with_retry
|
294
280
|
def _create_database(self, db_name: str) -> None:
|
295
281
|
"""
|
296
282
|
创建数据库
|
@@ -300,6 +286,7 @@ class MySQLUploader:
|
|
300
286
|
"""
|
301
287
|
db_name = self._validate_identifier(db_name)
|
302
288
|
sql = f'CREATE DATABASE IF NOT EXISTS `{db_name}` CHARACTER SET {self.charset} COLLATE {self.collation}'
|
289
|
+
conn = None
|
303
290
|
try:
|
304
291
|
with self._get_connection() as conn:
|
305
292
|
with conn.cursor() as cursor:
|
@@ -308,7 +295,8 @@ class MySQLUploader:
|
|
308
295
|
logger.info('数据库已创建', {'库': db_name})
|
309
296
|
except Exception as e:
|
310
297
|
logger.error('无法创建数据库', {'库': db_name, '错误': str(e)})
|
311
|
-
conn
|
298
|
+
if conn is not None:
|
299
|
+
conn.rollback()
|
312
300
|
raise
|
313
301
|
|
314
302
|
def _get_partition_table_name(self, table_name: str, date_value: str, partition_by: str) -> str:
|
@@ -345,10 +333,8 @@ class MySQLUploader:
|
|
345
333
|
if not identifier or not isinstance(identifier, str):
|
346
334
|
logger.error('无效的标识符', {'标识符': identifier})
|
347
335
|
raise ValueError(f"无效的标识符: `{identifier}`")
|
348
|
-
|
349
|
-
|
350
|
-
else:
|
351
|
-
cleaned = identifier
|
336
|
+
# 始终做特殊字符清理
|
337
|
+
cleaned = re.sub(r'[^-\uFFFF\w\u4e00-\u9fff$]', '_', identifier)
|
352
338
|
cleaned = re.sub(r'_+', '_', cleaned).strip('_')
|
353
339
|
if not cleaned:
|
354
340
|
logger.error('无法清理异常标识符', {'原始标识符': identifier})
|
@@ -362,6 +348,7 @@ class MySQLUploader:
|
|
362
348
|
return f"`{cleaned}`"
|
363
349
|
return cleaned
|
364
350
|
|
351
|
+
@_execute_with_retry
|
365
352
|
def _check_table_exists(self, db_name: str, table_name: str) -> bool:
|
366
353
|
"""
|
367
354
|
检查表是否存在
|
@@ -396,6 +383,7 @@ class MySQLUploader:
|
|
396
383
|
logger.debug('表存在检查', {'库': db_name, '表': table_name, '存在': result})
|
397
384
|
return result
|
398
385
|
|
386
|
+
@_execute_with_retry
|
399
387
|
def _create_table(
|
400
388
|
self,
|
401
389
|
db_name: str,
|
@@ -407,16 +395,7 @@ class MySQLUploader:
|
|
407
395
|
allow_null: bool = False
|
408
396
|
) -> None:
|
409
397
|
"""
|
410
|
-
|
411
|
-
|
412
|
-
:param db_name: 数据库名
|
413
|
-
:param table_name: 表名
|
414
|
-
:param set_typ: 列名和数据类型字典 {列名: 数据类型}
|
415
|
-
:param primary_keys: 主键列列表,可选
|
416
|
-
:param date_column: 日期列名,可选,如果存在将设置为索引
|
417
|
-
:param indexes: 需要创建索引的列列表,可选
|
418
|
-
:param allow_null: 是否允许空值,默认为False
|
419
|
-
:raises: 可能抛出数据库相关异常
|
398
|
+
创建数据表,优化索引创建方式
|
420
399
|
"""
|
421
400
|
db_name = self._validate_identifier(db_name)
|
422
401
|
table_name = self._validate_identifier(table_name)
|
@@ -439,40 +418,35 @@ class MySQLUploader:
|
|
439
418
|
primary_keys = ['id']
|
440
419
|
safe_primary_keys = [self._validate_identifier(pk) for pk in primary_keys]
|
441
420
|
primary_key_sql = f", PRIMARY KEY (`{'`,`'.join(safe_primary_keys)}`)"
|
421
|
+
# 索引统一在CREATE TABLE中定义
|
422
|
+
index_defs = []
|
423
|
+
if date_column and date_column in set_typ:
|
424
|
+
safe_date_col = self._validate_identifier(date_column)
|
425
|
+
index_defs.append(f"INDEX `idx_{safe_date_col}` (`{safe_date_col}`)")
|
426
|
+
if indexes:
|
427
|
+
for idx_col in indexes:
|
428
|
+
if idx_col in set_typ:
|
429
|
+
safe_idx_col = self._validate_identifier(idx_col)
|
430
|
+
index_defs.append(f"INDEX `idx_{safe_idx_col}` (`{safe_idx_col}`)")
|
431
|
+
index_sql = (',' + ','.join(index_defs)) if index_defs else ''
|
442
432
|
sql = f"""
|
443
433
|
CREATE TABLE IF NOT EXISTS `{db_name}`.`{table_name}` (
|
444
434
|
{','.join(column_defs)}
|
445
435
|
{primary_key_sql}
|
436
|
+
{index_sql}
|
446
437
|
) ENGINE=InnoDB DEFAULT CHARSET={self.charset} COLLATE={self.collation}
|
447
438
|
"""
|
439
|
+
conn = None
|
448
440
|
try:
|
449
441
|
with self._get_connection() as conn:
|
450
442
|
with conn.cursor() as cursor:
|
451
443
|
cursor.execute(sql)
|
452
|
-
logger.info('数据表已创建', {'库': db_name, '表': table_name})
|
453
|
-
index_statements = []
|
454
|
-
if date_column and date_column in set_typ:
|
455
|
-
safe_date_col = self._validate_identifier(date_column)
|
456
|
-
index_statements.append(
|
457
|
-
f"ALTER TABLE `{db_name}`.`{table_name}` ADD INDEX `idx_{safe_date_col}` (`{safe_date_col}`)"
|
458
|
-
)
|
459
|
-
if indexes:
|
460
|
-
for idx_col in indexes:
|
461
|
-
if idx_col in set_typ:
|
462
|
-
safe_idx_col = self._validate_identifier(idx_col)
|
463
|
-
index_statements.append(
|
464
|
-
f"ALTER TABLE `{db_name}`.`{table_name}` ADD INDEX `idx_{safe_idx_col}` (`{safe_idx_col}`)"
|
465
|
-
)
|
466
|
-
if index_statements:
|
467
|
-
with conn.cursor() as cursor:
|
468
|
-
for stmt in index_statements:
|
469
|
-
cursor.execute(stmt)
|
470
|
-
logger.debug('执行索引语句', {'SQL': stmt})
|
471
444
|
conn.commit()
|
472
|
-
logger.info('
|
445
|
+
logger.info('数据表及索引已创建', {'库': db_name, '表': table_name, '索引': indexes})
|
473
446
|
except Exception as e:
|
474
447
|
logger.error('建表失败', {'库': db_name, '表': table_name, '错误': str(e)})
|
475
|
-
conn
|
448
|
+
if conn is not None:
|
449
|
+
conn.rollback()
|
476
450
|
raise
|
477
451
|
|
478
452
|
def _validate_datetime(self, value: str, date_type: bool = False) -> Any:
|
@@ -511,19 +485,24 @@ class MySQLUploader:
|
|
511
485
|
logger.error('无效的日期格式', {'值': value})
|
512
486
|
raise ValueError(f"无效的日期格式: `{value}`")
|
513
487
|
|
514
|
-
def _validate_value(self, value: Any, column_type: str, allow_null: bool) -> Any:
|
488
|
+
def _validate_value(self, value: Any, column_type: str, allow_null: bool, db_name: str = None, table_name: str = None, col_name: str = None) -> Any:
|
515
489
|
"""
|
516
490
|
根据列类型验证并转换数据值
|
517
491
|
|
518
492
|
:param value: 要验证的值
|
519
493
|
:param column_type: 列的数据类型
|
520
494
|
:param allow_null: 是否允许空值
|
495
|
+
:param db_name: 数据库名(用于日志)
|
496
|
+
:param table_name: 表名(用于日志)
|
497
|
+
:param col_name: 列名(用于日志)
|
521
498
|
:return: 转换后的值
|
522
499
|
:raises ValueError: 当值转换失败时抛出
|
523
500
|
"""
|
524
501
|
if value is None:
|
525
502
|
if not allow_null:
|
526
|
-
logger.warning('字段值为None
|
503
|
+
logger.warning('字段值为None但不允许空值, 已填充为none', {
|
504
|
+
'库': db_name, '表': table_name, '列': col_name, '字段类型': column_type
|
505
|
+
})
|
527
506
|
return 'none'
|
528
507
|
return None
|
529
508
|
try:
|
@@ -536,14 +515,18 @@ class MySQLUploader:
|
|
536
515
|
logger.debug('百分比字符串转小数', {'原始': value, '结果': decimal_value})
|
537
516
|
return decimal_value
|
538
517
|
except ValueError:
|
539
|
-
logger.warning('百分比字符串转小数失败', {
|
518
|
+
logger.warning('百分比字符串转小数失败', {
|
519
|
+
'库': db_name, '表': table_name, '列': col_name, '原始': value
|
520
|
+
})
|
540
521
|
elif 'int' in column_type_lower:
|
541
522
|
if isinstance(value, str):
|
542
523
|
value = value.replace(',', '').strip()
|
543
524
|
try:
|
544
525
|
return int(float(value))
|
545
526
|
except ValueError:
|
546
|
-
logger.error('字符串转整数失败', {
|
527
|
+
logger.error('字符串转整数失败', {
|
528
|
+
'库': db_name, '表': table_name, '列': col_name, '值': value
|
529
|
+
})
|
547
530
|
raise ValueError(f"`{value}` -> 无法转为整数")
|
548
531
|
return int(value) if value is not None else None
|
549
532
|
elif any(t in column_type_lower for t in ['float', 'double', 'decimal']):
|
@@ -557,7 +540,9 @@ class MySQLUploader:
|
|
557
540
|
try:
|
558
541
|
return self._validate_datetime(value)
|
559
542
|
except ValueError as e:
|
560
|
-
logger.error('无效日期格式', {
|
543
|
+
logger.error('无效日期格式', {
|
544
|
+
'库': db_name, '表': table_name, '列': col_name, '值': value, '错误': str(e)
|
545
|
+
})
|
561
546
|
raise ValueError(f"无效日期格式: `{value}` -> {str(e)}")
|
562
547
|
return str(value)
|
563
548
|
elif 'char' in column_type_lower or 'text' in column_type_lower:
|
@@ -569,9 +554,12 @@ class MySQLUploader:
|
|
569
554
|
else:
|
570
555
|
return value
|
571
556
|
except (ValueError, TypeError) as e:
|
572
|
-
logger.error('数据类型转换异常', {
|
557
|
+
logger.error('数据类型转换异常', {
|
558
|
+
'库': db_name, '表': table_name, '列': col_name, '值': value, '目标类型': column_type, '错误': str(e)
|
559
|
+
})
|
573
560
|
raise ValueError(f"转换异常 -> 无法将 `{value}` 的数据类型转为: `{column_type}` -> {str(e)}")
|
574
561
|
|
562
|
+
@_execute_with_retry
|
575
563
|
def _get_table_columns(self, db_name: str, table_name: str) -> Dict[str, str]:
|
576
564
|
"""
|
577
565
|
获取表的列名和数据类型
|
@@ -748,7 +736,9 @@ class MySQLUploader:
|
|
748
736
|
self,
|
749
737
|
data: Union[Dict, List[Dict], pd.DataFrame],
|
750
738
|
set_typ: Dict[str, str],
|
751
|
-
allow_null: bool = False
|
739
|
+
allow_null: bool = False,
|
740
|
+
db_name: str = None,
|
741
|
+
table_name: str = None,
|
752
742
|
) -> Tuple[List[Dict], Dict[str, str]]:
|
753
743
|
"""
|
754
744
|
准备要上传的数据,验证并转换数据类型
|
@@ -805,15 +795,17 @@ class MySQLUploader:
|
|
805
795
|
if col in set_typ:
|
806
796
|
filtered_set_typ[col] = set_typ[col]
|
807
797
|
else:
|
808
|
-
# 如果列不在set_typ
|
809
|
-
sample_values = [row[col] for row in data if col in row and row[col] is not None][:
|
810
|
-
|
811
|
-
|
812
|
-
|
813
|
-
|
814
|
-
|
815
|
-
|
816
|
-
|
798
|
+
# 如果列不在set_typ中,采样多个非None值推断类型
|
799
|
+
sample_values = [row[col] for row in data if col in row and row[col] is not None][:5]
|
800
|
+
inferred_type = None
|
801
|
+
for val in sample_values:
|
802
|
+
inferred_type = self._infer_data_type(val)
|
803
|
+
if inferred_type:
|
804
|
+
break
|
805
|
+
if not inferred_type:
|
806
|
+
inferred_type = 'VARCHAR(255)'
|
807
|
+
filtered_set_typ[col] = inferred_type
|
808
|
+
logger.debug(f"自动推断列 `{col}` 的数据类型为: `{inferred_type}`")
|
817
809
|
|
818
810
|
prepared_data = []
|
819
811
|
for row_idx, row in enumerate(data, 1):
|
@@ -830,7 +822,7 @@ class MySQLUploader:
|
|
830
822
|
prepared_row[col_name] = None
|
831
823
|
else:
|
832
824
|
try:
|
833
|
-
prepared_row[col_name] = self._validate_value(row[col_name], filtered_set_typ[col_name], allow_null)
|
825
|
+
prepared_row[col_name] = self._validate_value(row[col_name], filtered_set_typ[col_name], allow_null, db_name, table_name, col_name)
|
834
826
|
except ValueError as e:
|
835
827
|
logger.error('数据验证失败', {
|
836
828
|
'列': col_name,
|
@@ -862,7 +854,7 @@ class MySQLUploader:
|
|
862
854
|
transaction_mode: str = "batch"
|
863
855
|
):
|
864
856
|
"""
|
865
|
-
|
857
|
+
上传数据到数据库的主入口方法,分表逻辑异常处理统计丢弃数据
|
866
858
|
|
867
859
|
:param db_name: 数据库名
|
868
860
|
:param table_name: 表名
|
@@ -888,6 +880,7 @@ class MySQLUploader:
|
|
888
880
|
|
889
881
|
batch_id = f"batch_{int(time.time() * 1000)}"
|
890
882
|
success_flag = False
|
883
|
+
dropped_rows = 0
|
891
884
|
|
892
885
|
logger.info("开始上传", {
|
893
886
|
'库': db_name,
|
@@ -924,7 +917,7 @@ class MySQLUploader:
|
|
924
917
|
raise ValueError("分表方式必须是 'year' 或 'month' 或 'None'")
|
925
918
|
|
926
919
|
# 准备数据
|
927
|
-
prepared_data, filtered_set_typ = self._prepare_data(data, set_typ, allow_null)
|
920
|
+
prepared_data, filtered_set_typ = self._prepare_data(data, set_typ, allow_null, db_name, table_name)
|
928
921
|
|
929
922
|
# 检查数据库是否存在
|
930
923
|
if not self._check_database_exists(db_name):
|
@@ -951,8 +944,8 @@ class MySQLUploader:
|
|
951
944
|
'row': self._shorten_for_log(row),
|
952
945
|
'func': sys._getframe().f_code.co_name,
|
953
946
|
})
|
954
|
-
|
955
|
-
|
947
|
+
dropped_rows += 1
|
948
|
+
continue
|
956
949
|
part_table = self._get_partition_table_name(
|
957
950
|
table_name,
|
958
951
|
str(row[partition_date_column]),
|
@@ -969,7 +962,8 @@ class MySQLUploader:
|
|
969
962
|
'error': str(e),
|
970
963
|
'func': sys._getframe().f_code.co_name,
|
971
964
|
})
|
972
|
-
|
965
|
+
dropped_rows += 1
|
966
|
+
continue
|
973
967
|
|
974
968
|
# 对每个分表执行上传
|
975
969
|
for part_table, part_data in partitioned_data.items():
|
@@ -1016,10 +1010,11 @@ class MySQLUploader:
|
|
1016
1010
|
'表': table_name,
|
1017
1011
|
'批次': batch_id,
|
1018
1012
|
'finish': success_flag,
|
1019
|
-
|
1020
|
-
'
|
1013
|
+
'数据行': initial_row_count,
|
1014
|
+
'丢弃行数': dropped_rows
|
1021
1015
|
})
|
1022
1016
|
|
1017
|
+
@_execute_with_retry
|
1023
1018
|
def _insert_data(
|
1024
1019
|
self,
|
1025
1020
|
db_name: str,
|
@@ -1184,34 +1179,27 @@ class MySQLUploader:
|
|
1184
1179
|
update_on_duplicate: bool
|
1185
1180
|
) -> str:
|
1186
1181
|
"""
|
1187
|
-
准备插入SQL
|
1188
|
-
|
1189
|
-
1. 当 check_duplicate=False 时,忽略 duplicate_columns 和 update_on_duplicate 参数,直接插入全部data。
|
1190
|
-
2. 当 check_duplicate=False 且 update_on_duplicate=True 时,由于 check_duplicate=False,直接插入全部data。
|
1191
|
-
3. 当 check_duplicate=True 且 duplicate_columns=[] 且 update_on_duplicate=True 时,获取数据库所有列(但排除`id`和`更新时间`列),按这些列(不含`id`和`更新时间`)排重插入,遇到重复数据时更新旧数据。
|
1192
|
-
4. 当 check_duplicate=True 且 duplicate_columns=[] 且 update_on_duplicate=False 时,获取数据库所有列(但排除`id`和`更新时间`列),按这些列(不含`id`和`更新时间`)排重插入,不考虑是否更新旧数据。
|
1193
|
-
5. 当 check_duplicate=True 且 duplicate_columns 指定了排重列且 update_on_duplicate=True 时,按 duplicate_columns 指定的列(但排除`id`和`更新时间`)排重插入,遇到重复数据时更新旧数据。
|
1194
|
-
6. 当 check_duplicate=True 且 duplicate_columns 指定了排重列且 update_on_duplicate=False 时,按 duplicate_columns 指定的列(但排除`id`和`更新时间`)排重插入,不考虑是否更新旧数据。
|
1195
|
-
|
1182
|
+
准备插入SQL语句, 增加StatementCache缓存
|
1196
1183
|
"""
|
1184
|
+
cache_key = (db_name, table_name, tuple(sorted(set_typ.items())), check_duplicate, tuple(duplicate_columns) if duplicate_columns else (), update_on_duplicate)
|
1185
|
+
cached = self._prepared_statements.get(cache_key)
|
1186
|
+
if cached:
|
1187
|
+
return cached
|
1197
1188
|
# 获取所有列名(排除id)
|
1198
1189
|
all_columns = [col for col in set_typ.keys()
|
1199
1190
|
if col.lower() != 'id']
|
1200
|
-
|
1201
|
-
# 情况1-2:不检查重复
|
1202
1191
|
if not check_duplicate:
|
1203
|
-
|
1192
|
+
sql = self._build_simple_insert_sql(db_name, table_name, all_columns,
|
1204
1193
|
update_on_duplicate)
|
1205
|
-
|
1206
|
-
|
1207
|
-
|
1208
|
-
|
1209
|
-
|
1210
|
-
|
1211
|
-
|
1212
|
-
# 情况3-6:检查重复
|
1213
|
-
return self._build_duplicate_check_sql(db_name, table_name, all_columns,
|
1194
|
+
else:
|
1195
|
+
dup_cols = duplicate_columns if duplicate_columns else [
|
1196
|
+
col for col in all_columns
|
1197
|
+
if col.lower() not in self.base_excute_col
|
1198
|
+
]
|
1199
|
+
sql = self._build_duplicate_check_sql(db_name, table_name, all_columns,
|
1214
1200
|
dup_cols, update_on_duplicate, set_typ)
|
1201
|
+
self._prepared_statements[cache_key] = sql
|
1202
|
+
return sql
|
1215
1203
|
|
1216
1204
|
def _execute_batch_insert(
|
1217
1205
|
self,
|
@@ -1226,10 +1214,8 @@ class MySQLUploader:
|
|
1226
1214
|
transaction_mode: str,
|
1227
1215
|
update_on_duplicate: bool = False
|
1228
1216
|
) -> Tuple[int, int, int]:
|
1229
|
-
"""
|
1230
|
-
|
1217
|
+
"""执行批量插入操作,优化batch和hybrid模式"""
|
1231
1218
|
def get_optimal_batch_size(total_rows: int) -> int:
|
1232
|
-
# 根据数据量调整批量大小
|
1233
1219
|
if total_rows <= 100:
|
1234
1220
|
return total_rows
|
1235
1221
|
elif total_rows <= 1000:
|
@@ -1238,205 +1224,64 @@ class MySQLUploader:
|
|
1238
1224
|
return 1000
|
1239
1225
|
else:
|
1240
1226
|
return 2000
|
1241
|
-
|
1242
1227
|
batch_size = get_optimal_batch_size(len(data))
|
1243
|
-
|
1244
|
-
# 获取所有列名(排除id列)
|
1245
|
-
all_columns = [col for col in set_typ.keys()
|
1246
|
-
if col.lower() != 'id']
|
1247
|
-
|
1228
|
+
all_columns = [col for col in set_typ.keys() if col.lower() != 'id']
|
1248
1229
|
total_inserted = 0
|
1249
1230
|
total_skipped = 0
|
1250
1231
|
total_failed = 0
|
1251
|
-
|
1252
1232
|
with self._get_connection() as conn:
|
1253
1233
|
with conn.cursor() as cursor:
|
1254
|
-
|
1255
|
-
|
1256
|
-
|
1257
|
-
|
1258
|
-
|
1259
|
-
|
1260
|
-
|
1261
|
-
|
1262
|
-
|
1263
|
-
|
1264
|
-
|
1265
|
-
|
1266
|
-
|
1234
|
+
if transaction_mode == 'batch':
|
1235
|
+
for i in range(0, len(data), batch_size):
|
1236
|
+
batch = data[i:i + batch_size]
|
1237
|
+
values_list = []
|
1238
|
+
for row in batch:
|
1239
|
+
values = [row.get(col) for col in all_columns]
|
1240
|
+
if check_duplicate and not update_on_duplicate:
|
1241
|
+
dup_cols = duplicate_columns if duplicate_columns else [col for col in all_columns if col.lower() not in self.base_excute_col]
|
1242
|
+
values += [row.get(col) for col in dup_cols]
|
1243
|
+
values_list.append(values)
|
1244
|
+
try:
|
1245
|
+
cursor.executemany(sql, values_list)
|
1246
|
+
conn.commit()
|
1247
|
+
total_inserted += len(batch)
|
1248
|
+
except Exception as e:
|
1249
|
+
conn.rollback()
|
1250
|
+
total_failed += len(batch)
|
1251
|
+
logger.error('批量插入失败', {'库': db_name, '表': table_name, '错误': str(e)})
|
1252
|
+
elif transaction_mode == 'hybrid':
|
1253
|
+
hybrid_n = 100 # 可配置
|
1254
|
+
for i in range(0, len(data), hybrid_n):
|
1255
|
+
batch = data[i:i + hybrid_n]
|
1256
|
+
for row in batch:
|
1257
|
+
try:
|
1258
|
+
values = [row.get(col) for col in all_columns]
|
1259
|
+
if check_duplicate and not update_on_duplicate:
|
1260
|
+
dup_cols = duplicate_columns if duplicate_columns else [col for col in all_columns if col.lower() not in self.base_excute_col]
|
1261
|
+
values += [row.get(col) for col in dup_cols]
|
1262
|
+
cursor.execute(sql, values)
|
1263
|
+
total_inserted += 1
|
1264
|
+
except Exception as e:
|
1265
|
+
conn.rollback()
|
1266
|
+
total_failed += 1
|
1267
|
+
logger.error('hybrid单行插入失败', {'库': db_name, '表': table_name, '错误': str(e)})
|
1268
|
+
conn.commit()
|
1269
|
+
else: # row模式
|
1270
|
+
for row in data:
|
1271
|
+
try:
|
1272
|
+
values = [row.get(col) for col in all_columns]
|
1273
|
+
if check_duplicate and not update_on_duplicate:
|
1274
|
+
dup_cols = duplicate_columns if duplicate_columns else [col for col in all_columns if col.lower() not in self.base_excute_col]
|
1275
|
+
values += [row.get(col) for col in dup_cols]
|
1276
|
+
cursor.execute(sql, values)
|
1277
|
+
conn.commit()
|
1278
|
+
total_inserted += 1
|
1279
|
+
except Exception as e:
|
1280
|
+
conn.rollback()
|
1281
|
+
total_failed += 1
|
1282
|
+
logger.error('单行插入失败', {'库': db_name, '表': table_name, '错误': str(e)})
|
1267
1283
|
return total_inserted, total_skipped, total_failed
|
1268
1284
|
|
1269
|
-
def _process_batch(
|
1270
|
-
self,
|
1271
|
-
conn,
|
1272
|
-
cursor,
|
1273
|
-
db_name: str,
|
1274
|
-
table_name: str,
|
1275
|
-
batch: List[Dict],
|
1276
|
-
all_columns: List[str],
|
1277
|
-
sql: str,
|
1278
|
-
check_duplicate: bool,
|
1279
|
-
duplicate_columns: Optional[List[str]],
|
1280
|
-
batch_id: Optional[str],
|
1281
|
-
transaction_mode: str,
|
1282
|
-
update_on_duplicate: bool = False
|
1283
|
-
) -> Tuple[int, int, int]:
|
1284
|
-
"""
|
1285
|
-
处理单个批次的数据插入
|
1286
|
-
|
1287
|
-
:param conn: 数据库连接对象
|
1288
|
-
:param cursor: 数据库游标对象
|
1289
|
-
:param db_name: 数据库名
|
1290
|
-
:param table_name: 表名
|
1291
|
-
:param batch: 当前批次的数据(字典列表)
|
1292
|
-
:param all_columns: 需要插入的所有列名
|
1293
|
-
:param sql: 执行的SQL语句
|
1294
|
-
:param check_duplicate: 是否检查重复
|
1295
|
-
:param duplicate_columns: 排重列
|
1296
|
-
:param batch_id: 批次ID
|
1297
|
-
:param transaction_mode: 事务模式
|
1298
|
-
:param update_on_duplicate: 遇到重复时是否更新
|
1299
|
-
:return: (插入数, 跳过数, 失败数)
|
1300
|
-
"""
|
1301
|
-
batch_inserted = 0
|
1302
|
-
batch_skipped = 0
|
1303
|
-
batch_failed = 0
|
1304
|
-
batch_size = len(batch)
|
1305
|
-
logger.debug('批次插入开始', {
|
1306
|
-
'库': db_name,
|
1307
|
-
'表': table_name,
|
1308
|
-
'批次ID': batch_id,
|
1309
|
-
'批次大小': batch_size,
|
1310
|
-
'事务模式': transaction_mode,
|
1311
|
-
'SQL预览': sql[:200],
|
1312
|
-
'排重': check_duplicate,
|
1313
|
-
'排重列': duplicate_columns,
|
1314
|
-
'允许更新': update_on_duplicate,
|
1315
|
-
'数据样例': self._shorten_for_log(batch, 2)
|
1316
|
-
})
|
1317
|
-
if transaction_mode == 'batch':
|
1318
|
-
try:
|
1319
|
-
for row_idx, row in enumerate(batch, 1):
|
1320
|
-
result = self._process_single_row(
|
1321
|
-
db_name, table_name, cursor, row, all_columns, sql,
|
1322
|
-
check_duplicate, duplicate_columns, update_on_duplicate
|
1323
|
-
)
|
1324
|
-
if result == 'inserted':
|
1325
|
-
batch_inserted += 1
|
1326
|
-
elif result == 'skipped':
|
1327
|
-
batch_skipped += 1
|
1328
|
-
else:
|
1329
|
-
batch_failed += 1
|
1330
|
-
conn.commit()
|
1331
|
-
logger.debug('批次插入成功', {
|
1332
|
-
'库': db_name,
|
1333
|
-
'表': table_name,
|
1334
|
-
'批次ID': batch_id,
|
1335
|
-
'插入': batch_inserted,
|
1336
|
-
'跳过': batch_skipped,
|
1337
|
-
'失败': batch_failed
|
1338
|
-
})
|
1339
|
-
except Exception as e:
|
1340
|
-
conn.rollback()
|
1341
|
-
batch_failed += len(batch)
|
1342
|
-
logger.error('批次插入失败', {
|
1343
|
-
'库': db_name,
|
1344
|
-
'表': table_name,
|
1345
|
-
'批次ID': batch_id,
|
1346
|
-
'错误': str(e),
|
1347
|
-
'SQL预览': sql[:200],
|
1348
|
-
'数据样例': self._shorten_for_log(batch, 2)
|
1349
|
-
})
|
1350
|
-
else: # row 或 hybrid 模式
|
1351
|
-
for row_idx, row in enumerate(batch, 1):
|
1352
|
-
try:
|
1353
|
-
result = self._process_single_row(
|
1354
|
-
db_name, table_name, cursor, row, all_columns, sql,
|
1355
|
-
check_duplicate, duplicate_columns, update_on_duplicate
|
1356
|
-
)
|
1357
|
-
if result == 'inserted':
|
1358
|
-
batch_inserted += 1
|
1359
|
-
elif result == 'skipped':
|
1360
|
-
batch_skipped += 1
|
1361
|
-
else:
|
1362
|
-
batch_failed += 1
|
1363
|
-
conn.commit()
|
1364
|
-
logger.debug('单行插入成功', {
|
1365
|
-
'库': db_name,
|
1366
|
-
'表': table_name,
|
1367
|
-
'批次ID': batch_id,
|
1368
|
-
'行号': row_idx,
|
1369
|
-
'插入状态': result
|
1370
|
-
})
|
1371
|
-
except Exception as e:
|
1372
|
-
conn.rollback()
|
1373
|
-
batch_failed += 1
|
1374
|
-
logger.error('单行插入失败', {
|
1375
|
-
'库': db_name,
|
1376
|
-
'表': table_name,
|
1377
|
-
'批次ID': batch_id,
|
1378
|
-
'行号': row_idx,
|
1379
|
-
'错误': str(e),
|
1380
|
-
'SQL预览': sql[:200],
|
1381
|
-
'数据': self._shorten_for_log(row)
|
1382
|
-
})
|
1383
|
-
logger.debug('批次插入结束', {
|
1384
|
-
'库': db_name,
|
1385
|
-
'表': table_name,
|
1386
|
-
'批次ID': batch_id,
|
1387
|
-
'插入': batch_inserted,
|
1388
|
-
'跳过': batch_skipped,
|
1389
|
-
'失败': batch_failed,
|
1390
|
-
'数据样例': self._shorten_for_log(batch, 2)
|
1391
|
-
})
|
1392
|
-
return batch_inserted, batch_skipped, batch_failed
|
1393
|
-
|
1394
|
-
def _process_single_row(
|
1395
|
-
self,
|
1396
|
-
db_name: str,
|
1397
|
-
table_name: str,
|
1398
|
-
cursor,
|
1399
|
-
row: Dict,
|
1400
|
-
all_columns: List[str],
|
1401
|
-
sql: str,
|
1402
|
-
check_duplicate: bool,
|
1403
|
-
duplicate_columns: Optional[List[str]],
|
1404
|
-
update_on_duplicate: bool = False
|
1405
|
-
) -> str:
|
1406
|
-
"""
|
1407
|
-
处理单行数据插入
|
1408
|
-
|
1409
|
-
:param db_name: 数据库名
|
1410
|
-
:param table_name: 表名
|
1411
|
-
:param cursor: 数据库游标对象
|
1412
|
-
:param row: 单行数据(字典)
|
1413
|
-
:param all_columns: 需要插入的所有列名
|
1414
|
-
:param sql: 执行的SQL语句
|
1415
|
-
:param check_duplicate: 是否检查重复
|
1416
|
-
:param duplicate_columns: 排重列
|
1417
|
-
:param update_on_duplicate: 遇到重复时是否更新
|
1418
|
-
:return: 'inserted' | 'skipped' | 'failed'
|
1419
|
-
"""
|
1420
|
-
try:
|
1421
|
-
# 构造参数
|
1422
|
-
values = [row.get(col) for col in all_columns]
|
1423
|
-
if check_duplicate:
|
1424
|
-
# 需要为 WHERE NOT EXISTS 语句补充参数
|
1425
|
-
if not update_on_duplicate:
|
1426
|
-
# duplicate_columns 为空时,默认用所有列(排除id/更新时间)
|
1427
|
-
dup_cols = duplicate_columns if duplicate_columns else [col for col in all_columns if col.lower() not in self.base_excute_col]
|
1428
|
-
values = values + [row.get(col) for col in dup_cols]
|
1429
|
-
cursor.execute(sql, values)
|
1430
|
-
except Exception as e:
|
1431
|
-
logger.error('单行插入失败', {
|
1432
|
-
'库': db_name,
|
1433
|
-
'表': table_name,
|
1434
|
-
'row': self._shorten_for_log(row),
|
1435
|
-
'错误': str(e)
|
1436
|
-
})
|
1437
|
-
return 'failed'
|
1438
|
-
return 'inserted'
|
1439
|
-
|
1440
1285
|
def close(self) -> None:
|
1441
1286
|
"""
|
1442
1287
|
关闭连接池并清理资源
|
@@ -1447,10 +1292,10 @@ class MySQLUploader:
|
|
1447
1292
|
try:
|
1448
1293
|
if hasattr(self, 'pool') and self.pool is not None:
|
1449
1294
|
try:
|
1450
|
-
self.pool.close()
|
1295
|
+
# self.pool.close() # PooledDB 没有 close 方法
|
1296
|
+
self.pool = None
|
1451
1297
|
except Exception as e:
|
1452
1298
|
logger.warning('关闭连接池时出错', {'error': str(e)})
|
1453
|
-
self.pool = None
|
1454
1299
|
logger.info('连接池关闭', {'uploader.py': '连接池关闭'})
|
1455
1300
|
except Exception as e:
|
1456
1301
|
logger.error('关闭连接池失败', {'error': str(e)})
|
@@ -1458,11 +1303,12 @@ class MySQLUploader:
|
|
1458
1303
|
|
1459
1304
|
def _check_pool_health(self) -> bool:
|
1460
1305
|
"""
|
1461
|
-
|
1462
|
-
:return: 连接池健康返回True,否则返回False
|
1306
|
+
检查连接池健康状态,防止连接泄露
|
1463
1307
|
"""
|
1464
1308
|
conn = None
|
1465
1309
|
try:
|
1310
|
+
if not hasattr(self, 'pool') or self.pool is None:
|
1311
|
+
return False
|
1466
1312
|
conn = self.pool.connection()
|
1467
1313
|
conn.ping(reconnect=True)
|
1468
1314
|
logger.debug('连接池健康检查通过')
|
@@ -1471,12 +1317,13 @@ class MySQLUploader:
|
|
1471
1317
|
logger.warning('连接池健康检查失败', {'error': str(e)})
|
1472
1318
|
return False
|
1473
1319
|
finally:
|
1474
|
-
if conn:
|
1320
|
+
if conn is not None:
|
1475
1321
|
try:
|
1476
1322
|
conn.close()
|
1477
1323
|
except Exception as e:
|
1478
1324
|
logger.warning('关闭连接时出错', {'error': str(e)})
|
1479
1325
|
|
1326
|
+
@staticmethod
|
1480
1327
|
def retry_on_failure(max_retries: int = 3, delay: int = 1):
|
1481
1328
|
"""
|
1482
1329
|
通用重试装饰器
|
mdbq/spider/aikucun.py
CHANGED
@@ -3,10 +3,8 @@ import datetime
|
|
3
3
|
import requests
|
4
4
|
import json
|
5
5
|
import os
|
6
|
-
import sys
|
7
6
|
import re
|
8
7
|
import time
|
9
|
-
import warnings
|
10
8
|
import platform
|
11
9
|
import getpass
|
12
10
|
from selenium import webdriver
|
@@ -15,20 +13,18 @@ from selenium.webdriver.common.by import By
|
|
15
13
|
from selenium.webdriver.support import expected_conditions as EC
|
16
14
|
from selenium.webdriver.chrome.service import Service
|
17
15
|
import pymysql
|
18
|
-
|
19
|
-
from mdbq.log import spider_logging
|
20
|
-
from mdbq.mysql import mysql
|
16
|
+
from mdbq.mysql import uploader
|
21
17
|
from mdbq.mysql import s_query
|
22
18
|
from mdbq.config import config
|
23
19
|
from mdbq.other import ua_sj
|
24
20
|
from mdbq.other import otk
|
21
|
+
from mdbq.log import mylogger
|
25
22
|
|
26
23
|
dir_path = os.path.expanduser("~")
|
27
24
|
config_file = os.path.join(dir_path, 'spd.txt')
|
28
25
|
content = config.read_config(file_path=config_file)
|
29
26
|
username, password, host, port = content['username'], content['password'], content['host'], content['port']
|
30
27
|
|
31
|
-
# m_engine = mysql.MysqlUpload(username=username, password=password, host=host, port=port, charset='utf8mb4')
|
32
28
|
uld = uploader.MySQLUploader(username=username, password=password, host=host, port=int(port), pool_size=10)
|
33
29
|
# 实例化一个数据查询类,用来获取 cookies 表数据
|
34
30
|
download = s_query.QueryDatas(username=username, password=password, host=host, port=port)
|
@@ -188,11 +184,6 @@ class AikuCun:
|
|
188
184
|
'更新时间': 'timestamp'
|
189
185
|
}
|
190
186
|
# 更新至数据库记录
|
191
|
-
# m_engine.dict_to_mysql(
|
192
|
-
# db_name=self.db_name,
|
193
|
-
# table_name=self.table_name,
|
194
|
-
# dict_data=self.token,
|
195
|
-
# )
|
196
187
|
uld.upload_data(
|
197
188
|
db_name=self.db_name,
|
198
189
|
table_name=self.table_name,
|
@@ -429,15 +420,6 @@ class AikuCun:
|
|
429
420
|
drop_dup = ['日期', '平台', '店铺名称', '商品款号', '访客量']
|
430
421
|
else:
|
431
422
|
drop_dup = ['日期', '平台', '店铺名称', '条码']
|
432
|
-
# m_engine.insert_many_dict(
|
433
|
-
# db_name=db_name,
|
434
|
-
# table_name=table_name,
|
435
|
-
# dict_data_list=_results,
|
436
|
-
# icm_update=drop_dup, # 唯一组合键
|
437
|
-
# # unique_main_key=['人群id'],
|
438
|
-
# set_typ=set_typ,
|
439
|
-
# allow_not_null=False, # 创建允许插入空值的列
|
440
|
-
# )
|
441
423
|
uld.upload_data(
|
442
424
|
db_name=db_name,
|
443
425
|
table_name=table_name,
|
@@ -1,17 +1,17 @@
|
|
1
1
|
mdbq/__init__.py,sha256=Il5Q9ATdX8yXqVxtP_nYqUhExzxPC_qk_WXQ_4h0exg,16
|
2
|
-
mdbq/__version__.py,sha256=
|
2
|
+
mdbq/__version__.py,sha256=SerN98H6Mx8rHVh-jf2Nmc7iZHb02NHGVphB1O5jKwE,18
|
3
3
|
mdbq/aggregation/__init__.py,sha256=EeDqX2Aml6SPx8363J-v1lz0EcZtgwIBYyCJV6CcEDU,40
|
4
|
-
mdbq/aggregation/query_data.py,sha256=
|
4
|
+
mdbq/aggregation/query_data.py,sha256=nxL8hSy8yI1QLlqnkTNHHQSxRfo-6WKL5OA-N4xLB7c,179832
|
5
5
|
mdbq/config/__init__.py,sha256=jso1oHcy6cJEfa7udS_9uO5X6kZLoPBF8l3wCYmr5dM,18
|
6
6
|
mdbq/config/config.py,sha256=eaTfrfXQ65xLqjr5I8-HkZd_jEY1JkGinEgv3TSLeoQ,3170
|
7
7
|
mdbq/log/__init__.py,sha256=Mpbrav0s0ifLL7lVDAuePEi1hJKiSHhxcv1byBKDl5E,15
|
8
|
-
mdbq/log/mylogger.py,sha256=
|
8
|
+
mdbq/log/mylogger.py,sha256=HuxLBCXjm6fZrxYE0rdpUCz359WGeqOX0vvg9jTuRY4,24126
|
9
9
|
mdbq/log/spider_logging.py,sha256=-ozWWEGm3HVv604ozs_OOvVwumjokmUPwbaodesUrPY,1664
|
10
10
|
mdbq/mysql/__init__.py,sha256=A_DPJyAoEvTSFojiI2e94zP0FKtCkkwKP1kYUCSyQzo,11
|
11
|
-
mdbq/mysql/deduplicator.py,sha256=
|
12
|
-
mdbq/mysql/mysql.py,sha256=
|
11
|
+
mdbq/mysql/deduplicator.py,sha256=bIV010UkFfSUONY6-756x3tDVO4k6q3pqxoY3Z2xT-k,32990
|
12
|
+
mdbq/mysql/mysql.py,sha256=Kjpi-LL00WQUmTTOfhEBsNrmo4-4kFFJzrHbVKfqiBE,56770
|
13
13
|
mdbq/mysql/s_query.py,sha256=dlnrVJ3-Vp1Suv9CNbPxyYSRqRJUHjOpF39tb2F-wBc,10190
|
14
|
-
mdbq/mysql/uploader.py,sha256=
|
14
|
+
mdbq/mysql/uploader.py,sha256=3fXyNA0GzBNaadAh6cOgbuUEvY4IAhKn4apgbkToEno,61321
|
15
15
|
mdbq/other/__init__.py,sha256=jso1oHcy6cJEfa7udS_9uO5X6kZLoPBF8l3wCYmr5dM,18
|
16
16
|
mdbq/other/download_sku_picture.py,sha256=YU8DxKMXbdeE1OOKEA848WVp62jYHw5O4tXTjUdq9H0,44832
|
17
17
|
mdbq/other/otk.py,sha256=iclBIFbQbhlqzUbcMMoePXBpcP1eZ06ZtjnhcA_EbmE,7241
|
@@ -23,8 +23,8 @@ mdbq/pbix/refresh_all.py,sha256=OBT9EewSZ0aRS9vL_FflVn74d4l2G00wzHiikCC4TC0,5926
|
|
23
23
|
mdbq/redis/__init__.py,sha256=YtgBlVSMDphtpwYX248wGge1x-Ex_mMufz4-8W0XRmA,12
|
24
24
|
mdbq/redis/getredis.py,sha256=YHgCKO8mEsslwet33K5tGss-nrDDwPnOSlhA9iBu0jY,24078
|
25
25
|
mdbq/spider/__init__.py,sha256=RBMFXGy_jd1HXZhngB2T2XTvJqki8P_Fr-pBcwijnew,18
|
26
|
-
mdbq/spider/aikucun.py,sha256=
|
27
|
-
mdbq-3.11.
|
28
|
-
mdbq-3.11.
|
29
|
-
mdbq-3.11.
|
30
|
-
mdbq-3.11.
|
26
|
+
mdbq/spider/aikucun.py,sha256=cqK-JRd_DHbToC7hyo83m8o97NZkJFqmB2xBtr6aAVU,20961
|
27
|
+
mdbq-3.11.3.dist-info/METADATA,sha256=tgDHEyJKxO0ML-gUTBap1b6yP-xv5sEA_SsfVJ_31C0,364
|
28
|
+
mdbq-3.11.3.dist-info/WHEEL,sha256=jB7zZ3N9hIM9adW7qlTAyycLYW9npaWKLRzaoVcLKcM,91
|
29
|
+
mdbq-3.11.3.dist-info/top_level.txt,sha256=2FQ-uLnCSB-OwFiWntzmwosW3X2Xqsg0ewh1axsaylA,5
|
30
|
+
mdbq-3.11.3.dist-info/RECORD,,
|
File without changes
|
File without changes
|