mdbq 4.2.0__py3-none-any.whl → 4.2.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of mdbq might be problematic. Click here for more details.
- mdbq/__version__.py +1 -1
- mdbq/mysql/uploader.py +741 -116
- mdbq/other/download_sku_picture.py +15 -36
- {mdbq-4.2.0.dist-info → mdbq-4.2.2.dist-info}/METADATA +1 -1
- {mdbq-4.2.0.dist-info → mdbq-4.2.2.dist-info}/RECORD +7 -7
- {mdbq-4.2.0.dist-info → mdbq-4.2.2.dist-info}/WHEEL +0 -0
- {mdbq-4.2.0.dist-info → mdbq-4.2.2.dist-info}/top_level.txt +0 -0
mdbq/__version__.py
CHANGED
|
@@ -1 +1 @@
|
|
|
1
|
-
VERSION = '4.2.
|
|
1
|
+
VERSION = '4.2.2'
|
mdbq/mysql/uploader.py
CHANGED
|
@@ -1,20 +1,24 @@
|
|
|
1
1
|
# -*- coding:utf-8 -*-
|
|
2
2
|
"""
|
|
3
|
-
MySQL
|
|
4
|
-
提供高可用、易维护的MySQL数据上传功能
|
|
3
|
+
MySQL数据上传
|
|
5
4
|
"""
|
|
6
|
-
|
|
7
5
|
import datetime
|
|
8
6
|
import time
|
|
9
7
|
import json
|
|
10
8
|
import re
|
|
11
|
-
|
|
9
|
+
import io
|
|
10
|
+
from typing import Union, List, Dict, Optional, Any, Tuple, Iterator
|
|
12
11
|
from functools import wraps
|
|
13
12
|
from decimal import Decimal, InvalidOperation
|
|
14
13
|
import math
|
|
15
|
-
|
|
14
|
+
import concurrent.futures
|
|
15
|
+
import threading
|
|
16
|
+
from queue import Queue
|
|
16
17
|
import pymysql
|
|
17
18
|
import pandas as pd
|
|
19
|
+
import psutil
|
|
20
|
+
import enum
|
|
21
|
+
import ipaddress
|
|
18
22
|
from dbutils.pooled_db import PooledDB
|
|
19
23
|
from mdbq.log import mylogger
|
|
20
24
|
# from mdbq.myconf import myconf
|
|
@@ -87,12 +91,35 @@ class DatabaseConnectionManager:
|
|
|
87
91
|
class DataTypeInferrer:
|
|
88
92
|
"""数据类型推断器"""
|
|
89
93
|
|
|
94
|
+
# 自定义类型映射注册表
|
|
95
|
+
_custom_type_handlers = {}
|
|
96
|
+
|
|
97
|
+
@classmethod
|
|
98
|
+
def register_type_handler(cls, type_name: str, handler_func):
|
|
99
|
+
"""
|
|
100
|
+
注册自定义类型处理器
|
|
101
|
+
|
|
102
|
+
:param type_name: 类型名称
|
|
103
|
+
:param handler_func: 处理函数,接收value参数,返回MySQL类型字符串或None
|
|
104
|
+
"""
|
|
105
|
+
cls._custom_type_handlers[type_name] = handler_func
|
|
106
|
+
|
|
90
107
|
@staticmethod
|
|
91
108
|
def infer_mysql_type(value: Any) -> str:
|
|
92
109
|
"""推断MySQL数据类型"""
|
|
93
110
|
if value is None or str(value).lower() in ['', 'none', 'nan']:
|
|
94
111
|
return 'VARCHAR(255)'
|
|
95
112
|
|
|
113
|
+
# 检查自定义类型处理器
|
|
114
|
+
for type_name, handler in DataTypeInferrer._custom_type_handlers.items():
|
|
115
|
+
try:
|
|
116
|
+
result = handler(value)
|
|
117
|
+
if result:
|
|
118
|
+
return result
|
|
119
|
+
except Exception:
|
|
120
|
+
continue
|
|
121
|
+
|
|
122
|
+
# Python基本类型
|
|
96
123
|
if isinstance(value, bool):
|
|
97
124
|
return 'TINYINT(1)'
|
|
98
125
|
elif isinstance(value, int):
|
|
@@ -109,42 +136,231 @@ class DataTypeInferrer:
|
|
|
109
136
|
elif isinstance(value, (list, dict)):
|
|
110
137
|
return 'JSON'
|
|
111
138
|
elif isinstance(value, str):
|
|
112
|
-
|
|
113
|
-
|
|
114
|
-
|
|
115
|
-
|
|
116
|
-
#
|
|
117
|
-
|
|
118
|
-
|
|
119
|
-
|
|
120
|
-
|
|
121
|
-
|
|
122
|
-
|
|
123
|
-
|
|
139
|
+
return DataTypeInferrer._infer_string_type(value)
|
|
140
|
+
|
|
141
|
+
# 处理枚举类型
|
|
142
|
+
if hasattr(value, '__class__') and hasattr(value.__class__, '__bases__'):
|
|
143
|
+
# 检查是否是枚举类型
|
|
144
|
+
if isinstance(value, enum.Enum):
|
|
145
|
+
# 根据枚举值的类型决定MySQL类型
|
|
146
|
+
enum_value = value.value
|
|
147
|
+
if isinstance(enum_value, int):
|
|
148
|
+
return 'INT'
|
|
149
|
+
elif isinstance(enum_value, str):
|
|
150
|
+
max_len = max(len(str(item.value)) for item in value.__class__)
|
|
151
|
+
return f'VARCHAR({min(max_len * 2, 255)})'
|
|
152
|
+
else:
|
|
153
|
+
return 'VARCHAR(255)'
|
|
154
|
+
|
|
155
|
+
# 处理其他特殊类型
|
|
156
|
+
value_str = str(value)
|
|
157
|
+
|
|
158
|
+
# UUID检测
|
|
159
|
+
if DataTypeInferrer._is_uuid(value_str):
|
|
160
|
+
return 'CHAR(36)'
|
|
161
|
+
|
|
162
|
+
# IP地址检测
|
|
163
|
+
if DataTypeInferrer._is_ip_address(value_str):
|
|
164
|
+
return 'VARCHAR(45)' # 支持IPv6
|
|
165
|
+
|
|
166
|
+
# 邮箱检测
|
|
167
|
+
if DataTypeInferrer._is_email(value_str):
|
|
168
|
+
return 'VARCHAR(255)'
|
|
169
|
+
|
|
170
|
+
# URL检测
|
|
171
|
+
if DataTypeInferrer._is_url(value_str):
|
|
172
|
+
return 'TEXT'
|
|
124
173
|
|
|
125
|
-
|
|
174
|
+
# 默认字符串处理
|
|
175
|
+
return DataTypeInferrer._infer_string_type(value_str)
|
|
176
|
+
|
|
177
|
+
@staticmethod
|
|
178
|
+
def _infer_string_type(value: str) -> str:
|
|
179
|
+
"""推断字符串类型"""
|
|
180
|
+
# 尝试判断是否是日期时间
|
|
181
|
+
if DataValidator.is_datetime_string(value):
|
|
182
|
+
return 'DATETIME'
|
|
183
|
+
|
|
184
|
+
# 数值字符串检测
|
|
185
|
+
if DataTypeInferrer._is_numeric_string(value):
|
|
186
|
+
if '.' in value or 'e' in value.lower():
|
|
187
|
+
return 'DECIMAL(20,6)'
|
|
188
|
+
else:
|
|
189
|
+
try:
|
|
190
|
+
int_val = int(value)
|
|
191
|
+
if -2147483648 <= int_val <= 2147483647:
|
|
192
|
+
return 'INT'
|
|
193
|
+
else:
|
|
194
|
+
return 'BIGINT'
|
|
195
|
+
except ValueError:
|
|
196
|
+
pass
|
|
197
|
+
|
|
198
|
+
# 根据字符串长度选择类型
|
|
199
|
+
length = len(value)
|
|
200
|
+
if length <= 255:
|
|
201
|
+
return 'VARCHAR(255)'
|
|
202
|
+
elif length <= 65535:
|
|
203
|
+
return 'TEXT'
|
|
204
|
+
else:
|
|
205
|
+
return 'LONGTEXT'
|
|
126
206
|
|
|
127
207
|
@staticmethod
|
|
128
|
-
def
|
|
129
|
-
"""
|
|
208
|
+
def _is_uuid(value: str) -> bool:
|
|
209
|
+
"""检测是否是UUID格式"""
|
|
210
|
+
uuid_pattern = r'^[0-9a-f]{8}-[0-9a-f]{4}-[1-5][0-9a-f]{3}-[89ab][0-9a-f]{3}-[0-9a-f]{12}$'
|
|
211
|
+
return bool(re.match(uuid_pattern, value.lower()))
|
|
212
|
+
|
|
213
|
+
@staticmethod
|
|
214
|
+
def _is_ip_address(value: str) -> bool:
|
|
215
|
+
"""检测是否是IP地址"""
|
|
216
|
+
try:
|
|
217
|
+
ipaddress.ip_address(value)
|
|
218
|
+
return True
|
|
219
|
+
except ValueError:
|
|
220
|
+
return False
|
|
221
|
+
|
|
222
|
+
@staticmethod
|
|
223
|
+
def _is_email(value: str) -> bool:
|
|
224
|
+
"""检测是否是邮箱地址"""
|
|
225
|
+
email_pattern = r'^[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}$'
|
|
226
|
+
return bool(re.match(email_pattern, value))
|
|
227
|
+
|
|
228
|
+
@staticmethod
|
|
229
|
+
def _is_url(value: str) -> bool:
|
|
230
|
+
"""检测是否是URL"""
|
|
231
|
+
url_pattern = r'^https?://[^\s/$.?#].[^\s]*$'
|
|
232
|
+
return bool(re.match(url_pattern, value, re.IGNORECASE))
|
|
233
|
+
|
|
234
|
+
@staticmethod
|
|
235
|
+
def _is_numeric_string(value: str) -> bool:
|
|
236
|
+
"""检测是否是数值字符串"""
|
|
237
|
+
try:
|
|
238
|
+
float(value)
|
|
239
|
+
return True
|
|
240
|
+
except ValueError:
|
|
241
|
+
return False
|
|
242
|
+
|
|
243
|
+
@staticmethod
|
|
244
|
+
def infer_types_from_data(data: List[Dict], sample_size: int = 100) -> Dict[str, str]:
|
|
245
|
+
"""
|
|
246
|
+
从数据中推断所有列的类型
|
|
247
|
+
|
|
248
|
+
:param data: 数据列表
|
|
249
|
+
:param sample_size: 采样大小,避免检查过多数据
|
|
250
|
+
"""
|
|
130
251
|
if not data:
|
|
131
252
|
return {}
|
|
132
253
|
|
|
133
254
|
type_map = {}
|
|
134
|
-
|
|
255
|
+
type_candidates = {} # 存储每列的候选类型
|
|
256
|
+
|
|
257
|
+
# 采样数据进行类型推断
|
|
258
|
+
sample_data = data[:sample_size] if len(data) > sample_size else data
|
|
259
|
+
|
|
260
|
+
for row in sample_data:
|
|
135
261
|
for col, value in row.items():
|
|
136
262
|
# 跳过系统列
|
|
137
263
|
if col.lower() in ['id', 'create_at', 'update_at']:
|
|
138
264
|
continue
|
|
139
|
-
|
|
140
|
-
|
|
265
|
+
|
|
266
|
+
if value is not None and str(value).strip():
|
|
267
|
+
mysql_type = DataTypeInferrer.infer_mysql_type(value)
|
|
268
|
+
|
|
269
|
+
if col not in type_candidates:
|
|
270
|
+
type_candidates[col] = []
|
|
271
|
+
type_candidates[col].append(mysql_type)
|
|
272
|
+
|
|
273
|
+
# 为每列选择最合适的类型
|
|
274
|
+
for col, types in type_candidates.items():
|
|
275
|
+
type_map[col] = DataTypeInferrer._select_best_type(types)
|
|
141
276
|
|
|
142
|
-
#
|
|
277
|
+
# 自动添加系统列类型定义(id列只在新建表时添加)
|
|
143
278
|
type_map['id'] = 'BIGINT'
|
|
144
279
|
type_map['create_at'] = 'TIMESTAMP'
|
|
145
280
|
type_map['update_at'] = 'TIMESTAMP'
|
|
146
281
|
|
|
147
282
|
return type_map
|
|
283
|
+
|
|
284
|
+
@staticmethod
|
|
285
|
+
def _select_best_type(type_candidates: List[str]) -> str:
|
|
286
|
+
"""
|
|
287
|
+
从候选类型中选择最佳类型
|
|
288
|
+
|
|
289
|
+
优先级:JSON > LONGTEXT > TEXT > VARCHAR > DECIMAL > BIGINT > INT > DATETIME > DATE
|
|
290
|
+
"""
|
|
291
|
+
if not type_candidates:
|
|
292
|
+
return 'VARCHAR(255)'
|
|
293
|
+
|
|
294
|
+
# 类型优先级映射
|
|
295
|
+
type_priority = {
|
|
296
|
+
'JSON': 10,
|
|
297
|
+
'LONGTEXT': 9,
|
|
298
|
+
'TEXT': 8,
|
|
299
|
+
'VARCHAR': 7,
|
|
300
|
+
'DECIMAL': 6,
|
|
301
|
+
'BIGINT': 5,
|
|
302
|
+
'INT': 4,
|
|
303
|
+
'DATETIME': 3,
|
|
304
|
+
'DATE': 2,
|
|
305
|
+
'TINYINT': 1
|
|
306
|
+
}
|
|
307
|
+
|
|
308
|
+
# 找到优先级最高的类型
|
|
309
|
+
best_type = 'VARCHAR(255)'
|
|
310
|
+
best_priority = 0
|
|
311
|
+
|
|
312
|
+
for candidate in set(type_candidates):
|
|
313
|
+
# 提取基础类型名
|
|
314
|
+
base_type = candidate.split('(')[0].upper()
|
|
315
|
+
priority = type_priority.get(base_type, 0)
|
|
316
|
+
|
|
317
|
+
if priority > best_priority:
|
|
318
|
+
best_priority = priority
|
|
319
|
+
best_type = candidate
|
|
320
|
+
|
|
321
|
+
return best_type
|
|
322
|
+
|
|
323
|
+
|
|
324
|
+
# 注册一些常用的自定义类型处理器
|
|
325
|
+
def register_common_type_handlers():
|
|
326
|
+
"""注册常用的自定义类型处理器"""
|
|
327
|
+
|
|
328
|
+
def handle_phone_number(value):
|
|
329
|
+
"""处理电话号码"""
|
|
330
|
+
if isinstance(value, str):
|
|
331
|
+
# 中国手机号码格式
|
|
332
|
+
if re.match(r'^1[3-9]\d{9}$', value):
|
|
333
|
+
return 'VARCHAR(11)'
|
|
334
|
+
# 国际电话号码格式
|
|
335
|
+
if re.match(r'^\+?[1-9]\d{1,14}$', value):
|
|
336
|
+
return 'VARCHAR(20)'
|
|
337
|
+
return None
|
|
338
|
+
|
|
339
|
+
def handle_id_card(value):
|
|
340
|
+
"""处理身份证号"""
|
|
341
|
+
if isinstance(value, str):
|
|
342
|
+
# 中国身份证号码
|
|
343
|
+
if re.match(r'^\d{17}[\dXx]$', value):
|
|
344
|
+
return 'CHAR(18)'
|
|
345
|
+
return None
|
|
346
|
+
|
|
347
|
+
def handle_json_string(value):
|
|
348
|
+
"""处理JSON字符串"""
|
|
349
|
+
if isinstance(value, str):
|
|
350
|
+
try:
|
|
351
|
+
json.loads(value)
|
|
352
|
+
return 'JSON'
|
|
353
|
+
except (ValueError, TypeError):
|
|
354
|
+
pass
|
|
355
|
+
return None
|
|
356
|
+
|
|
357
|
+
# 注册处理器
|
|
358
|
+
DataTypeInferrer.register_type_handler('phone', handle_phone_number)
|
|
359
|
+
DataTypeInferrer.register_type_handler('id_card', handle_id_card)
|
|
360
|
+
DataTypeInferrer.register_type_handler('json_string', handle_json_string)
|
|
361
|
+
|
|
362
|
+
# 自动注册常用类型处理器
|
|
363
|
+
register_common_type_handlers()
|
|
148
364
|
|
|
149
365
|
|
|
150
366
|
class DataValidator:
|
|
@@ -376,6 +592,80 @@ class TableManager:
|
|
|
376
592
|
)
|
|
377
593
|
return bool(cursor.fetchone())
|
|
378
594
|
|
|
595
|
+
def get_table_columns(self, db_name: str, table_name: str) -> Dict[str, str]:
|
|
596
|
+
"""获取表的列信息"""
|
|
597
|
+
db_name = self._sanitize_identifier(db_name)
|
|
598
|
+
table_name = self._sanitize_identifier(table_name)
|
|
599
|
+
|
|
600
|
+
with self.conn_mgr.get_connection() as conn:
|
|
601
|
+
with conn.cursor() as cursor:
|
|
602
|
+
cursor.execute("""
|
|
603
|
+
SELECT COLUMN_NAME, COLUMN_TYPE
|
|
604
|
+
FROM INFORMATION_SCHEMA.COLUMNS
|
|
605
|
+
WHERE TABLE_SCHEMA = %s AND TABLE_NAME = %s
|
|
606
|
+
""", (db_name, table_name))
|
|
607
|
+
|
|
608
|
+
columns = {}
|
|
609
|
+
for row in cursor.fetchall():
|
|
610
|
+
columns[row['COLUMN_NAME']] = row['COLUMN_TYPE']
|
|
611
|
+
return columns
|
|
612
|
+
|
|
613
|
+
def get_table_primary_key(self, db_name: str, table_name: str) -> Optional[str]:
|
|
614
|
+
"""获取表的主键列名"""
|
|
615
|
+
db_name = self._sanitize_identifier(db_name)
|
|
616
|
+
table_name = self._sanitize_identifier(table_name)
|
|
617
|
+
|
|
618
|
+
with self.conn_mgr.get_connection() as conn:
|
|
619
|
+
with conn.cursor() as cursor:
|
|
620
|
+
cursor.execute("""
|
|
621
|
+
SELECT COLUMN_NAME
|
|
622
|
+
FROM INFORMATION_SCHEMA.KEY_COLUMN_USAGE
|
|
623
|
+
WHERE TABLE_SCHEMA = %s AND TABLE_NAME = %s
|
|
624
|
+
AND CONSTRAINT_NAME = 'PRIMARY'
|
|
625
|
+
""", (db_name, table_name))
|
|
626
|
+
|
|
627
|
+
result = cursor.fetchone()
|
|
628
|
+
return result['COLUMN_NAME'] if result else None
|
|
629
|
+
|
|
630
|
+
def ensure_system_columns(self, db_name: str, table_name: str):
|
|
631
|
+
"""确保表有系统列,如果没有则添加(保持原有主键结构)"""
|
|
632
|
+
existing_columns = self.get_table_columns(db_name, table_name)
|
|
633
|
+
existing_primary_key = self.get_table_primary_key(db_name, table_name)
|
|
634
|
+
|
|
635
|
+
with self.conn_mgr.get_connection() as conn:
|
|
636
|
+
with conn.cursor() as cursor:
|
|
637
|
+
# 只有在表没有主键且没有id列时,才添加id主键
|
|
638
|
+
if existing_primary_key is None and 'id' not in existing_columns:
|
|
639
|
+
cursor.execute(f"""
|
|
640
|
+
ALTER TABLE `{db_name}`.`{table_name}`
|
|
641
|
+
ADD COLUMN `id` BIGINT NOT NULL AUTO_INCREMENT PRIMARY KEY FIRST
|
|
642
|
+
""")
|
|
643
|
+
logger.info('自动添加id主键列', {'database': db_name, 'table': table_name})
|
|
644
|
+
elif existing_primary_key is not None:
|
|
645
|
+
logger.debug('表已有主键,保持原有结构', {
|
|
646
|
+
'database': db_name,
|
|
647
|
+
'table': table_name,
|
|
648
|
+
'primary_key': existing_primary_key
|
|
649
|
+
})
|
|
650
|
+
|
|
651
|
+
# 检查并添加create_at列
|
|
652
|
+
if 'create_at' not in existing_columns:
|
|
653
|
+
cursor.execute(f"""
|
|
654
|
+
ALTER TABLE `{db_name}`.`{table_name}`
|
|
655
|
+
ADD COLUMN `create_at` TIMESTAMP NOT NULL DEFAULT CURRENT_TIMESTAMP
|
|
656
|
+
""")
|
|
657
|
+
logger.info('自动添加create_at列', {'database': db_name, 'table': table_name})
|
|
658
|
+
|
|
659
|
+
# 检查并添加update_at列
|
|
660
|
+
if 'update_at' not in existing_columns:
|
|
661
|
+
cursor.execute(f"""
|
|
662
|
+
ALTER TABLE `{db_name}`.`{table_name}`
|
|
663
|
+
ADD COLUMN `update_at` TIMESTAMP NOT NULL DEFAULT CURRENT_TIMESTAMP ON UPDATE CURRENT_TIMESTAMP
|
|
664
|
+
""")
|
|
665
|
+
logger.info('自动添加update_at列', {'database': db_name, 'table': table_name})
|
|
666
|
+
|
|
667
|
+
conn.commit()
|
|
668
|
+
|
|
379
669
|
def create_table(self, db_name: str, table_name: str, columns: Dict[str, str],
|
|
380
670
|
primary_keys: Optional[List[str]] = None,
|
|
381
671
|
unique_keys: Optional[List[List[str]]] = None):
|
|
@@ -479,24 +769,49 @@ class DataProcessor:
|
|
|
479
769
|
"""数据处理器"""
|
|
480
770
|
|
|
481
771
|
@staticmethod
|
|
482
|
-
def normalize_data(data: Union[Dict, List[Dict], pd.DataFrame]
|
|
483
|
-
|
|
772
|
+
def normalize_data(data: Union[Dict, List[Dict], pd.DataFrame],
|
|
773
|
+
chunk_size: int = 5000,
|
|
774
|
+
memory_limit_mb: int = 100) -> Iterator[List[Dict]]:
|
|
775
|
+
"""
|
|
776
|
+
标准化数据格式为分块迭代器
|
|
777
|
+
|
|
778
|
+
:param data: 输入数据
|
|
779
|
+
:param chunk_size: 每个chunk的大小
|
|
780
|
+
:param memory_limit_mb: 内存限制(MB),超过时自动调整chunk_size
|
|
781
|
+
"""
|
|
782
|
+
# 动态调整chunk_size基于可用内存
|
|
783
|
+
available_memory_mb = psutil.virtual_memory().available / 1024 / 1024
|
|
784
|
+
if available_memory_mb < memory_limit_mb * 2:
|
|
785
|
+
chunk_size = min(chunk_size, 1000) # 内存紧张时减小chunk
|
|
786
|
+
|
|
484
787
|
if isinstance(data, pd.DataFrame):
|
|
485
|
-
|
|
788
|
+
# 对于大DataFrame,使用更高效的分块方式
|
|
789
|
+
if len(data) > 50000:
|
|
790
|
+
# 大数据集使用pandas的分块读取
|
|
791
|
+
for chunk in pd.read_csv(io.StringIO(data.to_csv(index=False)), chunksize=chunk_size):
|
|
792
|
+
yield chunk.to_dict('records')
|
|
793
|
+
else:
|
|
794
|
+
for i in range(0, len(data), chunk_size):
|
|
795
|
+
chunk = data.iloc[i:i + chunk_size]
|
|
796
|
+
yield chunk.to_dict('records')
|
|
486
797
|
elif isinstance(data, dict):
|
|
487
|
-
|
|
488
|
-
elif isinstance(data, list)
|
|
489
|
-
|
|
798
|
+
yield [data]
|
|
799
|
+
elif isinstance(data, list):
|
|
800
|
+
if all(isinstance(item, dict) for item in data):
|
|
801
|
+
for i in range(0, len(data), chunk_size):
|
|
802
|
+
yield data[i:i + chunk_size]
|
|
803
|
+
else:
|
|
804
|
+
raise ValueError("列表中必须全部是字典")
|
|
490
805
|
else:
|
|
491
806
|
raise ValueError("数据格式必须是字典、字典列表或DataFrame")
|
|
492
807
|
|
|
493
808
|
@staticmethod
|
|
494
|
-
def prepare_data_for_insert(
|
|
809
|
+
def prepare_data_for_insert(data_chunk: List[Dict], set_typ: Dict[str, str],
|
|
495
810
|
allow_null: bool = False) -> List[Dict]:
|
|
496
811
|
"""准备插入数据"""
|
|
497
812
|
prepared_data = []
|
|
498
813
|
|
|
499
|
-
for row_idx, row in enumerate(
|
|
814
|
+
for row_idx, row in enumerate(data_chunk, 1):
|
|
500
815
|
prepared_row = {}
|
|
501
816
|
|
|
502
817
|
for col_name, col_type in set_typ.items():
|
|
@@ -523,13 +838,13 @@ class DataProcessor:
|
|
|
523
838
|
return prepared_data
|
|
524
839
|
|
|
525
840
|
@staticmethod
|
|
526
|
-
def partition_data_by_date(
|
|
841
|
+
def partition_data_by_date(data_chunk: List[Dict], date_column: str,
|
|
527
842
|
partition_by: str) -> Dict[str, List[Dict]]:
|
|
528
|
-
"""
|
|
843
|
+
"""按日期分区数据块"""
|
|
529
844
|
partitioned = {}
|
|
530
845
|
table_manager = TableManager(None, None) # 只用静态方法
|
|
531
846
|
|
|
532
|
-
for row in
|
|
847
|
+
for row in data_chunk:
|
|
533
848
|
if date_column not in row:
|
|
534
849
|
logger.warning('缺少分区日期列', {'列名': date_column, '行数据': row})
|
|
535
850
|
continue
|
|
@@ -583,34 +898,59 @@ class DataInserter:
|
|
|
583
898
|
def _execute_batch_insert(self, sql: str, data: List[Dict],
|
|
584
899
|
columns: List[str]) -> Tuple[int, int, int]:
|
|
585
900
|
"""执行批量插入"""
|
|
586
|
-
|
|
901
|
+
# 动态调整批次大小
|
|
902
|
+
estimated_row_size = len(str(data[0])) if data else 100
|
|
903
|
+
max_packet_size = 16 * 1024 * 1024 # 16MB MySQL默认限制
|
|
904
|
+
optimal_batch_size = min(
|
|
905
|
+
max_packet_size // (estimated_row_size * len(columns)),
|
|
906
|
+
2000, # 最大批次
|
|
907
|
+
len(data)
|
|
908
|
+
)
|
|
909
|
+
batch_size = max(100, optimal_batch_size) # 最小100条
|
|
910
|
+
|
|
587
911
|
total_inserted = 0
|
|
588
912
|
total_skipped = 0
|
|
589
913
|
total_failed = 0
|
|
590
914
|
|
|
591
915
|
with self.conn_mgr.get_connection() as conn:
|
|
592
916
|
with conn.cursor() as cursor:
|
|
593
|
-
|
|
594
|
-
|
|
595
|
-
|
|
596
|
-
|
|
597
|
-
|
|
598
|
-
|
|
599
|
-
|
|
917
|
+
# 预处理所有数据,减少循环中的处理开销
|
|
918
|
+
all_values = []
|
|
919
|
+
for row in data:
|
|
920
|
+
values = [self._ensure_basic_type(row.get(col)) for col in columns]
|
|
921
|
+
all_values.append(values)
|
|
922
|
+
|
|
923
|
+
# 分批处理,使用更大的事务批次
|
|
924
|
+
transaction_size = min(5000, len(all_values)) # 每个事务处理的记录数
|
|
925
|
+
|
|
926
|
+
for tx_start in range(0, len(all_values), transaction_size):
|
|
927
|
+
tx_end = min(tx_start + transaction_size, len(all_values))
|
|
928
|
+
tx_values = all_values[tx_start:tx_end]
|
|
600
929
|
|
|
601
930
|
try:
|
|
602
|
-
|
|
931
|
+
# 开始事务
|
|
932
|
+
conn.begin()
|
|
933
|
+
|
|
934
|
+
# 在事务内分批执行,成功后直接累加
|
|
935
|
+
for i in range(0, len(tx_values), batch_size):
|
|
936
|
+
batch_values = tx_values[i:i + batch_size]
|
|
937
|
+
|
|
938
|
+
try:
|
|
939
|
+
cursor.executemany(sql, batch_values)
|
|
940
|
+
total_inserted += len(batch_values)
|
|
941
|
+
except pymysql.err.IntegrityError:
|
|
942
|
+
total_skipped += len(batch_values)
|
|
943
|
+
logger.debug('批量插入唯一约束冲突,跳过', {'批次大小': len(batch_values)})
|
|
944
|
+
except Exception as e:
|
|
945
|
+
logger.error('批量插入失败', {'错误': str(e), '批次大小': len(batch_values)})
|
|
946
|
+
raise
|
|
947
|
+
|
|
603
948
|
conn.commit()
|
|
604
|
-
|
|
605
|
-
total_inserted += affected
|
|
606
|
-
except pymysql.err.IntegrityError:
|
|
607
|
-
conn.rollback()
|
|
608
|
-
total_skipped += len(batch)
|
|
609
|
-
logger.debug('批量插入唯一约束冲突,跳过', {'批次大小': len(batch)})
|
|
949
|
+
|
|
610
950
|
except Exception as e:
|
|
611
951
|
conn.rollback()
|
|
612
|
-
|
|
613
|
-
|
|
952
|
+
logger.error('事务执行失败,已回滚', {'错误': str(e)})
|
|
953
|
+
total_failed += len(tx_values)
|
|
614
954
|
|
|
615
955
|
return total_inserted, total_skipped, total_failed
|
|
616
956
|
|
|
@@ -655,13 +995,13 @@ def retry_on_failure(max_retries: int = 3, delay: int = 1):
|
|
|
655
995
|
|
|
656
996
|
class MySQLUploader:
|
|
657
997
|
"""
|
|
658
|
-
MySQL数据上传器
|
|
998
|
+
MySQL数据上传器
|
|
659
999
|
|
|
660
1000
|
特性:
|
|
661
1001
|
- 自动为每个表添加id(BIGINT自增主键)、create_at、update_at时间戳列
|
|
662
1002
|
- 支持自动建表、分表、数据类型推断
|
|
663
1003
|
- 高可用连接池管理和重试机制
|
|
664
|
-
-
|
|
1004
|
+
- 流式批量插入优化
|
|
665
1005
|
"""
|
|
666
1006
|
|
|
667
1007
|
def __init__(self, username: str, password: str, host: str = 'localhost',
|
|
@@ -705,7 +1045,7 @@ class MySQLUploader:
|
|
|
705
1045
|
partition_by: Optional[str] = None,
|
|
706
1046
|
partition_date_column: str = '日期',
|
|
707
1047
|
update_on_duplicate: bool = False,
|
|
708
|
-
unique_keys: Optional[List[List[str]]] = None) ->
|
|
1048
|
+
unique_keys: Optional[List[List[str]]] = None) -> Dict[str, Any]:
|
|
709
1049
|
"""
|
|
710
1050
|
上传数据到MySQL数据库
|
|
711
1051
|
|
|
@@ -714,120 +1054,256 @@ class MySQLUploader:
|
|
|
714
1054
|
- create_at: 创建时间戳(插入时自动设置)
|
|
715
1055
|
- update_at: 更新时间戳(插入和更新时自动设置)
|
|
716
1056
|
|
|
717
|
-
:param db_name:
|
|
718
|
-
:param table_name:
|
|
719
|
-
:param data:
|
|
1057
|
+
:param db_name: 数据库名
|
|
1058
|
+
:param table_name: 表名
|
|
1059
|
+
:param data: 要上传的数据,支持字典、字典列表、DataFrame
|
|
720
1060
|
:param set_typ: 列类型定义,如果为None则自动推断(无需包含系统列)
|
|
721
1061
|
:param allow_null: 是否允许空值
|
|
722
1062
|
:param partition_by: 分表方式('year'或'month')
|
|
723
1063
|
:param partition_date_column: 分表日期列名
|
|
724
1064
|
:param update_on_duplicate: 遇到重复数据时是否更新
|
|
725
1065
|
:param unique_keys: 唯一约束列表(无需包含系统列)
|
|
726
|
-
:return:
|
|
1066
|
+
:return: 上传结果详情
|
|
727
1067
|
"""
|
|
728
1068
|
db_name = db_name.lower()
|
|
729
1069
|
table_name = table_name.lower()
|
|
1070
|
+
|
|
1071
|
+
result = {
|
|
1072
|
+
'success': False,
|
|
1073
|
+
'inserted_rows': 0,
|
|
1074
|
+
'skipped_rows': 0,
|
|
1075
|
+
'failed_rows': 0,
|
|
1076
|
+
'tables_created': []
|
|
1077
|
+
}
|
|
1078
|
+
|
|
730
1079
|
try:
|
|
731
|
-
|
|
1080
|
+
# 计算原始数据大小
|
|
1081
|
+
original_data_size = 0
|
|
1082
|
+
if isinstance(data, (pd.DataFrame, list)):
|
|
1083
|
+
original_data_size = len(data)
|
|
1084
|
+
elif isinstance(data, dict):
|
|
1085
|
+
original_data_size = 1
|
|
732
1086
|
|
|
733
|
-
#
|
|
1087
|
+
# 标准化数据为流式迭代器
|
|
734
1088
|
normalized_data = DataProcessor.normalize_data(data)
|
|
735
|
-
if not normalized_data:
|
|
736
|
-
logger.warning('数据为空,跳过上传')
|
|
737
|
-
return True
|
|
738
1089
|
|
|
739
1090
|
# 推断或验证列类型
|
|
740
1091
|
if set_typ is None:
|
|
741
|
-
|
|
742
|
-
|
|
1092
|
+
# 取第一个chunk进行类型推断
|
|
1093
|
+
first_chunk = next(iter(normalized_data))
|
|
1094
|
+
set_typ = DataTypeInferrer.infer_types_from_data(first_chunk)
|
|
1095
|
+
# 重新创建迭代器
|
|
1096
|
+
normalized_data = DataProcessor.normalize_data(data)
|
|
1097
|
+
logger.debug('自动推断数据类型', {'类型映射': set_typ})
|
|
743
1098
|
|
|
744
1099
|
# 确保数据库存在
|
|
745
1100
|
self.table_mgr.ensure_database_exists(db_name)
|
|
746
1101
|
|
|
747
1102
|
# 处理分表逻辑
|
|
748
1103
|
if partition_by:
|
|
749
|
-
|
|
1104
|
+
upload_result = self._handle_partitioned_upload(
|
|
750
1105
|
db_name, table_name, normalized_data, set_typ,
|
|
751
1106
|
partition_by, partition_date_column, allow_null,
|
|
752
1107
|
update_on_duplicate, unique_keys
|
|
753
1108
|
)
|
|
754
1109
|
else:
|
|
755
|
-
|
|
1110
|
+
upload_result = self._handle_single_table_upload(
|
|
756
1111
|
db_name, table_name, normalized_data, set_typ,
|
|
757
1112
|
allow_null, update_on_duplicate, unique_keys
|
|
758
1113
|
)
|
|
759
|
-
|
|
1114
|
+
|
|
1115
|
+
# 合并结果
|
|
1116
|
+
result.update(upload_result)
|
|
1117
|
+
result['success'] = upload_result.get('failed_rows', 0) == 0
|
|
1118
|
+
|
|
760
1119
|
except Exception as e:
|
|
761
1120
|
logger.error('数据上传失败', {
|
|
762
1121
|
'数据库': db_name,
|
|
763
1122
|
'表名': table_name,
|
|
764
1123
|
'错误': str(e)
|
|
765
1124
|
})
|
|
766
|
-
|
|
1125
|
+
result['success'] = False
|
|
1126
|
+
|
|
1127
|
+
return result
|
|
767
1128
|
|
|
768
1129
|
def _handle_single_table_upload(self, db_name: str, table_name: str,
|
|
769
|
-
data: List[Dict],
|
|
1130
|
+
data: Iterator[List[Dict]],
|
|
1131
|
+
set_typ: Dict[str, str],
|
|
770
1132
|
allow_null: bool, update_on_duplicate: bool,
|
|
771
|
-
unique_keys: Optional[List[List[str]]]) ->
|
|
1133
|
+
unique_keys: Optional[List[List[str]]]) -> Dict[str, Any]:
|
|
772
1134
|
"""处理单表上传"""
|
|
1135
|
+
result = {
|
|
1136
|
+
'inserted_rows': 0,
|
|
1137
|
+
'skipped_rows': 0,
|
|
1138
|
+
'failed_rows': 0,
|
|
1139
|
+
'tables_created': []
|
|
1140
|
+
}
|
|
1141
|
+
|
|
773
1142
|
# 确保表存在
|
|
774
1143
|
if not self.table_mgr.table_exists(db_name, table_name):
|
|
775
1144
|
self.table_mgr.create_table(db_name, table_name, set_typ,
|
|
776
1145
|
unique_keys=unique_keys)
|
|
1146
|
+
result['tables_created'].append(f"{db_name}.{table_name}")
|
|
1147
|
+
else:
|
|
1148
|
+
# 表已存在,确保有时间戳列(但保持原有主键结构)
|
|
1149
|
+
self.table_mgr.ensure_system_columns(db_name, table_name)
|
|
777
1150
|
|
|
778
|
-
#
|
|
779
|
-
|
|
780
|
-
|
|
781
|
-
|
|
782
|
-
|
|
783
|
-
|
|
784
|
-
|
|
785
|
-
|
|
786
|
-
|
|
1151
|
+
# 流式处理每个数据块
|
|
1152
|
+
for chunk in data:
|
|
1153
|
+
if not chunk:
|
|
1154
|
+
continue
|
|
1155
|
+
|
|
1156
|
+
prepared_chunk = DataProcessor.prepare_data_for_insert(
|
|
1157
|
+
chunk, set_typ, allow_null
|
|
1158
|
+
)
|
|
1159
|
+
|
|
1160
|
+
inserted, skipped, failed = self.data_inserter.insert_data(
|
|
1161
|
+
db_name, table_name, prepared_chunk, set_typ, update_on_duplicate
|
|
1162
|
+
)
|
|
1163
|
+
|
|
1164
|
+
result['inserted_rows'] += inserted
|
|
1165
|
+
result['skipped_rows'] += skipped
|
|
1166
|
+
result['failed_rows'] += failed
|
|
787
1167
|
|
|
788
1168
|
logger.info('单表上传完成', {
|
|
789
1169
|
'数据库': db_name,
|
|
790
1170
|
'表名': table_name,
|
|
791
|
-
'
|
|
792
|
-
'
|
|
793
|
-
'
|
|
794
|
-
'失败': failed
|
|
1171
|
+
'插入': result['inserted_rows'],
|
|
1172
|
+
'跳过': result['skipped_rows'],
|
|
1173
|
+
'失败': result['failed_rows']
|
|
795
1174
|
})
|
|
796
1175
|
|
|
797
|
-
return
|
|
1176
|
+
return result
|
|
798
1177
|
|
|
799
1178
|
def _handle_partitioned_upload(self, db_name: str, base_table_name: str,
|
|
800
|
-
data: List[Dict],
|
|
1179
|
+
data: Iterator[List[Dict]],
|
|
1180
|
+
set_typ: Dict[str, str],
|
|
801
1181
|
partition_by: str, partition_date_column: str,
|
|
802
1182
|
allow_null: bool, update_on_duplicate: bool,
|
|
803
|
-
unique_keys: Optional[List[List[str]]]) ->
|
|
1183
|
+
unique_keys: Optional[List[List[str]]]) -> Dict[str, Any]:
|
|
804
1184
|
"""处理分表上传"""
|
|
805
|
-
|
|
806
|
-
|
|
807
|
-
|
|
808
|
-
|
|
1185
|
+
result = {
|
|
1186
|
+
'inserted_rows': 0,
|
|
1187
|
+
'skipped_rows': 0,
|
|
1188
|
+
'failed_rows': 0,
|
|
1189
|
+
'tables_created': []
|
|
1190
|
+
}
|
|
809
1191
|
|
|
810
|
-
|
|
1192
|
+
# 使用更小的缓冲区,更频繁地刷新
|
|
1193
|
+
partition_buffers = {}
|
|
1194
|
+
buffer_limit = 1000 # 减小缓冲区大小
|
|
811
1195
|
|
|
812
|
-
|
|
813
|
-
|
|
1196
|
+
# 记录已创建的表,避免重复检查
|
|
1197
|
+
created_tables = set()
|
|
1198
|
+
|
|
1199
|
+
for chunk in data:
|
|
1200
|
+
if not chunk:
|
|
1201
|
+
continue
|
|
814
1202
|
|
|
815
|
-
|
|
816
|
-
|
|
817
|
-
|
|
1203
|
+
# 按日期分区当前chunk
|
|
1204
|
+
partitioned_chunk = DataProcessor.partition_data_by_date(
|
|
1205
|
+
chunk, partition_date_column, partition_by
|
|
818
1206
|
)
|
|
819
1207
|
|
|
820
|
-
|
|
821
|
-
|
|
1208
|
+
# 将数据添加到对应分区缓冲区
|
|
1209
|
+
for partition_suffix, partition_data in partitioned_chunk.items():
|
|
1210
|
+
if partition_suffix not in partition_buffers:
|
|
1211
|
+
partition_buffers[partition_suffix] = []
|
|
1212
|
+
partition_buffers[partition_suffix].extend(partition_data)
|
|
1213
|
+
|
|
1214
|
+
# 更频繁地刷新缓冲区
|
|
1215
|
+
if len(partition_buffers[partition_suffix]) >= buffer_limit:
|
|
1216
|
+
partition_result = self._process_partition_buffer_optimized(
|
|
1217
|
+
db_name, base_table_name, partition_suffix,
|
|
1218
|
+
partition_buffers[partition_suffix], set_typ,
|
|
1219
|
+
allow_null, update_on_duplicate, unique_keys, created_tables
|
|
1220
|
+
)
|
|
1221
|
+
self._merge_partition_result(result, partition_result)
|
|
1222
|
+
partition_buffers[partition_suffix] = [] # 清空缓冲区
|
|
1223
|
+
|
|
1224
|
+
# 定期检查所有缓冲区,防止某些分区数据积累过多
|
|
1225
|
+
total_buffered = sum(len(buffer) for buffer in partition_buffers.values())
|
|
1226
|
+
if total_buffered > 5000: # 总缓冲超过5000条时强制刷新
|
|
1227
|
+
for partition_suffix in list(partition_buffers.keys()):
|
|
1228
|
+
if partition_buffers[partition_suffix]:
|
|
1229
|
+
partition_result = self._process_partition_buffer_optimized(
|
|
1230
|
+
db_name, base_table_name, partition_suffix,
|
|
1231
|
+
partition_buffers[partition_suffix], set_typ,
|
|
1232
|
+
allow_null, update_on_duplicate, unique_keys, created_tables
|
|
1233
|
+
)
|
|
1234
|
+
self._merge_partition_result(result, partition_result)
|
|
1235
|
+
partition_buffers[partition_suffix] = []
|
|
1236
|
+
|
|
1237
|
+
# 处理剩余的缓冲区数据
|
|
1238
|
+
for partition_suffix, buffer_data in partition_buffers.items():
|
|
1239
|
+
if buffer_data:
|
|
1240
|
+
partition_result = self._process_partition_buffer_optimized(
|
|
1241
|
+
db_name, base_table_name, partition_suffix,
|
|
1242
|
+
buffer_data, set_typ, allow_null, update_on_duplicate, unique_keys, created_tables
|
|
1243
|
+
)
|
|
1244
|
+
self._merge_partition_result(result, partition_result)
|
|
822
1245
|
|
|
823
1246
|
logger.info('分表上传完成', {
|
|
824
1247
|
'数据库': db_name,
|
|
825
1248
|
'基础表名': base_table_name,
|
|
826
|
-
'分区数': len(
|
|
827
|
-
'
|
|
1249
|
+
'分区数': len(created_tables),
|
|
1250
|
+
'插入': result['inserted_rows'],
|
|
1251
|
+
'跳过': result['skipped_rows'],
|
|
1252
|
+
'失败': result['failed_rows']
|
|
828
1253
|
})
|
|
829
1254
|
|
|
830
|
-
return
|
|
1255
|
+
return result
|
|
1256
|
+
|
|
1257
|
+
def _process_partition_buffer_optimized(self, db_name: str, base_table_name: str,
|
|
1258
|
+
partition_suffix: str, partition_data: List[Dict],
|
|
1259
|
+
set_typ: Dict[str, str], allow_null: bool,
|
|
1260
|
+
update_on_duplicate: bool,
|
|
1261
|
+
unique_keys: Optional[List[List[str]]],
|
|
1262
|
+
created_tables: set) -> Dict[str, Any]:
|
|
1263
|
+
"""处理单个分区的缓冲数据"""
|
|
1264
|
+
partition_table_name = f"{base_table_name}_{partition_suffix}"
|
|
1265
|
+
|
|
1266
|
+
result = {
|
|
1267
|
+
'inserted_rows': 0,
|
|
1268
|
+
'skipped_rows': 0,
|
|
1269
|
+
'failed_rows': 0,
|
|
1270
|
+
'tables_created': []
|
|
1271
|
+
}
|
|
1272
|
+
|
|
1273
|
+
# 优化表存在性检查
|
|
1274
|
+
table_key = f"{db_name}.{partition_table_name}"
|
|
1275
|
+
if table_key not in created_tables:
|
|
1276
|
+
if not self.table_mgr.table_exists(db_name, partition_table_name):
|
|
1277
|
+
self.table_mgr.create_table(db_name, partition_table_name, set_typ,
|
|
1278
|
+
unique_keys=unique_keys)
|
|
1279
|
+
result['tables_created'].append(table_key)
|
|
1280
|
+
else:
|
|
1281
|
+
# 表已存在,确保有时间戳列(但保持原有主键结构)
|
|
1282
|
+
self.table_mgr.ensure_system_columns(db_name, partition_table_name)
|
|
1283
|
+
created_tables.add(table_key)
|
|
1284
|
+
|
|
1285
|
+
# 准备并插入数据
|
|
1286
|
+
prepared_data = DataProcessor.prepare_data_for_insert(
|
|
1287
|
+
partition_data, set_typ, allow_null
|
|
1288
|
+
)
|
|
1289
|
+
|
|
1290
|
+
inserted, skipped, failed = self.data_inserter.insert_data(
|
|
1291
|
+
db_name, partition_table_name, prepared_data, set_typ, update_on_duplicate
|
|
1292
|
+
)
|
|
1293
|
+
|
|
1294
|
+
result['inserted_rows'] = inserted
|
|
1295
|
+
result['skipped_rows'] = skipped
|
|
1296
|
+
result['failed_rows'] = failed
|
|
1297
|
+
|
|
1298
|
+
return result
|
|
1299
|
+
|
|
1300
|
+
def _merge_partition_result(self, main_result: Dict[str, Any],
|
|
1301
|
+
partition_result: Dict[str, Any]):
|
|
1302
|
+
"""合并分区处理结果"""
|
|
1303
|
+
main_result['inserted_rows'] += partition_result['inserted_rows']
|
|
1304
|
+
main_result['skipped_rows'] += partition_result['skipped_rows']
|
|
1305
|
+
main_result['failed_rows'] += partition_result['failed_rows']
|
|
1306
|
+
main_result['tables_created'].extend(partition_result['tables_created'])
|
|
831
1307
|
|
|
832
1308
|
def close(self):
|
|
833
1309
|
"""关闭连接"""
|
|
@@ -846,6 +1322,165 @@ class MySQLUploader:
|
|
|
846
1322
|
def __exit__(self, exc_type, exc_val, exc_tb):
|
|
847
1323
|
self.close()
|
|
848
1324
|
|
|
1325
|
+
def upload_data_concurrent(self, db_name: str, table_name: str,
|
|
1326
|
+
data: Union[Dict, List[Dict], pd.DataFrame],
|
|
1327
|
+
set_typ: Optional[Dict[str, str]] = None,
|
|
1328
|
+
allow_null: bool = False,
|
|
1329
|
+
partition_by: Optional[str] = None,
|
|
1330
|
+
partition_date_column: str = '日期',
|
|
1331
|
+
update_on_duplicate: bool = False,
|
|
1332
|
+
unique_keys: Optional[List[List[str]]] = None,
|
|
1333
|
+
max_workers: int = 3) -> Dict[str, Any]:
|
|
1334
|
+
"""
|
|
1335
|
+
并发上传数据到MySQL数据库
|
|
1336
|
+
|
|
1337
|
+
:param max_workers: 最大并发工作线程数
|
|
1338
|
+
:return: 上传结果详情
|
|
1339
|
+
"""
|
|
1340
|
+
db_name = db_name.lower()
|
|
1341
|
+
table_name = table_name.lower()
|
|
1342
|
+
|
|
1343
|
+
result = {
|
|
1344
|
+
'success': False,
|
|
1345
|
+
'inserted_rows': 0,
|
|
1346
|
+
'skipped_rows': 0,
|
|
1347
|
+
'failed_rows': 0,
|
|
1348
|
+
'tables_created': []
|
|
1349
|
+
}
|
|
1350
|
+
|
|
1351
|
+
try:
|
|
1352
|
+
# 标准化数据为流式迭代器
|
|
1353
|
+
normalized_data = DataProcessor.normalize_data(data, chunk_size=2000) # 更小的chunk用于并发
|
|
1354
|
+
|
|
1355
|
+
# 推断或验证列类型
|
|
1356
|
+
if set_typ is None:
|
|
1357
|
+
first_chunk = next(iter(normalized_data))
|
|
1358
|
+
set_typ = DataTypeInferrer.infer_types_from_data(first_chunk)
|
|
1359
|
+
normalized_data = DataProcessor.normalize_data(data, chunk_size=2000)
|
|
1360
|
+
logger.debug('自动推断数据类型', {'类型映射': set_typ})
|
|
1361
|
+
|
|
1362
|
+
# 确保数据库存在
|
|
1363
|
+
self.table_mgr.ensure_database_exists(db_name)
|
|
1364
|
+
|
|
1365
|
+
# 创建线程锁用于表创建的线程安全
|
|
1366
|
+
table_creation_lock = threading.Lock()
|
|
1367
|
+
created_tables_set = set()
|
|
1368
|
+
|
|
1369
|
+
def process_chunk_worker(chunk_data):
|
|
1370
|
+
"""工作线程函数"""
|
|
1371
|
+
try:
|
|
1372
|
+
if partition_by:
|
|
1373
|
+
# 分表处理
|
|
1374
|
+
partitioned_chunk = DataProcessor.partition_data_by_date(
|
|
1375
|
+
chunk_data, partition_date_column, partition_by
|
|
1376
|
+
)
|
|
1377
|
+
|
|
1378
|
+
chunk_result = {
|
|
1379
|
+
'inserted_rows': 0,
|
|
1380
|
+
'skipped_rows': 0,
|
|
1381
|
+
'failed_rows': 0,
|
|
1382
|
+
'tables_created': []
|
|
1383
|
+
}
|
|
1384
|
+
|
|
1385
|
+
for partition_suffix, partition_data in partitioned_chunk.items():
|
|
1386
|
+
partition_table_name = f"{table_name}_{partition_suffix}"
|
|
1387
|
+
table_key = f"{db_name}.{partition_table_name}"
|
|
1388
|
+
|
|
1389
|
+
# 确保表存在(线程安全)
|
|
1390
|
+
with table_creation_lock:
|
|
1391
|
+
if table_key not in created_tables_set:
|
|
1392
|
+
if not self.table_mgr.table_exists(db_name, partition_table_name):
|
|
1393
|
+
self.table_mgr.create_table(db_name, partition_table_name, set_typ,
|
|
1394
|
+
unique_keys=unique_keys)
|
|
1395
|
+
chunk_result['tables_created'].append(table_key)
|
|
1396
|
+
else:
|
|
1397
|
+
self.table_mgr.ensure_system_columns(db_name, partition_table_name)
|
|
1398
|
+
created_tables_set.add(table_key)
|
|
1399
|
+
|
|
1400
|
+
# 准备并插入数据
|
|
1401
|
+
prepared_data = DataProcessor.prepare_data_for_insert(
|
|
1402
|
+
partition_data, set_typ, allow_null
|
|
1403
|
+
)
|
|
1404
|
+
|
|
1405
|
+
inserted, skipped, failed = self.data_inserter.insert_data(
|
|
1406
|
+
db_name, partition_table_name, prepared_data, set_typ, update_on_duplicate
|
|
1407
|
+
)
|
|
1408
|
+
|
|
1409
|
+
chunk_result['inserted_rows'] += inserted
|
|
1410
|
+
chunk_result['skipped_rows'] += skipped
|
|
1411
|
+
chunk_result['failed_rows'] += failed
|
|
1412
|
+
else:
|
|
1413
|
+
# 单表处理
|
|
1414
|
+
table_key = f"{db_name}.{table_name}"
|
|
1415
|
+
with table_creation_lock:
|
|
1416
|
+
if table_key not in created_tables_set:
|
|
1417
|
+
if not self.table_mgr.table_exists(db_name, table_name):
|
|
1418
|
+
self.table_mgr.create_table(db_name, table_name, set_typ,
|
|
1419
|
+
unique_keys=unique_keys)
|
|
1420
|
+
chunk_result = {'tables_created': [table_key]}
|
|
1421
|
+
else:
|
|
1422
|
+
self.table_mgr.ensure_system_columns(db_name, table_name)
|
|
1423
|
+
chunk_result = {'tables_created': []}
|
|
1424
|
+
created_tables_set.add(table_key)
|
|
1425
|
+
else:
|
|
1426
|
+
chunk_result = {'tables_created': []}
|
|
1427
|
+
|
|
1428
|
+
prepared_chunk = DataProcessor.prepare_data_for_insert(
|
|
1429
|
+
chunk_data, set_typ, allow_null
|
|
1430
|
+
)
|
|
1431
|
+
|
|
1432
|
+
inserted, skipped, failed = self.data_inserter.insert_data(
|
|
1433
|
+
db_name, table_name, prepared_chunk, set_typ, update_on_duplicate
|
|
1434
|
+
)
|
|
1435
|
+
|
|
1436
|
+
chunk_result.update({
|
|
1437
|
+
'inserted_rows': inserted,
|
|
1438
|
+
'skipped_rows': skipped,
|
|
1439
|
+
'failed_rows': failed
|
|
1440
|
+
})
|
|
1441
|
+
|
|
1442
|
+
return chunk_result
|
|
1443
|
+
|
|
1444
|
+
except Exception as e:
|
|
1445
|
+
logger.error('并发处理chunk失败', {'错误': str(e)})
|
|
1446
|
+
return {
|
|
1447
|
+
'inserted_rows': 0,
|
|
1448
|
+
'skipped_rows': 0,
|
|
1449
|
+
'failed_rows': len(chunk_data) if chunk_data else 0,
|
|
1450
|
+
'tables_created': []
|
|
1451
|
+
}
|
|
1452
|
+
|
|
1453
|
+
# 使用线程池执行并发处理
|
|
1454
|
+
with concurrent.futures.ThreadPoolExecutor(max_workers=max_workers) as executor:
|
|
1455
|
+
# 提交所有任务
|
|
1456
|
+
future_to_chunk = {}
|
|
1457
|
+
for chunk in normalized_data:
|
|
1458
|
+
if chunk:
|
|
1459
|
+
future = executor.submit(process_chunk_worker, chunk)
|
|
1460
|
+
future_to_chunk[future] = len(chunk)
|
|
1461
|
+
|
|
1462
|
+
# 收集结果
|
|
1463
|
+
for future in concurrent.futures.as_completed(future_to_chunk):
|
|
1464
|
+
chunk_result = future.result()
|
|
1465
|
+
result['inserted_rows'] += chunk_result['inserted_rows']
|
|
1466
|
+
result['skipped_rows'] += chunk_result['skipped_rows']
|
|
1467
|
+
result['failed_rows'] += chunk_result['failed_rows']
|
|
1468
|
+
result['tables_created'].extend(chunk_result['tables_created'])
|
|
1469
|
+
|
|
1470
|
+
# 去重tables_created
|
|
1471
|
+
result['tables_created'] = list(set(result['tables_created']))
|
|
1472
|
+
result['success'] = result['failed_rows'] == 0
|
|
1473
|
+
|
|
1474
|
+
except Exception as e:
|
|
1475
|
+
logger.error('并发数据上传失败', {
|
|
1476
|
+
'数据库': db_name,
|
|
1477
|
+
'表名': table_name,
|
|
1478
|
+
'错误': str(e)
|
|
1479
|
+
})
|
|
1480
|
+
result['success'] = False
|
|
1481
|
+
|
|
1482
|
+
return result
|
|
1483
|
+
|
|
849
1484
|
|
|
850
1485
|
# 使用示例
|
|
851
1486
|
if __name__ == '__main__':
|
|
@@ -863,24 +1498,14 @@ if __name__ == '__main__':
|
|
|
863
1498
|
{'name': 'Bob', 'age': 30, 'salary': 60000.0, '日期': '2023-01-02'},
|
|
864
1499
|
]
|
|
865
1500
|
|
|
866
|
-
#
|
|
867
|
-
|
|
868
|
-
'name': 'VARCHAR(255)',
|
|
869
|
-
'age': 'INT',
|
|
870
|
-
'salary': 'DECIMAL(10,2)',
|
|
871
|
-
'日期': 'DATE'
|
|
872
|
-
}
|
|
873
|
-
|
|
874
|
-
# 上传数据
|
|
875
|
-
success = uploader.upload_data(
|
|
1501
|
+
# 上传数据(自动推断类型,流式处理)
|
|
1502
|
+
result = uploader.upload_data(
|
|
876
1503
|
db_name='test_db',
|
|
877
1504
|
table_name='test_table',
|
|
878
1505
|
data=sample_data,
|
|
879
|
-
set_typ=column_types,
|
|
880
|
-
allow_null=False,
|
|
881
1506
|
update_on_duplicate=True,
|
|
882
1507
|
unique_keys=[['name', '日期']]
|
|
883
1508
|
)
|
|
884
1509
|
|
|
885
1510
|
uploader.close()
|
|
886
|
-
print(f"上传结果: {
|
|
1511
|
+
print(f"上传结果: {result}")
|
|
@@ -820,18 +820,11 @@ def main(service_name, database):
|
|
|
820
820
|
db_name='属性设置2',
|
|
821
821
|
table_name='天猫商品sku信息',
|
|
822
822
|
data=s.df,
|
|
823
|
-
set_typ=
|
|
824
|
-
|
|
825
|
-
|
|
826
|
-
update_on_duplicate=
|
|
827
|
-
|
|
828
|
-
allow_null=False, # 允许插入空值
|
|
829
|
-
partition_by=None, # 按年/月分表
|
|
830
|
-
partition_date_column='日期', # 用于分表的日期列名,默认为'日期'
|
|
831
|
-
auto_create=True, # 表不存在时自动创建, 默认参数不要更改
|
|
832
|
-
indexes=[], # 指定索引列
|
|
833
|
-
transaction_mode='row', # 事务模式
|
|
834
|
-
unique_keys=[[]], # 唯一约束列表
|
|
823
|
+
set_typ=None,
|
|
824
|
+
allow_null=False,
|
|
825
|
+
partition_by=None,
|
|
826
|
+
update_on_duplicate=True,
|
|
827
|
+
unique_keys=None,
|
|
835
828
|
)
|
|
836
829
|
|
|
837
830
|
|
|
@@ -907,18 +900,11 @@ def download_sku(service_name='company', database='mysql', db_name='属性设置
|
|
|
907
900
|
db_name=table_name,
|
|
908
901
|
table_name=table_name,
|
|
909
902
|
data=s.df,
|
|
910
|
-
set_typ=
|
|
911
|
-
|
|
912
|
-
|
|
913
|
-
update_on_duplicate=
|
|
914
|
-
|
|
915
|
-
allow_null=False, # 允许插入空值
|
|
916
|
-
partition_by=None, # 按年/月分表
|
|
917
|
-
partition_date_column='日期', # 用于分表的日期列名,默认为'日期'
|
|
918
|
-
auto_create=True, # 表不存在时自动创建, 默认参数不要更改
|
|
919
|
-
indexes=[], # 指定索引列
|
|
920
|
-
transaction_mode='row', # 事务模式
|
|
921
|
-
unique_keys=[[]], # 唯一约束列表
|
|
903
|
+
set_typ=None,
|
|
904
|
+
allow_null=False,
|
|
905
|
+
partition_by=None,
|
|
906
|
+
update_on_duplicate=True,
|
|
907
|
+
unique_keys=None,
|
|
922
908
|
)
|
|
923
909
|
|
|
924
910
|
# 从数据库中读取数据,并下载素材到本地
|
|
@@ -954,18 +940,11 @@ def download_sku(service_name='company', database='mysql', db_name='属性设置
|
|
|
954
940
|
db_name=db_name,
|
|
955
941
|
table_name=table_name,
|
|
956
942
|
data=df,
|
|
957
|
-
set_typ=
|
|
958
|
-
|
|
959
|
-
|
|
960
|
-
update_on_duplicate=
|
|
961
|
-
|
|
962
|
-
allow_null=False, # 允许插入空值
|
|
963
|
-
partition_by=None, # 按年/月分表
|
|
964
|
-
partition_date_column='日期', # 用于分表的日期列名,默认为'日期'
|
|
965
|
-
auto_create=True, # 表不存在时自动创建, 默认参数不要更改
|
|
966
|
-
indexes=[], # 指定索引列
|
|
967
|
-
transaction_mode='row', # 事务模式
|
|
968
|
-
unique_keys=[[]], # 唯一约束列表
|
|
943
|
+
set_typ=None,
|
|
944
|
+
allow_null=False,
|
|
945
|
+
partition_by=None,
|
|
946
|
+
update_on_duplicate=True,
|
|
947
|
+
unique_keys=None,
|
|
969
948
|
)
|
|
970
949
|
|
|
971
950
|
|
|
@@ -1,5 +1,5 @@
|
|
|
1
1
|
mdbq/__init__.py,sha256=Il5Q9ATdX8yXqVxtP_nYqUhExzxPC_qk_WXQ_4h0exg,16
|
|
2
|
-
mdbq/__version__.py,sha256=
|
|
2
|
+
mdbq/__version__.py,sha256=gNDA6f7PmXcbqB0lTY4HIgD6dEB6SGywjhHa3HAyczA,17
|
|
3
3
|
mdbq/auth/__init__.py,sha256=pnPMAt63sh1B6kEvmutUuro46zVf2v2YDAG7q-jV_To,24
|
|
4
4
|
mdbq/auth/auth_backend.py,sha256=iLN7AqiSq7fQgFtNtge_TIlVOR1hrCSZXH6oId6uGX4,116924
|
|
5
5
|
mdbq/auth/crypto.py,sha256=fcZRFCnrKVVdWDUx_zds51ynFYwS9DBvJOrRQVldrfM,15931
|
|
@@ -15,9 +15,9 @@ mdbq/mysql/deduplicator.py,sha256=tzLIm9K9S0lGLlVTI0dDQVYpWX796XCuyufmw1lU26Y,73
|
|
|
15
15
|
mdbq/mysql/mysql.py,sha256=pDg771xBugCMSTWeskIFTi3pFLgaqgyG3smzf-86Wn8,56772
|
|
16
16
|
mdbq/mysql/s_query.py,sha256=N2xHJf2CiUXjXIVBemdst-wamIP3908EGAJOFG13fCU,50475
|
|
17
17
|
mdbq/mysql/unique_.py,sha256=MaztT-WIyEQUs-OOYY4pFulgHVcXR1BfCy3QUz0XM_U,21127
|
|
18
|
-
mdbq/mysql/uploader.py,sha256=
|
|
18
|
+
mdbq/mysql/uploader.py,sha256=2inrXu3PIlvowfm5_0U4Trx_mraApjII8g_5ycFbNJ0,60059
|
|
19
19
|
mdbq/other/__init__.py,sha256=jso1oHcy6cJEfa7udS_9uO5X6kZLoPBF8l3wCYmr5dM,18
|
|
20
|
-
mdbq/other/download_sku_picture.py,sha256=
|
|
20
|
+
mdbq/other/download_sku_picture.py,sha256=MJX47I9jTUMFzO1kyEH-onIzAGa6QpgfmghrmyYnEsc,45111
|
|
21
21
|
mdbq/other/error_handler.py,sha256=4p5haAXSY-P78stp4Xwo_MwAngWYqyKj5ogWIuYXMeY,12631
|
|
22
22
|
mdbq/other/otk.py,sha256=iclBIFbQbhlqzUbcMMoePXBpcP1eZ06ZtjnhcA_EbmE,7241
|
|
23
23
|
mdbq/other/pov_city.py,sha256=AEOmCOzOwyjHi9LLZWPKi6DUuSC-_M163664I52u9qw,21050
|
|
@@ -35,7 +35,7 @@ mdbq/route/routes.py,sha256=QVGfTvDgu0CpcKCvk1ra74H8uojgqTLUav1fnVAqLEA,29433
|
|
|
35
35
|
mdbq/selenium/__init__.py,sha256=AKzeEceqZyvqn2dEDoJSzDQnbuENkJSHAlbHAD0u0ZI,10
|
|
36
36
|
mdbq/selenium/get_driver.py,sha256=1NTlVUE6QsyjTrVVVqTO2LOnYf578ccFWlWnvIXGtic,20903
|
|
37
37
|
mdbq/spider/__init__.py,sha256=RBMFXGy_jd1HXZhngB2T2XTvJqki8P_Fr-pBcwijnew,18
|
|
38
|
-
mdbq-4.2.
|
|
39
|
-
mdbq-4.2.
|
|
40
|
-
mdbq-4.2.
|
|
41
|
-
mdbq-4.2.
|
|
38
|
+
mdbq-4.2.2.dist-info/METADATA,sha256=vfhvk7DXQ267-NOPdqKJ_AWCWSEbWKdDjIf7bilbCXo,363
|
|
39
|
+
mdbq-4.2.2.dist-info/WHEEL,sha256=jB7zZ3N9hIM9adW7qlTAyycLYW9npaWKLRzaoVcLKcM,91
|
|
40
|
+
mdbq-4.2.2.dist-info/top_level.txt,sha256=2FQ-uLnCSB-OwFiWntzmwosW3X2Xqsg0ewh1axsaylA,5
|
|
41
|
+
mdbq-4.2.2.dist-info/RECORD,,
|
|
File without changes
|
|
File without changes
|