crawlo 1.4.5__py3-none-any.whl → 1.4.6__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of crawlo might be problematic. Click here for more details.
- crawlo/__version__.py +1 -1
- crawlo/downloader/cffi_downloader.py +3 -1
- crawlo/middleware/proxy.py +171 -348
- crawlo/pipelines/mysql_pipeline.py +339 -188
- crawlo/settings/default_settings.py +38 -30
- crawlo/stats_collector.py +10 -1
- crawlo/templates/project/settings.py.tmpl +10 -55
- crawlo/templates/project/settings_distributed.py.tmpl +20 -22
- crawlo/templates/project/settings_gentle.py.tmpl +5 -0
- crawlo/templates/project/settings_high_performance.py.tmpl +5 -0
- crawlo/templates/project/settings_minimal.py.tmpl +25 -1
- crawlo/templates/project/settings_simple.py.tmpl +5 -0
- crawlo/templates/run.py.tmpl +1 -8
- crawlo/templates/spider/spider.py.tmpl +5 -108
- crawlo/utils/db_helper.py +11 -5
- {crawlo-1.4.5.dist-info → crawlo-1.4.6.dist-info}/METADATA +1 -1
- {crawlo-1.4.5.dist-info → crawlo-1.4.6.dist-info}/RECORD +43 -29
- tests/authenticated_proxy_example.py +10 -6
- tests/explain_mysql_update_behavior.py +77 -0
- tests/simulate_mysql_update_test.py +140 -0
- tests/test_asyncmy_usage.py +57 -0
- tests/test_crawlo_proxy_integration.py +8 -2
- tests/test_downloader_proxy_compatibility.py +24 -20
- tests/test_mysql_pipeline_config.py +165 -0
- tests/test_mysql_pipeline_error.py +99 -0
- tests/test_mysql_pipeline_init_log.py +83 -0
- tests/test_mysql_pipeline_integration.py +133 -0
- tests/test_mysql_pipeline_refactor.py +144 -0
- tests/test_mysql_pipeline_refactor_simple.py +86 -0
- tests/test_mysql_pipeline_robustness.py +196 -0
- tests/test_mysql_pipeline_types.py +89 -0
- tests/test_mysql_update_columns.py +94 -0
- tests/test_proxy_middleware.py +104 -8
- tests/test_proxy_middleware_enhanced.py +1 -5
- tests/test_proxy_middleware_integration.py +7 -2
- tests/test_proxy_middleware_refactored.py +25 -2
- tests/test_proxy_only.py +84 -0
- tests/test_proxy_with_downloader.py +153 -0
- tests/test_real_scenario_proxy.py +17 -17
- tests/verify_mysql_warnings.py +110 -0
- crawlo/middleware/simple_proxy.py +0 -65
- {crawlo-1.4.5.dist-info → crawlo-1.4.6.dist-info}/WHEEL +0 -0
- {crawlo-1.4.5.dist-info → crawlo-1.4.6.dist-info}/entry_points.txt +0 -0
- {crawlo-1.4.5.dist-info → crawlo-1.4.6.dist-info}/top_level.txt +0 -0
|
@@ -2,20 +2,28 @@
|
|
|
2
2
|
import asyncio
|
|
3
3
|
import aiomysql
|
|
4
4
|
from asyncmy import create_pool
|
|
5
|
-
from typing import Optional, List, Dict
|
|
5
|
+
from typing import Optional, List, Dict, Any
|
|
6
|
+
from abc import ABC, abstractmethod
|
|
7
|
+
import async_timeout
|
|
6
8
|
|
|
9
|
+
from crawlo.items import Item
|
|
7
10
|
from crawlo.exceptions import ItemDiscard
|
|
8
11
|
from crawlo.utils.db_helper import SQLBuilder
|
|
9
12
|
from crawlo.utils.log import get_logger
|
|
10
13
|
from . import BasePipeline
|
|
11
14
|
|
|
12
15
|
|
|
13
|
-
class
|
|
16
|
+
class BaseMySQLPipeline(BasePipeline, ABC):
|
|
17
|
+
"""MySQL管道的基类,封装公共功能"""
|
|
18
|
+
|
|
14
19
|
def __init__(self, crawler):
|
|
15
20
|
self.crawler = crawler
|
|
16
21
|
self.settings = crawler.settings
|
|
17
22
|
self.logger = get_logger(self.__class__.__name__, self.settings.get('LOG_LEVEL'))
|
|
18
23
|
|
|
24
|
+
# 记录管道初始化
|
|
25
|
+
self.logger.info(f"MySQL pipeline initialized: {self.__class__.__name__}")
|
|
26
|
+
|
|
19
27
|
# 使用异步锁和初始化标志确保线程安全
|
|
20
28
|
self._pool_lock = asyncio.Lock()
|
|
21
29
|
self._pool_initialized = False
|
|
@@ -30,46 +38,35 @@ class AsyncmyMySQLPipeline:
|
|
|
30
38
|
spider_table_name or
|
|
31
39
|
self.settings.get('MYSQL_TABLE') or
|
|
32
40
|
getattr(crawler.spider, 'mysql_table', None) or
|
|
33
|
-
f"{crawler.spider
|
|
41
|
+
f"{getattr(crawler.spider, 'name', 'default')}_items"
|
|
34
42
|
)
|
|
35
|
-
|
|
43
|
+
|
|
44
|
+
# 验证表名是否有效
|
|
45
|
+
if not self.table_name or not isinstance(self.table_name, str):
|
|
46
|
+
raise ValueError(f"Invalid table name: {self.table_name}. Table name must be a non-empty string.")
|
|
47
|
+
|
|
48
|
+
# 清理表名,移除可能的非法字符
|
|
49
|
+
self.table_name = self.table_name.strip().replace(' ', '_').replace('-', '_')
|
|
50
|
+
|
|
36
51
|
# 批量插入配置
|
|
37
|
-
self.batch_size = self.settings.get_int('MYSQL_BATCH_SIZE', 100)
|
|
52
|
+
self.batch_size = max(1, self.settings.get_int('MYSQL_BATCH_SIZE', 100)) # 确保至少为1
|
|
38
53
|
self.use_batch = self.settings.get_bool('MYSQL_USE_BATCH', False)
|
|
39
54
|
self.batch_buffer: List[Dict] = [] # 批量缓冲区
|
|
40
55
|
|
|
56
|
+
# SQL生成配置
|
|
57
|
+
self.auto_update = self.settings.get_bool('MYSQL_AUTO_UPDATE', False)
|
|
58
|
+
self.insert_ignore = self.settings.get_bool('MYSQL_INSERT_IGNORE', False)
|
|
59
|
+
self.update_columns = self.settings.get('MYSQL_UPDATE_COLUMNS', ())
|
|
60
|
+
|
|
61
|
+
# 验证 update_columns 是否为元组或列表
|
|
62
|
+
if self.update_columns and not isinstance(self.update_columns, (tuple, list)):
|
|
63
|
+
self.logger.warning(f"update_columns should be a tuple or list, got {type(self.update_columns)}. Converting to tuple.")
|
|
64
|
+
self.update_columns = (self.update_columns,)
|
|
65
|
+
|
|
41
66
|
# 注册关闭事件
|
|
42
67
|
crawler.subscriber.subscribe(self.spider_closed, event='spider_closed')
|
|
43
68
|
|
|
44
|
-
|
|
45
|
-
def from_crawler(cls, crawler):
|
|
46
|
-
return cls(crawler)
|
|
47
|
-
|
|
48
|
-
async def _ensure_pool(self):
|
|
49
|
-
"""确保连接池已初始化(线程安全)"""
|
|
50
|
-
if self._pool_initialized:
|
|
51
|
-
return
|
|
52
|
-
|
|
53
|
-
async with self._pool_lock:
|
|
54
|
-
if not self._pool_initialized: # 双重检查避免竞争条件
|
|
55
|
-
try:
|
|
56
|
-
self.pool = await create_pool(
|
|
57
|
-
host=self.settings.get('MYSQL_HOST', 'localhost'),
|
|
58
|
-
port=self.settings.get_int('MYSQL_PORT', 3306),
|
|
59
|
-
user=self.settings.get('MYSQL_USER', 'root'),
|
|
60
|
-
password=self.settings.get('MYSQL_PASSWORD', ''),
|
|
61
|
-
db=self.settings.get('MYSQL_DB', 'scrapy_db'),
|
|
62
|
-
minsize=self.settings.get_int('MYSQL_POOL_MIN', 3),
|
|
63
|
-
maxsize=self.settings.get_int('MYSQL_POOL_MAX', 10),
|
|
64
|
-
echo=self.settings.get_bool('MYSQL_ECHO', False)
|
|
65
|
-
)
|
|
66
|
-
self._pool_initialized = True
|
|
67
|
-
self.logger.debug(f"MySQL连接池初始化完成(表: {self.table_name})")
|
|
68
|
-
except Exception as e:
|
|
69
|
-
self.logger.error(f"MySQL连接池初始化失败: {e}")
|
|
70
|
-
raise
|
|
71
|
-
|
|
72
|
-
async def process_item(self, item, spider, kwargs=None) -> Optional[dict]:
|
|
69
|
+
async def process_item(self, item: Item, spider, kwargs: Dict[str, Any] = None) -> Item:
|
|
73
70
|
"""处理item的核心方法"""
|
|
74
71
|
kwargs = kwargs or {}
|
|
75
72
|
spider_name = getattr(spider, 'name', 'unknown') # 获取爬虫名称
|
|
@@ -87,8 +84,13 @@ class AsyncmyMySQLPipeline:
|
|
|
87
84
|
# 单条插入逻辑
|
|
88
85
|
try:
|
|
89
86
|
await self._ensure_pool()
|
|
87
|
+
|
|
88
|
+
# 检查连接池是否有效
|
|
89
|
+
if not self._pool_initialized or not self.pool:
|
|
90
|
+
raise RuntimeError("Database connection pool is not initialized or invalid")
|
|
91
|
+
|
|
90
92
|
item_dict = dict(item)
|
|
91
|
-
sql =
|
|
93
|
+
sql = await self._make_insert_sql(item_dict, **kwargs)
|
|
92
94
|
|
|
93
95
|
rowcount = await self._execute_sql(sql=sql)
|
|
94
96
|
if rowcount > 1:
|
|
@@ -100,37 +102,38 @@ class AsyncmyMySQLPipeline:
|
|
|
100
102
|
f"爬虫 {spider_name} 成功插入单条记录到表 {self.table_name}"
|
|
101
103
|
)
|
|
102
104
|
else:
|
|
103
|
-
|
|
104
|
-
|
|
105
|
-
|
|
105
|
+
# 当使用 MYSQL_UPDATE_COLUMNS 时,如果更新的字段值与现有记录相同,
|
|
106
|
+
# MySQL 不会实际更新任何数据,rowcount 会是 0
|
|
107
|
+
if self.update_columns:
|
|
108
|
+
self.logger.info(
|
|
109
|
+
f"爬虫 {spider_name}: SQL执行完成,使用更新列配置 {self.update_columns},"
|
|
110
|
+
f"可能未实际更新数据(字段值未变化)"
|
|
111
|
+
)
|
|
112
|
+
else:
|
|
113
|
+
self.logger.warning(
|
|
114
|
+
f"爬虫 {spider_name}: SQL执行成功但未插入新记录"
|
|
115
|
+
)
|
|
106
116
|
|
|
107
117
|
# 统计计数移到这里,与AiomysqlMySQLPipeline保持一致
|
|
108
118
|
self.crawler.stats.inc_value('mysql/insert_success')
|
|
109
119
|
return item
|
|
110
120
|
|
|
111
121
|
except Exception as e:
|
|
112
|
-
|
|
122
|
+
# 添加更多调试信息
|
|
123
|
+
error_msg = f"处理失败: {str(e)}"
|
|
124
|
+
self.logger.error(f"处理item时发生错误: {error_msg}")
|
|
113
125
|
self.crawler.stats.inc_value('mysql/insert_failed')
|
|
114
|
-
raise ItemDiscard(
|
|
126
|
+
raise ItemDiscard(error_msg)
|
|
115
127
|
|
|
128
|
+
@abstractmethod
|
|
116
129
|
async def _execute_sql(self, sql: str, values: list = None) -> int:
|
|
117
|
-
"""执行SQL语句并处理结果"""
|
|
118
|
-
|
|
119
|
-
async with conn.cursor() as cursor:
|
|
120
|
-
try:
|
|
121
|
-
# 根据是否有参数值选择不同的执行方法
|
|
122
|
-
if values is not None:
|
|
123
|
-
rowcount = await cursor.execute(sql, values)
|
|
124
|
-
else:
|
|
125
|
-
rowcount = await cursor.execute(sql)
|
|
130
|
+
"""执行SQL语句并处理结果 - 子类需要重写此方法"""
|
|
131
|
+
raise NotImplementedError("子类必须实现 _execute_sql 方法")
|
|
126
132
|
|
|
127
|
-
|
|
128
|
-
|
|
129
|
-
|
|
130
|
-
|
|
131
|
-
await conn.rollback()
|
|
132
|
-
# 移除这里的统计计数
|
|
133
|
-
raise ItemDiscard(f"MySQL插入失败: {e}")
|
|
133
|
+
@abstractmethod
|
|
134
|
+
async def _execute_batch_sql(self, sql: str, values_list: list) -> int:
|
|
135
|
+
"""执行批量SQL语句 - 子类需要重写此方法"""
|
|
136
|
+
raise NotImplementedError("子类必须实现 _execute_batch_sql 方法")
|
|
134
137
|
|
|
135
138
|
async def _flush_batch(self, spider_name: str):
|
|
136
139
|
"""刷新批量缓冲区并执行批量插入"""
|
|
@@ -140,81 +143,260 @@ class AsyncmyMySQLPipeline:
|
|
|
140
143
|
try:
|
|
141
144
|
await self._ensure_pool()
|
|
142
145
|
|
|
143
|
-
#
|
|
144
|
-
|
|
145
|
-
|
|
146
|
-
|
|
147
|
-
|
|
148
|
-
|
|
146
|
+
# 检查连接池是否有效
|
|
147
|
+
if not self._pool_initialized or not self.pool:
|
|
148
|
+
raise RuntimeError("Database connection pool is not initialized or invalid")
|
|
149
|
+
|
|
150
|
+
# 使用 SQLBuilder 生成批量插入 SQL
|
|
151
|
+
batch_result = SQLBuilder.make_batch(
|
|
152
|
+
table=self.table_name,
|
|
153
|
+
datas=self.batch_buffer,
|
|
154
|
+
auto_update=self.auto_update,
|
|
155
|
+
update_columns=self.update_columns
|
|
156
|
+
)
|
|
157
|
+
|
|
158
|
+
if batch_result:
|
|
159
|
+
sql, values_list = batch_result
|
|
160
|
+
rowcount = await self._execute_batch_sql(sql=sql, values_list=values_list)
|
|
161
|
+
|
|
162
|
+
if rowcount > 0:
|
|
163
|
+
self.logger.info(
|
|
164
|
+
f"爬虫 {spider_name} 批量插入 {len(self.batch_buffer)} 条记录到表 {self.table_name},实际影响 {rowcount} 行"
|
|
165
|
+
)
|
|
166
|
+
else:
|
|
167
|
+
# 当使用 MYSQL_UPDATE_COLUMNS 时,如果更新的字段值与现有记录相同,
|
|
168
|
+
# MySQL 不会实际更新任何数据,rowcount 会是 0
|
|
169
|
+
if self.update_columns:
|
|
170
|
+
self.logger.debug(
|
|
171
|
+
f"爬虫 {spider_name}: 批量SQL执行完成,使用更新列配置 {self.update_columns},"
|
|
172
|
+
f"可能未实际更新数据(字段值未变化)"
|
|
173
|
+
)
|
|
174
|
+
else:
|
|
175
|
+
self.logger.warning(
|
|
176
|
+
f"爬虫 {spider_name}: 批量SQL执行完成但未插入新记录"
|
|
177
|
+
)
|
|
149
178
|
|
|
150
|
-
|
|
179
|
+
# 清空缓冲区
|
|
180
|
+
self.batch_buffer.clear()
|
|
181
|
+
self.crawler.stats.inc_value('mysql/batch_insert_success')
|
|
182
|
+
else:
|
|
183
|
+
self.logger.warning(f"爬虫 {spider_name}: 批量数据为空,跳过插入")
|
|
151
184
|
|
|
152
|
-
async with self.pool.acquire() as conn:
|
|
153
|
-
async with conn.cursor() as cursor:
|
|
154
|
-
try:
|
|
155
|
-
# 执行批量插入
|
|
156
|
-
rowcount = await cursor.executemany(sql, values_list)
|
|
157
|
-
await conn.commit()
|
|
158
|
-
|
|
159
|
-
self.logger.info(
|
|
160
|
-
f"爬虫 {spider_name} 批量插入 {rowcount} 条记录到表 {self.table_name}"
|
|
161
|
-
)
|
|
162
|
-
# 更新统计计数
|
|
163
|
-
self.crawler.stats.inc_value('mysql/insert_success', rowcount)
|
|
164
|
-
self.batch_buffer.clear()
|
|
165
|
-
except Exception as e:
|
|
166
|
-
await conn.rollback()
|
|
167
|
-
self.crawler.stats.inc_value('mysql/insert_failed', len(self.batch_buffer))
|
|
168
|
-
self.logger.error(f"批量插入失败: {e}")
|
|
169
|
-
raise ItemDiscard(f"批量插入失败: {e}")
|
|
170
185
|
except Exception as e:
|
|
171
|
-
|
|
172
|
-
|
|
186
|
+
# 添加更多调试信息
|
|
187
|
+
error_msg = f"批量插入失败: {str(e)}"
|
|
188
|
+
self.logger.error(f"批量处理时发生错误: {error_msg}")
|
|
189
|
+
self.crawler.stats.inc_value('mysql/batch_insert_failed')
|
|
190
|
+
# 不清空缓冲区,以便可能的重试
|
|
191
|
+
# 但如果错误是由于数据问题导致的,可能需要清空缓冲区以避免无限重试
|
|
192
|
+
if "Duplicate entry" in str(e) or "Data too long" in str(e):
|
|
193
|
+
self.logger.warning("由于数据问题导致的错误,清空缓冲区以避免无限重试")
|
|
194
|
+
self.batch_buffer.clear()
|
|
195
|
+
raise ItemDiscard(error_msg)
|
|
173
196
|
|
|
174
197
|
async def spider_closed(self):
|
|
175
198
|
"""关闭爬虫时清理资源"""
|
|
176
199
|
# 在关闭前刷新剩余的批量数据
|
|
177
200
|
if self.use_batch and self.batch_buffer:
|
|
178
201
|
spider_name = getattr(self.crawler.spider, 'name', 'unknown')
|
|
179
|
-
|
|
202
|
+
try:
|
|
203
|
+
await self._flush_batch(spider_name)
|
|
204
|
+
except Exception as e:
|
|
205
|
+
self.logger.error(f"关闭爬虫时刷新批量数据失败: {e}")
|
|
180
206
|
|
|
181
207
|
if self.pool:
|
|
182
|
-
|
|
183
|
-
|
|
184
|
-
|
|
208
|
+
try:
|
|
209
|
+
pool_stats = {
|
|
210
|
+
'size': getattr(self.pool, 'size', 'unknown'),
|
|
211
|
+
'minsize': getattr(self.pool, 'minsize', 'unknown'),
|
|
212
|
+
'maxsize': getattr(self.pool, 'maxsize', 'unknown')
|
|
213
|
+
}
|
|
214
|
+
self.logger.info(f"正在关闭MySQL连接池,当前状态: {pool_stats}")
|
|
215
|
+
self.pool.close()
|
|
216
|
+
await self.pool.wait_closed()
|
|
217
|
+
self.logger.info("MySQL连接池已关闭")
|
|
218
|
+
except Exception as e:
|
|
219
|
+
self.logger.error(f"关闭MySQL连接池时发生错误: {e}")
|
|
220
|
+
|
|
221
|
+
async def _make_insert_sql(self, item_dict: Dict, **kwargs) -> str:
|
|
222
|
+
"""生成插入SQL语句,子类可以重写此方法"""
|
|
223
|
+
# 合并管道配置和传入的kwargs参数
|
|
224
|
+
sql_kwargs = {
|
|
225
|
+
'auto_update': self.auto_update,
|
|
226
|
+
'insert_ignore': self.insert_ignore,
|
|
227
|
+
'update_columns': self.update_columns
|
|
228
|
+
}
|
|
229
|
+
sql_kwargs.update(kwargs)
|
|
230
|
+
|
|
231
|
+
return SQLBuilder.make_insert(
|
|
232
|
+
table=self.table_name,
|
|
233
|
+
data=item_dict,
|
|
234
|
+
**sql_kwargs
|
|
235
|
+
)
|
|
236
|
+
|
|
237
|
+
@abstractmethod
|
|
238
|
+
async def _ensure_pool(self):
|
|
239
|
+
"""确保连接池已初始化(线程安全),子类必须实现此方法"""
|
|
240
|
+
pass
|
|
185
241
|
|
|
186
242
|
|
|
187
|
-
class
|
|
243
|
+
class AsyncmyMySQLPipeline(BaseMySQLPipeline):
|
|
244
|
+
"""使用asyncmy库的MySQL管道实现"""
|
|
245
|
+
|
|
188
246
|
def __init__(self, crawler):
|
|
189
|
-
|
|
190
|
-
self.settings
|
|
191
|
-
self.logger = get_logger(self.__class__.__name__, self.settings.get('LOG_LEVEL'))
|
|
247
|
+
super().__init__(crawler)
|
|
248
|
+
self.logger.info(f"AsyncmyMySQLPipeline instance created, config - host: {self.settings.get('MYSQL_HOST', 'localhost')}, database: {self.settings.get('MYSQL_DB', 'scrapy_db')}, table: {self.table_name}")
|
|
192
249
|
|
|
193
|
-
|
|
194
|
-
|
|
195
|
-
|
|
196
|
-
self.pool = None
|
|
197
|
-
self.table_name = (
|
|
198
|
-
self.settings.get('MYSQL_TABLE') or
|
|
199
|
-
getattr(crawler.spider, 'mysql_table', None) or
|
|
200
|
-
f"{crawler.spider.name}_items"
|
|
201
|
-
)
|
|
250
|
+
@classmethod
|
|
251
|
+
def from_crawler(cls, crawler):
|
|
252
|
+
return cls(crawler)
|
|
202
253
|
|
|
203
|
-
|
|
204
|
-
|
|
205
|
-
|
|
206
|
-
|
|
254
|
+
async def _ensure_pool(self):
|
|
255
|
+
"""确保连接池已初始化(线程安全)"""
|
|
256
|
+
if self._pool_initialized:
|
|
257
|
+
# 检查连接池是否仍然有效
|
|
258
|
+
if self.pool and hasattr(self.pool, 'closed') and not self.pool.closed:
|
|
259
|
+
return
|
|
260
|
+
else:
|
|
261
|
+
self.logger.warning("连接池已初始化但无效,重新初始化")
|
|
262
|
+
|
|
263
|
+
async with self._pool_lock:
|
|
264
|
+
if not self._pool_initialized: # 双重检查避免竞争条件
|
|
265
|
+
try:
|
|
266
|
+
self.pool = await create_pool(
|
|
267
|
+
host=self.settings.get('MYSQL_HOST', 'localhost'),
|
|
268
|
+
port=self.settings.get_int('MYSQL_PORT', 3306),
|
|
269
|
+
user=self.settings.get('MYSQL_USER', 'root'),
|
|
270
|
+
password=self.settings.get('MYSQL_PASSWORD', ''),
|
|
271
|
+
db=self.settings.get('MYSQL_DB', 'scrapy_db'),
|
|
272
|
+
minsize=self.settings.get_int('MYSQL_POOL_MIN', 3),
|
|
273
|
+
maxsize=self.settings.get_int('MYSQL_POOL_MAX', 10),
|
|
274
|
+
echo=self.settings.get_bool('MYSQL_ECHO', False)
|
|
275
|
+
)
|
|
276
|
+
self._pool_initialized = True
|
|
277
|
+
pool_stats = {
|
|
278
|
+
'minsize': getattr(self.pool, 'minsize', 'unknown'),
|
|
279
|
+
'maxsize': getattr(self.pool, 'maxsize', 'unknown')
|
|
280
|
+
}
|
|
281
|
+
self.logger.info(f"MySQL连接池初始化完成(表: {self.table_name}, 配置: {pool_stats})")
|
|
282
|
+
except Exception as e:
|
|
283
|
+
self.logger.error(f"MySQL连接池初始化失败: {e}")
|
|
284
|
+
# 重置状态以便重试
|
|
285
|
+
self._pool_initialized = False
|
|
286
|
+
self.pool = None
|
|
287
|
+
raise
|
|
288
|
+
|
|
289
|
+
async def _execute_sql(self, sql: str, values: list = None) -> int:
|
|
290
|
+
"""执行SQL语句并处理结果,包含死锁重试机制"""
|
|
291
|
+
max_retries = 3
|
|
292
|
+
timeout = 30 # 30秒超时
|
|
293
|
+
|
|
294
|
+
for attempt in range(max_retries):
|
|
295
|
+
try:
|
|
296
|
+
# 检查连接池状态
|
|
297
|
+
if not self.pool:
|
|
298
|
+
raise RuntimeError("Database connection pool is not available")
|
|
299
|
+
|
|
300
|
+
# 使用asyncmy的连接方式,带超时
|
|
301
|
+
async with async_timeout.timeout(timeout):
|
|
302
|
+
async with self.pool.acquire() as conn:
|
|
303
|
+
async with conn.cursor() as cursor:
|
|
304
|
+
# 根据是否有参数值选择不同的执行方法
|
|
305
|
+
if values is not None:
|
|
306
|
+
rowcount = await cursor.execute(sql, values)
|
|
307
|
+
else:
|
|
308
|
+
rowcount = await cursor.execute(sql)
|
|
309
|
+
|
|
310
|
+
await conn.commit()
|
|
311
|
+
return rowcount
|
|
312
|
+
except asyncio.TimeoutError:
|
|
313
|
+
self.logger.error(f"执行SQL超时 ({timeout}秒): {sql[:100]}...")
|
|
314
|
+
raise ItemDiscard(f"MySQL操作超时: {sql[:100]}...")
|
|
315
|
+
except Exception as e:
|
|
316
|
+
# 检查是否是死锁错误
|
|
317
|
+
if "Deadlock found" in str(e) and attempt < max_retries - 1:
|
|
318
|
+
self.logger.warning(f"检测到死锁,正在进行第 {attempt + 1} 次重试: {str(e)}")
|
|
319
|
+
await asyncio.sleep(0.1 * (2 ** attempt)) # 指数退避
|
|
320
|
+
continue
|
|
321
|
+
# 检查是否是连接错误,尝试重新初始化连接池
|
|
322
|
+
elif ("Connection closed" in str(e) or "Lost connection" in str(e)) and attempt < max_retries - 1:
|
|
323
|
+
self.logger.warning(f"检测到连接错误,尝试重新初始化连接池并重试: {str(e)}")
|
|
324
|
+
self._pool_initialized = False
|
|
325
|
+
self.pool = None
|
|
326
|
+
await asyncio.sleep(0.5 * (attempt + 1)) # 简单退避
|
|
327
|
+
continue
|
|
328
|
+
else:
|
|
329
|
+
# 添加更多调试信息
|
|
330
|
+
error_msg = f"MySQL插入失败: {str(e)}"
|
|
331
|
+
self.logger.error(f"执行SQL时发生错误: {error_msg}")
|
|
332
|
+
# 如果是批量操作,记录SQL和值以便调试
|
|
333
|
+
if values:
|
|
334
|
+
self.logger.debug(f"SQL: {sql[:200]}..., Values: {values[:5] if isinstance(values, list) else '...'}")
|
|
335
|
+
raise ItemDiscard(error_msg)
|
|
336
|
+
|
|
337
|
+
async def _execute_batch_sql(self, sql: str, values_list: list) -> int:
|
|
338
|
+
"""执行批量SQL语句,包含死锁重试机制"""
|
|
339
|
+
max_retries = 3
|
|
340
|
+
timeout = 60 # 60秒超时,批量操作可能需要更长时间
|
|
341
|
+
|
|
342
|
+
for attempt in range(max_retries):
|
|
343
|
+
try:
|
|
344
|
+
# 检查连接池状态
|
|
345
|
+
if not self.pool:
|
|
346
|
+
raise RuntimeError("Database connection pool is not available")
|
|
347
|
+
|
|
348
|
+
# 带超时的批量执行
|
|
349
|
+
async with async_timeout.timeout(timeout):
|
|
350
|
+
async with self.pool.acquire() as conn:
|
|
351
|
+
async with conn.cursor() as cursor:
|
|
352
|
+
# 执行批量插入
|
|
353
|
+
rowcount = await cursor.executemany(sql, values_list)
|
|
354
|
+
await conn.commit()
|
|
355
|
+
return rowcount
|
|
356
|
+
except asyncio.TimeoutError:
|
|
357
|
+
self.logger.error(f"执行批量SQL超时 ({timeout}秒)")
|
|
358
|
+
raise ItemDiscard(f"MySQL批量操作超时")
|
|
359
|
+
except Exception as e:
|
|
360
|
+
# 检查是否是死锁错误
|
|
361
|
+
if "Deadlock found" in str(e) and attempt < max_retries - 1:
|
|
362
|
+
self.logger.warning(f"检测到批量插入死锁,正在进行第 {attempt + 1} 次重试: {str(e)}")
|
|
363
|
+
await asyncio.sleep(0.1 * (2 ** attempt)) # 指数退避
|
|
364
|
+
continue
|
|
365
|
+
# 检查是否是连接错误,尝试重新初始化连接池
|
|
366
|
+
elif ("Connection closed" in str(e) or "Lost connection" in str(e)) and attempt < max_retries - 1:
|
|
367
|
+
self.logger.warning(f"检测到连接错误,尝试重新初始化连接池并重试: {str(e)}")
|
|
368
|
+
self._pool_initialized = False
|
|
369
|
+
self.pool = None
|
|
370
|
+
await asyncio.sleep(0.5 * (attempt + 1)) # 简单退避
|
|
371
|
+
continue
|
|
372
|
+
else:
|
|
373
|
+
# 添加更多调试信息
|
|
374
|
+
error_msg = f"MySQL批量插入失败: {str(e)}"
|
|
375
|
+
self.logger.error(f"执行批量SQL时发生错误: {error_msg}")
|
|
376
|
+
# 记录SQL和值的概要以便调试
|
|
377
|
+
self.logger.debug(f"SQL: {sql[:200]}..., Values count: {len(values_list) if isinstance(values_list, list) else 'unknown'}")
|
|
378
|
+
raise ItemDiscard(error_msg)
|
|
207
379
|
|
|
208
|
-
|
|
380
|
+
|
|
381
|
+
class AiomysqlMySQLPipeline(BaseMySQLPipeline):
|
|
382
|
+
"""使用aiomysql库的MySQL管道实现"""
|
|
383
|
+
|
|
384
|
+
def __init__(self, crawler):
|
|
385
|
+
super().__init__(crawler)
|
|
386
|
+
self.logger.info(f"AiomysqlMySQLPipeline instance created, config - host: {self.settings.get('MYSQL_HOST', 'localhost')}, database: {self.settings.get('MYSQL_DB', 'scrapy_db')}, table: {self.table_name}")
|
|
209
387
|
|
|
210
388
|
@classmethod
|
|
211
389
|
def from_crawler(cls, crawler):
|
|
212
390
|
return cls(crawler)
|
|
213
391
|
|
|
214
|
-
async def
|
|
392
|
+
async def _ensure_pool(self):
|
|
215
393
|
"""延迟初始化连接池(线程安全)"""
|
|
216
394
|
if self._pool_initialized:
|
|
217
|
-
|
|
395
|
+
# 检查连接池是否仍然有效
|
|
396
|
+
if self.pool and hasattr(self.pool, 'closed') and not self.pool.closed:
|
|
397
|
+
return
|
|
398
|
+
else:
|
|
399
|
+
self.logger.warning("连接池已初始化但无效,重新初始化")
|
|
218
400
|
|
|
219
401
|
async with self._pool_lock:
|
|
220
402
|
if not self._pool_initialized:
|
|
@@ -231,96 +413,65 @@ class AiomysqlMySQLPipeline:
|
|
|
231
413
|
autocommit=False
|
|
232
414
|
)
|
|
233
415
|
self._pool_initialized = True
|
|
234
|
-
|
|
416
|
+
pool_stats = {
|
|
417
|
+
'minsize': getattr(self.pool, 'minsize', 'unknown'),
|
|
418
|
+
'maxsize': getattr(self.pool, 'maxsize', 'unknown')
|
|
419
|
+
}
|
|
420
|
+
self.logger.info(f"aiomysql连接池已初始化(表: {self.table_name}, 配置: {pool_stats})")
|
|
235
421
|
except Exception as e:
|
|
236
422
|
self.logger.error(f"aiomysql连接池初始化失败: {e}")
|
|
423
|
+
# 重置状态以便重试
|
|
424
|
+
self._pool_initialized = False
|
|
425
|
+
self.pool = None
|
|
237
426
|
raise
|
|
238
427
|
|
|
239
|
-
async def
|
|
240
|
-
"""
|
|
241
|
-
|
|
242
|
-
|
|
243
|
-
self.batch_buffer.append(dict(item))
|
|
244
|
-
|
|
245
|
-
# 如果缓冲区达到批量大小,执行批量插入
|
|
246
|
-
if len(self.batch_buffer) >= self.batch_size:
|
|
247
|
-
spider_name = getattr(spider, 'name', 'unknown')
|
|
248
|
-
await self._flush_batch(spider_name)
|
|
249
|
-
|
|
250
|
-
return item
|
|
251
|
-
else:
|
|
252
|
-
# 单条插入逻辑
|
|
428
|
+
async def _execute_sql(self, sql: str, values: list = None) -> int:
|
|
429
|
+
"""执行SQL语句并处理结果,包含死锁重试机制"""
|
|
430
|
+
max_retries = 3
|
|
431
|
+
for attempt in range(max_retries):
|
|
253
432
|
try:
|
|
254
|
-
|
|
255
|
-
|
|
256
|
-
item_dict = dict(item)
|
|
257
|
-
# 使用SQLBuilder生成SQL
|
|
258
|
-
sql = SQLBuilder.make_insert(table=self.table_name, data=item_dict)
|
|
259
|
-
|
|
433
|
+
# 使用aiomysql的异步上下文管理器方式
|
|
260
434
|
async with self.pool.acquire() as conn:
|
|
261
435
|
async with conn.cursor() as cursor:
|
|
262
|
-
|
|
263
|
-
|
|
264
|
-
await
|
|
265
|
-
|
|
266
|
-
|
|
267
|
-
await conn.rollback()
|
|
268
|
-
self.crawler.stats.inc_value('mysql/insert_failed')
|
|
269
|
-
raise ItemDiscard(f"MySQL错误: {e.args[1]}")
|
|
270
|
-
|
|
271
|
-
return item
|
|
436
|
+
# 根据是否有参数值选择不同的执行方法
|
|
437
|
+
if values is not None:
|
|
438
|
+
rowcount = await cursor.execute(sql, values)
|
|
439
|
+
else:
|
|
440
|
+
rowcount = await cursor.execute(sql)
|
|
272
441
|
|
|
442
|
+
await conn.commit()
|
|
443
|
+
return rowcount
|
|
273
444
|
except Exception as e:
|
|
274
|
-
|
|
275
|
-
|
|
276
|
-
|
|
277
|
-
|
|
278
|
-
|
|
279
|
-
|
|
280
|
-
|
|
281
|
-
|
|
282
|
-
|
|
283
|
-
|
|
284
|
-
|
|
285
|
-
|
|
286
|
-
|
|
287
|
-
|
|
288
|
-
|
|
289
|
-
|
|
290
|
-
|
|
291
|
-
|
|
292
|
-
sql, values_list = batch_result
|
|
293
|
-
|
|
294
|
-
async with self.pool.acquire() as conn:
|
|
295
|
-
async with conn.cursor() as cursor:
|
|
296
|
-
try:
|
|
445
|
+
# 检查是否是死锁错误
|
|
446
|
+
if "Deadlock found" in str(e) and attempt < max_retries - 1:
|
|
447
|
+
self.logger.warning(f"检测到死锁,正在进行第 {attempt + 1} 次重试: {str(e)}")
|
|
448
|
+
await asyncio.sleep(0.1 * (2 ** attempt)) # 指数退避
|
|
449
|
+
continue
|
|
450
|
+
else:
|
|
451
|
+
# 添加更多调试信息
|
|
452
|
+
error_msg = f"MySQL插入失败: {str(e)}"
|
|
453
|
+
self.logger.error(f"执行SQL时发生错误: {error_msg}")
|
|
454
|
+
raise ItemDiscard(error_msg)
|
|
455
|
+
|
|
456
|
+
async def _execute_batch_sql(self, sql: str, values_list: list) -> int:
|
|
457
|
+
"""执行批量SQL语句,包含死锁重试机制"""
|
|
458
|
+
max_retries = 3
|
|
459
|
+
for attempt in range(max_retries):
|
|
460
|
+
try:
|
|
461
|
+
async with self.pool.acquire() as conn:
|
|
462
|
+
async with conn.cursor() as cursor:
|
|
297
463
|
# 执行批量插入
|
|
298
464
|
rowcount = await cursor.executemany(sql, values_list)
|
|
299
465
|
await conn.commit()
|
|
300
|
-
|
|
301
|
-
|
|
302
|
-
|
|
303
|
-
|
|
304
|
-
|
|
305
|
-
|
|
306
|
-
|
|
307
|
-
|
|
308
|
-
|
|
309
|
-
|
|
310
|
-
|
|
311
|
-
|
|
312
|
-
except Exception as e:
|
|
313
|
-
self.logger.error(f"批量插入过程中发生错误: {e}")
|
|
314
|
-
raise ItemDiscard(f"批量插入处理失败: {e}")
|
|
315
|
-
|
|
316
|
-
async def spider_closed(self):
|
|
317
|
-
"""资源清理"""
|
|
318
|
-
# 在关闭前刷新剩余的批量数据
|
|
319
|
-
if self.use_batch and self.batch_buffer:
|
|
320
|
-
spider_name = getattr(self.crawler.spider, 'name', 'unknown')
|
|
321
|
-
await self._flush_batch(spider_name)
|
|
322
|
-
|
|
323
|
-
if self.pool:
|
|
324
|
-
self.pool.close()
|
|
325
|
-
await self.pool.wait_closed()
|
|
326
|
-
self.logger.info("aiomysql连接池已释放")
|
|
466
|
+
return rowcount
|
|
467
|
+
except Exception as e:
|
|
468
|
+
# 检查是否是死锁错误
|
|
469
|
+
if "Deadlock found" in str(e) and attempt < max_retries - 1:
|
|
470
|
+
self.logger.warning(f"检测到批量插入死锁,正在进行第 {attempt + 1} 次重试: {str(e)}")
|
|
471
|
+
await asyncio.sleep(0.1 * (2 ** attempt)) # 指数退避
|
|
472
|
+
continue
|
|
473
|
+
else:
|
|
474
|
+
# 添加更多调试信息
|
|
475
|
+
error_msg = f"MySQL批量插入失败: {str(e)}"
|
|
476
|
+
self.logger.error(f"执行批量SQL时发生错误: {error_msg}")
|
|
477
|
+
raise ItemDiscard(error_msg)
|