crawlo 1.3.2__py3-none-any.whl → 1.3.4__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of crawlo might be problematic. Click here for more details.

Files changed (105) hide show
  1. crawlo/__init__.py +24 -0
  2. crawlo/__version__.py +1 -1
  3. crawlo/commands/run.py +58 -32
  4. crawlo/core/__init__.py +44 -0
  5. crawlo/core/engine.py +119 -45
  6. crawlo/core/scheduler.py +4 -3
  7. crawlo/crawler.py +603 -1133
  8. crawlo/downloader/aiohttp_downloader.py +4 -2
  9. crawlo/extension/__init__.py +1 -1
  10. crawlo/extension/logging_extension.py +23 -7
  11. crawlo/factories/__init__.py +28 -0
  12. crawlo/factories/base.py +69 -0
  13. crawlo/factories/crawler.py +104 -0
  14. crawlo/factories/registry.py +85 -0
  15. crawlo/filters/aioredis_filter.py +25 -2
  16. crawlo/framework.py +292 -0
  17. crawlo/initialization/__init__.py +40 -0
  18. crawlo/initialization/built_in.py +426 -0
  19. crawlo/initialization/context.py +142 -0
  20. crawlo/initialization/core.py +194 -0
  21. crawlo/initialization/phases.py +149 -0
  22. crawlo/initialization/registry.py +146 -0
  23. crawlo/items/base.py +2 -1
  24. crawlo/logging/__init__.py +38 -0
  25. crawlo/logging/config.py +97 -0
  26. crawlo/logging/factory.py +129 -0
  27. crawlo/logging/manager.py +112 -0
  28. crawlo/middleware/middleware_manager.py +1 -1
  29. crawlo/middleware/offsite.py +1 -1
  30. crawlo/mode_manager.py +26 -1
  31. crawlo/pipelines/pipeline_manager.py +2 -1
  32. crawlo/project.py +76 -46
  33. crawlo/queue/pqueue.py +11 -5
  34. crawlo/queue/queue_manager.py +143 -19
  35. crawlo/queue/redis_priority_queue.py +69 -49
  36. crawlo/settings/default_settings.py +110 -14
  37. crawlo/settings/setting_manager.py +29 -13
  38. crawlo/spider/__init__.py +34 -16
  39. crawlo/stats_collector.py +17 -3
  40. crawlo/task_manager.py +112 -3
  41. crawlo/templates/project/settings.py.tmpl +103 -202
  42. crawlo/templates/project/settings_distributed.py.tmpl +122 -135
  43. crawlo/templates/project/settings_gentle.py.tmpl +149 -43
  44. crawlo/templates/project/settings_high_performance.py.tmpl +127 -90
  45. crawlo/templates/project/settings_minimal.py.tmpl +46 -15
  46. crawlo/templates/project/settings_simple.py.tmpl +138 -75
  47. crawlo/templates/project/spiders/__init__.py.tmpl +5 -1
  48. crawlo/templates/run.py.tmpl +10 -14
  49. crawlo/templates/spiders_init.py.tmpl +10 -0
  50. crawlo/tools/network_diagnostic.py +365 -0
  51. crawlo/utils/class_loader.py +26 -0
  52. crawlo/utils/error_handler.py +76 -35
  53. crawlo/utils/log.py +41 -144
  54. crawlo/utils/redis_connection_pool.py +43 -6
  55. crawlo/utils/request_serializer.py +8 -1
  56. {crawlo-1.3.2.dist-info → crawlo-1.3.4.dist-info}/METADATA +120 -14
  57. {crawlo-1.3.2.dist-info → crawlo-1.3.4.dist-info}/RECORD +104 -45
  58. tests/authenticated_proxy_example.py +2 -2
  59. tests/baidu_performance_test.py +109 -0
  60. tests/baidu_test.py +60 -0
  61. tests/comprehensive_framework_test.py +213 -0
  62. tests/comprehensive_test.py +82 -0
  63. tests/comprehensive_testing_summary.md +187 -0
  64. tests/debug_configure.py +70 -0
  65. tests/debug_framework_logger.py +85 -0
  66. tests/debug_log_levels.py +64 -0
  67. tests/distributed_test.py +67 -0
  68. tests/distributed_test_debug.py +77 -0
  69. tests/final_command_test_report.md +0 -0
  70. tests/final_comprehensive_test.py +152 -0
  71. tests/final_validation_test.py +183 -0
  72. tests/framework_performance_test.py +203 -0
  73. tests/optimized_performance_test.py +212 -0
  74. tests/performance_comparison.py +246 -0
  75. tests/queue_blocking_test.py +114 -0
  76. tests/queue_test.py +90 -0
  77. tests/scrapy_comparison/ofweek_scrapy.py +139 -0
  78. tests/scrapy_comparison/scrapy_test.py +134 -0
  79. tests/simple_command_test.py +120 -0
  80. tests/simple_crawlo_test.py +128 -0
  81. tests/simple_log_test.py +58 -0
  82. tests/simple_optimization_test.py +129 -0
  83. tests/simple_spider_test.py +50 -0
  84. tests/simple_test.py +48 -0
  85. tests/test_all_commands.py +231 -0
  86. tests/test_batch_processor.py +179 -0
  87. tests/test_component_factory.py +175 -0
  88. tests/test_controlled_spider_mixin.py +80 -0
  89. tests/test_enhanced_error_handler_comprehensive.py +246 -0
  90. tests/test_factories.py +253 -0
  91. tests/test_framework_logger.py +67 -0
  92. tests/test_framework_startup.py +65 -0
  93. tests/test_large_scale_config.py +113 -0
  94. tests/test_large_scale_helper.py +236 -0
  95. tests/test_mode_change.py +73 -0
  96. tests/test_mode_consistency.py +1 -1
  97. tests/test_performance_monitor.py +116 -0
  98. tests/test_queue_empty_check.py +42 -0
  99. tests/untested_features_report.md +139 -0
  100. tests/verify_debug.py +52 -0
  101. tests/verify_log_fix.py +112 -0
  102. tests/DOUBLE_CRAWLO_PREFIX_FIX_REPORT.md +0 -82
  103. {crawlo-1.3.2.dist-info → crawlo-1.3.4.dist-info}/WHEEL +0 -0
  104. {crawlo-1.3.2.dist-info → crawlo-1.3.4.dist-info}/entry_points.txt +0 -0
  105. {crawlo-1.3.2.dist-info → crawlo-1.3.4.dist-info}/top_level.txt +0 -0
@@ -6,18 +6,32 @@
6
6
  # 添加环境变量配置工具导入
7
7
  from crawlo.utils.env_config import get_redis_config, get_runtime_config, get_version
8
8
 
9
+ # 框架初始化控制
10
+ FRAMEWORK_INIT_ORDER = [
11
+ 'log_system', # 日志系统
12
+ 'settings_system', # 配置系统
13
+ 'core_components', # 核心组件
14
+ 'extensions', # 扩展组件
15
+ 'full_initialization' # 完全初始化
16
+ ]
17
+ FRAMEWORK_INIT_STATE = 'uninitialized'
18
+
9
19
  # ============================== 项目基础配置 ==============================
10
20
 
11
21
  # 项目名称(用于日志、Redis Key 等标识)
12
22
  PROJECT_NAME = get_runtime_config()['PROJECT_NAME']
13
23
 
24
+ # 确保项目名称不为空
25
+ if not PROJECT_NAME or PROJECT_NAME == 'None':
26
+ PROJECT_NAME = 'crawlo'
27
+
14
28
  # 项目版本号 - 从框架的__version__.py文件中读取,如果不存在则使用默认值
15
29
  VERSION = get_version()
16
30
 
17
31
  # 运行模式:standalone/distributed/auto
18
32
  RUN_MODE = get_runtime_config()['CRAWLO_MODE']
19
33
 
20
- # 并发数配置
34
+ # 并发数配置 - 优化默认值以提高性能
21
35
  CONCURRENCY = get_runtime_config()['CONCURRENCY']
22
36
 
23
37
  # ============================== 爬虫核心配置 ==============================
@@ -25,8 +39,8 @@ CONCURRENCY = get_runtime_config()['CONCURRENCY']
25
39
  # 默认下载器
26
40
  DOWNLOADER = "crawlo.downloader.httpx_downloader.HttpXDownloader"
27
41
 
28
- # 请求延迟(秒)
29
- DOWNLOAD_DELAY = 1
42
+ # 请求延迟(秒)- 优化默认值以提高性能
43
+ DOWNLOAD_DELAY = 0.5
30
44
 
31
45
  # 随机延迟配置
32
46
  RANDOMNESS = False # 是否启用随机延迟
@@ -35,8 +49,15 @@ RANDOM_RANGE = [0.5, 1.5] # 随机延迟范围因子,实际延迟 = DOWNLOAD_
35
49
  # 深度优先级(负数表示深度优先,正数表示广度优先)
36
50
  DEPTH_PRIORITY = 1
37
51
 
38
- # 调度器队列最大大小
39
- SCHEDULER_MAX_QUEUE_SIZE = 1000
52
+ # 调度器队列最大大小 - 优化默认值以提高性能
53
+ SCHEDULER_MAX_QUEUE_SIZE = 5000
54
+ # 背压控制配置 - 优化默认值以提高性能
55
+ BACKPRESSURE_RATIO = 0.9 # 背压触发阈值(队列大小达到最大容量的90%时触发背压控制)
56
+
57
+ # 请求生成控制
58
+ REQUEST_GENERATION_BATCH_SIZE = 10 # 请求生成批处理大小
59
+ REQUEST_GENERATION_INTERVAL = 0.01 # 请求生成间隔(秒)
60
+ ENABLE_CONTROLLED_REQUEST_GENERATION = False # 是否启用受控请求生成
40
61
 
41
62
  # 调度器队列名称(遵循统一命名规范)
42
63
  SCHEDULER_QUEUE_NAME = f"crawlo:{PROJECT_NAME}:queue:requests"
@@ -44,13 +65,15 @@ SCHEDULER_QUEUE_NAME = f"crawlo:{PROJECT_NAME}:queue:requests"
44
65
  # 队列类型:memory/redis/auto
45
66
  QUEUE_TYPE = 'auto'
46
67
 
68
+ # 队列配置
69
+ QUEUE_MAX_RETRIES = 3 # 队列操作最大重试次数
70
+ QUEUE_TIMEOUT = 300 # 队列操作超时时间(秒)
47
71
 
48
72
  # 默认使用内存过滤器和去重管道,确保在无Redis环境下也能正常运行
49
73
  # 在auto模式下,如果Redis可用,框架会自动更新为Redis实现以提供更好的去重能力
50
- DEFAULT_DEDUP_PIPELINE = 'crawlo.pipelines.memory_dedup_pipeline.MemoryDedupPipeline'
74
+ DEFAULT_DEDUP_PIPELINE = 'crawlo.pipelines.memory_dedup_pipeline.MemoryDedupPipeline'
51
75
  FILTER_CLASS = 'crawlo.filters.memory_filter.MemoryFilter'
52
76
 
53
-
54
77
  MYSQL_HOST = '127.0.0.1'
55
78
  MYSQL_PORT = 3306
56
79
  MYSQL_USER = 'root'
@@ -60,7 +83,6 @@ MYSQL_TABLE = 'crawlo'
60
83
  MYSQL_BATCH_SIZE = 100
61
84
  MYSQL_USE_BATCH = False # 是否启用批量插入
62
85
 
63
-
64
86
  # --- Redis 过滤器配置 ---
65
87
  # 使用环境变量配置工具获取 Redis 配置
66
88
  redis_config = get_redis_config()
@@ -111,10 +133,6 @@ PIPELINES = [
111
133
  'crawlo.pipelines.console_pipeline.ConsolePipeline',
112
134
  ]
113
135
 
114
- # 明确添加默认去重管道到管道列表开头
115
- # 注意:此操作已移至SettingManager中处理,避免重复插入
116
- # PIPELINES.insert(0, DEFAULT_DEDUP_PIPELINE)
117
-
118
136
  # ============================== 框架默认扩展配置 ==============================
119
137
 
120
138
  # 框架扩展组件列表(框架默认扩展 + 用户自定义扩展)
@@ -126,9 +144,9 @@ EXTENSIONS = [
126
144
 
127
145
  # ============================== 日志与监控 ==============================
128
146
 
129
- LOG_LEVEL = 'DEBUG' # 日志级别: DEBUG/INFO/WARNING/ERROR
147
+ LOG_LEVEL = None # 日志级别: DEBUG/INFO/WARNING/ERROR,默认为None,由用户在项目settings中设置
130
148
  STATS_DUMP = True # 是否周期性输出统计信息
131
- LOG_FILE = f'logs/{PROJECT_NAME}.log' # 日志文件路径
149
+ LOG_FILE = None # 日志文件路径,将在项目配置中设置
132
150
  LOG_FORMAT = '%(asctime)s - [%(name)s] - %(levelname)s: %(message)s'
133
151
  LOG_ENCODING = 'utf-8'
134
152
 
@@ -151,6 +169,8 @@ PROXY_EXTRACTOR = "proxy"
151
169
  # 代理刷新控制
152
170
  PROXY_REFRESH_INTERVAL = 60 # 代理刷新间隔(秒)
153
171
  PROXY_API_TIMEOUT = 10 # 请求代理 API 超时时间
172
+ PROXY_POOL_SIZE = 5 # 代理池大小
173
+ PROXY_HEALTH_CHECK_THRESHOLD = 0.5 # 代理健康检查阈值
154
174
 
155
175
  # ============================== Curl-Cffi 特有配置 ==============================
156
176
 
@@ -183,6 +203,17 @@ HTTPX_FOLLOW_REDIRECTS = True # 是否自动跟随重定向
183
203
  AIOHTTP_AUTO_DECOMPRESS = True # 是否自动解压响应
184
204
  AIOHTTP_FORCE_CLOSE = False # 是否强制关闭连接
185
205
 
206
+ # 通用下载器配置
207
+ DOWNLOAD_TIMEOUT = 30 # 下载超时时间(秒)
208
+ VERIFY_SSL = True # 是否验证SSL证书
209
+ CONNECTION_POOL_LIMIT = 100 # 连接池大小限制
210
+ CONNECTION_POOL_LIMIT_PER_HOST = 20 # 每个主机的连接池大小限制
211
+ DOWNLOAD_MAXSIZE = 10 * 1024 * 1024 # 最大下载大小(字节)
212
+ DOWNLOAD_STATS = True # 是否启用下载统计
213
+ DOWNLOAD_WARN_SIZE = 1024 * 1024 # 下载警告大小(字节)
214
+ DOWNLOAD_RETRY_TIMES = 3 # 下载重试次数
215
+ MAX_RETRY_TIMES = 3 # 最大重试次数
216
+
186
217
  # ============================== Selenium 下载器配置 ==============================
187
218
 
188
219
  # Selenium 基础配置
@@ -223,3 +254,68 @@ MEMORY_MONITOR_ENABLED = False # 是否启用内存监控
223
254
  MEMORY_MONITOR_INTERVAL = 60 # 内存监控检查间隔(秒)
224
255
  MEMORY_WARNING_THRESHOLD = 80.0 # 内存使用率警告阈值(百分比)
225
256
  MEMORY_CRITICAL_THRESHOLD = 90.0 # 内存使用率严重阈值(百分比)
257
+
258
+ # ============================== 性能分析配置 ==============================
259
+
260
+ # 性能分析扩展默认不启用
261
+ PERFORMANCE_PROFILER_ENABLED = False # 是否启用性能分析
262
+ PERFORMANCE_PROFILER_OUTPUT_DIR = 'profiling' # 性能分析输出目录
263
+ PERFORMANCE_PROFILER_INTERVAL = 300 # 性能分析间隔(秒)
264
+
265
+ # ============================== 健康检查配置 ==============================
266
+
267
+ # 健康检查扩展默认启用
268
+ HEALTH_CHECK_ENABLED = True # 是否启用健康检查
269
+
270
+ # ============================== 日志间隔配置 ==============================
271
+
272
+ # 日志间隔扩展配置
273
+ INTERVAL = 60 # 日志输出间隔(秒)
274
+
275
+ # ============================== 自定义日志配置 ==============================
276
+
277
+ # 自定义日志扩展配置
278
+ LOG_ENABLE_CUSTOM = False # 是否启用自定义日志
279
+
280
+ # ============================== 默认请求头配置 ==============================
281
+
282
+ # 默认请求头配置
283
+ DEFAULT_REQUEST_HEADERS = {} # 默认请求头
284
+ USER_AGENT = None # 用户代理
285
+ USER_AGENTS = [] # 用户代理列表
286
+ RANDOM_HEADERS = {} # 随机请求头
287
+ RANDOM_USER_AGENT_ENABLED = False # 是否启用随机用户代理
288
+ USER_AGENT_DEVICE_TYPE = "all" # 用户代理设备类型
289
+
290
+ # ============================== 站外过滤配置 ==============================
291
+
292
+ # 站外过滤配置
293
+ ALLOWED_DOMAINS = [] # 允许的域名列表
294
+
295
+ # ============================== Bloom过滤器配置 ==============================
296
+
297
+ # Bloom过滤器配置
298
+ BLOOM_FILTER_CAPACITY = 1000000 # Bloom过滤器容量
299
+ BLOOM_FILTER_ERROR_RATE = 0.001 # Bloom过滤器错误率
300
+
301
+ # ============================== CSV管道配置 ==============================
302
+
303
+ # CSV管道配置
304
+ CSV_DELIMITER = ',' # CSV分隔符
305
+ CSV_QUOTECHAR = '"' # CSV引号字符
306
+ CSV_INCLUDE_HEADERS = True # 是否包含表头
307
+ CSV_EXTRASACTION = 'ignore' # 额外字段处理方式:ignore, raise
308
+ CSV_FIELDNAMES = None # 字段名列表
309
+ CSV_FILE = None # CSV文件路径
310
+ CSV_DICT_FILE = None # CSV字典文件路径
311
+ CSV_BATCH_SIZE = 100 # CSV批处理大小
312
+ CSV_BATCH_FILE = None # CSV批处理文件路径
313
+
314
+ # ============================== 数据库去重管道配置 ==============================
315
+
316
+ # 数据库去重管道配置
317
+ DB_HOST = 'localhost' # 数据库主机
318
+ DB_PORT = 3306 # 数据库端口
319
+ DB_USER = 'root' # 数据库用户
320
+ DB_PASSWORD = '' # 数据库密码
321
+ DB_NAME = 'crawlo' # 数据库名称
@@ -15,6 +15,8 @@ class SettingManager(MutableMapping):
15
15
  self.set_settings(default_settings)
16
16
  # 在初始化时合并配置
17
17
  self._merge_config(values)
18
+ # 处理动态配置
19
+ self._process_dynamic_config()
18
20
 
19
21
  def _merge_config(self, user_config):
20
22
  """合并默认配置和用户配置"""
@@ -81,6 +83,33 @@ class SettingManager(MutableMapping):
81
83
  if key not in ['MIDDLEWARES', 'PIPELINES', 'EXTENSIONS']:
82
84
  self.attributes[key] = value
83
85
 
86
+ def set_settings(self, module):
87
+ if isinstance(module, str):
88
+ module = import_module(module)
89
+
90
+ # 收集模块中的所有配置项
91
+ module_settings = {}
92
+ for key in dir(module):
93
+ if key.isupper():
94
+ value = getattr(module, key)
95
+ module_settings[key] = value
96
+
97
+ # 使用合并逻辑而不是直接设置
98
+ self._merge_config(module_settings)
99
+
100
+ # 处理动态配置项(如LOG_FILE)
101
+ self._process_dynamic_config()
102
+
103
+ def _process_dynamic_config(self):
104
+ """
105
+ 处理动态配置项
106
+ 某些配置项需要根据其他配置项的值进行动态计算
107
+ """
108
+ # 处理LOG_FILE配置
109
+ if self.attributes.get('LOG_FILE') is None:
110
+ project_name = self.attributes.get('PROJECT_NAME', 'crawlo')
111
+ self.attributes['LOG_FILE'] = f'logs/{project_name}.log'
112
+
84
113
  def get(self, key, default=None):
85
114
  """安全获取值,不触发递归"""
86
115
  value = self.attributes.get(key, default)
@@ -133,19 +162,6 @@ class SettingManager(MutableMapping):
133
162
  def set(self, key, value):
134
163
  self.attributes[key] = value
135
164
 
136
- def set_settings(self, module):
137
- if isinstance(module, str):
138
- module = import_module(module)
139
-
140
- # 收集模块中的所有配置项
141
- module_settings = {}
142
- for key in dir(module):
143
- if key.isupper():
144
- module_settings[key] = getattr(module, key)
145
-
146
- # 使用合并逻辑而不是直接设置
147
- self._merge_config(module_settings)
148
-
149
165
  # 实现 MutableMapping 必须的方法
150
166
  def __getitem__(self, item):
151
167
  return self.attributes[item]
crawlo/spider/__init__.py CHANGED
@@ -77,7 +77,13 @@ class SpiderMeta(type):
77
77
 
78
78
  # 注册爬虫
79
79
  _DEFAULT_SPIDER_REGISTRY[spider_name] = cls
80
- get_logger(__name__).debug(f"自动注册爬虫: {spider_name} -> {cls.__name__}")
80
+ # 延迟初始化logger避免模块级别阻塞
81
+ try:
82
+ from crawlo.utils.log import get_logger
83
+ get_logger(__name__).debug(f"自动注册爬虫: {spider_name} -> {cls.__name__}")
84
+ except:
85
+ # 如果日志系统未初始化,静默失败
86
+ pass
81
87
 
82
88
  return cls
83
89
 
@@ -153,12 +159,21 @@ class Spider(metaclass=SpiderMeta):
153
159
 
154
160
  # 初始化其他属性
155
161
  self.crawler = None
156
- self.logger = get_logger(self.name)
162
+ # 延迟初始化logger避免阻塞
163
+ self._logger = None
157
164
  self.stats = None
158
165
 
159
166
  # 应用额外参数
160
167
  for key, value in kwargs.items():
161
168
  setattr(self, key, value)
169
+
170
+ @property
171
+ def logger(self):
172
+ """延迟初始化logger"""
173
+ if self._logger is None:
174
+ from crawlo.utils.log import get_logger
175
+ self._logger = get_logger(self.name)
176
+ return self._logger
162
177
 
163
178
  @classmethod
164
179
  def create_instance(cls, crawler) -> 'Spider':
@@ -172,13 +187,23 @@ class Spider(metaclass=SpiderMeta):
172
187
  spider.crawler = crawler
173
188
  spider.stats = getattr(crawler, 'stats', None)
174
189
 
175
- # 合并自定义设置
190
+ # 合并自定义设置 - 使用延迟应用避免初始化时的循环依赖
176
191
  if hasattr(spider, 'custom_settings') and spider.custom_settings:
177
- for key, value in spider.custom_settings.items():
178
- crawler.settings.set(key, value)
179
- spider.logger.debug(f"应用自定义设置: {key} = {value}")
192
+ # 延迟到真正需要时才应用设置
193
+ spider._pending_settings = spider.custom_settings.copy()
194
+ spider.logger.debug(f"准备应用 {len(spider.custom_settings)} 项自定义设置")
180
195
 
181
196
  return spider
197
+
198
+ def apply_pending_settings(self):
199
+ """应用待处理的设置(在初始化完成后调用)"""
200
+ if hasattr(self, '_pending_settings') and self._pending_settings:
201
+ for key, value in self._pending_settings.items():
202
+ if self.crawler and hasattr(self.crawler, 'settings'):
203
+ self.crawler.settings.set(key, value)
204
+ self.logger.debug(f"应用自定义设置: {key} = {value}")
205
+ # 清除待处理的设置
206
+ delattr(self, '_pending_settings')
182
207
 
183
208
  def start_requests(self) -> Iterator[Request]:
184
209
  """
@@ -349,17 +374,10 @@ class Spider(metaclass=SpiderMeta):
349
374
  可用于:
350
375
  - 清理资源
351
376
  - 关闭数据库连接
352
- - 输出统计信息
353
377
  """
354
- if self.stats:
355
- stats_summary = {
356
- 'total_requests': self.stats.get('total_requests', 0),
357
- 'total_items': self.stats.get('total_items', 0),
358
- 'success_rate': self.stats.get('success_rate', 'N/A')
359
- }
360
- self.logger.info(f"Spider {self.name} closed, stats: {stats_summary}")
361
- else:
362
- self.logger.info(f"Spider {self.name} closed")
378
+ # 不再输出任何信息,避免与统计信息重复
379
+ # 统计信息由StatsCollector负责输出
380
+ pass
363
381
 
364
382
  def __str__(self) -> str:
365
383
  return f"{self.__class__.__name__}(name='{self.name}')"
crawlo/stats_collector.py CHANGED
@@ -46,9 +46,6 @@ class StatsCollector(object):
46
46
 
47
47
  self._stats['spider_name'] = spider_name
48
48
 
49
- if self._dump:
50
- self.logger.info(f'{spider_name} stats: \n{pformat(self._stats)}')
51
-
52
49
  def __getitem__(self, item):
53
50
  return self._stats[item]
54
51
 
@@ -57,3 +54,20 @@ class StatsCollector(object):
57
54
 
58
55
  def __delitem__(self, key):
59
56
  del self._stats[key]
57
+
58
+ def close(self):
59
+ """关闭统计收集器并输出统计信息"""
60
+ if self._dump:
61
+ # 获取爬虫名称
62
+ spider_name = self._stats.get('spider_name', 'unknown')
63
+
64
+ # 如果还没有设置爬虫名称,尝试从crawler中获取
65
+ if spider_name == 'unknown' and hasattr(self, 'crawler') and self.crawler:
66
+ spider = getattr(self.crawler, 'spider', None)
67
+ if spider and hasattr(spider, 'name'):
68
+ spider_name = spider.name
69
+ # 同时更新_stats中的spider_name
70
+ self._stats['spider_name'] = spider_name
71
+
72
+ # 输出统计信息(这是唯一输出统计信息的地方)
73
+ self.logger.info(f'{spider_name} stats: \n{pformat(self._stats)}')
crawlo/task_manager.py CHANGED
@@ -3,13 +3,79 @@
3
3
  import asyncio
4
4
  from asyncio import Task, Future, Semaphore
5
5
  from typing import Set, Final
6
+ from collections import deque
7
+ import time
8
+ from crawlo.utils.log import get_logger
9
+
10
+
11
+ class DynamicSemaphore(Semaphore):
12
+ """支持动态调整的信号量"""
13
+
14
+ def __init__(self, initial_value: int = 8):
15
+ super().__init__(initial_value)
16
+ self._initial_value = initial_value
17
+ self._current_value = initial_value
18
+ self._response_times = deque(maxlen=10) # 存储最近10次响应时间
19
+ self._last_adjust_time = time.time()
20
+
21
+ def record_response_time(self, response_time: float):
22
+ """记录响应时间"""
23
+ self._response_times.append(response_time)
24
+
25
+ def adjust_concurrency(self):
26
+ """根据响应时间动态调整并发数"""
27
+ current_time = time.time()
28
+ # 限制调整频率,至少间隔1秒(从2秒减少到1秒)
29
+ if current_time - self._last_adjust_time < 1:
30
+ return
31
+
32
+ self._last_adjust_time = current_time
33
+
34
+ if len(self._response_times) < 2: # 从3减少到2
35
+ return
36
+
37
+ # 计算平均响应时间
38
+ avg_response_time = sum(self._response_times) / len(self._response_times)
39
+
40
+ # 根据响应时间调整并发数
41
+ if avg_response_time < 0.2: # 响应很快,增加并发(从0.3降到0.2)
42
+ new_concurrency = min(self._current_value + 5, self._initial_value * 3) # 增加幅度从3提高到5,最大值从2倍提高到3倍
43
+ elif avg_response_time > 1.0: # 响应较慢,减少并发(从1.5降到1.0)
44
+ new_concurrency = max(self._current_value - 5, max(1, self._initial_value // 3)) # 减少幅度从3提高到5,最小值从一半降低到三分之一
45
+ else:
46
+ return # 保持当前并发数
47
+
48
+ # 只有当变化较大时才调整
49
+ if abs(new_concurrency - self._current_value) > 1:
50
+ self._adjust_semaphore_value(new_concurrency)
51
+
52
+ def _adjust_semaphore_value(self, new_value: int):
53
+ """调整信号量的值"""
54
+ if new_value > self._current_value:
55
+ # 增加信号量
56
+ for _ in range(new_value - self._current_value):
57
+ self.release()
58
+ elif new_value < self._current_value:
59
+ # 减少信号量,这里只是记录新的目标值
60
+ # 实际减少会在acquire时处理
61
+ pass
62
+
63
+ self._current_value = new_value
64
+ # 注意:Python的Semaphore没有直接修改内部计数器的方法
65
+ # 所以我们只能通过release()来增加,减少则需要在acquire时控制
6
66
 
7
67
 
8
68
  class TaskManager:
9
69
 
10
70
  def __init__(self, total_concurrency: int = 8):
11
71
  self.current_task: Final[Set] = set()
12
- self.semaphore: Semaphore = Semaphore(total_concurrency)
72
+ # 使用动态信号量替代普通信号量
73
+ self.semaphore: DynamicSemaphore = DynamicSemaphore(max(1, total_concurrency))
74
+ self.logger = get_logger(self.__class__.__name__)
75
+
76
+ # 异常统计
77
+ self._exception_count = 0
78
+ self._total_tasks = 0
13
79
 
14
80
  async def create_task(self, coroutine) -> Task:
15
81
  # 等待信号量,控制并发数
@@ -17,10 +83,39 @@ class TaskManager:
17
83
 
18
84
  task = asyncio.create_task(coroutine)
19
85
  self.current_task.add(task)
86
+ self._total_tasks += 1
20
87
 
21
88
  def done_callback(_future: Future) -> None:
22
- self.current_task.remove(task)
23
- self.semaphore.release()
89
+ try:
90
+ self.current_task.discard(task) # 使用discard而不是remove,避免KeyError
91
+
92
+ # 获取任务结果或异常 - 这是关键,必须调用result()或exception()来"获取"异常
93
+ try:
94
+ # 尝试获取结果,如果有异常会被抛出
95
+ result = _future.result()
96
+ # 如果成功完成,可以在这里记录成功统计
97
+ except Exception as exception:
98
+ # 异常被正确"获取"了,不会再出现"never retrieved"警告
99
+ self._exception_count += 1
100
+
101
+ # 记录异常详情
102
+ self.logger.error(
103
+ f"Task completed with exception: {type(exception).__name__}: {exception}"
104
+ )
105
+ self.logger.debug("Task exception details:", exc_info=exception)
106
+
107
+ # 可以在这里添加更多的异常处理逻辑,如发送到监控系统
108
+
109
+ except Exception as e:
110
+ # 防止回调函数本身出现异常
111
+ self.logger.error(f"Error in task done callback: {e}")
112
+ finally:
113
+ # 确保信号量始终被释放
114
+ self.semaphore.release()
115
+
116
+ # 定期调整并发数(从每3个任务调整一次改为每2个任务调整一次)
117
+ if self._total_tasks % 2 == 0:
118
+ self.semaphore.adjust_concurrency()
24
119
 
25
120
  task.add_done_callback(done_callback)
26
121
 
@@ -28,3 +123,17 @@ class TaskManager:
28
123
 
29
124
  def all_done(self) -> bool:
30
125
  return len(self.current_task) == 0
126
+
127
+ def record_response_time(self, response_time: float):
128
+ """记录任务的响应时间,用于动态调整并发数"""
129
+ self.semaphore.record_response_time(response_time)
130
+
131
+ def get_stats(self) -> dict:
132
+ """获取任务管理器统计信息"""
133
+ return {
134
+ 'active_tasks': len(self.current_task),
135
+ 'total_tasks': self._total_tasks,
136
+ 'exception_count': self._exception_count,
137
+ 'success_rate': (self._total_tasks - self._exception_count) / max(1, self._total_tasks) * 100,
138
+ 'current_concurrency': self.semaphore._current_value
139
+ }