crawlo 1.3.9__py3-none-any.whl → 1.4.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of crawlo might be problematic. Click here for more details.

crawlo/__version__.py CHANGED
@@ -1 +1 @@
1
- __version__ = '1.3.9'
1
+ __version__ = '1.4.0'
crawlo/core/processor.py CHANGED
@@ -1,10 +1,12 @@
1
1
  #!/usr/bin/python
2
2
  # -*- coding:UTF-8 -*-
3
- from asyncio import Queue
3
+ from asyncio import Queue, create_task
4
4
  from typing import Union, Optional
5
5
 
6
6
  from crawlo import Request, Item
7
7
  from crawlo.pipelines.pipeline_manager import PipelineManager
8
+ from crawlo.exceptions import ItemDiscard
9
+ from crawlo.event import item_discard
8
10
 
9
11
 
10
12
  class Processor(object):
@@ -27,7 +29,13 @@ class Processor(object):
27
29
  await self._process_item(result)
28
30
 
29
31
  async def _process_item(self, item):
30
- await self.pipelines.process_item(item=item)
32
+ try:
33
+ await self.pipelines.process_item(item=item)
34
+ except ItemDiscard as exc:
35
+ # Item was discarded by a pipeline (e.g., deduplication pipeline)
36
+ # We simply ignore this item and don't pass it to subsequent pipelines
37
+ # The statistics system has already been notified in PipelineManager, so we don't need to notify again
38
+ pass
31
39
 
32
40
  async def enqueue(self, output: Union[Request, Item]):
33
41
  await self.queue.put(output)
@@ -37,4 +45,4 @@ class Processor(object):
37
45
  return len(self) == 0
38
46
 
39
47
  def __len__(self):
40
- return self.queue.qsize()
48
+ return self.queue.qsize()
crawlo/crawler.py CHANGED
@@ -308,6 +308,18 @@ class ModernCrawler:
308
308
  except Exception as e:
309
309
  self._logger.warning(f"Spider cleanup failed: {e}")
310
310
 
311
+ # 调用StatsCollector的close_spider方法,设置reason和spider_name
312
+ if self._stats and hasattr(self._stats, 'close_spider'):
313
+ try:
314
+ # 使用默认的'finished'作为reason
315
+ self._stats.close_spider(self._spider, reason='finished')
316
+ except Exception as e:
317
+ self._logger.warning(f"Stats close_spider failed: {e}")
318
+
319
+ # 触发spider_closed事件,通知所有订阅者(包括扩展)
320
+ # 传递reason参数,这里使用默认的'finished'作为reason
321
+ await self.subscriber.notify("spider_closed", reason='finished')
322
+
311
323
  if self._stats and hasattr(self._stats, 'close'):
312
324
  try:
313
325
  close_result = self._stats.close()
@@ -16,6 +16,7 @@ class ExtensionManager(object):
16
16
  extensions = self.crawler.settings.get_list('EXTENSIONS')
17
17
  self.logger = get_logger(self.__class__.__name__, crawler.settings.get('LOG_LEVEL'))
18
18
  self._add_extensions(extensions)
19
+ self._subscribe_extensions()
19
20
 
20
21
  @classmethod
21
22
  def create_instance(cls, *args: Any, **kwargs: Any) -> 'ExtensionManager':
@@ -37,3 +38,27 @@ class ExtensionManager(object):
37
38
  if extensions:
38
39
  # 恢复INFO级别日志,保留关键的启用信息
39
40
  self.logger.info(f"Enabled extensions: \n{pformat(extensions)}")
41
+
42
+ def _subscribe_extensions(self) -> None:
43
+ """订阅扩展方法到相应的事件"""
44
+ for extension in self.extensions:
45
+ # 订阅 spider_closed 方法
46
+ if hasattr(extension, 'spider_closed'):
47
+ self.crawler.subscriber.subscribe(extension.spider_closed, event="spider_closed")
48
+
49
+ # 订阅 item_successful 方法
50
+ if hasattr(extension, 'item_successful'):
51
+ self.crawler.subscriber.subscribe(extension.item_successful, event="item_successful")
52
+
53
+ # 订阅 item_discard 方法
54
+ if hasattr(extension, 'item_discard'):
55
+ self.crawler.subscriber.subscribe(extension.item_discard, event="item_discard")
56
+
57
+ # 订阅 response_received 方法
58
+ if hasattr(extension, 'response_received'):
59
+ # 修复:将事件名称从 "request_received" 更正为 "response_received"
60
+ self.crawler.subscriber.subscribe(extension.response_received, event="response_received")
61
+
62
+ # 订阅 request_scheduled 方法
63
+ if hasattr(extension, 'request_scheduled'):
64
+ self.crawler.subscriber.subscribe(extension.request_scheduled, event="request_scheduled")
@@ -1,52 +1,43 @@
1
1
  #!/usr/bin/python
2
2
  # -*- coding:UTF-8 -*-
3
+ """
4
+ 日志统计扩展
5
+ 提供详细的爬虫运行统计信息
6
+ """
7
+ import asyncio
3
8
  from typing import Any
4
9
 
5
- from crawlo import event
6
- from crawlo.tools.date_tools import now, time_diff
10
+ from crawlo.utils.log import get_logger
11
+ from crawlo.utils import now, time_diff
7
12
 
8
13
 
9
- class LogStats(object):
14
+ class LogStats:
15
+ """
16
+ 日志统计扩展,记录和输出爬虫运行过程中的各种统计信息
17
+ """
10
18
 
11
- def __init__(self, stats: Any):
12
- self._stats = stats
19
+ def __init__(self, crawler):
20
+ self.crawler = crawler
21
+ self.logger = get_logger(self.__class__.__name__, crawler.settings.get('LOG_LEVEL'))
22
+ self._stats = crawler.stats
23
+ self._stats['start_time'] = now(fmt='%Y-%m-%d %H:%M:%S')
13
24
 
14
25
  @classmethod
15
- def create_instance(cls, crawler: Any) -> 'LogStats':
16
- o = cls(crawler.stats)
17
- # 订阅所有需要的事件
18
- event_subscriptions = [
19
- (o.spider_opened, event.spider_opened),
20
- (o.spider_closed, event.spider_closed),
21
- (o.item_successful, event.item_successful),
22
- (o.item_discard, event.item_discard),
23
- (o.response_received, event.response_received),
24
- (o.request_scheduled, event.request_scheduled),
25
- ]
26
-
27
- for handler, evt in event_subscriptions:
28
- try:
29
- crawler.subscriber.subscribe(handler, event=evt)
30
- except Exception as e:
31
- # 获取日志记录器并记录错误
32
- from crawlo.utils.log import get_logger
33
- logger = get_logger(cls.__name__)
34
- logger.error(f"Failed to subscribe to event {evt}: {e}")
26
+ def from_crawler(cls, crawler):
27
+ return cls(crawler)
35
28
 
36
- return o
37
-
38
- async def spider_opened(self) -> None:
39
- try:
40
- self._stats['start_time'] = now(fmt='%Y-%m-%d %H:%M:%S')
41
- except Exception as e:
42
- # 静默处理,避免影响爬虫运行
43
- pass
29
+ @classmethod
30
+ def create_instance(cls, crawler):
31
+ return cls.from_crawler(crawler)
44
32
 
45
- async def spider_closed(self) -> None:
33
+ async def spider_closed(self, reason: str = 'finished') -> None:
46
34
  try:
47
35
  self._stats['end_time'] = now(fmt='%Y-%m-%d %H:%M:%S')
48
36
  self._stats['cost_time(s)'] = time_diff(start=self._stats['start_time'], end=self._stats['end_time'])
37
+ self._stats['reason'] = reason
49
38
  except Exception as e:
39
+ # 添加日志以便调试
40
+ self.logger.error(f"Error in spider_closed: {e}")
50
41
  # 静默处理,避免影响爬虫运行
51
42
  pass
52
43
 
@@ -59,10 +50,8 @@ class LogStats(object):
59
50
 
60
51
  async def item_discard(self, _item: Any, exc: Any, _spider: Any) -> None:
61
52
  try:
53
+ # 只增加总的丢弃计数,不记录每个丢弃项目的原因详情
62
54
  self._stats.inc_value('item_discard_count')
63
- reason = getattr(exc, 'msg', None) # 更安全地获取属性
64
- if reason:
65
- self._stats.inc_value(f"item_discard/{reason}")
66
55
  except Exception as e:
67
56
  # 静默处理,避免影响爬虫运行
68
57
  pass
@@ -127,9 +127,7 @@ class ResponseCodeMiddleware(object):
127
127
  """
128
128
  status_code = response.status_code
129
129
 
130
- # 记录具体状态码统计
131
- self.stats.inc_value(f'response_status_code/count/{status_code}')
132
-
130
+ # 只记录总的统计信息,不记录每个域名和每个状态码的详细信息
133
131
  # 记录状态码分类统计
134
132
  category = self._get_status_category(status_code)
135
133
  self.stats.inc_value(f'response_status_code/category/{category}')
@@ -144,17 +142,6 @@ class ResponseCodeMiddleware(object):
144
142
  if hasattr(response, 'content_length') and response.content_length:
145
143
  self.stats.inc_value('response_total_bytes', response.content_length)
146
144
 
147
- # 记录域名统计
148
- try:
149
- from urllib.parse import urlparse
150
- parsed_url = urlparse(response.url)
151
- domain = parsed_url.netloc
152
- if domain:
153
- self.stats.inc_value(f'response_status_code/domain/{domain}/count/{status_code}')
154
- self.stats.inc_value(f'response_status_code/domain/{domain}/category/{category}')
155
- except Exception:
156
- self.stats.inc_value('response_status_code/domain/invalid_url/count/{status_code}')
157
-
158
145
  # 详细日志记录
159
146
  self.logger.debug(
160
147
  f'收到响应: {status_code} {response.url} '
@@ -66,11 +66,19 @@ class PipelineManager:
66
66
 
67
67
  async def process_item(self, item):
68
68
  try:
69
- for method in self.methods:
70
- item = await common_call(method, item, self.crawler.spider)
71
- if item is None:
72
- raise InvalidOutputError(f"{method.__qualname__} return None is not supported.")
73
- except (ItemDiscard, DropItem) as exc: # 同时捕获两种异常类型
74
- create_task(self.crawler.subscriber.notify(item_discard, item, exc, self.crawler.spider))
69
+ for i, method in enumerate(self.methods):
70
+ self.logger.debug(f"Processing item with pipeline method {i}: {method.__qualname__}")
71
+ try:
72
+ item = await common_call(method, item, self.crawler.spider)
73
+ if item is None:
74
+ raise InvalidOutputError(f"{method.__qualname__} return None is not supported.")
75
+ except (ItemDiscard, DropItem) as exc: # 同时捕获两种异常类型
76
+ self.logger.debug(f"Item discarded by pipeline: {exc}")
77
+ create_task(self.crawler.subscriber.notify(item_discard, item, exc, self.crawler.spider))
78
+ # 重新抛出异常,确保上层调用者也能捕获到,并停止执行后续管道
79
+ raise
80
+ except (ItemDiscard, DropItem):
81
+ # 异常已经被处理和通知,这里只需要重新抛出
82
+ raise
75
83
  else:
76
- create_task(self.crawler.subscriber.notify(item_successful, item, self.crawler.spider))
84
+ create_task(self.crawler.subscriber.notify(item_successful, item, self.crawler.spider))
@@ -104,17 +104,20 @@ class RedisDedupPipeline:
104
104
  if not is_new:
105
105
  # 如果指纹已存在,丢弃这个数据项
106
106
  self.dropped_count += 1
107
- # self.logger.debug(f"Dropping duplicate item: {fingerprint[:20]}...") # 注释掉重复的日志
107
+ self.logger.info(f"Dropping duplicate item: {fingerprint}")
108
108
  raise ItemDiscard(f"Duplicate item: {fingerprint}")
109
109
  else:
110
110
  # 如果是新数据项,继续处理
111
- # self.logger.debug(f"Processing new item: {fingerprint[:20]}...") # 注释掉重复的日志
111
+ self.logger.debug(f"Processing new item: {fingerprint}")
112
112
  return item
113
113
 
114
114
  except redis.RedisError as e:
115
115
  self.logger.error(f"Redis error: {e}")
116
116
  # 在 Redis 错误时继续处理,避免丢失数据
117
117
  return item
118
+ except ItemDiscard:
119
+ # 重新抛出ItemDiscard异常,确保管道管理器能正确处理
120
+ raise
118
121
  except Exception as e:
119
122
  self.logger.error(f"Error processing item: {e}")
120
123
  # 在其他错误时继续处理
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: crawlo
3
- Version: 1.3.9
3
+ Version: 1.4.0
4
4
  Summary: Crawlo 是一款基于异步IO的高性能Python爬虫框架,支持分布式抓取。
5
5
  Home-page: https://github.com/crawl-coder/Crawlo.git
6
6
  Author: crawl-coder
@@ -1,9 +1,9 @@
1
1
  crawlo/__init__.py,sha256=rCeDq1OoX6mmcBxuK60eUpEp1cIg5T8Zgic3FUQAOkA,2318
2
- crawlo/__version__.py,sha256=w1HvwXrREPyUQwcUNaOv25LesFD0cwTBQjuG4ym_vww,22
2
+ crawlo/__version__.py,sha256=EyMGX1ADFzN6XVXHWbJUtKPONYKeFkvWoKIFPDDB2I8,22
3
3
  crawlo/cli.py,sha256=OXprmcTUbFK02ptw_Gq8Gk4-ZCU-WEMJgzU1ztgP6Bk,2327
4
4
  crawlo/config.py,sha256=dNoNyTkXLe2msQ7bZx3YTQItk1m49nIg5-g89FQDNwE,9486
5
5
  crawlo/config_validator.py,sha256=gsiLqf5swWd9ISDvoLqCdG7iSXr-ZdBPD4iT6ug1ua4,11239
6
- crawlo/crawler.py,sha256=wd8_jrfUBwlIw4NiaNeCwMj-CXS7F2ngeUhQ74P0wJE,25656
6
+ crawlo/crawler.py,sha256=E83JhClOe58XVX1ma0f-HAF1BJ7Ej9Zs0w51ERs3fgA,26348
7
7
  crawlo/event.py,sha256=7-y6HNv_EIJSYQNzsj0mVK-Gg4ON3wdQeMdQjfFJPlw,313
8
8
  crawlo/exceptions.py,sha256=sMay0wnWLfc_FXWslqxm60qz6b66LXs3EdN_w8ygE9k,1166
9
9
  crawlo/framework.py,sha256=1RVBwj_VBzfJiMB3lq6XcfFHCjRBHyT4D_T2X4fU_6g,9166
@@ -23,7 +23,7 @@ crawlo/commands/stats.py,sha256=8wTubR1RQ1JPTlpOKArcGcQ39bM-0cuH27lDpndnwPQ,6014
23
23
  crawlo/commands/utils.py,sha256=Psfu2tKrmDloMq0WnfXLaxx0lJFitMZ-FWS3HAIrziQ,5382
24
24
  crawlo/core/__init__.py,sha256=nikMDqFwnDfE8ugqwAIfycBtIqIVZpeprjEYW-H5Dkw,1272
25
25
  crawlo/core/engine.py,sha256=0l7TVNf2R8EHJAZ4ktj71j-qysrq84cYqf_7LEzzYJM,19096
26
- crawlo/core/processor.py,sha256=qmCqAeqhwYu-UE86evYesaGt9qpuSIfH-ZIZKcXFCZc,1140
26
+ crawlo/core/processor.py,sha256=wO6DMU-Azr0yaMLJw8LSTG19a6ZAvPuT3J7wNLfbpu4,1577
27
27
  crawlo/core/scheduler.py,sha256=By1JB0iukcss5j0nrj1rq1Lk-VmmUHIiGl0RLCH9YUs,12630
28
28
  crawlo/data/__init__.py,sha256=8MgDxcMhx-emFARcLAw_ODOZNz0neYBcx7kEbzothd8,116
29
29
  crawlo/data/user_agents.py,sha256=6V34lYHREWV5ZR5wH-1pCnr1Y3ZYC7iMLfC6vZHyhZQ,9697
@@ -34,10 +34,10 @@ crawlo/downloader/httpx_downloader.py,sha256=7jfQfvAtfk8yD_mvwUbWLhYOxMM7r1nudiU
34
34
  crawlo/downloader/hybrid_downloader.py,sha256=4SzOPEwBlSZVzUAWR3DyxMx2Tsx15YrpBvQS4it4Vps,8028
35
35
  crawlo/downloader/playwright_downloader.py,sha256=Lnc7k5cXhVnURXSxgZFCYMJkBxLg5F_OE67rtf3G7Ig,16261
36
36
  crawlo/downloader/selenium_downloader.py,sha256=B_0muNi-GQ_hgoYHcf7wgu01V68q7xKnSh-0kzlUiio,21036
37
- crawlo/extension/__init__.py,sha256=Y3GOEmT7YtRoWf6fxEGCnRXgn_yaXsXCJ1Y6uwjFnM8,1605
37
+ crawlo/extension/__init__.py,sha256=7HxWQKBuiVphZUBLIBVCtIjgFIbzTa5QDOQp6WH4HhU,2923
38
38
  crawlo/extension/health_check.py,sha256=0GveZgUtFwjYEKlm3qbwIvCmb4FR0qrIKc8cEF1yQV8,5516
39
39
  crawlo/extension/log_interval.py,sha256=VCIeNqXcWDnxj4m6l77cjqgRzV8LfsPMb22X0Xc1Vwc,2417
40
- crawlo/extension/log_stats.py,sha256=vrChs3bj_Dvay3kxxkBOp4-w0K-IG-2XZ0PoSUahTPs,2908
40
+ crawlo/extension/log_stats.py,sha256=X90Y_E6USAdm55yvRN5t59HNLmyN9QMKglhbPPxtehA,2382
41
41
  crawlo/extension/logging_extension.py,sha256=8KT-WJRK5tocS2kBOiSquree53L11qD1vLg-P8ob40U,2354
42
42
  crawlo/extension/memory_monitor.py,sha256=4aszl3C0GMQbqFhGZjZq5iQuXQR1sOz06VHjjEHgkyE,4290
43
43
  crawlo/extension/performance_profiler.py,sha256=EPiNuXuPPDU0Jtgy8arYHpr_8ASK13cCI2BytdJnu_I,4899
@@ -70,7 +70,7 @@ crawlo/middleware/middleware_manager.py,sha256=bQuIxn-i2oud-0hDkv890sa3YvNMbuJIR
70
70
  crawlo/middleware/offsite.py,sha256=FIWZvkkzlDJfvQc7Ud7BdfDZ78Sa85qlEEwAR76hSBk,4559
71
71
  crawlo/middleware/proxy.py,sha256=NquB6tqHAgHs3-2_1_5220kJYfjNG5JyHRJyo_2j4wo,15636
72
72
  crawlo/middleware/request_ignore.py,sha256=xcyZ1c7r_HhbzR3r9pfjsLGW7L7FBVeYvlNt8cpP2wY,2577
73
- crawlo/middleware/response_code.py,sha256=-Aa9Mm9nJN-WdddN7iTanJRMA83_LYYgSEz3XLQGvMo,4934
73
+ crawlo/middleware/response_code.py,sha256=sHT-Xe9ZKLsjLso6SYAVqcD0r_4CptD82C605rjFWSs,4383
74
74
  crawlo/middleware/response_filter.py,sha256=6VBUe04mu8C7XxmOak6XyhGMWZPYEm3AMo5Kt_r1SXY,4248
75
75
  crawlo/middleware/retry.py,sha256=HxeIf7DibeLCpZ_y4rNARWMyzlrsdq5UR2CaFZInA3s,4124
76
76
  crawlo/middleware/simple_proxy.py,sha256=V_v28L-faiMJtt8vi-u5O4za-aU77_JTqNTCYSfWzCE,2191
@@ -86,8 +86,8 @@ crawlo/pipelines/json_pipeline.py,sha256=vlu1nqbD2mtqtExt9cL5nibx1CwJM1RNqd4WGjZ
86
86
  crawlo/pipelines/memory_dedup_pipeline.py,sha256=9KuUA1S0uHWSB3qJntPdg9ifPdRXwc8ju4j9tWe8qTo,3853
87
87
  crawlo/pipelines/mongo_pipeline.py,sha256=k7gNqAO-g2MtIfArphC6z5ZzkKVRkBKcv-2ImziPFA0,5706
88
88
  crawlo/pipelines/mysql_pipeline.py,sha256=_oRfIvlEiOsTKkr4v-yPTcL8nG9O9coRmke2ZSkkKII,13871
89
- crawlo/pipelines/pipeline_manager.py,sha256=AZPOjm7N1WcjyfIoyZpzVEchmAfZP0uFSZ_WicKL5co,3171
90
- crawlo/pipelines/redis_dedup_pipeline.py,sha256=2Esl-Yh6nhNzYlUsrpvT0fV8Wx4cNNU9jpwIxqOrgCM,6358
89
+ crawlo/pipelines/pipeline_manager.py,sha256=BX17CU9JK2xJeIdzQ4FeK7kwpwew1k-BEVMk9oviqTQ,3682
90
+ crawlo/pipelines/redis_dedup_pipeline.py,sha256=6fkHt7O-R2TTMlULgxyqPdyKBjsRzYh_GL-Juye4ZQ0,6410
91
91
  crawlo/queue/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
92
92
  crawlo/queue/pqueue.py,sha256=j2ISmyays5t1tuI36xM6EcELwSpq2xIjAScSBWSRZms,1220
93
93
  crawlo/queue/queue_manager.py,sha256=JfkjtOD04e_OZZvEEvp3O_W3lfGXhHslZHrCgw90amY,20693
@@ -286,8 +286,8 @@ tests/verify_distributed.py,sha256=krnYYA5Qx9xXDMWc9YF5DxPSplGvawDg2n0l-3CAqoM,3
286
286
  tests/verify_log_fix.py,sha256=TD7M1R22NxLqQPufvgE-H33u9tUjyz-rSR2ayIXozRU,4225
287
287
  tests/scrapy_comparison/ofweek_scrapy.py,sha256=2Hvpi6DRTubUxBy6RyJApQxMQONPLc1zWjKTQO_i5U4,5652
288
288
  tests/scrapy_comparison/scrapy_test.py,sha256=5sw7jOHhaTmQ8bsUd1TiolAUTRQYQOe-f49HPfysqbI,5466
289
- crawlo-1.3.9.dist-info/METADATA,sha256=a6FUc4WoaMyek0LyyW55pQIMjeAaeRgxyDnWJXvA62I,33235
290
- crawlo-1.3.9.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
291
- crawlo-1.3.9.dist-info/entry_points.txt,sha256=5HoVoTSPxI8SCa5B7pQYxLSrkOdiunyO9tqNsLMv52g,43
292
- crawlo-1.3.9.dist-info/top_level.txt,sha256=keG_67pbZ_wZL2dmDRA9RMaNHTaV_x_oxZ9DKNgwvR0,22
293
- crawlo-1.3.9.dist-info/RECORD,,
289
+ crawlo-1.4.0.dist-info/METADATA,sha256=WIambmjeZxudgM_Ej7lv8CKd1EyoQ4f-Z4CugfsHTkY,33235
290
+ crawlo-1.4.0.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
291
+ crawlo-1.4.0.dist-info/entry_points.txt,sha256=5HoVoTSPxI8SCa5B7pQYxLSrkOdiunyO9tqNsLMv52g,43
292
+ crawlo-1.4.0.dist-info/top_level.txt,sha256=keG_67pbZ_wZL2dmDRA9RMaNHTaV_x_oxZ9DKNgwvR0,22
293
+ crawlo-1.4.0.dist-info/RECORD,,
File without changes