PyPI - crawlo - Versions diffs - 1.3.9__py3-none-any.whl → 1.4.0__py3-none-any.whl - Mend

crawlo 1.3.9py3-none-any.whl → 1.4.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of crawlo might be problematic. Click here for more details.

Files changed (13) hide show

crawlo/__version__.py +1 -1
crawlo/core/processor.py +11 -3
crawlo/crawler.py +12 -0
crawlo/extension/__init__.py +25 -0
crawlo/extension/log_stats.py +26 -37
crawlo/middleware/response_code.py +1 -14
crawlo/pipelines/pipeline_manager.py +15 -7
crawlo/pipelines/redis_dedup_pipeline.py +5 -2
{crawlo-1.3.9.dist-info → crawlo-1.4.0.dist-info}/METADATA +1 -1
{crawlo-1.3.9.dist-info → crawlo-1.4.0.dist-info}/RECORD +13 -13
{crawlo-1.3.9.dist-info → crawlo-1.4.0.dist-info}/WHEEL +0 -0
{crawlo-1.3.9.dist-info → crawlo-1.4.0.dist-info}/entry_points.txt +0 -0
{crawlo-1.3.9.dist-info → crawlo-1.4.0.dist-info}/top_level.txt +0 -0

crawlo/__version__.py CHANGED Viewed

	@@ -1 +1 @@
1	- __version__ = '1.3.9'
1	+ __version__ = '1.4.0'

crawlo/core/processor.py CHANGED Viewed

@@ -1,10 +1,12 @@
 #!/usr/bin/python
 # -*- coding:UTF-8 -*-
-from asyncio import Queue
+from asyncio import Queue, create_task
 from typing import Union, Optional
 from crawlo import Request, Item
 from crawlo.pipelines.pipeline_manager import PipelineManager
+from crawlo.exceptions import ItemDiscard
+from crawlo.event import item_discard
 class Processor(object):
@@ -27,7 +29,13 @@ class Processor(object):
                 await self._process_item(result)
     async def _process_item(self, item):
-        await self.pipelines.process_item(item=item)
+        try:
+            await self.pipelines.process_item(item=item)
+        except ItemDiscard as exc:
+            # Item was discarded by a pipeline (e.g., deduplication pipeline)
+            # We simply ignore this item and don't pass it to subsequent pipelines
+            # The statistics system has already been notified in PipelineManager, so we don't need to notify again
+            pass
     async def enqueue(self, output: Union[Request, Item]):
         await self.queue.put(output)
@@ -37,4 +45,4 @@ class Processor(object):
         return len(self) == 0
     def __len__(self):
-        return self.queue.qsize()
+        return self.queue.qsize()

crawlo/crawler.py CHANGED Viewed

@@ -308,6 +308,18 @@ class ModernCrawler:
                 except Exception as e:
                     self._logger.warning(f"Spider cleanup failed: {e}")
+            # 调用StatsCollector的close_spider方法，设置reason和spider_name
+            if self._stats and hasattr(self._stats, 'close_spider'):
+                try:
+                    # 使用默认的'finished'作为reason
+                    self._stats.close_spider(self._spider, reason='finished')
+                except Exception as e:
+                    self._logger.warning(f"Stats close_spider failed: {e}")
+            # 触发spider_closed事件，通知所有订阅者（包括扩展）
+            # 传递reason参数，这里使用默认的'finished'作为reason
+            await self.subscriber.notify("spider_closed", reason='finished')
             if self._stats and hasattr(self._stats, 'close'):
                 try:
                     close_result = self._stats.close()

crawlo/extension/__init__.py CHANGED Viewed

@@ -16,6 +16,7 @@ class ExtensionManager(object):
         extensions = self.crawler.settings.get_list('EXTENSIONS')
         self.logger = get_logger(self.__class__.__name__, crawler.settings.get('LOG_LEVEL'))
         self._add_extensions(extensions)
+        self._subscribe_extensions()
     @classmethod
     def create_instance(cls, *args: Any, **kwargs: Any) -> 'ExtensionManager':
@@ -37,3 +38,27 @@ class ExtensionManager(object):
         if extensions:
             # 恢复INFO级别日志，保留关键的启用信息
             self.logger.info(f"Enabled extensions: \n{pformat(extensions)}")
+    def _subscribe_extensions(self) -> None:
+        """订阅扩展方法到相应的事件"""
+        for extension in self.extensions:
+            # 订阅 spider_closed 方法
+            if hasattr(extension, 'spider_closed'):
+                self.crawler.subscriber.subscribe(extension.spider_closed, event="spider_closed")
+            # 订阅 item_successful 方法
+            if hasattr(extension, 'item_successful'):
+                self.crawler.subscriber.subscribe(extension.item_successful, event="item_successful")
+            # 订阅 item_discard 方法
+            if hasattr(extension, 'item_discard'):
+                self.crawler.subscriber.subscribe(extension.item_discard, event="item_discard")
+            # 订阅 response_received 方法
+            if hasattr(extension, 'response_received'):
+                # 修复：将事件名称从 "request_received" 更正为 "response_received"
+                self.crawler.subscriber.subscribe(extension.response_received, event="response_received")
+            # 订阅 request_scheduled 方法
+            if hasattr(extension, 'request_scheduled'):
+                self.crawler.subscriber.subscribe(extension.request_scheduled, event="request_scheduled")

crawlo/extension/log_stats.py CHANGED Viewed

@@ -1,52 +1,43 @@
 #!/usr/bin/python
 # -*- coding:UTF-8 -*-
+"""
+日志统计扩展
+提供详细的爬虫运行统计信息
+"""
+import asyncio
 from typing import Any
-from crawlo import event
-from crawlo.tools.date_tools import now, time_diff
+from crawlo.utils.log import get_logger
+from crawlo.utils import now, time_diff
-class LogStats(object):
+class LogStats:
+    """
+    日志统计扩展，记录和输出爬虫运行过程中的各种统计信息
+    """
-    def __init__(self, stats: Any):
-        self._stats = stats
+    def __init__(self, crawler):
+        self.crawler = crawler
+        self.logger = get_logger(self.__class__.__name__, crawler.settings.get('LOG_LEVEL'))
+        self._stats = crawler.stats
+        self._stats['start_time'] = now(fmt='%Y-%m-%d %H:%M:%S')
     @classmethod
-    def create_instance(cls, crawler: Any) -> 'LogStats':
-        o = cls(crawler.stats)
-        # 订阅所有需要的事件
-        event_subscriptions = [
-            (o.spider_opened, event.spider_opened),
-            (o.spider_closed, event.spider_closed),
-            (o.item_successful, event.item_successful),
-            (o.item_discard, event.item_discard),
-            (o.response_received, event.response_received),
-            (o.request_scheduled, event.request_scheduled),
-        ]
-        for handler, evt in event_subscriptions:
-            try:
-                crawler.subscriber.subscribe(handler, event=evt)
-            except Exception as e:
-                # 获取日志记录器并记录错误
-                from crawlo.utils.log import get_logger
-                logger = get_logger(cls.__name__)
-                logger.error(f"Failed to subscribe to event {evt}: {e}")
+    def from_crawler(cls, crawler):
+        return cls(crawler)
-        return o
-    async def spider_opened(self) -> None:
-        try:
-            self._stats['start_time'] = now(fmt='%Y-%m-%d %H:%M:%S')
-        except Exception as e:
-            # 静默处理，避免影响爬虫运行
-            pass
+    @classmethod
+    def create_instance(cls, crawler):
+        return cls.from_crawler(crawler)
-    async def spider_closed(self) -> None:
+    async def spider_closed(self, reason: str = 'finished') -> None:
         try:
             self._stats['end_time'] = now(fmt='%Y-%m-%d %H:%M:%S')
             self._stats['cost_time(s)'] = time_diff(start=self._stats['start_time'], end=self._stats['end_time'])
+            self._stats['reason'] = reason
         except Exception as e:
+            # 添加日志以便调试
+            self.logger.error(f"Error in spider_closed: {e}")
             # 静默处理，避免影响爬虫运行
             pass
@@ -59,10 +50,8 @@ class LogStats(object):
     async def item_discard(self, _item: Any, exc: Any, _spider: Any) -> None:
         try:
+            # 只增加总的丢弃计数，不记录每个丢弃项目的原因详情
             self._stats.inc_value('item_discard_count')
-            reason = getattr(exc, 'msg', None)  # 更安全地获取属性
-            if reason:
-                self._stats.inc_value(f"item_discard/{reason}")
         except Exception as e:
             # 静默处理，避免影响爬虫运行
             pass

crawlo/middleware/response_code.py CHANGED Viewed

@@ -127,9 +127,7 @@ class ResponseCodeMiddleware(object):
         """
         status_code = response.status_code
-        # 记录具体状态码统计
-        self.stats.inc_value(f'response_status_code/count/{status_code}')
+        # 只记录总的统计信息，不记录每个域名和每个状态码的详细信息
         # 记录状态码分类统计
         category = self._get_status_category(status_code)
         self.stats.inc_value(f'response_status_code/category/{category}')
@@ -144,17 +142,6 @@ class ResponseCodeMiddleware(object):
         if hasattr(response, 'content_length') and response.content_length:
             self.stats.inc_value('response_total_bytes', response.content_length)
-        # 记录域名统计
-        try:
-            from urllib.parse import urlparse
-            parsed_url = urlparse(response.url)
-            domain = parsed_url.netloc
-            if domain:
-                self.stats.inc_value(f'response_status_code/domain/{domain}/count/{status_code}')
-                self.stats.inc_value(f'response_status_code/domain/{domain}/category/{category}')
-        except Exception:
-            self.stats.inc_value('response_status_code/domain/invalid_url/count/{status_code}')
         # 详细日志记录
         self.logger.debug(
             f'收到响应: {status_code} {response.url} '

crawlo/pipelines/pipeline_manager.py CHANGED Viewed

@@ -66,11 +66,19 @@ class PipelineManager:
     async def process_item(self, item):
         try:
-            for method in self.methods:
-                item = await common_call(method, item, self.crawler.spider)
-                if item is None:
-                    raise InvalidOutputError(f"{method.__qualname__} return None is not supported.")
-        except (ItemDiscard, DropItem) as exc:  # 同时捕获两种异常类型
-            create_task(self.crawler.subscriber.notify(item_discard, item, exc, self.crawler.spider))
+            for i, method in enumerate(self.methods):
+                self.logger.debug(f"Processing item with pipeline method {i}: {method.__qualname__}")
+                try:
+                    item = await common_call(method, item, self.crawler.spider)
+                    if item is None:
+                        raise InvalidOutputError(f"{method.__qualname__} return None is not supported.")
+                except (ItemDiscard, DropItem) as exc:  # 同时捕获两种异常类型
+                    self.logger.debug(f"Item discarded by pipeline: {exc}")
+                    create_task(self.crawler.subscriber.notify(item_discard, item, exc, self.crawler.spider))
+                    # 重新抛出异常，确保上层调用者也能捕获到，并停止执行后续管道
+                    raise
+        except (ItemDiscard, DropItem):
+            # 异常已经被处理和通知，这里只需要重新抛出
+            raise
         else:
-            create_task(self.crawler.subscriber.notify(item_successful, item, self.crawler.spider))
+            create_task(self.crawler.subscriber.notify(item_successful, item, self.crawler.spider))

crawlo/pipelines/redis_dedup_pipeline.py CHANGED Viewed

@@ -104,17 +104,20 @@ class RedisDedupPipeline:
             if not is_new:
                 # 如果指纹已存在，丢弃这个数据项
                 self.dropped_count += 1
-                # self.logger.debug(f"Dropping duplicate item: {fingerprint[:20]}...")  # 注释掉重复的日志
+                self.logger.info(f"Dropping duplicate item: {fingerprint}")
                 raise ItemDiscard(f"Duplicate item: {fingerprint}")
             else:
                 # 如果是新数据项，继续处理
-                # self.logger.debug(f"Processing new item: {fingerprint[:20]}...")  # 注释掉重复的日志
+                self.logger.debug(f"Processing new item: {fingerprint}")
                 return item
         except redis.RedisError as e:
             self.logger.error(f"Redis error: {e}")
             # 在 Redis 错误时继续处理，避免丢失数据
             return item
+        except ItemDiscard:
+            # 重新抛出ItemDiscard异常，确保管道管理器能正确处理
+            raise
         except Exception as e:
             self.logger.error(f"Error processing item: {e}")
             # 在其他错误时继续处理

{crawlo-1.3.9.dist-info → crawlo-1.4.0.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: crawlo
-Version: 1.3.9
+Version: 1.4.0
 Summary: Crawlo 是一款基于异步IO的高性能Python爬虫框架，支持分布式抓取。
 Home-page: https://github.com/crawl-coder/Crawlo.git
 Author: crawl-coder

{crawlo-1.3.9.dist-info → crawlo-1.4.0.dist-info}/RECORD RENAMED Viewed

@@ -1,9 +1,9 @@
 crawlo/__init__.py,sha256=rCeDq1OoX6mmcBxuK60eUpEp1cIg5T8Zgic3FUQAOkA,2318
-crawlo/__version__.py,sha256=w1HvwXrREPyUQwcUNaOv25LesFD0cwTBQjuG4ym_vww,22
+crawlo/__version__.py,sha256=EyMGX1ADFzN6XVXHWbJUtKPONYKeFkvWoKIFPDDB2I8,22
 crawlo/cli.py,sha256=OXprmcTUbFK02ptw_Gq8Gk4-ZCU-WEMJgzU1ztgP6Bk,2327
 crawlo/config.py,sha256=dNoNyTkXLe2msQ7bZx3YTQItk1m49nIg5-g89FQDNwE,9486
 crawlo/config_validator.py,sha256=gsiLqf5swWd9ISDvoLqCdG7iSXr-ZdBPD4iT6ug1ua4,11239
-crawlo/crawler.py,sha256=wd8_jrfUBwlIw4NiaNeCwMj-CXS7F2ngeUhQ74P0wJE,25656
+crawlo/crawler.py,sha256=E83JhClOe58XVX1ma0f-HAF1BJ7Ej9Zs0w51ERs3fgA,26348
 crawlo/event.py,sha256=7-y6HNv_EIJSYQNzsj0mVK-Gg4ON3wdQeMdQjfFJPlw,313
 crawlo/exceptions.py,sha256=sMay0wnWLfc_FXWslqxm60qz6b66LXs3EdN_w8ygE9k,1166
 crawlo/framework.py,sha256=1RVBwj_VBzfJiMB3lq6XcfFHCjRBHyT4D_T2X4fU_6g,9166
@@ -23,7 +23,7 @@ crawlo/commands/stats.py,sha256=8wTubR1RQ1JPTlpOKArcGcQ39bM-0cuH27lDpndnwPQ,6014
 crawlo/commands/utils.py,sha256=Psfu2tKrmDloMq0WnfXLaxx0lJFitMZ-FWS3HAIrziQ,5382
 crawlo/core/__init__.py,sha256=nikMDqFwnDfE8ugqwAIfycBtIqIVZpeprjEYW-H5Dkw,1272
 crawlo/core/engine.py,sha256=0l7TVNf2R8EHJAZ4ktj71j-qysrq84cYqf_7LEzzYJM,19096
-crawlo/core/processor.py,sha256=qmCqAeqhwYu-UE86evYesaGt9qpuSIfH-ZIZKcXFCZc,1140
+crawlo/core/processor.py,sha256=wO6DMU-Azr0yaMLJw8LSTG19a6ZAvPuT3J7wNLfbpu4,1577
 crawlo/core/scheduler.py,sha256=By1JB0iukcss5j0nrj1rq1Lk-VmmUHIiGl0RLCH9YUs,12630
 crawlo/data/__init__.py,sha256=8MgDxcMhx-emFARcLAw_ODOZNz0neYBcx7kEbzothd8,116
 crawlo/data/user_agents.py,sha256=6V34lYHREWV5ZR5wH-1pCnr1Y3ZYC7iMLfC6vZHyhZQ,9697
@@ -34,10 +34,10 @@ crawlo/downloader/httpx_downloader.py,sha256=7jfQfvAtfk8yD_mvwUbWLhYOxMM7r1nudiU
 crawlo/downloader/hybrid_downloader.py,sha256=4SzOPEwBlSZVzUAWR3DyxMx2Tsx15YrpBvQS4it4Vps,8028
 crawlo/downloader/playwright_downloader.py,sha256=Lnc7k5cXhVnURXSxgZFCYMJkBxLg5F_OE67rtf3G7Ig,16261
 crawlo/downloader/selenium_downloader.py,sha256=B_0muNi-GQ_hgoYHcf7wgu01V68q7xKnSh-0kzlUiio,21036
-crawlo/extension/__init__.py,sha256=Y3GOEmT7YtRoWf6fxEGCnRXgn_yaXsXCJ1Y6uwjFnM8,1605
+crawlo/extension/__init__.py,sha256=7HxWQKBuiVphZUBLIBVCtIjgFIbzTa5QDOQp6WH4HhU,2923
 crawlo/extension/health_check.py,sha256=0GveZgUtFwjYEKlm3qbwIvCmb4FR0qrIKc8cEF1yQV8,5516
 crawlo/extension/log_interval.py,sha256=VCIeNqXcWDnxj4m6l77cjqgRzV8LfsPMb22X0Xc1Vwc,2417
-crawlo/extension/log_stats.py,sha256=vrChs3bj_Dvay3kxxkBOp4-w0K-IG-2XZ0PoSUahTPs,2908
+crawlo/extension/log_stats.py,sha256=X90Y_E6USAdm55yvRN5t59HNLmyN9QMKglhbPPxtehA,2382
 crawlo/extension/logging_extension.py,sha256=8KT-WJRK5tocS2kBOiSquree53L11qD1vLg-P8ob40U,2354
 crawlo/extension/memory_monitor.py,sha256=4aszl3C0GMQbqFhGZjZq5iQuXQR1sOz06VHjjEHgkyE,4290
 crawlo/extension/performance_profiler.py,sha256=EPiNuXuPPDU0Jtgy8arYHpr_8ASK13cCI2BytdJnu_I,4899
@@ -70,7 +70,7 @@ crawlo/middleware/middleware_manager.py,sha256=bQuIxn-i2oud-0hDkv890sa3YvNMbuJIR
 crawlo/middleware/offsite.py,sha256=FIWZvkkzlDJfvQc7Ud7BdfDZ78Sa85qlEEwAR76hSBk,4559
 crawlo/middleware/proxy.py,sha256=NquB6tqHAgHs3-2_1_5220kJYfjNG5JyHRJyo_2j4wo,15636
 crawlo/middleware/request_ignore.py,sha256=xcyZ1c7r_HhbzR3r9pfjsLGW7L7FBVeYvlNt8cpP2wY,2577
-crawlo/middleware/response_code.py,sha256=-Aa9Mm9nJN-WdddN7iTanJRMA83_LYYgSEz3XLQGvMo,4934
+crawlo/middleware/response_code.py,sha256=sHT-Xe9ZKLsjLso6SYAVqcD0r_4CptD82C605rjFWSs,4383
 crawlo/middleware/response_filter.py,sha256=6VBUe04mu8C7XxmOak6XyhGMWZPYEm3AMo5Kt_r1SXY,4248
 crawlo/middleware/retry.py,sha256=HxeIf7DibeLCpZ_y4rNARWMyzlrsdq5UR2CaFZInA3s,4124
 crawlo/middleware/simple_proxy.py,sha256=V_v28L-faiMJtt8vi-u5O4za-aU77_JTqNTCYSfWzCE,2191
@@ -86,8 +86,8 @@ crawlo/pipelines/json_pipeline.py,sha256=vlu1nqbD2mtqtExt9cL5nibx1CwJM1RNqd4WGjZ
 crawlo/pipelines/memory_dedup_pipeline.py,sha256=9KuUA1S0uHWSB3qJntPdg9ifPdRXwc8ju4j9tWe8qTo,3853
 crawlo/pipelines/mongo_pipeline.py,sha256=k7gNqAO-g2MtIfArphC6z5ZzkKVRkBKcv-2ImziPFA0,5706
 crawlo/pipelines/mysql_pipeline.py,sha256=_oRfIvlEiOsTKkr4v-yPTcL8nG9O9coRmke2ZSkkKII,13871
-crawlo/pipelines/pipeline_manager.py,sha256=AZPOjm7N1WcjyfIoyZpzVEchmAfZP0uFSZ_WicKL5co,3171
-crawlo/pipelines/redis_dedup_pipeline.py,sha256=2Esl-Yh6nhNzYlUsrpvT0fV8Wx4cNNU9jpwIxqOrgCM,6358
+crawlo/pipelines/pipeline_manager.py,sha256=BX17CU9JK2xJeIdzQ4FeK7kwpwew1k-BEVMk9oviqTQ,3682
+crawlo/pipelines/redis_dedup_pipeline.py,sha256=6fkHt7O-R2TTMlULgxyqPdyKBjsRzYh_GL-Juye4ZQ0,6410
 crawlo/queue/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
 crawlo/queue/pqueue.py,sha256=j2ISmyays5t1tuI36xM6EcELwSpq2xIjAScSBWSRZms,1220
 crawlo/queue/queue_manager.py,sha256=JfkjtOD04e_OZZvEEvp3O_W3lfGXhHslZHrCgw90amY,20693
@@ -286,8 +286,8 @@ tests/verify_distributed.py,sha256=krnYYA5Qx9xXDMWc9YF5DxPSplGvawDg2n0l-3CAqoM,3
 tests/verify_log_fix.py,sha256=TD7M1R22NxLqQPufvgE-H33u9tUjyz-rSR2ayIXozRU,4225
 tests/scrapy_comparison/ofweek_scrapy.py,sha256=2Hvpi6DRTubUxBy6RyJApQxMQONPLc1zWjKTQO_i5U4,5652
 tests/scrapy_comparison/scrapy_test.py,sha256=5sw7jOHhaTmQ8bsUd1TiolAUTRQYQOe-f49HPfysqbI,5466
-crawlo-1.3.9.dist-info/METADATA,sha256=a6FUc4WoaMyek0LyyW55pQIMjeAaeRgxyDnWJXvA62I,33235
-crawlo-1.3.9.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
-crawlo-1.3.9.dist-info/entry_points.txt,sha256=5HoVoTSPxI8SCa5B7pQYxLSrkOdiunyO9tqNsLMv52g,43
-crawlo-1.3.9.dist-info/top_level.txt,sha256=keG_67pbZ_wZL2dmDRA9RMaNHTaV_x_oxZ9DKNgwvR0,22
-crawlo-1.3.9.dist-info/RECORD,,
+crawlo-1.4.0.dist-info/METADATA,sha256=WIambmjeZxudgM_Ej7lv8CKd1EyoQ4f-Z4CugfsHTkY,33235
+crawlo-1.4.0.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
+crawlo-1.4.0.dist-info/entry_points.txt,sha256=5HoVoTSPxI8SCa5B7pQYxLSrkOdiunyO9tqNsLMv52g,43
+crawlo-1.4.0.dist-info/top_level.txt,sha256=keG_67pbZ_wZL2dmDRA9RMaNHTaV_x_oxZ9DKNgwvR0,22
+crawlo-1.4.0.dist-info/RECORD,,

{crawlo-1.3.9.dist-info → crawlo-1.4.0.dist-info}/WHEEL RENAMED Viewed

File without changes

{crawlo-1.3.9.dist-info → crawlo-1.4.0.dist-info}/entry_points.txt RENAMED Viewed

File without changes

{crawlo-1.3.9.dist-info → crawlo-1.4.0.dist-info}/top_level.txt RENAMED Viewed

File without changes

crawlo 1.3.9__py3-none-any.whl → 1.4.0__py3-none-any.whl

Potentially problematic release.

crawlo 1.3.9py3-none-any.whl → 1.4.0py3-none-any.whl