PyPI - crawlo - Versions diffs - 1.4.6__py3-none-any.whl → 1.4.8__py3-none-any.whl - Mend

crawlo 1.4.6py3-none-any.whl → 1.4.8py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of crawlo might be problematic. Click here for more details.

Files changed (162) hide show

crawlo/__init__.py +2 -1
crawlo/__version__.py +1 -1
crawlo/cli.py +2 -2
crawlo/commands/check.py +1 -1
crawlo/commands/help.py +5 -3
crawlo/commands/list.py +1 -1
crawlo/commands/run.py +49 -11
crawlo/commands/stats.py +1 -1
crawlo/config.py +12 -4
crawlo/config_validator.py +1 -1
crawlo/core/engine.py +20 -7
crawlo/core/processor.py +1 -1
crawlo/core/scheduler.py +4 -5
crawlo/crawler.py +51 -10
crawlo/downloader/__init__.py +7 -3
crawlo/downloader/aiohttp_downloader.py +18 -18
crawlo/downloader/cffi_downloader.py +5 -2
crawlo/downloader/httpx_downloader.py +9 -3
crawlo/downloader/hybrid_downloader.py +2 -2
crawlo/downloader/playwright_downloader.py +38 -15
crawlo/downloader/selenium_downloader.py +16 -2
crawlo/event.py +42 -8
crawlo/exceptions.py +157 -24
crawlo/extension/__init__.py +10 -9
crawlo/extension/health_check.py +7 -7
crawlo/extension/log_interval.py +6 -6
crawlo/extension/log_stats.py +2 -2
crawlo/extension/logging_extension.py +4 -12
crawlo/extension/memory_monitor.py +5 -5
crawlo/extension/performance_profiler.py +5 -5
crawlo/extension/request_recorder.py +6 -6
crawlo/factories/base.py +1 -1
crawlo/factories/crawler.py +61 -60
crawlo/factories/utils.py +135 -0
crawlo/filters/__init__.py +19 -2
crawlo/filters/aioredis_filter.py +133 -49
crawlo/filters/memory_filter.py +6 -21
crawlo/framework.py +22 -8
crawlo/initialization/built_in.py +24 -67
crawlo/initialization/core.py +65 -19
crawlo/initialization/phases.py +83 -2
crawlo/initialization/registry.py +5 -7
crawlo/initialization/utils.py +49 -0
crawlo/logging/__init__.py +6 -10
crawlo/logging/config.py +106 -22
crawlo/logging/factory.py +12 -8
crawlo/logging/manager.py +19 -27
crawlo/middleware/__init__.py +72 -9
crawlo/middleware/default_header.py +2 -2
crawlo/middleware/download_delay.py +2 -2
crawlo/middleware/middleware_manager.py +6 -6
crawlo/middleware/offsite.py +2 -2
crawlo/middleware/proxy.py +2 -2
crawlo/middleware/request_ignore.py +4 -4
crawlo/middleware/response_code.py +2 -2
crawlo/middleware/response_filter.py +2 -2
crawlo/middleware/retry.py +1 -1
crawlo/mode_manager.py +38 -4
crawlo/network/request.py +54 -26
crawlo/network/response.py +69 -135
crawlo/pipelines/__init__.py +40 -9
crawlo/pipelines/base_pipeline.py +452 -0
crawlo/pipelines/bloom_dedup_pipeline.py +4 -5
crawlo/pipelines/console_pipeline.py +2 -2
crawlo/pipelines/csv_pipeline.py +4 -4
crawlo/pipelines/database_dedup_pipeline.py +4 -5
crawlo/pipelines/json_pipeline.py +4 -4
crawlo/pipelines/memory_dedup_pipeline.py +4 -5
crawlo/pipelines/mongo_pipeline.py +23 -14
crawlo/pipelines/mysql_pipeline.py +31 -39
crawlo/pipelines/pipeline_manager.py +8 -8
crawlo/pipelines/redis_dedup_pipeline.py +13 -14
crawlo/project.py +1 -1
crawlo/queue/__init__.py +10 -0
crawlo/queue/queue_manager.py +79 -13
crawlo/queue/redis_priority_queue.py +196 -47
crawlo/settings/default_settings.py +16 -6
crawlo/spider/__init__.py +6 -5
crawlo/stats_collector.py +2 -2
crawlo/task_manager.py +1 -1
crawlo/templates/crawlo.cfg.tmpl +3 -3
crawlo/templates/project/__init__.py.tmpl +1 -3
crawlo/templates/project/items.py.tmpl +2 -6
crawlo/templates/project/middlewares.py.tmpl +1 -1
crawlo/templates/project/pipelines.py.tmpl +1 -2
crawlo/templates/project/settings.py.tmpl +12 -10
crawlo/templates/project/settings_distributed.py.tmpl +14 -13
crawlo/templates/project/settings_gentle.py.tmpl +21 -23
crawlo/templates/project/settings_high_performance.py.tmpl +21 -23
crawlo/templates/project/settings_minimal.py.tmpl +10 -8
crawlo/templates/project/settings_simple.py.tmpl +21 -23
crawlo/templates/run.py.tmpl +1 -1
crawlo/templates/spider/spider.py.tmpl +4 -12
crawlo/templates/spiders_init.py.tmpl +3 -8
crawlo/tools/__init__.py +0 -103
crawlo/tools/scenario_adapter.py +1 -1
crawlo/utils/__init__.py +25 -1
crawlo/utils/batch_processor.py +23 -6
crawlo/utils/config_manager.py +442 -0
crawlo/utils/controlled_spider_mixin.py +1 -1
crawlo/utils/db_helper.py +1 -1
crawlo/utils/encoding_helper.py +190 -0
crawlo/utils/error_handler.py +2 -2
crawlo/utils/large_scale_helper.py +1 -1
crawlo/utils/leak_detector.py +335 -0
crawlo/utils/mongo_connection_pool.py +157 -0
crawlo/utils/mysql_connection_pool.py +197 -0
crawlo/utils/performance_monitor.py +1 -1
crawlo/utils/redis_checker.py +91 -0
crawlo/utils/redis_connection_pool.py +260 -70
crawlo/utils/redis_key_validator.py +1 -1
crawlo/utils/request.py +24 -2
crawlo/utils/request_serializer.py +1 -1
crawlo/utils/resource_manager.py +337 -0
crawlo/utils/response_helper.py +113 -0
crawlo/utils/selector_helper.py +3 -2
crawlo/utils/singleton.py +70 -0
crawlo/utils/spider_loader.py +1 -1
crawlo/utils/text_helper.py +1 -1
crawlo-1.4.8.dist-info/METADATA +831 -0
{crawlo-1.4.6.dist-info → crawlo-1.4.8.dist-info}/RECORD +131 -145
tests/advanced_tools_example.py +10 -68
tests/distributed_dedup_test.py +467 -0
tests/monitor_redis_dedup.sh +72 -0
tests/ofweek_scrapy/ofweek_scrapy/spiders/__init__.py +4 -4
tests/simple_cli_test.py +55 -0
tests/test_cli_arguments.py +119 -0
tests/test_dedup_fix.py +10 -10
crawlo/logging/async_handler.py +0 -181
crawlo/logging/monitor.py +0 -153
crawlo/logging/sampler.py +0 -167
crawlo/tools/authenticated_proxy.py +0 -241
crawlo/tools/data_formatter.py +0 -226
crawlo/tools/data_validator.py +0 -181
crawlo/tools/encoding_converter.py +0 -127
crawlo/tools/network_diagnostic.py +0 -365
crawlo/tools/request_tools.py +0 -83
crawlo/tools/retry_mechanism.py +0 -224
crawlo/utils/env_config.py +0 -143
crawlo/utils/large_scale_config.py +0 -287
crawlo/utils/log.py +0 -80
crawlo/utils/system.py +0 -11
crawlo/utils/tools.py +0 -5
crawlo/utils/url.py +0 -40
crawlo-1.4.6.dist-info/METADATA +0 -329
tests/env_config_example.py +0 -134
tests/ofweek_scrapy/ofweek_scrapy/spiders/ofweek_spider.py +0 -162
tests/test_authenticated_proxy.py +0 -142
tests/test_comprehensive.py +0 -147
tests/test_dynamic_downloaders_proxy.py +0 -125
tests/test_dynamic_proxy.py +0 -93
tests/test_dynamic_proxy_config.py +0 -147
tests/test_dynamic_proxy_real.py +0 -110
tests/test_env_config.py +0 -122
tests/test_framework_env_usage.py +0 -104
tests/test_large_scale_config.py +0 -113
tests/test_proxy_api.py +0 -265
tests/test_real_scenario_proxy.py +0 -196
tests/tools_example.py +0 -261
{crawlo-1.4.6.dist-info → crawlo-1.4.8.dist-info}/WHEEL +0 -0
{crawlo-1.4.6.dist-info → crawlo-1.4.8.dist-info}/entry_points.txt +0 -0
{crawlo-1.4.6.dist-info → crawlo-1.4.8.dist-info}/top_level.txt +0 -0

crawlo/queue/redis_priority_queue.py CHANGED Viewed

@@ -1,19 +1,26 @@
 import asyncio
-import asyncio
 import pickle
 import time
 import traceback
-from typing import Optional, TYPE_CHECKING
+from typing import Optional, TYPE_CHECKING, List, Union, Any
 import redis.asyncio as aioredis
+# 尝试导入Redis集群支持
+try:
+    from redis.asyncio.cluster import RedisCluster
+    REDIS_CLUSTER_AVAILABLE = True
+except ImportError:
+    RedisCluster = None
+    REDIS_CLUSTER_AVAILABLE = False
 # 使用 TYPE_CHECKING 避免运行时循环导入
 if TYPE_CHECKING:
     from crawlo import Request
-from crawlo.utils.error_handler import ErrorHandler
-from crawlo.utils.log import get_logger
-from crawlo.utils.redis_connection_pool import get_redis_pool, OptimizedRedisConnectionPool
+from crawlo.utils.error_handler import ErrorHandler, ErrorContext
+from crawlo.logging import get_logger
+from crawlo.utils.redis_connection_pool import get_redis_pool, RedisConnectionPool
 from crawlo.utils.request_serializer import RequestSerializer
 # 延迟初始化避免循环依赖
@@ -42,14 +49,16 @@ class RedisPriorityQueue:
     def __init__(
             self,
-            redis_url: str = None,
-            queue_name: str = None,  # 修改默认值为 None
-            processing_queue: str = None,  # 修改默认值为 None
-            failed_queue: str = None,  # 修改默认值为 None
+            redis_url: Optional[str] = None,
+            queue_name: Optional[str] = None,  # 修改默认值为 None
+            processing_queue: Optional[str] = None,  # 修改默认值为 None
+            failed_queue: Optional[str] = None,  # 修改默认值为 None
             max_retries: int = 3,
             timeout: int = 300,  # 任务处理超时时间（秒）
             max_connections: int = 10,  # 连接池大小
-            module_name: str = "default"  # 添加 module_name 参数
+            module_name: str = "default",  # 添加 module_name 参数
+            is_cluster: bool = False,  # 是否为集群模式
+            cluster_nodes: Optional[List[str]] = None  # 集群节点列表
     ):
         # 移除直接使用 os.getenv()，要求通过参数传递 redis_url
         if redis_url is None:
@@ -58,6 +67,8 @@ class RedisPriorityQueue:
         self.redis_url = redis_url
         self.module_name = module_name  # 保存 module_name
+        self.is_cluster = is_cluster
+        self.cluster_nodes = cluster_nodes
         # 如果未提供 queue_name，则根据 module_name 自动生成
         if queue_name is None:
@@ -87,8 +98,8 @@ class RedisPriorityQueue:
         self.max_retries = max_retries
         self.timeout = timeout
         self.max_connections = max_connections
-        self._redis_pool: Optional[OptimizedRedisConnectionPool] = None
-        self._redis: Optional[aioredis.Redis] = None
+        self._redis_pool: Optional[RedisConnectionPool] = None
+        self._redis: Optional[Any] = None
         self._lock = asyncio.Lock()  # 用于连接初始化的锁
         self.request_serializer = RequestSerializer()  # 处理序列化
@@ -150,6 +161,8 @@ class RedisPriorityQueue:
                     # 使用优化的连接池，确保 decode_responses=False 以避免编码问题
                     self._redis_pool = get_redis_pool(
                         self.redis_url,
+                        is_cluster=self.is_cluster,
+                        cluster_nodes=self.cluster_nodes,
                         max_connections=self.max_connections,
                         socket_connect_timeout=5,
                         socket_timeout=30,
@@ -162,9 +175,8 @@ class RedisPriorityQueue:
                     self._redis = await self._redis_pool.get_connection()
                     # 测试连接
-                    await self._redis.ping()
-                    # 只在调试模式下输出详细连接信息
-                    # get_module_logger().debug(f"Redis 连接成功 (Module: {self.module_name})")  # 注释掉重复的日志
+                    if self._redis:
+                        await self._redis.ping()
                     return self._redis
                 except Exception as e:
                     error_msg = f"Redis 连接失败 (尝试 {attempt + 1}/{max_retries}, Module: {self.module_name}): {e}"
@@ -180,16 +192,28 @@ class RedisPriorityQueue:
         if self._redis is None:
             await self.connect()
         try:
-            await self._redis.ping()
+            if self._redis:
+                await self._redis.ping()
         except Exception as e:
             get_module_logger().warning(f"Redis 连接失效 (Module: {self.module_name})，尝试重连...: {e}")
             self._redis = None
             await self.connect()
+    def _is_cluster_mode(self) -> bool:
+        """检查是否为集群模式"""
+        if REDIS_CLUSTER_AVAILABLE and RedisCluster is not None:
+            # 检查 _redis 是否为 RedisCluster 实例
+            if self._redis is not None and isinstance(self._redis, RedisCluster):
+                return True
+        return False
     async def put(self, request, priority: int = 0) -> bool:
         """放入请求到队列"""
         try:
             await self._ensure_connection()
+            if not self._redis:
+                return False
             # 修复优先级行为一致性问题
             # 原来: score = -priority （导致priority大的先出队）
             # 现在: score = priority （确保priority小的先出队，与内存队列一致）
@@ -208,18 +232,34 @@ class RedisPriorityQueue:
                 get_module_logger().error(f"请求序列化验证失败 (Module: {self.module_name}): {serialize_error}")
                 return False
-            pipe = self._redis.pipeline()
-            pipe.zadd(self.queue_name, {key: score})
-            pipe.hset(f"{self.queue_name}:data", key, serialized)
-            result = await pipe.execute()
+            # 处理集群模式下的操作
+            if self._is_cluster_mode():
+                # 在集群模式下，确保所有键都在同一个slot中
+                # 可以通过在键名中添加相同的哈希标签来实现
+                hash_tag = "{queue}"  # 使用哈希标签确保键在同一个slot
+                queue_name_with_tag = f"{self.queue_name}{hash_tag}"
+                data_key_with_tag = f"{self.queue_name}:data{hash_tag}"
+                pipe = self._redis.pipeline()
+                pipe.zadd(queue_name_with_tag, {key: score})
+                pipe.hset(data_key_with_tag, key, serialized)
+                result = await pipe.execute()
+            else:
+                pipe = self._redis.pipeline()
+                pipe.zadd(self.queue_name, {key: score})
+                pipe.hset(f"{self.queue_name}:data", key, serialized)
+                result = await pipe.execute()
             if result[0] > 0:
-                get_module_logger().debug(f"成功入队 (Module: {self.module_name}): {request.url}")  # 注释掉重复的日志
+                get_module_logger().debug(f"成功入队 (Module: {self.module_name}): {request.url}")
             return result[0] > 0
         except Exception as e:
+            error_context = ErrorContext(
+                context=f"放入队列失败 (Module: {self.module_name})"
+            )
             get_module_error_handler().handle_error(
                 e,
-                context=f"放入队列失败 (Module: {self.module_name})",
+                context=error_context,
                 raise_error=False
             )
             return False
@@ -231,24 +271,54 @@ class RedisPriorityQueue:
         """
         try:
             await self._ensure_connection()
+            if not self._redis:
+                return None
             start_time = asyncio.get_event_loop().time()
             while True:
                 # 尝试获取任务
-                result = await self._redis.zpopmin(self.queue_name, count=1)
+                if self._is_cluster_mode():
+                    # 集群模式处理
+                    hash_tag = "{queue}"
+                    queue_name_with_tag = f"{self.queue_name}{hash_tag}"
+                    result = await self._redis.zpopmin(queue_name_with_tag, count=1)
+                else:
+                    result = await self._redis.zpopmin(self.queue_name, count=1)
                 if result:
                     key, score = result[0]
-                    serialized = await self._redis.hget(f"{self.queue_name}:data", key)
+                    data_key = f"{self.queue_name}:data"
+                    if self._is_cluster_mode():
+                        hash_tag = "{queue}"
+                        data_key = f"{self.queue_name}:data{hash_tag}"
+                    serialized = await self._redis.hget(data_key, key)
                     if not serialized:
                         continue
                     # 移动到 processing
                     processing_key = f"{key}:{int(time.time())}"
-                    pipe = self._redis.pipeline()
-                    pipe.zadd(self.processing_queue, {processing_key: time.time() + self.timeout})
-                    pipe.hset(f"{self.processing_queue}:data", processing_key, serialized)
-                    pipe.hdel(f"{self.queue_name}:data", key)
-                    await pipe.execute()
+                    processing_queue = self.processing_queue
+                    processing_data_key = f"{self.processing_queue}:data"
+                    if self._is_cluster_mode():
+                        hash_tag = "{queue}"
+                        processing_queue = f"{self.processing_queue}{hash_tag}"
+                        processing_data_key = f"{self.processing_queue}:data{hash_tag}"
+                    if self._is_cluster_mode():
+                        pipe = self._redis.pipeline()
+                        pipe.zadd(processing_queue, {processing_key: time.time() + self.timeout})
+                        pipe.hset(processing_data_key, processing_key, serialized)
+                        pipe.hdel(data_key, key)
+                        await pipe.execute()
+                    else:
+                        pipe = self._redis.pipeline()
+                        pipe.zadd(processing_queue, {processing_key: time.time() + self.timeout})
+                        pipe.hset(processing_data_key, processing_key, serialized)
+                        pipe.hdel(data_key, key)
+                        await pipe.execute()
                     # 更安全的反序列化方式
                     try:
@@ -263,8 +333,12 @@ class RedisPriorityQueue:
                         # 如果pickle反序列化失败，记录错误并跳过这个任务
                         get_module_logger().error(f"无法反序列化请求数据 (Module: {self.module_name}): {pickle_error}")
                         # 从processing队列中移除这个无效的任务
-                        await self._redis.zrem(self.processing_queue, processing_key)
-                        await self._redis.hdel(f"{self.processing_queue}:data", processing_key)
+                        if self._is_cluster_mode():
+                            await self._redis.zrem(processing_queue, processing_key)
+                            await self._redis.hdel(processing_data_key, processing_key)
+                        else:
+                            await self._redis.zrem(processing_queue, processing_key)
+                            await self._redis.hdel(processing_data_key, processing_key)
                         # 继续尝试下一个任务
                         continue
@@ -276,9 +350,12 @@ class RedisPriorityQueue:
                 await asyncio.sleep(0.001)  # 从0.01减少到0.001
         except Exception as e:
+            error_context = ErrorContext(
+                context=f"获取队列任务失败 (Module: {self.module_name})"
+            )
             get_module_error_handler().handle_error(
                 e,
-                context=f"获取队列任务失败 (Module: {self.module_name})",
+                context=error_context,
                 raise_error=False
             )
             return None
@@ -287,22 +364,46 @@ class RedisPriorityQueue:
         """确认任务完成"""
         try:
             await self._ensure_connection()
+            if not self._redis:
+                return
             key = self._get_request_key(request)
+            processing_queue = self.processing_queue
+            processing_data_key = f"{self.processing_queue}:data"
+            if self._is_cluster_mode():
+                hash_tag = "{queue}"
+                processing_queue = f"{self.processing_queue}{hash_tag}"
+                processing_data_key = f"{self.processing_queue}:data{hash_tag}"
             cursor = 0
             while True:
-                cursor, keys = await self._redis.zscan(self.processing_queue, cursor, match=f"{key}:*")
+                if self._is_cluster_mode():
+                    cursor, keys = await self._redis.zscan(processing_queue, cursor, match=f"{key}:*")
+                else:
+                    cursor, keys = await self._redis.zscan(processing_queue, cursor, match=f"{key}:*")
                 if keys:
-                    pipe = self._redis.pipeline()
-                    for k in keys:
-                        pipe.zrem(self.processing_queue, k)
-                        pipe.hdel(f"{self.processing_queue}:data", k)
-                    await pipe.execute()
+                    if self._is_cluster_mode():
+                        pipe = self._redis.pipeline()
+                        for k in keys:
+                            pipe.zrem(processing_queue, k)
+                            pipe.hdel(processing_data_key, k)
+                        await pipe.execute()
+                    else:
+                        pipe = self._redis.pipeline()
+                        for k in keys:
+                            pipe.zrem(processing_queue, k)
+                            pipe.hdel(processing_data_key, k)
+                        await pipe.execute()
                 if cursor == 0:
                     break
         except Exception as e:
+            error_context = ErrorContext(
+                context=f"确认任务完成失败 (Module: {self.module_name})"
+            )
             get_module_error_handler().handle_error(
                 e,
-                context=f"确认任务完成失败 (Module: {self.module_name})",
+                context=error_context,
                 raise_error=False
             )
@@ -310,10 +411,20 @@ class RedisPriorityQueue:
         """标记任务失败"""
         try:
             await self._ensure_connection()
+            if not self._redis:
+                return
             key = self._get_request_key(request)
             await self.ack(request)
             retry_key = f"{self.failed_queue}:retries:{key}"
+            failed_queue = self.failed_queue
+            if self._is_cluster_mode():
+                hash_tag = "{queue}"
+                retry_key = f"{self.failed_queue}:retries:{key}{hash_tag}"
+                failed_queue = f"{self.failed_queue}{hash_tag}"
             retries = await self._redis.incr(retry_key)
             await self._redis.expire(retry_key, 86400)
@@ -329,12 +440,15 @@ class RedisPriorityQueue:
                     "failed_at": time.time(),
                     "request_pickle": pickle.dumps(request).hex(),  # 可选：保存完整请求
                 }
-                await self._redis.lpush(self.failed_queue, pickle.dumps(failed_data))
+                await self._redis.lpush(failed_queue, pickle.dumps(failed_data))
                 get_module_logger().error(f"任务彻底失败 [{retries}次] (Module: {self.module_name}): {request.url}")
         except Exception as e:
+            error_context = ErrorContext(
+                context=f"标记任务失败失败 (Module: {self.module_name})"
+            )
             get_module_error_handler().handle_error(
                 e,
-                context=f"标记任务失败失败 (Module: {self.module_name})",
+                context=error_context,
                 raise_error=False
             )
@@ -346,11 +460,22 @@ class RedisPriorityQueue:
         """Get queue size"""
         try:
             await self._ensure_connection()
-            return await self._redis.zcard(self.queue_name)
+            if not self._redis:
+                return 0
+            if self._is_cluster_mode():
+                hash_tag = "{queue}"
+                queue_name_with_tag = f"{self.queue_name}{hash_tag}"
+                return await self._redis.zcard(queue_name_with_tag)
+            else:
+                return await self._redis.zcard(self.queue_name)
         except Exception as e:
+            error_context = ErrorContext(
+                context=f"Failed to get queue size (Module: {self.module_name})"
+            )
             get_module_error_handler().handle_error(
                 e,
-                context=f"Failed to get queue size (Module: {self.module_name})",
+                context=error_context,
                 raise_error=False
             )
             return 0
@@ -358,13 +483,37 @@ class RedisPriorityQueue:
     async def close(self):
         """关闭连接"""
         try:
-            # 连接池会自动管理连接，这里不需要显式关闭单个连接
-            self._redis = None
+            # 显式关闭Redis连接
+            if self._redis is not None:
+                try:
+                    # 尝试关闭连接
+                    if hasattr(self._redis, 'close'):
+                        close_result = self._redis.close()
+                        if asyncio.iscoroutine(close_result):
+                            await close_result
+                    # 等待连接关闭完成
+                    if hasattr(self._redis, 'wait_closed'):
+                        wait_result = self._redis.wait_closed()
+                        if asyncio.iscoroutine(wait_result):
+                            await wait_result
+                except Exception as close_error:
+                    get_module_logger().warning(
+                        f"Error closing Redis connection (Module: {self.module_name}): {close_error}"
+                    )
+                finally:
+                    self._redis = None
+            # 释放连接池引用（连接池由全局管理器管理）
             self._redis_pool = None
             get_module_logger().debug(f"Redis 连接已释放 (Module: {self.module_name})")
         except Exception as e:
+            error_context = ErrorContext(
+                context=f"释放 Redis 连接失败 (Module: {self.module_name})"
+            )
             get_module_error_handler().handle_error(
                 e,
-                context=f"释放 Redis 连接失败 (Module: {self.module_name})",
+                context=error_context,
                 raise_error=False
-            )
+            )

crawlo/settings/default_settings.py CHANGED Viewed

@@ -4,7 +4,7 @@
 包含 Crawlo 框架的所有默认设置项
 """
 # 添加环境变量配置工具导入
-from crawlo.utils.env_config import get_redis_config, get_runtime_config, get_version
+from crawlo.utils.config_manager import EnvConfigManager
 # --------------------------------- 1. 框架基础配置 ------------------------------------
@@ -19,9 +19,9 @@ FRAMEWORK_INIT_ORDER = [
 FRAMEWORK_INIT_STATE = 'uninitialized'
 # 项目基础配置
-runtime_config = get_runtime_config()
+runtime_config = EnvConfigManager.get_runtime_config()
 PROJECT_NAME = runtime_config['PROJECT_NAME']  # 项目名称（用于日志、Redis Key 等标识）
-VERSION = get_version()  # 项目版本号 - 从框架的__version__.py文件中读取，如果不存在则使用默认值
+VERSION = EnvConfigManager.get_version()  # 项目版本号 - 从框架的__version__.py文件中读取，如果不存在则使用默认值
 RUN_MODE = runtime_config['CRAWLO_MODE']  # 运行模式：standalone/distributed/auto
 CONCURRENCY = runtime_config['CONCURRENCY']  # 并发数配置
@@ -70,12 +70,19 @@ MYSQL_INSERT_IGNORE = False  # 是否使用 INSERT IGNORE（忽略重复数据
 MYSQL_UPDATE_COLUMNS = ()  # 冲突时需更新的列名；指定后 MYSQL_AUTO_UPDATE 失效
 # Redis配置
-redis_config = get_redis_config()
+redis_config = EnvConfigManager.get_redis_config()
 REDIS_HOST = redis_config['REDIS_HOST']
 REDIS_PORT = redis_config['REDIS_PORT']
 REDIS_PASSWORD = redis_config['REDIS_PASSWORD']
 REDIS_DB = redis_config['REDIS_DB']
+# Redis集群支持说明：
+# Crawlo框架支持Redis单实例和集群模式的智能切换
+# 集群模式配置方式：
+# 1. 使用逗号分隔的节点列表：'192.168.1.100:7000,192.168.1.101:7000,192.168.1.102:7000'
+# 2. 使用集群URL格式：'redis-cluster://192.168.1.100:7000,192.168.1.101:7000,192.168.1.102:7000'
+# 框架会自动检测URL格式并选择合适的模式
 # 根据是否有密码生成不同的 URL 格式
 if REDIS_PASSWORD:
     REDIS_URL = f'redis://:{REDIS_PASSWORD}@{REDIS_HOST}:{REDIS_PORT}/{REDIS_DB}'
@@ -142,8 +149,11 @@ STATS_DUMP = True  # 是否周期性输出统计信息
 LOG_FILE = None  # 日志文件路径，将在项目配置中设置
 LOG_FORMAT = '%(asctime)s - [%(name)s] - %(levelname)s: %(message)s'
 LOG_ENCODING = 'utf-8'
-LOG_MAX_BYTES = 10 * 1024 * 1024  # 日志轮转大小（字节）
-LOG_BACKUP_COUNT = 5  # 日志备份数量
+LOG_MAX_BYTES = 10 * 1024 * 1024  # 日志轮转大小（字节），推荐20MB用于生产环境
+LOG_BACKUP_COUNT = 5  # 日志备份数量，推荐10个用于生产环境
+# 如果用户不想要日志轮转，可以设置 LOG_MAX_BYTES = 0 来禁用轮转功能
+# 注意：当LOG_MAX_BYTES或LOG_BACKUP_COUNT为0时，日志轮转永远不会发生，日志文件会持续增长
+# 需要通过其他方式管理磁盘空间，如系统级日志轮转工具(logrotate等)
 # 日志间隔配置
 INTERVAL = 60  # 日志输出间隔（秒）

crawlo/spider/__init__.py CHANGED Viewed

@@ -28,10 +28,11 @@ Crawlo Spider Module
             yield Item(data=response.json())
 """
 from __future__ import annotations
-from typing import Type, Any, Optional, List, Dict, Union, Iterator, AsyncIterator
-from ..network.request import Request
-from ..utils.log import get_logger
+from typing import Type, Any, Optional, List, Dict, Iterator
+from ..logging import get_logger
+from ..network.request import Request
 # 全局爬虫注册表
 _DEFAULT_SPIDER_REGISTRY: dict[str, Type[Spider]] = {}
@@ -79,7 +80,7 @@ class SpiderMeta(type):
         _DEFAULT_SPIDER_REGISTRY[spider_name] = cls
         # 延迟初始化logger避免模块级别阻塞
         try:
-            from crawlo.utils.log import get_logger
+            from crawlo.logging import get_logger
             get_logger(__name__).debug(f"自动注册爬虫: {spider_name} -> {cls.__name__}")
         except:
             # 如果日志系统未初始化，静默失败
@@ -171,7 +172,7 @@ class Spider(metaclass=SpiderMeta):
     def logger(self):
         """延迟初始化logger"""
         if self._logger is None:
-            from crawlo.utils.log import get_logger
+            from crawlo.logging import get_logger
             self._logger = get_logger(self.name)
         return self._logger

crawlo/stats_collector.py CHANGED Viewed

@@ -6,7 +6,7 @@
 # @Desc    :   统计信息收集器
 """
 from pprint import pformat
-from crawlo.utils.log import get_logger
+from crawlo.logging import get_logger
 class StatsCollector(object):
@@ -15,7 +15,7 @@ class StatsCollector(object):
         self.crawler = crawler
         self._dump = self.crawler.settings.get_bool('STATS_DUMP')
         self._stats = {}
-        self.logger = get_logger(self.__class__.__name__, "INFO")
+        self.logger = get_logger(self.__class__.__name__)
     def inc_value(self, key, count=1, start=0):
         self._stats[key] = self._stats.setdefault(key, start) + count

crawlo/task_manager.py CHANGED Viewed

@@ -5,7 +5,7 @@ import asyncio
 from typing import Set, Final
 from collections import deque
 from asyncio import Task, Future, Semaphore
-from crawlo.utils.log import get_logger
+from crawlo.logging import get_logger
 class DynamicSemaphore(Semaphore):

crawlo/templates/crawlo.cfg.tmpl CHANGED Viewed

@@ -1,11 +1,11 @@
 # crawlo.cfg
-# 项目的配置文件。
+# 项目配置文件
 [settings]
-# 指定 settings 模块的导入路径
+# settings 模块路径
 default = {{project_name}}.settings
 [deploy]
-# （可选）用于部署配置
+# 部署配置（可选）
 # url = http://localhost:6800/
 # project = {{project_name}}

crawlo/templates/project/__init__.py.tmpl CHANGED Viewed

@@ -1,4 +1,2 @@
 # -*- coding: UTF-8 -*-
-"""
-{{project_name}} 项目包
-"""
+"""{{project_name}} 项目"""

crawlo/templates/project/items.py.tmpl CHANGED Viewed

@@ -1,17 +1,13 @@
 # -*- coding: UTF-8 -*-
 """
-{{project_name}}.items
-======================
-定义你抓取的数据结构。
+数据项定义
 """
 from crawlo.items import Item, Field
 class {{project_name|title}}Item(Item):
-    """
-    {{project_name}} 项目的数据项。
-    """
+    """数据项"""
     id = Field()
     # price = Field()
     # description = Field()

crawlo/templates/project/middlewares.py.tmpl CHANGED Viewed

@@ -6,7 +6,7 @@
 """
 from crawlo.network import Request, Response
-from crawlo.utils.log import get_logger
+from crawlo.logging import get_logger
 class {{project_name|title}}Middleware:

crawlo/templates/project/pipelines.py.tmpl CHANGED Viewed

@@ -5,8 +5,7 @@
 数据管道示例
 """
-from crawlo.exceptions import DropItem
-from crawlo.utils.log import get_logger
+from crawlo.logging import get_logger
 class {{project_name|title}}Pipeline:

crawlo/templates/project/settings.py.tmpl CHANGED Viewed

@@ -4,13 +4,13 @@
 =============================
 基于 Crawlo 框架的爬虫项目配置。
-此配置使用 CrawloConfig.standalone() 工厂方法创建单机模式配置，
-适用于开发测试和中小规模数据采集任务。
+此配置使用 CrawloConfig.auto() 工厂方法创建自动检测模式配置，
+框架会自动检测Redis可用性，可用则使用分布式模式，否则使用单机模式。
 """
 from crawlo.config import CrawloConfig
-# 使用单机模式配置工厂创建配置
+# 使用自动检测模式配置工厂创建配置
 config = CrawloConfig.auto(
     project_name='{{project_name}}',
     concurrency=8,
@@ -25,12 +25,10 @@ locals().update(config.to_dict())
 # 爬虫模块配置
 SPIDER_MODULES = ['{{project_name}}.spiders']
-# 默认请求头配置
-# 为DefaultHeaderMiddleware配置默认请求头
+# 默认请求头
 # DEFAULT_REQUEST_HEADERS = {}
 # 允许的域名
-# 为OffsiteMiddleware配置允许的域名
 # ALLOWED_DOMAINS = []
 # 数据管道
@@ -58,6 +56,10 @@ SPIDER_MODULES = ['{{project_name}}.spiders']
 LOG_LEVEL = 'INFO'
 LOG_FILE = 'logs/{{project_name}}.log'
 LOG_ENCODING = 'utf-8'  # 明确指定日志文件编码
+LOG_MAX_BYTES = 20 * 1024 * 1024  # 20MB，推荐值
+LOG_BACKUP_COUNT = 10  # 10个备份文件，推荐值
+# 如果不想要日志轮转，可以设置 LOG_MAX_BYTES = 0
+# 当LOG_MAX_BYTES或LOG_BACKUP_COUNT为0时，日志轮转将被禁用，文件会持续增长
 STATS_DUMP = True
 # 输出配置
@@ -103,10 +105,10 @@ MONGO_USE_BATCH = False  # 是否启用批量插入
 # =================================== 代理配置 ===================================
-# 简化版代理配置（适用于SimpleProxyMiddleware）
-# 只要配置了代理列表，中间件就会自动启用
+# 简单代理（SimpleProxyMiddleware）
+# 配置代理列表后中间件自动启用
 # PROXY_LIST = ["http://proxy1:8080", "http://proxy2:8080"]
-# 高级代理配置（适用于ProxyMiddleware）
-# 只要配置了代理API URL，中间件就会自动启用
+# 动态代理（ProxyMiddleware）
+# 配置代理API URL后中间件自动启用
 # PROXY_API_URL = "http://your-proxy-api.com/get-proxy"

crawlo 1.4.6__py3-none-any.whl → 1.4.8__py3-none-any.whl

Potentially problematic release.

crawlo 1.4.6py3-none-any.whl → 1.4.8py3-none-any.whl