crawlo 1.4.4__py3-none-any.whl → 1.4.5__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of crawlo might be problematic. Click here for more details.

Files changed (85) hide show
  1. crawlo/__init__.py +11 -15
  2. crawlo/__version__.py +1 -1
  3. crawlo/commands/startproject.py +24 -0
  4. crawlo/core/engine.py +2 -2
  5. crawlo/core/scheduler.py +4 -4
  6. crawlo/crawler.py +8 -7
  7. crawlo/downloader/__init__.py +5 -2
  8. crawlo/extension/__init__.py +2 -2
  9. crawlo/filters/aioredis_filter.py +8 -1
  10. crawlo/filters/memory_filter.py +8 -1
  11. crawlo/initialization/built_in.py +13 -4
  12. crawlo/initialization/core.py +5 -4
  13. crawlo/interfaces.py +24 -0
  14. crawlo/middleware/__init__.py +7 -4
  15. crawlo/middleware/middleware_manager.py +15 -8
  16. crawlo/mode_manager.py +45 -11
  17. crawlo/network/response.py +374 -69
  18. crawlo/pipelines/mysql_pipeline.py +6 -6
  19. crawlo/pipelines/pipeline_manager.py +2 -2
  20. crawlo/project.py +2 -4
  21. crawlo/settings/default_settings.py +4 -0
  22. crawlo/task_manager.py +2 -2
  23. crawlo/templates/project/items.py.tmpl +2 -2
  24. crawlo/templates/project/middlewares.py.tmpl +9 -89
  25. crawlo/templates/project/pipelines.py.tmpl +8 -68
  26. crawlo/tools/__init__.py +0 -11
  27. crawlo/utils/__init__.py +17 -1
  28. crawlo/utils/db_helper.py +220 -319
  29. crawlo/utils/error_handler.py +313 -67
  30. crawlo/utils/fingerprint.py +3 -4
  31. crawlo/utils/misc.py +82 -0
  32. crawlo/utils/request.py +55 -66
  33. crawlo/utils/selector_helper.py +138 -0
  34. crawlo/utils/spider_loader.py +185 -45
  35. crawlo/utils/text_helper.py +95 -0
  36. crawlo-1.4.5.dist-info/METADATA +329 -0
  37. {crawlo-1.4.4.dist-info → crawlo-1.4.5.dist-info}/RECORD +76 -49
  38. tests/bug_check_test.py +251 -0
  39. tests/direct_selector_helper_test.py +97 -0
  40. tests/ofweek_scrapy/ofweek_scrapy/items.py +12 -0
  41. tests/ofweek_scrapy/ofweek_scrapy/middlewares.py +100 -0
  42. tests/ofweek_scrapy/ofweek_scrapy/pipelines.py +13 -0
  43. tests/ofweek_scrapy/ofweek_scrapy/settings.py +85 -0
  44. tests/ofweek_scrapy/ofweek_scrapy/spiders/__init__.py +4 -0
  45. tests/ofweek_scrapy/ofweek_scrapy/spiders/ofweek_spider.py +162 -0
  46. tests/ofweek_scrapy/scrapy.cfg +11 -0
  47. tests/performance_comparison.py +4 -5
  48. tests/simple_crawlo_test.py +1 -2
  49. tests/simple_follow_test.py +39 -0
  50. tests/simple_response_selector_test.py +95 -0
  51. tests/simple_selector_helper_test.py +155 -0
  52. tests/simple_selector_test.py +208 -0
  53. tests/simple_url_test.py +74 -0
  54. tests/test_crawler_process_import.py +39 -0
  55. tests/test_crawler_process_spider_modules.py +48 -0
  56. tests/test_edge_cases.py +7 -5
  57. tests/test_encoding_core.py +57 -0
  58. tests/test_encoding_detection.py +127 -0
  59. tests/test_factory_compatibility.py +197 -0
  60. tests/test_optimized_selector_naming.py +101 -0
  61. tests/test_priority_behavior.py +18 -18
  62. tests/test_response_follow.py +105 -0
  63. tests/test_response_selector_methods.py +93 -0
  64. tests/test_response_url_methods.py +71 -0
  65. tests/test_response_urljoin.py +87 -0
  66. tests/test_scrapy_style_encoding.py +113 -0
  67. tests/test_selector_helper.py +101 -0
  68. tests/test_selector_optimizations.py +147 -0
  69. tests/test_spider_loader.py +50 -0
  70. tests/test_spider_loader_comprehensive.py +70 -0
  71. tests/test_spiders/__init__.py +1 -0
  72. tests/test_spiders/test_spider.py +10 -0
  73. crawlo/tools/anti_crawler.py +0 -269
  74. crawlo/utils/class_loader.py +0 -26
  75. crawlo/utils/enhanced_error_handler.py +0 -357
  76. crawlo-1.4.4.dist-info/METADATA +0 -190
  77. tests/simple_log_test.py +0 -58
  78. tests/simple_test.py +0 -48
  79. tests/test_framework_logger.py +0 -67
  80. tests/test_framework_startup.py +0 -65
  81. tests/test_mode_change.py +0 -73
  82. {crawlo-1.4.4.dist-info → crawlo-1.4.5.dist-info}/WHEEL +0 -0
  83. {crawlo-1.4.4.dist-info → crawlo-1.4.5.dist-info}/entry_points.txt +0 -0
  84. {crawlo-1.4.4.dist-info → crawlo-1.4.5.dist-info}/top_level.txt +0 -0
  85. /tests/{final_command_test_report.md → ofweek_scrapy/ofweek_scrapy/__init__.py} +0 -0
crawlo/__init__.py CHANGED
@@ -3,14 +3,17 @@
3
3
  """
4
4
  Crawlo - 一个异步爬虫框架
5
5
  """
6
- from typing import TYPE_CHECKING
7
6
 
8
- from crawlo.spider import Spider
7
+ # 为了向后兼容,从tools中导入cleaners相关的功能
8
+ import crawlo.tools as cleaners
9
+ from crawlo import tools
10
+ from crawlo.crawler import CrawlerProcess
11
+ from crawlo.downloader import DownloaderBase
9
12
  from crawlo.items import Item, Field
13
+ from crawlo.middleware import BaseMiddleware
10
14
  from crawlo.network.request import Request
11
15
  from crawlo.network.response import Response
12
- from crawlo.downloader import DownloaderBase
13
- from crawlo.middleware import BaseMiddleware
16
+ from crawlo.spider import Spider
14
17
  from crawlo.utils import (
15
18
  TimeUtils,
16
19
  parse_time,
@@ -24,21 +27,13 @@ from crawlo.utils import (
24
27
  to_local,
25
28
  from_timestamp_with_tz
26
29
  )
27
- from crawlo import tools
28
-
29
- # 框架核心模块 - 使用TYPE_CHECKING避免循环导入
30
- if TYPE_CHECKING:
31
- from crawlo.initialization import get_framework_initializer, initialize_framework
32
-
33
- # 为了向后兼容,从tools中导入cleaners相关的功能
34
- import crawlo.tools as cleaners
35
30
 
36
31
 
37
32
  # 延迟导入的辅助函数
38
33
  def get_framework_initializer():
39
- """延迟导入get_framework_initializer以避免循环依赖"""
40
- from crawlo.initialization import get_framework_initializer as _get_framework_initializer
41
- return _get_framework_initializer()
34
+ """延迟导入CoreInitializer以避免循环依赖"""
35
+ from crawlo.initialization import CoreInitializer
36
+ return CoreInitializer()
42
37
 
43
38
 
44
39
  def initialize_framework(custom_settings=None):
@@ -87,6 +82,7 @@ __all__ = [
87
82
  'from_timestamp_with_tz',
88
83
  'cleaners',
89
84
  'tools',
85
+ 'CrawlerProcess',
90
86
  'get_framework_initializer',
91
87
  'get_bootstrap_manager',
92
88
  '__version__',
crawlo/__version__.py CHANGED
@@ -1 +1 @@
1
- __version__ = '1.4.4'
1
+ __version__ = '1.4.5'
@@ -92,8 +92,32 @@ def _render_template(tmpl_path, context):
92
92
  """读取模板文件,替换 {{key}} 为 context 中的值"""
93
93
  with open(tmpl_path, 'r', encoding='utf-8') as f:
94
94
  content = f.read()
95
+
96
+ # 处理简单的过滤器语法 {{key|filter}}
97
+ import re
98
+
99
+ def apply_filter(value, filter_name):
100
+ if filter_name == 'title':
101
+ # 将 snake_case 转换为 TitleCase
102
+ words = value.replace('_', ' ').split()
103
+ return ''.join(word.capitalize() for word in words)
104
+ return value
105
+
106
+ # 查找并替换 {{key|filter}} 格式的占位符
107
+ pattern = r'\{\{([^}|]+)\|([^}]+)\}\}'
108
+ def replace_filter_match(match):
109
+ key = match.group(1).strip()
110
+ filter_name = match.group(2).strip()
111
+ if key in context:
112
+ return str(apply_filter(context[key], filter_name))
113
+ return match.group(0) # 如果找不到key,保持原样
114
+
115
+ content = re.sub(pattern, replace_filter_match, content)
116
+
117
+ # 处理普通的 {{key}} 占位符
95
118
  for key, value in context.items():
96
119
  content = content.replace(f'{{{{{key}}}}}', str(value))
120
+
97
121
  return content
98
122
 
99
123
 
crawlo/core/engine.py CHANGED
@@ -11,7 +11,7 @@ from crawlo.core.scheduler import Scheduler
11
11
  from crawlo.downloader import DownloaderBase
12
12
  from crawlo.event import spider_opened, spider_error, request_scheduled
13
13
  from crawlo.exceptions import OutputError
14
- from crawlo.utils.class_loader import load_class
14
+ from crawlo.utils.misc import load_object
15
15
  from crawlo.spider import Spider
16
16
  from crawlo.task_manager import TaskManager
17
17
  from crawlo.utils.func_tools import transform
@@ -62,7 +62,7 @@ class Engine(object):
62
62
  self.logger.warning(f"无法使用下载器类型 '{downloader_type}': {e},回退到默认配置")
63
63
 
64
64
  # 方式2: 使用 DOWNLOADER 完整类路径(兼容旧版本)
65
- downloader_cls = load_class(self.settings.get('DOWNLOADER'))
65
+ downloader_cls = load_object(self.settings.get('DOWNLOADER'))
66
66
  if not issubclass(downloader_cls, DownloaderBase):
67
67
  raise TypeError(f'下载器 {downloader_cls.__name__} 不是 DownloaderBase 的子类。')
68
68
  return downloader_cls
crawlo/core/scheduler.py CHANGED
@@ -6,7 +6,7 @@ from typing import Optional, Callable
6
6
  from crawlo.utils.log import get_logger
7
7
  from crawlo.utils.request import set_request
8
8
  from crawlo.utils.error_handler import ErrorHandler
9
- from crawlo.utils.class_loader import load_class
9
+ from crawlo.utils.misc import load_object
10
10
  from crawlo.project import common_call
11
11
  from crawlo.utils.request_serializer import RequestSerializer
12
12
  from crawlo.queue.queue_manager import QueueManager, QueueConfig, QueueType
@@ -26,7 +26,7 @@ class Scheduler:
26
26
 
27
27
  @classmethod
28
28
  def create_instance(cls, crawler):
29
- filter_cls = load_class(crawler.settings.get('FILTER_CLASS'))
29
+ filter_cls = load_object(crawler.settings.get('FILTER_CLASS'))
30
30
  o = cls(
31
31
  crawler=crawler,
32
32
  dupe_filter=filter_cls.create_instance(crawler),
@@ -120,7 +120,7 @@ class Scheduler:
120
120
  # 如果需要更新配置,则执行更新
121
121
  if needs_config_update:
122
122
  # 重新创建过滤器实例,确保使用更新后的配置
123
- filter_cls = load_class(self.crawler.settings.get('FILTER_CLASS'))
123
+ filter_cls = load_object(self.crawler.settings.get('FILTER_CLASS'))
124
124
  self.dupe_filter = filter_cls.create_instance(self.crawler)
125
125
 
126
126
  # 记录警告信息
@@ -136,7 +136,7 @@ class Scheduler:
136
136
  self._switch_to_memory_config()
137
137
 
138
138
  # 重新创建过滤器实例
139
- filter_cls = load_class(self.crawler.settings.get('FILTER_CLASS'))
139
+ filter_cls = load_object(self.crawler.settings.get('FILTER_CLASS'))
140
140
  self.dupe_filter = filter_cls.create_instance(self.crawler)
141
141
 
142
142
  def _is_filter_matching_queue_type(self, current_filter_class):
crawlo/crawler.py CHANGED
@@ -13,14 +13,14 @@
13
13
 
14
14
  import asyncio
15
15
  import time
16
- from contextlib import asynccontextmanager
17
- from dataclasses import dataclass
18
16
  from enum import Enum
17
+ from dataclasses import dataclass
18
+ from contextlib import asynccontextmanager
19
19
  from typing import Optional, Type, Dict, Any, List
20
20
 
21
+ from crawlo.logging import get_logger
21
22
  from crawlo.factories import get_component_registry
22
23
  from crawlo.initialization import initialize_framework, is_framework_ready
23
- from crawlo.logging import get_logger
24
24
 
25
25
 
26
26
  class CrawlerState(Enum):
@@ -345,6 +345,7 @@ class CrawlerProcess:
345
345
  """
346
346
 
347
347
  def __init__(self, settings=None, max_concurrency: int = 3, spider_modules=None):
348
+ # 初始化框架配置
348
349
  self._settings = settings or initialize_framework()
349
350
  self._max_concurrency = max_concurrency
350
351
  self._crawlers: List[ModernCrawler] = []
@@ -353,14 +354,14 @@ class CrawlerProcess:
353
354
 
354
355
  # 如果没有显式提供spider_modules,则从settings中获取
355
356
  if spider_modules is None and self._settings:
356
- spider_modules = self._settings.get('SPIDER_MODULES')
357
+ spider_modules = self._settings.get('SPIDER_MODULES', [])
357
358
  self._logger.debug(f"从settings中获取SPIDER_MODULES: {spider_modules}")
358
359
 
359
- self._spider_modules = spider_modules # 保存spider_modules
360
+ self._spider_modules = spider_modules or [] # 保存spider_modules
360
361
 
361
362
  # 如果提供了spider_modules,自动注册这些模块中的爬虫
362
- if spider_modules:
363
- self._register_spider_modules(spider_modules)
363
+ if self._spider_modules:
364
+ self._register_spider_modules(self._spider_modules)
364
365
 
365
366
  # 指标
366
367
  self._start_time: Optional[float] = None
@@ -15,12 +15,15 @@ Crawlo Downloader Module
15
15
  - ActivateRequestManager: 活跃请求管理器
16
16
  """
17
17
  from abc import abstractmethod, ABCMeta
18
- from typing import Final, Set, Optional
18
+ from typing import Final, Set, Optional, TYPE_CHECKING
19
19
  from contextlib import asynccontextmanager
20
20
 
21
21
  from crawlo.utils.log import get_logger
22
22
  from crawlo.middleware.middleware_manager import MiddlewareManager
23
23
 
24
+ if TYPE_CHECKING:
25
+ from crawlo import Response
26
+
24
27
 
25
28
  class ActivateRequestManager:
26
29
  """活跃请求管理器 - 跟踪和管理正在处理的请求"""
@@ -134,7 +137,7 @@ class DownloaderBase(metaclass=DownloaderMeta):
134
137
  self.logger.error(f"中间件初始化失败: {e}")
135
138
  raise
136
139
 
137
- async def fetch(self, request) -> Optional['Response']:
140
+ async def fetch(self, request) -> 'Optional[Response]':
138
141
  """获取请求响应(经过中间件处理)"""
139
142
  if self._closed:
140
143
  raise RuntimeError(f"{self.__class__.__name__} 已关闭")
@@ -4,7 +4,7 @@ from typing import List, Any
4
4
  from pprint import pformat
5
5
 
6
6
  from crawlo.utils.log import get_logger
7
- from crawlo.utils.class_loader import load_class
7
+ from crawlo.utils.misc import load_object
8
8
  from crawlo.exceptions import ExtensionInitError
9
9
 
10
10
 
@@ -25,7 +25,7 @@ class ExtensionManager(object):
25
25
  def _add_extensions(self, extensions: List[str]) -> None:
26
26
  for extension_path in extensions:
27
27
  try:
28
- extension_cls = load_class(extension_path)
28
+ extension_cls = load_object(extension_path)
29
29
  if not hasattr(extension_cls, 'create_instance'):
30
30
  raise ExtensionInitError(
31
31
  f"Extension '{extension_path}' init failed: Must have method 'create_instance()'"
@@ -142,7 +142,14 @@ class AioRedisFilter(BaseFilter):
142
142
  if redis_client is None:
143
143
  return False
144
144
 
145
- fp = str(request_fingerprint(request))
145
+ # 使用统一的指纹生成器
146
+ from crawlo.utils.fingerprint import FingerprintGenerator
147
+ fp = str(FingerprintGenerator.request_fingerprint(
148
+ request.method,
149
+ request.url,
150
+ request.body or b'',
151
+ dict(request.headers) if hasattr(request, 'headers') else None
152
+ ))
146
153
  self._redis_operations += 1
147
154
 
148
155
  # 使用 pipeline 优化性能
@@ -102,7 +102,14 @@ class MemoryFilter(BaseFilter):
102
102
  :return: 是否重复
103
103
  """
104
104
  with self._lock:
105
- fp = request_fingerprint(request)
105
+ # 使用统一的指纹生成器
106
+ from crawlo.utils.fingerprint import FingerprintGenerator
107
+ fp = FingerprintGenerator.request_fingerprint(
108
+ request.method,
109
+ request.url,
110
+ request.body or b'',
111
+ dict(request.headers) if hasattr(request, 'headers') else None
112
+ )
106
113
  if fp in self.fingerprints:
107
114
  self._dupe_count += 1
108
115
  # if self.debug:
@@ -210,8 +210,17 @@ class SettingsInitializer(BaseInitializer):
210
210
  from crawlo.settings.setting_manager import SettingManager
211
211
  from crawlo.project import _load_project_settings
212
212
 
213
- # 创建配置管理器并加载项目配置
214
- settings = _load_project_settings(context.custom_settings)
213
+ # 如果上下文中已有设置,则使用它作为基础配置
214
+ if context.settings:
215
+ # 使用用户传递的设置作为基础配置
216
+ settings = context.settings
217
+ # 加载项目配置并合并
218
+ project_settings = _load_project_settings(context.custom_settings)
219
+ # 合并配置,用户配置优先
220
+ settings.update_attributes(project_settings.attributes)
221
+ else:
222
+ # 创建配置管理器并加载项目配置
223
+ settings = _load_project_settings(context.custom_settings)
215
224
 
216
225
  # 存储到上下文
217
226
  context.settings = settings
@@ -346,8 +355,8 @@ class ExtensionsInitializer(BaseInitializer):
346
355
  initialized_extensions = []
347
356
  for extension_path in extensions:
348
357
  try:
349
- from crawlo.utils.class_loader import load_class
350
- extension_class = load_class(extension_path)
358
+ from crawlo.utils.misc import load_object
359
+ extension_class = load_object(extension_path)
351
360
  extension_instance = extension_class()
352
361
  initialized_extensions.append(extension_instance)
353
362
  except Exception as e:
@@ -4,14 +4,14 @@
4
4
  核心初始化器 - 协调整个初始化过程
5
5
  """
6
6
 
7
- import time
8
7
  import threading
8
+ import time
9
9
  from typing import Optional, Any
10
10
 
11
- from .context import InitializationContext
12
- from .phases import InitializationPhase, PhaseResult, get_execution_order, get_phase_definition
13
- from .registry import get_global_registry, BaseInitializer, register_initializer
14
11
  from .built_in import register_built_in_initializers
12
+ from .context import InitializationContext
13
+ from .phases import InitializationPhase, PhaseResult, get_execution_order, get_phase_definition
14
+ from .registry import get_global_registry
15
15
 
16
16
 
17
17
  class CoreInitializer:
@@ -78,6 +78,7 @@ class CoreInitializer:
78
78
  # 创建初始化上下文
79
79
  context = InitializationContext()
80
80
  context.custom_settings = kwargs
81
+ context.settings = settings
81
82
  self._context = context
82
83
 
83
84
  try:
crawlo/interfaces.py ADDED
@@ -0,0 +1,24 @@
1
+ from abc import ABC, abstractmethod
2
+ from typing import List, Type, Protocol
3
+
4
+ from crawlo.spider import Spider
5
+ from crawlo.network.request import Request
6
+
7
+
8
+ class ISpiderLoader(Protocol):
9
+ """Spider loader interface"""
10
+
11
+ @abstractmethod
12
+ def load(self, spider_name: str) -> Type[Spider]:
13
+ """Load a spider by name"""
14
+ pass
15
+
16
+ @abstractmethod
17
+ def list(self) -> List[str]:
18
+ """List all available spider names"""
19
+ pass
20
+
21
+ @abstractmethod
22
+ def find_by_request(self, request: Request) -> List[str]:
23
+ """Find spider names that can handle the given request"""
24
+ pass
@@ -1,18 +1,21 @@
1
1
  #!/usr/bin/python
2
2
  # -*- coding:UTF-8 -*-
3
- from crawlo import Request, Response
3
+ from typing import TYPE_CHECKING
4
+
5
+ if TYPE_CHECKING:
6
+ from crawlo import Request, Response
4
7
 
5
8
 
6
9
  class BaseMiddleware(object):
7
- def process_request(self, request, spider) -> None | Request | Response:
10
+ def process_request(self, request, spider) -> 'None | Request | Response':
8
11
  # 请求预处理
9
12
  pass
10
13
 
11
- def process_response(self, request, response, spider) -> Request | Response:
14
+ def process_response(self, request, response, spider) -> 'Request | Response':
12
15
  # 响应预处理
13
16
  pass
14
17
 
15
- def process_exception(self, request, exp, spider) -> None | Request | Response:
18
+ def process_exception(self, request, exp, spider) -> 'None | Request | Response':
16
19
  # 异常预处理
17
20
  pass
18
21
 
@@ -4,11 +4,18 @@ from pprint import pformat
4
4
  from types import MethodType
5
5
  from asyncio import create_task
6
6
  from collections import defaultdict
7
- from typing import List, Dict, Callable, Optional
7
+ from typing import List, Dict, Callable, Optional, TYPE_CHECKING
8
8
 
9
- from crawlo import Request, Response
9
+ from typing import TYPE_CHECKING
10
+
11
+ if TYPE_CHECKING:
12
+ from crawlo import Request, Response
13
+ else:
14
+ # 为 isinstance 检查导入实际的类
15
+ from crawlo.network.request import Request
16
+ from crawlo.network.response import Response
10
17
  from crawlo.utils.log import get_logger
11
- from crawlo.utils.class_loader import load_class
18
+ from crawlo.utils.misc import load_object
12
19
  from crawlo.middleware import BaseMiddleware
13
20
  from crawlo.project import common_call
14
21
  from crawlo.event import ignore_request, response_received
@@ -30,7 +37,7 @@ class MiddlewareManager:
30
37
  self.download_method: Callable = crawler.engine.downloader.download
31
38
  self._stats = crawler.stats
32
39
 
33
- async def _process_request(self, request: Request):
40
+ async def _process_request(self, request: 'Request'):
34
41
  for method in self.methods['process_request']:
35
42
  result = await common_call(method, request, self.crawler.spider)
36
43
  if result is None:
@@ -42,7 +49,7 @@ class MiddlewareManager:
42
49
  )
43
50
  return await self.download_method(request)
44
51
 
45
- async def _process_response(self, request: Request, response: Response):
52
+ async def _process_response(self, request: 'Request', response: 'Response'):
46
53
  for method in reversed(self.methods['process_response']):
47
54
  try:
48
55
  response = await common_call(method, request, response, self.crawler.spider)
@@ -57,7 +64,7 @@ class MiddlewareManager:
57
64
  )
58
65
  return response
59
66
 
60
- async def _process_exception(self, request: Request, exp: Exception):
67
+ async def _process_exception(self, request: 'Request', exp: Exception):
61
68
  for method in self.methods['process_exception']:
62
69
  response = await common_call(method, request, exp, self.crawler.spider)
63
70
  if response is None:
@@ -72,7 +79,7 @@ class MiddlewareManager:
72
79
  else:
73
80
  raise exp
74
81
 
75
- async def download(self, request) -> Optional[Response]:
82
+ async def download(self, request) -> 'Optional[Response]':
76
83
  """ called in the download method. """
77
84
  try:
78
85
  response = await self._process_request(request)
@@ -105,7 +112,7 @@ class MiddlewareManager:
105
112
  self.logger.info(f'Enabled middlewares:\n {pformat(enabled_middlewares)}')
106
113
 
107
114
  def _validate_middleware(self, middleware):
108
- middleware_cls = load_class(middleware)
115
+ middleware_cls = load_object(middleware)
109
116
  if not hasattr(middleware_cls, 'create_instance'):
110
117
  raise MiddlewareInitError(
111
118
  f"Middleware init failed, must inherit from `BaseMiddleware` or have a `create_instance` method"
crawlo/mode_manager.py CHANGED
@@ -51,9 +51,11 @@ class ModeManager:
51
51
  def get_standalone_settings() -> Dict[str, Any]:
52
52
  """获取单机模式配置"""
53
53
  return {
54
+ 'RUN_MODE': 'standalone',
54
55
  'QUEUE_TYPE': 'memory',
55
56
  'FILTER_CLASS': 'crawlo.filters.memory_filter.MemoryFilter',
56
57
  'DEFAULT_DEDUP_PIPELINE': 'crawlo.pipelines.memory_dedup_pipeline.MemoryDedupPipeline',
58
+ 'PROJECT_NAME': 'crawlo',
57
59
  'CONCURRENCY': 8,
58
60
  'MAX_RUNNING_SPIDERS': 1,
59
61
  'DOWNLOAD_DELAY': 1.0,
@@ -75,6 +77,7 @@ class ModeManager:
75
77
  redis_url = f'redis://{redis_host}:{redis_port}/{redis_db}'
76
78
 
77
79
  return {
80
+ 'RUN_MODE': 'distributed',
78
81
  'QUEUE_TYPE': 'redis',
79
82
  'FILTER_CLASS': 'crawlo.filters.aioredis_filter.AioRedisFilter',
80
83
  'DEFAULT_DEDUP_PIPELINE': 'crawlo.pipelines.redis_dedup_pipeline.RedisDedupPipeline',
@@ -95,6 +98,7 @@ class ModeManager:
95
98
  """获取自动检测模式配置"""
96
99
  # 默认使用内存队列和过滤器
97
100
  settings = ModeManager.get_standalone_settings()
101
+ settings['RUN_MODE'] = 'auto'
98
102
  settings['QUEUE_TYPE'] = 'auto'
99
103
  return settings
100
104
 
@@ -143,13 +147,29 @@ class ModeManager:
143
147
  raise ValueError(f"不支持的运行模式: {mode}")
144
148
 
145
149
  # 合并用户自定义配置
146
- user_settings = {
147
- k: v for k,
148
- v in kwargs.items() if k not in [
149
- 'redis_host',
150
- 'redis_port',
151
- 'redis_password',
152
- 'project_name']}
150
+ # 对于分布式模式,过滤掉特定参数
151
+ if mode == RunMode.DISTRIBUTED:
152
+ user_settings = {
153
+ k.upper(): v for k,
154
+ v in kwargs.items() if k not in [
155
+ 'redis_host',
156
+ 'redis_port',
157
+ 'redis_password',
158
+ 'project_name']}
159
+ # 特别处理project_name
160
+ if 'project_name' in kwargs:
161
+ settings['PROJECT_NAME'] = kwargs['project_name']
162
+ else:
163
+ # 对于单机模式和自动模式,只过滤Redis相关参数
164
+ user_settings = {
165
+ k.upper(): v for k,
166
+ v in kwargs.items() if k not in [
167
+ 'redis_host',
168
+ 'redis_port',
169
+ 'redis_password']}
170
+ # 特别处理project_name
171
+ if 'project_name' in kwargs:
172
+ settings['PROJECT_NAME'] = kwargs['project_name']
153
173
  settings.update(user_settings)
154
174
  self._debug(f"合并用户自定义配置: {list(user_settings.keys())}")
155
175
 
@@ -182,9 +202,16 @@ class ModeManager:
182
202
 
183
203
 
184
204
  # 便利函数
185
- def standalone_mode(**kwargs) -> Dict[str, Any]:
205
+ def standalone_mode(
206
+ project_name: str = 'crawlo',
207
+ **kwargs
208
+ ) -> Dict[str, Any]:
186
209
  """快速创建单机模式配置"""
187
- return ModeManager().resolve_mode_settings('standalone', **kwargs)
210
+ return ModeManager().resolve_mode_settings(
211
+ 'standalone',
212
+ project_name=project_name,
213
+ **kwargs
214
+ )
188
215
 
189
216
 
190
217
  def distributed_mode(
@@ -207,9 +234,16 @@ def distributed_mode(
207
234
  )
208
235
 
209
236
 
210
- def auto_mode(**kwargs) -> Dict[str, Any]:
237
+ def auto_mode(
238
+ project_name: str = 'crawlo',
239
+ **kwargs
240
+ ) -> Dict[str, Any]:
211
241
  """快速创建自动检测模式配置"""
212
- return ModeManager().resolve_mode_settings('auto', **kwargs)
242
+ return ModeManager().resolve_mode_settings(
243
+ 'auto',
244
+ project_name=project_name,
245
+ **kwargs
246
+ )
213
247
 
214
248
 
215
249
  # 环境变量支持