crawlo 1.4.3__py3-none-any.whl → 1.4.5__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of crawlo might be problematic. Click here for more details.

Files changed (107) hide show
  1. crawlo/__init__.py +11 -15
  2. crawlo/__version__.py +1 -1
  3. crawlo/commands/genspider.py +52 -17
  4. crawlo/commands/startproject.py +24 -0
  5. crawlo/core/engine.py +2 -2
  6. crawlo/core/scheduler.py +4 -4
  7. crawlo/crawler.py +13 -6
  8. crawlo/downloader/__init__.py +5 -2
  9. crawlo/extension/__init__.py +2 -2
  10. crawlo/filters/aioredis_filter.py +8 -1
  11. crawlo/filters/memory_filter.py +8 -1
  12. crawlo/initialization/built_in.py +13 -4
  13. crawlo/initialization/core.py +5 -4
  14. crawlo/interfaces.py +24 -0
  15. crawlo/middleware/__init__.py +7 -4
  16. crawlo/middleware/middleware_manager.py +15 -8
  17. crawlo/mode_manager.py +45 -11
  18. crawlo/network/response.py +374 -69
  19. crawlo/pipelines/mysql_pipeline.py +6 -6
  20. crawlo/pipelines/pipeline_manager.py +2 -2
  21. crawlo/project.py +2 -4
  22. crawlo/queue/pqueue.py +2 -6
  23. crawlo/queue/queue_manager.py +1 -2
  24. crawlo/settings/default_settings.py +15 -30
  25. crawlo/task_manager.py +2 -2
  26. crawlo/templates/project/items.py.tmpl +2 -2
  27. crawlo/templates/project/middlewares.py.tmpl +9 -89
  28. crawlo/templates/project/pipelines.py.tmpl +8 -68
  29. crawlo/templates/project/settings.py.tmpl +51 -65
  30. crawlo/templates/project/settings_distributed.py.tmpl +59 -67
  31. crawlo/templates/project/settings_gentle.py.tmpl +45 -40
  32. crawlo/templates/project/settings_high_performance.py.tmpl +45 -40
  33. crawlo/templates/project/settings_minimal.py.tmpl +37 -26
  34. crawlo/templates/project/settings_simple.py.tmpl +45 -40
  35. crawlo/templates/run.py.tmpl +3 -7
  36. crawlo/tools/__init__.py +0 -11
  37. crawlo/utils/__init__.py +17 -1
  38. crawlo/utils/db_helper.py +220 -319
  39. crawlo/utils/error_handler.py +313 -67
  40. crawlo/utils/fingerprint.py +3 -4
  41. crawlo/utils/misc.py +82 -0
  42. crawlo/utils/request.py +55 -66
  43. crawlo/utils/selector_helper.py +138 -0
  44. crawlo/utils/spider_loader.py +185 -45
  45. crawlo/utils/text_helper.py +95 -0
  46. crawlo-1.4.5.dist-info/METADATA +329 -0
  47. {crawlo-1.4.3.dist-info → crawlo-1.4.5.dist-info}/RECORD +89 -68
  48. tests/bug_check_test.py +251 -0
  49. tests/direct_selector_helper_test.py +97 -0
  50. tests/ofweek_scrapy/ofweek_scrapy/items.py +12 -0
  51. tests/ofweek_scrapy/ofweek_scrapy/middlewares.py +100 -0
  52. tests/ofweek_scrapy/ofweek_scrapy/pipelines.py +13 -0
  53. tests/ofweek_scrapy/ofweek_scrapy/settings.py +85 -0
  54. tests/ofweek_scrapy/ofweek_scrapy/spiders/__init__.py +4 -0
  55. tests/ofweek_scrapy/ofweek_scrapy/spiders/ofweek_spider.py +162 -0
  56. tests/ofweek_scrapy/scrapy.cfg +11 -0
  57. tests/performance_comparison.py +4 -5
  58. tests/simple_crawlo_test.py +1 -2
  59. tests/simple_follow_test.py +39 -0
  60. tests/simple_response_selector_test.py +95 -0
  61. tests/simple_selector_helper_test.py +155 -0
  62. tests/simple_selector_test.py +208 -0
  63. tests/simple_url_test.py +74 -0
  64. tests/test_crawler_process_import.py +39 -0
  65. tests/test_crawler_process_spider_modules.py +48 -0
  66. tests/test_edge_cases.py +7 -5
  67. tests/test_encoding_core.py +57 -0
  68. tests/test_encoding_detection.py +127 -0
  69. tests/test_factory_compatibility.py +197 -0
  70. tests/test_multi_directory.py +68 -0
  71. tests/test_multiple_spider_modules.py +81 -0
  72. tests/test_optimized_selector_naming.py +101 -0
  73. tests/test_priority_behavior.py +18 -18
  74. tests/test_response_follow.py +105 -0
  75. tests/test_response_selector_methods.py +93 -0
  76. tests/test_response_url_methods.py +71 -0
  77. tests/test_response_urljoin.py +87 -0
  78. tests/test_scrapy_style_encoding.py +113 -0
  79. tests/test_selector_helper.py +101 -0
  80. tests/test_selector_optimizations.py +147 -0
  81. tests/test_spider_loader.py +50 -0
  82. tests/test_spider_loader_comprehensive.py +70 -0
  83. tests/test_spider_modules.py +85 -0
  84. tests/test_spiders/__init__.py +1 -0
  85. tests/test_spiders/test_spider.py +10 -0
  86. crawlo/tools/anti_crawler.py +0 -269
  87. crawlo/utils/class_loader.py +0 -26
  88. crawlo/utils/enhanced_error_handler.py +0 -357
  89. crawlo-1.4.3.dist-info/METADATA +0 -190
  90. examples/test_project/__init__.py +0 -7
  91. examples/test_project/run.py +0 -35
  92. examples/test_project/test_project/__init__.py +0 -4
  93. examples/test_project/test_project/items.py +0 -18
  94. examples/test_project/test_project/middlewares.py +0 -119
  95. examples/test_project/test_project/pipelines.py +0 -97
  96. examples/test_project/test_project/settings.py +0 -170
  97. examples/test_project/test_project/spiders/__init__.py +0 -10
  98. examples/test_project/test_project/spiders/of_week_dis.py +0 -144
  99. tests/simple_log_test.py +0 -58
  100. tests/simple_test.py +0 -48
  101. tests/test_framework_logger.py +0 -67
  102. tests/test_framework_startup.py +0 -65
  103. tests/test_mode_change.py +0 -73
  104. {crawlo-1.4.3.dist-info → crawlo-1.4.5.dist-info}/WHEEL +0 -0
  105. {crawlo-1.4.3.dist-info → crawlo-1.4.5.dist-info}/entry_points.txt +0 -0
  106. {crawlo-1.4.3.dist-info → crawlo-1.4.5.dist-info}/top_level.txt +0 -0
  107. /tests/{final_command_test_report.md → ofweek_scrapy/ofweek_scrapy/__init__.py} +0 -0
crawlo/__init__.py CHANGED
@@ -3,14 +3,17 @@
3
3
  """
4
4
  Crawlo - 一个异步爬虫框架
5
5
  """
6
- from typing import TYPE_CHECKING
7
6
 
8
- from crawlo.spider import Spider
7
+ # 为了向后兼容,从tools中导入cleaners相关的功能
8
+ import crawlo.tools as cleaners
9
+ from crawlo import tools
10
+ from crawlo.crawler import CrawlerProcess
11
+ from crawlo.downloader import DownloaderBase
9
12
  from crawlo.items import Item, Field
13
+ from crawlo.middleware import BaseMiddleware
10
14
  from crawlo.network.request import Request
11
15
  from crawlo.network.response import Response
12
- from crawlo.downloader import DownloaderBase
13
- from crawlo.middleware import BaseMiddleware
16
+ from crawlo.spider import Spider
14
17
  from crawlo.utils import (
15
18
  TimeUtils,
16
19
  parse_time,
@@ -24,21 +27,13 @@ from crawlo.utils import (
24
27
  to_local,
25
28
  from_timestamp_with_tz
26
29
  )
27
- from crawlo import tools
28
-
29
- # 框架核心模块 - 使用TYPE_CHECKING避免循环导入
30
- if TYPE_CHECKING:
31
- from crawlo.initialization import get_framework_initializer, initialize_framework
32
-
33
- # 为了向后兼容,从tools中导入cleaners相关的功能
34
- import crawlo.tools as cleaners
35
30
 
36
31
 
37
32
  # 延迟导入的辅助函数
38
33
  def get_framework_initializer():
39
- """延迟导入get_framework_initializer以避免循环依赖"""
40
- from crawlo.initialization import get_framework_initializer as _get_framework_initializer
41
- return _get_framework_initializer()
34
+ """延迟导入CoreInitializer以避免循环依赖"""
35
+ from crawlo.initialization import CoreInitializer
36
+ return CoreInitializer()
42
37
 
43
38
 
44
39
  def initialize_framework(custom_settings=None):
@@ -87,6 +82,7 @@ __all__ = [
87
82
  'from_timestamp_with_tz',
88
83
  'cleaners',
89
84
  'tools',
85
+ 'CrawlerProcess',
90
86
  'get_framework_initializer',
91
87
  'get_bootstrap_manager',
92
88
  '__version__',
crawlo/__version__.py CHANGED
@@ -1 +1 @@
1
- __version__ = '1.4.3'
1
+ __version__ = '1.4.5'
@@ -6,15 +6,16 @@
6
6
  # @Desc : 命令行入口:crawlo genspider baidu,创建爬虫。
7
7
  """
8
8
  import sys
9
+ import re
9
10
  from pathlib import Path
10
11
  import configparser
11
12
  import importlib
12
13
  from rich.console import Console
13
14
 
14
15
  from .utils import (
15
- get_project_root,
16
- validate_project_environment,
17
- show_error_panel,
16
+ get_project_root,
17
+ validate_project_environment,
18
+ show_error_panel,
18
19
  show_success_panel,
19
20
  validate_spider_name,
20
21
  is_valid_domain
@@ -35,6 +36,39 @@ def _render_template(tmpl_path, context):
35
36
  return content
36
37
 
37
38
 
39
+ def generate_class_name(spider_name):
40
+ """
41
+ 根据爬虫名称生成类名
42
+ 规则:蛇形命名 → 大驼峰命名 + 'Spider'
43
+ 示例:
44
+ 'news_spider' → 'NewsSpider'
45
+ 'ofweek_standalone' → 'OfweekStandaloneSpider'
46
+ 'baidu' → 'BaiduSpider'
47
+ """
48
+ # 如果名称已包含 'spider' 后缀,先去除
49
+ name_clean = spider_name
50
+
51
+ # 定义要移除的后缀列表
52
+ spider_suffixes = ['_spider', 'spider']
53
+
54
+ # 检查并移除后缀
55
+ for suffix in spider_suffixes:
56
+ if spider_name.endswith(suffix):
57
+ name_clean = spider_name[:-len(suffix)]
58
+ break
59
+
60
+ # 按分隔符拆分单词
61
+ words = re.split(r'[_-]', name_clean)
62
+
63
+ # 将每个单词首字母大写
64
+ capitalized_words = [word.capitalize() for word in words if word]
65
+
66
+ # 组合成类名
67
+ class_name = ''.join(capitalized_words) + 'Spider'
68
+
69
+ return class_name
70
+
71
+
38
72
  def main(args):
39
73
  if len(args) < 2:
40
74
  console.print("[bold red]错误:[/bold red] 用法: [blue]crawlo genspider[/blue] <爬虫名称> <域名>")
@@ -45,11 +79,11 @@ def main(args):
45
79
 
46
80
  spider_name = args[0]
47
81
  domain = args[1]
48
-
82
+
49
83
  # 验证爬虫名称
50
84
  if not validate_spider_name(spider_name):
51
85
  show_error_panel(
52
- "无效的爬虫名称",
86
+ "无效的爬虫名称",
53
87
  f"爬虫名称 '[cyan]{spider_name}[/cyan]' 无效。\n"
54
88
  "爬虫名称应:\n"
55
89
  " • 以小写字母开头\n"
@@ -57,11 +91,11 @@ def main(args):
57
91
  " • 是有效的Python标识符"
58
92
  )
59
93
  return 1
60
-
94
+
61
95
  # 验证域名格式
62
96
  if not is_valid_domain(domain):
63
97
  show_error_panel(
64
- "无效的域名",
98
+ "无效的域名",
65
99
  f"域名 '[cyan]{domain}[/cyan]' 格式无效。\n"
66
100
  "请提供有效的域名,如 'example.com'"
67
101
  )
@@ -72,7 +106,7 @@ def main(args):
72
106
  if not is_valid:
73
107
  show_error_panel("非Crawlo项目", error_msg)
74
108
  return 1
75
-
109
+
76
110
  project_root = get_project_root()
77
111
 
78
112
  # 确定 items 模块的路径
@@ -91,7 +125,8 @@ def main(args):
91
125
  if item_classes:
92
126
  default_item_class = item_classes[0].__name__
93
127
  else:
94
- console.print("[yellow]警告:[/yellow] 在 [cyan]items.py[/cyan] 中未找到项目类,使用 [green]ExampleItem[/green]。")
128
+ console.print(
129
+ "[yellow]警告:[/yellow] 在 [cyan]items.py[/cyan] 中未找到项目类,使用 [green]ExampleItem[/green]。")
95
130
 
96
131
  except ImportError as e:
97
132
  console.print(f"[yellow]警告:[/yellow] 导入 [cyan]{items_module_path}[/cyan] 失败: {e}")
@@ -104,7 +139,7 @@ def main(args):
104
139
  spider_file = spiders_dir / f'{spider_name}.py'
105
140
  if spider_file.exists():
106
141
  show_error_panel(
107
- "爬虫已存在",
142
+ "爬虫已存在",
108
143
  f"爬虫 '[cyan]{spider_name}[/cyan]' 已存在于\n[green]{spider_file}[/green]"
109
144
  )
110
145
  return 1
@@ -113,13 +148,13 @@ def main(args):
113
148
  tmpl_path = TEMPLATES_DIR / 'spider' / 'spider.py.tmpl'
114
149
  if not tmpl_path.exists():
115
150
  show_error_panel(
116
- "模板未找到",
151
+ "模板未找到",
117
152
  f"模板文件未找到于 [cyan]{tmpl_path}[/cyan]"
118
153
  )
119
154
  return 1
120
155
 
121
- # 生成类名
122
- class_name = f"{spider_name.replace('_', '').capitalize()}Spider"
156
+ # 生成类名(使用新的转换函数)
157
+ class_name = generate_class_name(spider_name)
123
158
 
124
159
  context = {
125
160
  'spider_name': spider_name,
@@ -133,7 +168,7 @@ def main(args):
133
168
  content = _render_template(tmpl_path, context)
134
169
  with open(spider_file, 'w', encoding='utf-8') as f:
135
170
  f.write(content)
136
-
171
+
137
172
  console.print(f"[green]爬虫 '[bold]{spider_name}[/bold]' 创建成功![/green]")
138
173
  console.print(f" → 位置: [cyan]{spider_file}[/cyan]")
139
174
  console.print(f" → 类名: [yellow]{class_name}[/yellow]")
@@ -141,12 +176,12 @@ def main(args):
141
176
  console.print("\n[bold]下一步操作:[/bold]")
142
177
  console.print(f" [blue]crawlo run[/blue] {spider_name}")
143
178
  console.print(f" [blue]crawlo check[/blue] {spider_name}")
144
-
179
+
145
180
  return 0
146
-
181
+
147
182
  except Exception as e:
148
183
  show_error_panel(
149
- "创建失败",
184
+ "创建失败",
150
185
  f"创建爬虫失败: {e}"
151
186
  )
152
187
  return 1
@@ -92,8 +92,32 @@ def _render_template(tmpl_path, context):
92
92
  """读取模板文件,替换 {{key}} 为 context 中的值"""
93
93
  with open(tmpl_path, 'r', encoding='utf-8') as f:
94
94
  content = f.read()
95
+
96
+ # 处理简单的过滤器语法 {{key|filter}}
97
+ import re
98
+
99
+ def apply_filter(value, filter_name):
100
+ if filter_name == 'title':
101
+ # 将 snake_case 转换为 TitleCase
102
+ words = value.replace('_', ' ').split()
103
+ return ''.join(word.capitalize() for word in words)
104
+ return value
105
+
106
+ # 查找并替换 {{key|filter}} 格式的占位符
107
+ pattern = r'\{\{([^}|]+)\|([^}]+)\}\}'
108
+ def replace_filter_match(match):
109
+ key = match.group(1).strip()
110
+ filter_name = match.group(2).strip()
111
+ if key in context:
112
+ return str(apply_filter(context[key], filter_name))
113
+ return match.group(0) # 如果找不到key,保持原样
114
+
115
+ content = re.sub(pattern, replace_filter_match, content)
116
+
117
+ # 处理普通的 {{key}} 占位符
95
118
  for key, value in context.items():
96
119
  content = content.replace(f'{{{{{key}}}}}', str(value))
120
+
97
121
  return content
98
122
 
99
123
 
crawlo/core/engine.py CHANGED
@@ -11,7 +11,7 @@ from crawlo.core.scheduler import Scheduler
11
11
  from crawlo.downloader import DownloaderBase
12
12
  from crawlo.event import spider_opened, spider_error, request_scheduled
13
13
  from crawlo.exceptions import OutputError
14
- from crawlo.utils.class_loader import load_class
14
+ from crawlo.utils.misc import load_object
15
15
  from crawlo.spider import Spider
16
16
  from crawlo.task_manager import TaskManager
17
17
  from crawlo.utils.func_tools import transform
@@ -62,7 +62,7 @@ class Engine(object):
62
62
  self.logger.warning(f"无法使用下载器类型 '{downloader_type}': {e},回退到默认配置")
63
63
 
64
64
  # 方式2: 使用 DOWNLOADER 完整类路径(兼容旧版本)
65
- downloader_cls = load_class(self.settings.get('DOWNLOADER'))
65
+ downloader_cls = load_object(self.settings.get('DOWNLOADER'))
66
66
  if not issubclass(downloader_cls, DownloaderBase):
67
67
  raise TypeError(f'下载器 {downloader_cls.__name__} 不是 DownloaderBase 的子类。')
68
68
  return downloader_cls
crawlo/core/scheduler.py CHANGED
@@ -6,7 +6,7 @@ from typing import Optional, Callable
6
6
  from crawlo.utils.log import get_logger
7
7
  from crawlo.utils.request import set_request
8
8
  from crawlo.utils.error_handler import ErrorHandler
9
- from crawlo.utils.class_loader import load_class
9
+ from crawlo.utils.misc import load_object
10
10
  from crawlo.project import common_call
11
11
  from crawlo.utils.request_serializer import RequestSerializer
12
12
  from crawlo.queue.queue_manager import QueueManager, QueueConfig, QueueType
@@ -26,7 +26,7 @@ class Scheduler:
26
26
 
27
27
  @classmethod
28
28
  def create_instance(cls, crawler):
29
- filter_cls = load_class(crawler.settings.get('FILTER_CLASS'))
29
+ filter_cls = load_object(crawler.settings.get('FILTER_CLASS'))
30
30
  o = cls(
31
31
  crawler=crawler,
32
32
  dupe_filter=filter_cls.create_instance(crawler),
@@ -120,7 +120,7 @@ class Scheduler:
120
120
  # 如果需要更新配置,则执行更新
121
121
  if needs_config_update:
122
122
  # 重新创建过滤器实例,确保使用更新后的配置
123
- filter_cls = load_class(self.crawler.settings.get('FILTER_CLASS'))
123
+ filter_cls = load_object(self.crawler.settings.get('FILTER_CLASS'))
124
124
  self.dupe_filter = filter_cls.create_instance(self.crawler)
125
125
 
126
126
  # 记录警告信息
@@ -136,7 +136,7 @@ class Scheduler:
136
136
  self._switch_to_memory_config()
137
137
 
138
138
  # 重新创建过滤器实例
139
- filter_cls = load_class(self.crawler.settings.get('FILTER_CLASS'))
139
+ filter_cls = load_object(self.crawler.settings.get('FILTER_CLASS'))
140
140
  self.dupe_filter = filter_cls.create_instance(self.crawler)
141
141
 
142
142
  def _is_filter_matching_queue_type(self, current_filter_class):
crawlo/crawler.py CHANGED
@@ -13,14 +13,14 @@
13
13
 
14
14
  import asyncio
15
15
  import time
16
- from contextlib import asynccontextmanager
17
- from dataclasses import dataclass
18
16
  from enum import Enum
17
+ from dataclasses import dataclass
18
+ from contextlib import asynccontextmanager
19
19
  from typing import Optional, Type, Dict, Any, List
20
20
 
21
+ from crawlo.logging import get_logger
21
22
  from crawlo.factories import get_component_registry
22
23
  from crawlo.initialization import initialize_framework, is_framework_ready
23
- from crawlo.logging import get_logger
24
24
 
25
25
 
26
26
  class CrawlerState(Enum):
@@ -345,16 +345,23 @@ class CrawlerProcess:
345
345
  """
346
346
 
347
347
  def __init__(self, settings=None, max_concurrency: int = 3, spider_modules=None):
348
+ # 初始化框架配置
348
349
  self._settings = settings or initialize_framework()
349
350
  self._max_concurrency = max_concurrency
350
351
  self._crawlers: List[ModernCrawler] = []
351
352
  self._semaphore = asyncio.Semaphore(max_concurrency)
352
353
  self._logger = get_logger('crawler.process')
353
- self._spider_modules = spider_modules # 保存spider_modules
354
+
355
+ # 如果没有显式提供spider_modules,则从settings中获取
356
+ if spider_modules is None and self._settings:
357
+ spider_modules = self._settings.get('SPIDER_MODULES', [])
358
+ self._logger.debug(f"从settings中获取SPIDER_MODULES: {spider_modules}")
359
+
360
+ self._spider_modules = spider_modules or [] # 保存spider_modules
354
361
 
355
362
  # 如果提供了spider_modules,自动注册这些模块中的爬虫
356
- if spider_modules:
357
- self._register_spider_modules(spider_modules)
363
+ if self._spider_modules:
364
+ self._register_spider_modules(self._spider_modules)
358
365
 
359
366
  # 指标
360
367
  self._start_time: Optional[float] = None
@@ -15,12 +15,15 @@ Crawlo Downloader Module
15
15
  - ActivateRequestManager: 活跃请求管理器
16
16
  """
17
17
  from abc import abstractmethod, ABCMeta
18
- from typing import Final, Set, Optional
18
+ from typing import Final, Set, Optional, TYPE_CHECKING
19
19
  from contextlib import asynccontextmanager
20
20
 
21
21
  from crawlo.utils.log import get_logger
22
22
  from crawlo.middleware.middleware_manager import MiddlewareManager
23
23
 
24
+ if TYPE_CHECKING:
25
+ from crawlo import Response
26
+
24
27
 
25
28
  class ActivateRequestManager:
26
29
  """活跃请求管理器 - 跟踪和管理正在处理的请求"""
@@ -134,7 +137,7 @@ class DownloaderBase(metaclass=DownloaderMeta):
134
137
  self.logger.error(f"中间件初始化失败: {e}")
135
138
  raise
136
139
 
137
- async def fetch(self, request) -> Optional['Response']:
140
+ async def fetch(self, request) -> 'Optional[Response]':
138
141
  """获取请求响应(经过中间件处理)"""
139
142
  if self._closed:
140
143
  raise RuntimeError(f"{self.__class__.__name__} 已关闭")
@@ -4,7 +4,7 @@ from typing import List, Any
4
4
  from pprint import pformat
5
5
 
6
6
  from crawlo.utils.log import get_logger
7
- from crawlo.utils.class_loader import load_class
7
+ from crawlo.utils.misc import load_object
8
8
  from crawlo.exceptions import ExtensionInitError
9
9
 
10
10
 
@@ -25,7 +25,7 @@ class ExtensionManager(object):
25
25
  def _add_extensions(self, extensions: List[str]) -> None:
26
26
  for extension_path in extensions:
27
27
  try:
28
- extension_cls = load_class(extension_path)
28
+ extension_cls = load_object(extension_path)
29
29
  if not hasattr(extension_cls, 'create_instance'):
30
30
  raise ExtensionInitError(
31
31
  f"Extension '{extension_path}' init failed: Must have method 'create_instance()'"
@@ -142,7 +142,14 @@ class AioRedisFilter(BaseFilter):
142
142
  if redis_client is None:
143
143
  return False
144
144
 
145
- fp = str(request_fingerprint(request))
145
+ # 使用统一的指纹生成器
146
+ from crawlo.utils.fingerprint import FingerprintGenerator
147
+ fp = str(FingerprintGenerator.request_fingerprint(
148
+ request.method,
149
+ request.url,
150
+ request.body or b'',
151
+ dict(request.headers) if hasattr(request, 'headers') else None
152
+ ))
146
153
  self._redis_operations += 1
147
154
 
148
155
  # 使用 pipeline 优化性能
@@ -102,7 +102,14 @@ class MemoryFilter(BaseFilter):
102
102
  :return: 是否重复
103
103
  """
104
104
  with self._lock:
105
- fp = request_fingerprint(request)
105
+ # 使用统一的指纹生成器
106
+ from crawlo.utils.fingerprint import FingerprintGenerator
107
+ fp = FingerprintGenerator.request_fingerprint(
108
+ request.method,
109
+ request.url,
110
+ request.body or b'',
111
+ dict(request.headers) if hasattr(request, 'headers') else None
112
+ )
106
113
  if fp in self.fingerprints:
107
114
  self._dupe_count += 1
108
115
  # if self.debug:
@@ -210,8 +210,17 @@ class SettingsInitializer(BaseInitializer):
210
210
  from crawlo.settings.setting_manager import SettingManager
211
211
  from crawlo.project import _load_project_settings
212
212
 
213
- # 创建配置管理器并加载项目配置
214
- settings = _load_project_settings(context.custom_settings)
213
+ # 如果上下文中已有设置,则使用它作为基础配置
214
+ if context.settings:
215
+ # 使用用户传递的设置作为基础配置
216
+ settings = context.settings
217
+ # 加载项目配置并合并
218
+ project_settings = _load_project_settings(context.custom_settings)
219
+ # 合并配置,用户配置优先
220
+ settings.update_attributes(project_settings.attributes)
221
+ else:
222
+ # 创建配置管理器并加载项目配置
223
+ settings = _load_project_settings(context.custom_settings)
215
224
 
216
225
  # 存储到上下文
217
226
  context.settings = settings
@@ -346,8 +355,8 @@ class ExtensionsInitializer(BaseInitializer):
346
355
  initialized_extensions = []
347
356
  for extension_path in extensions:
348
357
  try:
349
- from crawlo.utils.class_loader import load_class
350
- extension_class = load_class(extension_path)
358
+ from crawlo.utils.misc import load_object
359
+ extension_class = load_object(extension_path)
351
360
  extension_instance = extension_class()
352
361
  initialized_extensions.append(extension_instance)
353
362
  except Exception as e:
@@ -4,14 +4,14 @@
4
4
  核心初始化器 - 协调整个初始化过程
5
5
  """
6
6
 
7
- import time
8
7
  import threading
8
+ import time
9
9
  from typing import Optional, Any
10
10
 
11
- from .context import InitializationContext
12
- from .phases import InitializationPhase, PhaseResult, get_execution_order, get_phase_definition
13
- from .registry import get_global_registry, BaseInitializer, register_initializer
14
11
  from .built_in import register_built_in_initializers
12
+ from .context import InitializationContext
13
+ from .phases import InitializationPhase, PhaseResult, get_execution_order, get_phase_definition
14
+ from .registry import get_global_registry
15
15
 
16
16
 
17
17
  class CoreInitializer:
@@ -78,6 +78,7 @@ class CoreInitializer:
78
78
  # 创建初始化上下文
79
79
  context = InitializationContext()
80
80
  context.custom_settings = kwargs
81
+ context.settings = settings
81
82
  self._context = context
82
83
 
83
84
  try:
crawlo/interfaces.py ADDED
@@ -0,0 +1,24 @@
1
+ from abc import ABC, abstractmethod
2
+ from typing import List, Type, Protocol
3
+
4
+ from crawlo.spider import Spider
5
+ from crawlo.network.request import Request
6
+
7
+
8
+ class ISpiderLoader(Protocol):
9
+ """Spider loader interface"""
10
+
11
+ @abstractmethod
12
+ def load(self, spider_name: str) -> Type[Spider]:
13
+ """Load a spider by name"""
14
+ pass
15
+
16
+ @abstractmethod
17
+ def list(self) -> List[str]:
18
+ """List all available spider names"""
19
+ pass
20
+
21
+ @abstractmethod
22
+ def find_by_request(self, request: Request) -> List[str]:
23
+ """Find spider names that can handle the given request"""
24
+ pass
@@ -1,18 +1,21 @@
1
1
  #!/usr/bin/python
2
2
  # -*- coding:UTF-8 -*-
3
- from crawlo import Request, Response
3
+ from typing import TYPE_CHECKING
4
+
5
+ if TYPE_CHECKING:
6
+ from crawlo import Request, Response
4
7
 
5
8
 
6
9
  class BaseMiddleware(object):
7
- def process_request(self, request, spider) -> None | Request | Response:
10
+ def process_request(self, request, spider) -> 'None | Request | Response':
8
11
  # 请求预处理
9
12
  pass
10
13
 
11
- def process_response(self, request, response, spider) -> Request | Response:
14
+ def process_response(self, request, response, spider) -> 'Request | Response':
12
15
  # 响应预处理
13
16
  pass
14
17
 
15
- def process_exception(self, request, exp, spider) -> None | Request | Response:
18
+ def process_exception(self, request, exp, spider) -> 'None | Request | Response':
16
19
  # 异常预处理
17
20
  pass
18
21
 
@@ -4,11 +4,18 @@ from pprint import pformat
4
4
  from types import MethodType
5
5
  from asyncio import create_task
6
6
  from collections import defaultdict
7
- from typing import List, Dict, Callable, Optional
7
+ from typing import List, Dict, Callable, Optional, TYPE_CHECKING
8
8
 
9
- from crawlo import Request, Response
9
+ from typing import TYPE_CHECKING
10
+
11
+ if TYPE_CHECKING:
12
+ from crawlo import Request, Response
13
+ else:
14
+ # 为 isinstance 检查导入实际的类
15
+ from crawlo.network.request import Request
16
+ from crawlo.network.response import Response
10
17
  from crawlo.utils.log import get_logger
11
- from crawlo.utils.class_loader import load_class
18
+ from crawlo.utils.misc import load_object
12
19
  from crawlo.middleware import BaseMiddleware
13
20
  from crawlo.project import common_call
14
21
  from crawlo.event import ignore_request, response_received
@@ -30,7 +37,7 @@ class MiddlewareManager:
30
37
  self.download_method: Callable = crawler.engine.downloader.download
31
38
  self._stats = crawler.stats
32
39
 
33
- async def _process_request(self, request: Request):
40
+ async def _process_request(self, request: 'Request'):
34
41
  for method in self.methods['process_request']:
35
42
  result = await common_call(method, request, self.crawler.spider)
36
43
  if result is None:
@@ -42,7 +49,7 @@ class MiddlewareManager:
42
49
  )
43
50
  return await self.download_method(request)
44
51
 
45
- async def _process_response(self, request: Request, response: Response):
52
+ async def _process_response(self, request: 'Request', response: 'Response'):
46
53
  for method in reversed(self.methods['process_response']):
47
54
  try:
48
55
  response = await common_call(method, request, response, self.crawler.spider)
@@ -57,7 +64,7 @@ class MiddlewareManager:
57
64
  )
58
65
  return response
59
66
 
60
- async def _process_exception(self, request: Request, exp: Exception):
67
+ async def _process_exception(self, request: 'Request', exp: Exception):
61
68
  for method in self.methods['process_exception']:
62
69
  response = await common_call(method, request, exp, self.crawler.spider)
63
70
  if response is None:
@@ -72,7 +79,7 @@ class MiddlewareManager:
72
79
  else:
73
80
  raise exp
74
81
 
75
- async def download(self, request) -> Optional[Response]:
82
+ async def download(self, request) -> 'Optional[Response]':
76
83
  """ called in the download method. """
77
84
  try:
78
85
  response = await self._process_request(request)
@@ -105,7 +112,7 @@ class MiddlewareManager:
105
112
  self.logger.info(f'Enabled middlewares:\n {pformat(enabled_middlewares)}')
106
113
 
107
114
  def _validate_middleware(self, middleware):
108
- middleware_cls = load_class(middleware)
115
+ middleware_cls = load_object(middleware)
109
116
  if not hasattr(middleware_cls, 'create_instance'):
110
117
  raise MiddlewareInitError(
111
118
  f"Middleware init failed, must inherit from `BaseMiddleware` or have a `create_instance` method"