crawlo 1.4.3__py3-none-any.whl → 1.4.5__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of crawlo might be problematic. Click here for more details.
- crawlo/__init__.py +11 -15
- crawlo/__version__.py +1 -1
- crawlo/commands/genspider.py +52 -17
- crawlo/commands/startproject.py +24 -0
- crawlo/core/engine.py +2 -2
- crawlo/core/scheduler.py +4 -4
- crawlo/crawler.py +13 -6
- crawlo/downloader/__init__.py +5 -2
- crawlo/extension/__init__.py +2 -2
- crawlo/filters/aioredis_filter.py +8 -1
- crawlo/filters/memory_filter.py +8 -1
- crawlo/initialization/built_in.py +13 -4
- crawlo/initialization/core.py +5 -4
- crawlo/interfaces.py +24 -0
- crawlo/middleware/__init__.py +7 -4
- crawlo/middleware/middleware_manager.py +15 -8
- crawlo/mode_manager.py +45 -11
- crawlo/network/response.py +374 -69
- crawlo/pipelines/mysql_pipeline.py +6 -6
- crawlo/pipelines/pipeline_manager.py +2 -2
- crawlo/project.py +2 -4
- crawlo/queue/pqueue.py +2 -6
- crawlo/queue/queue_manager.py +1 -2
- crawlo/settings/default_settings.py +15 -30
- crawlo/task_manager.py +2 -2
- crawlo/templates/project/items.py.tmpl +2 -2
- crawlo/templates/project/middlewares.py.tmpl +9 -89
- crawlo/templates/project/pipelines.py.tmpl +8 -68
- crawlo/templates/project/settings.py.tmpl +51 -65
- crawlo/templates/project/settings_distributed.py.tmpl +59 -67
- crawlo/templates/project/settings_gentle.py.tmpl +45 -40
- crawlo/templates/project/settings_high_performance.py.tmpl +45 -40
- crawlo/templates/project/settings_minimal.py.tmpl +37 -26
- crawlo/templates/project/settings_simple.py.tmpl +45 -40
- crawlo/templates/run.py.tmpl +3 -7
- crawlo/tools/__init__.py +0 -11
- crawlo/utils/__init__.py +17 -1
- crawlo/utils/db_helper.py +220 -319
- crawlo/utils/error_handler.py +313 -67
- crawlo/utils/fingerprint.py +3 -4
- crawlo/utils/misc.py +82 -0
- crawlo/utils/request.py +55 -66
- crawlo/utils/selector_helper.py +138 -0
- crawlo/utils/spider_loader.py +185 -45
- crawlo/utils/text_helper.py +95 -0
- crawlo-1.4.5.dist-info/METADATA +329 -0
- {crawlo-1.4.3.dist-info → crawlo-1.4.5.dist-info}/RECORD +89 -68
- tests/bug_check_test.py +251 -0
- tests/direct_selector_helper_test.py +97 -0
- tests/ofweek_scrapy/ofweek_scrapy/items.py +12 -0
- tests/ofweek_scrapy/ofweek_scrapy/middlewares.py +100 -0
- tests/ofweek_scrapy/ofweek_scrapy/pipelines.py +13 -0
- tests/ofweek_scrapy/ofweek_scrapy/settings.py +85 -0
- tests/ofweek_scrapy/ofweek_scrapy/spiders/__init__.py +4 -0
- tests/ofweek_scrapy/ofweek_scrapy/spiders/ofweek_spider.py +162 -0
- tests/ofweek_scrapy/scrapy.cfg +11 -0
- tests/performance_comparison.py +4 -5
- tests/simple_crawlo_test.py +1 -2
- tests/simple_follow_test.py +39 -0
- tests/simple_response_selector_test.py +95 -0
- tests/simple_selector_helper_test.py +155 -0
- tests/simple_selector_test.py +208 -0
- tests/simple_url_test.py +74 -0
- tests/test_crawler_process_import.py +39 -0
- tests/test_crawler_process_spider_modules.py +48 -0
- tests/test_edge_cases.py +7 -5
- tests/test_encoding_core.py +57 -0
- tests/test_encoding_detection.py +127 -0
- tests/test_factory_compatibility.py +197 -0
- tests/test_multi_directory.py +68 -0
- tests/test_multiple_spider_modules.py +81 -0
- tests/test_optimized_selector_naming.py +101 -0
- tests/test_priority_behavior.py +18 -18
- tests/test_response_follow.py +105 -0
- tests/test_response_selector_methods.py +93 -0
- tests/test_response_url_methods.py +71 -0
- tests/test_response_urljoin.py +87 -0
- tests/test_scrapy_style_encoding.py +113 -0
- tests/test_selector_helper.py +101 -0
- tests/test_selector_optimizations.py +147 -0
- tests/test_spider_loader.py +50 -0
- tests/test_spider_loader_comprehensive.py +70 -0
- tests/test_spider_modules.py +85 -0
- tests/test_spiders/__init__.py +1 -0
- tests/test_spiders/test_spider.py +10 -0
- crawlo/tools/anti_crawler.py +0 -269
- crawlo/utils/class_loader.py +0 -26
- crawlo/utils/enhanced_error_handler.py +0 -357
- crawlo-1.4.3.dist-info/METADATA +0 -190
- examples/test_project/__init__.py +0 -7
- examples/test_project/run.py +0 -35
- examples/test_project/test_project/__init__.py +0 -4
- examples/test_project/test_project/items.py +0 -18
- examples/test_project/test_project/middlewares.py +0 -119
- examples/test_project/test_project/pipelines.py +0 -97
- examples/test_project/test_project/settings.py +0 -170
- examples/test_project/test_project/spiders/__init__.py +0 -10
- examples/test_project/test_project/spiders/of_week_dis.py +0 -144
- tests/simple_log_test.py +0 -58
- tests/simple_test.py +0 -48
- tests/test_framework_logger.py +0 -67
- tests/test_framework_startup.py +0 -65
- tests/test_mode_change.py +0 -73
- {crawlo-1.4.3.dist-info → crawlo-1.4.5.dist-info}/WHEEL +0 -0
- {crawlo-1.4.3.dist-info → crawlo-1.4.5.dist-info}/entry_points.txt +0 -0
- {crawlo-1.4.3.dist-info → crawlo-1.4.5.dist-info}/top_level.txt +0 -0
- /tests/{final_command_test_report.md → ofweek_scrapy/ofweek_scrapy/__init__.py} +0 -0
crawlo/__init__.py
CHANGED
|
@@ -3,14 +3,17 @@
|
|
|
3
3
|
"""
|
|
4
4
|
Crawlo - 一个异步爬虫框架
|
|
5
5
|
"""
|
|
6
|
-
from typing import TYPE_CHECKING
|
|
7
6
|
|
|
8
|
-
|
|
7
|
+
# 为了向后兼容,从tools中导入cleaners相关的功能
|
|
8
|
+
import crawlo.tools as cleaners
|
|
9
|
+
from crawlo import tools
|
|
10
|
+
from crawlo.crawler import CrawlerProcess
|
|
11
|
+
from crawlo.downloader import DownloaderBase
|
|
9
12
|
from crawlo.items import Item, Field
|
|
13
|
+
from crawlo.middleware import BaseMiddleware
|
|
10
14
|
from crawlo.network.request import Request
|
|
11
15
|
from crawlo.network.response import Response
|
|
12
|
-
from crawlo.
|
|
13
|
-
from crawlo.middleware import BaseMiddleware
|
|
16
|
+
from crawlo.spider import Spider
|
|
14
17
|
from crawlo.utils import (
|
|
15
18
|
TimeUtils,
|
|
16
19
|
parse_time,
|
|
@@ -24,21 +27,13 @@ from crawlo.utils import (
|
|
|
24
27
|
to_local,
|
|
25
28
|
from_timestamp_with_tz
|
|
26
29
|
)
|
|
27
|
-
from crawlo import tools
|
|
28
|
-
|
|
29
|
-
# 框架核心模块 - 使用TYPE_CHECKING避免循环导入
|
|
30
|
-
if TYPE_CHECKING:
|
|
31
|
-
from crawlo.initialization import get_framework_initializer, initialize_framework
|
|
32
|
-
|
|
33
|
-
# 为了向后兼容,从tools中导入cleaners相关的功能
|
|
34
|
-
import crawlo.tools as cleaners
|
|
35
30
|
|
|
36
31
|
|
|
37
32
|
# 延迟导入的辅助函数
|
|
38
33
|
def get_framework_initializer():
|
|
39
|
-
"""延迟导入
|
|
40
|
-
from crawlo.initialization import
|
|
41
|
-
return
|
|
34
|
+
"""延迟导入CoreInitializer以避免循环依赖"""
|
|
35
|
+
from crawlo.initialization import CoreInitializer
|
|
36
|
+
return CoreInitializer()
|
|
42
37
|
|
|
43
38
|
|
|
44
39
|
def initialize_framework(custom_settings=None):
|
|
@@ -87,6 +82,7 @@ __all__ = [
|
|
|
87
82
|
'from_timestamp_with_tz',
|
|
88
83
|
'cleaners',
|
|
89
84
|
'tools',
|
|
85
|
+
'CrawlerProcess',
|
|
90
86
|
'get_framework_initializer',
|
|
91
87
|
'get_bootstrap_manager',
|
|
92
88
|
'__version__',
|
crawlo/__version__.py
CHANGED
|
@@ -1 +1 @@
|
|
|
1
|
-
__version__ = '1.4.
|
|
1
|
+
__version__ = '1.4.5'
|
crawlo/commands/genspider.py
CHANGED
|
@@ -6,15 +6,16 @@
|
|
|
6
6
|
# @Desc : 命令行入口:crawlo genspider baidu,创建爬虫。
|
|
7
7
|
"""
|
|
8
8
|
import sys
|
|
9
|
+
import re
|
|
9
10
|
from pathlib import Path
|
|
10
11
|
import configparser
|
|
11
12
|
import importlib
|
|
12
13
|
from rich.console import Console
|
|
13
14
|
|
|
14
15
|
from .utils import (
|
|
15
|
-
get_project_root,
|
|
16
|
-
validate_project_environment,
|
|
17
|
-
show_error_panel,
|
|
16
|
+
get_project_root,
|
|
17
|
+
validate_project_environment,
|
|
18
|
+
show_error_panel,
|
|
18
19
|
show_success_panel,
|
|
19
20
|
validate_spider_name,
|
|
20
21
|
is_valid_domain
|
|
@@ -35,6 +36,39 @@ def _render_template(tmpl_path, context):
|
|
|
35
36
|
return content
|
|
36
37
|
|
|
37
38
|
|
|
39
|
+
def generate_class_name(spider_name):
|
|
40
|
+
"""
|
|
41
|
+
根据爬虫名称生成类名
|
|
42
|
+
规则:蛇形命名 → 大驼峰命名 + 'Spider'
|
|
43
|
+
示例:
|
|
44
|
+
'news_spider' → 'NewsSpider'
|
|
45
|
+
'ofweek_standalone' → 'OfweekStandaloneSpider'
|
|
46
|
+
'baidu' → 'BaiduSpider'
|
|
47
|
+
"""
|
|
48
|
+
# 如果名称已包含 'spider' 后缀,先去除
|
|
49
|
+
name_clean = spider_name
|
|
50
|
+
|
|
51
|
+
# 定义要移除的后缀列表
|
|
52
|
+
spider_suffixes = ['_spider', 'spider']
|
|
53
|
+
|
|
54
|
+
# 检查并移除后缀
|
|
55
|
+
for suffix in spider_suffixes:
|
|
56
|
+
if spider_name.endswith(suffix):
|
|
57
|
+
name_clean = spider_name[:-len(suffix)]
|
|
58
|
+
break
|
|
59
|
+
|
|
60
|
+
# 按分隔符拆分单词
|
|
61
|
+
words = re.split(r'[_-]', name_clean)
|
|
62
|
+
|
|
63
|
+
# 将每个单词首字母大写
|
|
64
|
+
capitalized_words = [word.capitalize() for word in words if word]
|
|
65
|
+
|
|
66
|
+
# 组合成类名
|
|
67
|
+
class_name = ''.join(capitalized_words) + 'Spider'
|
|
68
|
+
|
|
69
|
+
return class_name
|
|
70
|
+
|
|
71
|
+
|
|
38
72
|
def main(args):
|
|
39
73
|
if len(args) < 2:
|
|
40
74
|
console.print("[bold red]错误:[/bold red] 用法: [blue]crawlo genspider[/blue] <爬虫名称> <域名>")
|
|
@@ -45,11 +79,11 @@ def main(args):
|
|
|
45
79
|
|
|
46
80
|
spider_name = args[0]
|
|
47
81
|
domain = args[1]
|
|
48
|
-
|
|
82
|
+
|
|
49
83
|
# 验证爬虫名称
|
|
50
84
|
if not validate_spider_name(spider_name):
|
|
51
85
|
show_error_panel(
|
|
52
|
-
"无效的爬虫名称",
|
|
86
|
+
"无效的爬虫名称",
|
|
53
87
|
f"爬虫名称 '[cyan]{spider_name}[/cyan]' 无效。\n"
|
|
54
88
|
"爬虫名称应:\n"
|
|
55
89
|
" • 以小写字母开头\n"
|
|
@@ -57,11 +91,11 @@ def main(args):
|
|
|
57
91
|
" • 是有效的Python标识符"
|
|
58
92
|
)
|
|
59
93
|
return 1
|
|
60
|
-
|
|
94
|
+
|
|
61
95
|
# 验证域名格式
|
|
62
96
|
if not is_valid_domain(domain):
|
|
63
97
|
show_error_panel(
|
|
64
|
-
"无效的域名",
|
|
98
|
+
"无效的域名",
|
|
65
99
|
f"域名 '[cyan]{domain}[/cyan]' 格式无效。\n"
|
|
66
100
|
"请提供有效的域名,如 'example.com'"
|
|
67
101
|
)
|
|
@@ -72,7 +106,7 @@ def main(args):
|
|
|
72
106
|
if not is_valid:
|
|
73
107
|
show_error_panel("非Crawlo项目", error_msg)
|
|
74
108
|
return 1
|
|
75
|
-
|
|
109
|
+
|
|
76
110
|
project_root = get_project_root()
|
|
77
111
|
|
|
78
112
|
# 确定 items 模块的路径
|
|
@@ -91,7 +125,8 @@ def main(args):
|
|
|
91
125
|
if item_classes:
|
|
92
126
|
default_item_class = item_classes[0].__name__
|
|
93
127
|
else:
|
|
94
|
-
console.print(
|
|
128
|
+
console.print(
|
|
129
|
+
"[yellow]警告:[/yellow] 在 [cyan]items.py[/cyan] 中未找到项目类,使用 [green]ExampleItem[/green]。")
|
|
95
130
|
|
|
96
131
|
except ImportError as e:
|
|
97
132
|
console.print(f"[yellow]警告:[/yellow] 导入 [cyan]{items_module_path}[/cyan] 失败: {e}")
|
|
@@ -104,7 +139,7 @@ def main(args):
|
|
|
104
139
|
spider_file = spiders_dir / f'{spider_name}.py'
|
|
105
140
|
if spider_file.exists():
|
|
106
141
|
show_error_panel(
|
|
107
|
-
"爬虫已存在",
|
|
142
|
+
"爬虫已存在",
|
|
108
143
|
f"爬虫 '[cyan]{spider_name}[/cyan]' 已存在于\n[green]{spider_file}[/green]"
|
|
109
144
|
)
|
|
110
145
|
return 1
|
|
@@ -113,13 +148,13 @@ def main(args):
|
|
|
113
148
|
tmpl_path = TEMPLATES_DIR / 'spider' / 'spider.py.tmpl'
|
|
114
149
|
if not tmpl_path.exists():
|
|
115
150
|
show_error_panel(
|
|
116
|
-
"模板未找到",
|
|
151
|
+
"模板未找到",
|
|
117
152
|
f"模板文件未找到于 [cyan]{tmpl_path}[/cyan]"
|
|
118
153
|
)
|
|
119
154
|
return 1
|
|
120
155
|
|
|
121
|
-
#
|
|
122
|
-
class_name =
|
|
156
|
+
# 生成类名(使用新的转换函数)
|
|
157
|
+
class_name = generate_class_name(spider_name)
|
|
123
158
|
|
|
124
159
|
context = {
|
|
125
160
|
'spider_name': spider_name,
|
|
@@ -133,7 +168,7 @@ def main(args):
|
|
|
133
168
|
content = _render_template(tmpl_path, context)
|
|
134
169
|
with open(spider_file, 'w', encoding='utf-8') as f:
|
|
135
170
|
f.write(content)
|
|
136
|
-
|
|
171
|
+
|
|
137
172
|
console.print(f"[green]爬虫 '[bold]{spider_name}[/bold]' 创建成功![/green]")
|
|
138
173
|
console.print(f" → 位置: [cyan]{spider_file}[/cyan]")
|
|
139
174
|
console.print(f" → 类名: [yellow]{class_name}[/yellow]")
|
|
@@ -141,12 +176,12 @@ def main(args):
|
|
|
141
176
|
console.print("\n[bold]下一步操作:[/bold]")
|
|
142
177
|
console.print(f" [blue]crawlo run[/blue] {spider_name}")
|
|
143
178
|
console.print(f" [blue]crawlo check[/blue] {spider_name}")
|
|
144
|
-
|
|
179
|
+
|
|
145
180
|
return 0
|
|
146
|
-
|
|
181
|
+
|
|
147
182
|
except Exception as e:
|
|
148
183
|
show_error_panel(
|
|
149
|
-
"创建失败",
|
|
184
|
+
"创建失败",
|
|
150
185
|
f"创建爬虫失败: {e}"
|
|
151
186
|
)
|
|
152
187
|
return 1
|
crawlo/commands/startproject.py
CHANGED
|
@@ -92,8 +92,32 @@ def _render_template(tmpl_path, context):
|
|
|
92
92
|
"""读取模板文件,替换 {{key}} 为 context 中的值"""
|
|
93
93
|
with open(tmpl_path, 'r', encoding='utf-8') as f:
|
|
94
94
|
content = f.read()
|
|
95
|
+
|
|
96
|
+
# 处理简单的过滤器语法 {{key|filter}}
|
|
97
|
+
import re
|
|
98
|
+
|
|
99
|
+
def apply_filter(value, filter_name):
|
|
100
|
+
if filter_name == 'title':
|
|
101
|
+
# 将 snake_case 转换为 TitleCase
|
|
102
|
+
words = value.replace('_', ' ').split()
|
|
103
|
+
return ''.join(word.capitalize() for word in words)
|
|
104
|
+
return value
|
|
105
|
+
|
|
106
|
+
# 查找并替换 {{key|filter}} 格式的占位符
|
|
107
|
+
pattern = r'\{\{([^}|]+)\|([^}]+)\}\}'
|
|
108
|
+
def replace_filter_match(match):
|
|
109
|
+
key = match.group(1).strip()
|
|
110
|
+
filter_name = match.group(2).strip()
|
|
111
|
+
if key in context:
|
|
112
|
+
return str(apply_filter(context[key], filter_name))
|
|
113
|
+
return match.group(0) # 如果找不到key,保持原样
|
|
114
|
+
|
|
115
|
+
content = re.sub(pattern, replace_filter_match, content)
|
|
116
|
+
|
|
117
|
+
# 处理普通的 {{key}} 占位符
|
|
95
118
|
for key, value in context.items():
|
|
96
119
|
content = content.replace(f'{{{{{key}}}}}', str(value))
|
|
120
|
+
|
|
97
121
|
return content
|
|
98
122
|
|
|
99
123
|
|
crawlo/core/engine.py
CHANGED
|
@@ -11,7 +11,7 @@ from crawlo.core.scheduler import Scheduler
|
|
|
11
11
|
from crawlo.downloader import DownloaderBase
|
|
12
12
|
from crawlo.event import spider_opened, spider_error, request_scheduled
|
|
13
13
|
from crawlo.exceptions import OutputError
|
|
14
|
-
from crawlo.utils.
|
|
14
|
+
from crawlo.utils.misc import load_object
|
|
15
15
|
from crawlo.spider import Spider
|
|
16
16
|
from crawlo.task_manager import TaskManager
|
|
17
17
|
from crawlo.utils.func_tools import transform
|
|
@@ -62,7 +62,7 @@ class Engine(object):
|
|
|
62
62
|
self.logger.warning(f"无法使用下载器类型 '{downloader_type}': {e},回退到默认配置")
|
|
63
63
|
|
|
64
64
|
# 方式2: 使用 DOWNLOADER 完整类路径(兼容旧版本)
|
|
65
|
-
downloader_cls =
|
|
65
|
+
downloader_cls = load_object(self.settings.get('DOWNLOADER'))
|
|
66
66
|
if not issubclass(downloader_cls, DownloaderBase):
|
|
67
67
|
raise TypeError(f'下载器 {downloader_cls.__name__} 不是 DownloaderBase 的子类。')
|
|
68
68
|
return downloader_cls
|
crawlo/core/scheduler.py
CHANGED
|
@@ -6,7 +6,7 @@ from typing import Optional, Callable
|
|
|
6
6
|
from crawlo.utils.log import get_logger
|
|
7
7
|
from crawlo.utils.request import set_request
|
|
8
8
|
from crawlo.utils.error_handler import ErrorHandler
|
|
9
|
-
from crawlo.utils.
|
|
9
|
+
from crawlo.utils.misc import load_object
|
|
10
10
|
from crawlo.project import common_call
|
|
11
11
|
from crawlo.utils.request_serializer import RequestSerializer
|
|
12
12
|
from crawlo.queue.queue_manager import QueueManager, QueueConfig, QueueType
|
|
@@ -26,7 +26,7 @@ class Scheduler:
|
|
|
26
26
|
|
|
27
27
|
@classmethod
|
|
28
28
|
def create_instance(cls, crawler):
|
|
29
|
-
filter_cls =
|
|
29
|
+
filter_cls = load_object(crawler.settings.get('FILTER_CLASS'))
|
|
30
30
|
o = cls(
|
|
31
31
|
crawler=crawler,
|
|
32
32
|
dupe_filter=filter_cls.create_instance(crawler),
|
|
@@ -120,7 +120,7 @@ class Scheduler:
|
|
|
120
120
|
# 如果需要更新配置,则执行更新
|
|
121
121
|
if needs_config_update:
|
|
122
122
|
# 重新创建过滤器实例,确保使用更新后的配置
|
|
123
|
-
filter_cls =
|
|
123
|
+
filter_cls = load_object(self.crawler.settings.get('FILTER_CLASS'))
|
|
124
124
|
self.dupe_filter = filter_cls.create_instance(self.crawler)
|
|
125
125
|
|
|
126
126
|
# 记录警告信息
|
|
@@ -136,7 +136,7 @@ class Scheduler:
|
|
|
136
136
|
self._switch_to_memory_config()
|
|
137
137
|
|
|
138
138
|
# 重新创建过滤器实例
|
|
139
|
-
filter_cls =
|
|
139
|
+
filter_cls = load_object(self.crawler.settings.get('FILTER_CLASS'))
|
|
140
140
|
self.dupe_filter = filter_cls.create_instance(self.crawler)
|
|
141
141
|
|
|
142
142
|
def _is_filter_matching_queue_type(self, current_filter_class):
|
crawlo/crawler.py
CHANGED
|
@@ -13,14 +13,14 @@
|
|
|
13
13
|
|
|
14
14
|
import asyncio
|
|
15
15
|
import time
|
|
16
|
-
from contextlib import asynccontextmanager
|
|
17
|
-
from dataclasses import dataclass
|
|
18
16
|
from enum import Enum
|
|
17
|
+
from dataclasses import dataclass
|
|
18
|
+
from contextlib import asynccontextmanager
|
|
19
19
|
from typing import Optional, Type, Dict, Any, List
|
|
20
20
|
|
|
21
|
+
from crawlo.logging import get_logger
|
|
21
22
|
from crawlo.factories import get_component_registry
|
|
22
23
|
from crawlo.initialization import initialize_framework, is_framework_ready
|
|
23
|
-
from crawlo.logging import get_logger
|
|
24
24
|
|
|
25
25
|
|
|
26
26
|
class CrawlerState(Enum):
|
|
@@ -345,16 +345,23 @@ class CrawlerProcess:
|
|
|
345
345
|
"""
|
|
346
346
|
|
|
347
347
|
def __init__(self, settings=None, max_concurrency: int = 3, spider_modules=None):
|
|
348
|
+
# 初始化框架配置
|
|
348
349
|
self._settings = settings or initialize_framework()
|
|
349
350
|
self._max_concurrency = max_concurrency
|
|
350
351
|
self._crawlers: List[ModernCrawler] = []
|
|
351
352
|
self._semaphore = asyncio.Semaphore(max_concurrency)
|
|
352
353
|
self._logger = get_logger('crawler.process')
|
|
353
|
-
|
|
354
|
+
|
|
355
|
+
# 如果没有显式提供spider_modules,则从settings中获取
|
|
356
|
+
if spider_modules is None and self._settings:
|
|
357
|
+
spider_modules = self._settings.get('SPIDER_MODULES', [])
|
|
358
|
+
self._logger.debug(f"从settings中获取SPIDER_MODULES: {spider_modules}")
|
|
359
|
+
|
|
360
|
+
self._spider_modules = spider_modules or [] # 保存spider_modules
|
|
354
361
|
|
|
355
362
|
# 如果提供了spider_modules,自动注册这些模块中的爬虫
|
|
356
|
-
if
|
|
357
|
-
self._register_spider_modules(
|
|
363
|
+
if self._spider_modules:
|
|
364
|
+
self._register_spider_modules(self._spider_modules)
|
|
358
365
|
|
|
359
366
|
# 指标
|
|
360
367
|
self._start_time: Optional[float] = None
|
crawlo/downloader/__init__.py
CHANGED
|
@@ -15,12 +15,15 @@ Crawlo Downloader Module
|
|
|
15
15
|
- ActivateRequestManager: 活跃请求管理器
|
|
16
16
|
"""
|
|
17
17
|
from abc import abstractmethod, ABCMeta
|
|
18
|
-
from typing import Final, Set, Optional
|
|
18
|
+
from typing import Final, Set, Optional, TYPE_CHECKING
|
|
19
19
|
from contextlib import asynccontextmanager
|
|
20
20
|
|
|
21
21
|
from crawlo.utils.log import get_logger
|
|
22
22
|
from crawlo.middleware.middleware_manager import MiddlewareManager
|
|
23
23
|
|
|
24
|
+
if TYPE_CHECKING:
|
|
25
|
+
from crawlo import Response
|
|
26
|
+
|
|
24
27
|
|
|
25
28
|
class ActivateRequestManager:
|
|
26
29
|
"""活跃请求管理器 - 跟踪和管理正在处理的请求"""
|
|
@@ -134,7 +137,7 @@ class DownloaderBase(metaclass=DownloaderMeta):
|
|
|
134
137
|
self.logger.error(f"中间件初始化失败: {e}")
|
|
135
138
|
raise
|
|
136
139
|
|
|
137
|
-
async def fetch(self, request) -> Optional[
|
|
140
|
+
async def fetch(self, request) -> 'Optional[Response]':
|
|
138
141
|
"""获取请求响应(经过中间件处理)"""
|
|
139
142
|
if self._closed:
|
|
140
143
|
raise RuntimeError(f"{self.__class__.__name__} 已关闭")
|
crawlo/extension/__init__.py
CHANGED
|
@@ -4,7 +4,7 @@ from typing import List, Any
|
|
|
4
4
|
from pprint import pformat
|
|
5
5
|
|
|
6
6
|
from crawlo.utils.log import get_logger
|
|
7
|
-
from crawlo.utils.
|
|
7
|
+
from crawlo.utils.misc import load_object
|
|
8
8
|
from crawlo.exceptions import ExtensionInitError
|
|
9
9
|
|
|
10
10
|
|
|
@@ -25,7 +25,7 @@ class ExtensionManager(object):
|
|
|
25
25
|
def _add_extensions(self, extensions: List[str]) -> None:
|
|
26
26
|
for extension_path in extensions:
|
|
27
27
|
try:
|
|
28
|
-
extension_cls =
|
|
28
|
+
extension_cls = load_object(extension_path)
|
|
29
29
|
if not hasattr(extension_cls, 'create_instance'):
|
|
30
30
|
raise ExtensionInitError(
|
|
31
31
|
f"Extension '{extension_path}' init failed: Must have method 'create_instance()'"
|
|
@@ -142,7 +142,14 @@ class AioRedisFilter(BaseFilter):
|
|
|
142
142
|
if redis_client is None:
|
|
143
143
|
return False
|
|
144
144
|
|
|
145
|
-
|
|
145
|
+
# 使用统一的指纹生成器
|
|
146
|
+
from crawlo.utils.fingerprint import FingerprintGenerator
|
|
147
|
+
fp = str(FingerprintGenerator.request_fingerprint(
|
|
148
|
+
request.method,
|
|
149
|
+
request.url,
|
|
150
|
+
request.body or b'',
|
|
151
|
+
dict(request.headers) if hasattr(request, 'headers') else None
|
|
152
|
+
))
|
|
146
153
|
self._redis_operations += 1
|
|
147
154
|
|
|
148
155
|
# 使用 pipeline 优化性能
|
crawlo/filters/memory_filter.py
CHANGED
|
@@ -102,7 +102,14 @@ class MemoryFilter(BaseFilter):
|
|
|
102
102
|
:return: 是否重复
|
|
103
103
|
"""
|
|
104
104
|
with self._lock:
|
|
105
|
-
|
|
105
|
+
# 使用统一的指纹生成器
|
|
106
|
+
from crawlo.utils.fingerprint import FingerprintGenerator
|
|
107
|
+
fp = FingerprintGenerator.request_fingerprint(
|
|
108
|
+
request.method,
|
|
109
|
+
request.url,
|
|
110
|
+
request.body or b'',
|
|
111
|
+
dict(request.headers) if hasattr(request, 'headers') else None
|
|
112
|
+
)
|
|
106
113
|
if fp in self.fingerprints:
|
|
107
114
|
self._dupe_count += 1
|
|
108
115
|
# if self.debug:
|
|
@@ -210,8 +210,17 @@ class SettingsInitializer(BaseInitializer):
|
|
|
210
210
|
from crawlo.settings.setting_manager import SettingManager
|
|
211
211
|
from crawlo.project import _load_project_settings
|
|
212
212
|
|
|
213
|
-
#
|
|
214
|
-
|
|
213
|
+
# 如果上下文中已有设置,则使用它作为基础配置
|
|
214
|
+
if context.settings:
|
|
215
|
+
# 使用用户传递的设置作为基础配置
|
|
216
|
+
settings = context.settings
|
|
217
|
+
# 加载项目配置并合并
|
|
218
|
+
project_settings = _load_project_settings(context.custom_settings)
|
|
219
|
+
# 合并配置,用户配置优先
|
|
220
|
+
settings.update_attributes(project_settings.attributes)
|
|
221
|
+
else:
|
|
222
|
+
# 创建配置管理器并加载项目配置
|
|
223
|
+
settings = _load_project_settings(context.custom_settings)
|
|
215
224
|
|
|
216
225
|
# 存储到上下文
|
|
217
226
|
context.settings = settings
|
|
@@ -346,8 +355,8 @@ class ExtensionsInitializer(BaseInitializer):
|
|
|
346
355
|
initialized_extensions = []
|
|
347
356
|
for extension_path in extensions:
|
|
348
357
|
try:
|
|
349
|
-
from crawlo.utils.
|
|
350
|
-
extension_class =
|
|
358
|
+
from crawlo.utils.misc import load_object
|
|
359
|
+
extension_class = load_object(extension_path)
|
|
351
360
|
extension_instance = extension_class()
|
|
352
361
|
initialized_extensions.append(extension_instance)
|
|
353
362
|
except Exception as e:
|
crawlo/initialization/core.py
CHANGED
|
@@ -4,14 +4,14 @@
|
|
|
4
4
|
核心初始化器 - 协调整个初始化过程
|
|
5
5
|
"""
|
|
6
6
|
|
|
7
|
-
import time
|
|
8
7
|
import threading
|
|
8
|
+
import time
|
|
9
9
|
from typing import Optional, Any
|
|
10
10
|
|
|
11
|
-
from .context import InitializationContext
|
|
12
|
-
from .phases import InitializationPhase, PhaseResult, get_execution_order, get_phase_definition
|
|
13
|
-
from .registry import get_global_registry, BaseInitializer, register_initializer
|
|
14
11
|
from .built_in import register_built_in_initializers
|
|
12
|
+
from .context import InitializationContext
|
|
13
|
+
from .phases import InitializationPhase, PhaseResult, get_execution_order, get_phase_definition
|
|
14
|
+
from .registry import get_global_registry
|
|
15
15
|
|
|
16
16
|
|
|
17
17
|
class CoreInitializer:
|
|
@@ -78,6 +78,7 @@ class CoreInitializer:
|
|
|
78
78
|
# 创建初始化上下文
|
|
79
79
|
context = InitializationContext()
|
|
80
80
|
context.custom_settings = kwargs
|
|
81
|
+
context.settings = settings
|
|
81
82
|
self._context = context
|
|
82
83
|
|
|
83
84
|
try:
|
crawlo/interfaces.py
ADDED
|
@@ -0,0 +1,24 @@
|
|
|
1
|
+
from abc import ABC, abstractmethod
|
|
2
|
+
from typing import List, Type, Protocol
|
|
3
|
+
|
|
4
|
+
from crawlo.spider import Spider
|
|
5
|
+
from crawlo.network.request import Request
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
class ISpiderLoader(Protocol):
|
|
9
|
+
"""Spider loader interface"""
|
|
10
|
+
|
|
11
|
+
@abstractmethod
|
|
12
|
+
def load(self, spider_name: str) -> Type[Spider]:
|
|
13
|
+
"""Load a spider by name"""
|
|
14
|
+
pass
|
|
15
|
+
|
|
16
|
+
@abstractmethod
|
|
17
|
+
def list(self) -> List[str]:
|
|
18
|
+
"""List all available spider names"""
|
|
19
|
+
pass
|
|
20
|
+
|
|
21
|
+
@abstractmethod
|
|
22
|
+
def find_by_request(self, request: Request) -> List[str]:
|
|
23
|
+
"""Find spider names that can handle the given request"""
|
|
24
|
+
pass
|
crawlo/middleware/__init__.py
CHANGED
|
@@ -1,18 +1,21 @@
|
|
|
1
1
|
#!/usr/bin/python
|
|
2
2
|
# -*- coding:UTF-8 -*-
|
|
3
|
-
from
|
|
3
|
+
from typing import TYPE_CHECKING
|
|
4
|
+
|
|
5
|
+
if TYPE_CHECKING:
|
|
6
|
+
from crawlo import Request, Response
|
|
4
7
|
|
|
5
8
|
|
|
6
9
|
class BaseMiddleware(object):
|
|
7
|
-
def process_request(self, request, spider) -> None | Request | Response:
|
|
10
|
+
def process_request(self, request, spider) -> 'None | Request | Response':
|
|
8
11
|
# 请求预处理
|
|
9
12
|
pass
|
|
10
13
|
|
|
11
|
-
def process_response(self, request, response, spider) -> Request | Response:
|
|
14
|
+
def process_response(self, request, response, spider) -> 'Request | Response':
|
|
12
15
|
# 响应预处理
|
|
13
16
|
pass
|
|
14
17
|
|
|
15
|
-
def process_exception(self, request, exp, spider) -> None | Request | Response:
|
|
18
|
+
def process_exception(self, request, exp, spider) -> 'None | Request | Response':
|
|
16
19
|
# 异常预处理
|
|
17
20
|
pass
|
|
18
21
|
|
|
@@ -4,11 +4,18 @@ from pprint import pformat
|
|
|
4
4
|
from types import MethodType
|
|
5
5
|
from asyncio import create_task
|
|
6
6
|
from collections import defaultdict
|
|
7
|
-
from typing import List, Dict, Callable, Optional
|
|
7
|
+
from typing import List, Dict, Callable, Optional, TYPE_CHECKING
|
|
8
8
|
|
|
9
|
-
from
|
|
9
|
+
from typing import TYPE_CHECKING
|
|
10
|
+
|
|
11
|
+
if TYPE_CHECKING:
|
|
12
|
+
from crawlo import Request, Response
|
|
13
|
+
else:
|
|
14
|
+
# 为 isinstance 检查导入实际的类
|
|
15
|
+
from crawlo.network.request import Request
|
|
16
|
+
from crawlo.network.response import Response
|
|
10
17
|
from crawlo.utils.log import get_logger
|
|
11
|
-
from crawlo.utils.
|
|
18
|
+
from crawlo.utils.misc import load_object
|
|
12
19
|
from crawlo.middleware import BaseMiddleware
|
|
13
20
|
from crawlo.project import common_call
|
|
14
21
|
from crawlo.event import ignore_request, response_received
|
|
@@ -30,7 +37,7 @@ class MiddlewareManager:
|
|
|
30
37
|
self.download_method: Callable = crawler.engine.downloader.download
|
|
31
38
|
self._stats = crawler.stats
|
|
32
39
|
|
|
33
|
-
async def _process_request(self, request: Request):
|
|
40
|
+
async def _process_request(self, request: 'Request'):
|
|
34
41
|
for method in self.methods['process_request']:
|
|
35
42
|
result = await common_call(method, request, self.crawler.spider)
|
|
36
43
|
if result is None:
|
|
@@ -42,7 +49,7 @@ class MiddlewareManager:
|
|
|
42
49
|
)
|
|
43
50
|
return await self.download_method(request)
|
|
44
51
|
|
|
45
|
-
async def _process_response(self, request: Request, response: Response):
|
|
52
|
+
async def _process_response(self, request: 'Request', response: 'Response'):
|
|
46
53
|
for method in reversed(self.methods['process_response']):
|
|
47
54
|
try:
|
|
48
55
|
response = await common_call(method, request, response, self.crawler.spider)
|
|
@@ -57,7 +64,7 @@ class MiddlewareManager:
|
|
|
57
64
|
)
|
|
58
65
|
return response
|
|
59
66
|
|
|
60
|
-
async def _process_exception(self, request: Request, exp: Exception):
|
|
67
|
+
async def _process_exception(self, request: 'Request', exp: Exception):
|
|
61
68
|
for method in self.methods['process_exception']:
|
|
62
69
|
response = await common_call(method, request, exp, self.crawler.spider)
|
|
63
70
|
if response is None:
|
|
@@ -72,7 +79,7 @@ class MiddlewareManager:
|
|
|
72
79
|
else:
|
|
73
80
|
raise exp
|
|
74
81
|
|
|
75
|
-
async def download(self, request) -> Optional[Response]:
|
|
82
|
+
async def download(self, request) -> 'Optional[Response]':
|
|
76
83
|
""" called in the download method. """
|
|
77
84
|
try:
|
|
78
85
|
response = await self._process_request(request)
|
|
@@ -105,7 +112,7 @@ class MiddlewareManager:
|
|
|
105
112
|
self.logger.info(f'Enabled middlewares:\n {pformat(enabled_middlewares)}')
|
|
106
113
|
|
|
107
114
|
def _validate_middleware(self, middleware):
|
|
108
|
-
middleware_cls =
|
|
115
|
+
middleware_cls = load_object(middleware)
|
|
109
116
|
if not hasattr(middleware_cls, 'create_instance'):
|
|
110
117
|
raise MiddlewareInitError(
|
|
111
118
|
f"Middleware init failed, must inherit from `BaseMiddleware` or have a `create_instance` method"
|