crawlo 1.4.6__py3-none-any.whl → 1.4.8__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of crawlo might be problematic. Click here for more details.
- crawlo/__init__.py +2 -1
- crawlo/__version__.py +1 -1
- crawlo/cli.py +2 -2
- crawlo/commands/check.py +1 -1
- crawlo/commands/help.py +5 -3
- crawlo/commands/list.py +1 -1
- crawlo/commands/run.py +49 -11
- crawlo/commands/stats.py +1 -1
- crawlo/config.py +12 -4
- crawlo/config_validator.py +1 -1
- crawlo/core/engine.py +20 -7
- crawlo/core/processor.py +1 -1
- crawlo/core/scheduler.py +4 -5
- crawlo/crawler.py +51 -10
- crawlo/downloader/__init__.py +7 -3
- crawlo/downloader/aiohttp_downloader.py +18 -18
- crawlo/downloader/cffi_downloader.py +5 -2
- crawlo/downloader/httpx_downloader.py +9 -3
- crawlo/downloader/hybrid_downloader.py +2 -2
- crawlo/downloader/playwright_downloader.py +38 -15
- crawlo/downloader/selenium_downloader.py +16 -2
- crawlo/event.py +42 -8
- crawlo/exceptions.py +157 -24
- crawlo/extension/__init__.py +10 -9
- crawlo/extension/health_check.py +7 -7
- crawlo/extension/log_interval.py +6 -6
- crawlo/extension/log_stats.py +2 -2
- crawlo/extension/logging_extension.py +4 -12
- crawlo/extension/memory_monitor.py +5 -5
- crawlo/extension/performance_profiler.py +5 -5
- crawlo/extension/request_recorder.py +6 -6
- crawlo/factories/base.py +1 -1
- crawlo/factories/crawler.py +61 -60
- crawlo/factories/utils.py +135 -0
- crawlo/filters/__init__.py +19 -2
- crawlo/filters/aioredis_filter.py +133 -49
- crawlo/filters/memory_filter.py +6 -21
- crawlo/framework.py +22 -8
- crawlo/initialization/built_in.py +24 -67
- crawlo/initialization/core.py +65 -19
- crawlo/initialization/phases.py +83 -2
- crawlo/initialization/registry.py +5 -7
- crawlo/initialization/utils.py +49 -0
- crawlo/logging/__init__.py +6 -10
- crawlo/logging/config.py +106 -22
- crawlo/logging/factory.py +12 -8
- crawlo/logging/manager.py +19 -27
- crawlo/middleware/__init__.py +72 -9
- crawlo/middleware/default_header.py +2 -2
- crawlo/middleware/download_delay.py +2 -2
- crawlo/middleware/middleware_manager.py +6 -6
- crawlo/middleware/offsite.py +2 -2
- crawlo/middleware/proxy.py +2 -2
- crawlo/middleware/request_ignore.py +4 -4
- crawlo/middleware/response_code.py +2 -2
- crawlo/middleware/response_filter.py +2 -2
- crawlo/middleware/retry.py +1 -1
- crawlo/mode_manager.py +38 -4
- crawlo/network/request.py +54 -26
- crawlo/network/response.py +69 -135
- crawlo/pipelines/__init__.py +40 -9
- crawlo/pipelines/base_pipeline.py +452 -0
- crawlo/pipelines/bloom_dedup_pipeline.py +4 -5
- crawlo/pipelines/console_pipeline.py +2 -2
- crawlo/pipelines/csv_pipeline.py +4 -4
- crawlo/pipelines/database_dedup_pipeline.py +4 -5
- crawlo/pipelines/json_pipeline.py +4 -4
- crawlo/pipelines/memory_dedup_pipeline.py +4 -5
- crawlo/pipelines/mongo_pipeline.py +23 -14
- crawlo/pipelines/mysql_pipeline.py +31 -39
- crawlo/pipelines/pipeline_manager.py +8 -8
- crawlo/pipelines/redis_dedup_pipeline.py +13 -14
- crawlo/project.py +1 -1
- crawlo/queue/__init__.py +10 -0
- crawlo/queue/queue_manager.py +79 -13
- crawlo/queue/redis_priority_queue.py +196 -47
- crawlo/settings/default_settings.py +16 -6
- crawlo/spider/__init__.py +6 -5
- crawlo/stats_collector.py +2 -2
- crawlo/task_manager.py +1 -1
- crawlo/templates/crawlo.cfg.tmpl +3 -3
- crawlo/templates/project/__init__.py.tmpl +1 -3
- crawlo/templates/project/items.py.tmpl +2 -6
- crawlo/templates/project/middlewares.py.tmpl +1 -1
- crawlo/templates/project/pipelines.py.tmpl +1 -2
- crawlo/templates/project/settings.py.tmpl +12 -10
- crawlo/templates/project/settings_distributed.py.tmpl +14 -13
- crawlo/templates/project/settings_gentle.py.tmpl +21 -23
- crawlo/templates/project/settings_high_performance.py.tmpl +21 -23
- crawlo/templates/project/settings_minimal.py.tmpl +10 -8
- crawlo/templates/project/settings_simple.py.tmpl +21 -23
- crawlo/templates/run.py.tmpl +1 -1
- crawlo/templates/spider/spider.py.tmpl +4 -12
- crawlo/templates/spiders_init.py.tmpl +3 -8
- crawlo/tools/__init__.py +0 -103
- crawlo/tools/scenario_adapter.py +1 -1
- crawlo/utils/__init__.py +25 -1
- crawlo/utils/batch_processor.py +23 -6
- crawlo/utils/config_manager.py +442 -0
- crawlo/utils/controlled_spider_mixin.py +1 -1
- crawlo/utils/db_helper.py +1 -1
- crawlo/utils/encoding_helper.py +190 -0
- crawlo/utils/error_handler.py +2 -2
- crawlo/utils/large_scale_helper.py +1 -1
- crawlo/utils/leak_detector.py +335 -0
- crawlo/utils/mongo_connection_pool.py +157 -0
- crawlo/utils/mysql_connection_pool.py +197 -0
- crawlo/utils/performance_monitor.py +1 -1
- crawlo/utils/redis_checker.py +91 -0
- crawlo/utils/redis_connection_pool.py +260 -70
- crawlo/utils/redis_key_validator.py +1 -1
- crawlo/utils/request.py +24 -2
- crawlo/utils/request_serializer.py +1 -1
- crawlo/utils/resource_manager.py +337 -0
- crawlo/utils/response_helper.py +113 -0
- crawlo/utils/selector_helper.py +3 -2
- crawlo/utils/singleton.py +70 -0
- crawlo/utils/spider_loader.py +1 -1
- crawlo/utils/text_helper.py +1 -1
- crawlo-1.4.8.dist-info/METADATA +831 -0
- {crawlo-1.4.6.dist-info → crawlo-1.4.8.dist-info}/RECORD +131 -145
- tests/advanced_tools_example.py +10 -68
- tests/distributed_dedup_test.py +467 -0
- tests/monitor_redis_dedup.sh +72 -0
- tests/ofweek_scrapy/ofweek_scrapy/spiders/__init__.py +4 -4
- tests/simple_cli_test.py +55 -0
- tests/test_cli_arguments.py +119 -0
- tests/test_dedup_fix.py +10 -10
- crawlo/logging/async_handler.py +0 -181
- crawlo/logging/monitor.py +0 -153
- crawlo/logging/sampler.py +0 -167
- crawlo/tools/authenticated_proxy.py +0 -241
- crawlo/tools/data_formatter.py +0 -226
- crawlo/tools/data_validator.py +0 -181
- crawlo/tools/encoding_converter.py +0 -127
- crawlo/tools/network_diagnostic.py +0 -365
- crawlo/tools/request_tools.py +0 -83
- crawlo/tools/retry_mechanism.py +0 -224
- crawlo/utils/env_config.py +0 -143
- crawlo/utils/large_scale_config.py +0 -287
- crawlo/utils/log.py +0 -80
- crawlo/utils/system.py +0 -11
- crawlo/utils/tools.py +0 -5
- crawlo/utils/url.py +0 -40
- crawlo-1.4.6.dist-info/METADATA +0 -329
- tests/env_config_example.py +0 -134
- tests/ofweek_scrapy/ofweek_scrapy/spiders/ofweek_spider.py +0 -162
- tests/test_authenticated_proxy.py +0 -142
- tests/test_comprehensive.py +0 -147
- tests/test_dynamic_downloaders_proxy.py +0 -125
- tests/test_dynamic_proxy.py +0 -93
- tests/test_dynamic_proxy_config.py +0 -147
- tests/test_dynamic_proxy_real.py +0 -110
- tests/test_env_config.py +0 -122
- tests/test_framework_env_usage.py +0 -104
- tests/test_large_scale_config.py +0 -113
- tests/test_proxy_api.py +0 -265
- tests/test_real_scenario_proxy.py +0 -196
- tests/tools_example.py +0 -261
- {crawlo-1.4.6.dist-info → crawlo-1.4.8.dist-info}/WHEEL +0 -0
- {crawlo-1.4.6.dist-info → crawlo-1.4.8.dist-info}/entry_points.txt +0 -0
- {crawlo-1.4.6.dist-info → crawlo-1.4.8.dist-info}/top_level.txt +0 -0
|
@@ -4,9 +4,9 @@ import asyncio
|
|
|
4
4
|
import psutil
|
|
5
5
|
from typing import Any, Optional
|
|
6
6
|
|
|
7
|
-
from crawlo.
|
|
7
|
+
from crawlo.logging import get_logger
|
|
8
8
|
from crawlo.utils.error_handler import ErrorHandler
|
|
9
|
-
from crawlo.event import
|
|
9
|
+
from crawlo.event import CrawlerEvent
|
|
10
10
|
|
|
11
11
|
|
|
12
12
|
class MemoryMonitorExtension:
|
|
@@ -19,7 +19,7 @@ class MemoryMonitorExtension:
|
|
|
19
19
|
self.task: Optional[asyncio.Task] = None
|
|
20
20
|
self.process = psutil.Process()
|
|
21
21
|
self.settings = crawler.settings
|
|
22
|
-
self.logger = get_logger(self.__class__.__name__
|
|
22
|
+
self.logger = get_logger(self.__class__.__name__)
|
|
23
23
|
self.error_handler = ErrorHandler(self.__class__.__name__, crawler.settings.get('LOG_LEVEL'))
|
|
24
24
|
|
|
25
25
|
# 获取配置参数
|
|
@@ -35,8 +35,8 @@ class MemoryMonitorExtension:
|
|
|
35
35
|
raise NotConfigured("MemoryMonitorExtension: MEMORY_MONITOR_ENABLED is False")
|
|
36
36
|
|
|
37
37
|
o = cls(crawler)
|
|
38
|
-
crawler.subscriber.subscribe(o.spider_opened, event=
|
|
39
|
-
crawler.subscriber.subscribe(o.spider_closed, event=
|
|
38
|
+
crawler.subscriber.subscribe(o.spider_opened, event=CrawlerEvent.SPIDER_OPENED)
|
|
39
|
+
crawler.subscriber.subscribe(o.spider_closed, event=CrawlerEvent.SPIDER_CLOSED)
|
|
40
40
|
return o
|
|
41
41
|
|
|
42
42
|
async def spider_opened(self) -> None:
|
|
@@ -7,9 +7,9 @@ import asyncio
|
|
|
7
7
|
import cProfile
|
|
8
8
|
from typing import Any, Optional
|
|
9
9
|
|
|
10
|
-
from crawlo.
|
|
10
|
+
from crawlo.logging import get_logger
|
|
11
11
|
from crawlo.utils.error_handler import ErrorHandler
|
|
12
|
-
from crawlo.event import
|
|
12
|
+
from crawlo.event import CrawlerEvent
|
|
13
13
|
|
|
14
14
|
|
|
15
15
|
class PerformanceProfilerExtension:
|
|
@@ -20,7 +20,7 @@ class PerformanceProfilerExtension:
|
|
|
20
20
|
|
|
21
21
|
def __init__(self, crawler: Any):
|
|
22
22
|
self.settings = crawler.settings
|
|
23
|
-
self.logger = get_logger(self.__class__.__name__
|
|
23
|
+
self.logger = get_logger(self.__class__.__name__)
|
|
24
24
|
self.error_handler = ErrorHandler(self.__class__.__name__, crawler.settings.get('LOG_LEVEL'))
|
|
25
25
|
|
|
26
26
|
# 获取配置参数
|
|
@@ -44,8 +44,8 @@ class PerformanceProfilerExtension:
|
|
|
44
44
|
|
|
45
45
|
o = cls(crawler)
|
|
46
46
|
if o.enabled:
|
|
47
|
-
crawler.subscriber.subscribe(o.spider_opened, event=
|
|
48
|
-
crawler.subscriber.subscribe(o.spider_closed, event=
|
|
47
|
+
crawler.subscriber.subscribe(o.spider_opened, event=CrawlerEvent.SPIDER_OPENED)
|
|
48
|
+
crawler.subscriber.subscribe(o.spider_closed, event=CrawlerEvent.SPIDER_CLOSED)
|
|
49
49
|
return o
|
|
50
50
|
|
|
51
51
|
async def spider_opened(self) -> None:
|
|
@@ -5,8 +5,8 @@ import json
|
|
|
5
5
|
from typing import Any
|
|
6
6
|
from datetime import datetime
|
|
7
7
|
|
|
8
|
-
from crawlo import
|
|
9
|
-
from crawlo.
|
|
8
|
+
from crawlo.event import CrawlerEvent
|
|
9
|
+
from crawlo.logging import get_logger
|
|
10
10
|
|
|
11
11
|
|
|
12
12
|
class RequestRecorderExtension:
|
|
@@ -17,7 +17,7 @@ class RequestRecorderExtension:
|
|
|
17
17
|
|
|
18
18
|
def __init__(self, crawler: Any):
|
|
19
19
|
self.settings = crawler.settings
|
|
20
|
-
self.logger = get_logger(self.__class__.__name__
|
|
20
|
+
self.logger = get_logger(self.__class__.__name__)
|
|
21
21
|
|
|
22
22
|
# 获取配置参数
|
|
23
23
|
self.enabled = self.settings.get_bool('REQUEST_RECORDER_ENABLED', False)
|
|
@@ -40,9 +40,9 @@ class RequestRecorderExtension:
|
|
|
40
40
|
|
|
41
41
|
o = cls(crawler)
|
|
42
42
|
if o.enabled:
|
|
43
|
-
crawler.subscriber.subscribe(o.request_scheduled, event=
|
|
44
|
-
crawler.subscriber.subscribe(o.response_received, event=
|
|
45
|
-
crawler.subscriber.subscribe(o.spider_closed, event=
|
|
43
|
+
crawler.subscriber.subscribe(o.request_scheduled, event=CrawlerEvent.REQUEST_SCHEDULED)
|
|
44
|
+
crawler.subscriber.subscribe(o.response_received, event=CrawlerEvent.RESPONSE_RECEIVED)
|
|
45
|
+
crawler.subscriber.subscribe(o.spider_closed, event=CrawlerEvent.SPIDER_CLOSED)
|
|
46
46
|
return o
|
|
47
47
|
|
|
48
48
|
async def request_scheduled(self, request: Any, spider: Any) -> None:
|
crawlo/factories/base.py
CHANGED
crawlo/factories/crawler.py
CHANGED
|
@@ -31,73 +31,74 @@ class CrawlerComponentFactory(ComponentFactory):
|
|
|
31
31
|
return component_type.__name__ in supported_types
|
|
32
32
|
|
|
33
33
|
|
|
34
|
+
# Engine组件
|
|
35
|
+
def create_engine(crawler, **kwargs):
|
|
36
|
+
from crawlo.core.engine import Engine
|
|
37
|
+
return Engine(crawler)
|
|
38
|
+
|
|
39
|
+
# Scheduler组件
|
|
40
|
+
def create_scheduler(crawler, **kwargs):
|
|
41
|
+
from crawlo.core.scheduler import Scheduler
|
|
42
|
+
return Scheduler.create_instance(crawler)
|
|
43
|
+
|
|
44
|
+
# StatsCollector组件
|
|
45
|
+
def create_stats(crawler, **kwargs):
|
|
46
|
+
from crawlo.stats_collector import StatsCollector
|
|
47
|
+
return StatsCollector(crawler)
|
|
48
|
+
|
|
49
|
+
# Subscriber组件
|
|
50
|
+
def create_subscriber(**kwargs):
|
|
51
|
+
from crawlo.subscriber import Subscriber
|
|
52
|
+
return Subscriber()
|
|
53
|
+
|
|
54
|
+
# ExtensionManager组件
|
|
55
|
+
def create_extension_manager(crawler, **kwargs):
|
|
56
|
+
from crawlo.extension import ExtensionManager
|
|
57
|
+
return ExtensionManager.create_instance(crawler)
|
|
58
|
+
|
|
34
59
|
def register_crawler_components():
|
|
35
60
|
"""注册Crawler相关组件"""
|
|
36
|
-
|
|
61
|
+
from .utils import register_components
|
|
37
62
|
|
|
38
63
|
# 注册工厂
|
|
64
|
+
registry = get_component_registry()
|
|
39
65
|
registry.register_factory(CrawlerComponentFactory())
|
|
40
66
|
|
|
41
|
-
#
|
|
42
|
-
|
|
43
|
-
|
|
44
|
-
|
|
45
|
-
|
|
46
|
-
|
|
47
|
-
|
|
48
|
-
|
|
49
|
-
|
|
50
|
-
|
|
51
|
-
|
|
52
|
-
|
|
53
|
-
|
|
54
|
-
|
|
55
|
-
|
|
56
|
-
|
|
57
|
-
|
|
58
|
-
|
|
59
|
-
|
|
60
|
-
|
|
61
|
-
|
|
62
|
-
|
|
63
|
-
|
|
64
|
-
|
|
65
|
-
|
|
66
|
-
|
|
67
|
-
|
|
68
|
-
|
|
69
|
-
|
|
70
|
-
|
|
71
|
-
|
|
72
|
-
|
|
73
|
-
|
|
74
|
-
component_type=type('StatsCollector', (), {}),
|
|
75
|
-
factory_func=create_stats,
|
|
76
|
-
dependencies=['crawler']
|
|
77
|
-
))
|
|
78
|
-
|
|
79
|
-
# Subscriber组件
|
|
80
|
-
def create_subscriber(**kwargs):
|
|
81
|
-
from crawlo.subscriber import Subscriber
|
|
82
|
-
return Subscriber()
|
|
83
|
-
|
|
84
|
-
registry.register(ComponentSpec(
|
|
85
|
-
name='subscriber',
|
|
86
|
-
component_type=type('Subscriber', (), {}),
|
|
87
|
-
factory_func=create_subscriber
|
|
88
|
-
))
|
|
89
|
-
|
|
90
|
-
# ExtensionManager组件
|
|
91
|
-
def create_extension_manager(crawler, **kwargs):
|
|
92
|
-
from crawlo.extension import ExtensionManager
|
|
93
|
-
return ExtensionManager.create_instance(crawler)
|
|
67
|
+
# 批量注册组件
|
|
68
|
+
component_list = [
|
|
69
|
+
{
|
|
70
|
+
'name': 'engine',
|
|
71
|
+
'component_type': 'Engine',
|
|
72
|
+
'factory_func': create_engine,
|
|
73
|
+
'dependencies': ['crawler']
|
|
74
|
+
},
|
|
75
|
+
{
|
|
76
|
+
'name': 'scheduler',
|
|
77
|
+
'component_type': 'Scheduler',
|
|
78
|
+
'factory_func': create_scheduler,
|
|
79
|
+
'dependencies': ['crawler']
|
|
80
|
+
},
|
|
81
|
+
{
|
|
82
|
+
'name': 'stats',
|
|
83
|
+
'component_type': 'StatsCollector',
|
|
84
|
+
'factory_func': create_stats,
|
|
85
|
+
'dependencies': ['crawler']
|
|
86
|
+
},
|
|
87
|
+
{
|
|
88
|
+
'name': 'subscriber',
|
|
89
|
+
'component_type': 'Subscriber',
|
|
90
|
+
'factory_func': create_subscriber,
|
|
91
|
+
'dependencies': []
|
|
92
|
+
},
|
|
93
|
+
{
|
|
94
|
+
'name': 'extension_manager',
|
|
95
|
+
'component_type': 'ExtensionManager',
|
|
96
|
+
'factory_func': create_extension_manager,
|
|
97
|
+
'dependencies': ['crawler']
|
|
98
|
+
}
|
|
99
|
+
]
|
|
94
100
|
|
|
95
|
-
|
|
96
|
-
name='extension_manager',
|
|
97
|
-
component_type=type('ExtensionManager', (), {}),
|
|
98
|
-
factory_func=create_extension_manager,
|
|
99
|
-
dependencies=['crawler']
|
|
100
|
-
))
|
|
101
|
+
register_components(component_list)
|
|
101
102
|
|
|
102
103
|
|
|
103
104
|
# 自动注册
|
|
@@ -0,0 +1,135 @@
|
|
|
1
|
+
#!/usr/bin/python
|
|
2
|
+
# -*- coding: UTF-8 -*-
|
|
3
|
+
"""
|
|
4
|
+
工厂工具模块 - 提供通用的组件注册和创建工具
|
|
5
|
+
"""
|
|
6
|
+
|
|
7
|
+
from typing import Any, Callable, List, Optional, Type, Union
|
|
8
|
+
from .base import ComponentSpec
|
|
9
|
+
from .registry import get_component_registry
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
def register_component(
|
|
13
|
+
name: str,
|
|
14
|
+
component_type: Union[Type, str],
|
|
15
|
+
factory_func: Callable[..., Any],
|
|
16
|
+
dependencies: Optional[List[str]] = None,
|
|
17
|
+
singleton: bool = False,
|
|
18
|
+
config_key: Optional[str] = None
|
|
19
|
+
) -> None:
|
|
20
|
+
"""
|
|
21
|
+
注册组件的便捷函数
|
|
22
|
+
|
|
23
|
+
Args:
|
|
24
|
+
name: 组件名称
|
|
25
|
+
component_type: 组件类型
|
|
26
|
+
factory_func: 工厂函数
|
|
27
|
+
dependencies: 依赖列表
|
|
28
|
+
singleton: 是否单例
|
|
29
|
+
config_key: 配置键名
|
|
30
|
+
"""
|
|
31
|
+
registry = get_component_registry()
|
|
32
|
+
|
|
33
|
+
# 如果component_type是字符串,创建一个动态类型
|
|
34
|
+
if isinstance(component_type, str):
|
|
35
|
+
component_type = type(component_type, (), {})
|
|
36
|
+
|
|
37
|
+
spec_kwargs = {
|
|
38
|
+
'name': name,
|
|
39
|
+
'component_type': component_type,
|
|
40
|
+
'factory_func': factory_func,
|
|
41
|
+
'dependencies': dependencies or [],
|
|
42
|
+
'singleton': singleton
|
|
43
|
+
}
|
|
44
|
+
|
|
45
|
+
# 只有当config_key不为None时才添加
|
|
46
|
+
if config_key is not None:
|
|
47
|
+
spec_kwargs['config_key'] = config_key
|
|
48
|
+
|
|
49
|
+
spec = ComponentSpec(**spec_kwargs)
|
|
50
|
+
|
|
51
|
+
registry.register(spec)
|
|
52
|
+
|
|
53
|
+
|
|
54
|
+
def register_components(component_list: List[dict]) -> None:
|
|
55
|
+
"""
|
|
56
|
+
批量注册组件
|
|
57
|
+
|
|
58
|
+
Args:
|
|
59
|
+
component_list: 组件定义列表,每个元素是一个包含组件信息的字典
|
|
60
|
+
"""
|
|
61
|
+
for component_info in component_list:
|
|
62
|
+
register_component(**component_info)
|
|
63
|
+
|
|
64
|
+
|
|
65
|
+
def create_component_factory(
|
|
66
|
+
component_name: str,
|
|
67
|
+
module_path: str,
|
|
68
|
+
class_name: str,
|
|
69
|
+
dependencies: Optional[List[str]] = None,
|
|
70
|
+
singleton: bool = False
|
|
71
|
+
) -> Callable[..., Any]:
|
|
72
|
+
"""
|
|
73
|
+
创建组件工厂函数的便捷函数
|
|
74
|
+
|
|
75
|
+
Args:
|
|
76
|
+
component_name: 组件名称(用于错误信息)
|
|
77
|
+
module_path: 模块路径
|
|
78
|
+
class_name: 类名
|
|
79
|
+
dependencies: 依赖列表
|
|
80
|
+
singleton: 是否单例
|
|
81
|
+
|
|
82
|
+
Returns:
|
|
83
|
+
工厂函数
|
|
84
|
+
"""
|
|
85
|
+
def factory_func(*args, **kwargs):
|
|
86
|
+
try:
|
|
87
|
+
# 动态导入模块
|
|
88
|
+
module = __import__(module_path, fromlist=[class_name])
|
|
89
|
+
component_class = getattr(module, class_name)
|
|
90
|
+
|
|
91
|
+
# 检查是否需要调用create_instance方法
|
|
92
|
+
if hasattr(component_class, 'create_instance'):
|
|
93
|
+
return component_class.create_instance(*args, **kwargs)
|
|
94
|
+
else:
|
|
95
|
+
return component_class(*args, **kwargs)
|
|
96
|
+
except Exception as e:
|
|
97
|
+
raise RuntimeError(f"Failed to create {component_name}: {e}")
|
|
98
|
+
|
|
99
|
+
return factory_func
|
|
100
|
+
|
|
101
|
+
|
|
102
|
+
def create_crawler_component_factory(
|
|
103
|
+
component_name: str,
|
|
104
|
+
module_path: str,
|
|
105
|
+
class_name: str
|
|
106
|
+
) -> Callable[..., Any]:
|
|
107
|
+
"""
|
|
108
|
+
创建需要crawler依赖的组件工厂函数
|
|
109
|
+
|
|
110
|
+
Args:
|
|
111
|
+
component_name: 组件名称
|
|
112
|
+
module_path: 模块路径
|
|
113
|
+
class_name: 类名
|
|
114
|
+
|
|
115
|
+
Returns:
|
|
116
|
+
工厂函数
|
|
117
|
+
"""
|
|
118
|
+
def factory_func(crawler=None, **kwargs):
|
|
119
|
+
if crawler is None:
|
|
120
|
+
raise ValueError(f"Crawler instance required for component {component_name}")
|
|
121
|
+
|
|
122
|
+
try:
|
|
123
|
+
# 动态导入模块
|
|
124
|
+
module = __import__(module_path, fromlist=[class_name])
|
|
125
|
+
component_class = getattr(module, class_name)
|
|
126
|
+
|
|
127
|
+
# 检查是否需要调用create_instance方法
|
|
128
|
+
if hasattr(component_class, 'create_instance'):
|
|
129
|
+
return component_class.create_instance(crawler, **kwargs)
|
|
130
|
+
else:
|
|
131
|
+
return component_class(crawler, **kwargs)
|
|
132
|
+
except Exception as e:
|
|
133
|
+
raise RuntimeError(f"Failed to create {component_name}: {e}")
|
|
134
|
+
|
|
135
|
+
return factory_func
|
crawlo/filters/__init__.py
CHANGED
|
@@ -17,7 +17,7 @@ Crawlo过滤器模块
|
|
|
17
17
|
from abc import ABC, abstractmethod
|
|
18
18
|
from typing import Optional
|
|
19
19
|
|
|
20
|
-
from crawlo.utils.
|
|
20
|
+
from crawlo.utils.fingerprint import FingerprintGenerator
|
|
21
21
|
|
|
22
22
|
|
|
23
23
|
class BaseFilter(ABC):
|
|
@@ -46,6 +46,23 @@ class BaseFilter(ABC):
|
|
|
46
46
|
def create_instance(cls, *args, **kwargs) -> 'BaseFilter':
|
|
47
47
|
return cls(*args, **kwargs)
|
|
48
48
|
|
|
49
|
+
def _get_fingerprint(self, request) -> str:
|
|
50
|
+
"""
|
|
51
|
+
获取请求指纹(内部辅助方法)
|
|
52
|
+
|
|
53
|
+
使用统一的 FingerprintGenerator 生成请求指纹。
|
|
54
|
+
子类可以直接调用此方法,避免重复实现。
|
|
55
|
+
|
|
56
|
+
:param request: 请求对象
|
|
57
|
+
:return: 请求指纹字符串
|
|
58
|
+
"""
|
|
59
|
+
return FingerprintGenerator.request_fingerprint(
|
|
60
|
+
request.method,
|
|
61
|
+
request.url,
|
|
62
|
+
request.body or b'',
|
|
63
|
+
dict(request.headers) if hasattr(request, 'headers') else {}
|
|
64
|
+
)
|
|
65
|
+
|
|
49
66
|
def requested(self, request) -> bool:
|
|
50
67
|
"""
|
|
51
68
|
检查请求是否重复(主要接口)
|
|
@@ -54,7 +71,7 @@ class BaseFilter(ABC):
|
|
|
54
71
|
:return: True 表示重复,False 表示新请求
|
|
55
72
|
"""
|
|
56
73
|
self._request_count += 1
|
|
57
|
-
fp =
|
|
74
|
+
fp = self._get_fingerprint(request)
|
|
58
75
|
|
|
59
76
|
if fp in self:
|
|
60
77
|
self._duplicate_count += 1
|