crawlo 1.4.6__py3-none-any.whl → 1.4.8__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of crawlo might be problematic. Click here for more details.
- crawlo/__init__.py +2 -1
- crawlo/__version__.py +1 -1
- crawlo/cli.py +2 -2
- crawlo/commands/check.py +1 -1
- crawlo/commands/help.py +5 -3
- crawlo/commands/list.py +1 -1
- crawlo/commands/run.py +49 -11
- crawlo/commands/stats.py +1 -1
- crawlo/config.py +12 -4
- crawlo/config_validator.py +1 -1
- crawlo/core/engine.py +20 -7
- crawlo/core/processor.py +1 -1
- crawlo/core/scheduler.py +4 -5
- crawlo/crawler.py +51 -10
- crawlo/downloader/__init__.py +7 -3
- crawlo/downloader/aiohttp_downloader.py +18 -18
- crawlo/downloader/cffi_downloader.py +5 -2
- crawlo/downloader/httpx_downloader.py +9 -3
- crawlo/downloader/hybrid_downloader.py +2 -2
- crawlo/downloader/playwright_downloader.py +38 -15
- crawlo/downloader/selenium_downloader.py +16 -2
- crawlo/event.py +42 -8
- crawlo/exceptions.py +157 -24
- crawlo/extension/__init__.py +10 -9
- crawlo/extension/health_check.py +7 -7
- crawlo/extension/log_interval.py +6 -6
- crawlo/extension/log_stats.py +2 -2
- crawlo/extension/logging_extension.py +4 -12
- crawlo/extension/memory_monitor.py +5 -5
- crawlo/extension/performance_profiler.py +5 -5
- crawlo/extension/request_recorder.py +6 -6
- crawlo/factories/base.py +1 -1
- crawlo/factories/crawler.py +61 -60
- crawlo/factories/utils.py +135 -0
- crawlo/filters/__init__.py +19 -2
- crawlo/filters/aioredis_filter.py +133 -49
- crawlo/filters/memory_filter.py +6 -21
- crawlo/framework.py +22 -8
- crawlo/initialization/built_in.py +24 -67
- crawlo/initialization/core.py +65 -19
- crawlo/initialization/phases.py +83 -2
- crawlo/initialization/registry.py +5 -7
- crawlo/initialization/utils.py +49 -0
- crawlo/logging/__init__.py +6 -10
- crawlo/logging/config.py +106 -22
- crawlo/logging/factory.py +12 -8
- crawlo/logging/manager.py +19 -27
- crawlo/middleware/__init__.py +72 -9
- crawlo/middleware/default_header.py +2 -2
- crawlo/middleware/download_delay.py +2 -2
- crawlo/middleware/middleware_manager.py +6 -6
- crawlo/middleware/offsite.py +2 -2
- crawlo/middleware/proxy.py +2 -2
- crawlo/middleware/request_ignore.py +4 -4
- crawlo/middleware/response_code.py +2 -2
- crawlo/middleware/response_filter.py +2 -2
- crawlo/middleware/retry.py +1 -1
- crawlo/mode_manager.py +38 -4
- crawlo/network/request.py +54 -26
- crawlo/network/response.py +69 -135
- crawlo/pipelines/__init__.py +40 -9
- crawlo/pipelines/base_pipeline.py +452 -0
- crawlo/pipelines/bloom_dedup_pipeline.py +4 -5
- crawlo/pipelines/console_pipeline.py +2 -2
- crawlo/pipelines/csv_pipeline.py +4 -4
- crawlo/pipelines/database_dedup_pipeline.py +4 -5
- crawlo/pipelines/json_pipeline.py +4 -4
- crawlo/pipelines/memory_dedup_pipeline.py +4 -5
- crawlo/pipelines/mongo_pipeline.py +23 -14
- crawlo/pipelines/mysql_pipeline.py +31 -39
- crawlo/pipelines/pipeline_manager.py +8 -8
- crawlo/pipelines/redis_dedup_pipeline.py +13 -14
- crawlo/project.py +1 -1
- crawlo/queue/__init__.py +10 -0
- crawlo/queue/queue_manager.py +79 -13
- crawlo/queue/redis_priority_queue.py +196 -47
- crawlo/settings/default_settings.py +16 -6
- crawlo/spider/__init__.py +6 -5
- crawlo/stats_collector.py +2 -2
- crawlo/task_manager.py +1 -1
- crawlo/templates/crawlo.cfg.tmpl +3 -3
- crawlo/templates/project/__init__.py.tmpl +1 -3
- crawlo/templates/project/items.py.tmpl +2 -6
- crawlo/templates/project/middlewares.py.tmpl +1 -1
- crawlo/templates/project/pipelines.py.tmpl +1 -2
- crawlo/templates/project/settings.py.tmpl +12 -10
- crawlo/templates/project/settings_distributed.py.tmpl +14 -13
- crawlo/templates/project/settings_gentle.py.tmpl +21 -23
- crawlo/templates/project/settings_high_performance.py.tmpl +21 -23
- crawlo/templates/project/settings_minimal.py.tmpl +10 -8
- crawlo/templates/project/settings_simple.py.tmpl +21 -23
- crawlo/templates/run.py.tmpl +1 -1
- crawlo/templates/spider/spider.py.tmpl +4 -12
- crawlo/templates/spiders_init.py.tmpl +3 -8
- crawlo/tools/__init__.py +0 -103
- crawlo/tools/scenario_adapter.py +1 -1
- crawlo/utils/__init__.py +25 -1
- crawlo/utils/batch_processor.py +23 -6
- crawlo/utils/config_manager.py +442 -0
- crawlo/utils/controlled_spider_mixin.py +1 -1
- crawlo/utils/db_helper.py +1 -1
- crawlo/utils/encoding_helper.py +190 -0
- crawlo/utils/error_handler.py +2 -2
- crawlo/utils/large_scale_helper.py +1 -1
- crawlo/utils/leak_detector.py +335 -0
- crawlo/utils/mongo_connection_pool.py +157 -0
- crawlo/utils/mysql_connection_pool.py +197 -0
- crawlo/utils/performance_monitor.py +1 -1
- crawlo/utils/redis_checker.py +91 -0
- crawlo/utils/redis_connection_pool.py +260 -70
- crawlo/utils/redis_key_validator.py +1 -1
- crawlo/utils/request.py +24 -2
- crawlo/utils/request_serializer.py +1 -1
- crawlo/utils/resource_manager.py +337 -0
- crawlo/utils/response_helper.py +113 -0
- crawlo/utils/selector_helper.py +3 -2
- crawlo/utils/singleton.py +70 -0
- crawlo/utils/spider_loader.py +1 -1
- crawlo/utils/text_helper.py +1 -1
- crawlo-1.4.8.dist-info/METADATA +831 -0
- {crawlo-1.4.6.dist-info → crawlo-1.4.8.dist-info}/RECORD +131 -145
- tests/advanced_tools_example.py +10 -68
- tests/distributed_dedup_test.py +467 -0
- tests/monitor_redis_dedup.sh +72 -0
- tests/ofweek_scrapy/ofweek_scrapy/spiders/__init__.py +4 -4
- tests/simple_cli_test.py +55 -0
- tests/test_cli_arguments.py +119 -0
- tests/test_dedup_fix.py +10 -10
- crawlo/logging/async_handler.py +0 -181
- crawlo/logging/monitor.py +0 -153
- crawlo/logging/sampler.py +0 -167
- crawlo/tools/authenticated_proxy.py +0 -241
- crawlo/tools/data_formatter.py +0 -226
- crawlo/tools/data_validator.py +0 -181
- crawlo/tools/encoding_converter.py +0 -127
- crawlo/tools/network_diagnostic.py +0 -365
- crawlo/tools/request_tools.py +0 -83
- crawlo/tools/retry_mechanism.py +0 -224
- crawlo/utils/env_config.py +0 -143
- crawlo/utils/large_scale_config.py +0 -287
- crawlo/utils/log.py +0 -80
- crawlo/utils/system.py +0 -11
- crawlo/utils/tools.py +0 -5
- crawlo/utils/url.py +0 -40
- crawlo-1.4.6.dist-info/METADATA +0 -329
- tests/env_config_example.py +0 -134
- tests/ofweek_scrapy/ofweek_scrapy/spiders/ofweek_spider.py +0 -162
- tests/test_authenticated_proxy.py +0 -142
- tests/test_comprehensive.py +0 -147
- tests/test_dynamic_downloaders_proxy.py +0 -125
- tests/test_dynamic_proxy.py +0 -93
- tests/test_dynamic_proxy_config.py +0 -147
- tests/test_dynamic_proxy_real.py +0 -110
- tests/test_env_config.py +0 -122
- tests/test_framework_env_usage.py +0 -104
- tests/test_large_scale_config.py +0 -113
- tests/test_proxy_api.py +0 -265
- tests/test_real_scenario_proxy.py +0 -196
- tests/tools_example.py +0 -261
- {crawlo-1.4.6.dist-info → crawlo-1.4.8.dist-info}/WHEEL +0 -0
- {crawlo-1.4.6.dist-info → crawlo-1.4.8.dist-info}/entry_points.txt +0 -0
- {crawlo-1.4.6.dist-info → crawlo-1.4.8.dist-info}/top_level.txt +0 -0
crawlo/middleware/__init__.py
CHANGED
|
@@ -1,24 +1,87 @@
|
|
|
1
1
|
#!/usr/bin/python
|
|
2
2
|
# -*- coding:UTF-8 -*-
|
|
3
|
-
from typing import TYPE_CHECKING
|
|
3
|
+
from typing import TYPE_CHECKING, Optional, Union
|
|
4
4
|
|
|
5
5
|
if TYPE_CHECKING:
|
|
6
6
|
from crawlo import Request, Response
|
|
7
7
|
|
|
8
8
|
|
|
9
|
-
class BaseMiddleware
|
|
10
|
-
|
|
11
|
-
|
|
9
|
+
class BaseMiddleware:
|
|
10
|
+
"""中间件基类
|
|
11
|
+
|
|
12
|
+
定义了中间件的标准接口,所有自定义中间件都应该继承此类。
|
|
13
|
+
|
|
14
|
+
中间件处理流程:
|
|
15
|
+
1. process_request: 请求发送前处理
|
|
16
|
+
2. process_response: 响应接收后处理
|
|
17
|
+
3. process_exception: 异常发生时处理
|
|
18
|
+
"""
|
|
19
|
+
|
|
20
|
+
def process_request(
|
|
21
|
+
self,
|
|
22
|
+
request: 'Request',
|
|
23
|
+
spider
|
|
24
|
+
) -> Optional[Union['Request', 'Response']]:
|
|
25
|
+
"""处理请求
|
|
26
|
+
|
|
27
|
+
Args:
|
|
28
|
+
request: 待处理的请求对象
|
|
29
|
+
spider: 当前爬虫实例
|
|
30
|
+
|
|
31
|
+
Returns:
|
|
32
|
+
None: 继续处理
|
|
33
|
+
Request: 替换原请求
|
|
34
|
+
Response: 跳过下载,直接返回响应
|
|
35
|
+
"""
|
|
12
36
|
pass
|
|
13
37
|
|
|
14
|
-
def process_response(
|
|
15
|
-
|
|
16
|
-
|
|
38
|
+
def process_response(
|
|
39
|
+
self,
|
|
40
|
+
request: 'Request',
|
|
41
|
+
response: 'Response',
|
|
42
|
+
spider
|
|
43
|
+
) -> Union['Request', 'Response']:
|
|
44
|
+
"""处理响应
|
|
45
|
+
|
|
46
|
+
Args:
|
|
47
|
+
request: 原始请求对象
|
|
48
|
+
response: 接收到的响应对象
|
|
49
|
+
spider: 当前爬虫实例
|
|
50
|
+
|
|
51
|
+
Returns:
|
|
52
|
+
Request: 重新发起请求
|
|
53
|
+
Response: 返回响应(可能是修改后的)
|
|
54
|
+
"""
|
|
55
|
+
return response
|
|
17
56
|
|
|
18
|
-
def process_exception(
|
|
19
|
-
|
|
57
|
+
def process_exception(
|
|
58
|
+
self,
|
|
59
|
+
request: 'Request',
|
|
60
|
+
exp: Exception,
|
|
61
|
+
spider
|
|
62
|
+
) -> Optional[Union['Request', 'Response']]:
|
|
63
|
+
"""处理异常
|
|
64
|
+
|
|
65
|
+
Args:
|
|
66
|
+
request: 发生异常的请求
|
|
67
|
+
exp: 捕获的异常对象
|
|
68
|
+
spider: 当前爬虫实例
|
|
69
|
+
|
|
70
|
+
Returns:
|
|
71
|
+
None: 继续传递异常
|
|
72
|
+
Request: 重新发起请求
|
|
73
|
+
Response: 返回响应
|
|
74
|
+
"""
|
|
20
75
|
pass
|
|
21
76
|
|
|
22
77
|
@classmethod
|
|
23
78
|
def create_instance(cls, crawler):
|
|
79
|
+
"""创建中间件实例
|
|
80
|
+
|
|
81
|
+
Args:
|
|
82
|
+
crawler: Crawler实例,包含settings等配置
|
|
83
|
+
|
|
84
|
+
Returns:
|
|
85
|
+
中间件实例
|
|
86
|
+
"""
|
|
24
87
|
return cls()
|
|
@@ -6,7 +6,7 @@ DefaultHeaderMiddleware 中间件
|
|
|
6
6
|
"""
|
|
7
7
|
|
|
8
8
|
import random
|
|
9
|
-
from crawlo.
|
|
9
|
+
from crawlo.logging import get_logger
|
|
10
10
|
from crawlo.exceptions import NotConfiguredError
|
|
11
11
|
# 导入User-Agent数据
|
|
12
12
|
from crawlo.data.user_agents import get_user_agents
|
|
@@ -22,7 +22,7 @@ class DefaultHeaderMiddleware(object):
|
|
|
22
22
|
"""
|
|
23
23
|
初始化中间件
|
|
24
24
|
"""
|
|
25
|
-
self.logger = get_logger(self.__class__.__name__
|
|
25
|
+
self.logger = get_logger(self.__class__.__name__)
|
|
26
26
|
|
|
27
27
|
# 获取默认请求头配置
|
|
28
28
|
self.headers = settings.get_dict('DEFAULT_REQUEST_HEADERS', {})
|
|
@@ -7,7 +7,7 @@ DownloadDelayMiddleware 中间件
|
|
|
7
7
|
|
|
8
8
|
from asyncio import sleep
|
|
9
9
|
from random import uniform
|
|
10
|
-
from crawlo.
|
|
10
|
+
from crawlo.logging import get_logger
|
|
11
11
|
from crawlo.exceptions import NotConfiguredError
|
|
12
12
|
|
|
13
13
|
|
|
@@ -51,7 +51,7 @@ class DownloadDelayMiddleware(object):
|
|
|
51
51
|
# 如果配置不完整,使用默认值
|
|
52
52
|
self.floor, self.upper = 0.5, 1.5
|
|
53
53
|
|
|
54
|
-
self.logger = get_logger(self.__class__.__name__
|
|
54
|
+
self.logger = get_logger(self.__class__.__name__)
|
|
55
55
|
self.stats = stats
|
|
56
56
|
|
|
57
57
|
@classmethod
|
|
@@ -14,11 +14,11 @@ else:
|
|
|
14
14
|
# 为 isinstance 检查导入实际的类
|
|
15
15
|
from crawlo.network.request import Request
|
|
16
16
|
from crawlo.network.response import Response
|
|
17
|
-
from crawlo.
|
|
17
|
+
from crawlo.logging import get_logger
|
|
18
18
|
from crawlo.utils.misc import load_object
|
|
19
19
|
from crawlo.middleware import BaseMiddleware
|
|
20
20
|
from crawlo.project import common_call
|
|
21
|
-
from crawlo.event import
|
|
21
|
+
from crawlo.event import CrawlerEvent
|
|
22
22
|
from crawlo.exceptions import MiddlewareInitError, InvalidOutputError, RequestMethodError, IgnoreRequestError, \
|
|
23
23
|
NotConfiguredError
|
|
24
24
|
|
|
@@ -27,7 +27,7 @@ class MiddlewareManager:
|
|
|
27
27
|
|
|
28
28
|
def __init__(self, crawler):
|
|
29
29
|
self.crawler = crawler
|
|
30
|
-
self.logger = get_logger(self.__class__.__name__
|
|
30
|
+
self.logger = get_logger(self.__class__.__name__)
|
|
31
31
|
self.middlewares: List = []
|
|
32
32
|
self.methods: Dict[str, List[MethodType]] = defaultdict(list)
|
|
33
33
|
middlewares = self.crawler.settings.get_list('MIDDLEWARES')
|
|
@@ -54,7 +54,7 @@ class MiddlewareManager:
|
|
|
54
54
|
try:
|
|
55
55
|
response = await common_call(method, request, response, self.crawler.spider)
|
|
56
56
|
except IgnoreRequestError as exp:
|
|
57
|
-
create_task(self.crawler.subscriber.notify(
|
|
57
|
+
create_task(self.crawler.subscriber.notify(CrawlerEvent.IGNORE_REQUEST, exp, request, self.crawler.spider))
|
|
58
58
|
if isinstance(response, Request):
|
|
59
59
|
return response
|
|
60
60
|
if isinstance(response, Response):
|
|
@@ -86,13 +86,13 @@ class MiddlewareManager:
|
|
|
86
86
|
except KeyError:
|
|
87
87
|
raise RequestMethodError(f"{request.method.lower()} is not supported")
|
|
88
88
|
except IgnoreRequestError as exp:
|
|
89
|
-
create_task(self.crawler.subscriber.notify(
|
|
89
|
+
create_task(self.crawler.subscriber.notify(CrawlerEvent.IGNORE_REQUEST, exp, request, self.crawler.spider))
|
|
90
90
|
response = await self._process_exception(request, exp)
|
|
91
91
|
except Exception as exp:
|
|
92
92
|
self._stats.inc_value(f'download_error/{exp.__class__.__name__}')
|
|
93
93
|
response = await self._process_exception(request, exp)
|
|
94
94
|
else:
|
|
95
|
-
create_task(self.crawler.subscriber.notify(
|
|
95
|
+
create_task(self.crawler.subscriber.notify(CrawlerEvent.RESPONSE_RECEIVED, response, self.crawler.spider))
|
|
96
96
|
self._stats.inc_value('response_received_count')
|
|
97
97
|
if isinstance(response, Response):
|
|
98
98
|
response = await self._process_response(request, response)
|
crawlo/middleware/offsite.py
CHANGED
|
@@ -7,7 +7,7 @@ OffsiteMiddleware 中间件
|
|
|
7
7
|
import re
|
|
8
8
|
from urllib.parse import urlparse
|
|
9
9
|
|
|
10
|
-
from crawlo.
|
|
10
|
+
from crawlo.logging import get_logger
|
|
11
11
|
from crawlo.exceptions import IgnoreRequestError
|
|
12
12
|
|
|
13
13
|
|
|
@@ -18,7 +18,7 @@ class OffsiteMiddleware:
|
|
|
18
18
|
"""
|
|
19
19
|
|
|
20
20
|
def __init__(self, stats, log_level, allowed_domains=None):
|
|
21
|
-
self.logger = get_logger(self.__class__.__name__
|
|
21
|
+
self.logger = get_logger(self.__class__.__name__)
|
|
22
22
|
self.stats = stats
|
|
23
23
|
self.allowed_domains = allowed_domains or []
|
|
24
24
|
|
crawlo/middleware/proxy.py
CHANGED
|
@@ -9,14 +9,14 @@ from urllib.parse import urlparse
|
|
|
9
9
|
from typing import Optional, List
|
|
10
10
|
|
|
11
11
|
from crawlo.network import Request, Response
|
|
12
|
-
from crawlo.
|
|
12
|
+
from crawlo.logging import get_logger
|
|
13
13
|
|
|
14
14
|
|
|
15
15
|
class ProxyMiddleware:
|
|
16
16
|
"""通用代理中间件"""
|
|
17
17
|
|
|
18
18
|
def __init__(self, settings, log_level):
|
|
19
|
-
self.logger = get_logger(self.__class__.__name__
|
|
19
|
+
self.logger = get_logger(self.__class__.__name__)
|
|
20
20
|
|
|
21
21
|
# 获取代理列表和API URL
|
|
22
22
|
self.proxies: List[str] = settings.get("PROXY_LIST", [])
|
|
@@ -4,9 +4,9 @@
|
|
|
4
4
|
RequestIgnoreMiddleware 中间件
|
|
5
5
|
用于处理和记录被忽略的请求
|
|
6
6
|
"""
|
|
7
|
-
from crawlo.
|
|
7
|
+
from crawlo.logging import get_logger
|
|
8
8
|
from crawlo.exceptions import IgnoreRequestError
|
|
9
|
-
from crawlo.event import
|
|
9
|
+
from crawlo.event import CrawlerEvent
|
|
10
10
|
|
|
11
11
|
|
|
12
12
|
class RequestIgnoreMiddleware(object):
|
|
@@ -23,7 +23,7 @@ class RequestIgnoreMiddleware(object):
|
|
|
23
23
|
stats: 统计信息收集器
|
|
24
24
|
log_level: 日志级别
|
|
25
25
|
"""
|
|
26
|
-
self.logger = get_logger(self.__class__.__name__
|
|
26
|
+
self.logger = get_logger(self.__class__.__name__)
|
|
27
27
|
self.stats = stats
|
|
28
28
|
|
|
29
29
|
@classmethod
|
|
@@ -38,7 +38,7 @@ class RequestIgnoreMiddleware(object):
|
|
|
38
38
|
RequestIgnoreMiddleware: 中间件实例
|
|
39
39
|
"""
|
|
40
40
|
o = cls(stats=crawler.stats, log_level=crawler.settings.get('LOG_LEVEL'))
|
|
41
|
-
crawler.subscriber.subscribe(o.request_ignore, event=
|
|
41
|
+
crawler.subscriber.subscribe(o.request_ignore, event=CrawlerEvent.IGNORE_REQUEST)
|
|
42
42
|
return o
|
|
43
43
|
|
|
44
44
|
async def request_ignore(self, exc, request, _spider):
|
|
@@ -4,7 +4,7 @@
|
|
|
4
4
|
ResponseCodeMiddleware 中间件
|
|
5
5
|
用于处理HTTP响应状态码,记录统计信息并支持特殊状态码处理
|
|
6
6
|
"""
|
|
7
|
-
from crawlo.
|
|
7
|
+
from crawlo.logging import get_logger
|
|
8
8
|
|
|
9
9
|
|
|
10
10
|
class ResponseCodeMiddleware(object):
|
|
@@ -27,7 +27,7 @@ class ResponseCodeMiddleware(object):
|
|
|
27
27
|
stats: 统计信息收集器
|
|
28
28
|
log_level: 日志级别
|
|
29
29
|
"""
|
|
30
|
-
self.logger = get_logger(self.__class__.__name__
|
|
30
|
+
self.logger = get_logger(self.__class__.__name__)
|
|
31
31
|
self.stats = stats
|
|
32
32
|
|
|
33
33
|
@classmethod
|
|
@@ -4,7 +4,7 @@
|
|
|
4
4
|
ResponseFilterMiddleware 中间件
|
|
5
5
|
用于过滤不符合要求的HTTP响应,支持自定义允许的状态码
|
|
6
6
|
"""
|
|
7
|
-
from crawlo.
|
|
7
|
+
from crawlo.logging import get_logger
|
|
8
8
|
from crawlo.exceptions import IgnoreRequestError
|
|
9
9
|
|
|
10
10
|
|
|
@@ -47,7 +47,7 @@ class ResponseFilterMiddleware:
|
|
|
47
47
|
except (ValueError, TypeError):
|
|
48
48
|
pass # 忽略无效的状态码
|
|
49
49
|
|
|
50
|
-
self.logger = get_logger(self.__class__.__name__
|
|
50
|
+
self.logger = get_logger(self.__class__.__name__)
|
|
51
51
|
|
|
52
52
|
@classmethod
|
|
53
53
|
def create_instance(cls, crawler):
|
crawlo/middleware/retry.py
CHANGED
crawlo/mode_manager.py
CHANGED
|
@@ -34,7 +34,7 @@ class ModeManager:
|
|
|
34
34
|
"""延迟获取logger实例"""
|
|
35
35
|
if self._logger is None:
|
|
36
36
|
try:
|
|
37
|
-
from crawlo.
|
|
37
|
+
from crawlo.logging import get_logger
|
|
38
38
|
self._logger = get_logger(__name__)
|
|
39
39
|
except Exception:
|
|
40
40
|
# 如果日志系统尚未初始化,返回None
|
|
@@ -248,6 +248,40 @@ def auto_mode(
|
|
|
248
248
|
|
|
249
249
|
# 环境变量支持
|
|
250
250
|
def from_env(default_mode: str = 'standalone') -> Dict[str, Any]:
|
|
251
|
-
"""从环境变量创建配置
|
|
252
|
-
|
|
253
|
-
|
|
251
|
+
"""从环境变量创建配置
|
|
252
|
+
|
|
253
|
+
支持的环境变量:
|
|
254
|
+
- CRAWLO_MODE: 运行模式 (standalone/distributed/auto)
|
|
255
|
+
- CRAWLO_REDIS_HOST: Redis主机地址
|
|
256
|
+
- CRAWLO_REDIS_PORT: Redis端口
|
|
257
|
+
- CRAWLO_REDIS_PASSWORD: Redis密码
|
|
258
|
+
- CRAWLO_REDIS_DB: Redis数据库编号
|
|
259
|
+
- CRAWLO_PROJECT_NAME: 项目名称
|
|
260
|
+
- CRAWLO_CONCURRENCY: 并发数
|
|
261
|
+
|
|
262
|
+
Args:
|
|
263
|
+
default_mode: 默认运行模式(当未设置环境变量时使用)
|
|
264
|
+
|
|
265
|
+
Returns:
|
|
266
|
+
配置字典
|
|
267
|
+
"""
|
|
268
|
+
mode = os.getenv('CRAWLO_MODE', default_mode).lower()
|
|
269
|
+
|
|
270
|
+
kwargs = {}
|
|
271
|
+
|
|
272
|
+
# 分布式模式特有配置
|
|
273
|
+
if mode == 'distributed':
|
|
274
|
+
kwargs['redis_host'] = os.getenv('CRAWLO_REDIS_HOST', '127.0.0.1')
|
|
275
|
+
kwargs['redis_port'] = int(os.getenv('CRAWLO_REDIS_PORT', '6379'))
|
|
276
|
+
if password := os.getenv('CRAWLO_REDIS_PASSWORD'):
|
|
277
|
+
kwargs['redis_password'] = password
|
|
278
|
+
kwargs['redis_db'] = int(os.getenv('CRAWLO_REDIS_DB', '0'))
|
|
279
|
+
|
|
280
|
+
# 通用配置
|
|
281
|
+
if project_name := os.getenv('CRAWLO_PROJECT_NAME'):
|
|
282
|
+
kwargs['project_name'] = project_name
|
|
283
|
+
|
|
284
|
+
if concurrency := os.getenv('CRAWLO_CONCURRENCY'):
|
|
285
|
+
kwargs['CONCURRENCY'] = int(concurrency)
|
|
286
|
+
|
|
287
|
+
return ModeManager().resolve_mode_settings(mode, **kwargs)
|
crawlo/network/request.py
CHANGED
|
@@ -12,42 +12,32 @@ HTTP Request 封装模块
|
|
|
12
12
|
"""
|
|
13
13
|
import json
|
|
14
14
|
from copy import deepcopy
|
|
15
|
-
from
|
|
16
|
-
from
|
|
15
|
+
from enum import IntEnum
|
|
16
|
+
from urllib.parse import urldefrag, urlencode, urlparse, urlunparse, parse_qsl
|
|
17
|
+
from w3lib.url import safe_url_string, add_or_replace_parameter
|
|
17
18
|
from typing import Dict, Optional, Callable, Union, Any, TypeVar, List
|
|
18
19
|
|
|
19
|
-
|
|
20
|
+
|
|
20
21
|
|
|
21
22
|
|
|
22
23
|
_Request = TypeVar("_Request", bound="Request")
|
|
23
24
|
|
|
24
25
|
|
|
25
|
-
class RequestPriority:
|
|
26
|
-
"""
|
|
26
|
+
class RequestPriority(IntEnum):
|
|
27
|
+
"""
|
|
28
|
+
请求优先级枚举。
|
|
29
|
+
|
|
30
|
+
数值越小,优先级越高。使用 IntEnum 确保可以直接当作整数使用。
|
|
31
|
+
|
|
32
|
+
Examples:
|
|
33
|
+
>>> request = Request(url, priority=RequestPriority.HIGH)
|
|
34
|
+
>>> request.priority = RequestPriority.URGENT
|
|
35
|
+
"""
|
|
27
36
|
URGENT = -200 # 紧急任务
|
|
28
37
|
HIGH = -100 # 高优先级
|
|
29
38
|
NORMAL = 0 # 正常优先级(默认)
|
|
30
39
|
LOW = 100 # 低优先级
|
|
31
40
|
BACKGROUND = 200 # 后台任务
|
|
32
|
-
|
|
33
|
-
@classmethod
|
|
34
|
-
def get_all_priorities(cls) -> Dict[str, int]:
|
|
35
|
-
"""获取所有优先级常量"""
|
|
36
|
-
return {
|
|
37
|
-
'URGENT': cls.URGENT,
|
|
38
|
-
'HIGH': cls.HIGH,
|
|
39
|
-
'NORMAL': cls.NORMAL,
|
|
40
|
-
'LOW': cls.LOW,
|
|
41
|
-
'BACKGROUND': cls.BACKGROUND
|
|
42
|
-
}
|
|
43
|
-
|
|
44
|
-
@classmethod
|
|
45
|
-
def from_string(cls, priority_str: str) -> int:
|
|
46
|
-
"""从字符串获取优先级值"""
|
|
47
|
-
priorities = cls.get_all_priorities()
|
|
48
|
-
if priority_str.upper() not in priorities:
|
|
49
|
-
raise ValueError(f"不支持的优先级: {priority_str}, 支持: {list(priorities.keys())}")
|
|
50
|
-
return priorities[priority_str.upper()]
|
|
51
41
|
|
|
52
42
|
|
|
53
43
|
class Request:
|
|
@@ -87,6 +77,7 @@ class Request:
|
|
|
87
77
|
self,
|
|
88
78
|
url: str,
|
|
89
79
|
callback: Optional[Callable] = None,
|
|
80
|
+
err_back: Optional[Callable] = None,
|
|
90
81
|
method: Optional[str] = 'GET',
|
|
91
82
|
headers: Optional[Dict[str, str]] = None,
|
|
92
83
|
body: Optional[Union[bytes, str, Dict[Any, Any]]] = None,
|
|
@@ -114,6 +105,7 @@ class Request:
|
|
|
114
105
|
|
|
115
106
|
:param url: 请求 URL(必须)
|
|
116
107
|
:param callback: 成功回调函数
|
|
108
|
+
:param err_back: 错误回调函数
|
|
117
109
|
:param method: HTTP 方法,默认 GET
|
|
118
110
|
:param headers: 请求头
|
|
119
111
|
:param body: 原始请求体(bytes/str),若为 dict 且未使用 json_body/form_data,则自动转为 JSON
|
|
@@ -134,6 +126,7 @@ class Request:
|
|
|
134
126
|
:param encoding: 字符编码,默认 utf-8
|
|
135
127
|
"""
|
|
136
128
|
self.callback = callback
|
|
129
|
+
self.err_back = err_back
|
|
137
130
|
self.method = str(method).upper()
|
|
138
131
|
self.headers = headers or {}
|
|
139
132
|
self.cookies = cookies or {}
|
|
@@ -229,7 +222,7 @@ class Request:
|
|
|
229
222
|
"""安全地 deepcopy meta,移除 logger 后再复制"""
|
|
230
223
|
import logging
|
|
231
224
|
|
|
232
|
-
def clean_logger_recursive(obj):
|
|
225
|
+
def clean_logger_recursive(obj: Any) -> Any:
|
|
233
226
|
"""递归移除 logger 对象"""
|
|
234
227
|
if isinstance(obj, logging.Logger):
|
|
235
228
|
return None
|
|
@@ -251,6 +244,9 @@ class Request:
|
|
|
251
244
|
|
|
252
245
|
# 先清理 logger,再 deepcopy
|
|
253
246
|
cleaned_meta = clean_logger_recursive(meta)
|
|
247
|
+
# 确保返回字典类型
|
|
248
|
+
if not isinstance(cleaned_meta, dict):
|
|
249
|
+
return {}
|
|
254
250
|
return deepcopy(cleaned_meta)
|
|
255
251
|
|
|
256
252
|
def copy(self: _Request) -> _Request:
|
|
@@ -377,4 +373,36 @@ class Request:
|
|
|
377
373
|
|
|
378
374
|
def __lt__(self, other: _Request) -> bool:
|
|
379
375
|
"""用于按优先级排序"""
|
|
380
|
-
return self.priority < other.priority
|
|
376
|
+
return self.priority < other.priority
|
|
377
|
+
|
|
378
|
+
def escape_ajax(url: str) -> str:
|
|
379
|
+
"""
|
|
380
|
+
根据Google AJAX爬取规范转换URL(处理哈希片段#!):
|
|
381
|
+
https://developers.google.com/webmasters/ajax-crawling/docs/getting-started
|
|
382
|
+
|
|
383
|
+
规则说明:
|
|
384
|
+
1. 仅当URL包含 `#!` 时才转换(表示这是AJAX可爬取页面)
|
|
385
|
+
2. 将 `#!key=value` 转换为 `?_escaped_fragment_=key%3Dvalue`
|
|
386
|
+
3. 保留原始查询参数(如果有)
|
|
387
|
+
|
|
388
|
+
示例:
|
|
389
|
+
>>> escape_ajax("www.example.com/ajax.html#!key=value")
|
|
390
|
+
'www.example.com/ajax.html?_escaped_fragment_=key%3Dvalue'
|
|
391
|
+
>>> escape_ajax("www.example.com/ajax.html?k1=v1#!key=value")
|
|
392
|
+
'www.example.com/ajax.html?k1=v1&_escaped_fragment_=key%3Dvalue'
|
|
393
|
+
>>> escape_ajax("www.example.com/ajax.html#!")
|
|
394
|
+
'www.example.com/ajax.html?_escaped_fragment_='
|
|
395
|
+
|
|
396
|
+
非AJAX可爬取的URL(无#!)原样返回:
|
|
397
|
+
>>> escape_ajax("www.example.com/ajax.html#normal")
|
|
398
|
+
'www.example.com/ajax.html#normal'
|
|
399
|
+
"""
|
|
400
|
+
# 分离URL的基础部分和哈希片段
|
|
401
|
+
de_frag, frag = urldefrag(url)
|
|
402
|
+
|
|
403
|
+
# 仅处理以"!"开头的哈希片段(Google规范)
|
|
404
|
+
if not frag.startswith("!"):
|
|
405
|
+
return url # 不符合规则则原样返回
|
|
406
|
+
|
|
407
|
+
# 调用辅助函数添加 `_escaped_fragment_` 参数
|
|
408
|
+
return add_or_replace_parameter(de_frag, "_escaped_fragment_", frag[1:])
|