crawlo 1.4.6__py3-none-any.whl → 1.4.8__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of crawlo might be problematic. Click here for more details.
- crawlo/__init__.py +2 -1
- crawlo/__version__.py +1 -1
- crawlo/cli.py +2 -2
- crawlo/commands/check.py +1 -1
- crawlo/commands/help.py +5 -3
- crawlo/commands/list.py +1 -1
- crawlo/commands/run.py +49 -11
- crawlo/commands/stats.py +1 -1
- crawlo/config.py +12 -4
- crawlo/config_validator.py +1 -1
- crawlo/core/engine.py +20 -7
- crawlo/core/processor.py +1 -1
- crawlo/core/scheduler.py +4 -5
- crawlo/crawler.py +51 -10
- crawlo/downloader/__init__.py +7 -3
- crawlo/downloader/aiohttp_downloader.py +18 -18
- crawlo/downloader/cffi_downloader.py +5 -2
- crawlo/downloader/httpx_downloader.py +9 -3
- crawlo/downloader/hybrid_downloader.py +2 -2
- crawlo/downloader/playwright_downloader.py +38 -15
- crawlo/downloader/selenium_downloader.py +16 -2
- crawlo/event.py +42 -8
- crawlo/exceptions.py +157 -24
- crawlo/extension/__init__.py +10 -9
- crawlo/extension/health_check.py +7 -7
- crawlo/extension/log_interval.py +6 -6
- crawlo/extension/log_stats.py +2 -2
- crawlo/extension/logging_extension.py +4 -12
- crawlo/extension/memory_monitor.py +5 -5
- crawlo/extension/performance_profiler.py +5 -5
- crawlo/extension/request_recorder.py +6 -6
- crawlo/factories/base.py +1 -1
- crawlo/factories/crawler.py +61 -60
- crawlo/factories/utils.py +135 -0
- crawlo/filters/__init__.py +19 -2
- crawlo/filters/aioredis_filter.py +133 -49
- crawlo/filters/memory_filter.py +6 -21
- crawlo/framework.py +22 -8
- crawlo/initialization/built_in.py +24 -67
- crawlo/initialization/core.py +65 -19
- crawlo/initialization/phases.py +83 -2
- crawlo/initialization/registry.py +5 -7
- crawlo/initialization/utils.py +49 -0
- crawlo/logging/__init__.py +6 -10
- crawlo/logging/config.py +106 -22
- crawlo/logging/factory.py +12 -8
- crawlo/logging/manager.py +19 -27
- crawlo/middleware/__init__.py +72 -9
- crawlo/middleware/default_header.py +2 -2
- crawlo/middleware/download_delay.py +2 -2
- crawlo/middleware/middleware_manager.py +6 -6
- crawlo/middleware/offsite.py +2 -2
- crawlo/middleware/proxy.py +2 -2
- crawlo/middleware/request_ignore.py +4 -4
- crawlo/middleware/response_code.py +2 -2
- crawlo/middleware/response_filter.py +2 -2
- crawlo/middleware/retry.py +1 -1
- crawlo/mode_manager.py +38 -4
- crawlo/network/request.py +54 -26
- crawlo/network/response.py +69 -135
- crawlo/pipelines/__init__.py +40 -9
- crawlo/pipelines/base_pipeline.py +452 -0
- crawlo/pipelines/bloom_dedup_pipeline.py +4 -5
- crawlo/pipelines/console_pipeline.py +2 -2
- crawlo/pipelines/csv_pipeline.py +4 -4
- crawlo/pipelines/database_dedup_pipeline.py +4 -5
- crawlo/pipelines/json_pipeline.py +4 -4
- crawlo/pipelines/memory_dedup_pipeline.py +4 -5
- crawlo/pipelines/mongo_pipeline.py +23 -14
- crawlo/pipelines/mysql_pipeline.py +31 -39
- crawlo/pipelines/pipeline_manager.py +8 -8
- crawlo/pipelines/redis_dedup_pipeline.py +13 -14
- crawlo/project.py +1 -1
- crawlo/queue/__init__.py +10 -0
- crawlo/queue/queue_manager.py +79 -13
- crawlo/queue/redis_priority_queue.py +196 -47
- crawlo/settings/default_settings.py +16 -6
- crawlo/spider/__init__.py +6 -5
- crawlo/stats_collector.py +2 -2
- crawlo/task_manager.py +1 -1
- crawlo/templates/crawlo.cfg.tmpl +3 -3
- crawlo/templates/project/__init__.py.tmpl +1 -3
- crawlo/templates/project/items.py.tmpl +2 -6
- crawlo/templates/project/middlewares.py.tmpl +1 -1
- crawlo/templates/project/pipelines.py.tmpl +1 -2
- crawlo/templates/project/settings.py.tmpl +12 -10
- crawlo/templates/project/settings_distributed.py.tmpl +14 -13
- crawlo/templates/project/settings_gentle.py.tmpl +21 -23
- crawlo/templates/project/settings_high_performance.py.tmpl +21 -23
- crawlo/templates/project/settings_minimal.py.tmpl +10 -8
- crawlo/templates/project/settings_simple.py.tmpl +21 -23
- crawlo/templates/run.py.tmpl +1 -1
- crawlo/templates/spider/spider.py.tmpl +4 -12
- crawlo/templates/spiders_init.py.tmpl +3 -8
- crawlo/tools/__init__.py +0 -103
- crawlo/tools/scenario_adapter.py +1 -1
- crawlo/utils/__init__.py +25 -1
- crawlo/utils/batch_processor.py +23 -6
- crawlo/utils/config_manager.py +442 -0
- crawlo/utils/controlled_spider_mixin.py +1 -1
- crawlo/utils/db_helper.py +1 -1
- crawlo/utils/encoding_helper.py +190 -0
- crawlo/utils/error_handler.py +2 -2
- crawlo/utils/large_scale_helper.py +1 -1
- crawlo/utils/leak_detector.py +335 -0
- crawlo/utils/mongo_connection_pool.py +157 -0
- crawlo/utils/mysql_connection_pool.py +197 -0
- crawlo/utils/performance_monitor.py +1 -1
- crawlo/utils/redis_checker.py +91 -0
- crawlo/utils/redis_connection_pool.py +260 -70
- crawlo/utils/redis_key_validator.py +1 -1
- crawlo/utils/request.py +24 -2
- crawlo/utils/request_serializer.py +1 -1
- crawlo/utils/resource_manager.py +337 -0
- crawlo/utils/response_helper.py +113 -0
- crawlo/utils/selector_helper.py +3 -2
- crawlo/utils/singleton.py +70 -0
- crawlo/utils/spider_loader.py +1 -1
- crawlo/utils/text_helper.py +1 -1
- crawlo-1.4.8.dist-info/METADATA +831 -0
- {crawlo-1.4.6.dist-info → crawlo-1.4.8.dist-info}/RECORD +131 -145
- tests/advanced_tools_example.py +10 -68
- tests/distributed_dedup_test.py +467 -0
- tests/monitor_redis_dedup.sh +72 -0
- tests/ofweek_scrapy/ofweek_scrapy/spiders/__init__.py +4 -4
- tests/simple_cli_test.py +55 -0
- tests/test_cli_arguments.py +119 -0
- tests/test_dedup_fix.py +10 -10
- crawlo/logging/async_handler.py +0 -181
- crawlo/logging/monitor.py +0 -153
- crawlo/logging/sampler.py +0 -167
- crawlo/tools/authenticated_proxy.py +0 -241
- crawlo/tools/data_formatter.py +0 -226
- crawlo/tools/data_validator.py +0 -181
- crawlo/tools/encoding_converter.py +0 -127
- crawlo/tools/network_diagnostic.py +0 -365
- crawlo/tools/request_tools.py +0 -83
- crawlo/tools/retry_mechanism.py +0 -224
- crawlo/utils/env_config.py +0 -143
- crawlo/utils/large_scale_config.py +0 -287
- crawlo/utils/log.py +0 -80
- crawlo/utils/system.py +0 -11
- crawlo/utils/tools.py +0 -5
- crawlo/utils/url.py +0 -40
- crawlo-1.4.6.dist-info/METADATA +0 -329
- tests/env_config_example.py +0 -134
- tests/ofweek_scrapy/ofweek_scrapy/spiders/ofweek_spider.py +0 -162
- tests/test_authenticated_proxy.py +0 -142
- tests/test_comprehensive.py +0 -147
- tests/test_dynamic_downloaders_proxy.py +0 -125
- tests/test_dynamic_proxy.py +0 -93
- tests/test_dynamic_proxy_config.py +0 -147
- tests/test_dynamic_proxy_real.py +0 -110
- tests/test_env_config.py +0 -122
- tests/test_framework_env_usage.py +0 -104
- tests/test_large_scale_config.py +0 -113
- tests/test_proxy_api.py +0 -265
- tests/test_real_scenario_proxy.py +0 -196
- tests/tools_example.py +0 -261
- {crawlo-1.4.6.dist-info → crawlo-1.4.8.dist-info}/WHEEL +0 -0
- {crawlo-1.4.6.dist-info → crawlo-1.4.8.dist-info}/entry_points.txt +0 -0
- {crawlo-1.4.6.dist-info → crawlo-1.4.8.dist-info}/top_level.txt +0 -0
crawlo/utils/log.py
DELETED
|
@@ -1,80 +0,0 @@
|
|
|
1
|
-
# ==================== 向后兼容的日志接口 ====================
|
|
2
|
-
# 主要功能已迁移到 crawlo.logging 模块
|
|
3
|
-
# 本文件提供向后兼容接口,同时支持新的日志系统功能
|
|
4
|
-
|
|
5
|
-
import logging
|
|
6
|
-
from typing import Optional, Any
|
|
7
|
-
|
|
8
|
-
# 向后兼容:导入新的日志系统
|
|
9
|
-
try:
|
|
10
|
-
from crawlo.logging import get_logger as new_get_logger, configure_logging
|
|
11
|
-
|
|
12
|
-
_NEW_LOGGING_AVAILABLE = True
|
|
13
|
-
except ImportError:
|
|
14
|
-
_NEW_LOGGING_AVAILABLE = False
|
|
15
|
-
new_get_logger = None
|
|
16
|
-
configure_logging = None
|
|
17
|
-
|
|
18
|
-
LOG_FORMAT = '%(asctime)s - [%(name)s] - %(levelname)s: %(message)s'
|
|
19
|
-
|
|
20
|
-
|
|
21
|
-
# 向后兼容的日志函数
|
|
22
|
-
def get_logger(name: str = 'default', level: Optional[int] = None):
|
|
23
|
-
"""获取Logger实例 - 向后兼容函数"""
|
|
24
|
-
if _NEW_LOGGING_AVAILABLE and new_get_logger:
|
|
25
|
-
# 使用新的日志系统
|
|
26
|
-
return new_get_logger(name)
|
|
27
|
-
else:
|
|
28
|
-
# 降级到基本的Python logging
|
|
29
|
-
logger = logging.getLogger(name)
|
|
30
|
-
if not logger.handlers:
|
|
31
|
-
handler = logging.StreamHandler()
|
|
32
|
-
formatter = logging.Formatter(LOG_FORMAT)
|
|
33
|
-
handler.setFormatter(formatter)
|
|
34
|
-
logger.addHandler(handler)
|
|
35
|
-
logger.setLevel(level or logging.INFO)
|
|
36
|
-
return logger
|
|
37
|
-
|
|
38
|
-
|
|
39
|
-
def get_component_logger(component_class: Any, settings: Optional[Any] = None, level: Optional[str] = None):
|
|
40
|
-
"""
|
|
41
|
-
获取组件Logger - 推荐的组件日志创建方式
|
|
42
|
-
|
|
43
|
-
Args:
|
|
44
|
-
component_class: 组件类
|
|
45
|
-
settings: 配置对象,用于读取日志级别配置
|
|
46
|
-
level: 日志级别(优先级低于settings中的配置)
|
|
47
|
-
|
|
48
|
-
Returns:
|
|
49
|
-
logging.Logger: 配置好的Logger实例
|
|
50
|
-
"""
|
|
51
|
-
# 获取组件名称
|
|
52
|
-
if hasattr(component_class, '__name__'):
|
|
53
|
-
component_name = component_class.__name__
|
|
54
|
-
else:
|
|
55
|
-
component_name = str(component_class)
|
|
56
|
-
|
|
57
|
-
# 如果新日志系统可用,使用新系统
|
|
58
|
-
if _NEW_LOGGING_AVAILABLE and new_get_logger:
|
|
59
|
-
return new_get_logger(component_name)
|
|
60
|
-
|
|
61
|
-
# 否则使用向后兼容方式
|
|
62
|
-
# 从settings中获取日志级别(如果提供)
|
|
63
|
-
if settings is not None:
|
|
64
|
-
# 尝试从settings获取组件特定的日志级别
|
|
65
|
-
if hasattr(settings, 'get'):
|
|
66
|
-
# 检查是否有组件特定的日志级别配置
|
|
67
|
-
component_level = settings.get(f'LOG_LEVEL_{component_name}')
|
|
68
|
-
if component_level is not None:
|
|
69
|
-
level = component_level
|
|
70
|
-
else:
|
|
71
|
-
# 检查通用日志级别
|
|
72
|
-
general_level = settings.get('LOG_LEVEL')
|
|
73
|
-
if general_level is not None:
|
|
74
|
-
level = general_level
|
|
75
|
-
|
|
76
|
-
# 转换日志级别
|
|
77
|
-
if isinstance(level, str):
|
|
78
|
-
level = getattr(logging, level.upper(), logging.INFO)
|
|
79
|
-
|
|
80
|
-
return get_logger(component_name, level)
|
crawlo/utils/system.py
DELETED
crawlo/utils/tools.py
DELETED
crawlo/utils/url.py
DELETED
|
@@ -1,40 +0,0 @@
|
|
|
1
|
-
from urllib.parse import urldefrag
|
|
2
|
-
from w3lib.url import add_or_replace_parameter
|
|
3
|
-
|
|
4
|
-
|
|
5
|
-
def escape_ajax(url: str) -> str:
|
|
6
|
-
"""
|
|
7
|
-
根据Google AJAX爬取规范转换URL(处理哈希片段#!):
|
|
8
|
-
https://developers.google.com/webmasters/ajax-crawling/docs/getting-started
|
|
9
|
-
|
|
10
|
-
规则说明:
|
|
11
|
-
1. 仅当URL包含 `#!` 时才转换(表示这是AJAX可爬取页面)
|
|
12
|
-
2. 将 `#!key=value` 转换为 `?_escaped_fragment_=key%3Dvalue`
|
|
13
|
-
3. 保留原始查询参数(如果有)
|
|
14
|
-
|
|
15
|
-
示例:
|
|
16
|
-
>>> escape_ajax("www.example.com/ajax.html#!key=value")
|
|
17
|
-
'www.example.com/ajax.html?_escaped_fragment_=key%3Dvalue'
|
|
18
|
-
>>> escape_ajax("www.example.com/ajax.html?k1=v1#!key=value")
|
|
19
|
-
'www.example.com/ajax.html?k1=v1&_escaped_fragment_=key%3Dvalue'
|
|
20
|
-
>>> escape_ajax("www.example.com/ajax.html#!")
|
|
21
|
-
'www.example.com/ajax.html?_escaped_fragment_='
|
|
22
|
-
|
|
23
|
-
非AJAX可爬取的URL(无#!)原样返回:
|
|
24
|
-
>>> escape_ajax("www.example.com/ajax.html#normal")
|
|
25
|
-
'www.example.com/ajax.html#normal'
|
|
26
|
-
"""
|
|
27
|
-
# 分离URL的基础部分和哈希片段
|
|
28
|
-
de_frag, frag = urldefrag(url)
|
|
29
|
-
|
|
30
|
-
# 仅处理以"!"开头的哈希片段(Google规范)
|
|
31
|
-
if not frag.startswith("!"):
|
|
32
|
-
return url # 不符合规则则原样返回
|
|
33
|
-
|
|
34
|
-
# 调用辅助函数添加 `_escaped_fragment_` 参数
|
|
35
|
-
return add_or_replace_parameter(de_frag, "_escaped_fragment_", frag[1:])
|
|
36
|
-
|
|
37
|
-
|
|
38
|
-
if __name__ == '__main__':
|
|
39
|
-
f = escape_ajax('http://example.com/page#!')
|
|
40
|
-
print(f)
|
crawlo-1.4.6.dist-info/METADATA
DELETED
|
@@ -1,329 +0,0 @@
|
|
|
1
|
-
Metadata-Version: 2.4
|
|
2
|
-
Name: crawlo
|
|
3
|
-
Version: 1.4.6
|
|
4
|
-
Summary: Crawlo 是一款基于异步IO的高性能Python爬虫框架,支持分布式抓取。
|
|
5
|
-
Home-page: https://github.com/crawl-coder/Crawlo.git
|
|
6
|
-
Author: crawl-coder
|
|
7
|
-
Author-email: crawlo@qq.com
|
|
8
|
-
License: MIT
|
|
9
|
-
Classifier: Programming Language :: Python :: 3
|
|
10
|
-
Classifier: License :: OSI Approved :: MIT License
|
|
11
|
-
Classifier: Operating System :: OS Independent
|
|
12
|
-
Requires-Python: >=3.6
|
|
13
|
-
Description-Content-Type: text/markdown
|
|
14
|
-
Requires-Dist: aiohttp>=3.12.14
|
|
15
|
-
Requires-Dist: aiomysql>=0.2.0
|
|
16
|
-
Requires-Dist: aioredis>=2.0.1
|
|
17
|
-
Requires-Dist: asyncmy>=0.2.10
|
|
18
|
-
Requires-Dist: cssselect>=1.2.0
|
|
19
|
-
Requires-Dist: dateparser>=1.2.2
|
|
20
|
-
Requires-Dist: httpx[http2]>=0.27.0
|
|
21
|
-
Requires-Dist: curl-cffi>=0.13.0
|
|
22
|
-
Requires-Dist: lxml>=5.2.1
|
|
23
|
-
Requires-Dist: motor>=3.7.0
|
|
24
|
-
Requires-Dist: parsel>=1.9.1
|
|
25
|
-
Requires-Dist: pydantic>=2.11.7
|
|
26
|
-
Requires-Dist: pymongo>=4.11
|
|
27
|
-
Requires-Dist: PyMySQL>=1.1.1
|
|
28
|
-
Requires-Dist: python-dateutil>=2.9.0.post0
|
|
29
|
-
Requires-Dist: redis>=6.2.0
|
|
30
|
-
Requires-Dist: requests>=2.32.4
|
|
31
|
-
Requires-Dist: six>=1.17.0
|
|
32
|
-
Requires-Dist: ujson>=5.9.0
|
|
33
|
-
Requires-Dist: urllib3>=2.5.0
|
|
34
|
-
Requires-Dist: w3lib>=2.1.2
|
|
35
|
-
Requires-Dist: rich>=14.1.0
|
|
36
|
-
Requires-Dist: astor>=0.8.1
|
|
37
|
-
Requires-Dist: watchdog>=6.0.0
|
|
38
|
-
Provides-Extra: render
|
|
39
|
-
Requires-Dist: webdriver-manager>=4.0.0; extra == "render"
|
|
40
|
-
Requires-Dist: playwright; extra == "render"
|
|
41
|
-
Requires-Dist: selenium>=3.141.0; extra == "render"
|
|
42
|
-
Provides-Extra: all
|
|
43
|
-
Requires-Dist: bitarray>=1.5.3; extra == "all"
|
|
44
|
-
Requires-Dist: PyExecJS>=1.5.1; extra == "all"
|
|
45
|
-
Requires-Dist: pymongo>=3.10.1; extra == "all"
|
|
46
|
-
Requires-Dist: redis-py-cluster>=2.1.0; extra == "all"
|
|
47
|
-
Requires-Dist: webdriver-manager>=4.0.0; extra == "all"
|
|
48
|
-
Requires-Dist: playwright; extra == "all"
|
|
49
|
-
Requires-Dist: selenium>=3.141.0; extra == "all"
|
|
50
|
-
|
|
51
|
-
# Crawlo 爬虫框架
|
|
52
|
-
|
|
53
|
-
Crawlo 是一个高性能、可扩展的 Python 爬虫框架,支持单机和分布式部署。
|
|
54
|
-
|
|
55
|
-
## 特性
|
|
56
|
-
|
|
57
|
-
- 高性能异步爬取
|
|
58
|
-
- 支持多种下载器 (aiohttp, httpx, curl-cffi)
|
|
59
|
-
- 内置数据清洗和验证
|
|
60
|
-
- 分布式爬取支持
|
|
61
|
-
- 灵活的中间件系统
|
|
62
|
-
- 强大的配置管理系统
|
|
63
|
-
- 详细的日志记录和监控
|
|
64
|
-
- Windows 和 Linux 兼容
|
|
65
|
-
|
|
66
|
-
## 安装
|
|
67
|
-
|
|
68
|
-
```bash
|
|
69
|
-
pip install crawlo
|
|
70
|
-
```
|
|
71
|
-
|
|
72
|
-
或者从源码安装:
|
|
73
|
-
|
|
74
|
-
```bash
|
|
75
|
-
git clone git@github.com:crawl-coder/Crawlo.git
|
|
76
|
-
cd crawlo
|
|
77
|
-
pip install -r requirements.txt
|
|
78
|
-
pip install .
|
|
79
|
-
```
|
|
80
|
-
|
|
81
|
-
## 快速开始
|
|
82
|
-
|
|
83
|
-
```python
|
|
84
|
-
from crawlo import Spider
|
|
85
|
-
|
|
86
|
-
class MySpider(Spider):
|
|
87
|
-
name = 'example'
|
|
88
|
-
|
|
89
|
-
def parse(self, response):
|
|
90
|
-
# 解析逻辑
|
|
91
|
-
pass
|
|
92
|
-
|
|
93
|
-
# 运行爬虫
|
|
94
|
-
# crawlo run example
|
|
95
|
-
```
|
|
96
|
-
|
|
97
|
-
## Response 对象功能
|
|
98
|
-
|
|
99
|
-
Crawlo 框架对 Response 对象进行了增强,提供了更多便捷方法:
|
|
100
|
-
|
|
101
|
-
### URL 处理
|
|
102
|
-
|
|
103
|
-
使用 Response 对象封装的 URL 处理方法可以方便地处理各种 URL 操作,无需手动导入 `urllib.parse` 中的函数:
|
|
104
|
-
|
|
105
|
-
```python
|
|
106
|
-
class MySpider(Spider):
|
|
107
|
-
def parse(self, response):
|
|
108
|
-
# 1. 处理相对URL和绝对URL
|
|
109
|
-
absolute_url = response.urljoin('/relative/path')
|
|
110
|
-
|
|
111
|
-
# 2. 解析URL组件
|
|
112
|
-
parsed = response.urlparse() # 解析当前响应URL
|
|
113
|
-
scheme = parsed.scheme
|
|
114
|
-
domain = parsed.netloc
|
|
115
|
-
path = parsed.path
|
|
116
|
-
|
|
117
|
-
# 3. 解析查询参数
|
|
118
|
-
query_params = response.parse_qs() # 解析当前URL的查询参数
|
|
119
|
-
|
|
120
|
-
# 4. 编码查询参数
|
|
121
|
-
new_query = response.urlencode({'key': 'value', 'name': '测试'})
|
|
122
|
-
|
|
123
|
-
# 5. URL编码/解码
|
|
124
|
-
encoded = response.quote('hello world 你好')
|
|
125
|
-
decoded = response.unquote(encoded)
|
|
126
|
-
|
|
127
|
-
# 6. 移除URL片段
|
|
128
|
-
url_without_fragment, fragment = response.urldefrag('http://example.com/path#section')
|
|
129
|
-
|
|
130
|
-
yield Request(url=absolute_url, callback=self.parse_detail)
|
|
131
|
-
```
|
|
132
|
-
|
|
133
|
-
### 编码检测优化
|
|
134
|
-
|
|
135
|
-
Crawlo 框架参考 Scrapy 的设计模式对 Response 对象的编码检测功能进行了优化,提供了更准确和可靠的编码检测:
|
|
136
|
-
|
|
137
|
-
```python
|
|
138
|
-
class MySpider(Spider):
|
|
139
|
-
def parse(self, response):
|
|
140
|
-
# 自动检测响应编码
|
|
141
|
-
encoding = response.encoding
|
|
142
|
-
|
|
143
|
-
# 获取声明的编码(Request编码 > BOM > HTTP头部 > HTML meta标签)
|
|
144
|
-
declared_encoding = response._declared_encoding()
|
|
145
|
-
|
|
146
|
-
# 响应文本已自动使用正确的编码解码
|
|
147
|
-
text = response.text
|
|
148
|
-
|
|
149
|
-
# 处理解码后的内容
|
|
150
|
-
# ...
|
|
151
|
-
```
|
|
152
|
-
|
|
153
|
-
编码检测优先级:
|
|
154
|
-
1. Request 中指定的编码
|
|
155
|
-
2. BOM 字节顺序标记
|
|
156
|
-
3. HTTP Content-Type 头部
|
|
157
|
-
4. HTML meta 标签声明
|
|
158
|
-
5. 内容自动检测
|
|
159
|
-
6. 默认编码 (utf-8)
|
|
160
|
-
|
|
161
|
-
### 选择器方法优化
|
|
162
|
-
|
|
163
|
-
Crawlo 框架对 Response 对象的选择器方法进行了优化,提供了更便捷的数据提取功能,方法命名更加直观和统一:
|
|
164
|
-
|
|
165
|
-
```python
|
|
166
|
-
class MySpider(Spider):
|
|
167
|
-
def parse(self, response):
|
|
168
|
-
# 1. 提取单个元素文本(支持CSS和XPath)
|
|
169
|
-
title = response.extract_text('title') # CSS选择器
|
|
170
|
-
title = response.extract_text('//title') # XPath选择器
|
|
171
|
-
|
|
172
|
-
# 2. 提取多个元素文本
|
|
173
|
-
paragraphs = response.extract_texts('.content p') # CSS选择器
|
|
174
|
-
paragraphs = response.extract_texts('//div[@class="content"]//p') # XPath选择器
|
|
175
|
-
|
|
176
|
-
# 3. 提取单个元素属性
|
|
177
|
-
link_href = response.extract_attr('a', 'href') # CSS选择器
|
|
178
|
-
link_href = response.extract_attr('//a[@class="link"]', 'href') # XPath选择器
|
|
179
|
-
|
|
180
|
-
# 4. 提取多个元素属性
|
|
181
|
-
all_links = response.extract_attrs('a', 'href') # CSS选择器
|
|
182
|
-
all_links = response.extract_attrs('//a[@class="link"]', 'href') # XPath选择器
|
|
183
|
-
|
|
184
|
-
yield {
|
|
185
|
-
'title': title,
|
|
186
|
-
'paragraphs': paragraphs,
|
|
187
|
-
'links': all_links
|
|
188
|
-
}
|
|
189
|
-
```
|
|
190
|
-
|
|
191
|
-
所有选择器方法都采用了简洁直观的命名风格,便于记忆和使用。
|
|
192
|
-
|
|
193
|
-
### 工具模块
|
|
194
|
-
|
|
195
|
-
Crawlo 框架提供了丰富的工具模块,用于处理各种常见任务。选择器相关的辅助函数现在位于 `crawlo.utils.selector_helper` 模块中:
|
|
196
|
-
|
|
197
|
-
```python
|
|
198
|
-
from crawlo.utils import (
|
|
199
|
-
extract_text,
|
|
200
|
-
extract_texts,
|
|
201
|
-
extract_attr,
|
|
202
|
-
extract_attrs,
|
|
203
|
-
is_xpath
|
|
204
|
-
)
|
|
205
|
-
|
|
206
|
-
# 在自定义代码中使用这些工具函数
|
|
207
|
-
title_elements = response.css('title')
|
|
208
|
-
title_text = extract_text(title_elements)
|
|
209
|
-
|
|
210
|
-
li_elements = response.css('.list li')
|
|
211
|
-
li_texts = extract_texts(li_elements)
|
|
212
|
-
|
|
213
|
-
link_elements = response.css('.link')
|
|
214
|
-
link_href = extract_attr(link_elements, 'href')
|
|
215
|
-
|
|
216
|
-
all_links = response.css('a')
|
|
217
|
-
all_hrefs = extract_attrs(all_links, 'href')
|
|
218
|
-
```
|
|
219
|
-
|
|
220
|
-
## 日志系统
|
|
221
|
-
|
|
222
|
-
Crawlo 拥有一个功能强大的日志系统,支持多种配置选项:
|
|
223
|
-
|
|
224
|
-
### 基本配置
|
|
225
|
-
|
|
226
|
-
```python
|
|
227
|
-
from crawlo.logging import configure_logging, get_logger
|
|
228
|
-
|
|
229
|
-
# 配置日志系统
|
|
230
|
-
configure_logging(
|
|
231
|
-
LOG_LEVEL='INFO',
|
|
232
|
-
LOG_FILE='logs/app.log',
|
|
233
|
-
LOG_MAX_BYTES=10*1024*1024, # 10MB
|
|
234
|
-
LOG_BACKUP_COUNT=5
|
|
235
|
-
)
|
|
236
|
-
|
|
237
|
-
# 获取logger
|
|
238
|
-
logger = get_logger('my_module')
|
|
239
|
-
logger.info('这是一条日志消息')
|
|
240
|
-
```
|
|
241
|
-
|
|
242
|
-
### 高级配置
|
|
243
|
-
|
|
244
|
-
```python
|
|
245
|
-
# 分别配置控制台和文件日志级别
|
|
246
|
-
configure_logging(
|
|
247
|
-
LOG_LEVEL='INFO',
|
|
248
|
-
LOG_CONSOLE_LEVEL='WARNING', # 控制台只显示WARNING及以上级别
|
|
249
|
-
LOG_FILE_LEVEL='DEBUG', # 文件记录DEBUG及以上级别
|
|
250
|
-
LOG_FILE='logs/app.log',
|
|
251
|
-
LOG_INCLUDE_THREAD_ID=True, # 包含线程ID
|
|
252
|
-
LOG_INCLUDE_PROCESS_ID=True # 包含进程ID
|
|
253
|
-
)
|
|
254
|
-
|
|
255
|
-
# 模块特定日志级别
|
|
256
|
-
configure_logging(
|
|
257
|
-
LOG_LEVEL='WARNING',
|
|
258
|
-
LOG_LEVELS={
|
|
259
|
-
'my_module.debug': 'DEBUG',
|
|
260
|
-
'my_module.info': 'INFO'
|
|
261
|
-
}
|
|
262
|
-
)
|
|
263
|
-
```
|
|
264
|
-
|
|
265
|
-
### 性能监控
|
|
266
|
-
|
|
267
|
-
```python
|
|
268
|
-
from crawlo.logging import get_monitor
|
|
269
|
-
|
|
270
|
-
# 启用日志性能监控
|
|
271
|
-
monitor = get_monitor()
|
|
272
|
-
monitor.enable_monitoring()
|
|
273
|
-
|
|
274
|
-
# 获取性能报告
|
|
275
|
-
report = monitor.get_performance_report()
|
|
276
|
-
print(report)
|
|
277
|
-
```
|
|
278
|
-
|
|
279
|
-
### 日志采样
|
|
280
|
-
|
|
281
|
-
```python
|
|
282
|
-
from crawlo.logging import get_sampler
|
|
283
|
-
|
|
284
|
-
# 设置采样率(只记录30%的日志)
|
|
285
|
-
sampler = get_sampler()
|
|
286
|
-
sampler.set_sample_rate('my_module', 0.3)
|
|
287
|
-
|
|
288
|
-
# 设置速率限制(每秒最多100条日志)
|
|
289
|
-
sampler.set_rate_limit('my_module', 100)
|
|
290
|
-
```
|
|
291
|
-
|
|
292
|
-
## Windows 兼容性说明
|
|
293
|
-
|
|
294
|
-
在 Windows 系统上使用日志轮转功能时,可能会遇到文件锁定问题。为了解决这个问题,建议安装 `concurrent-log-handler` 库:
|
|
295
|
-
|
|
296
|
-
```bash
|
|
297
|
-
pip install concurrent-log-handler
|
|
298
|
-
```
|
|
299
|
-
|
|
300
|
-
Crawlo 框架会自动检测并使用这个库来提供更好的 Windows 兼容性。
|
|
301
|
-
|
|
302
|
-
如果未安装 `concurrent-log-handler`,在 Windows 上运行时可能会出现以下错误:
|
|
303
|
-
```
|
|
304
|
-
PermissionError: [WinError 32] 另一个程序正在使用此文件,进程无法访问。
|
|
305
|
-
```
|
|
306
|
-
|
|
307
|
-
## 爬虫自动发现
|
|
308
|
-
|
|
309
|
-
Crawlo 框架支持通过 `SPIDER_MODULES` 配置自动发现和加载爬虫,类似于 Scrapy 的机制:
|
|
310
|
-
|
|
311
|
-
```python
|
|
312
|
-
# settings.py
|
|
313
|
-
SPIDER_MODULES = [
|
|
314
|
-
'myproject.spiders',
|
|
315
|
-
'myproject.more_spiders',
|
|
316
|
-
]
|
|
317
|
-
|
|
318
|
-
SPIDER_LOADER_WARN_ONLY = True # 加载错误时只警告不报错
|
|
319
|
-
```
|
|
320
|
-
|
|
321
|
-
框架会自动扫描配置的模块目录,发现并注册其中的爬虫类。
|
|
322
|
-
|
|
323
|
-
## 文档
|
|
324
|
-
|
|
325
|
-
请查看 [文档](https://your-docs-url.com) 获取更多信息。
|
|
326
|
-
|
|
327
|
-
## 许可证
|
|
328
|
-
|
|
329
|
-
MIT
|
tests/env_config_example.py
DELETED
|
@@ -1,134 +0,0 @@
|
|
|
1
|
-
#!/usr/bin/env python3
|
|
2
|
-
# -*- coding: utf-8 -*-
|
|
3
|
-
"""
|
|
4
|
-
环境变量配置工具使用示例
|
|
5
|
-
展示如何在 Crawlo 项目中正确使用环境变量配置工具
|
|
6
|
-
"""
|
|
7
|
-
import os
|
|
8
|
-
import sys
|
|
9
|
-
|
|
10
|
-
# 添加项目根目录到Python路径
|
|
11
|
-
sys.path.insert(0, os.path.join(os.path.dirname(__file__), '..'))
|
|
12
|
-
|
|
13
|
-
from crawlo.utils.env_config import get_env_var, get_redis_config, get_runtime_config
|
|
14
|
-
from crawlo.settings.setting_manager import SettingManager
|
|
15
|
-
from crawlo.settings import default_settings
|
|
16
|
-
|
|
17
|
-
|
|
18
|
-
def example_basic_usage():
|
|
19
|
-
"""基本使用示例"""
|
|
20
|
-
print("=== 基本环境变量使用示例 ===")
|
|
21
|
-
|
|
22
|
-
# 获取字符串环境变量
|
|
23
|
-
project_name = get_env_var('PROJECT_NAME', 'my_crawler', str)
|
|
24
|
-
print(f"项目名称: {project_name}")
|
|
25
|
-
|
|
26
|
-
# 获取整数环境变量
|
|
27
|
-
concurrency = get_env_var('CONCURRENCY', 8, int)
|
|
28
|
-
print(f"并发数: {concurrency}")
|
|
29
|
-
|
|
30
|
-
# 获取布尔环境变量
|
|
31
|
-
debug_mode = get_env_var('DEBUG_MODE', False, bool)
|
|
32
|
-
print(f"调试模式: {debug_mode}")
|
|
33
|
-
|
|
34
|
-
|
|
35
|
-
def example_redis_config():
|
|
36
|
-
"""Redis配置示例"""
|
|
37
|
-
print("\n=== Redis配置示例 ===")
|
|
38
|
-
|
|
39
|
-
# 获取Redis配置
|
|
40
|
-
redis_config = get_redis_config()
|
|
41
|
-
print(f"Redis主机: {redis_config['REDIS_HOST']}")
|
|
42
|
-
print(f"Redis端口: {redis_config['REDIS_PORT']}")
|
|
43
|
-
print(f"Redis密码: {'*' * len(redis_config['REDIS_PASSWORD']) if redis_config['REDIS_PASSWORD'] else '无'}")
|
|
44
|
-
print(f"Redis数据库: {redis_config['REDIS_DB']}")
|
|
45
|
-
|
|
46
|
-
# 生成Redis URL
|
|
47
|
-
if redis_config['REDIS_PASSWORD']:
|
|
48
|
-
redis_url = f"redis://:{redis_config['REDIS_PASSWORD']}@{redis_config['REDIS_HOST']}:{redis_config['REDIS_PORT']}/{redis_config['REDIS_DB']}"
|
|
49
|
-
else:
|
|
50
|
-
redis_url = f"redis://{redis_config['REDIS_HOST']}:{redis_config['REDIS_PORT']}/{redis_config['REDIS_DB']}"
|
|
51
|
-
|
|
52
|
-
print(f"Redis URL: {redis_url}")
|
|
53
|
-
|
|
54
|
-
|
|
55
|
-
def example_runtime_config():
|
|
56
|
-
"""运行时配置示例"""
|
|
57
|
-
print("\n=== 运行时配置示例 ===")
|
|
58
|
-
|
|
59
|
-
# 获取运行时配置
|
|
60
|
-
runtime_config = get_runtime_config()
|
|
61
|
-
print(f"运行模式: {runtime_config['CRAWLO_MODE']}")
|
|
62
|
-
print(f"项目名称: {runtime_config['PROJECT_NAME']}")
|
|
63
|
-
print(f"并发数: {runtime_config['CONCURRENCY']}")
|
|
64
|
-
|
|
65
|
-
|
|
66
|
-
def example_settings_integration():
|
|
67
|
-
"""与Settings集成示例"""
|
|
68
|
-
print("\n=== 与Settings集成示例 ===")
|
|
69
|
-
|
|
70
|
-
# 创建设置管理器
|
|
71
|
-
settings = SettingManager()
|
|
72
|
-
|
|
73
|
-
# 更新Redis相关设置
|
|
74
|
-
redis_config = get_redis_config()
|
|
75
|
-
settings.set('REDIS_HOST', redis_config['REDIS_HOST'])
|
|
76
|
-
settings.set('REDIS_PORT', redis_config['REDIS_PORT'])
|
|
77
|
-
settings.set('REDIS_PASSWORD', redis_config['REDIS_PASSWORD'])
|
|
78
|
-
settings.set('REDIS_DB', redis_config['REDIS_DB'])
|
|
79
|
-
|
|
80
|
-
# 更新运行时设置
|
|
81
|
-
runtime_config = get_runtime_config()
|
|
82
|
-
settings.set('PROJECT_NAME', runtime_config['PROJECT_NAME'])
|
|
83
|
-
settings.set('RUN_MODE', runtime_config['CRAWLO_MODE'])
|
|
84
|
-
settings.set('CONCURRENCY', runtime_config['CONCURRENCY'])
|
|
85
|
-
|
|
86
|
-
# 显示一些关键设置
|
|
87
|
-
print(f"项目名称: {settings.get('PROJECT_NAME')}")
|
|
88
|
-
print(f"运行模式: {settings.get('RUN_MODE')}")
|
|
89
|
-
print(f"并发数: {settings.get_int('CONCURRENCY')}")
|
|
90
|
-
print(f"Redis主机: {settings.get('REDIS_HOST')}")
|
|
91
|
-
print(f"Redis端口: {settings.get_int('REDIS_PORT')}")
|
|
92
|
-
|
|
93
|
-
|
|
94
|
-
def example_env_setup():
|
|
95
|
-
"""环境变量设置示例"""
|
|
96
|
-
print("\n=== 环境变量设置示例 ===")
|
|
97
|
-
print("在命令行中设置环境变量的示例:")
|
|
98
|
-
print(" Windows (PowerShell):")
|
|
99
|
-
print(" $env:PROJECT_NAME = \"my_distributed_crawler\"")
|
|
100
|
-
print(" $env:REDIS_HOST = \"redis.example.com\"")
|
|
101
|
-
print(" $env:REDIS_PORT = \"6380\"")
|
|
102
|
-
print(" $env:CONCURRENCY = \"16\"")
|
|
103
|
-
print(" $env:CRAWLO_MODE = \"distributed\"")
|
|
104
|
-
print()
|
|
105
|
-
print(" Linux/macOS:")
|
|
106
|
-
print(" export PROJECT_NAME=\"my_distributed_crawler\"")
|
|
107
|
-
print(" export REDIS_HOST=\"redis.example.com\"")
|
|
108
|
-
print(" export REDIS_PORT=\"6380\"")
|
|
109
|
-
print(" export CONCURRENCY=\"16\"")
|
|
110
|
-
print(" export CRAWLO_MODE=\"distributed\"")
|
|
111
|
-
|
|
112
|
-
|
|
113
|
-
if __name__ == '__main__':
|
|
114
|
-
# 设置一些测试环境变量
|
|
115
|
-
os.environ['PROJECT_NAME'] = 'test_crawler'
|
|
116
|
-
os.environ['CONCURRENCY'] = '12'
|
|
117
|
-
os.environ['DEBUG_MODE'] = 'true'
|
|
118
|
-
os.environ['REDIS_HOST'] = 'redis.test.com'
|
|
119
|
-
os.environ['REDIS_PORT'] = '6380'
|
|
120
|
-
os.environ['REDIS_PASSWORD'] = 'test_password'
|
|
121
|
-
os.environ['CRAWLO_MODE'] = 'distributed'
|
|
122
|
-
|
|
123
|
-
# 运行示例
|
|
124
|
-
example_basic_usage()
|
|
125
|
-
example_redis_config()
|
|
126
|
-
example_runtime_config()
|
|
127
|
-
example_settings_integration()
|
|
128
|
-
example_env_setup()
|
|
129
|
-
|
|
130
|
-
# 清理测试环境变量
|
|
131
|
-
for var in ['PROJECT_NAME', 'CONCURRENCY', 'DEBUG_MODE', 'REDIS_HOST',
|
|
132
|
-
'REDIS_PORT', 'REDIS_PASSWORD', 'CRAWLO_MODE']:
|
|
133
|
-
if var in os.environ:
|
|
134
|
-
del os.environ[var]
|