crawlo 1.4.3__py3-none-any.whl → 1.4.5__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of crawlo might be problematic. Click here for more details.
- crawlo/__init__.py +11 -15
- crawlo/__version__.py +1 -1
- crawlo/commands/genspider.py +52 -17
- crawlo/commands/startproject.py +24 -0
- crawlo/core/engine.py +2 -2
- crawlo/core/scheduler.py +4 -4
- crawlo/crawler.py +13 -6
- crawlo/downloader/__init__.py +5 -2
- crawlo/extension/__init__.py +2 -2
- crawlo/filters/aioredis_filter.py +8 -1
- crawlo/filters/memory_filter.py +8 -1
- crawlo/initialization/built_in.py +13 -4
- crawlo/initialization/core.py +5 -4
- crawlo/interfaces.py +24 -0
- crawlo/middleware/__init__.py +7 -4
- crawlo/middleware/middleware_manager.py +15 -8
- crawlo/mode_manager.py +45 -11
- crawlo/network/response.py +374 -69
- crawlo/pipelines/mysql_pipeline.py +6 -6
- crawlo/pipelines/pipeline_manager.py +2 -2
- crawlo/project.py +2 -4
- crawlo/queue/pqueue.py +2 -6
- crawlo/queue/queue_manager.py +1 -2
- crawlo/settings/default_settings.py +15 -30
- crawlo/task_manager.py +2 -2
- crawlo/templates/project/items.py.tmpl +2 -2
- crawlo/templates/project/middlewares.py.tmpl +9 -89
- crawlo/templates/project/pipelines.py.tmpl +8 -68
- crawlo/templates/project/settings.py.tmpl +51 -65
- crawlo/templates/project/settings_distributed.py.tmpl +59 -67
- crawlo/templates/project/settings_gentle.py.tmpl +45 -40
- crawlo/templates/project/settings_high_performance.py.tmpl +45 -40
- crawlo/templates/project/settings_minimal.py.tmpl +37 -26
- crawlo/templates/project/settings_simple.py.tmpl +45 -40
- crawlo/templates/run.py.tmpl +3 -7
- crawlo/tools/__init__.py +0 -11
- crawlo/utils/__init__.py +17 -1
- crawlo/utils/db_helper.py +220 -319
- crawlo/utils/error_handler.py +313 -67
- crawlo/utils/fingerprint.py +3 -4
- crawlo/utils/misc.py +82 -0
- crawlo/utils/request.py +55 -66
- crawlo/utils/selector_helper.py +138 -0
- crawlo/utils/spider_loader.py +185 -45
- crawlo/utils/text_helper.py +95 -0
- crawlo-1.4.5.dist-info/METADATA +329 -0
- {crawlo-1.4.3.dist-info → crawlo-1.4.5.dist-info}/RECORD +89 -68
- tests/bug_check_test.py +251 -0
- tests/direct_selector_helper_test.py +97 -0
- tests/ofweek_scrapy/ofweek_scrapy/items.py +12 -0
- tests/ofweek_scrapy/ofweek_scrapy/middlewares.py +100 -0
- tests/ofweek_scrapy/ofweek_scrapy/pipelines.py +13 -0
- tests/ofweek_scrapy/ofweek_scrapy/settings.py +85 -0
- tests/ofweek_scrapy/ofweek_scrapy/spiders/__init__.py +4 -0
- tests/ofweek_scrapy/ofweek_scrapy/spiders/ofweek_spider.py +162 -0
- tests/ofweek_scrapy/scrapy.cfg +11 -0
- tests/performance_comparison.py +4 -5
- tests/simple_crawlo_test.py +1 -2
- tests/simple_follow_test.py +39 -0
- tests/simple_response_selector_test.py +95 -0
- tests/simple_selector_helper_test.py +155 -0
- tests/simple_selector_test.py +208 -0
- tests/simple_url_test.py +74 -0
- tests/test_crawler_process_import.py +39 -0
- tests/test_crawler_process_spider_modules.py +48 -0
- tests/test_edge_cases.py +7 -5
- tests/test_encoding_core.py +57 -0
- tests/test_encoding_detection.py +127 -0
- tests/test_factory_compatibility.py +197 -0
- tests/test_multi_directory.py +68 -0
- tests/test_multiple_spider_modules.py +81 -0
- tests/test_optimized_selector_naming.py +101 -0
- tests/test_priority_behavior.py +18 -18
- tests/test_response_follow.py +105 -0
- tests/test_response_selector_methods.py +93 -0
- tests/test_response_url_methods.py +71 -0
- tests/test_response_urljoin.py +87 -0
- tests/test_scrapy_style_encoding.py +113 -0
- tests/test_selector_helper.py +101 -0
- tests/test_selector_optimizations.py +147 -0
- tests/test_spider_loader.py +50 -0
- tests/test_spider_loader_comprehensive.py +70 -0
- tests/test_spider_modules.py +85 -0
- tests/test_spiders/__init__.py +1 -0
- tests/test_spiders/test_spider.py +10 -0
- crawlo/tools/anti_crawler.py +0 -269
- crawlo/utils/class_loader.py +0 -26
- crawlo/utils/enhanced_error_handler.py +0 -357
- crawlo-1.4.3.dist-info/METADATA +0 -190
- examples/test_project/__init__.py +0 -7
- examples/test_project/run.py +0 -35
- examples/test_project/test_project/__init__.py +0 -4
- examples/test_project/test_project/items.py +0 -18
- examples/test_project/test_project/middlewares.py +0 -119
- examples/test_project/test_project/pipelines.py +0 -97
- examples/test_project/test_project/settings.py +0 -170
- examples/test_project/test_project/spiders/__init__.py +0 -10
- examples/test_project/test_project/spiders/of_week_dis.py +0 -144
- tests/simple_log_test.py +0 -58
- tests/simple_test.py +0 -48
- tests/test_framework_logger.py +0 -67
- tests/test_framework_startup.py +0 -65
- tests/test_mode_change.py +0 -73
- {crawlo-1.4.3.dist-info → crawlo-1.4.5.dist-info}/WHEEL +0 -0
- {crawlo-1.4.3.dist-info → crawlo-1.4.5.dist-info}/entry_points.txt +0 -0
- {crawlo-1.4.3.dist-info → crawlo-1.4.5.dist-info}/top_level.txt +0 -0
- /tests/{final_command_test_report.md → ofweek_scrapy/ofweek_scrapy/__init__.py} +0 -0
|
@@ -6,75 +6,79 @@
|
|
|
6
6
|
适合快速开始和简单项目。
|
|
7
7
|
"""
|
|
8
8
|
|
|
9
|
-
#
|
|
9
|
+
# =================================== 基础配置 ===================================
|
|
10
|
+
|
|
11
|
+
# 项目基本信息
|
|
10
12
|
PROJECT_NAME = '{{project_name}}'
|
|
11
13
|
|
|
12
|
-
#
|
|
14
|
+
# 运行模式
|
|
13
15
|
RUN_MODE = 'standalone'
|
|
14
16
|
|
|
15
|
-
#
|
|
17
|
+
# 并发配置
|
|
16
18
|
CONCURRENCY = 4
|
|
17
19
|
MAX_RUNNING_SPIDERS = 1
|
|
18
20
|
DOWNLOAD_DELAY = 1.0
|
|
19
21
|
|
|
20
|
-
#
|
|
21
|
-
|
|
22
|
-
#
|
|
23
|
-
# DOWNLOADER = 'crawlo.downloader.httpx_downloader.HttpXDownloader'
|
|
24
|
-
# DOWNLOADER = 'crawlo.downloader.cffi_downloader.CurlCffiDownloader'
|
|
22
|
+
# =================================== 核心组件配置 ===================================
|
|
23
|
+
|
|
24
|
+
# 下载器配置
|
|
25
25
|
DOWNLOADER = 'crawlo.downloader.aiohttp_downloader.AioHttpDownloader'
|
|
26
26
|
|
|
27
|
-
#
|
|
27
|
+
# 队列配置
|
|
28
28
|
QUEUE_TYPE = 'memory'
|
|
29
29
|
|
|
30
|
-
#
|
|
30
|
+
# 去重过滤器
|
|
31
31
|
FILTER_CLASS = 'crawlo.filters.memory_filter.MemoryFilter'
|
|
32
32
|
|
|
33
|
-
#
|
|
33
|
+
# 默认去重管道
|
|
34
34
|
DEFAULT_DEDUP_PIPELINE = 'crawlo.pipelines.memory_dedup_pipeline.MemoryDedupPipeline'
|
|
35
35
|
|
|
36
|
-
#
|
|
37
|
-
SPIDER_MODULES = ['{{project_name}}.spiders']
|
|
36
|
+
# =================================== 爬虫配置 ===================================
|
|
38
37
|
|
|
39
|
-
#
|
|
40
|
-
|
|
41
|
-
# 'crawlo.middleware.simple_proxy.SimpleProxyMiddleware',
|
|
42
|
-
# ]
|
|
38
|
+
# 爬虫模块配置
|
|
39
|
+
SPIDER_MODULES = ['{{project_name}}.spiders']
|
|
43
40
|
|
|
44
|
-
#
|
|
41
|
+
# 默认请求头配置
|
|
45
42
|
# 为DefaultHeaderMiddleware配置默认请求头
|
|
46
|
-
DEFAULT_REQUEST_HEADERS = {
|
|
47
|
-
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
|
|
48
|
-
'Accept-Language': 'zh-CN,zh;q=0.9,en;q=0.8',
|
|
49
|
-
'Accept-Encoding': 'gzip, deflate, br',
|
|
50
|
-
}
|
|
43
|
+
# DEFAULT_REQUEST_HEADERS = {}
|
|
51
44
|
|
|
52
|
-
#
|
|
45
|
+
# 允许的域名
|
|
53
46
|
# 为OffsiteMiddleware配置允许的域名
|
|
54
|
-
# ALLOWED_DOMAINS = [
|
|
47
|
+
# ALLOWED_DOMAINS = []
|
|
55
48
|
|
|
56
|
-
#
|
|
49
|
+
# 数据管道
|
|
50
|
+
# 如需添加自定义管道,请取消注释并添加
|
|
57
51
|
# PIPELINES = [
|
|
58
|
-
# 'crawlo.pipelines.mysql_pipeline.AsyncmyMySQLPipeline',
|
|
52
|
+
# 'crawlo.pipelines.mysql_pipeline.AsyncmyMySQLPipeline', # MySQL 存储(使用asyncmy异步库)
|
|
53
|
+
# # '{{project_name}}.pipelines.CustomPipeline', # 用户自定义管道示例
|
|
59
54
|
# ]
|
|
60
55
|
|
|
61
|
-
#
|
|
56
|
+
# =================================== 系统配置 ===================================
|
|
57
|
+
|
|
58
|
+
# 扩展组件
|
|
59
|
+
# 如需添加自定义扩展,请取消注释并添加
|
|
62
60
|
# EXTENSIONS = [
|
|
63
|
-
# '
|
|
64
|
-
#
|
|
65
|
-
|
|
61
|
+
# # '{{project_name}}.extensions.CustomExtension', # 用户自定义扩展示例
|
|
62
|
+
# ]
|
|
63
|
+
|
|
64
|
+
# 中间件
|
|
65
|
+
# 如需添加自定义中间件,请取消注释并添加
|
|
66
|
+
# MIDDLEWARES = [
|
|
67
|
+
# # '{{project_name}}.middlewares.CustomMiddleware', # 用户自定义中间件示例
|
|
66
68
|
# ]
|
|
67
69
|
|
|
68
|
-
#
|
|
70
|
+
# 日志配置
|
|
69
71
|
LOG_LEVEL = 'INFO'
|
|
70
72
|
LOG_FILE = 'logs/{{project_name}}.log'
|
|
71
73
|
LOG_ENCODING = 'utf-8' # 明确指定日志文件编码
|
|
72
74
|
STATS_DUMP = True
|
|
73
75
|
|
|
74
|
-
#
|
|
76
|
+
# 输出配置
|
|
75
77
|
OUTPUT_DIR = 'output'
|
|
76
78
|
|
|
77
|
-
#
|
|
79
|
+
# =================================== 数据库配置 ===================================
|
|
80
|
+
|
|
81
|
+
# Redis配置
|
|
78
82
|
REDIS_HOST = '127.0.0.1'
|
|
79
83
|
REDIS_PORT = 6379
|
|
80
84
|
REDIS_PASSWORD = ''
|
|
@@ -86,7 +90,7 @@ if REDIS_PASSWORD:
|
|
|
86
90
|
else:
|
|
87
91
|
REDIS_URL = f'redis://{REDIS_HOST}:{REDIS_PORT}/{REDIS_DB}'
|
|
88
92
|
|
|
89
|
-
#
|
|
93
|
+
# MySQL配置
|
|
90
94
|
MYSQL_HOST = '127.0.0.1'
|
|
91
95
|
MYSQL_PORT = 3306
|
|
92
96
|
MYSQL_USER = 'root'
|
|
@@ -96,7 +100,7 @@ MYSQL_TABLE = '{{project_name}}_data'
|
|
|
96
100
|
MYSQL_BATCH_SIZE = 100
|
|
97
101
|
MYSQL_USE_BATCH = False # 是否启用批量插入
|
|
98
102
|
|
|
99
|
-
#
|
|
103
|
+
# MongoDB配置
|
|
100
104
|
MONGO_URI = 'mongodb://localhost:27017'
|
|
101
105
|
MONGO_DATABASE = '{{project_name}}_db'
|
|
102
106
|
MONGO_COLLECTION = '{{project_name}}_items'
|
|
@@ -105,7 +109,9 @@ MONGO_MIN_POOL_SIZE = 20
|
|
|
105
109
|
MONGO_BATCH_SIZE = 100 # 批量插入条数
|
|
106
110
|
MONGO_USE_BATCH = False # 是否启用批量插入
|
|
107
111
|
|
|
108
|
-
#
|
|
112
|
+
# =================================== 网络配置 ===================================
|
|
113
|
+
|
|
114
|
+
# 代理配置
|
|
109
115
|
# 代理功能默认不启用,如需使用请在项目配置文件中启用并配置相关参数
|
|
110
116
|
PROXY_ENABLED = False # 是否启用代理
|
|
111
117
|
|
|
@@ -124,7 +130,6 @@ PROXY_EXTRACTOR = "proxy"
|
|
|
124
130
|
PROXY_REFRESH_INTERVAL = 60 # 代理刷新间隔(秒)
|
|
125
131
|
PROXY_API_TIMEOUT = 10 # 请求代理 API 超时时间
|
|
126
132
|
|
|
127
|
-
# ============================== Curl-Cffi 特有配置 ==============================
|
|
128
133
|
# 浏览器指纹模拟(仅 CurlCffi 下载器有效)
|
|
129
134
|
CURL_BROWSER_TYPE = "chrome" # 可选: chrome, edge, safari, firefox 或版本如 chrome136
|
|
130
135
|
|
|
@@ -136,7 +141,7 @@ CURL_BROWSER_VERSION_MAP = {
|
|
|
136
141
|
"firefox": "firefox135",
|
|
137
142
|
}
|
|
138
143
|
|
|
139
|
-
#
|
|
144
|
+
# 下载器优化配置
|
|
140
145
|
# 下载器健康检查
|
|
141
146
|
DOWNLOADER_HEALTH_CHECK = True # 是否启用下载器健康检查
|
|
142
147
|
HEALTH_CHECK_INTERVAL = 60 # 健康检查间隔(秒)
|
|
@@ -157,7 +162,7 @@ AIOHTTP_FORCE_CLOSE = False # 是否强制关闭连接
|
|
|
157
162
|
CONNECTION_TTL_DNS_CACHE = 300 # DNS缓存TTL(秒)
|
|
158
163
|
CONNECTION_KEEPALIVE_TIMEOUT = 15 # Keep-Alive超时(秒)
|
|
159
164
|
|
|
160
|
-
#
|
|
165
|
+
# 内存监控配置
|
|
161
166
|
# 内存监控扩展默认不启用,如需使用请在项目配置文件中启用
|
|
162
167
|
MEMORY_MONITOR_ENABLED = False # 是否启用内存监控
|
|
163
168
|
MEMORY_MONITOR_INTERVAL = 60 # 内存监控检查间隔(秒)
|
crawlo/templates/run.py.tmpl
CHANGED
|
@@ -6,7 +6,7 @@
|
|
|
6
6
|
基于 Crawlo 框架的简化爬虫启动器。
|
|
7
7
|
|
|
8
8
|
框架会自动处理爬虫模块的导入和注册,用户无需手动导入。
|
|
9
|
-
|
|
9
|
+
框架会自动从settings.py中读取SPIDER_MODULES配置。
|
|
10
10
|
"""
|
|
11
11
|
import sys
|
|
12
12
|
import asyncio
|
|
@@ -17,12 +17,8 @@ from crawlo.crawler import CrawlerProcess
|
|
|
17
17
|
def main():
|
|
18
18
|
"""主函数:运行爬虫"""
|
|
19
19
|
try:
|
|
20
|
-
#
|
|
21
|
-
|
|
22
|
-
process = CrawlerProcess(spider_modules=spider_modules)
|
|
23
|
-
|
|
24
|
-
# TODO 运行指定的爬虫
|
|
25
|
-
asyncio.run(process.crawl('spider_name'))
|
|
20
|
+
# TODO: 请将 'spider_name' 替换为实际要运行的爬虫名称
|
|
21
|
+
asyncio.run(CrawlerProcess().crawl('spider_name'))
|
|
26
22
|
|
|
27
23
|
except Exception as e:
|
|
28
24
|
print(f"❌ 运行失败: {e}")
|
crawlo/tools/__init__.py
CHANGED
|
@@ -79,17 +79,6 @@ from .retry_mechanism import (
|
|
|
79
79
|
exponential_backoff
|
|
80
80
|
)
|
|
81
81
|
|
|
82
|
-
# 反爬虫应对工具
|
|
83
|
-
from .anti_crawler import (
|
|
84
|
-
ProxyPoolManager,
|
|
85
|
-
CaptchaHandler,
|
|
86
|
-
AntiCrawler,
|
|
87
|
-
get_random_user_agent,
|
|
88
|
-
rotate_proxy,
|
|
89
|
-
handle_captcha,
|
|
90
|
-
detect_rate_limiting
|
|
91
|
-
)
|
|
92
|
-
|
|
93
82
|
# 带认证代理工具
|
|
94
83
|
from .authenticated_proxy import (
|
|
95
84
|
AuthenticatedProxy,
|
crawlo/utils/__init__.py
CHANGED
|
@@ -4,6 +4,9 @@
|
|
|
4
4
|
# @Time : 2025-02-05 13:57
|
|
5
5
|
# @Author : oscar
|
|
6
6
|
# @Desc : 工具模块集合
|
|
7
|
+
|
|
8
|
+
提供用于处理parsel选择器的辅助函数,用于提取文本和属性等操作。
|
|
9
|
+
所有方法都采用了简洁直观的命名风格,便于记忆和使用。
|
|
7
10
|
"""
|
|
8
11
|
|
|
9
12
|
from ..tools.date_tools import (
|
|
@@ -20,6 +23,14 @@ from ..tools.date_tools import (
|
|
|
20
23
|
from_timestamp_with_tz
|
|
21
24
|
)
|
|
22
25
|
|
|
26
|
+
from .selector_helper import (
|
|
27
|
+
extract_text,
|
|
28
|
+
extract_texts,
|
|
29
|
+
extract_attr,
|
|
30
|
+
extract_attrs,
|
|
31
|
+
is_xpath
|
|
32
|
+
)
|
|
33
|
+
|
|
23
34
|
__all__ = [
|
|
24
35
|
"TimeUtils",
|
|
25
36
|
"parse_time",
|
|
@@ -31,5 +42,10 @@ __all__ = [
|
|
|
31
42
|
"to_timezone",
|
|
32
43
|
"to_utc",
|
|
33
44
|
"to_local",
|
|
34
|
-
"from_timestamp_with_tz"
|
|
45
|
+
"from_timestamp_with_tz",
|
|
46
|
+
"extract_text",
|
|
47
|
+
"extract_texts",
|
|
48
|
+
"extract_attr",
|
|
49
|
+
"extract_attrs",
|
|
50
|
+
"is_xpath"
|
|
35
51
|
]
|