crawlo 1.1.1__py3-none-any.whl → 1.1.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of crawlo might be problematic. Click here for more details.

Files changed (68) hide show
  1. crawlo/__init__.py +2 -1
  2. crawlo/__version__.py +1 -1
  3. crawlo/commands/genspider.py +68 -42
  4. crawlo/commands/list.py +102 -93
  5. crawlo/commands/startproject.py +89 -4
  6. crawlo/commands/utils.py +187 -0
  7. crawlo/config.py +280 -0
  8. crawlo/core/engine.py +16 -3
  9. crawlo/core/enhanced_engine.py +190 -0
  10. crawlo/core/scheduler.py +113 -8
  11. crawlo/crawler.py +840 -307
  12. crawlo/downloader/__init__.py +181 -17
  13. crawlo/downloader/aiohttp_downloader.py +15 -2
  14. crawlo/downloader/cffi_downloader.py +11 -1
  15. crawlo/downloader/httpx_downloader.py +14 -3
  16. crawlo/filters/__init__.py +122 -5
  17. crawlo/filters/aioredis_filter.py +128 -36
  18. crawlo/filters/memory_filter.py +99 -32
  19. crawlo/middleware/proxy.py +11 -8
  20. crawlo/middleware/retry.py +40 -5
  21. crawlo/mode_manager.py +201 -0
  22. crawlo/network/__init__.py +17 -3
  23. crawlo/network/request.py +118 -10
  24. crawlo/network/response.py +131 -28
  25. crawlo/pipelines/__init__.py +1 -1
  26. crawlo/pipelines/csv_pipeline.py +317 -0
  27. crawlo/pipelines/json_pipeline.py +219 -0
  28. crawlo/queue/__init__.py +0 -0
  29. crawlo/queue/pqueue.py +37 -0
  30. crawlo/queue/queue_manager.py +304 -0
  31. crawlo/queue/redis_priority_queue.py +192 -0
  32. crawlo/settings/default_settings.py +68 -9
  33. crawlo/spider/__init__.py +576 -66
  34. crawlo/task_manager.py +4 -1
  35. crawlo/templates/project/middlewares.py.tmpl +56 -45
  36. crawlo/templates/project/pipelines.py.tmpl +308 -36
  37. crawlo/templates/project/run.py.tmpl +239 -0
  38. crawlo/templates/project/settings.py.tmpl +211 -17
  39. crawlo/templates/spider/spider.py.tmpl +153 -7
  40. crawlo/utils/controlled_spider_mixin.py +336 -0
  41. crawlo/utils/large_scale_config.py +287 -0
  42. crawlo/utils/large_scale_helper.py +344 -0
  43. crawlo/utils/queue_helper.py +176 -0
  44. crawlo/utils/request_serializer.py +220 -0
  45. crawlo-1.1.2.dist-info/METADATA +567 -0
  46. {crawlo-1.1.1.dist-info → crawlo-1.1.2.dist-info}/RECORD +54 -46
  47. tests/test_final_validation.py +154 -0
  48. tests/test_redis_config.py +29 -0
  49. tests/test_redis_queue.py +225 -0
  50. tests/test_request_serialization.py +71 -0
  51. tests/test_scheduler.py +242 -0
  52. crawlo/pipelines/mysql_batch_pipline.py +0 -273
  53. crawlo/utils/pqueue.py +0 -174
  54. crawlo-1.1.1.dist-info/METADATA +0 -220
  55. examples/baidu_spider/__init__.py +0 -7
  56. examples/baidu_spider/demo.py +0 -94
  57. examples/baidu_spider/items.py +0 -46
  58. examples/baidu_spider/middleware.py +0 -49
  59. examples/baidu_spider/pipeline.py +0 -55
  60. examples/baidu_spider/run.py +0 -27
  61. examples/baidu_spider/settings.py +0 -121
  62. examples/baidu_spider/spiders/__init__.py +0 -7
  63. examples/baidu_spider/spiders/bai_du.py +0 -61
  64. examples/baidu_spider/spiders/miit.py +0 -159
  65. examples/baidu_spider/spiders/sina.py +0 -79
  66. {crawlo-1.1.1.dist-info → crawlo-1.1.2.dist-info}/WHEEL +0 -0
  67. {crawlo-1.1.1.dist-info → crawlo-1.1.2.dist-info}/entry_points.txt +0 -0
  68. {crawlo-1.1.1.dist-info → crawlo-1.1.2.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,239 @@
1
+ #!/usr/bin/env python
2
+ # -*- coding: UTF-8 -*-
3
+ """
4
+ {{project_name}} 项目运行脚本
5
+ ============================
6
+ 基于 Crawlo 框架的智能爬虫启动器。
7
+ 支持单机/分布式模式,灵活配置,开箱即用。
8
+
9
+ 🎯 快速使用:
10
+ python run.py spider_name # 单机模式运行
11
+ python run.py spider_name --distributed # 分布式模式运行
12
+ python run.py spider_name --env production # 使用预设配置
13
+
14
+ 🔧 高级选项:
15
+ python run.py spider_name --dry-run # 干运行(不执行实际爬取)
16
+ python run.py spider_name --concurrency 16 # 自定义并发数
17
+ python run.py spider_name --mode gentle # 温和模式(低负载)
18
+ python run.py spider1 spider2 --distributed # 多爬虫分布式运行
19
+
20
+ 📦 配置模式:
21
+ --standalone 单机模式(默认)- 内存队列,无需外部依赖
22
+ --distributed 分布式模式 - Redis队列,支持多节点
23
+ --auto 自动模式 - 智能检测Redis可用性
24
+
25
+ 🎛️ 预设配置:
26
+ --env development 开发环境(调试友好)
27
+ --env production 生产环境(高性能)
28
+ --env large-scale 大规模爬取(优化内存)
29
+ --env gentle 温和模式(低负载)
30
+ """
31
+
32
+ import os
33
+ import sys
34
+ import asyncio
35
+ import argparse
36
+ from pathlib import Path
37
+ from crawlo.crawler import CrawlerProcess
38
+ from crawlo.config import CrawloConfig
39
+ from crawlo.mode_manager import standalone_mode, distributed_mode, auto_mode
40
+
41
+
42
+ def create_parser():
43
+ """创建命令行参数解析器"""
44
+ parser = argparse.ArgumentParser(
45
+ description='{{project_name}} 爬虫启动器 - 基于 Crawlo 框架',
46
+ formatter_class=argparse.RawDescriptionHelpFormatter,
47
+ epilog="""
48
+ 示例用法:
49
+ python run.py my_spider # 默认单机模式
50
+ python run.py my_spider --distributed # 分布式模式
51
+ python run.py my_spider --env production # 生产环境配置
52
+ python run.py spider1 spider2 # 运行多个爬虫
53
+ python run.py my_spider --dry-run # 测试模式
54
+ """
55
+ )
56
+
57
+ # 爬虫名称(位置参数)
58
+ parser.add_argument(
59
+ 'spiders',
60
+ nargs='*',
61
+ help='要运行的爬虫名称(可指定多个)'
62
+ )
63
+
64
+ # 运行模式选择
65
+ mode_group = parser.add_mutually_exclusive_group()
66
+ mode_group.add_argument(
67
+ '--standalone',
68
+ action='store_true',
69
+ help='单机模式(默认)- 使用内存队列,无需外部依赖'
70
+ )
71
+ mode_group.add_argument(
72
+ '--distributed',
73
+ action='store_true',
74
+ help='分布式模式 - 使用 Redis 队列,支持多节点爬取'
75
+ )
76
+ mode_group.add_argument(
77
+ '--auto',
78
+ action='store_true',
79
+ help='自动模式 - 智能检测 Redis 可用性选择队列类型'
80
+ )
81
+
82
+ # 预设环境配置
83
+ parser.add_argument(
84
+ '--env',
85
+ choices=['development', 'production', 'large-scale', 'gentle'],
86
+ help='预设环境配置(优先级高于模式选择)'
87
+ )
88
+
89
+ # 性能调优选项
90
+ parser.add_argument(
91
+ '--concurrency',
92
+ type=int,
93
+ help='并发请求数(覆盖默认设置)'
94
+ )
95
+
96
+ parser.add_argument(
97
+ '--delay',
98
+ type=float,
99
+ help='请求延迟时间(秒)'
100
+ )
101
+
102
+ # 功能选项
103
+ parser.add_argument(
104
+ '--dry-run',
105
+ action='store_true',
106
+ help='干运行模式 - 解析页面但不执行实际爬取操作'
107
+ )
108
+
109
+ parser.add_argument(
110
+ '--debug',
111
+ action='store_true',
112
+ help='启用调试模式 - 详细日志输出'
113
+ )
114
+
115
+ parser.add_argument(
116
+ '--config-file',
117
+ type=str,
118
+ help='自定义配置文件路径'
119
+ )
120
+
121
+ # 环境变量支持
122
+ parser.add_argument(
123
+ '--from-env',
124
+ action='store_true',
125
+ help='从环境变量加载配置(CRAWLO_*)'
126
+ )
127
+
128
+ return parser
129
+
130
+
131
+ def build_config(args):
132
+ """根据命令行参数构建配置"""
133
+ config = None
134
+
135
+ # 1. 优先使用环境变量配置
136
+ if args.from_env:
137
+ config = CrawloConfig.from_env()
138
+ print("📋 使用环境变量配置")
139
+
140
+ # 2. 使用预设环境配置
141
+ elif args.env:
142
+ presets = {
143
+ 'development': CrawloConfig.presets().development(),
144
+ 'production': CrawloConfig.presets().production(),
145
+ 'large-scale': CrawloConfig.presets().large_scale(),
146
+ 'gentle': CrawloConfig.presets().gentle()
147
+ }
148
+ config = presets[args.env]
149
+ print(f"🎛️ 使用预设配置: {args.env}")
150
+
151
+ # 3. 使用模式配置
152
+ elif args.distributed:
153
+ config = CrawloConfig.distributed()
154
+ print("🌐 启用分布式模式")
155
+ elif args.auto:
156
+ config = CrawloConfig.auto()
157
+ print("🤖 启用自动检测模式")
158
+ else:
159
+ # 默认单机模式
160
+ config = CrawloConfig.standalone()
161
+ print("💻 使用单机模式(默认)")
162
+
163
+ # 4. 应用命令行参数覆盖
164
+ if args.concurrency:
165
+ config.set('CONCURRENCY', args.concurrency)
166
+ print(f"⚡ 设置并发数: {args.concurrency}")
167
+
168
+ if args.delay:
169
+ config.set('DOWNLOAD_DELAY', args.delay)
170
+ print(f"⏱️ 设置请求延迟: {args.delay}秒")
171
+
172
+ if args.debug:
173
+ config.set('LOG_LEVEL', 'DEBUG')
174
+ print("🐛 启用调试模式")
175
+
176
+ if args.dry_run:
177
+ # 干运行模式的配置(可根据需要调整)
178
+ config.set('DOWNLOAD_DELAY', 0.1) # 加快速度
179
+ config.set('CONCURRENCY', 1) # 降低并发
180
+ print("🧪 启用干运行模式")
181
+
182
+ return config
183
+
184
+
185
+ async def main():
186
+ """主函数:解析参数,构建配置,启动爬虫"""
187
+
188
+ # 解析命令行参数
189
+ parser = create_parser()
190
+ args = parser.parse_args()
191
+
192
+ # 检查是否指定了爬虫
193
+ if not args.spiders:
194
+ print("❌ 请指定要运行的爬虫名称")
195
+ print("\n可用的爬虫:")
196
+ print(" # TODO: 在这里列出你的爬虫")
197
+ print(" # from {{project_name}}.spiders import MySpider")
198
+ print("\n使用方法: python run.py <spider_name>")
199
+ parser.print_help()
200
+ return
201
+
202
+ # 构建配置
203
+ config = build_config(args)
204
+
205
+ # 创建爬虫进程
206
+ print(f"\n🚀 正在启动爬虫: {', '.join(args.spiders)}")
207
+
208
+ if args.dry_run:
209
+ print(" 🧪 [干运行模式] 将解析页面但不执行实际爬取")
210
+
211
+ try:
212
+ # 应用配置并启动
213
+ process = CrawlerProcess(settings=config.to_dict())
214
+
215
+ # TODO: 在这里添加你的爬虫导入
216
+ # from {{project_name}}.spiders.example_spider import ExampleSpider
217
+ # spider_classes = {'example_spider': ExampleSpider}
218
+
219
+ # 运行指定爬虫
220
+ await process.crawl(args.spiders)
221
+
222
+ print("\n✅ 所有爬虫执行完成")
223
+
224
+ except ImportError as e:
225
+ print(f"❌ 无法导入爬虫: {e}")
226
+ print(" 请检查爬虫文件是否存在,并更新 run.py 中的导入语句")
227
+ except Exception as e:
228
+ print(f"❌ 运行错误: {e}")
229
+ raise
230
+
231
+
232
+ if __name__ == '__main__':
233
+ try:
234
+ asyncio.run(main())
235
+ except KeyboardInterrupt:
236
+ print("\n⏹️ 用户中断爬虫执行")
237
+ except Exception as e:
238
+ print(f"❌ 运行错误: {e}")
239
+ sys.exit(1)
@@ -1,54 +1,248 @@
1
1
  # -*- coding: UTF-8 -*-
2
- """自动创建的 settings.py 文件"""
2
+ """
3
+ {{project_name}} 项目配置文件
4
+ =============================
5
+ 基于 Crawlo 框架的爬虫项目配置。
3
6
 
7
+ 🎯 快速开始:
8
+
9
+ # 方式1:使用默认单机模式(推荐)
10
+ from crawlo.crawler import CrawlerProcess
11
+ process = CrawlerProcess() # 无需任何配置
12
+
13
+ # 方式2:使用配置工厂
14
+ from crawlo.config import CrawloConfig
15
+ config = CrawloConfig.standalone() # 单机模式
16
+ config = CrawloConfig.distributed(redis_host='192.168.1.100') # 分布式模式
17
+ process = CrawlerProcess(settings=config.to_dict())
18
+
19
+ # 方式3:使用环境变量
20
+ from crawlo.config import CrawloConfig
21
+ config = CrawloConfig.from_env() # 从环境变量读取
22
+ """
23
+ import os
24
+ from crawlo.config import CrawloConfig
25
+
26
+ # ============================== 项目基本信息 ==============================
4
27
  PROJECT_NAME = '{{project_name}}'
5
- VERSION = '1.0'
28
+ VERSION = '1.0.0'
29
+
30
+ # ============================== 运行模式选择 ==============================
31
+
32
+ # 🎯 选择一种配置方式:
33
+
34
+ # 方式1:使用配置工厂(推荐)
35
+ # 单机模式(默认)
36
+ CONFIG = CrawloConfig.standalone(
37
+ concurrency=8,
38
+ download_delay=1.0
39
+ )
40
+
41
+ # 分布式模式(去掉注释并修改 Redis 地址)
42
+ # CONFIG = CrawloConfig.distributed(
43
+ # redis_host='127.0.0.1',
44
+ # redis_password='your_password', # 如果有密码
45
+ # project_name='{{project_name}}',
46
+ # concurrency=16,
47
+ # download_delay=1.0
48
+ # )
49
+
50
+ # 自动检测模式
51
+ # CONFIG = CrawloConfig.auto(concurrency=12)
52
+
53
+ # 方式2:从环境变量读取(适合部署)
54
+ # CONFIG = CrawloConfig.from_env()
55
+
56
+ # 方式3:使用预设配置
57
+ # from crawlo.config import Presets
58
+ # CONFIG = Presets.development() # 开发环境
59
+ # CONFIG = Presets.production() # 生产环境
60
+
61
+ # 获取最终配置
62
+ locals().update(CONFIG.to_dict())
6
63
 
7
64
  # ============================== 网络请求配置 ==============================
8
- DOWNLOADER = "crawlo.downloader.aiohttp_downloader.AioHttpDownloader"
9
- DOWNLOAD_TIMEOUT = 60
65
+
66
+ # 下载器选择(推荐使用 CurlCffi,支持浏览器指纹模拟)
67
+ DOWNLOADER = "crawlo.downloader.cffi_downloader.CurlCffiDownloader" # 支持浏览器指纹
68
+ # DOWNLOADER = "crawlo.downloader.aiohttp_downloader.AioHttpDownloader" # 轻量级选择
69
+ # DOWNLOADER = "crawlo.downloader.httpx_downloader.HttpXDownloader" # HTTP/2 支持
70
+
71
+ # 请求超时与安全
72
+ DOWNLOAD_TIMEOUT = 30
10
73
  VERIFY_SSL = True
11
74
  USE_SESSION = True
12
75
 
76
+ # 请求延迟控制(防反爬)
13
77
  DOWNLOAD_DELAY = 1.0
78
+ RANDOM_RANGE = (0.8, 1.2)
14
79
  RANDOMNESS = True
15
80
 
81
+ # 重试策略
16
82
  MAX_RETRY_TIMES = 3
83
+ RETRY_PRIORITY = -1
17
84
  RETRY_HTTP_CODES = [408, 429, 500, 502, 503, 504, 522, 524]
18
85
  IGNORE_HTTP_CODES = [403, 404]
86
+ ALLOWED_CODES = []
19
87
 
20
- CONNECTION_POOL_LIMIT = 100
88
+ # 连接池配置
89
+ CONNECTION_POOL_LIMIT = 50
90
+ DOWNLOAD_MAXSIZE = 10 * 1024 * 1024 # 10MB
91
+ DOWNLOAD_WARN_SIZE = 1024 * 1024 # 1MB
21
92
 
22
- # ============================== 并发与调度 ==============================
93
+ # ============================== 并发与调度配置 ==============================
23
94
  CONCURRENCY = 8
95
+ INTERVAL = 5
96
+ DEPTH_PRIORITY = 1
24
97
  MAX_RUNNING_SPIDERS = 3
25
98
 
26
- # ============================== 数据存储 ==============================
27
- MYSQL_HOST = '127.0.0.1'
28
- MYSQL_PORT = 3306
29
- MYSQL_USER = 'root'
30
- MYSQL_PASSWORD = '123456'
31
- MYSQL_DB = '{{project_name}}'
32
- MYSQL_TABLE = 'crawled_data'
99
+ # ============================== 队列配置(支持分布式) ==============================
100
+
101
+ # 队列类型:'auto'(自动选择), 'memory'(内存队列), 'redis'(分布式队列)
102
+ QUEUE_TYPE = 'auto'
103
+ SCHEDULER_MAX_QUEUE_SIZE = 2000
104
+ SCHEDULER_QUEUE_NAME = f'{{project_name}}:requests'
105
+ QUEUE_MAX_RETRIES = 3
106
+ QUEUE_TIMEOUT = 300
107
+
108
+ # 大规模爬取优化
109
+ LARGE_SCALE_BATCH_SIZE = 1000
110
+ LARGE_SCALE_CHECKPOINT_INTERVAL = 5000
111
+ LARGE_SCALE_MAX_MEMORY_USAGE = 500
112
+
113
+ # ============================== 数据存储配置 ==============================
114
+
115
+ # --- MySQL 配置 ---
116
+ MYSQL_HOST = os.getenv('MYSQL_HOST', '127.0.0.1')
117
+ MYSQL_PORT = int(os.getenv('MYSQL_PORT', 3306))
118
+ MYSQL_USER = os.getenv('MYSQL_USER', 'root')
119
+ MYSQL_PASSWORD = os.getenv('MYSQL_PASSWORD', '123456')
120
+ MYSQL_DB = os.getenv('MYSQL_DB', '{{project_name}}')
121
+ MYSQL_TABLE = '{{project_name}}_data'
122
+ MYSQL_BATCH_SIZE = 100
123
+
124
+ # MySQL 连接池
125
+ MYSQL_FLUSH_INTERVAL = 5
126
+ MYSQL_POOL_MIN = 5
127
+ MYSQL_POOL_MAX = 20
128
+ MYSQL_ECHO = False
129
+
130
+ # --- MongoDB 配置 ---
131
+ MONGO_URI = os.getenv('MONGO_URI', 'mongodb://localhost:27017')
132
+ MONGO_DATABASE = f'{{project_name}}_db'
133
+ MONGO_COLLECTION = '{{project_name}}_items'
134
+ MONGO_MAX_POOL_SIZE = 200
135
+ MONGO_MIN_POOL_SIZE = 20
136
+
137
+ # ============================== 去重过滤配置 ==============================
33
138
 
34
- # ============================== 去重过滤 ==============================
139
+ REQUEST_DIR = '.'
140
+
141
+ # 去重过滤器(推荐分布式项目使用 Redis 过滤器)
35
142
  FILTER_CLASS = 'crawlo.filters.memory_filter.MemoryFilter'
143
+ # FILTER_CLASS = 'crawlo.filters.aioredis_filter.AioRedisFilter' # 分布式去重
144
+
145
+ # --- Redis 配置(用于分布式去重和队列) ---
146
+ REDIS_HOST = os.getenv('REDIS_HOST', '127.0.0.1')
147
+ REDIS_PORT = int(os.getenv('REDIS_PORT', 6379))
148
+ REDIS_PASSWORD = os.getenv('REDIS_PASSWORD', '')
149
+
150
+ # 根据是否有密码生成 URL
151
+ if REDIS_PASSWORD:
152
+ REDIS_URL = f'redis://:{REDIS_PASSWORD}@{REDIS_HOST}:{REDIS_PORT}/0'
153
+ else:
154
+ REDIS_URL = f'redis://{REDIS_HOST}:{REDIS_PORT}/0'
155
+
156
+ REDIS_KEY = f'{{project_name}}:fingerprint'
157
+ REDIS_TTL = 0
158
+ CLEANUP_FP = 0
159
+ FILTER_DEBUG = True
160
+ DECODE_RESPONSES = True
161
+
162
+ # ============================== 中间件配置 ==============================
36
163
 
37
- # ============================== 中间件 & 管道 ==============================
38
164
  MIDDLEWARES = [
165
+ # === 请求预处理阶段 ===
39
166
  'crawlo.middleware.request_ignore.RequestIgnoreMiddleware',
40
167
  'crawlo.middleware.download_delay.DownloadDelayMiddleware',
41
168
  'crawlo.middleware.default_header.DefaultHeaderMiddleware',
42
169
  'crawlo.middleware.proxy.ProxyMiddleware',
170
+
171
+ # === 响应处理阶段 ===
43
172
  'crawlo.middleware.retry.RetryMiddleware',
44
173
  'crawlo.middleware.response_code.ResponseCodeMiddleware',
45
174
  'crawlo.middleware.response_filter.ResponseFilterMiddleware',
46
175
  ]
47
176
 
177
+ # ============================== 数据管道配置 ==============================
178
+
48
179
  PIPELINES = [
49
180
  'crawlo.pipelines.console_pipeline.ConsolePipeline',
181
+ # '{{project_name}}.pipelines.DatabasePipeline', # 自定义数据库管道
182
+ # 'crawlo.pipelines.mysql_pipeline.AsyncmyMySQLPipeline', # MySQL 存储
183
+ # 'crawlo.pipelines.mongo_pipeline.MongoPipeline', # MongoDB 存储
50
184
  ]
51
185
 
52
- # ============================== 日志 ==============================
186
+ # ============================== 扩展组件 ==============================
187
+
188
+ EXTENSIONS = [
189
+ 'crawlo.extension.log_interval.LogIntervalExtension',
190
+ 'crawlo.extension.log_stats.LogStats',
191
+ 'crawlo.extension.logging_extension.CustomLoggerExtension',
192
+ ]
193
+
194
+ # ============================== 日志配置 ==============================
195
+
53
196
  LOG_LEVEL = 'INFO'
54
- LOG_FILE = f'logs/{{{project_name}}}.log'
197
+ STATS_DUMP = True
198
+ LOG_FILE = f'logs/{{project_name}}.log'
199
+ LOG_FORMAT = '%(asctime)s - [%(name)s] - %(levelname)s: %(message)s'
200
+ LOG_ENCODING = 'utf-8'
201
+
202
+ # ============================== 代理配置 ==============================
203
+
204
+ PROXY_ENABLED = False
205
+ PROXY_API_URL = "" # 请填入真实的代理API地址
206
+ PROXY_EXTRACTOR = "proxy"
207
+ PROXY_REFRESH_INTERVAL = 60
208
+ PROXY_API_TIMEOUT = 10
209
+
210
+ # ============================== 浏览器指纹配置 ==============================
211
+
212
+ # CurlCffi 下载器专用配置
213
+ CURL_BROWSER_TYPE = "chrome"
214
+ CURL_BROWSER_VERSION_MAP = {
215
+ "chrome": "chrome136",
216
+ "edge": "edge101",
217
+ "safari": "safari184",
218
+ "firefox": "firefox135",
219
+ }
220
+
221
+ # 默认请求头
222
+ DEFAULT_REQUEST_HEADERS = {
223
+ 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 '
224
+ '(KHTML, like Gecko) Chrome/136.0.0.0 Safari/537.36',
225
+ 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
226
+ 'Accept-Language': 'zh-CN,zh;q=0.9,en;q=0.8',
227
+ 'Accept-Encoding': 'gzip, deflate, br',
228
+ 'Connection': 'keep-alive',
229
+ 'Upgrade-Insecure-Requests': '1',
230
+ }
231
+
232
+ # ============================== 开发与调试 ==============================
233
+
234
+ # 开发模式配置
235
+ DEBUG = False
236
+ TESTING = False
237
+
238
+ # 性能监控
239
+ ENABLE_PERFORMANCE_MONITORING = True
240
+ MEMORY_USAGE_WARNING_THRESHOLD = 500 # MB
241
+
242
+ # ============================== 自定义配置区域 ==============================
243
+ # 在此处添加项目特定的配置项
244
+
245
+ # 示例:目标网站特定配置
246
+ # TARGET_DOMAIN = '{{domain}}'
247
+ # MAX_PAGES_PER_DOMAIN = 10000
248
+ # CUSTOM_RATE_LIMIT = 1.5
@@ -3,30 +3,176 @@
3
3
  {{project_name}}.spiders.{{spider_name}}
4
4
  =======================================
5
5
  由 `crawlo genspider` 命令生成的爬虫。
6
+ 基于 Crawlo 框架,支持异步并发、分布式爬取等功能。
7
+
8
+ 使用示例:
9
+ crawlo crawl {{spider_name}}
6
10
  """
7
11
 
8
12
  from crawlo.spider import Spider
13
+ from crawlo import Request
14
+ # from {{project_name}}.items import {{item_class}} # 可选:导入数据项
9
15
 
10
16
 
11
17
  class {{class_name}}(Spider):
12
18
  """
13
19
  爬虫:{{spider_name}}
20
+
21
+ 功能说明:
22
+ - 支持并发爬取
23
+ - 自动去重过滤
24
+ - 错误重试机制
25
+ - 数据管道处理
14
26
  """
15
27
  name = '{{spider_name}}'
16
28
  allowed_domains = ['{{domain}}']
17
29
  start_urls = ['https://{{domain}}/']
30
+
31
+ # 高级配置(可选)
32
+ # custom_settings = {
33
+ # 'DOWNLOAD_DELAY': 2.0,
34
+ # 'CONCURRENCY': 4,
35
+ # 'RETRY_HTTP_CODES': [500, 502, 503, 504, 408, 429],
36
+ # }
37
+
38
+ def start_requests(self):
39
+ """
40
+ 生成初始请求。
41
+
42
+ 支持自定义请求头、代理、优先级等。
43
+ """
44
+ headers = {
45
+ 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
46
+ 'Accept-Language': 'zh-CN,zh;q=0.8,en-US;q=0.5,en;q=0.3',
47
+ }
48
+
49
+ for url in self.start_urls:
50
+ yield Request(
51
+ url=url,
52
+ callback=self.parse,
53
+ headers=headers,
54
+ # meta={'proxy': 'http://proxy.example.com:8080'}, # 自定义代理
55
+ # priority=10, # 请求优先级(数字越大优先级越高)
56
+ )
18
57
 
19
58
  def parse(self, response):
20
59
  """
21
60
  解析响应的主方法。
61
+
62
+ Args:
63
+ response: 响应对象,包含页面内容和元数据
64
+
65
+ Yields:
66
+ Request: 新的请求对象(用于深度爬取)
67
+ Item: 数据项对象(用于数据存储)
22
68
  """
23
- # TODO: 在这里编写你的解析逻辑
24
-
25
- # 示例:提取数据
69
+ self.logger.info(f'正在解析页面: {response.url}')
70
+
71
+ # ================== 数据提取示例 ==================
72
+
73
+ # 提取数据并创建 Item
26
74
  # item = {{item_class}}()
27
- # item['title'] = response.xpath('//title/text()').get()
75
+ # item['title'] = response.xpath('//title/text()').get(default='')
76
+ # item['url'] = response.url
77
+ # item['content'] = response.xpath('//div[@class="content"]//text()').getall()
28
78
  # yield item
79
+
80
+ # 直接返回字典(简单数据)
81
+ yield {
82
+ 'title': response.xpath('//title/text()').get(default=''),
83
+ 'url': response.url,
84
+ 'status_code': response.status_code,
85
+ # 'description': response.xpath('//meta[@name="description"]/@content').get(),
86
+ # 'keywords': response.xpath('//meta[@name="keywords"]/@content').get(),
87
+ }
88
+
89
+ # ================== 链接提取示例 ==================
90
+
91
+ # 提取并跟进链接
92
+ # links = response.xpath('//a/@href').getall()
93
+ # for link in links:
94
+ # # 过滤有效链接
95
+ # if link and not link.startswith(('javascript:', 'mailto:', '#')):
96
+ # yield response.follow(
97
+ # link,
98
+ # callback=self.parse_detail, # 或者 self.parse 继续递归
99
+ # meta={'parent_url': response.url} # 传递父页面信息
100
+ # )
101
+
102
+ # 用 CSS 选择器提取链接
103
+ # for link in response.css('a.item-link::attr(href)').getall():
104
+ # yield response.follow(link, callback=self.parse_detail)
105
+
106
+ # ================== 分页处理示例 ==================
107
+
108
+ # 处理分页
109
+ # next_page = response.xpath('//a[@class="next"]/@href').get()
110
+ # if next_page:
111
+ # yield response.follow(next_page, callback=self.parse)
112
+
113
+ # 数字分页
114
+ # current_page = int(response.meta.get('page', 1))
115
+ # max_pages = 100 # 设置最大页数
116
+ # if current_page < max_pages:
117
+ # next_url = f'https://{{domain}}/page/{current_page + 1}'
118
+ # yield Request(
119
+ # url=next_url,
120
+ # callback=self.parse,
121
+ # meta={'page': current_page + 1}
122
+ # )
123
+
124
+ def parse_detail(self, response):
125
+ """
126
+ 解析详情页面的方法(可选)。
127
+
128
+ 用于处理从列表页跳转而来的详情页。
129
+ """
130
+ self.logger.info(f'正在解析详情页: {response.url}')
131
+
132
+ # parent_url = response.meta.get('parent_url', '')
133
+ #
134
+ # yield {
135
+ # 'title': response.xpath('//h1/text()').get(default=''),
136
+ # 'content': '\n'.join(response.xpath('//div[@class="content"]//text()').getall()),
137
+ # 'url': response.url,
138
+ # 'parent_url': parent_url,
139
+ # 'publish_time': response.xpath('//time/@datetime').get(),
140
+ # }
141
+
142
+ pass
29
143
 
30
- # 示例:提取链接并跟进
31
- # for href in response.xpath('//a/@href').getall():
32
- # yield response.follow(href, callback=self.parse)
144
+ def parse_error(self, failure):
145
+ """
146
+ 处理请求失败的方法(可选)。
147
+
148
+ 当请求失败时会被调用。
149
+ """
150
+ self.logger.error(f'请求失败: {failure.request.url} - {failure.value}')
151
+
152
+ # 可以选择重试或记录失败信息
153
+ # yield {
154
+ # 'error_url': failure.request.url,
155
+ # 'error_message': str(failure.value),
156
+ # 'error_type': failure.type.__name__,
157
+ # }
158
+
159
+ def spider_opened(self, spider):
160
+ """
161
+ 爬虫启动时的回调方法(可选)。
162
+ """
163
+ self.logger.info(f'爬虫 {spider.name} 已启动')
164
+
165
+ # 初始化操作,例如连接数据库、加载配置等
166
+ # self.database = self.connect_database()
167
+ # self.cookies = self.load_cookies()
168
+
169
+ def spider_closed(self, spider, reason):
170
+ """
171
+ 爬虫关闭时的回调方法(可选)。
172
+ """
173
+ self.logger.info(f'爬虫 {spider.name} 已关闭,原因: {reason}')
174
+
175
+ # 清理操作,例如关闭数据库连接、保存状态等
176
+ # if hasattr(self, 'database'):
177
+ # self.database.close()
178
+ # self.save_cookies()