crawlo 1.1.1__py3-none-any.whl → 1.1.3__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of crawlo might be problematic. Click here for more details.

Files changed (128) hide show
  1. crawlo/__init__.py +34 -33
  2. crawlo/__version__.py +1 -1
  3. crawlo/cli.py +40 -40
  4. crawlo/commands/__init__.py +13 -13
  5. crawlo/commands/check.py +594 -594
  6. crawlo/commands/genspider.py +152 -126
  7. crawlo/commands/list.py +156 -147
  8. crawlo/commands/run.py +285 -285
  9. crawlo/commands/startproject.py +196 -111
  10. crawlo/commands/stats.py +188 -188
  11. crawlo/commands/utils.py +187 -0
  12. crawlo/config.py +280 -0
  13. crawlo/core/__init__.py +2 -2
  14. crawlo/core/engine.py +171 -158
  15. crawlo/core/enhanced_engine.py +190 -0
  16. crawlo/core/processor.py +40 -40
  17. crawlo/core/scheduler.py +166 -57
  18. crawlo/crawler.py +1028 -495
  19. crawlo/downloader/__init__.py +242 -78
  20. crawlo/downloader/aiohttp_downloader.py +212 -199
  21. crawlo/downloader/cffi_downloader.py +251 -241
  22. crawlo/downloader/httpx_downloader.py +259 -246
  23. crawlo/event.py +11 -11
  24. crawlo/exceptions.py +82 -78
  25. crawlo/extension/__init__.py +31 -31
  26. crawlo/extension/log_interval.py +49 -49
  27. crawlo/extension/log_stats.py +44 -44
  28. crawlo/extension/logging_extension.py +34 -34
  29. crawlo/filters/__init__.py +154 -37
  30. crawlo/filters/aioredis_filter.py +242 -150
  31. crawlo/filters/memory_filter.py +269 -202
  32. crawlo/items/__init__.py +23 -23
  33. crawlo/items/base.py +21 -21
  34. crawlo/items/fields.py +53 -53
  35. crawlo/items/items.py +104 -104
  36. crawlo/middleware/__init__.py +21 -21
  37. crawlo/middleware/default_header.py +32 -32
  38. crawlo/middleware/download_delay.py +28 -28
  39. crawlo/middleware/middleware_manager.py +135 -135
  40. crawlo/middleware/proxy.py +248 -245
  41. crawlo/middleware/request_ignore.py +30 -30
  42. crawlo/middleware/response_code.py +18 -18
  43. crawlo/middleware/response_filter.py +26 -26
  44. crawlo/middleware/retry.py +125 -90
  45. crawlo/mode_manager.py +201 -0
  46. crawlo/network/__init__.py +21 -7
  47. crawlo/network/request.py +311 -203
  48. crawlo/network/response.py +271 -166
  49. crawlo/pipelines/__init__.py +22 -13
  50. crawlo/pipelines/bloom_dedup_pipeline.py +157 -0
  51. crawlo/pipelines/console_pipeline.py +39 -39
  52. crawlo/pipelines/csv_pipeline.py +317 -0
  53. crawlo/pipelines/database_dedup_pipeline.py +225 -0
  54. crawlo/pipelines/json_pipeline.py +219 -0
  55. crawlo/pipelines/memory_dedup_pipeline.py +116 -0
  56. crawlo/pipelines/mongo_pipeline.py +116 -116
  57. crawlo/pipelines/mysql_pipeline.py +195 -195
  58. crawlo/pipelines/pipeline_manager.py +56 -56
  59. crawlo/pipelines/redis_dedup_pipeline.py +163 -0
  60. crawlo/project.py +153 -153
  61. crawlo/queue/__init__.py +0 -0
  62. crawlo/queue/pqueue.py +37 -0
  63. crawlo/queue/queue_manager.py +308 -0
  64. crawlo/queue/redis_priority_queue.py +209 -0
  65. crawlo/settings/__init__.py +7 -7
  66. crawlo/settings/default_settings.py +245 -167
  67. crawlo/settings/setting_manager.py +99 -99
  68. crawlo/spider/__init__.py +639 -129
  69. crawlo/stats_collector.py +59 -59
  70. crawlo/subscriber.py +106 -106
  71. crawlo/task_manager.py +30 -27
  72. crawlo/templates/crawlo.cfg.tmpl +10 -10
  73. crawlo/templates/project/__init__.py.tmpl +3 -3
  74. crawlo/templates/project/items.py.tmpl +17 -17
  75. crawlo/templates/project/middlewares.py.tmpl +87 -76
  76. crawlo/templates/project/pipelines.py.tmpl +342 -64
  77. crawlo/templates/project/run.py.tmpl +252 -0
  78. crawlo/templates/project/settings.py.tmpl +251 -54
  79. crawlo/templates/project/spiders/__init__.py.tmpl +5 -5
  80. crawlo/templates/spider/spider.py.tmpl +178 -32
  81. crawlo/utils/__init__.py +7 -7
  82. crawlo/utils/controlled_spider_mixin.py +440 -0
  83. crawlo/utils/date_tools.py +233 -233
  84. crawlo/utils/db_helper.py +343 -343
  85. crawlo/utils/func_tools.py +82 -82
  86. crawlo/utils/large_scale_config.py +287 -0
  87. crawlo/utils/large_scale_helper.py +344 -0
  88. crawlo/utils/log.py +128 -128
  89. crawlo/utils/queue_helper.py +176 -0
  90. crawlo/utils/request.py +267 -267
  91. crawlo/utils/request_serializer.py +220 -0
  92. crawlo/utils/spider_loader.py +62 -62
  93. crawlo/utils/system.py +11 -11
  94. crawlo/utils/tools.py +4 -4
  95. crawlo/utils/url.py +39 -39
  96. crawlo-1.1.3.dist-info/METADATA +635 -0
  97. crawlo-1.1.3.dist-info/RECORD +113 -0
  98. examples/__init__.py +7 -7
  99. examples/controlled_spider_example.py +205 -0
  100. tests/__init__.py +7 -7
  101. tests/test_final_validation.py +154 -0
  102. tests/test_proxy_health_check.py +32 -32
  103. tests/test_proxy_middleware_integration.py +136 -136
  104. tests/test_proxy_providers.py +56 -56
  105. tests/test_proxy_stats.py +19 -19
  106. tests/test_proxy_strategies.py +59 -59
  107. tests/test_redis_config.py +29 -0
  108. tests/test_redis_queue.py +225 -0
  109. tests/test_request_serialization.py +71 -0
  110. tests/test_scheduler.py +242 -0
  111. crawlo/pipelines/mysql_batch_pipline.py +0 -273
  112. crawlo/utils/pqueue.py +0 -174
  113. crawlo-1.1.1.dist-info/METADATA +0 -220
  114. crawlo-1.1.1.dist-info/RECORD +0 -100
  115. examples/baidu_spider/__init__.py +0 -7
  116. examples/baidu_spider/demo.py +0 -94
  117. examples/baidu_spider/items.py +0 -46
  118. examples/baidu_spider/middleware.py +0 -49
  119. examples/baidu_spider/pipeline.py +0 -55
  120. examples/baidu_spider/run.py +0 -27
  121. examples/baidu_spider/settings.py +0 -121
  122. examples/baidu_spider/spiders/__init__.py +0 -7
  123. examples/baidu_spider/spiders/bai_du.py +0 -61
  124. examples/baidu_spider/spiders/miit.py +0 -159
  125. examples/baidu_spider/spiders/sina.py +0 -79
  126. {crawlo-1.1.1.dist-info → crawlo-1.1.3.dist-info}/WHEEL +0 -0
  127. {crawlo-1.1.1.dist-info → crawlo-1.1.3.dist-info}/entry_points.txt +0 -0
  128. {crawlo-1.1.1.dist-info → crawlo-1.1.3.dist-info}/top_level.txt +0 -0
@@ -1,54 +1,251 @@
1
- # -*- coding: UTF-8 -*-
2
- """自动创建的 settings.py 文件"""
3
-
4
- PROJECT_NAME = '{{project_name}}'
5
- VERSION = '1.0'
6
-
7
- # ============================== 网络请求配置 ==============================
8
- DOWNLOADER = "crawlo.downloader.aiohttp_downloader.AioHttpDownloader"
9
- DOWNLOAD_TIMEOUT = 60
10
- VERIFY_SSL = True
11
- USE_SESSION = True
12
-
13
- DOWNLOAD_DELAY = 1.0
14
- RANDOMNESS = True
15
-
16
- MAX_RETRY_TIMES = 3
17
- RETRY_HTTP_CODES = [408, 429, 500, 502, 503, 504, 522, 524]
18
- IGNORE_HTTP_CODES = [403, 404]
19
-
20
- CONNECTION_POOL_LIMIT = 100
21
-
22
- # ============================== 并发与调度 ==============================
23
- CONCURRENCY = 8
24
- MAX_RUNNING_SPIDERS = 3
25
-
26
- # ============================== 数据存储 ==============================
27
- MYSQL_HOST = '127.0.0.1'
28
- MYSQL_PORT = 3306
29
- MYSQL_USER = 'root'
30
- MYSQL_PASSWORD = '123456'
31
- MYSQL_DB = '{{project_name}}'
32
- MYSQL_TABLE = 'crawled_data'
33
-
34
- # ============================== 去重过滤 ==============================
35
- FILTER_CLASS = 'crawlo.filters.memory_filter.MemoryFilter'
36
-
37
- # ============================== 中间件 & 管道 ==============================
38
- MIDDLEWARES = [
39
- 'crawlo.middleware.request_ignore.RequestIgnoreMiddleware',
40
- 'crawlo.middleware.download_delay.DownloadDelayMiddleware',
41
- 'crawlo.middleware.default_header.DefaultHeaderMiddleware',
42
- 'crawlo.middleware.proxy.ProxyMiddleware',
43
- 'crawlo.middleware.retry.RetryMiddleware',
44
- 'crawlo.middleware.response_code.ResponseCodeMiddleware',
45
- 'crawlo.middleware.response_filter.ResponseFilterMiddleware',
46
- ]
47
-
48
- PIPELINES = [
49
- 'crawlo.pipelines.console_pipeline.ConsolePipeline',
50
- ]
51
-
52
- # ============================== 日志 ==============================
53
- LOG_LEVEL = 'INFO'
54
- LOG_FILE = f'logs/{{{project_name}}}.log'
1
+ # -*- coding: UTF-8 -*-
2
+ """
3
+ {{project_name}} 项目配置文件
4
+ =============================
5
+ 基于 Crawlo 框架的爬虫项目配置。
6
+
7
+ 🎯 快速开始:
8
+
9
+ # 方式1:使用默认单机模式(推荐)
10
+ from crawlo.crawler import CrawlerProcess
11
+ process = CrawlerProcess() # 无需任何配置
12
+
13
+ # 方式2:使用配置工厂
14
+ from crawlo.config import CrawloConfig
15
+ config = CrawloConfig.standalone() # 单机模式
16
+ config = CrawloConfig.distributed(redis_host='192.168.1.100') # 分布式模式
17
+ process = CrawlerProcess(settings=config.to_dict())
18
+
19
+ # 方式3:使用环境变量
20
+ from crawlo.config import CrawloConfig
21
+ config = CrawloConfig.from_env() # 从环境变量读取
22
+ """
23
+ import os
24
+ from crawlo.config import CrawloConfig
25
+
26
+ # ============================== 项目基本信息 ==============================
27
+ PROJECT_NAME = '{{project_name}}'
28
+ VERSION = '1.0.0'
29
+
30
+ # ============================== 运行模式选择 ==============================
31
+
32
+ # 🎯 选择一种配置方式:
33
+
34
+ # 方式1:使用配置工厂(推荐)
35
+ # 单机模式(默认)
36
+ CONFIG = CrawloConfig.standalone(
37
+ concurrency=8,
38
+ download_delay=1.0
39
+ )
40
+
41
+ # 分布式模式(去掉注释并修改 Redis 地址)
42
+ # CONFIG = CrawloConfig.distributed(
43
+ # redis_host='127.0.0.1',
44
+ # redis_password='your_password', # 如果有密码
45
+ # project_name='{{project_name}}',
46
+ # concurrency=16,
47
+ # download_delay=1.0
48
+ # )
49
+
50
+ # 自动检测模式
51
+ # CONFIG = CrawloConfig.auto(concurrency=12)
52
+
53
+ # 方式2:从环境变量读取(适合部署)
54
+ # CONFIG = CrawloConfig.from_env()
55
+
56
+ # 方式3:使用预设配置
57
+ # from crawlo.config import Presets
58
+ # CONFIG = Presets.development() # 开发环境
59
+ # CONFIG = Presets.production() # 生产环境
60
+
61
+ # 获取最终配置
62
+ locals().update(CONFIG.to_dict())
63
+
64
+ # ============================== 网络请求配置 ==============================
65
+
66
+ # 下载器选择(推荐使用 CurlCffi,支持浏览器指纹模拟)
67
+ DOWNLOADER = "crawlo.downloader.cffi_downloader.CurlCffiDownloader" # 支持浏览器指纹
68
+ # DOWNLOADER = "crawlo.downloader.aiohttp_downloader.AioHttpDownloader" # 轻量级选择
69
+ # DOWNLOADER = "crawlo.downloader.httpx_downloader.HttpXDownloader" # HTTP/2 支持
70
+
71
+ # 请求超时与安全
72
+ DOWNLOAD_TIMEOUT = 30
73
+ VERIFY_SSL = True
74
+ USE_SESSION = True
75
+
76
+ # 请求延迟控制(防反爬)
77
+ DOWNLOAD_DELAY = 1.0
78
+ RANDOM_RANGE = (0.8, 1.2)
79
+ RANDOMNESS = True
80
+
81
+ # 重试策略
82
+ MAX_RETRY_TIMES = 3
83
+ RETRY_PRIORITY = -1
84
+ RETRY_HTTP_CODES = [408, 429, 500, 502, 503, 504, 522, 524]
85
+ IGNORE_HTTP_CODES = [403, 404]
86
+ ALLOWED_CODES = []
87
+
88
+ # 连接池配置
89
+ CONNECTION_POOL_LIMIT = 50
90
+ DOWNLOAD_MAXSIZE = 10 * 1024 * 1024 # 10MB
91
+ DOWNLOAD_WARN_SIZE = 1024 * 1024 # 1MB
92
+
93
+ # ============================== 并发与调度配置 ==============================
94
+ CONCURRENCY = 8
95
+ INTERVAL = 5
96
+ DEPTH_PRIORITY = 1
97
+ MAX_RUNNING_SPIDERS = 3
98
+
99
+ # ============================== 队列配置(支持分布式) ==============================
100
+
101
+ # 队列类型:'auto'(自动选择), 'memory'(内存队列), 'redis'(分布式队列)
102
+ QUEUE_TYPE = 'auto'
103
+ SCHEDULER_MAX_QUEUE_SIZE = 2000
104
+ SCHEDULER_QUEUE_NAME = f'{{project_name}}:requests'
105
+ QUEUE_MAX_RETRIES = 3
106
+ QUEUE_TIMEOUT = 300
107
+
108
+ # 大规模爬取优化
109
+ LARGE_SCALE_BATCH_SIZE = 1000
110
+ LARGE_SCALE_CHECKPOINT_INTERVAL = 5000
111
+ LARGE_SCALE_MAX_MEMORY_USAGE = 500
112
+
113
+ # ============================== 数据存储配置 ==============================
114
+
115
+ # --- MySQL 配置 ---
116
+ MYSQL_HOST = os.getenv('MYSQL_HOST', '127.0.0.1')
117
+ MYSQL_PORT = int(os.getenv('MYSQL_PORT', 3306))
118
+ MYSQL_USER = os.getenv('MYSQL_USER', 'root')
119
+ MYSQL_PASSWORD = os.getenv('MYSQL_PASSWORD', '123456')
120
+ MYSQL_DB = os.getenv('MYSQL_DB', '{{project_name}}')
121
+ MYSQL_TABLE = '{{project_name}}_data'
122
+ MYSQL_BATCH_SIZE = 100
123
+
124
+ # MySQL 连接池
125
+ MYSQL_FLUSH_INTERVAL = 5
126
+ MYSQL_POOL_MIN = 5
127
+ MYSQL_POOL_MAX = 20
128
+ MYSQL_ECHO = False
129
+
130
+ # --- MongoDB 配置 ---
131
+ MONGO_URI = os.getenv('MONGO_URI', 'mongodb://localhost:27017')
132
+ MONGO_DATABASE = f'{{project_name}}_db'
133
+ MONGO_COLLECTION = '{{project_name}}_items'
134
+ MONGO_MAX_POOL_SIZE = 200
135
+ MONGO_MIN_POOL_SIZE = 20
136
+
137
+ # ============================== 去重过滤配置 ==============================
138
+
139
+ REQUEST_DIR = '.'
140
+
141
+ # 去重过滤器(推荐分布式项目使用 Redis 过滤器)
142
+ FILTER_CLASS = 'crawlo.filters.memory_filter.MemoryFilter'
143
+ # FILTER_CLASS = 'crawlo.filters.aioredis_filter.AioRedisFilter' # 分布式去重
144
+
145
+ # --- Redis 配置(用于分布式去重和队列) ---
146
+ REDIS_HOST = os.getenv('REDIS_HOST', '127.0.0.1')
147
+ REDIS_PORT = int(os.getenv('REDIS_PORT', 6379))
148
+ REDIS_PASSWORD = os.getenv('REDIS_PASSWORD', '')
149
+
150
+ # 根据是否有密码生成 URL
151
+ if REDIS_PASSWORD:
152
+ REDIS_URL = f'redis://:{REDIS_PASSWORD}@{REDIS_HOST}:{REDIS_PORT}/0'
153
+ else:
154
+ REDIS_URL = f'redis://{REDIS_HOST}:{REDIS_PORT}/0'
155
+
156
+ REDIS_KEY = f'{{project_name}}:fingerprint'
157
+ REDIS_TTL = 0
158
+ CLEANUP_FP = 0
159
+ FILTER_DEBUG = True
160
+ DECODE_RESPONSES = True
161
+
162
+ # ============================== 中间件配置 ==============================
163
+
164
+ MIDDLEWARES = [
165
+ # === 请求预处理阶段 ===
166
+ 'crawlo.middleware.request_ignore.RequestIgnoreMiddleware',
167
+ 'crawlo.middleware.download_delay.DownloadDelayMiddleware',
168
+ 'crawlo.middleware.default_header.DefaultHeaderMiddleware',
169
+ 'crawlo.middleware.proxy.ProxyMiddleware',
170
+
171
+ # === 响应处理阶段 ===
172
+ 'crawlo.middleware.retry.RetryMiddleware',
173
+ 'crawlo.middleware.response_code.ResponseCodeMiddleware',
174
+ 'crawlo.middleware.response_filter.ResponseFilterMiddleware',
175
+ ]
176
+
177
+ # ============================== 数据管道配置 ==============================
178
+
179
+ PIPELINES = [
180
+ # 根据运行模式自动选择默认去重管道
181
+ # 单机模式:crawlo.pipelines.MemoryDedupPipeline
182
+ # 分布式模式:crawlo.pipelines.RedisDedupPipeline
183
+ 'crawlo.pipelines.console_pipeline.ConsolePipeline',
184
+ # '{{project_name}}.pipelines.DatabasePipeline', # 自定义数据库管道
185
+ # 'crawlo.pipelines.mysql_pipeline.AsyncmyMySQLPipeline', # MySQL 存储
186
+ # 'crawlo.pipelines.mongo_pipeline.MongoPipeline', # MongoDB 存储
187
+ ]
188
+
189
+ # ============================== 扩展组件 ==============================
190
+
191
+ EXTENSIONS = [
192
+ 'crawlo.extension.log_interval.LogIntervalExtension',
193
+ 'crawlo.extension.log_stats.LogStats',
194
+ 'crawlo.extension.logging_extension.CustomLoggerExtension',
195
+ ]
196
+
197
+ # ============================== 日志配置 ==============================
198
+
199
+ LOG_LEVEL = 'INFO'
200
+ STATS_DUMP = True
201
+ LOG_FILE = f'logs/{{project_name}}.log'
202
+ LOG_FORMAT = '%(asctime)s - [%(name)s] - %(levelname)s: %(message)s'
203
+ LOG_ENCODING = 'utf-8'
204
+
205
+ # ============================== 代理配置 ==============================
206
+
207
+ PROXY_ENABLED = False
208
+ PROXY_API_URL = "" # 请填入真实的代理API地址
209
+ PROXY_EXTRACTOR = "proxy"
210
+ PROXY_REFRESH_INTERVAL = 60
211
+ PROXY_API_TIMEOUT = 10
212
+
213
+ # ============================== 浏览器指纹配置 ==============================
214
+
215
+ # CurlCffi 下载器专用配置
216
+ CURL_BROWSER_TYPE = "chrome"
217
+ CURL_BROWSER_VERSION_MAP = {
218
+ "chrome": "chrome136",
219
+ "edge": "edge101",
220
+ "safari": "safari184",
221
+ "firefox": "firefox135",
222
+ }
223
+
224
+ # 默认请求头
225
+ DEFAULT_REQUEST_HEADERS = {
226
+ 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 '
227
+ '(KHTML, like Gecko) Chrome/136.0.0.0 Safari/537.36',
228
+ 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
229
+ 'Accept-Language': 'zh-CN,zh;q=0.9,en;q=0.8',
230
+ 'Accept-Encoding': 'gzip, deflate, br',
231
+ 'Connection': 'keep-alive',
232
+ 'Upgrade-Insecure-Requests': '1',
233
+ }
234
+
235
+ # ============================== 开发与调试 ==============================
236
+
237
+ # 开发模式配置
238
+ DEBUG = False
239
+ TESTING = False
240
+
241
+ # 性能监控
242
+ ENABLE_PERFORMANCE_MONITORING = True
243
+ MEMORY_USAGE_WARNING_THRESHOLD = 500 # MB
244
+
245
+ # ============================== 自定义配置区域 ==============================
246
+ # 在此处添加项目特定的配置项
247
+
248
+ # 示例:目标网站特定配置
249
+ # TARGET_DOMAIN = '{{domain}}'
250
+ # MAX_PAGES_PER_DOMAIN = 10000
251
+ # CUSTOM_RATE_LIMIT = 1.5
@@ -1,6 +1,6 @@
1
- # -*- coding: UTF-8 -*-
2
- """
3
- {{project_name}}.spiders
4
- ========================
5
- 存放所有的爬虫。
1
+ # -*- coding: UTF-8 -*-
2
+ """
3
+ {{project_name}}.spiders
4
+ ========================
5
+ 存放所有的爬虫。
6
6
  """
@@ -1,32 +1,178 @@
1
- # -*- coding: UTF-8 -*-
2
- """
3
- {{project_name}}.spiders.{{spider_name}}
4
- =======================================
5
- 由 `crawlo genspider` 命令生成的爬虫。
6
- """
7
-
8
- from crawlo.spider import Spider
9
-
10
-
11
- class {{class_name}}(Spider):
12
- """
13
- 爬虫:{{spider_name}}
14
- """
15
- name = '{{spider_name}}'
16
- allowed_domains = ['{{domain}}']
17
- start_urls = ['https://{{domain}}/']
18
-
19
- def parse(self, response):
20
- """
21
- 解析响应的主方法。
22
- """
23
- # TODO: 在这里编写你的解析逻辑
24
-
25
- # 示例:提取数据
26
- # item = {{item_class}}()
27
- # item['title'] = response.xpath('//title/text()').get()
28
- # yield item
29
-
30
- # 示例:提取链接并跟进
31
- # for href in response.xpath('//a/@href').getall():
32
- # yield response.follow(href, callback=self.parse)
1
+ # -*- coding: UTF-8 -*-
2
+ """
3
+ {{project_name}}.spiders.{{spider_name}}
4
+ =======================================
5
+ 由 `crawlo genspider` 命令生成的爬虫。
6
+ 基于 Crawlo 框架,支持异步并发、分布式爬取等功能。
7
+
8
+ 使用示例:
9
+ crawlo crawl {{spider_name}}
10
+ """
11
+
12
+ from crawlo.spider import Spider
13
+ from crawlo import Request
14
+ # from {{project_name}}.items import {{item_class}} # 可选:导入数据项
15
+
16
+
17
+ class {{class_name}}(Spider):
18
+ """
19
+ 爬虫:{{spider_name}}
20
+
21
+ 功能说明:
22
+ - 支持并发爬取
23
+ - 自动去重过滤
24
+ - 错误重试机制
25
+ - 数据管道处理
26
+ """
27
+ name = '{{spider_name}}'
28
+ allowed_domains = ['{{domain}}']
29
+ start_urls = ['https://{{domain}}/']
30
+
31
+ # 高级配置(可选)
32
+ # custom_settings = {
33
+ # 'DOWNLOAD_DELAY': 2.0,
34
+ # 'CONCURRENCY': 4,
35
+ # 'RETRY_HTTP_CODES': [500, 502, 503, 504, 408, 429],
36
+ # }
37
+
38
+ def start_requests(self):
39
+ """
40
+ 生成初始请求。
41
+
42
+ 支持自定义请求头、代理、优先级等。
43
+ """
44
+ headers = {
45
+ 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
46
+ 'Accept-Language': 'zh-CN,zh;q=0.8,en-US;q=0.5,en;q=0.3',
47
+ }
48
+
49
+ for url in self.start_urls:
50
+ yield Request(
51
+ url=url,
52
+ callback=self.parse,
53
+ headers=headers,
54
+ # meta={'proxy': 'http://proxy.example.com:8080'}, # 自定义代理
55
+ # priority=10, # 请求优先级(数字越大优先级越高)
56
+ )
57
+
58
+ def parse(self, response):
59
+ """
60
+ 解析响应的主方法。
61
+
62
+ Args:
63
+ response: 响应对象,包含页面内容和元数据
64
+
65
+ Yields:
66
+ Request: 新的请求对象(用于深度爬取)
67
+ Item: 数据项对象(用于数据存储)
68
+ """
69
+ self.logger.info(f'正在解析页面: {response.url}')
70
+
71
+ # ================== 数据提取示例 ==================
72
+
73
+ # 提取数据并创建 Item
74
+ # item = {{item_class}}()
75
+ # item['title'] = response.xpath('//title/text()').get(default='')
76
+ # item['url'] = response.url
77
+ # item['content'] = response.xpath('//div[@class="content"]//text()').getall()
78
+ # yield item
79
+
80
+ # 直接返回字典(简单数据)
81
+ yield {
82
+ 'title': response.xpath('//title/text()').get(default=''),
83
+ 'url': response.url,
84
+ 'status_code': response.status_code,
85
+ # 'description': response.xpath('//meta[@name="description"]/@content').get(),
86
+ # 'keywords': response.xpath('//meta[@name="keywords"]/@content').get(),
87
+ }
88
+
89
+ # ================== 链接提取示例 ==================
90
+
91
+ # 提取并跟进链接
92
+ # links = response.xpath('//a/@href').getall()
93
+ # for link in links:
94
+ # # 过滤有效链接
95
+ # if link and not link.startswith(('javascript:', 'mailto:', '#')):
96
+ # yield response.follow(
97
+ # link,
98
+ # callback=self.parse_detail, # 或者 self.parse 继续递归
99
+ # meta={'parent_url': response.url} # 传递父页面信息
100
+ # )
101
+
102
+ # 用 CSS 选择器提取链接
103
+ # for link in response.css('a.item-link::attr(href)').getall():
104
+ # yield response.follow(link, callback=self.parse_detail)
105
+
106
+ # ================== 分页处理示例 ==================
107
+
108
+ # 处理分页
109
+ # next_page = response.xpath('//a[@class="next"]/@href').get()
110
+ # if next_page:
111
+ # yield response.follow(next_page, callback=self.parse)
112
+
113
+ # 数字分页
114
+ # current_page = int(response.meta.get('page', 1))
115
+ # max_pages = 100 # 设置最大页数
116
+ # if current_page < max_pages:
117
+ # next_url = f'https://{{domain}}/page/{current_page + 1}'
118
+ # yield Request(
119
+ # url=next_url,
120
+ # callback=self.parse,
121
+ # meta={'page': current_page + 1}
122
+ # )
123
+
124
+ def parse_detail(self, response):
125
+ """
126
+ 解析详情页面的方法(可选)。
127
+
128
+ 用于处理从列表页跳转而来的详情页。
129
+ """
130
+ self.logger.info(f'正在解析详情页: {response.url}')
131
+
132
+ # parent_url = response.meta.get('parent_url', '')
133
+ #
134
+ # yield {
135
+ # 'title': response.xpath('//h1/text()').get(default=''),
136
+ # 'content': '\n'.join(response.xpath('//div[@class="content"]//text()').getall()),
137
+ # 'url': response.url,
138
+ # 'parent_url': parent_url,
139
+ # 'publish_time': response.xpath('//time/@datetime').get(),
140
+ # }
141
+
142
+ pass
143
+
144
+ def parse_error(self, failure):
145
+ """
146
+ 处理请求失败的方法(可选)。
147
+
148
+ 当请求失败时会被调用。
149
+ """
150
+ self.logger.error(f'请求失败: {failure.request.url} - {failure.value}')
151
+
152
+ # 可以选择重试或记录失败信息
153
+ # yield {
154
+ # 'error_url': failure.request.url,
155
+ # 'error_message': str(failure.value),
156
+ # 'error_type': failure.type.__name__,
157
+ # }
158
+
159
+ def spider_opened(self, spider):
160
+ """
161
+ 爬虫启动时的回调方法(可选)。
162
+ """
163
+ self.logger.info(f'爬虫 {spider.name} 已启动')
164
+
165
+ # 初始化操作,例如连接数据库、加载配置等
166
+ # self.database = self.connect_database()
167
+ # self.cookies = self.load_cookies()
168
+
169
+ def spider_closed(self, spider, reason):
170
+ """
171
+ 爬虫关闭时的回调方法(可选)。
172
+ """
173
+ self.logger.info(f'爬虫 {spider.name} 已关闭,原因: {reason}')
174
+
175
+ # 清理操作,例如关闭数据库连接、保存状态等
176
+ # if hasattr(self, 'database'):
177
+ # self.database.close()
178
+ # self.save_cookies()
crawlo/utils/__init__.py CHANGED
@@ -1,7 +1,7 @@
1
- #!/usr/bin/python
2
- # -*- coding:UTF-8 -*-
3
- """
4
- # @Time : 2025-02-05 13:57
5
- # @Author : oscar
6
- # @Desc : None
7
- """
1
+ #!/usr/bin/python
2
+ # -*- coding:UTF-8 -*-
3
+ """
4
+ # @Time : 2025-02-05 13:57
5
+ # @Author : oscar
6
+ # @Desc : None
7
+ """