crawlo 1.2.3__py3-none-any.whl → 1.2.5__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of crawlo might be problematic. Click here for more details.

Files changed (222) hide show
  1. crawlo/__init__.py +61 -61
  2. crawlo/__version__.py +1 -1
  3. crawlo/cleaners/__init__.py +60 -60
  4. crawlo/cleaners/data_formatter.py +225 -225
  5. crawlo/cleaners/encoding_converter.py +125 -125
  6. crawlo/cleaners/text_cleaner.py +232 -232
  7. crawlo/cli.py +88 -81
  8. crawlo/commands/__init__.py +14 -14
  9. crawlo/commands/check.py +594 -594
  10. crawlo/commands/genspider.py +151 -151
  11. crawlo/commands/help.py +144 -142
  12. crawlo/commands/list.py +155 -155
  13. crawlo/commands/run.py +323 -292
  14. crawlo/commands/startproject.py +436 -417
  15. crawlo/commands/stats.py +187 -187
  16. crawlo/commands/utils.py +186 -186
  17. crawlo/config.py +312 -312
  18. crawlo/config_validator.py +251 -251
  19. crawlo/core/__init__.py +2 -2
  20. crawlo/core/engine.py +354 -354
  21. crawlo/core/processor.py +40 -40
  22. crawlo/core/scheduler.py +143 -143
  23. crawlo/crawler.py +1110 -1027
  24. crawlo/data/__init__.py +5 -5
  25. crawlo/data/user_agents.py +107 -107
  26. crawlo/downloader/__init__.py +266 -266
  27. crawlo/downloader/aiohttp_downloader.py +220 -220
  28. crawlo/downloader/cffi_downloader.py +256 -256
  29. crawlo/downloader/httpx_downloader.py +259 -259
  30. crawlo/downloader/hybrid_downloader.py +212 -212
  31. crawlo/downloader/playwright_downloader.py +402 -402
  32. crawlo/downloader/selenium_downloader.py +472 -472
  33. crawlo/event.py +11 -11
  34. crawlo/exceptions.py +81 -81
  35. crawlo/extension/__init__.py +37 -37
  36. crawlo/extension/health_check.py +141 -141
  37. crawlo/extension/log_interval.py +57 -57
  38. crawlo/extension/log_stats.py +81 -81
  39. crawlo/extension/logging_extension.py +43 -43
  40. crawlo/extension/memory_monitor.py +104 -104
  41. crawlo/extension/performance_profiler.py +133 -133
  42. crawlo/extension/request_recorder.py +107 -107
  43. crawlo/filters/__init__.py +154 -154
  44. crawlo/filters/aioredis_filter.py +280 -280
  45. crawlo/filters/memory_filter.py +269 -269
  46. crawlo/items/__init__.py +23 -23
  47. crawlo/items/base.py +21 -21
  48. crawlo/items/fields.py +52 -52
  49. crawlo/items/items.py +104 -104
  50. crawlo/middleware/__init__.py +21 -21
  51. crawlo/middleware/default_header.py +131 -131
  52. crawlo/middleware/download_delay.py +104 -104
  53. crawlo/middleware/middleware_manager.py +135 -135
  54. crawlo/middleware/offsite.py +114 -114
  55. crawlo/middleware/proxy.py +367 -367
  56. crawlo/middleware/request_ignore.py +86 -86
  57. crawlo/middleware/response_code.py +163 -163
  58. crawlo/middleware/response_filter.py +136 -136
  59. crawlo/middleware/retry.py +124 -124
  60. crawlo/mode_manager.py +211 -211
  61. crawlo/network/__init__.py +21 -21
  62. crawlo/network/request.py +338 -338
  63. crawlo/network/response.py +359 -359
  64. crawlo/pipelines/__init__.py +21 -21
  65. crawlo/pipelines/bloom_dedup_pipeline.py +156 -156
  66. crawlo/pipelines/console_pipeline.py +39 -39
  67. crawlo/pipelines/csv_pipeline.py +316 -316
  68. crawlo/pipelines/database_dedup_pipeline.py +222 -222
  69. crawlo/pipelines/json_pipeline.py +218 -218
  70. crawlo/pipelines/memory_dedup_pipeline.py +115 -115
  71. crawlo/pipelines/mongo_pipeline.py +131 -131
  72. crawlo/pipelines/mysql_pipeline.py +317 -317
  73. crawlo/pipelines/pipeline_manager.py +61 -61
  74. crawlo/pipelines/redis_dedup_pipeline.py +165 -165
  75. crawlo/project.py +279 -187
  76. crawlo/queue/pqueue.py +37 -37
  77. crawlo/queue/queue_manager.py +337 -337
  78. crawlo/queue/redis_priority_queue.py +298 -298
  79. crawlo/settings/__init__.py +7 -7
  80. crawlo/settings/default_settings.py +217 -226
  81. crawlo/settings/setting_manager.py +122 -122
  82. crawlo/spider/__init__.py +639 -639
  83. crawlo/stats_collector.py +59 -59
  84. crawlo/subscriber.py +129 -129
  85. crawlo/task_manager.py +30 -30
  86. crawlo/templates/crawlo.cfg.tmpl +10 -10
  87. crawlo/templates/project/__init__.py.tmpl +3 -3
  88. crawlo/templates/project/items.py.tmpl +17 -17
  89. crawlo/templates/project/middlewares.py.tmpl +118 -118
  90. crawlo/templates/project/pipelines.py.tmpl +96 -96
  91. crawlo/templates/project/settings.py.tmpl +324 -325
  92. crawlo/templates/project/settings_distributed.py.tmpl +154 -121
  93. crawlo/templates/project/settings_gentle.py.tmpl +127 -94
  94. crawlo/templates/project/settings_high_performance.py.tmpl +149 -151
  95. crawlo/templates/project/settings_simple.py.tmpl +102 -68
  96. crawlo/templates/project/spiders/__init__.py.tmpl +5 -5
  97. crawlo/templates/{project/run.py.tmpl → run.py.tmpl} +47 -45
  98. crawlo/templates/spider/spider.py.tmpl +143 -143
  99. crawlo/tools/__init__.py +182 -182
  100. crawlo/tools/anti_crawler.py +268 -268
  101. crawlo/tools/authenticated_proxy.py +240 -240
  102. crawlo/tools/data_validator.py +180 -180
  103. crawlo/tools/date_tools.py +35 -35
  104. crawlo/tools/distributed_coordinator.py +386 -386
  105. crawlo/tools/retry_mechanism.py +220 -220
  106. crawlo/tools/scenario_adapter.py +262 -262
  107. crawlo/utils/__init__.py +35 -35
  108. crawlo/utils/batch_processor.py +259 -259
  109. crawlo/utils/controlled_spider_mixin.py +439 -439
  110. crawlo/utils/date_tools.py +290 -290
  111. crawlo/utils/db_helper.py +343 -343
  112. crawlo/utils/enhanced_error_handler.py +356 -356
  113. crawlo/utils/env_config.py +105 -105
  114. crawlo/utils/error_handler.py +123 -123
  115. crawlo/utils/func_tools.py +82 -82
  116. crawlo/utils/large_scale_config.py +286 -286
  117. crawlo/utils/large_scale_helper.py +344 -344
  118. crawlo/utils/log.py +128 -128
  119. crawlo/utils/performance_monitor.py +285 -285
  120. crawlo/utils/queue_helper.py +175 -175
  121. crawlo/utils/redis_connection_pool.py +334 -334
  122. crawlo/utils/redis_key_validator.py +198 -198
  123. crawlo/utils/request.py +267 -267
  124. crawlo/utils/request_serializer.py +218 -218
  125. crawlo/utils/spider_loader.py +61 -61
  126. crawlo/utils/system.py +11 -11
  127. crawlo/utils/tools.py +4 -4
  128. crawlo/utils/url.py +39 -39
  129. {crawlo-1.2.3.dist-info → crawlo-1.2.5.dist-info}/METADATA +764 -692
  130. crawlo-1.2.5.dist-info/RECORD +206 -0
  131. examples/__init__.py +7 -7
  132. tests/DOUBLE_CRAWLO_PREFIX_FIX_REPORT.md +81 -81
  133. tests/__init__.py +7 -7
  134. tests/advanced_tools_example.py +275 -275
  135. tests/authenticated_proxy_example.py +236 -236
  136. tests/cleaners_example.py +160 -160
  137. tests/config_validation_demo.py +102 -102
  138. tests/controlled_spider_example.py +205 -205
  139. tests/date_tools_example.py +180 -180
  140. tests/dynamic_loading_example.py +523 -523
  141. tests/dynamic_loading_test.py +104 -104
  142. tests/env_config_example.py +133 -133
  143. tests/error_handling_example.py +171 -171
  144. tests/redis_key_validation_demo.py +130 -130
  145. tests/response_improvements_example.py +144 -144
  146. tests/test_advanced_tools.py +148 -148
  147. tests/test_all_redis_key_configs.py +145 -145
  148. tests/test_authenticated_proxy.py +141 -141
  149. tests/test_cleaners.py +54 -54
  150. tests/test_comprehensive.py +146 -146
  151. tests/test_config_validator.py +193 -193
  152. tests/test_crawlo_proxy_integration.py +172 -172
  153. tests/test_date_tools.py +123 -123
  154. tests/test_default_header_middleware.py +158 -158
  155. tests/test_double_crawlo_fix.py +207 -207
  156. tests/test_double_crawlo_fix_simple.py +124 -124
  157. tests/test_download_delay_middleware.py +221 -221
  158. tests/test_downloader_proxy_compatibility.py +268 -268
  159. tests/test_dynamic_downloaders_proxy.py +124 -124
  160. tests/test_dynamic_proxy.py +92 -92
  161. tests/test_dynamic_proxy_config.py +146 -146
  162. tests/test_dynamic_proxy_real.py +109 -109
  163. tests/test_edge_cases.py +303 -303
  164. tests/test_enhanced_error_handler.py +270 -270
  165. tests/test_env_config.py +121 -121
  166. tests/test_error_handler_compatibility.py +112 -112
  167. tests/test_final_validation.py +153 -153
  168. tests/test_framework_env_usage.py +103 -103
  169. tests/test_integration.py +356 -356
  170. tests/test_item_dedup_redis_key.py +122 -122
  171. tests/test_offsite_middleware.py +221 -221
  172. tests/test_parsel.py +29 -29
  173. tests/test_performance.py +327 -327
  174. tests/test_proxy_api.py +264 -264
  175. tests/test_proxy_health_check.py +32 -32
  176. tests/test_proxy_middleware.py +121 -121
  177. tests/test_proxy_middleware_enhanced.py +216 -216
  178. tests/test_proxy_middleware_integration.py +136 -136
  179. tests/test_proxy_providers.py +56 -56
  180. tests/test_proxy_stats.py +19 -19
  181. tests/test_proxy_strategies.py +59 -59
  182. tests/test_queue_manager_double_crawlo.py +173 -173
  183. tests/test_queue_manager_redis_key.py +176 -176
  184. tests/test_real_scenario_proxy.py +195 -195
  185. tests/test_redis_config.py +28 -28
  186. tests/test_redis_connection_pool.py +294 -294
  187. tests/test_redis_key_naming.py +181 -181
  188. tests/test_redis_key_validator.py +123 -123
  189. tests/test_redis_queue.py +224 -224
  190. tests/test_request_ignore_middleware.py +182 -182
  191. tests/test_request_serialization.py +70 -70
  192. tests/test_response_code_middleware.py +349 -349
  193. tests/test_response_filter_middleware.py +427 -427
  194. tests/test_response_improvements.py +152 -152
  195. tests/test_retry_middleware.py +241 -241
  196. tests/test_scheduler.py +241 -241
  197. tests/test_simple_response.py +61 -61
  198. tests/test_telecom_spider_redis_key.py +205 -205
  199. tests/test_template_content.py +87 -87
  200. tests/test_template_redis_key.py +134 -134
  201. tests/test_tools.py +153 -153
  202. tests/tools_example.py +257 -257
  203. crawlo-1.2.3.dist-info/RECORD +0 -222
  204. examples/aiohttp_settings.py +0 -42
  205. examples/curl_cffi_settings.py +0 -41
  206. examples/default_header_middleware_example.py +0 -107
  207. examples/default_header_spider_example.py +0 -129
  208. examples/download_delay_middleware_example.py +0 -160
  209. examples/httpx_settings.py +0 -42
  210. examples/multi_downloader_proxy_example.py +0 -81
  211. examples/offsite_middleware_example.py +0 -55
  212. examples/offsite_spider_example.py +0 -107
  213. examples/proxy_spider_example.py +0 -166
  214. examples/request_ignore_middleware_example.py +0 -51
  215. examples/request_ignore_spider_example.py +0 -99
  216. examples/response_code_middleware_example.py +0 -52
  217. examples/response_filter_middleware_example.py +0 -67
  218. examples/tong_hua_shun_settings.py +0 -62
  219. examples/tong_hua_shun_spider.py +0 -170
  220. {crawlo-1.2.3.dist-info → crawlo-1.2.5.dist-info}/WHEEL +0 -0
  221. {crawlo-1.2.3.dist-info → crawlo-1.2.5.dist-info}/entry_points.txt +0 -0
  222. {crawlo-1.2.3.dist-info → crawlo-1.2.5.dist-info}/top_level.txt +0 -0
@@ -1,160 +0,0 @@
1
- #!/usr/bin/python
2
- # -*- coding:UTF-8 -*-
3
- """
4
- DownloadDelayMiddleware 使用示例
5
- 展示如何配置和使用下载延迟中间件
6
- """
7
-
8
- import asyncio
9
- from crawlo.settings.setting_manager import SettingManager
10
- from crawlo.middleware.download_delay import DownloadDelayMiddleware
11
-
12
-
13
- def example_with_fixed_delay():
14
- """固定延迟示例"""
15
- print("=== 固定延迟示例 ===")
16
-
17
- # 创建设置管理器
18
- settings = SettingManager()
19
-
20
- # 配置固定延迟
21
- settings.set('DOWNLOAD_DELAY', 2.0) # 2秒固定延迟
22
- settings.set('RANDOMNESS', False) # 不启用随机延迟
23
- settings.set('LOG_LEVEL', 'INFO') # 设置日志级别
24
-
25
- # 创建爬虫模拟对象
26
- class MockCrawler:
27
- def __init__(self, settings):
28
- self.settings = settings
29
- self.stats = None
30
-
31
- crawler = MockCrawler(settings)
32
-
33
- # 创建中间件实例
34
- middleware = DownloadDelayMiddleware.create_instance(crawler)
35
-
36
- print(f"延迟设置: {middleware.delay}秒")
37
- print(f"是否启用随机延迟: {middleware.randomness}")
38
- print("中间件创建成功!")
39
-
40
-
41
- def example_with_random_delay():
42
- """随机延迟示例"""
43
- print("\n=== 随机延迟示例 ===")
44
-
45
- # 创建设置管理器
46
- settings = SettingManager()
47
-
48
- # 配置随机延迟
49
- settings.set('DOWNLOAD_DELAY', 1.0) # 基础延迟1秒
50
- settings.set('RANDOMNESS', True) # 启用随机延迟
51
- settings.set('RANDOM_RANGE', [0.5, 2.0]) # 随机范围因子
52
- settings.set('LOG_LEVEL', 'INFO') # 设置日志级别
53
-
54
- # 创建爬虫模拟对象
55
- class MockCrawler:
56
- def __init__(self, settings):
57
- self.settings = settings
58
- self.stats = None
59
-
60
- crawler = MockCrawler(settings)
61
-
62
- # 创建中间件实例
63
- middleware = DownloadDelayMiddleware.create_instance(crawler)
64
-
65
- print(f"基础延迟设置: {middleware.delay}秒")
66
- print(f"是否启用随机延迟: {middleware.randomness}")
67
- print(f"随机范围: {middleware.floor} - {middleware.upper}")
68
- print(f"实际延迟范围: {middleware.delay * middleware.floor} - {middleware.delay * middleware.upper}秒")
69
- print("中间件创建成功!")
70
-
71
-
72
- def example_with_invalid_config():
73
- """无效配置示例"""
74
- print("\n=== 无效配置示例 ===")
75
-
76
- # 创建设置管理器
77
- settings = SettingManager()
78
-
79
- # 配置无效的延迟(0值)
80
- settings.set('DOWNLOAD_DELAY', 0) # 无效延迟
81
- settings.set('LOG_LEVEL', 'INFO') # 设置日志级别
82
-
83
- # 创建爬虫模拟对象
84
- class MockCrawler:
85
- def __init__(self, settings):
86
- self.settings = settings
87
- self.stats = None
88
-
89
- crawler = MockCrawler(settings)
90
-
91
- try:
92
- # 尝试创建中间件实例
93
- middleware = DownloadDelayMiddleware.create_instance(crawler)
94
- print("中间件创建成功!")
95
- except Exception as e:
96
- print(f"中间件创建失败: {e}")
97
-
98
-
99
- def example_with_stats():
100
- """带统计信息的示例"""
101
- print("\n=== 带统计信息的示例 ===")
102
-
103
- # 创建设置管理器
104
- settings = SettingManager()
105
-
106
- # 配置固定延迟
107
- settings.set('DOWNLOAD_DELAY', 1.0) # 1秒固定延迟
108
- settings.set('RANDOMNESS', False) # 不启用随机延迟
109
- settings.set('LOG_LEVEL', 'INFO') # 设置日志级别
110
-
111
- # 创建统计收集器模拟对象
112
- class MockStats:
113
- def __init__(self):
114
- self.stats = {}
115
-
116
- def inc_value(self, key, value=1):
117
- if key in self.stats:
118
- self.stats[key] += value
119
- else:
120
- self.stats[key] = value
121
-
122
- def __str__(self):
123
- return str(self.stats)
124
-
125
- # 创建爬虫模拟对象
126
- class MockCrawler:
127
- def __init__(self, settings):
128
- self.settings = settings
129
- self.stats = MockStats()
130
-
131
- crawler = MockCrawler(settings)
132
-
133
- # 创建中间件实例
134
- middleware = DownloadDelayMiddleware.create_instance(crawler)
135
-
136
- print(f"延迟设置: {middleware.delay}秒")
137
- print("中间件创建成功!")
138
-
139
- # 模拟处理请求
140
- class MockRequest:
141
- pass
142
-
143
- class MockSpider:
144
- pass
145
-
146
- request = MockRequest()
147
- spider = MockSpider()
148
-
149
- # 执行请求处理
150
- asyncio.run(middleware.process_request(request, spider))
151
-
152
- print(f"统计信息: {crawler.stats}")
153
-
154
-
155
- if __name__ == '__main__':
156
- # 运行所有示例
157
- example_with_fixed_delay()
158
- example_with_random_delay()
159
- example_with_invalid_config()
160
- example_with_stats()
@@ -1,42 +0,0 @@
1
- #!/usr/bin/env python
2
- # -*- coding: utf-8 -*-
3
- """
4
- httpx下载器配置示例
5
- """
6
-
7
- # 基础配置
8
- SETTINGS = {
9
- # 下载器配置
10
- 'DOWNLOADER': 'crawlo.downloader.httpx_downloader.HttpXDownloader',
11
- 'DOWNLOADER_TYPE': 'httpx',
12
-
13
- # httpx特定配置
14
- 'HTTPX_HTTP2': True,
15
- 'HTTPX_FOLLOW_REDIRECTS': True,
16
-
17
- # 代理配置
18
- 'PROXY_ENABLED': True,
19
- 'PROXY_API_URL': 'http://test.proxy.api:8080/proxy/getitem/',
20
- 'PROXY_EXTRACTOR': 'proxy',
21
- 'PROXY_REFRESH_INTERVAL': 60,
22
- 'PROXY_POOL_SIZE': 5,
23
-
24
- # 通用下载配置
25
- 'DOWNLOAD_TIMEOUT': 30,
26
- 'CONNECTION_POOL_LIMIT': 100,
27
- 'CONNECTION_POOL_LIMIT_PER_HOST': 20,
28
- 'DOWNLOAD_MAXSIZE': 10 * 1024 * 1024, # 10MB
29
- 'VERIFY_SSL': True,
30
-
31
- # 日志配置
32
- 'LOG_LEVEL': 'INFO',
33
- }
34
-
35
- def get_settings():
36
- """获取配置"""
37
- return SETTINGS
38
-
39
- if __name__ == "__main__":
40
- print("httpx下载器配置:")
41
- for key, value in SETTINGS.items():
42
- print(f" {key}: {value}")
@@ -1,81 +0,0 @@
1
- #!/usr/bin/env python
2
- # -*- coding: utf-8 -*-
3
- """
4
- Crawlo框架多下载器代理配置示例
5
- 展示如何在Crawlo中配置不同下载器并使用代理功能
6
- """
7
-
8
- # aiohttp下载器配置
9
- DOWNLOADER_CONFIGS = {
10
- "aiohttp": {
11
- 'DOWNLOADER': 'crawlo.downloader.aiohttp_downloader.AioHttpDownloader',
12
- 'DOWNLOADER_TYPE': 'aiohttp',
13
- # aiohttp特定配置
14
- 'AIOHTTP_AUTO_DECOMPRESS': True,
15
- 'AIOHTTP_FORCE_CLOSE': False,
16
- },
17
-
18
- "httpx": {
19
- 'DOWNLOADER': 'crawlo.downloader.httpx_downloader.HttpXDownloader',
20
- 'DOWNLOADER_TYPE': 'httpx',
21
- # httpx特定配置
22
- 'HTTPX_HTTP2': True,
23
- 'HTTPX_FOLLOW_REDIRECTS': True,
24
- },
25
-
26
- "curl_cffi": {
27
- 'DOWNLOADER': 'crawlo.downloader.cffi_downloader.CurlCffiDownloader',
28
- 'DOWNLOADER_TYPE': 'curl_cffi',
29
- # curl-cffi特定配置
30
- 'CURL_BROWSER_TYPE': 'chrome',
31
- }
32
- }
33
-
34
- # 通用配置(适用于所有下载器)
35
- COMMON_SETTINGS = {
36
- # 代理配置
37
- 'PROXY_ENABLED': True,
38
- 'PROXY_API_URL': 'http://test.proxy.api:8080/proxy/getitem/',
39
- 'PROXY_EXTRACTOR': 'proxy',
40
- 'PROXY_REFRESH_INTERVAL': 60,
41
- 'PROXY_POOL_SIZE': 5,
42
-
43
- # 下载器通用配置
44
- 'DOWNLOAD_TIMEOUT': 30,
45
- 'CONNECTION_POOL_LIMIT': 100,
46
- 'CONNECTION_POOL_LIMIT_PER_HOST': 20,
47
- 'DOWNLOAD_MAXSIZE': 10 * 1024 * 1024, # 10MB
48
- 'VERIFY_SSL': True,
49
-
50
- # 日志配置
51
- 'LOG_LEVEL': 'INFO',
52
- }
53
-
54
- def get_downloader_settings(downloader_type):
55
- """
56
- 获取指定下载器的完整配置
57
- """
58
- if downloader_type not in DOWNLOADER_CONFIGS:
59
- raise ValueError(f"不支持的下载器类型: {downloader_type}")
60
-
61
- # 合并通用配置和特定下载器配置
62
- settings = COMMON_SETTINGS.copy()
63
- settings.update(DOWNLOADER_CONFIGS[downloader_type])
64
- return settings
65
-
66
- # 使用示例
67
- if __name__ == "__main__":
68
- print("Crawlo框架多下载器代理配置示例")
69
- print("=" * 50)
70
-
71
- for downloader_type in DOWNLOADER_CONFIGS.keys():
72
- print(f"\n{downloader_type.upper()} 下载器配置:")
73
- settings = get_downloader_settings(downloader_type)
74
- for key, value in settings.items():
75
- print(f" {key}: {value}")
76
-
77
- print("\n" + "=" * 50)
78
- print("所有下载器均已适配代理中间件:")
79
- print("✓ aiohttp: 通过 meta 传递代理认证信息")
80
- print("✓ httpx: 直接使用代理URL")
81
- print("✓ curl-cffi: 支持 str 和 dict 格式代理")
@@ -1,55 +0,0 @@
1
- #!/usr/bin/env python
2
- # -*- coding: utf-8 -*-
3
- """
4
- OffsiteMiddleware 使用示例
5
- 展示如何配置和使用OffsiteMiddleware来限制爬虫只爬取指定域名
6
- """
7
-
8
- # 基础配置
9
- SETTINGS = {
10
- # 允许的域名列表(OffsiteMiddleware会使用这个配置)
11
- 'ALLOWED_DOMAINS': [
12
- 'example.com',
13
- 'www.example.com',
14
- 'subdomain.example.com'
15
- ],
16
-
17
- # 中间件配置(OffsiteMiddleware已默认启用)
18
- 'MIDDLEWARES': [
19
- # === 请求预处理阶段 ===
20
- 'crawlo.middleware.request_ignore.RequestIgnoreMiddleware', # 1. 忽略无效请求
21
- 'crawlo.middleware.download_delay.DownloadDelayMiddleware', # 2. 控制请求频率
22
- 'crawlo.middleware.default_header.DefaultHeaderMiddleware', # 3. 添加默认请求头
23
- 'crawlo.middleware.proxy.ProxyMiddleware', # 4. 设置代理
24
- 'crawlo.middleware.offsite.OffsiteMiddleware', # 5. 站外请求过滤
25
-
26
- # === 响应处理阶段 ===
27
- 'crawlo.middleware.retry.RetryMiddleware', # 6. 失败请求重试
28
- 'crawlo.middleware.response_code.ResponseCodeMiddleware', # 7. 处理特殊状态码
29
- 'crawlo.middleware.response_filter.ResponseFilterMiddleware', # 8. 响应内容过滤
30
- ],
31
-
32
- # 其他常用配置
33
- 'DOWNLOAD_DELAY': 1,
34
- 'CONCURRENCY': 8,
35
- 'LOG_LEVEL': 'INFO',
36
- }
37
-
38
- def get_settings():
39
- """获取配置"""
40
- return SETTINGS
41
-
42
- if __name__ == "__main__":
43
- print("OffsiteMiddleware配置示例:")
44
- print("=" * 40)
45
- print(f"允许的域名: {SETTINGS['ALLOWED_DOMAINS']}")
46
- print("\n中间件列表:")
47
- for i, middleware in enumerate(SETTINGS['MIDDLEWARES'], 1):
48
- print(f" {i}. {middleware}")
49
-
50
- print("\n" + "=" * 40)
51
- print("OffsiteMiddleware功能说明:")
52
- print("✓ 自动过滤不在ALLOWED_DOMAINS中的请求")
53
- print("✓ 支持子域名匹配")
54
- print("✓ 记录被过滤的请求统计信息")
55
- print("✓ 可通过设置ALLOWED_DOMAINS=[]来禁用此中间件")
@@ -1,107 +0,0 @@
1
- #!/usr/bin/env python
2
- # -*- coding: utf-8 -*-
3
- """
4
- 使用OffsiteMiddleware的爬虫示例
5
- 展示如何在实际爬虫中使用OffsiteMiddleware限制爬取范围
6
- """
7
-
8
- from crawlo.spider import Spider
9
- from crawlo.network.request import Request
10
-
11
-
12
- class ExampleSpider(Spider):
13
- """
14
- 示例爬虫,演示OffsiteMiddleware的使用
15
- """
16
-
17
- # 爬虫名称
18
- name = "example_offsite_spider"
19
-
20
- # 自定义设置
21
- custom_settings = {
22
- # 允许的域名列表
23
- 'ALLOWED_DOMAINS': [
24
- 'httpbin.org',
25
- 'example.com',
26
- 'www.example.com'
27
- ],
28
-
29
- # 请求延迟(秒)
30
- 'DOWNLOAD_DELAY': 1,
31
-
32
- # 并发数
33
- 'CONCURRENCY': 4,
34
-
35
- # 日志级别
36
- 'LOG_LEVEL': 'INFO',
37
- }
38
-
39
- def start_requests(self):
40
- """
41
- 开始请求
42
- """
43
- # 这些URL会被允许
44
- allowed_urls = [
45
- 'https://httpbin.org/ip',
46
- 'https://httpbin.org/user-agent',
47
- 'https://example.com/page1',
48
- 'https://www.example.com/page2'
49
- ]
50
-
51
- # 这些URL会被过滤(站外请求)
52
- offsite_urls = [
53
- 'https://google.com',
54
- 'https://github.com',
55
- 'https://stackoverflow.com'
56
- ]
57
-
58
- # 生成允许的请求
59
- for url in allowed_urls:
60
- yield Request(url=url, callback=self.parse_allowed)
61
-
62
- # 生成站外请求(会被OffsiteMiddleware过滤)
63
- for url in offsite_urls:
64
- yield Request(url=url, callback=self.parse_offsite)
65
-
66
- async def parse_allowed(self, response):
67
- """
68
- 处理允许的请求响应
69
- """
70
- self.logger.info(f"成功处理允许的请求: {response.url}")
71
- self.logger.info(f"状态码: {response.status_code}")
72
- # 这里可以添加解析逻辑
73
-
74
- async def parse_offsite(self, response):
75
- """
76
- 这个方法实际上不会被调用,因为站外请求会被过滤
77
- """
78
- self.logger.info(f"这个消息不应该出现: {response.url}")
79
-
80
-
81
- # 运行爬虫的示例代码
82
- if __name__ == "__main__":
83
- """
84
- 运行说明:
85
-
86
- 1. 确保已在项目根目录下安装了crawlo:
87
- pip install -e .
88
-
89
- 2. 运行爬虫:
90
- crawlo run example_offsite_spider
91
-
92
- 3. 观察日志输出:
93
- - 允许的域名请求会被正常处理
94
- - 站外请求会被OffsiteMiddleware过滤,并在日志中显示过滤信息
95
- - 统计信息会记录被过滤的请求数量
96
-
97
- OffsiteMiddleware的优势:
98
- ✓ 防止爬虫意外爬取到无关网站
99
- ✓ 节省带宽和服务器资源
100
- ✓ 提高爬取效率,专注于目标网站
101
- ✓ 可配置的域名白名单,灵活控制爬取范围
102
- """
103
- print("OffsiteSpider示例")
104
- print("=" * 30)
105
- print("此爬虫演示了OffsiteMiddleware的使用方法")
106
- print("请使用以下命令运行:")
107
- print(" crawlo run example_offsite_spider")
@@ -1,166 +0,0 @@
1
- #!/usr/bin/python
2
- # -*- coding: UTF-8 -*-
3
- """
4
- 代理爬虫示例
5
- ==============
6
- 展示如何在Crawlo框架中使用代理API爬取网站
7
- """
8
-
9
- import asyncio
10
- import sys
11
- import os
12
-
13
- # 添加项目根目录到Python路径
14
- sys.path.insert(0, os.path.join(os.path.dirname(__file__), '..'))
15
-
16
- from crawlo import Spider, Request
17
- from crawlo.network.response import Response
18
-
19
-
20
- class ProxyExampleSpider(Spider):
21
- """使用代理的示例爬虫"""
22
- name = 'proxy_example_spider'
23
-
24
- def __init__(self):
25
- super().__init__()
26
- # 要爬取的URL列表
27
- self.urls = [
28
- 'https://httpbin.org/ip', # 查看当前IP
29
- 'https://httpbin.org/headers', # 查看请求头
30
- 'https://stock.10jqka.com.cn/20240315/c655957791.shtml', # 测试目标链接
31
- ]
32
-
33
- def start_requests(self):
34
- """生成初始请求"""
35
- for i, url in enumerate(self.urls):
36
- # 为每个请求添加一些元数据
37
- request = Request(
38
- url=url,
39
- callback=self.parse,
40
- meta={'request_id': i}
41
- )
42
- yield request
43
-
44
- def parse(self, response: Response):
45
- """解析响应"""
46
- request_id = response.request.meta.get('request_id', 'unknown')
47
-
48
- print(f"\n{'='*50}")
49
- print(f"请求 #{request_id}: {response.url}")
50
- print(f"状态码: {response.status_code}")
51
- print(f"{'='*50}")
52
-
53
- # 特殊处理httpbin.org的响应
54
- if 'httpbin.org/ip' in response.url:
55
- print("当前IP信息:")
56
- print(response.text[:500])
57
-
58
- elif 'httpbin.org/headers' in response.url:
59
- print("请求头信息:")
60
- print(response.text[:500])
61
-
62
- else:
63
- # 处理目标网站
64
- print("页面标题:")
65
- title = response.css('title::text').get()
66
- if title:
67
- print(f" {title}")
68
- else:
69
- print(" 未找到标题")
70
-
71
- print("\n页面内容预览:")
72
- # 清理HTML标签,只显示文本内容
73
- text_content = response.css('*::text').getall()
74
- if text_content:
75
- # 合并前几个文本片段
76
- content = ''.join(text_content[:10])
77
- print(f" {content[:200]}{'...' if len(content) > 200 else ''}")
78
- else:
79
- print(" 无文本内容")
80
-
81
- # 返回结果
82
- return {
83
- 'request_id': request_id,
84
- 'url': response.url,
85
- 'status_code': response.status_code,
86
- 'title': response.css('title::text').get(),
87
- }
88
-
89
-
90
- # 配置说明
91
- SETTINGS = {
92
- # 基础配置
93
- 'LOG_LEVEL': 'INFO',
94
- 'CONCURRENCY': 2,
95
-
96
- # 代理配置
97
- 'PROXY_ENABLED': True,
98
- 'PROXY_API_URL': 'http://test.proxy.api:8080/proxy/getitem/',
99
- 'PROXY_EXTRACTOR': 'proxy',
100
- 'PROXY_REFRESH_INTERVAL': 60, # 1分钟刷新一次
101
- 'PROXY_API_TIMEOUT': 10,
102
- 'PROXY_POOL_SIZE': 5,
103
- 'PROXY_HEALTH_CHECK_THRESHOLD': 0.5,
104
-
105
- # 下载延迟
106
- 'DOWNLOAD_DELAY': 1,
107
- 'RANDOMNESS': True,
108
-
109
- # 中间件
110
- 'MIDDLEWARES': [
111
- 'crawlo.middleware.request_ignore.RequestIgnoreMiddleware',
112
- 'crawlo.middleware.download_delay.DownloadDelayMiddleware',
113
- 'crawlo.middleware.default_header.DefaultHeaderMiddleware',
114
- 'crawlo.middleware.proxy.ProxyMiddleware',
115
- 'crawlo.middleware.retry.RetryMiddleware',
116
- 'crawlo.middleware.response_code.ResponseCodeMiddleware',
117
- 'crawlo.middleware.response_filter.ResponseFilterMiddleware',
118
- ],
119
-
120
- # 管道
121
- 'PIPELINES': [
122
- 'crawlo.pipelines.console_pipeline.ConsolePipeline',
123
- ],
124
-
125
- # 默认请求头
126
- 'DEFAULT_REQUEST_HEADERS': {
127
- 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/124.0.0.0 Safari/537.36',
128
- 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
129
- 'Accept-Language': 'zh-CN,zh;q=0.9,en;q=0.8',
130
- 'Accept-Encoding': 'gzip, deflate, br',
131
- 'Connection': 'keep-alive',
132
- 'Upgrade-Insecure-Requests': '1',
133
- }
134
- }
135
-
136
-
137
- def main():
138
- """主函数"""
139
- print("代理爬虫示例")
140
- print("=" * 50)
141
- print("此示例展示如何在Crawlo框架中使用代理API")
142
- print("代理API: http://test.proxy.api:8080/proxy/getitem/")
143
- print("目标网站: https://stock.10jqka.com.cn/20240315/c655957791.shtml")
144
- print("=" * 50)
145
-
146
- print("\n使用方法:")
147
- print("1. 确保在settings.py中配置了代理参数")
148
- print("2. 运行爬虫: crawlo run proxy_example_spider")
149
- print("3. 爬虫会自动使用代理API获取代理并应用到请求中")
150
-
151
- print("\n配置示例:")
152
- for key, value in SETTINGS.items():
153
- if key in ['MIDDLEWARES', 'PIPELINES', 'DEFAULT_REQUEST_HEADERS']:
154
- print(f"{key}:")
155
- if isinstance(value, list):
156
- for item in value:
157
- print(f" - {item}")
158
- elif isinstance(value, dict):
159
- for k, v in value.items():
160
- print(f" {k}: {v}")
161
- else:
162
- print(f"{key}: {value}")
163
-
164
-
165
- if __name__ == '__main__':
166
- main()
@@ -1,51 +0,0 @@
1
- #!/usr/bin/env python
2
- # -*- coding: utf-8 -*-
3
- """
4
- RequestIgnoreMiddleware 使用示例
5
- 展示如何使用RequestIgnoreMiddleware处理和记录被忽略的请求
6
- """
7
-
8
- # RequestIgnoreMiddleware是默认启用的中间件,无需特殊配置
9
- # 它会自动处理IgnoreRequestError异常并记录相关统计信息
10
-
11
- # 中间件配置(RequestIgnoreMiddleware已默认启用)
12
- SETTINGS = {
13
- 'MIDDLEWARES': [
14
- # === 请求预处理阶段 ===
15
- 'crawlo.middleware.request_ignore.RequestIgnoreMiddleware', # 1. 忽略无效请求
16
- 'crawlo.middleware.download_delay.DownloadDelayMiddleware', # 2. 控制请求频率
17
- 'crawlo.middleware.default_header.DefaultHeaderMiddleware', # 3. 添加默认请求头
18
- 'crawlo.middleware.proxy.ProxyMiddleware', # 4. 设置代理
19
- 'crawlo.middleware.offsite.OffsiteMiddleware', # 5. 站外请求过滤
20
-
21
- # === 响应处理阶段 ===
22
- 'crawlo.middleware.retry.RetryMiddleware', # 6. 失败请求重试
23
- 'crawlo.middleware.response_code.ResponseCodeMiddleware', # 7. 处理特殊状态码
24
- 'crawlo.middleware.response_filter.ResponseFilterMiddleware', # 8. 响应内容过滤
25
- ],
26
-
27
- # 其他常用配置
28
- 'DOWNLOAD_DELAY': 1,
29
- 'CONCURRENCY': 8,
30
- 'LOG_LEVEL': 'INFO',
31
- }
32
-
33
- def get_settings():
34
- """获取配置"""
35
- return SETTINGS
36
-
37
- if __name__ == "__main__":
38
- print("RequestIgnoreMiddleware配置示例:")
39
- print("=" * 40)
40
- print("中间件列表:")
41
- for i, middleware in enumerate(SETTINGS['MIDDLEWARES'], 1):
42
- print(f" {i}. {middleware}")
43
-
44
- print("\n" + "=" * 40)
45
- print("RequestIgnoreMiddleware功能说明:")
46
- print("✓ 自动处理IgnoreRequestError异常")
47
- print("✓ 记录被忽略请求的详细统计信息")
48
- print("✓ 按忽略原因分类统计")
49
- print("✓ 按域名分布统计")
50
- print("✓ 提供详细的日志信息")
51
- print("✓ 无需特殊配置,默认启用")