crawlo 1.4.5__py3-none-any.whl → 1.4.7__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of crawlo might be problematic. Click here for more details.

Files changed (375) hide show
  1. crawlo/__init__.py +90 -89
  2. crawlo/__version__.py +1 -1
  3. crawlo/cli.py +75 -75
  4. crawlo/commands/__init__.py +14 -14
  5. crawlo/commands/check.py +594 -594
  6. crawlo/commands/genspider.py +186 -186
  7. crawlo/commands/help.py +140 -138
  8. crawlo/commands/list.py +155 -155
  9. crawlo/commands/run.py +379 -341
  10. crawlo/commands/startproject.py +460 -460
  11. crawlo/commands/stats.py +187 -187
  12. crawlo/commands/utils.py +196 -196
  13. crawlo/config.py +320 -312
  14. crawlo/config_validator.py +277 -277
  15. crawlo/core/__init__.py +52 -52
  16. crawlo/core/engine.py +451 -438
  17. crawlo/core/processor.py +47 -47
  18. crawlo/core/scheduler.py +290 -291
  19. crawlo/crawler.py +698 -657
  20. crawlo/data/__init__.py +5 -5
  21. crawlo/data/user_agents.py +194 -194
  22. crawlo/downloader/__init__.py +280 -276
  23. crawlo/downloader/aiohttp_downloader.py +233 -233
  24. crawlo/downloader/cffi_downloader.py +250 -245
  25. crawlo/downloader/httpx_downloader.py +265 -259
  26. crawlo/downloader/hybrid_downloader.py +212 -212
  27. crawlo/downloader/playwright_downloader.py +425 -402
  28. crawlo/downloader/selenium_downloader.py +486 -472
  29. crawlo/event.py +45 -11
  30. crawlo/exceptions.py +215 -82
  31. crawlo/extension/__init__.py +65 -64
  32. crawlo/extension/health_check.py +141 -141
  33. crawlo/extension/log_interval.py +94 -94
  34. crawlo/extension/log_stats.py +70 -70
  35. crawlo/extension/logging_extension.py +53 -61
  36. crawlo/extension/memory_monitor.py +104 -104
  37. crawlo/extension/performance_profiler.py +133 -133
  38. crawlo/extension/request_recorder.py +107 -107
  39. crawlo/factories/__init__.py +27 -27
  40. crawlo/factories/base.py +68 -68
  41. crawlo/factories/crawler.py +104 -103
  42. crawlo/factories/registry.py +84 -84
  43. crawlo/factories/utils.py +135 -0
  44. crawlo/filters/__init__.py +170 -153
  45. crawlo/filters/aioredis_filter.py +348 -264
  46. crawlo/filters/memory_filter.py +261 -276
  47. crawlo/framework.py +306 -292
  48. crawlo/initialization/__init__.py +44 -44
  49. crawlo/initialization/built_in.py +391 -434
  50. crawlo/initialization/context.py +141 -141
  51. crawlo/initialization/core.py +240 -194
  52. crawlo/initialization/phases.py +230 -149
  53. crawlo/initialization/registry.py +143 -145
  54. crawlo/initialization/utils.py +49 -0
  55. crawlo/interfaces.py +23 -23
  56. crawlo/items/__init__.py +23 -23
  57. crawlo/items/base.py +23 -23
  58. crawlo/items/fields.py +52 -52
  59. crawlo/items/items.py +104 -104
  60. crawlo/logging/__init__.py +42 -46
  61. crawlo/logging/config.py +277 -197
  62. crawlo/logging/factory.py +175 -171
  63. crawlo/logging/manager.py +104 -112
  64. crawlo/middleware/__init__.py +87 -24
  65. crawlo/middleware/default_header.py +132 -132
  66. crawlo/middleware/download_delay.py +104 -104
  67. crawlo/middleware/middleware_manager.py +142 -142
  68. crawlo/middleware/offsite.py +123 -123
  69. crawlo/middleware/proxy.py +209 -386
  70. crawlo/middleware/request_ignore.py +86 -86
  71. crawlo/middleware/response_code.py +150 -150
  72. crawlo/middleware/response_filter.py +136 -136
  73. crawlo/middleware/retry.py +124 -124
  74. crawlo/mode_manager.py +287 -253
  75. crawlo/network/__init__.py +21 -21
  76. crawlo/network/request.py +375 -379
  77. crawlo/network/response.py +569 -664
  78. crawlo/pipelines/__init__.py +53 -22
  79. crawlo/pipelines/base_pipeline.py +452 -0
  80. crawlo/pipelines/bloom_dedup_pipeline.py +146 -146
  81. crawlo/pipelines/console_pipeline.py +39 -39
  82. crawlo/pipelines/csv_pipeline.py +316 -316
  83. crawlo/pipelines/database_dedup_pipeline.py +197 -197
  84. crawlo/pipelines/json_pipeline.py +218 -218
  85. crawlo/pipelines/memory_dedup_pipeline.py +105 -105
  86. crawlo/pipelines/mongo_pipeline.py +140 -132
  87. crawlo/pipelines/mysql_pipeline.py +470 -326
  88. crawlo/pipelines/pipeline_manager.py +100 -100
  89. crawlo/pipelines/redis_dedup_pipeline.py +155 -156
  90. crawlo/project.py +347 -347
  91. crawlo/queue/__init__.py +10 -0
  92. crawlo/queue/pqueue.py +38 -38
  93. crawlo/queue/queue_manager.py +591 -525
  94. crawlo/queue/redis_priority_queue.py +519 -370
  95. crawlo/settings/__init__.py +7 -7
  96. crawlo/settings/default_settings.py +285 -270
  97. crawlo/settings/setting_manager.py +219 -219
  98. crawlo/spider/__init__.py +657 -657
  99. crawlo/stats_collector.py +82 -73
  100. crawlo/subscriber.py +129 -129
  101. crawlo/task_manager.py +138 -138
  102. crawlo/templates/crawlo.cfg.tmpl +10 -10
  103. crawlo/templates/project/__init__.py.tmpl +2 -4
  104. crawlo/templates/project/items.py.tmpl +13 -17
  105. crawlo/templates/project/middlewares.py.tmpl +38 -38
  106. crawlo/templates/project/pipelines.py.tmpl +35 -36
  107. crawlo/templates/project/settings.py.tmpl +110 -157
  108. crawlo/templates/project/settings_distributed.py.tmpl +156 -161
  109. crawlo/templates/project/settings_gentle.py.tmpl +170 -171
  110. crawlo/templates/project/settings_high_performance.py.tmpl +171 -172
  111. crawlo/templates/project/settings_minimal.py.tmpl +99 -77
  112. crawlo/templates/project/settings_simple.py.tmpl +168 -169
  113. crawlo/templates/project/spiders/__init__.py.tmpl +9 -9
  114. crawlo/templates/run.py.tmpl +23 -30
  115. crawlo/templates/spider/spider.py.tmpl +33 -144
  116. crawlo/templates/spiders_init.py.tmpl +5 -10
  117. crawlo/tools/__init__.py +86 -189
  118. crawlo/tools/date_tools.py +289 -289
  119. crawlo/tools/distributed_coordinator.py +384 -384
  120. crawlo/tools/scenario_adapter.py +262 -262
  121. crawlo/tools/text_cleaner.py +232 -232
  122. crawlo/utils/__init__.py +50 -50
  123. crawlo/utils/batch_processor.py +276 -259
  124. crawlo/utils/config_manager.py +442 -0
  125. crawlo/utils/controlled_spider_mixin.py +439 -439
  126. crawlo/utils/db_helper.py +250 -244
  127. crawlo/utils/error_handler.py +410 -410
  128. crawlo/utils/fingerprint.py +121 -121
  129. crawlo/utils/func_tools.py +82 -82
  130. crawlo/utils/large_scale_helper.py +344 -344
  131. crawlo/utils/leak_detector.py +335 -0
  132. crawlo/utils/log.py +79 -79
  133. crawlo/utils/misc.py +81 -81
  134. crawlo/utils/mongo_connection_pool.py +157 -0
  135. crawlo/utils/mysql_connection_pool.py +197 -0
  136. crawlo/utils/performance_monitor.py +285 -285
  137. crawlo/utils/queue_helper.py +175 -175
  138. crawlo/utils/redis_checker.py +91 -0
  139. crawlo/utils/redis_connection_pool.py +578 -388
  140. crawlo/utils/redis_key_validator.py +198 -198
  141. crawlo/utils/request.py +278 -256
  142. crawlo/utils/request_serializer.py +225 -225
  143. crawlo/utils/resource_manager.py +337 -0
  144. crawlo/utils/selector_helper.py +137 -137
  145. crawlo/utils/singleton.py +70 -0
  146. crawlo/utils/spider_loader.py +201 -201
  147. crawlo/utils/text_helper.py +94 -94
  148. crawlo/utils/{url.py → url_utils.py} +39 -39
  149. crawlo-1.4.7.dist-info/METADATA +689 -0
  150. crawlo-1.4.7.dist-info/RECORD +347 -0
  151. examples/__init__.py +7 -7
  152. tests/__init__.py +7 -7
  153. tests/advanced_tools_example.py +217 -275
  154. tests/authenticated_proxy_example.py +110 -106
  155. tests/baidu_performance_test.py +108 -108
  156. tests/baidu_test.py +59 -59
  157. tests/bug_check_test.py +250 -250
  158. tests/cleaners_example.py +160 -160
  159. tests/comprehensive_framework_test.py +212 -212
  160. tests/comprehensive_test.py +81 -81
  161. tests/comprehensive_testing_summary.md +186 -186
  162. tests/config_validation_demo.py +142 -142
  163. tests/controlled_spider_example.py +205 -205
  164. tests/date_tools_example.py +180 -180
  165. tests/debug_configure.py +69 -69
  166. tests/debug_framework_logger.py +84 -84
  167. tests/debug_log_config.py +126 -126
  168. tests/debug_log_levels.py +63 -63
  169. tests/debug_pipelines.py +66 -66
  170. tests/detailed_log_test.py +233 -233
  171. tests/direct_selector_helper_test.py +96 -96
  172. tests/distributed_dedup_test.py +467 -0
  173. tests/distributed_test.py +66 -66
  174. tests/distributed_test_debug.py +76 -76
  175. tests/dynamic_loading_example.py +523 -523
  176. tests/dynamic_loading_test.py +104 -104
  177. tests/error_handling_example.py +171 -171
  178. tests/explain_mysql_update_behavior.py +77 -0
  179. tests/final_comprehensive_test.py +151 -151
  180. tests/final_log_test.py +260 -260
  181. tests/final_validation_test.py +182 -182
  182. tests/fix_log_test.py +142 -142
  183. tests/framework_performance_test.py +202 -202
  184. tests/log_buffering_test.py +111 -111
  185. tests/log_generation_timing_test.py +153 -153
  186. tests/monitor_redis_dedup.sh +72 -0
  187. tests/ofweek_scrapy/ofweek_scrapy/items.py +12 -12
  188. tests/ofweek_scrapy/ofweek_scrapy/middlewares.py +100 -100
  189. tests/ofweek_scrapy/ofweek_scrapy/pipelines.py +13 -13
  190. tests/ofweek_scrapy/ofweek_scrapy/settings.py +84 -84
  191. tests/ofweek_scrapy/scrapy.cfg +11 -11
  192. tests/optimized_performance_test.py +211 -211
  193. tests/performance_comparison.py +244 -244
  194. tests/queue_blocking_test.py +113 -113
  195. tests/queue_test.py +89 -89
  196. tests/redis_key_validation_demo.py +130 -130
  197. tests/request_params_example.py +150 -150
  198. tests/response_improvements_example.py +144 -144
  199. tests/scrapy_comparison/ofweek_scrapy.py +138 -138
  200. tests/scrapy_comparison/scrapy_test.py +133 -133
  201. tests/simple_cli_test.py +55 -0
  202. tests/simple_command_test.py +119 -119
  203. tests/simple_crawlo_test.py +126 -126
  204. tests/simple_follow_test.py +38 -38
  205. tests/simple_log_test2.py +137 -137
  206. tests/simple_optimization_test.py +128 -128
  207. tests/simple_queue_type_test.py +41 -41
  208. tests/simple_response_selector_test.py +94 -94
  209. tests/simple_selector_helper_test.py +154 -154
  210. tests/simple_selector_test.py +207 -207
  211. tests/simple_spider_test.py +49 -49
  212. tests/simple_url_test.py +73 -73
  213. tests/simulate_mysql_update_test.py +140 -0
  214. tests/spider_log_timing_test.py +177 -177
  215. tests/test_advanced_tools.py +148 -148
  216. tests/test_all_commands.py +230 -230
  217. tests/test_all_pipeline_fingerprints.py +133 -133
  218. tests/test_all_redis_key_configs.py +145 -145
  219. tests/test_asyncmy_usage.py +57 -0
  220. tests/test_batch_processor.py +178 -178
  221. tests/test_cleaners.py +54 -54
  222. tests/test_cli_arguments.py +119 -0
  223. tests/test_component_factory.py +174 -174
  224. tests/test_config_consistency.py +80 -80
  225. tests/test_config_merge.py +152 -152
  226. tests/test_config_validator.py +182 -182
  227. tests/test_controlled_spider_mixin.py +79 -79
  228. tests/test_crawler_process_import.py +38 -38
  229. tests/test_crawler_process_spider_modules.py +47 -47
  230. tests/test_crawlo_proxy_integration.py +114 -108
  231. tests/test_date_tools.py +123 -123
  232. tests/test_dedup_fix.py +220 -220
  233. tests/test_dedup_pipeline_consistency.py +124 -124
  234. tests/test_default_header_middleware.py +313 -313
  235. tests/test_distributed.py +65 -65
  236. tests/test_double_crawlo_fix.py +204 -204
  237. tests/test_double_crawlo_fix_simple.py +124 -124
  238. tests/test_download_delay_middleware.py +221 -221
  239. tests/test_downloader_proxy_compatibility.py +272 -268
  240. tests/test_edge_cases.py +305 -305
  241. tests/test_encoding_core.py +56 -56
  242. tests/test_encoding_detection.py +126 -126
  243. tests/test_enhanced_error_handler.py +270 -270
  244. tests/test_enhanced_error_handler_comprehensive.py +245 -245
  245. tests/test_error_handler_compatibility.py +112 -112
  246. tests/test_factories.py +252 -252
  247. tests/test_factory_compatibility.py +196 -196
  248. tests/test_final_validation.py +153 -153
  249. tests/test_fingerprint_consistency.py +135 -135
  250. tests/test_fingerprint_simple.py +51 -51
  251. tests/test_get_component_logger.py +83 -83
  252. tests/test_hash_performance.py +99 -99
  253. tests/test_integration.py +169 -169
  254. tests/test_item_dedup_redis_key.py +122 -122
  255. tests/test_large_scale_helper.py +235 -235
  256. tests/test_logging_enhancements.py +374 -374
  257. tests/test_logging_final.py +184 -184
  258. tests/test_logging_integration.py +312 -312
  259. tests/test_logging_system.py +282 -282
  260. tests/test_middleware_debug.py +141 -141
  261. tests/test_mode_consistency.py +51 -51
  262. tests/test_multi_directory.py +67 -67
  263. tests/test_multiple_spider_modules.py +80 -80
  264. tests/test_mysql_pipeline_config.py +165 -0
  265. tests/test_mysql_pipeline_error.py +99 -0
  266. tests/test_mysql_pipeline_init_log.py +83 -0
  267. tests/test_mysql_pipeline_integration.py +133 -0
  268. tests/test_mysql_pipeline_refactor.py +144 -0
  269. tests/test_mysql_pipeline_refactor_simple.py +86 -0
  270. tests/test_mysql_pipeline_robustness.py +196 -0
  271. tests/test_mysql_pipeline_types.py +89 -0
  272. tests/test_mysql_update_columns.py +94 -0
  273. tests/test_offsite_middleware.py +244 -244
  274. tests/test_offsite_middleware_simple.py +203 -203
  275. tests/test_optimized_selector_naming.py +100 -100
  276. tests/test_parsel.py +29 -29
  277. tests/test_performance.py +327 -327
  278. tests/test_performance_monitor.py +115 -115
  279. tests/test_pipeline_fingerprint_consistency.py +86 -86
  280. tests/test_priority_behavior.py +211 -211
  281. tests/test_priority_consistency.py +151 -151
  282. tests/test_priority_consistency_fixed.py +249 -249
  283. tests/test_proxy_health_check.py +32 -32
  284. tests/test_proxy_middleware.py +217 -121
  285. tests/test_proxy_middleware_enhanced.py +212 -216
  286. tests/test_proxy_middleware_integration.py +142 -137
  287. tests/test_proxy_middleware_refactored.py +207 -184
  288. tests/test_proxy_only.py +84 -0
  289. tests/test_proxy_providers.py +56 -56
  290. tests/test_proxy_stats.py +19 -19
  291. tests/test_proxy_strategies.py +59 -59
  292. tests/test_proxy_with_downloader.py +153 -0
  293. tests/test_queue_empty_check.py +41 -41
  294. tests/test_queue_manager_double_crawlo.py +173 -173
  295. tests/test_queue_manager_redis_key.py +179 -179
  296. tests/test_queue_naming.py +154 -154
  297. tests/test_queue_type.py +106 -106
  298. tests/test_queue_type_redis_config_consistency.py +130 -130
  299. tests/test_random_headers_default.py +322 -322
  300. tests/test_random_headers_necessity.py +308 -308
  301. tests/test_random_user_agent.py +72 -72
  302. tests/test_redis_config.py +28 -28
  303. tests/test_redis_connection_pool.py +294 -294
  304. tests/test_redis_key_naming.py +181 -181
  305. tests/test_redis_key_validator.py +123 -123
  306. tests/test_redis_queue.py +224 -224
  307. tests/test_redis_queue_name_fix.py +175 -175
  308. tests/test_redis_queue_type_fallback.py +129 -129
  309. tests/test_request_ignore_middleware.py +182 -182
  310. tests/test_request_params.py +111 -111
  311. tests/test_request_serialization.py +70 -70
  312. tests/test_response_code_middleware.py +349 -349
  313. tests/test_response_filter_middleware.py +427 -427
  314. tests/test_response_follow.py +104 -104
  315. tests/test_response_improvements.py +152 -152
  316. tests/test_response_selector_methods.py +92 -92
  317. tests/test_response_url_methods.py +70 -70
  318. tests/test_response_urljoin.py +86 -86
  319. tests/test_retry_middleware.py +333 -333
  320. tests/test_retry_middleware_realistic.py +273 -273
  321. tests/test_scheduler.py +252 -252
  322. tests/test_scheduler_config_update.py +133 -133
  323. tests/test_scrapy_style_encoding.py +112 -112
  324. tests/test_selector_helper.py +100 -100
  325. tests/test_selector_optimizations.py +146 -146
  326. tests/test_simple_response.py +61 -61
  327. tests/test_spider_loader.py +49 -49
  328. tests/test_spider_loader_comprehensive.py +69 -69
  329. tests/test_spider_modules.py +84 -84
  330. tests/test_spiders/test_spider.py +9 -9
  331. tests/test_telecom_spider_redis_key.py +205 -205
  332. tests/test_template_content.py +87 -87
  333. tests/test_template_redis_key.py +134 -134
  334. tests/test_tools.py +159 -159
  335. tests/test_user_agent_randomness.py +176 -176
  336. tests/test_user_agents.py +96 -96
  337. tests/untested_features_report.md +138 -138
  338. tests/verify_debug.py +51 -51
  339. tests/verify_distributed.py +117 -117
  340. tests/verify_log_fix.py +111 -111
  341. tests/verify_mysql_warnings.py +110 -0
  342. crawlo/logging/async_handler.py +0 -181
  343. crawlo/logging/monitor.py +0 -153
  344. crawlo/logging/sampler.py +0 -167
  345. crawlo/middleware/simple_proxy.py +0 -65
  346. crawlo/tools/authenticated_proxy.py +0 -241
  347. crawlo/tools/data_formatter.py +0 -226
  348. crawlo/tools/data_validator.py +0 -181
  349. crawlo/tools/encoding_converter.py +0 -127
  350. crawlo/tools/network_diagnostic.py +0 -365
  351. crawlo/tools/request_tools.py +0 -83
  352. crawlo/tools/retry_mechanism.py +0 -224
  353. crawlo/utils/env_config.py +0 -143
  354. crawlo/utils/large_scale_config.py +0 -287
  355. crawlo/utils/system.py +0 -11
  356. crawlo/utils/tools.py +0 -5
  357. crawlo-1.4.5.dist-info/METADATA +0 -329
  358. crawlo-1.4.5.dist-info/RECORD +0 -347
  359. tests/env_config_example.py +0 -134
  360. tests/ofweek_scrapy/ofweek_scrapy/spiders/ofweek_spider.py +0 -162
  361. tests/test_authenticated_proxy.py +0 -142
  362. tests/test_comprehensive.py +0 -147
  363. tests/test_dynamic_downloaders_proxy.py +0 -125
  364. tests/test_dynamic_proxy.py +0 -93
  365. tests/test_dynamic_proxy_config.py +0 -147
  366. tests/test_dynamic_proxy_real.py +0 -110
  367. tests/test_env_config.py +0 -122
  368. tests/test_framework_env_usage.py +0 -104
  369. tests/test_large_scale_config.py +0 -113
  370. tests/test_proxy_api.py +0 -265
  371. tests/test_real_scenario_proxy.py +0 -196
  372. tests/tools_example.py +0 -261
  373. {crawlo-1.4.5.dist-info → crawlo-1.4.7.dist-info}/WHEEL +0 -0
  374. {crawlo-1.4.5.dist-info → crawlo-1.4.7.dist-info}/entry_points.txt +0 -0
  375. {crawlo-1.4.5.dist-info → crawlo-1.4.7.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,689 @@
1
+ Metadata-Version: 2.4
2
+ Name: crawlo
3
+ Version: 1.4.7
4
+ Summary: Crawlo: A high-performance asynchronous Python web crawling framework with distributed support.。
5
+ Home-page: https://github.com/crawl-coder/Crawlo.git
6
+ Author: crawl-coder
7
+ Author-email: crawlo@qq.com
8
+ License: MIT
9
+ Classifier: Programming Language :: Python :: 3
10
+ Classifier: License :: OSI Approved :: MIT License
11
+ Classifier: Operating System :: OS Independent
12
+ Requires-Python: >=3.6
13
+ Description-Content-Type: text/markdown
14
+ Requires-Dist: aiohttp>=3.12.14
15
+ Requires-Dist: aiomysql>=0.2.0
16
+ Requires-Dist: aioredis>=2.0.1
17
+ Requires-Dist: asyncmy>=0.2.10
18
+ Requires-Dist: cssselect>=1.2.0
19
+ Requires-Dist: dateparser>=1.2.2
20
+ Requires-Dist: httpx[http2]>=0.27.0
21
+ Requires-Dist: curl-cffi>=0.13.0
22
+ Requires-Dist: lxml>=5.2.1
23
+ Requires-Dist: motor>=3.7.0
24
+ Requires-Dist: parsel>=1.9.1
25
+ Requires-Dist: pydantic>=2.11.7
26
+ Requires-Dist: pymongo>=4.11
27
+ Requires-Dist: PyMySQL>=1.1.1
28
+ Requires-Dist: python-dateutil>=2.9.0.post0
29
+ Requires-Dist: redis>=6.2.0
30
+ Requires-Dist: requests>=2.32.4
31
+ Requires-Dist: six>=1.17.0
32
+ Requires-Dist: ujson>=5.9.0
33
+ Requires-Dist: urllib3>=2.5.0
34
+ Requires-Dist: w3lib>=2.1.2
35
+ Requires-Dist: rich>=14.1.0
36
+ Requires-Dist: astor>=0.8.1
37
+ Requires-Dist: watchdog>=6.0.0
38
+ Provides-Extra: render
39
+ Requires-Dist: webdriver-manager>=4.0.0; extra == "render"
40
+ Requires-Dist: playwright; extra == "render"
41
+ Requires-Dist: selenium>=3.141.0; extra == "render"
42
+ Provides-Extra: all
43
+ Requires-Dist: bitarray>=1.5.3; extra == "all"
44
+ Requires-Dist: PyExecJS>=1.5.1; extra == "all"
45
+ Requires-Dist: pymongo>=3.10.1; extra == "all"
46
+ Requires-Dist: redis-py-cluster>=2.1.0; extra == "all"
47
+ Requires-Dist: webdriver-manager>=4.0.0; extra == "all"
48
+ Requires-Dist: playwright; extra == "all"
49
+ Requires-Dist: selenium>=3.141.0; extra == "all"
50
+
51
+ # Crawlo
52
+
53
+ 一个基于 asyncio 的现代化、高性能 Python 异步爬虫框架。
54
+
55
+ ## 核心特性
56
+
57
+ - 🚀 **高性能异步架构**:基于 asyncio 和 aiohttp,充分利用异步 I/O 提升爬取效率
58
+ - 🎯 **智能调度系统**:优先级队列、并发控制、自动重试、智能限速
59
+ - 🔄 **灵活的配置模式**:
60
+ - **Standalone 模式**:单机开发测试,使用内存队列
61
+ - **Distributed 模式**:多节点分布式部署,严格要求 Redis(不允许降级)
62
+ - **Auto 模式**:智能检测 Redis 可用性,自动选择最佳配置(推荐)
63
+ - 📦 **丰富的组件生态**:
64
+ - 内置 Redis 和 MongoDB 支持
65
+ - MySQL 异步连接池(基于 asyncmy)
66
+ - 多种过滤器和去重管道(Memory/Redis)
67
+ - 代理中间件支持(简单代理/动态代理)
68
+ - 多种下载器(aiohttp、httpx、curl-cffi)
69
+ - 🛠 **开发友好**:
70
+ - 类 Scrapy 的项目结构和 API 设计
71
+ - 配置工厂模式(`CrawloConfig.auto()`)
72
+ - 自动爬虫发现机制
73
+ - 完善的日志系统
74
+
75
+ ## 项目架构
76
+
77
+ Crawlo 框架采用模块化设计,核心组件包括:
78
+
79
+ ![Crawlo 框架架构图](images/Crawlo%20框架架构图.png)
80
+
81
+ - **Engine**:核心引擎,协调各个组件工作
82
+ - **Scheduler**:调度器,管理请求队列和去重
83
+ - **Downloader**:下载器,支持多种 HTTP 客户端
84
+ - **Spider**:爬虫基类,定义数据提取逻辑
85
+ - **Pipeline**:数据管道,处理和存储数据
86
+ - **Middleware**:中间件,处理请求和响应
87
+
88
+ ![Crawlo 数据流图](images/Crawlo%20数据流图.png)
89
+
90
+ ## 示例项目
91
+
92
+ 查看 [`examples/`](examples/) 目录下的完整示例项目:
93
+
94
+ - **ofweek_standalone** - Auto 模式示例(智能检测)
95
+ - **ofweek_spider** - Auto 模式示例
96
+ - **ofweek_distributed** - Distributed 模式示例(严格分布式)
97
+
98
+ ## 安装
99
+
100
+ ```bash
101
+ # 基础安装
102
+ pip install crawlo
103
+ ```
104
+
105
+ ## 配置模式详解
106
+
107
+ > ⚠️ **重要**:配置模式的选择直接影响爬虫的运行方式、性能和可靠性,请仔细阅读本节内容。
108
+
109
+ Crawlo 提供三种配置模式,满足不同场景需求:
110
+
111
+ ### 三种模式对比
112
+
113
+ | 配置项 | Standalone | Distributed | Auto |
114
+ |--------|-----------|-------------|------|
115
+ | **RUN_MODE** | `standalone` | `distributed` | `auto` |
116
+ | **队列类型** | 内存队列 | Redis 队列 | 自动检测 |
117
+ | **Redis 要求** | 不需要 | **必需** | 可选 |
118
+ | **Redis 不可用时** | N/A | 🚫 **报错退出** | ✅ 降级到内存 |
119
+ | **配置自动更新** | ❌ 否 | ❌ 否 | ✅ 是 |
120
+ | **过滤器** | Memory | Redis | Redis/Memory |
121
+ | **去重管道** | Memory | Redis | Redis/Memory |
122
+ | **适用场景** | 开发测试 | 多节点部署 | 生产环境 |
123
+ | **并发数默认值** | 8 | 16 | 12 |
124
+ | **推荐指数** | ⭐⭐⭐ | ⭐⭐⭐⭐ | ⭐⭐⭐⭐⭐ |
125
+
126
+ ### 1. Auto 模式(推荐)
127
+
128
+ **智能检测,自动适配,推荐用于生产环境。**
129
+
130
+ ```python
131
+ from crawlo.config import CrawloConfig
132
+
133
+ config = CrawloConfig.auto(
134
+ project_name='myproject',
135
+ concurrency=12,
136
+ download_delay=1.0
137
+ )
138
+ locals().update(config.to_dict())
139
+ ```
140
+
141
+ **运行机制**:
142
+ - 配置阶段不依赖 Redis
143
+ - 运行时才检测 Redis 可用性
144
+ - Redis 可用 → 使用 `RedisPriorityQueue` + `AioRedisFilter`
145
+ - Redis 不可用 → 降级到 `MemoryQueue` + `MemoryFilter`
146
+ - 自动更新配置(`QUEUE_TYPE`、`FILTER_CLASS`、`DEFAULT_DEDUP_PIPELINE`)
147
+
148
+ **优势**:
149
+ - ✅ 开发环境无需配置 Redis,直接启动
150
+ - ✅ 生产环境 Redis 故障时自动降级,保证系统可用性
151
+ - ✅ 同一份代码可在不同环境运行,无需修改配置
152
+ - ✅ 最佳的灵活性和可靠性
153
+
154
+ **适用场景**:
155
+ - 生产环境部署(首选)
156
+ - 需要在多种环境运行的项目
157
+ - 希望系统具备容错能力
158
+
159
+ ### 2. Standalone 模式
160
+
161
+ **单机模式,适合开发测试和中小规模爬取。**
162
+
163
+ ```python
164
+ config = CrawloConfig.standalone(
165
+ project_name='myproject',
166
+ concurrency=8
167
+ )
168
+ locals().update(config.to_dict())
169
+ ```
170
+
171
+ **运行机制**:
172
+ - 固定使用 `MemoryQueue`(内存队列)
173
+ - 固定使用 `MemoryFilter`(内存过滤器)
174
+ - 固定使用 `MemoryDedupPipeline`(内存去重)
175
+ - 不进行 Redis 检测
176
+ - 配置不会自动更新
177
+
178
+ **优势**:
179
+ - ✅ 无需任何外部依赖
180
+ - ✅ 启动速度快
181
+ - ✅ 适合快速开发调试
182
+
183
+ **限制**:
184
+ - ❌ 不支持分布式部署
185
+ - ❌ 重启后队列数据丢失
186
+ - ❌ 不适合大规模数据采集
187
+
188
+ **适用场景**:
189
+ - 本地开发调试
190
+ - 学习框架特性
191
+ - 中小规模数据采集(< 10万条)
192
+ - 单机运行的简单爬虫
193
+
194
+ ### 3. Distributed 模式
195
+
196
+ **分布式模式,严格要求 Redis 可用,适合多节点协同工作。**
197
+
198
+ ```python
199
+ config = CrawloConfig.distributed(
200
+ project_name='myproject',
201
+ redis_host='redis.example.com',
202
+ redis_port=6379,
203
+ redis_password='your_password',
204
+ concurrency=16
205
+ )
206
+ locals().update(config.to_dict())
207
+ ```
208
+
209
+ **运行机制**:
210
+ - 必须使用 `RedisPriorityQueue`
211
+ - 必须使用 `AioRedisFilter`
212
+ - 必须使用 `RedisDedupPipeline`
213
+ - 启动时强制检查 Redis 连接
214
+ - **Redis 不可用时抛出 `RuntimeError` 并退出(不允许降级)**
215
+
216
+ **为什么要严格要求 Redis?**
217
+
218
+ 1. **数据一致性**:防止不同节点使用不同的队列类型
219
+ 2. **去重有效性**:确保多节点间的去重功能正常工作
220
+ 3. **任务分配**:防止任务被重复执行
221
+ 4. **问题早发现**:启动失败比运行时失败更容易发现和修复
222
+ 5. **明确的意图**:分布式模式就应该是分布式的,不应该静默降级
223
+
224
+ **Redis 不可用时的错误信息**:
225
+
226
+ ```bash
227
+ $ crawlo run my_spider
228
+
229
+ 2025-10-25 22:00:00 - [queue_manager] - ERROR:
230
+ Distributed 模式要求 Redis 可用,但无法连接到 Redis 服务器。
231
+ 错误信息: Connection refused
232
+ Redis URL: redis://127.0.0.1:6379/0
233
+ 请检查:
234
+ 1. Redis 服务是否正在运行
235
+ 2. Redis 连接配置是否正确
236
+ 3. 网络连接是否正常
237
+
238
+ RuntimeError: Distributed 模式要求 Redis 可用,但无法连接到 Redis 服务器。
239
+ ```
240
+
241
+ **优势**:
242
+ - ✅ 支持多节点协同爬取
243
+ - ✅ 数据持久化,重启后可继续
244
+ - ✅ 严格的分布式一致性保证
245
+ - ✅ 适合大规模数据采集
246
+
247
+ **适用场景**:
248
+ - 多服务器协同采集
249
+ - 大规模数据采集(> 百万条)
250
+ - 需要严格保证分布式一致性
251
+ - 生产环境多节点部署
252
+
253
+ ### 模式选择建议
254
+
255
+ | 场景 | 推荐模式 | 原因 |
256
+ |------|---------|------|
257
+ | 生产环境(单节点或多节点) | **Auto** | 自动适配,容错能力强 |
258
+ | 开发环境 | **Standalone** 或 **Auto** | 无需配置 Redis |
259
+ | 严格的多节点分布式部署 | **Distributed** | 保证分布式一致性 |
260
+ | 学习和测试 | **Standalone** | 最简单,无依赖 |
261
+ | 中小规模爬取 | **Standalone** 或 **Auto** | 简单高效 |
262
+ | 大规模爬取 | **Auto** 或 **Distributed** | 性能和可靠性 |
263
+
264
+ > 📖 **完整文档**:更多详细信息请参考 [配置模式完全指南](docs/tutorials/configuration_modes.md)
265
+
266
+ ## 快速开始
267
+
268
+ ### 1. 创建项目
269
+
270
+ ```bash
271
+ # 创建新项目
272
+ crawlo startproject myproject
273
+ cd myproject
274
+
275
+ # 创建爬虫
276
+ crawlo genspider example example.com
277
+ ```
278
+
279
+ ### 2. 配置项目(推荐使用 Auto 模式)
280
+
281
+ ```python
282
+ # myproject/settings.py
283
+ from crawlo.config import CrawloConfig
284
+
285
+ # 使用 Auto 模式:智能检测 Redis,自动选择最佳配置
286
+ config = CrawloConfig.auto(
287
+ project_name='myproject',
288
+ concurrency=12, # 并发数
289
+ download_delay=1.0 # 下载延迟(秒)
290
+ )
291
+
292
+ # 将配置应用到当前模块
293
+ locals().update(config.to_dict())
294
+
295
+ # 爬虫模块配置
296
+ SPIDER_MODULES = ['myproject.spiders']
297
+
298
+ # 日志配置
299
+ LOG_LEVEL = 'INFO'
300
+ LOG_FILE = 'logs/myproject.log'
301
+
302
+ # 可选:添加数据管道
303
+ # PIPELINES = [
304
+ # 'crawlo.pipelines.mysql_pipeline.AsyncmyMySQLPipeline',
305
+ # ]
306
+
307
+ # 可选:Redis 配置(Auto 模式会自动检测)
308
+ # REDIS_HOST = '127.0.0.1'
309
+ # REDIS_PORT = 6379
310
+ ```
311
+
312
+ **其他配置模式:**
313
+
314
+ ```python
315
+ # Standalone 模式:单机开发测试
316
+ config = CrawloConfig.standalone(
317
+ project_name='myproject',
318
+ concurrency=8
319
+ )
320
+
321
+ # Distributed 模式:多节点分布式(必须配置 Redis)
322
+ config = CrawloConfig.distributed(
323
+ project_name='myproject',
324
+ redis_host='redis.example.com',
325
+ redis_port=6379,
326
+ redis_password='your_password',
327
+ concurrency=16
328
+ )
329
+ ```
330
+
331
+ ### 3. 编写爬虫
332
+
333
+ ```python
334
+ # myproject/spiders/example.py
335
+ from crawlo import Spider
336
+ from crawlo.http import Request
337
+
338
+ class ExampleSpider(Spider):
339
+ name = 'example'
340
+ start_urls = ['https://example.com']
341
+
342
+ async def parse(self, response):
343
+ # 提取数据
344
+ title = response.css('h1::text').get()
345
+
346
+ # 返回数据
347
+ yield {
348
+ 'title': title,
349
+ 'url': response.url
350
+ }
351
+
352
+ # 跟进链接
353
+ for href in response.css('a::attr(href)').getall():
354
+ yield Request(
355
+ url=response.urljoin(href),
356
+ callback=self.parse
357
+ )
358
+ ```
359
+
360
+ ### 4. 运行爬虫
361
+
362
+ ```bash
363
+ # 运行指定爬虫
364
+ crawlo run example
365
+
366
+ # 指定日志级别
367
+ crawlo run example --log-level DEBUG
368
+ ```
369
+
370
+ ## 核心功能
371
+
372
+ ### Response 对象
373
+
374
+ Crawlo 的 [`Response`](crawlo/http/response.py) 对象提供了强大的网页处理能力:
375
+
376
+ **1. 智能编码检测**
377
+
378
+ ```python
379
+ # 自动检测并正确解码页面内容
380
+ # 优先级:Content-Type → HTML meta → chardet → utf-8
381
+ response.text # 已正确解码的文本
382
+ response.encoding # 检测到的编码
383
+ ```
384
+
385
+ **2. CSS/XPath 选择器**
386
+
387
+ ```python
388
+ # CSS 选择器(推荐)
389
+ title = response.css('h1::text').get()
390
+ links = response.css('a::attr(href)').getall()
391
+
392
+ # XPath 选择器
393
+ title = response.xpath('//title/text()').get()
394
+ links = response.xpath('//a/@href').getall()
395
+
396
+ # 支持默认值
397
+ title = response.css('h1::text').get(default='无标题')
398
+ ```
399
+
400
+ **3. URL 处理**
401
+
402
+ ```python
403
+ response.url # 自动规范化(移除 fragment)
404
+ response.original_url # 保留原始 URL
405
+
406
+ # 智能 URL 拼接
407
+ response.urljoin('/path') # 绝对路径
408
+ response.urljoin('../path') # 相对路径
409
+ response.urljoin('//cdn.com/img') # 协议相对路径
410
+ ```
411
+
412
+ **4. 便捷提取方法**
413
+
414
+ ```python
415
+ # 提取单个/多个元素文本
416
+ title = response.extract_text('h1')
417
+ paragraphs = response.extract_texts('.content p')
418
+
419
+ # 提取单个/多个元素属性
420
+ link = response.extract_attr('a', 'href')
421
+ all_links = response.extract_attrs('a', 'href')
422
+ ```
423
+
424
+ ### 配置工厂模式
425
+
426
+ Crawlo 提供了便捷的配置工厂方法,无需手动配置繁琐的参数:
427
+
428
+ ```python
429
+ from crawlo.config import CrawloConfig
430
+
431
+ # Auto 模式(推荐):智能检测,自动适配
432
+ config = CrawloConfig.auto(
433
+ project_name='myproject',
434
+ concurrency=12,
435
+ download_delay=1.0
436
+ )
437
+
438
+ # Standalone 模式:单机开发
439
+ config = CrawloConfig.standalone(
440
+ project_name='myproject',
441
+ concurrency=8
442
+ )
443
+
444
+ # Distributed 模式:严格分布式
445
+ config = CrawloConfig.distributed(
446
+ project_name='myproject',
447
+ redis_host='localhost',
448
+ redis_port=6379,
449
+ concurrency=16
450
+ )
451
+
452
+ # 应用到 settings.py
453
+ locals().update(config.to_dict())
454
+ ```
455
+
456
+ **三种模式的核心区别**:
457
+
458
+ - **Auto**:智能检测 Redis,自动选择最佳配置,**推荐用于生产环境**
459
+ - **Standalone**:固定使用内存队列,适合开发测试,无外部依赖
460
+ - **Distributed**:严格要求 Redis,不允许降级,保证分布式一致性
461
+
462
+ > 💡 详细配置说明请查看前面的 [配置模式详解](#配置模式详解) 章节
463
+
464
+ ### 日志系统
465
+
466
+ Crawlo 提供了完善的日志系统,支持控制台和文件双输出:
467
+
468
+ ```python
469
+ from crawlo.logging import get_logger
470
+
471
+ logger = get_logger(__name__)
472
+
473
+ logger.debug('调试信息')
474
+ logger.info('普通信息')
475
+ logger.warning('警告信息')
476
+ logger.error('错误信息')
477
+ ```
478
+
479
+ **日志配置:**
480
+
481
+ ```python
482
+ # settings.py
483
+ LOG_LEVEL = 'INFO' # DEBUG, INFO, WARNING, ERROR, CRITICAL
484
+ LOG_FILE = 'logs/spider.log'
485
+ LOG_ENCODING = 'utf-8' # 明确指定日志文件编码
486
+ STATS_DUMP = True # 是否输出统计信息
487
+ ```
488
+
489
+ **高级功能:**
490
+
491
+ ```python
492
+ from crawlo.logging import configure_logging
493
+
494
+ # 分别配置控制台和文件日志级别
495
+ configure_logging(
496
+ LOG_LEVEL='INFO',
497
+ LOG_CONSOLE_LEVEL='WARNING', # 控制台只显示 WARNING 及以上
498
+ LOG_FILE_LEVEL='DEBUG', # 文件记录 DEBUG 及以上
499
+ LOG_FILE='logs/app.log',
500
+ LOG_MAX_BYTES=10*1024*1024, # 10MB
501
+ LOG_BACKUP_COUNT=5
502
+ )
503
+ ```
504
+
505
+ ### 爬虫自动发现
506
+
507
+ Crawlo 支持自动发现爬虫,无需手动导入:
508
+
509
+ ```bash
510
+ # 自动发现并运行(推荐)
511
+ crawlo run spider_name
512
+
513
+ # 指定文件路径运行
514
+ crawlo run -f path/to/spider.py -s SpiderClassName
515
+ ```
516
+
517
+ 框架会自动在 `SPIDER_MODULES` 配置的模块中查找爬虫。
518
+
519
+ ### 跨平台支持
520
+
521
+ Crawlo 在 Windows、macOS、Linux 上均可无缝运行:
522
+
523
+ - **Windows**:自动使用 ProactorEventLoop,正确处理控制台编码
524
+ - **macOS/Linux**:使用默认的 SelectorEventLoop
525
+ - 兼容不同平台的路径格式
526
+
527
+ > 💡 **Windows 用户提示**:如需日志轮转功能,建议安装 `concurrent-log-handler`:
528
+ > ```bash
529
+ > pip install concurrent-log-handler
530
+ > ```
531
+
532
+ ![Crawlo 核心架构图](images/Crawlo%20核心架构图.png)
533
+
534
+ ## 文档
535
+
536
+ 完整文档请查看 [`docs/`](docs/) 目录:
537
+
538
+ ### 📚 核心教程
539
+
540
+ - [配置模式完全指南](docs/tutorials/configuration_modes.md) - **强烈推荐阅读**
541
+ - [架构概述](docs/modules/architecture/index.md)
542
+ - [运行模式](docs/modules/architecture/modes.md)
543
+ - [配置系统](docs/modules/configuration/index.md)
544
+
545
+ ### 🔧 核心模块
546
+
547
+ - [引擎 (Engine)](docs/modules/core/engine.md)
548
+ - [调度器 (Scheduler)](docs/modules/core/scheduler.md)
549
+ - [处理器 (Processor)](docs/modules/core/processor.md)
550
+ - [爬虫基类 (Spider)](docs/modules/core/spider.md)
551
+
552
+ ### 📦 功能模块
553
+
554
+ - [下载器 (Downloader)](docs/modules/downloader/index.md)
555
+ - [队列 (Queue)](docs/modules/queue/index.md)
556
+ - [过滤器 (Filter)](docs/modules/filter/index.md)
557
+ - [中间件 (Middleware)](docs/modules/middleware/index.md)
558
+ - [管道 (Pipeline)](docs/modules/pipeline/index.md)
559
+ - [扩展 (Extension)](docs/modules/extension/index.md)
560
+
561
+ ### 🛠 命令行工具
562
+
563
+ - [CLI 概述](docs/modules/cli/index.md)
564
+ - [startproject](docs/modules/cli/startproject.md) - 项目初始化
565
+ - [genspider](docs/modules/cli/genspider.md) - 爬虫生成
566
+ - [run](docs/modules/cli/run.md) - 爬虫运行
567
+ - [list](docs/modules/cli/list.md) - 查看爬虫列表
568
+ - [check](docs/modules/cli/check.md) - 配置检查
569
+ - [stats](docs/modules/cli/stats.md) - 统计信息
570
+
571
+ ### 🚀 高级主题
572
+
573
+ - [分布式部署](docs/modules/advanced/distributed.md)
574
+ - [性能优化](docs/modules/advanced/performance.md)
575
+ - [故障排除](docs/modules/advanced/troubleshooting.md)
576
+ - [最佳实践](docs/modules/advanced/best_practices.md)
577
+
578
+ ### 📝 性能优化报告
579
+
580
+ - [初始化优化报告](docs/initialization_optimization_report.md)
581
+ - [MySQL 连接池优化](docs/mysql_connection_pool_optimization.md)
582
+ - [MongoDB 连接池优化](docs/mongo_connection_pool_optimization.md)
583
+
584
+ ### 📖 API 参考
585
+
586
+ - [完整 API 文档](docs/api/)
587
+
588
+ ---
589
+
590
+ **在线文档**:
591
+ - [中文文档](https://crawlo.readthedocs.io/en/latest/README_zh/)
592
+ - [English Documentation](https://crawlo.readthedocs.io/en/latest/)
593
+
594
+ **本地构建文档**:
595
+ ```bash
596
+ mkdocs serve
597
+ # 浏览器访问 http://localhost:8000
598
+ ```
599
+
600
+ ## 常见问题
601
+
602
+ ### 1. 如何选择配置模式?
603
+
604
+ - **开发测试**:使用 `CrawloConfig.standalone()`
605
+ - **生产环境**:使用 `CrawloConfig.auto()`(推荐)
606
+ - **多节点部署**:使用 `CrawloConfig.distributed()`
607
+
608
+ ### 2. Distributed 模式 Redis 不可用怎么办?
609
+
610
+ Distributed 模式**严格要求 Redis**,不可用时会抛出 `RuntimeError` 并退出。这是为了保证分布式一致性和数据安全。
611
+
612
+ 如果希望 Redis 不可用时自动降级,请使用 **Auto 模式**。
613
+
614
+ ### 3. Auto 模式如何工作?
615
+
616
+ Auto 模式在运行时智能检测:
617
+ - Redis 可用 → 使用 RedisPriorityQueue + AioRedisFilter
618
+ - Redis 不可用 → 降级到 MemoryQueue + MemoryFilter
619
+
620
+ 详见 [配置模式完全指南](docs/tutorials/configuration_modes.md)。
621
+
622
+ ### 4. 如何启用 MySQL 或 MongoDB 支持?
623
+
624
+ ```python
625
+ # settings.py
626
+ PIPELINES = [
627
+ 'crawlo.pipelines.mysql_pipeline.AsyncmyMySQLPipeline', # MySQL
628
+ # 或
629
+ 'crawlo.pipelines.mongo_pipeline.MongoDBPipeline', # MongoDB
630
+ ]
631
+
632
+ # MySQL 配置
633
+ MYSQL_HOST = '127.0.0.1'
634
+ MYSQL_USER = 'root'
635
+ MYSQL_PASSWORD = 'password'
636
+ MYSQL_DB = 'mydb'
637
+ MYSQL_TABLE = 'items'
638
+
639
+ # MongoDB 配置
640
+ MONGO_URI = 'mongodb://localhost:27017'
641
+ MONGO_DATABASE = 'mydb'
642
+ MONGO_COLLECTION = 'items'
643
+ ```
644
+
645
+ ### 5. 如何使用代理?
646
+
647
+ ```python
648
+ # settings.py
649
+
650
+ # 简单代理列表
651
+ PROXY_LIST = [
652
+ "http://proxy1:8080",
653
+ "http://proxy2:8080"
654
+ ]
655
+
656
+ # 或使用动态代理 API
657
+ PROXY_API_URL = "http://your-proxy-api.com/get-proxy"
658
+ ```
659
+
660
+ ## 学习路径
661
+
662
+ 如果您是 Crawlo 的新用户,建议按以下顺序学习:
663
+
664
+ 1. **入门** - 阅读快速开始指南,运行第一个示例
665
+ 2. **配置模式** - 学习三种配置模式,选择适合的模式([配置模式指南](docs/tutorials/configuration_modes.md))
666
+ 3. **核心概念** - 了解框架架构和基本概念
667
+ 4. **核心模块** - 深入学习引擎、调度器、处理器等核嘿组件
668
+ 5. **功能模块** - 根据需求学习下载器、队列、过滤器等模块
669
+ 6. **高级主题** - 掌握分布式部署、性能优化等高级功能
670
+
671
+ ## 贡献
672
+
673
+ 欢迎贡献!如果您想为 Crawlo 做出贡献:
674
+
675
+ 1. Fork 项目仓库
676
+ 2. 创建功能分支 (`git checkout -b feature/AmazingFeature`)
677
+ 3. 提交您的更改 (`git commit -m 'Add some AmazingFeature'`)
678
+ 4. 推送到分支 (`git push origin feature/AmazingFeature`)
679
+ 5. 发起 Pull Request
680
+
681
+ ## 许可证
682
+
683
+ MIT License - 详见 [LICENSE](LICENSE) 文件
684
+
685
+ ---
686
+
687
+ <p align="center">
688
+ <i>如有问题或建议,欢迎提交 <a href="https://github.com/crawl-coder/Crawlo/issues">Issue</a></i>
689
+ </p>