crawlo 1.1.3__tar.gz → 1.1.5__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (211) hide show
  1. crawlo-1.1.5/PKG-INFO +401 -0
  2. crawlo-1.1.5/README.md +351 -0
  3. {crawlo-1.1.3 → crawlo-1.1.5}/crawlo/__init__.py +28 -1
  4. crawlo-1.1.5/crawlo/__version__.py +1 -0
  5. crawlo-1.1.5/crawlo/cleaners/__init__.py +61 -0
  6. crawlo-1.1.5/crawlo/cleaners/data_formatter.py +226 -0
  7. crawlo-1.1.5/crawlo/cleaners/encoding_converter.py +126 -0
  8. crawlo-1.1.5/crawlo/cleaners/text_cleaner.py +233 -0
  9. {crawlo-1.1.3 → crawlo-1.1.5}/crawlo/commands/startproject.py +117 -13
  10. {crawlo-1.1.3 → crawlo-1.1.5}/crawlo/config.py +30 -0
  11. crawlo-1.1.5/crawlo/config_validator.py +253 -0
  12. crawlo-1.1.5/crawlo/core/engine.py +346 -0
  13. {crawlo-1.1.3 → crawlo-1.1.5}/crawlo/core/scheduler.py +49 -78
  14. {crawlo-1.1.3 → crawlo-1.1.5}/crawlo/crawler.py +6 -6
  15. {crawlo-1.1.3 → crawlo-1.1.5}/crawlo/downloader/__init__.py +24 -0
  16. {crawlo-1.1.3 → crawlo-1.1.5}/crawlo/downloader/aiohttp_downloader.py +8 -0
  17. {crawlo-1.1.3 → crawlo-1.1.5}/crawlo/downloader/cffi_downloader.py +5 -0
  18. crawlo-1.1.5/crawlo/downloader/hybrid_downloader.py +214 -0
  19. crawlo-1.1.5/crawlo/downloader/playwright_downloader.py +403 -0
  20. crawlo-1.1.5/crawlo/downloader/selenium_downloader.py +473 -0
  21. crawlo-1.1.5/crawlo/extension/__init__.py +38 -0
  22. crawlo-1.1.5/crawlo/extension/health_check.py +142 -0
  23. crawlo-1.1.5/crawlo/extension/log_interval.py +58 -0
  24. crawlo-1.1.5/crawlo/extension/log_stats.py +82 -0
  25. crawlo-1.1.5/crawlo/extension/logging_extension.py +44 -0
  26. crawlo-1.1.5/crawlo/extension/memory_monitor.py +105 -0
  27. crawlo-1.1.5/crawlo/extension/performance_profiler.py +134 -0
  28. crawlo-1.1.5/crawlo/extension/request_recorder.py +108 -0
  29. {crawlo-1.1.3 → crawlo-1.1.5}/crawlo/filters/aioredis_filter.py +50 -12
  30. {crawlo-1.1.3 → crawlo-1.1.5}/crawlo/middleware/proxy.py +26 -2
  31. {crawlo-1.1.3 → crawlo-1.1.5}/crawlo/mode_manager.py +24 -19
  32. {crawlo-1.1.3 → crawlo-1.1.5}/crawlo/network/request.py +30 -3
  33. {crawlo-1.1.3 → crawlo-1.1.5}/crawlo/network/response.py +114 -25
  34. crawlo-1.1.5/crawlo/pipelines/mongo_pipeline.py +132 -0
  35. crawlo-1.1.5/crawlo/pipelines/mysql_pipeline.py +317 -0
  36. {crawlo-1.1.3 → crawlo-1.1.5}/crawlo/pipelines/redis_dedup_pipeline.py +7 -3
  37. {crawlo-1.1.3 → crawlo-1.1.5}/crawlo/queue/queue_manager.py +15 -2
  38. crawlo-1.1.5/crawlo/queue/redis_priority_queue.py +277 -0
  39. crawlo-1.1.5/crawlo/settings/default_settings.py +217 -0
  40. crawlo-1.1.5/crawlo/subscriber.py +131 -0
  41. {crawlo-1.1.3 → crawlo-1.1.5}/crawlo/templates/project/items.py.tmpl +1 -1
  42. crawlo-1.1.5/crawlo/templates/project/middlewares.py.tmpl +111 -0
  43. crawlo-1.1.5/crawlo/templates/project/pipelines.py.tmpl +98 -0
  44. {crawlo-1.1.3 → crawlo-1.1.5}/crawlo/templates/project/settings.py.tmpl +93 -17
  45. crawlo-1.1.5/crawlo/templates/project/settings_distributed.py.tmpl +120 -0
  46. crawlo-1.1.5/crawlo/templates/project/settings_gentle.py.tmpl +95 -0
  47. crawlo-1.1.5/crawlo/templates/project/settings_high_performance.py.tmpl +152 -0
  48. crawlo-1.1.5/crawlo/templates/project/settings_simple.py.tmpl +69 -0
  49. {crawlo-1.1.3 → crawlo-1.1.5}/crawlo/templates/spider/spider.py.tmpl +2 -38
  50. crawlo-1.1.5/crawlo/tools/__init__.py +183 -0
  51. crawlo-1.1.5/crawlo/tools/anti_crawler.py +269 -0
  52. crawlo-1.1.5/crawlo/tools/authenticated_proxy.py +241 -0
  53. crawlo-1.1.5/crawlo/tools/data_validator.py +181 -0
  54. crawlo-1.1.5/crawlo/tools/date_tools.py +36 -0
  55. crawlo-1.1.5/crawlo/tools/distributed_coordinator.py +387 -0
  56. crawlo-1.1.5/crawlo/tools/retry_mechanism.py +221 -0
  57. crawlo-1.1.5/crawlo/tools/scenario_adapter.py +263 -0
  58. crawlo-1.1.5/crawlo/utils/__init__.py +35 -0
  59. crawlo-1.1.5/crawlo/utils/batch_processor.py +261 -0
  60. {crawlo-1.1.3 → crawlo-1.1.5}/crawlo/utils/date_tools.py +58 -1
  61. crawlo-1.1.5/crawlo/utils/enhanced_error_handler.py +360 -0
  62. crawlo-1.1.5/crawlo/utils/env_config.py +106 -0
  63. crawlo-1.1.5/crawlo/utils/error_handler.py +126 -0
  64. crawlo-1.1.5/crawlo/utils/performance_monitor.py +285 -0
  65. crawlo-1.1.5/crawlo/utils/redis_connection_pool.py +335 -0
  66. crawlo-1.1.5/crawlo/utils/redis_key_validator.py +200 -0
  67. crawlo-1.1.5/crawlo.egg-info/PKG-INFO +401 -0
  68. {crawlo-1.1.3 → crawlo-1.1.5}/crawlo.egg-info/SOURCES.txt +75 -3
  69. {crawlo-1.1.3 → crawlo-1.1.5}/requirements.txt +3 -2
  70. crawlo-1.1.5/tests/advanced_tools_example.py +276 -0
  71. crawlo-1.1.5/tests/authenticated_proxy_example.py +237 -0
  72. crawlo-1.1.5/tests/cleaners_example.py +161 -0
  73. crawlo-1.1.5/tests/config_validation_demo.py +103 -0
  74. crawlo-1.1.5/tests/date_tools_example.py +181 -0
  75. crawlo-1.1.5/tests/dynamic_loading_example.py +524 -0
  76. crawlo-1.1.5/tests/dynamic_loading_test.py +105 -0
  77. crawlo-1.1.5/tests/env_config_example.py +134 -0
  78. crawlo-1.1.5/tests/error_handling_example.py +172 -0
  79. crawlo-1.1.5/tests/redis_key_validation_demo.py +131 -0
  80. crawlo-1.1.5/tests/response_improvements_example.py +145 -0
  81. crawlo-1.1.5/tests/test_advanced_tools.py +149 -0
  82. crawlo-1.1.5/tests/test_all_redis_key_configs.py +146 -0
  83. crawlo-1.1.5/tests/test_authenticated_proxy.py +142 -0
  84. crawlo-1.1.5/tests/test_cleaners.py +55 -0
  85. crawlo-1.1.5/tests/test_comprehensive.py +147 -0
  86. crawlo-1.1.5/tests/test_config_validator.py +194 -0
  87. crawlo-1.1.5/tests/test_date_tools.py +124 -0
  88. crawlo-1.1.5/tests/test_dynamic_downloaders_proxy.py +125 -0
  89. crawlo-1.1.5/tests/test_dynamic_proxy.py +93 -0
  90. crawlo-1.1.5/tests/test_dynamic_proxy_config.py +147 -0
  91. crawlo-1.1.5/tests/test_dynamic_proxy_real.py +110 -0
  92. crawlo-1.1.5/tests/test_edge_cases.py +304 -0
  93. crawlo-1.1.5/tests/test_enhanced_error_handler.py +271 -0
  94. crawlo-1.1.5/tests/test_env_config.py +122 -0
  95. crawlo-1.1.5/tests/test_error_handler_compatibility.py +113 -0
  96. crawlo-1.1.5/tests/test_framework_env_usage.py +104 -0
  97. crawlo-1.1.5/tests/test_integration.py +357 -0
  98. crawlo-1.1.5/tests/test_item_dedup_redis_key.py +123 -0
  99. crawlo-1.1.5/tests/test_parsel.py +30 -0
  100. crawlo-1.1.5/tests/test_performance.py +328 -0
  101. crawlo-1.1.5/tests/test_queue_manager_redis_key.py +177 -0
  102. crawlo-1.1.5/tests/test_redis_connection_pool.py +295 -0
  103. crawlo-1.1.5/tests/test_redis_key_naming.py +182 -0
  104. crawlo-1.1.5/tests/test_redis_key_validator.py +124 -0
  105. crawlo-1.1.5/tests/test_response_improvements.py +153 -0
  106. crawlo-1.1.5/tests/test_simple_response.py +62 -0
  107. crawlo-1.1.5/tests/test_telecom_spider_redis_key.py +206 -0
  108. crawlo-1.1.5/tests/test_template_content.py +88 -0
  109. crawlo-1.1.5/tests/test_template_redis_key.py +135 -0
  110. crawlo-1.1.5/tests/test_tools.py +154 -0
  111. crawlo-1.1.5/tests/tools_example.py +258 -0
  112. crawlo-1.1.3/PKG-INFO +0 -635
  113. crawlo-1.1.3/README.md +0 -585
  114. crawlo-1.1.3/crawlo/__version__.py +0 -1
  115. crawlo-1.1.3/crawlo/core/engine.py +0 -172
  116. crawlo-1.1.3/crawlo/core/enhanced_engine.py +0 -190
  117. crawlo-1.1.3/crawlo/extension/__init__.py +0 -31
  118. crawlo-1.1.3/crawlo/extension/log_interval.py +0 -49
  119. crawlo-1.1.3/crawlo/extension/log_stats.py +0 -44
  120. crawlo-1.1.3/crawlo/extension/logging_extension.py +0 -35
  121. crawlo-1.1.3/crawlo/pipelines/mongo_pipeline.py +0 -117
  122. crawlo-1.1.3/crawlo/pipelines/mysql_pipeline.py +0 -195
  123. crawlo-1.1.3/crawlo/queue/redis_priority_queue.py +0 -209
  124. crawlo-1.1.3/crawlo/settings/default_settings.py +0 -245
  125. crawlo-1.1.3/crawlo/subscriber.py +0 -106
  126. crawlo-1.1.3/crawlo/templates/project/middlewares.py.tmpl +0 -87
  127. crawlo-1.1.3/crawlo/templates/project/pipelines.py.tmpl +0 -342
  128. crawlo-1.1.3/crawlo/utils/__init__.py +0 -7
  129. crawlo-1.1.3/crawlo.egg-info/PKG-INFO +0 -635
  130. {crawlo-1.1.3 → crawlo-1.1.5}/LICENSE +0 -0
  131. {crawlo-1.1.3 → crawlo-1.1.5}/MANIFEST.in +0 -0
  132. {crawlo-1.1.3 → crawlo-1.1.5}/crawlo/cli.py +0 -0
  133. {crawlo-1.1.3 → crawlo-1.1.5}/crawlo/commands/__init__.py +0 -0
  134. {crawlo-1.1.3 → crawlo-1.1.5}/crawlo/commands/check.py +0 -0
  135. {crawlo-1.1.3 → crawlo-1.1.5}/crawlo/commands/genspider.py +0 -0
  136. {crawlo-1.1.3 → crawlo-1.1.5}/crawlo/commands/list.py +0 -0
  137. {crawlo-1.1.3 → crawlo-1.1.5}/crawlo/commands/run.py +0 -0
  138. {crawlo-1.1.3 → crawlo-1.1.5}/crawlo/commands/stats.py +0 -0
  139. {crawlo-1.1.3 → crawlo-1.1.5}/crawlo/commands/utils.py +0 -0
  140. {crawlo-1.1.3 → crawlo-1.1.5}/crawlo/core/__init__.py +0 -0
  141. {crawlo-1.1.3 → crawlo-1.1.5}/crawlo/core/processor.py +0 -0
  142. {crawlo-1.1.3 → crawlo-1.1.5}/crawlo/downloader/httpx_downloader.py +0 -0
  143. {crawlo-1.1.3 → crawlo-1.1.5}/crawlo/event.py +0 -0
  144. {crawlo-1.1.3 → crawlo-1.1.5}/crawlo/exceptions.py +0 -0
  145. {crawlo-1.1.3 → crawlo-1.1.5}/crawlo/filters/__init__.py +0 -0
  146. {crawlo-1.1.3 → crawlo-1.1.5}/crawlo/filters/memory_filter.py +0 -0
  147. {crawlo-1.1.3 → crawlo-1.1.5}/crawlo/items/__init__.py +0 -0
  148. {crawlo-1.1.3 → crawlo-1.1.5}/crawlo/items/base.py +0 -0
  149. {crawlo-1.1.3 → crawlo-1.1.5}/crawlo/items/fields.py +0 -0
  150. {crawlo-1.1.3 → crawlo-1.1.5}/crawlo/items/items.py +0 -0
  151. {crawlo-1.1.3 → crawlo-1.1.5}/crawlo/middleware/__init__.py +0 -0
  152. {crawlo-1.1.3 → crawlo-1.1.5}/crawlo/middleware/default_header.py +0 -0
  153. {crawlo-1.1.3 → crawlo-1.1.5}/crawlo/middleware/download_delay.py +0 -0
  154. {crawlo-1.1.3 → crawlo-1.1.5}/crawlo/middleware/middleware_manager.py +0 -0
  155. {crawlo-1.1.3 → crawlo-1.1.5}/crawlo/middleware/request_ignore.py +0 -0
  156. {crawlo-1.1.3 → crawlo-1.1.5}/crawlo/middleware/response_code.py +0 -0
  157. {crawlo-1.1.3 → crawlo-1.1.5}/crawlo/middleware/response_filter.py +0 -0
  158. {crawlo-1.1.3 → crawlo-1.1.5}/crawlo/middleware/retry.py +0 -0
  159. {crawlo-1.1.3 → crawlo-1.1.5}/crawlo/network/__init__.py +0 -0
  160. {crawlo-1.1.3 → crawlo-1.1.5}/crawlo/pipelines/__init__.py +0 -0
  161. {crawlo-1.1.3 → crawlo-1.1.5}/crawlo/pipelines/bloom_dedup_pipeline.py +0 -0
  162. {crawlo-1.1.3 → crawlo-1.1.5}/crawlo/pipelines/console_pipeline.py +0 -0
  163. {crawlo-1.1.3 → crawlo-1.1.5}/crawlo/pipelines/csv_pipeline.py +0 -0
  164. {crawlo-1.1.3 → crawlo-1.1.5}/crawlo/pipelines/database_dedup_pipeline.py +0 -0
  165. {crawlo-1.1.3 → crawlo-1.1.5}/crawlo/pipelines/json_pipeline.py +0 -0
  166. {crawlo-1.1.3 → crawlo-1.1.5}/crawlo/pipelines/memory_dedup_pipeline.py +0 -0
  167. {crawlo-1.1.3 → crawlo-1.1.5}/crawlo/pipelines/pipeline_manager.py +0 -0
  168. {crawlo-1.1.3 → crawlo-1.1.5}/crawlo/project.py +0 -0
  169. {crawlo-1.1.3 → crawlo-1.1.5}/crawlo/queue/__init__.py +0 -0
  170. {crawlo-1.1.3 → crawlo-1.1.5}/crawlo/queue/pqueue.py +0 -0
  171. {crawlo-1.1.3 → crawlo-1.1.5}/crawlo/settings/__init__.py +0 -0
  172. {crawlo-1.1.3 → crawlo-1.1.5}/crawlo/settings/setting_manager.py +0 -0
  173. {crawlo-1.1.3 → crawlo-1.1.5}/crawlo/spider/__init__.py +0 -0
  174. {crawlo-1.1.3 → crawlo-1.1.5}/crawlo/stats_collector.py +0 -0
  175. {crawlo-1.1.3 → crawlo-1.1.5}/crawlo/task_manager.py +0 -0
  176. {crawlo-1.1.3 → crawlo-1.1.5}/crawlo/templates/crawlo.cfg.tmpl +0 -0
  177. {crawlo-1.1.3 → crawlo-1.1.5}/crawlo/templates/project/__init__.py.tmpl +0 -0
  178. {crawlo-1.1.3 → crawlo-1.1.5}/crawlo/templates/project/run.py.tmpl +0 -0
  179. {crawlo-1.1.3 → crawlo-1.1.5}/crawlo/templates/project/spiders/__init__.py.tmpl +0 -0
  180. {crawlo-1.1.3 → crawlo-1.1.5}/crawlo/utils/controlled_spider_mixin.py +0 -0
  181. {crawlo-1.1.3 → crawlo-1.1.5}/crawlo/utils/db_helper.py +0 -0
  182. {crawlo-1.1.3 → crawlo-1.1.5}/crawlo/utils/func_tools.py +0 -0
  183. {crawlo-1.1.3 → crawlo-1.1.5}/crawlo/utils/large_scale_config.py +0 -0
  184. {crawlo-1.1.3 → crawlo-1.1.5}/crawlo/utils/large_scale_helper.py +0 -0
  185. {crawlo-1.1.3 → crawlo-1.1.5}/crawlo/utils/log.py +0 -0
  186. {crawlo-1.1.3 → crawlo-1.1.5}/crawlo/utils/queue_helper.py +0 -0
  187. {crawlo-1.1.3 → crawlo-1.1.5}/crawlo/utils/request.py +0 -0
  188. {crawlo-1.1.3 → crawlo-1.1.5}/crawlo/utils/request_serializer.py +0 -0
  189. {crawlo-1.1.3 → crawlo-1.1.5}/crawlo/utils/spider_loader.py +0 -0
  190. {crawlo-1.1.3 → crawlo-1.1.5}/crawlo/utils/system.py +0 -0
  191. {crawlo-1.1.3 → crawlo-1.1.5}/crawlo/utils/tools.py +0 -0
  192. {crawlo-1.1.3 → crawlo-1.1.5}/crawlo/utils/url.py +0 -0
  193. {crawlo-1.1.3 → crawlo-1.1.5}/crawlo.egg-info/dependency_links.txt +0 -0
  194. {crawlo-1.1.3 → crawlo-1.1.5}/crawlo.egg-info/entry_points.txt +0 -0
  195. {crawlo-1.1.3 → crawlo-1.1.5}/crawlo.egg-info/requires.txt +0 -0
  196. {crawlo-1.1.3 → crawlo-1.1.5}/crawlo.egg-info/top_level.txt +0 -0
  197. {crawlo-1.1.3 → crawlo-1.1.5}/examples/__init__.py +0 -0
  198. {crawlo-1.1.3 → crawlo-1.1.5}/pyproject.toml +0 -0
  199. {crawlo-1.1.3 → crawlo-1.1.5}/setup.cfg +0 -0
  200. {crawlo-1.1.3 → crawlo-1.1.5}/tests/__init__.py +0 -0
  201. {crawlo-1.1.3/examples → crawlo-1.1.5/tests}/controlled_spider_example.py +0 -0
  202. {crawlo-1.1.3 → crawlo-1.1.5}/tests/test_final_validation.py +0 -0
  203. {crawlo-1.1.3 → crawlo-1.1.5}/tests/test_proxy_health_check.py +0 -0
  204. {crawlo-1.1.3 → crawlo-1.1.5}/tests/test_proxy_middleware_integration.py +0 -0
  205. {crawlo-1.1.3 → crawlo-1.1.5}/tests/test_proxy_providers.py +0 -0
  206. {crawlo-1.1.3 → crawlo-1.1.5}/tests/test_proxy_stats.py +0 -0
  207. {crawlo-1.1.3 → crawlo-1.1.5}/tests/test_proxy_strategies.py +0 -0
  208. {crawlo-1.1.3 → crawlo-1.1.5}/tests/test_redis_config.py +0 -0
  209. {crawlo-1.1.3 → crawlo-1.1.5}/tests/test_redis_queue.py +0 -0
  210. {crawlo-1.1.3 → crawlo-1.1.5}/tests/test_request_serialization.py +0 -0
  211. {crawlo-1.1.3 → crawlo-1.1.5}/tests/test_scheduler.py +0 -0
crawlo-1.1.5/PKG-INFO ADDED
@@ -0,0 +1,401 @@
1
+ Metadata-Version: 2.4
2
+ Name: crawlo
3
+ Version: 1.1.5
4
+ Summary: Crawlo 是一款基于异步IO的高性能Python爬虫框架,支持分布式抓取。
5
+ Home-page: https://github.com/crawl-coder/Crawlo.git
6
+ Author: crawl-coder
7
+ Author-email: crawlo@qq.com
8
+ License: MIT
9
+ Classifier: Programming Language :: Python :: 3
10
+ Classifier: License :: OSI Approved :: MIT License
11
+ Classifier: Operating System :: OS Independent
12
+ Requires-Python: >=3.6
13
+ Description-Content-Type: text/markdown
14
+ Requires-Dist: aiohttp>=3.12.14
15
+ Requires-Dist: aiomysql>=0.2.0
16
+ Requires-Dist: aioredis>=2.0.1
17
+ Requires-Dist: asyncmy>=0.2.10
18
+ Requires-Dist: cssselect>=1.2.0
19
+ Requires-Dist: dateparser>=1.2.2
20
+ Requires-Dist: httpx[http2]>=0.27.0
21
+ Requires-Dist: curl-cffi>=0.13.0
22
+ Requires-Dist: lxml>=5.2.1
23
+ Requires-Dist: motor>=3.7.0
24
+ Requires-Dist: parsel>=1.9.1
25
+ Requires-Dist: pydantic>=2.11.7
26
+ Requires-Dist: pymongo>=4.11
27
+ Requires-Dist: PyMySQL>=1.1.1
28
+ Requires-Dist: python-dateutil>=2.9.0.post0
29
+ Requires-Dist: redis>=6.2.0
30
+ Requires-Dist: requests>=2.32.4
31
+ Requires-Dist: six>=1.17.0
32
+ Requires-Dist: ujson>=5.9.0
33
+ Requires-Dist: urllib3>=2.5.0
34
+ Requires-Dist: w3lib>=2.1.2
35
+ Requires-Dist: rich>=14.1.0
36
+ Requires-Dist: astor>=0.8.1
37
+ Requires-Dist: watchdog>=6.0.0
38
+ Provides-Extra: render
39
+ Requires-Dist: webdriver-manager>=4.0.0; extra == "render"
40
+ Requires-Dist: playwright; extra == "render"
41
+ Requires-Dist: selenium>=3.141.0; extra == "render"
42
+ Provides-Extra: all
43
+ Requires-Dist: bitarray>=1.5.3; extra == "all"
44
+ Requires-Dist: PyExecJS>=1.5.1; extra == "all"
45
+ Requires-Dist: pymongo>=3.10.1; extra == "all"
46
+ Requires-Dist: redis-py-cluster>=2.1.0; extra == "all"
47
+ Requires-Dist: webdriver-manager>=4.0.0; extra == "all"
48
+ Requires-Dist: playwright; extra == "all"
49
+ Requires-Dist: selenium>=3.141.0; extra == "all"
50
+
51
+ # Crawlo - 异步分布式爬虫框架
52
+
53
+ <div align="center">
54
+
55
+ [![Python Version](https://img.shields.io/badge/python-3.8%2B-blue)](https://www.python.org/downloads/)
56
+ [![License](https://img.shields.io/badge/license-MIT-green)](LICENSE)
57
+ [![Documentation](https://img.shields.io/badge/docs-latest-brightgreen)](https://crawlo.readthedocs.io/)
58
+
59
+ 一个基于 asyncio 的高性能异步分布式爬虫框架,支持单机和分布式部署。
60
+
61
+ </div>
62
+
63
+ ## 🌟 特性
64
+
65
+ - **异步高性能**: 基于 asyncio 实现,充分利用现代 CPU 多核性能
66
+ - **分布式支持**: 内置 Redis 队列,轻松实现分布式部署
67
+ - **模块化设计**: 中间件、管道、扩展组件系统,易于定制和扩展
68
+ - **智能去重**: 多种去重策略(内存、Redis、Bloom Filter)
69
+ - **灵活配置**: 支持多种配置方式,适应不同场景需求
70
+ - **丰富文档**: 完整的中英文双语文档和示例项目
71
+
72
+ ## 🚀 快速开始
73
+
74
+ ### 安装
75
+
76
+ ```bash
77
+ pip install crawlo
78
+ ```
79
+
80
+ ### 创建项目
81
+
82
+ ```bash
83
+ crawlo startproject myproject
84
+ cd myproject
85
+ ```
86
+
87
+ ### 编写爬虫
88
+
89
+ ```python
90
+ from crawlo import Spider, Request, Item
91
+
92
+ class MyItem(Item):
93
+ title = ''
94
+ url = ''
95
+
96
+ class MySpider(Spider):
97
+ name = 'myspider'
98
+
99
+ async def start_requests(self):
100
+ yield Request('https://httpbin.org/get', callback=self.parse)
101
+
102
+ async def parse(self, response):
103
+ yield MyItem(
104
+ title='Example Title',
105
+ url=response.url
106
+ )
107
+ ```
108
+
109
+ ### 运行爬虫
110
+
111
+ ```bash
112
+ crawlo crawl myspider
113
+ ```
114
+
115
+ ## 🏗️ 架构设计
116
+
117
+ ### 组件交互图
118
+
119
+ ```
120
+ ┌─────────────────────────────────────────────────────────────────────┐
121
+ │ Crawler │
122
+ │ ┌──────────────┐ ┌──────────────┐ ┌──────────────────────────┐ │
123
+ │ │ Spider │ │ Engine │ │ ExtensionManager │ │
124
+ │ │ │ │ │ │ │ │
125
+ │ │ start_urls │ │ Scheduler ◄─┼──┼──► StatsCollector │ │
126
+ │ │ parse() │ │ │ │ │ │
127
+ │ │ │ │ Downloader ◄─┼──┼──► MiddlewareManager │ │
128
+ │ │ │ │ │ │ │ │
129
+ │ │ │ │ Processor ◄─┼──┼──► PipelineManager │ │
130
+ │ └──────────────┘ └──────┬───────┘ └──────────────────────────┘ │
131
+ └──────────────────────────┼─────────────────────────────────────────┘
132
+
133
+ ┌──────────────────▼──────────────────┐
134
+ │ Scheduler │
135
+ │ ┌──────────────────────────────┐ │
136
+ │ │ QueueManager │ │
137
+ │ │ ┌─────────┐ ┌────────────┐ │ │
138
+ │ │ │ Memory │ │ Redis │ │ │
139
+ │ │ │ Queue │ │ Queue │ │ │
140
+ │ │ └─────────┘ └────────────┘ │ │
141
+ │ └──────────────────────────────┘ │
142
+ │ ┌──────────────────────────────┐ │
143
+ │ │ Filter │ │
144
+ │ │ ┌─────────┐ ┌────────────┐ │ │
145
+ │ │ │ Memory │ │ Redis │ │ │
146
+ │ │ │ Filter │ │ Filter │ │ │
147
+ │ │ └─────────┘ └────────────┘ │ │
148
+ │ └──────────────────────────────┘ │
149
+ └─────────────────────────────────────┘
150
+
151
+ ┌──────────────────▼──────────────────┐
152
+ │ Downloader │
153
+ │ ┌──────────────────────────────┐ │
154
+ │ │ MiddlewareManager │ │
155
+ │ │ │ │
156
+ │ │ RequestMiddleware ◄────────┐ │ │
157
+ │ │ ResponseMiddleware │ │ │
158
+ │ │ ExceptionMiddleware │ │ │
159
+ │ │ ╱ │ │
160
+ │ └─────────────────────────╱───┘ │
161
+ │ ╱ │
162
+ │ ┌───────────────────────▼──┐ │
163
+ │ │ Download Implementations │ │
164
+ │ │ - AioHttpDownloader │ │
165
+ │ │ - HttpXDownloader │ │
166
+ │ │ - CurlCffiDownloader │ │
167
+ │ └──────────────────────────┘ │
168
+ └─────────────────────────────────────┘
169
+
170
+ ┌──────────────────▼──────────────────┐
171
+ │ Processor │
172
+ │ ┌──────────────────────────────┐ │
173
+ │ │ PipelineManager │ │
174
+ │ │ ┌─────────────────────────┐ │ │
175
+ │ │ │ Pipeline Stages │ │ │
176
+ │ │ │ - ValidationPipeline │ │ │
177
+ │ │ │ - ProcessingPipeline │ │ │
178
+ │ │ │ - StoragePipeline │ │ │
179
+ │ │ └─────────────────────────┘ │ │
180
+ │ └──────────────────────────────┘ │
181
+ └─────────────────────────────────────┘
182
+ ```
183
+
184
+ ### 运行模式切换图
185
+
186
+ ```
187
+ ┌─────────────────────┐
188
+ │ ModeManager │
189
+ │ (运行模式管理器) │
190
+ └─────────┬───────────┘
191
+
192
+ ┌─────────────────────┼─────────────────────┐
193
+ │ │ │
194
+ ▼ ▼ ▼
195
+ ┌───────────────┐ ┌─────────────────┐ ┌─────────────────┐
196
+ │ Standalone │ │ Distributed │ │ Auto │
197
+ │ (单机模式) │ │ (分布式模式) │ │ (自动检测模式) │
198
+ └───────┬───────┘ └─────────┬───────┘ └─────────┬───────┘
199
+ │ │ │
200
+ ▼ ▼ ▼
201
+ ┌───────────────┐ ┌─────────────────┐ ┌─────────────────┐
202
+ │ Memory Queue │ │ Redis Queue │ │ Auto Select │
203
+ │ Memory Filter │ │ Redis Filter │ │ Memory/Redis │
204
+ └───────────────┘ └─────────────────┘ └─────────────────┘
205
+ ```
206
+
207
+ ### 数据流向图
208
+
209
+ ```
210
+ ┌─────────────┐ 1.生成初始请求 ┌──────────────┐
211
+ │ Spider ├─────────────────────►│ Scheduler │
212
+ └─────────────┘ └──────┬───────┘
213
+ │ 2.去重检查
214
+
215
+ ┌─────────────────┐
216
+ │ Filter │
217
+ └─────────┬───────┘
218
+ │ 3.入队
219
+
220
+ ┌─────────────────┐
221
+ │ Queue │
222
+ └─────────┬───────┘
223
+ │ 4.获取请求
224
+
225
+ ┌─────────────────┐ 5.下载请求
226
+ │ Downloader ├──────────────────┐
227
+ └─────────────────┘ │
228
+ │ 6.解析响应 │
229
+ ▼ ▼
230
+ ┌─────────────────┐ 7.生成数据 ┌─────────────┐
231
+ │ Processor ├────────────────►│ Pipeline │
232
+ └─────────────────┘ └─────────────┘
233
+ │ 8.存储数据
234
+
235
+ ┌─────────────────┐
236
+ │ Items │
237
+ └─────────────────┘
238
+ ```
239
+
240
+ ### 模块层次结构图
241
+
242
+ ```
243
+ crawlo/
244
+ ├── cli.py # 命令行接口
245
+ ├── crawler.py # 爬虫运行实例
246
+ ├── project.py # 项目管理
247
+ ├── config.py # 配置管理
248
+ ├── mode_manager.py # 运行模式管理器
249
+ ├── stats_collector.py # 统计收集器
250
+ ├── subscriber.py # 事件订阅器
251
+ ├── task_manager.py # 任务管理器
252
+ ├── event.py # 事件定义
253
+ ├── exceptions.py # 异常定义
254
+ ├──
255
+ ├── core/ # 核心组件
256
+ │ ├── engine.py # 引擎
257
+ │ ├── scheduler.py # 调度器
258
+ │ ├── processor.py # 处理器
259
+
260
+ ├── spider/ # 爬虫基类
261
+ │ └── __init__.py # 爬虫元类和基类
262
+
263
+ ├── network/ # 网络相关
264
+ │ ├── request.py # 请求对象
265
+ │ └── response.py # 响应对象
266
+
267
+ ├── downloader/ # 下载器
268
+ │ ├── __init__.py # 下载器基类
269
+ │ ├── aiohttp_downloader.py # AioHttp实现
270
+ │ ├── httpx_downloader.py # HttpX实现
271
+ │ └── cffi_downloader.py # CurlCffi实现
272
+
273
+ ├── queue/ # 队列管理
274
+ │ ├── __init__.py
275
+ │ ├── queue_manager.py # 队列管理器
276
+ │ ├── pqueue.py # 内存优先队列
277
+ │ └── redis_priority_queue.py # Redis优先队列
278
+
279
+ ├── filters/ # 过滤器
280
+ │ ├── __init__.py
281
+ │ ├── base_filter.py # 过滤器基类
282
+ │ ├── memory_filter.py # 内存过滤器
283
+ │ └── aioredis_filter.py # Redis过滤器
284
+
285
+ ├── middleware/ # 中间件
286
+ │ ├── __init__.py
287
+ │ ├── middleware_manager.py # 中间件管理器
288
+ │ ├── default_header.py # 默认请求头
289
+ │ ├── download_delay.py # 下载延迟
290
+ │ ├── proxy.py # 代理支持
291
+ │ ├── request_ignore.py # 请求忽略
292
+ │ ├── response_code.py # 响应码处理
293
+ │ ├── response_filter.py # 响应过滤
294
+ │ └── retry.py # 重试机制
295
+
296
+ ├── pipelines/ # 数据管道
297
+ │ ├── __init__.py
298
+ │ ├── pipeline_manager.py # 管道管理器
299
+ │ ├── base_pipeline.py # 管道基类
300
+ │ ├── console_pipeline.py # 控制台输出管道
301
+ │ └── mysql_pipeline.py # MySQL存储管道
302
+
303
+ ├── extension/ # 扩展组件
304
+ │ ├── __init__.py
305
+ │ ├── log_interval.py # 定时日志
306
+ │ ├── log_stats.py # 统计日志
307
+ │ ├── logging_extension.py # 日志扩展
308
+ │ ├── memory_monitor.py # 内存监控
309
+ │ └── performance_profiler.py # 性能分析
310
+
311
+ ├── settings/ # 配置系统
312
+ │ ├── __init__.py
313
+ │ ├── default_settings.py # 默认配置
314
+ │ └── setting_manager.py # 配置管理器
315
+
316
+ ├── utils/ # 工具库
317
+ │ ├── __init__.py
318
+ │ ├── log.py # 日志工具
319
+ │ ├── request.py # 请求工具
320
+ │ ├── request_serializer.py # 请求序列化
321
+ │ └── func_tools.py # 函数工具
322
+
323
+ └── templates/ # 模板文件
324
+ ├── project/
325
+ └── spider/
326
+ ```
327
+
328
+ ### 组件说明
329
+
330
+ - **Crawler**: 爬虫运行实例,管理Spider与引擎的生命周期
331
+ - **Engine**: 引擎组件,协调Scheduler、Downloader、Processor
332
+ - **Scheduler**: 调度器,管理请求队列和去重过滤
333
+ - **Downloader**: 下载器,负责网络请求,支持多种实现(aiohttp, httpx, curl-cffi)
334
+ - **Processor**: 处理器,处理响应数据和管道
335
+ - **QueueManager**: 统一的队列管理器,支持内存队列和Redis队列的自动切换
336
+ - **Filter**: 请求去重过滤器,支持内存和Redis两种实现
337
+ - **Middleware**: 中间件系统,处理请求/响应的预处理和后处理
338
+ - **Pipeline**: 数据处理管道,支持多种存储方式(控制台、数据库等)
339
+ - **Spider**: 爬虫基类,定义爬取逻辑
340
+
341
+ ### 运行模式
342
+
343
+ Crawlo支持三种运行模式:
344
+ - **standalone**: 单机模式,使用内存队列和内存过滤器
345
+ - **distributed**: 分布式模式,使用Redis队列和Redis过滤器
346
+ - **auto**: 自动检测模式,根据环境自动选择最佳运行方式
347
+
348
+ ## 🎛️ 配置系统
349
+
350
+ ### 传统配置方式
351
+
352
+ ```
353
+ # settings.py
354
+ PROJECT_NAME = 'myproject'
355
+ CONCURRENCY = 16
356
+ DOWNLOAD_DELAY = 1.0
357
+ QUEUE_TYPE = 'memory' # 单机模式
358
+ # QUEUE_TYPE = 'redis' # 分布式模式
359
+ ```
360
+
361
+ ### 命令行配置
362
+
363
+ ```
364
+ crawlo crawl myspider --concurrency=32 --delay=0.5
365
+ ```
366
+
367
+ ## 🧩 核心组件
368
+
369
+ ### 中间件系统
370
+ 灵活的中间件系统,支持请求预处理、响应处理和异常处理。
371
+
372
+ ### 管道系统
373
+ 可扩展的数据处理管道,支持多种存储方式(控制台、数据库等)。
374
+
375
+ ### 扩展组件
376
+ 功能增强扩展,包括日志、监控、性能分析等。
377
+
378
+ ### 过滤系统
379
+ 智能去重过滤,支持多种去重策略(内存、Redis、Bloom Filter)。
380
+
381
+ ## 📦 示例项目
382
+
383
+ - [API数据采集](examples/api_data_collection/) - 简单的API数据采集示例
384
+ - [电信设备许可证](examples/telecom_licenses_distributed/) - 分布式爬取示例
385
+
386
+ ## 📚 文档
387
+
388
+ 完整的文档请访问 [Crawlo Documentation](https://crawlo.readthedocs.io/)
389
+
390
+ - [快速开始指南](docs/modules/index.md)
391
+ - [模块化文档](docs/modules/index.md)
392
+ - [API参考](docs/api_reference.md)
393
+ - [配置最佳实践](docs/configuration_best_practices.md)
394
+
395
+ ## 🤝 贡献
396
+
397
+ 欢迎提交 Issue 和 Pull Request 来帮助改进 Crawlo!
398
+
399
+ ## 📄 许可证
400
+
401
+ 本项目采用 MIT 许可证,详情请见 [LICENSE](LICENSE) 文件。