crawlo 1.1.5__tar.gz → 1.5.4__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (483) hide show
  1. crawlo-1.5.4/PKG-INFO +997 -0
  2. crawlo-1.5.4/README.md +947 -0
  3. crawlo-1.5.4/crawlo/__init__.py +89 -0
  4. crawlo-1.5.4/crawlo/__version__.py +1 -0
  5. crawlo-1.5.4/crawlo/cli.py +76 -0
  6. {crawlo-1.1.5 → crawlo-1.5.4}/crawlo/commands/__init__.py +2 -1
  7. {crawlo-1.1.5 → crawlo-1.5.4}/crawlo/commands/check.py +75 -75
  8. {crawlo-1.1.5 → crawlo-1.5.4}/crawlo/commands/genspider.py +71 -36
  9. crawlo-1.5.4/crawlo/commands/help.py +141 -0
  10. {crawlo-1.1.5 → crawlo-1.5.4}/crawlo/commands/list.py +24 -24
  11. crawlo-1.5.4/crawlo/commands/run.py +379 -0
  12. crawlo-1.5.4/crawlo/commands/startproject.py +461 -0
  13. {crawlo-1.1.5 → crawlo-1.5.4}/crawlo/commands/stats.py +23 -23
  14. {crawlo-1.1.5 → crawlo-1.5.4}/crawlo/commands/utils.py +20 -10
  15. {crawlo-1.1.5 → crawlo-1.5.4}/crawlo/config.py +173 -32
  16. {crawlo-1.1.5 → crawlo-1.5.4}/crawlo/config_validator.py +30 -5
  17. crawlo-1.5.4/crawlo/core/__init__.py +52 -0
  18. crawlo-1.5.4/crawlo/core/engine.py +515 -0
  19. {crawlo-1.1.5 → crawlo-1.5.4}/crawlo/core/processor.py +9 -2
  20. crawlo-1.5.4/crawlo/core/scheduler.py +441 -0
  21. crawlo-1.5.4/crawlo/crawler.py +941 -0
  22. crawlo-1.5.4/crawlo/data/__init__.py +6 -0
  23. crawlo-1.5.4/crawlo/data/user_agents.py +195 -0
  24. {crawlo-1.1.5 → crawlo-1.5.4}/crawlo/downloader/__init__.py +47 -9
  25. {crawlo-1.1.5 → crawlo-1.5.4}/crawlo/downloader/aiohttp_downloader.py +119 -45
  26. {crawlo-1.1.5 → crawlo-1.5.4}/crawlo/downloader/cffi_downloader.py +25 -30
  27. {crawlo-1.1.5 → crawlo-1.5.4}/crawlo/downloader/httpx_downloader.py +79 -25
  28. {crawlo-1.1.5 → crawlo-1.5.4}/crawlo/downloader/hybrid_downloader.py +19 -15
  29. {crawlo-1.1.5 → crawlo-1.5.4}/crawlo/downloader/playwright_downloader.py +44 -17
  30. {crawlo-1.1.5 → crawlo-1.5.4}/crawlo/downloader/selenium_downloader.py +23 -8
  31. crawlo-1.5.4/crawlo/event.py +45 -0
  32. crawlo-1.5.4/crawlo/exceptions.py +215 -0
  33. crawlo-1.5.4/crawlo/extension/__init__.py +65 -0
  34. {crawlo-1.1.5 → crawlo-1.5.4}/crawlo/extension/health_check.py +11 -9
  35. crawlo-1.5.4/crawlo/extension/log_interval.py +95 -0
  36. crawlo-1.5.4/crawlo/extension/log_stats.py +73 -0
  37. {crawlo-1.1.5 → crawlo-1.5.4}/crawlo/extension/logging_extension.py +16 -6
  38. {crawlo-1.1.5 → crawlo-1.5.4}/crawlo/extension/memory_monitor.py +5 -5
  39. {crawlo-1.1.5 → crawlo-1.5.4}/crawlo/extension/performance_profiler.py +5 -5
  40. {crawlo-1.1.5 → crawlo-1.5.4}/crawlo/extension/request_recorder.py +6 -6
  41. crawlo-1.5.4/crawlo/factories/__init__.py +28 -0
  42. crawlo-1.5.4/crawlo/factories/base.py +69 -0
  43. crawlo-1.5.4/crawlo/factories/crawler.py +105 -0
  44. crawlo-1.5.4/crawlo/factories/registry.py +85 -0
  45. crawlo-1.5.4/crawlo/factories/utils.py +135 -0
  46. {crawlo-1.1.5 → crawlo-1.5.4}/crawlo/filters/__init__.py +23 -6
  47. crawlo-1.5.4/crawlo/filters/aioredis_filter.py +504 -0
  48. {crawlo-1.1.5 → crawlo-1.5.4}/crawlo/filters/memory_filter.py +22 -20
  49. crawlo-1.5.4/crawlo/framework.py +307 -0
  50. crawlo-1.5.4/crawlo/initialization/__init__.py +44 -0
  51. crawlo-1.5.4/crawlo/initialization/built_in.py +392 -0
  52. crawlo-1.5.4/crawlo/initialization/context.py +142 -0
  53. crawlo-1.5.4/crawlo/initialization/core.py +241 -0
  54. crawlo-1.5.4/crawlo/initialization/phases.py +230 -0
  55. crawlo-1.5.4/crawlo/initialization/registry.py +144 -0
  56. crawlo-1.5.4/crawlo/initialization/utils.py +49 -0
  57. crawlo-1.5.4/crawlo/interfaces.py +46 -0
  58. {crawlo-1.1.5 → crawlo-1.5.4}/crawlo/items/base.py +2 -1
  59. {crawlo-1.1.5 → crawlo-1.5.4}/crawlo/items/fields.py +0 -1
  60. crawlo-1.5.4/crawlo/logging/__init__.py +42 -0
  61. crawlo-1.5.4/crawlo/logging/config.py +281 -0
  62. crawlo-1.5.4/crawlo/logging/factory.py +176 -0
  63. crawlo-1.5.4/crawlo/logging/manager.py +104 -0
  64. crawlo-1.5.4/crawlo/middleware/__init__.py +101 -0
  65. crawlo-1.5.4/crawlo/middleware/default_header.py +131 -0
  66. crawlo-1.5.4/crawlo/middleware/download_attachment_middleware.py +280 -0
  67. crawlo-1.5.4/crawlo/middleware/download_delay.py +109 -0
  68. crawlo-1.5.4/crawlo/middleware/middleware_manager.py +197 -0
  69. crawlo-1.5.4/crawlo/middleware/offsite.py +123 -0
  70. crawlo-1.5.4/crawlo/middleware/proxy.py +168 -0
  71. crawlo-1.5.4/crawlo/middleware/request_ignore.py +86 -0
  72. crawlo-1.5.4/crawlo/middleware/response_code.py +150 -0
  73. crawlo-1.5.4/crawlo/middleware/response_filter.py +135 -0
  74. {crawlo-1.1.5 → crawlo-1.5.4}/crawlo/middleware/retry.py +4 -2
  75. crawlo-1.5.4/crawlo/mode_manager.py +308 -0
  76. crawlo-1.5.4/crawlo/network/request.py +532 -0
  77. crawlo-1.5.4/crawlo/network/response.py +798 -0
  78. crawlo-1.5.4/crawlo/pipelines/__init__.py +53 -0
  79. crawlo-1.5.4/crawlo/pipelines/base_pipeline.py +692 -0
  80. crawlo-1.5.4/crawlo/pipelines/bloom_dedup_pipeline.py +137 -0
  81. {crawlo-1.1.5 → crawlo-1.5.4}/crawlo/pipelines/console_pipeline.py +2 -2
  82. {crawlo-1.1.5 → crawlo-1.5.4}/crawlo/pipelines/csv_pipeline.py +4 -4
  83. {crawlo-1.1.5 → crawlo-1.5.4}/crawlo/pipelines/database_dedup_pipeline.py +42 -89
  84. {crawlo-1.1.5 → crawlo-1.5.4}/crawlo/pipelines/json_pipeline.py +4 -4
  85. crawlo-1.5.4/crawlo/pipelines/memory_dedup_pipeline.py +93 -0
  86. crawlo-1.5.4/crawlo/pipelines/mongo_pipeline.py +186 -0
  87. crawlo-1.5.4/crawlo/pipelines/mysql_pipeline.py +799 -0
  88. crawlo-1.5.4/crawlo/pipelines/pipeline_manager.py +99 -0
  89. crawlo-1.5.4/crawlo/pipelines/redis_dedup_pipeline.py +210 -0
  90. crawlo-1.5.4/crawlo/project.py +347 -0
  91. crawlo-1.5.4/crawlo/queue/__init__.py +10 -0
  92. {crawlo-1.1.5 → crawlo-1.5.4}/crawlo/queue/pqueue.py +16 -10
  93. crawlo-1.5.4/crawlo/queue/queue_manager.py +671 -0
  94. crawlo-1.5.4/crawlo/queue/redis_priority_queue.py +487 -0
  95. crawlo-1.5.4/crawlo/settings/default_settings.py +273 -0
  96. crawlo-1.5.4/crawlo/settings/setting_manager.py +259 -0
  97. {crawlo-1.1.5 → crawlo-1.5.4}/crawlo/spider/__init__.py +183 -108
  98. crawlo-1.5.4/crawlo/stats_collector.py +85 -0
  99. {crawlo-1.1.5 → crawlo-1.5.4}/crawlo/subscriber.py +1 -2
  100. crawlo-1.5.4/crawlo/task_manager.py +142 -0
  101. crawlo-1.5.4/crawlo/templates/crawlo.cfg.tmpl +11 -0
  102. crawlo-1.5.4/crawlo/templates/project/__init__.py.tmpl +2 -0
  103. crawlo-1.5.4/crawlo/templates/project/items.py.tmpl +14 -0
  104. crawlo-1.5.4/crawlo/templates/project/middlewares.py.tmpl +39 -0
  105. crawlo-1.5.4/crawlo/templates/project/pipelines.py.tmpl +36 -0
  106. crawlo-1.5.4/crawlo/templates/project/settings.py.tmpl +109 -0
  107. crawlo-1.5.4/crawlo/templates/project/settings_distributed.py.tmpl +152 -0
  108. crawlo-1.5.4/crawlo/templates/project/settings_gentle.py.tmpl +176 -0
  109. crawlo-1.5.4/crawlo/templates/project/settings_high_performance.py.tmpl +177 -0
  110. crawlo-1.5.4/crawlo/templates/project/settings_minimal.py.tmpl +103 -0
  111. crawlo-1.5.4/crawlo/templates/project/settings_simple.py.tmpl +174 -0
  112. crawlo-1.5.4/crawlo/templates/project/spiders/__init__.py.tmpl +10 -0
  113. crawlo-1.5.4/crawlo/templates/run.py.tmpl +24 -0
  114. crawlo-1.5.4/crawlo/templates/spider/spider.py.tmpl +33 -0
  115. crawlo-1.5.4/crawlo/templates/spiders_init.py.tmpl +5 -0
  116. crawlo-1.5.4/crawlo/tools/__init__.py +95 -0
  117. crawlo-1.5.4/crawlo/tools/attachment_downloader.py +335 -0
  118. {crawlo-1.1.5/crawlo/utils → crawlo-1.5.4/crawlo/tools}/date_tools.py +1 -1
  119. {crawlo-1.1.5 → crawlo-1.5.4}/crawlo/tools/distributed_coordinator.py +31 -34
  120. {crawlo-1.1.5 → crawlo-1.5.4}/crawlo/tools/scenario_adapter.py +1 -1
  121. {crawlo-1.1.5/crawlo/cleaners → crawlo-1.5.4/crawlo/tools}/text_cleaner.py +2 -2
  122. crawlo-1.5.4/crawlo/utils/__init__.py +75 -0
  123. crawlo-1.1.5/crawlo/utils/batch_processor.py → crawlo-1.5.4/crawlo/utils/batch_manager.py +68 -26
  124. crawlo-1.5.4/crawlo/utils/batch_processor.py +165 -0
  125. crawlo-1.5.4/crawlo/utils/config_manager.py +442 -0
  126. {crawlo-1.1.5 → crawlo-1.5.4}/crawlo/utils/controlled_spider_mixin.py +8 -8
  127. crawlo-1.5.4/crawlo/utils/database_connection_pool.py +109 -0
  128. crawlo-1.5.4/crawlo/utils/db_helper.py +231 -0
  129. crawlo-1.5.4/crawlo/utils/encoding_helper.py +190 -0
  130. crawlo-1.1.5/crawlo/utils/enhanced_error_handler.py → crawlo-1.5.4/crawlo/utils/error_handler.py +59 -8
  131. crawlo-1.5.4/crawlo/utils/fingerprint.py +122 -0
  132. {crawlo-1.1.5 → crawlo-1.5.4}/crawlo/utils/large_scale_helper.py +7 -6
  133. crawlo-1.5.4/crawlo/utils/misc.py +129 -0
  134. crawlo-1.5.4/crawlo/utils/mongo_connection_pool.py +221 -0
  135. crawlo-1.5.4/crawlo/utils/mysql_connection_pool.py +451 -0
  136. {crawlo-1.1.5 → crawlo-1.5.4}/crawlo/utils/performance_monitor.py +10 -9
  137. crawlo-1.5.4/crawlo/utils/redis_key_manager.py +200 -0
  138. crawlo-1.5.4/crawlo/utils/redis_manager.py +809 -0
  139. {crawlo-1.1.5 → crawlo-1.5.4}/crawlo/utils/request.py +79 -68
  140. crawlo-1.5.4/crawlo/utils/request_serializer.py +426 -0
  141. crawlo-1.5.4/crawlo/utils/resource_manager.py +339 -0
  142. crawlo-1.5.4/crawlo/utils/response_helper.py +113 -0
  143. crawlo-1.5.4/crawlo/utils/selector_helper.py +139 -0
  144. crawlo-1.5.4/crawlo/utils/singleton.py +70 -0
  145. crawlo-1.5.4/crawlo/utils/spider_loader.py +202 -0
  146. crawlo-1.5.4/crawlo/utils/text_helper.py +95 -0
  147. crawlo-1.5.4/crawlo.egg-info/PKG-INFO +997 -0
  148. crawlo-1.5.4/crawlo.egg-info/SOURCES.txt +390 -0
  149. crawlo-1.5.4/examples/attachment_download_example.py +200 -0
  150. {crawlo-1.1.5 → crawlo-1.5.4}/requirements.txt +15 -6
  151. {crawlo-1.1.5 → crawlo-1.5.4}/setup.cfg +1 -1
  152. crawlo-1.5.4/tests/RESOURCE_LEAK_TEST_REPORT.md +128 -0
  153. {crawlo-1.1.5 → crawlo-1.5.4}/tests/advanced_tools_example.py +10 -68
  154. crawlo-1.5.4/tests/authenticated_proxy_example.py +111 -0
  155. crawlo-1.5.4/tests/baidu_performance_test.py +109 -0
  156. crawlo-1.5.4/tests/baidu_test.py +60 -0
  157. crawlo-1.5.4/tests/bug_check_test.py +251 -0
  158. {crawlo-1.1.5 → crawlo-1.5.4}/tests/cleaners_example.py +2 -2
  159. crawlo-1.5.4/tests/comprehensive_framework_test.py +213 -0
  160. crawlo-1.5.4/tests/comprehensive_test.py +82 -0
  161. crawlo-1.5.4/tests/comprehensive_testing_summary.md +187 -0
  162. crawlo-1.5.4/tests/config_validation_demo.py +143 -0
  163. crawlo-1.5.4/tests/debug_configure.py +70 -0
  164. crawlo-1.5.4/tests/debug_framework_logger.py +85 -0
  165. crawlo-1.5.4/tests/debug_log_config.py +127 -0
  166. crawlo-1.5.4/tests/debug_log_levels.py +64 -0
  167. crawlo-1.5.4/tests/debug_pipelines.py +67 -0
  168. crawlo-1.5.4/tests/detailed_log_test.py +234 -0
  169. crawlo-1.5.4/tests/direct_selector_helper_test.py +97 -0
  170. crawlo-1.5.4/tests/distributed_dedup_test.py +467 -0
  171. crawlo-1.5.4/tests/distributed_test.py +67 -0
  172. crawlo-1.5.4/tests/distributed_test_debug.py +77 -0
  173. {crawlo-1.1.5 → crawlo-1.5.4}/tests/error_handling_example.py +9 -9
  174. crawlo-1.5.4/tests/explain_mysql_update_behavior.py +77 -0
  175. crawlo-1.5.4/tests/final_comprehensive_test.py +152 -0
  176. crawlo-1.5.4/tests/final_log_test.py +261 -0
  177. crawlo-1.5.4/tests/final_validation_test.py +183 -0
  178. crawlo-1.5.4/tests/final_verification.py +383 -0
  179. crawlo-1.5.4/tests/fix_log_test.py +143 -0
  180. crawlo-1.5.4/tests/framework_performance_test.py +203 -0
  181. crawlo-1.5.4/tests/log_buffering_test.py +112 -0
  182. crawlo-1.5.4/tests/log_generation_timing_test.py +154 -0
  183. crawlo-1.5.4/tests/monitor_redis_dedup.sh +72 -0
  184. crawlo-1.5.4/tests/ofweek_scrapy/ofweek_scrapy/items.py +12 -0
  185. crawlo-1.5.4/tests/ofweek_scrapy/ofweek_scrapy/middlewares.py +100 -0
  186. crawlo-1.5.4/tests/ofweek_scrapy/ofweek_scrapy/pipelines.py +13 -0
  187. crawlo-1.5.4/tests/ofweek_scrapy/ofweek_scrapy/settings.py +85 -0
  188. crawlo-1.5.4/tests/ofweek_scrapy/ofweek_scrapy/spiders/__init__.py +4 -0
  189. crawlo-1.5.4/tests/ofweek_scrapy/scrapy.cfg +11 -0
  190. crawlo-1.5.4/tests/optimized_performance_test.py +212 -0
  191. crawlo-1.5.4/tests/performance_comparison.py +245 -0
  192. crawlo-1.5.4/tests/queue_blocking_test.py +114 -0
  193. crawlo-1.5.4/tests/queue_test.py +90 -0
  194. {crawlo-1.1.5 → crawlo-1.5.4}/tests/redis_key_validation_demo.py +3 -3
  195. crawlo-1.5.4/tests/request_params_example.py +151 -0
  196. crawlo-1.5.4/tests/run_all_leak_tests.py +156 -0
  197. crawlo-1.5.4/tests/scrapy_comparison/ofweek_scrapy.py +139 -0
  198. crawlo-1.5.4/tests/scrapy_comparison/scrapy_test.py +134 -0
  199. crawlo-1.5.4/tests/simple_cli_test.py +55 -0
  200. crawlo-1.5.4/tests/simple_command_test.py +120 -0
  201. crawlo-1.5.4/tests/simple_crawlo_test.py +127 -0
  202. crawlo-1.5.4/tests/simple_follow_test.py +39 -0
  203. crawlo-1.5.4/tests/simple_log_test2.py +138 -0
  204. crawlo-1.5.4/tests/simple_optimization_test.py +129 -0
  205. crawlo-1.5.4/tests/simple_queue_type_test.py +42 -0
  206. crawlo-1.5.4/tests/simple_response_selector_test.py +95 -0
  207. crawlo-1.5.4/tests/simple_selector_helper_test.py +155 -0
  208. crawlo-1.5.4/tests/simple_selector_test.py +208 -0
  209. crawlo-1.5.4/tests/simple_spider_test.py +50 -0
  210. crawlo-1.5.4/tests/simple_url_test.py +74 -0
  211. crawlo-1.5.4/tests/simulate_mysql_update_test.py +140 -0
  212. crawlo-1.5.4/tests/spider_log_timing_test.py +178 -0
  213. crawlo-1.5.4/tests/test_ack_call_analysis.py +91 -0
  214. crawlo-1.5.4/tests/test_ack_call_fix.py +237 -0
  215. crawlo-1.5.4/tests/test_ack_method_debug.py +177 -0
  216. crawlo-1.5.4/tests/test_ack_method_fix.py +131 -0
  217. crawlo-1.5.4/tests/test_ack_method_verification.py +211 -0
  218. {crawlo-1.1.5 → crawlo-1.5.4}/tests/test_advanced_tools.py +5 -0
  219. crawlo-1.5.4/tests/test_all_commands.py +236 -0
  220. crawlo-1.5.4/tests/test_all_pipeline_fingerprints.py +139 -0
  221. {crawlo-1.1.5 → crawlo-1.5.4}/tests/test_all_redis_key_configs.py +28 -23
  222. crawlo-1.5.4/tests/test_asyncmy_usage.py +62 -0
  223. crawlo-1.5.4/tests/test_batch_processor.py +184 -0
  224. crawlo-1.5.4/tests/test_browser_leak.py +88 -0
  225. crawlo-1.5.4/tests/test_cache_leak.py +79 -0
  226. crawlo-1.5.4/tests/test_circular_reference_leak.py +82 -0
  227. {crawlo-1.1.5 → crawlo-1.5.4}/tests/test_cleaners.py +6 -1
  228. crawlo-1.5.4/tests/test_cli_arguments.py +124 -0
  229. crawlo-1.5.4/tests/test_complete_ack_solution.py +144 -0
  230. crawlo-1.5.4/tests/test_component_factory.py +180 -0
  231. crawlo-1.5.4/tests/test_config_consistency.py +86 -0
  232. crawlo-1.5.4/tests/test_config_merge.py +158 -0
  233. {crawlo-1.1.5 → crawlo-1.5.4}/tests/test_config_validator.py +67 -73
  234. crawlo-1.5.4/tests/test_controlled_spider_mixin.py +85 -0
  235. crawlo-1.5.4/tests/test_coroutine_leak.py +78 -0
  236. crawlo-1.5.4/tests/test_crawler_process_import.py +44 -0
  237. crawlo-1.5.4/tests/test_crawler_process_spider_modules.py +53 -0
  238. crawlo-1.5.4/tests/test_crawlo_proxy_integration.py +120 -0
  239. crawlo-1.5.4/tests/test_database_connection_leak.py +92 -0
  240. {crawlo-1.1.5 → crawlo-1.5.4}/tests/test_date_tools.py +5 -0
  241. crawlo-1.5.4/tests/test_dedup_fix.py +226 -0
  242. crawlo-1.5.4/tests/test_dedup_pipeline_consistency.py +130 -0
  243. crawlo-1.5.4/tests/test_default_header_middleware.py +319 -0
  244. crawlo-1.5.4/tests/test_distributed.py +70 -0
  245. crawlo-1.5.4/tests/test_double_crawlo_fix.py +210 -0
  246. crawlo-1.5.4/tests/test_double_crawlo_fix_simple.py +130 -0
  247. crawlo-1.5.4/tests/test_download_delay_middleware.py +227 -0
  248. crawlo-1.5.4/tests/test_downloader_proxy_compatibility.py +278 -0
  249. {crawlo-1.1.5 → crawlo-1.5.4}/tests/test_edge_cases.py +43 -36
  250. crawlo-1.5.4/tests/test_encoding_core.py +62 -0
  251. crawlo-1.5.4/tests/test_encoding_detection.py +132 -0
  252. {crawlo-1.1.5 → crawlo-1.5.4}/tests/test_enhanced_error_handler.py +34 -29
  253. crawlo-1.5.4/tests/test_enhanced_error_handler_comprehensive.py +251 -0
  254. {crawlo-1.1.5 → crawlo-1.5.4}/tests/test_error_handler_compatibility.py +5 -0
  255. crawlo-1.5.4/tests/test_extract_spider_name.py +60 -0
  256. crawlo-1.5.4/tests/test_factories.py +258 -0
  257. crawlo-1.5.4/tests/test_factory_compatibility.py +202 -0
  258. crawlo-1.5.4/tests/test_file_handle_leak.py +74 -0
  259. {crawlo-1.1.5 → crawlo-1.5.4}/tests/test_final_validation.py +29 -24
  260. crawlo-1.5.4/tests/test_fingerprint_consistency.py +141 -0
  261. crawlo-1.5.4/tests/test_fingerprint_simple.py +57 -0
  262. crawlo-1.5.4/tests/test_get_component_logger.py +89 -0
  263. crawlo-1.5.4/tests/test_hash_performance.py +105 -0
  264. crawlo-1.5.4/tests/test_http_connection_leak.py +68 -0
  265. crawlo-1.5.4/tests/test_integration.py +174 -0
  266. {crawlo-1.1.5 → crawlo-1.5.4}/tests/test_item_dedup_redis_key.py +13 -8
  267. crawlo-1.5.4/tests/test_key_format_fix.py +165 -0
  268. crawlo-1.5.4/tests/test_key_format_fix_verification.py +179 -0
  269. crawlo-1.5.4/tests/test_large_scale_helper.py +241 -0
  270. crawlo-1.5.4/tests/test_logging_enhancements.py +380 -0
  271. crawlo-1.5.4/tests/test_logging_final.py +190 -0
  272. crawlo-1.5.4/tests/test_logging_integration.py +318 -0
  273. crawlo-1.5.4/tests/test_logging_system.py +288 -0
  274. crawlo-1.5.4/tests/test_middleware_debug.py +147 -0
  275. crawlo-1.5.4/tests/test_mode_consistency.py +57 -0
  276. crawlo-1.5.4/tests/test_multi_directory.py +73 -0
  277. crawlo-1.5.4/tests/test_multiple_spider_modules.py +86 -0
  278. crawlo-1.5.4/tests/test_mysql_optimizations.py +305 -0
  279. crawlo-1.5.4/tests/test_mysql_pipeline.py +257 -0
  280. crawlo-1.5.4/tests/test_mysql_pipeline_config.py +170 -0
  281. crawlo-1.5.4/tests/test_mysql_pipeline_error.py +104 -0
  282. crawlo-1.5.4/tests/test_mysql_pipeline_init_log.py +88 -0
  283. crawlo-1.5.4/tests/test_mysql_pipeline_integration.py +138 -0
  284. crawlo-1.5.4/tests/test_mysql_pipeline_refactor.py +149 -0
  285. crawlo-1.5.4/tests/test_mysql_pipeline_refactor_simple.py +91 -0
  286. crawlo-1.5.4/tests/test_mysql_pipeline_robustness.py +201 -0
  287. crawlo-1.5.4/tests/test_mysql_pipeline_types.py +94 -0
  288. crawlo-1.5.4/tests/test_mysql_update_columns.py +99 -0
  289. crawlo-1.5.4/tests/test_offsite_middleware.py +250 -0
  290. crawlo-1.5.4/tests/test_offsite_middleware_simple.py +209 -0
  291. crawlo-1.5.4/tests/test_optimized_selector_naming.py +106 -0
  292. {crawlo-1.1.5 → crawlo-1.5.4}/tests/test_parsel.py +5 -0
  293. {crawlo-1.1.5 → crawlo-1.5.4}/tests/test_performance.py +39 -35
  294. crawlo-1.5.4/tests/test_performance_monitor.py +121 -0
  295. crawlo-1.5.4/tests/test_pipeline_fingerprint_consistency.py +92 -0
  296. crawlo-1.5.4/tests/test_priority_behavior.py +217 -0
  297. crawlo-1.5.4/tests/test_priority_consistency.py +157 -0
  298. crawlo-1.5.4/tests/test_priority_consistency_fixed.py +255 -0
  299. crawlo-1.5.4/tests/test_processing_queue_debug.py +161 -0
  300. crawlo-1.5.4/tests/test_processing_queue_simple.py +209 -0
  301. {crawlo-1.1.5 → crawlo-1.5.4}/tests/test_proxy_health_check.py +5 -0
  302. crawlo-1.5.4/tests/test_proxy_middleware.py +223 -0
  303. crawlo-1.5.4/tests/test_proxy_middleware_enhanced.py +218 -0
  304. {crawlo-1.1.5 → crawlo-1.5.4}/tests/test_proxy_middleware_integration.py +12 -2
  305. crawlo-1.5.4/tests/test_proxy_middleware_refactored.py +213 -0
  306. crawlo-1.5.4/tests/test_proxy_only.py +89 -0
  307. {crawlo-1.1.5 → crawlo-1.5.4}/tests/test_proxy_providers.py +5 -0
  308. {crawlo-1.1.5 → crawlo-1.5.4}/tests/test_proxy_stats.py +5 -0
  309. {crawlo-1.1.5 → crawlo-1.5.4}/tests/test_proxy_strategies.py +5 -0
  310. crawlo-1.5.4/tests/test_proxy_with_downloader.py +158 -0
  311. crawlo-1.5.4/tests/test_queue_empty_check.py +47 -0
  312. crawlo-1.5.4/tests/test_queue_leak.py +99 -0
  313. crawlo-1.5.4/tests/test_queue_manager_double_crawlo.py +179 -0
  314. {crawlo-1.1.5 → crawlo-1.5.4}/tests/test_queue_manager_redis_key.py +55 -47
  315. crawlo-1.5.4/tests/test_queue_naming.py +160 -0
  316. crawlo-1.5.4/tests/test_queue_scores.py +128 -0
  317. crawlo-1.5.4/tests/test_queue_type.py +112 -0
  318. crawlo-1.5.4/tests/test_queue_type_redis_config_consistency.py +137 -0
  319. crawlo-1.5.4/tests/test_random_headers_default.py +328 -0
  320. crawlo-1.5.4/tests/test_random_headers_necessity.py +314 -0
  321. crawlo-1.5.4/tests/test_random_user_agent.py +78 -0
  322. {crawlo-1.1.5 → crawlo-1.5.4}/tests/test_redis_config.py +9 -4
  323. crawlo-1.5.4/tests/test_redis_connection_leak.py +79 -0
  324. {crawlo-1.1.5 → crawlo-1.5.4}/tests/test_redis_connection_pool.py +40 -35
  325. crawlo-1.5.4/tests/test_redis_key_consistency.py +103 -0
  326. crawlo-1.5.4/tests/test_redis_key_integration.py +107 -0
  327. crawlo-1.5.4/tests/test_redis_key_manager.py +126 -0
  328. {crawlo-1.1.5 → crawlo-1.5.4}/tests/test_redis_key_naming.py +16 -11
  329. crawlo-1.5.4/tests/test_redis_key_structure.py +56 -0
  330. {crawlo-1.1.5 → crawlo-1.5.4}/tests/test_redis_key_validator.py +8 -3
  331. {crawlo-1.1.5 → crawlo-1.5.4}/tests/test_redis_queue.py +26 -21
  332. crawlo-1.5.4/tests/test_redis_queue_name_fix.py +181 -0
  333. crawlo-1.5.4/tests/test_redis_queue_type_fallback.py +135 -0
  334. crawlo-1.5.4/tests/test_request_ignore_middleware.py +188 -0
  335. crawlo-1.5.4/tests/test_request_params.py +117 -0
  336. {crawlo-1.1.5 → crawlo-1.5.4}/tests/test_request_serialization.py +11 -6
  337. crawlo-1.5.4/tests/test_resource_leak_detection.py +150 -0
  338. crawlo-1.5.4/tests/test_response_code_middleware.py +355 -0
  339. crawlo-1.5.4/tests/test_response_filter_middleware.py +433 -0
  340. crawlo-1.5.4/tests/test_response_follow.py +110 -0
  341. {crawlo-1.1.5 → crawlo-1.5.4}/tests/test_response_improvements.py +5 -0
  342. crawlo-1.5.4/tests/test_response_selector_methods.py +98 -0
  343. crawlo-1.5.4/tests/test_response_url_methods.py +76 -0
  344. crawlo-1.5.4/tests/test_response_urljoin.py +92 -0
  345. crawlo-1.5.4/tests/test_retry_middleware.py +339 -0
  346. crawlo-1.5.4/tests/test_retry_middleware_realistic.py +279 -0
  347. {crawlo-1.1.5 → crawlo-1.5.4}/tests/test_scheduler.py +37 -21
  348. crawlo-1.5.4/tests/test_scheduler_config_update.py +139 -0
  349. crawlo-1.5.4/tests/test_scrapy_style_encoding.py +118 -0
  350. crawlo-1.5.4/tests/test_selector_helper.py +106 -0
  351. crawlo-1.5.4/tests/test_selector_optimizations.py +152 -0
  352. {crawlo-1.1.5 → crawlo-1.5.4}/tests/test_simple_response.py +5 -0
  353. crawlo-1.5.4/tests/test_spider_loader.py +55 -0
  354. crawlo-1.5.4/tests/test_spider_loader_comprehensive.py +75 -0
  355. crawlo-1.5.4/tests/test_spider_modules.py +90 -0
  356. crawlo-1.5.4/tests/test_spider_name_in_redis_keys.py +83 -0
  357. crawlo-1.5.4/tests/test_spiders/__init__.py +1 -0
  358. crawlo-1.5.4/tests/test_spiders/test_spider.py +15 -0
  359. {crawlo-1.1.5 → crawlo-1.5.4}/tests/test_telecom_spider_redis_key.py +17 -12
  360. {crawlo-1.1.5 → crawlo-1.5.4}/tests/test_template_content.py +20 -15
  361. {crawlo-1.1.5 → crawlo-1.5.4}/tests/test_template_redis_key.py +22 -17
  362. crawlo-1.5.4/tests/test_thread_leak.py +78 -0
  363. {crawlo-1.1.5 → crawlo-1.5.4}/tests/test_tools.py +14 -3
  364. crawlo-1.5.4/tests/test_user_agent_randomness.py +182 -0
  365. crawlo-1.5.4/tests/test_user_agents.py +102 -0
  366. crawlo-1.5.4/tests/untested_features_report.md +139 -0
  367. crawlo-1.5.4/tests/verify_debug.py +52 -0
  368. crawlo-1.5.4/tests/verify_distributed.py +117 -0
  369. crawlo-1.5.4/tests/verify_log_fix.py +112 -0
  370. crawlo-1.5.4/tests/verify_mysql_warnings.py +110 -0
  371. crawlo-1.1.5/PKG-INFO +0 -401
  372. crawlo-1.1.5/README.md +0 -351
  373. crawlo-1.1.5/crawlo/__init__.py +0 -62
  374. crawlo-1.1.5/crawlo/__version__.py +0 -1
  375. crawlo-1.1.5/crawlo/cleaners/__init__.py +0 -61
  376. crawlo-1.1.5/crawlo/cleaners/data_formatter.py +0 -226
  377. crawlo-1.1.5/crawlo/cleaners/encoding_converter.py +0 -126
  378. crawlo-1.1.5/crawlo/cli.py +0 -41
  379. crawlo-1.1.5/crawlo/commands/run.py +0 -286
  380. crawlo-1.1.5/crawlo/commands/startproject.py +0 -300
  381. crawlo-1.1.5/crawlo/core/__init__.py +0 -2
  382. crawlo-1.1.5/crawlo/core/engine.py +0 -346
  383. crawlo-1.1.5/crawlo/core/scheduler.py +0 -137
  384. crawlo-1.1.5/crawlo/crawler.py +0 -1028
  385. crawlo-1.1.5/crawlo/event.py +0 -11
  386. crawlo-1.1.5/crawlo/exceptions.py +0 -82
  387. crawlo-1.1.5/crawlo/extension/__init__.py +0 -38
  388. crawlo-1.1.5/crawlo/extension/log_interval.py +0 -58
  389. crawlo-1.1.5/crawlo/extension/log_stats.py +0 -82
  390. crawlo-1.1.5/crawlo/filters/aioredis_filter.py +0 -280
  391. crawlo-1.1.5/crawlo/middleware/__init__.py +0 -21
  392. crawlo-1.1.5/crawlo/middleware/default_header.py +0 -32
  393. crawlo-1.1.5/crawlo/middleware/download_delay.py +0 -28
  394. crawlo-1.1.5/crawlo/middleware/middleware_manager.py +0 -135
  395. crawlo-1.1.5/crawlo/middleware/proxy.py +0 -273
  396. crawlo-1.1.5/crawlo/middleware/request_ignore.py +0 -30
  397. crawlo-1.1.5/crawlo/middleware/response_code.py +0 -19
  398. crawlo-1.1.5/crawlo/middleware/response_filter.py +0 -26
  399. crawlo-1.1.5/crawlo/mode_manager.py +0 -206
  400. crawlo-1.1.5/crawlo/network/request.py +0 -339
  401. crawlo-1.1.5/crawlo/network/response.py +0 -360
  402. crawlo-1.1.5/crawlo/pipelines/__init__.py +0 -22
  403. crawlo-1.1.5/crawlo/pipelines/bloom_dedup_pipeline.py +0 -157
  404. crawlo-1.1.5/crawlo/pipelines/memory_dedup_pipeline.py +0 -116
  405. crawlo-1.1.5/crawlo/pipelines/mongo_pipeline.py +0 -132
  406. crawlo-1.1.5/crawlo/pipelines/mysql_pipeline.py +0 -317
  407. crawlo-1.1.5/crawlo/pipelines/pipeline_manager.py +0 -56
  408. crawlo-1.1.5/crawlo/pipelines/redis_dedup_pipeline.py +0 -167
  409. crawlo-1.1.5/crawlo/project.py +0 -153
  410. crawlo-1.1.5/crawlo/queue/queue_manager.py +0 -321
  411. crawlo-1.1.5/crawlo/queue/redis_priority_queue.py +0 -277
  412. crawlo-1.1.5/crawlo/settings/default_settings.py +0 -217
  413. crawlo-1.1.5/crawlo/settings/setting_manager.py +0 -100
  414. crawlo-1.1.5/crawlo/stats_collector.py +0 -59
  415. crawlo-1.1.5/crawlo/task_manager.py +0 -30
  416. crawlo-1.1.5/crawlo/templates/crawlo.cfg.tmpl +0 -11
  417. crawlo-1.1.5/crawlo/templates/project/__init__.py.tmpl +0 -4
  418. crawlo-1.1.5/crawlo/templates/project/items.py.tmpl +0 -18
  419. crawlo-1.1.5/crawlo/templates/project/middlewares.py.tmpl +0 -111
  420. crawlo-1.1.5/crawlo/templates/project/pipelines.py.tmpl +0 -98
  421. crawlo-1.1.5/crawlo/templates/project/run.py.tmpl +0 -252
  422. crawlo-1.1.5/crawlo/templates/project/settings.py.tmpl +0 -327
  423. crawlo-1.1.5/crawlo/templates/project/settings_distributed.py.tmpl +0 -120
  424. crawlo-1.1.5/crawlo/templates/project/settings_gentle.py.tmpl +0 -95
  425. crawlo-1.1.5/crawlo/templates/project/settings_high_performance.py.tmpl +0 -152
  426. crawlo-1.1.5/crawlo/templates/project/settings_simple.py.tmpl +0 -69
  427. crawlo-1.1.5/crawlo/templates/project/spiders/__init__.py.tmpl +0 -6
  428. crawlo-1.1.5/crawlo/templates/spider/spider.py.tmpl +0 -142
  429. crawlo-1.1.5/crawlo/tools/__init__.py +0 -183
  430. crawlo-1.1.5/crawlo/tools/anti_crawler.py +0 -269
  431. crawlo-1.1.5/crawlo/tools/authenticated_proxy.py +0 -241
  432. crawlo-1.1.5/crawlo/tools/data_validator.py +0 -181
  433. crawlo-1.1.5/crawlo/tools/date_tools.py +0 -36
  434. crawlo-1.1.5/crawlo/tools/retry_mechanism.py +0 -221
  435. crawlo-1.1.5/crawlo/utils/__init__.py +0 -35
  436. crawlo-1.1.5/crawlo/utils/db_helper.py +0 -344
  437. crawlo-1.1.5/crawlo/utils/env_config.py +0 -106
  438. crawlo-1.1.5/crawlo/utils/error_handler.py +0 -126
  439. crawlo-1.1.5/crawlo/utils/large_scale_config.py +0 -287
  440. crawlo-1.1.5/crawlo/utils/log.py +0 -129
  441. crawlo-1.1.5/crawlo/utils/redis_connection_pool.py +0 -335
  442. crawlo-1.1.5/crawlo/utils/redis_key_validator.py +0 -200
  443. crawlo-1.1.5/crawlo/utils/request_serializer.py +0 -220
  444. crawlo-1.1.5/crawlo/utils/spider_loader.py +0 -63
  445. crawlo-1.1.5/crawlo/utils/system.py +0 -11
  446. crawlo-1.1.5/crawlo/utils/tools.py +0 -5
  447. crawlo-1.1.5/crawlo/utils/url.py +0 -40
  448. crawlo-1.1.5/crawlo.egg-info/PKG-INFO +0 -401
  449. crawlo-1.1.5/crawlo.egg-info/SOURCES.txt +0 -192
  450. crawlo-1.1.5/tests/authenticated_proxy_example.py +0 -237
  451. crawlo-1.1.5/tests/config_validation_demo.py +0 -103
  452. crawlo-1.1.5/tests/env_config_example.py +0 -134
  453. crawlo-1.1.5/tests/test_authenticated_proxy.py +0 -142
  454. crawlo-1.1.5/tests/test_comprehensive.py +0 -147
  455. crawlo-1.1.5/tests/test_dynamic_downloaders_proxy.py +0 -125
  456. crawlo-1.1.5/tests/test_dynamic_proxy.py +0 -93
  457. crawlo-1.1.5/tests/test_dynamic_proxy_config.py +0 -147
  458. crawlo-1.1.5/tests/test_dynamic_proxy_real.py +0 -110
  459. crawlo-1.1.5/tests/test_env_config.py +0 -122
  460. crawlo-1.1.5/tests/test_framework_env_usage.py +0 -104
  461. crawlo-1.1.5/tests/test_integration.py +0 -357
  462. crawlo-1.1.5/tests/tools_example.py +0 -258
  463. {crawlo-1.1.5 → crawlo-1.5.4}/LICENSE +0 -0
  464. {crawlo-1.1.5 → crawlo-1.5.4}/MANIFEST.in +0 -0
  465. {crawlo-1.1.5 → crawlo-1.5.4}/crawlo/items/__init__.py +0 -0
  466. {crawlo-1.1.5 → crawlo-1.5.4}/crawlo/items/items.py +0 -0
  467. {crawlo-1.1.5 → crawlo-1.5.4}/crawlo/network/__init__.py +0 -0
  468. {crawlo-1.1.5 → crawlo-1.5.4}/crawlo/settings/__init__.py +0 -0
  469. {crawlo-1.1.5 → crawlo-1.5.4}/crawlo/utils/func_tools.py +0 -0
  470. {crawlo-1.1.5 → crawlo-1.5.4}/crawlo/utils/queue_helper.py +0 -0
  471. {crawlo-1.1.5 → crawlo-1.5.4}/crawlo.egg-info/dependency_links.txt +0 -0
  472. {crawlo-1.1.5 → crawlo-1.5.4}/crawlo.egg-info/entry_points.txt +0 -0
  473. {crawlo-1.1.5 → crawlo-1.5.4}/crawlo.egg-info/requires.txt +0 -0
  474. {crawlo-1.1.5 → crawlo-1.5.4}/crawlo.egg-info/top_level.txt +0 -0
  475. {crawlo-1.1.5 → crawlo-1.5.4}/examples/__init__.py +0 -0
  476. {crawlo-1.1.5 → crawlo-1.5.4}/pyproject.toml +0 -0
  477. {crawlo-1.1.5 → crawlo-1.5.4}/tests/__init__.py +0 -0
  478. {crawlo-1.1.5 → crawlo-1.5.4}/tests/controlled_spider_example.py +0 -0
  479. {crawlo-1.1.5 → crawlo-1.5.4}/tests/date_tools_example.py +0 -0
  480. {crawlo-1.1.5 → crawlo-1.5.4}/tests/dynamic_loading_example.py +0 -0
  481. {crawlo-1.1.5 → crawlo-1.5.4}/tests/dynamic_loading_test.py +0 -0
  482. {crawlo-1.1.5/crawlo/queue → crawlo-1.5.4/tests/ofweek_scrapy/ofweek_scrapy}/__init__.py +0 -0
  483. {crawlo-1.1.5 → crawlo-1.5.4}/tests/response_improvements_example.py +0 -0
crawlo-1.5.4/PKG-INFO ADDED
@@ -0,0 +1,997 @@
1
+ Metadata-Version: 2.4
2
+ Name: crawlo
3
+ Version: 1.5.4
4
+ Summary: Crawlo: A high-performance asynchronous Python web crawling framework with distributed support.。
5
+ Home-page: https://github.com/crawl-coder/Crawlo.git
6
+ Author: crawl-coder
7
+ Author-email: crawlo@qq.com
8
+ License: MIT
9
+ Classifier: Programming Language :: Python :: 3
10
+ Classifier: License :: OSI Approved :: MIT License
11
+ Classifier: Operating System :: OS Independent
12
+ Requires-Python: >=3.6
13
+ Description-Content-Type: text/markdown
14
+ Requires-Dist: aiohttp>=3.12.14
15
+ Requires-Dist: aiomysql>=0.2.0
16
+ Requires-Dist: aioredis>=2.0.1
17
+ Requires-Dist: asyncmy>=0.2.10
18
+ Requires-Dist: cssselect>=1.2.0
19
+ Requires-Dist: dateparser>=1.2.2
20
+ Requires-Dist: httpx[http2]>=0.27.0
21
+ Requires-Dist: curl-cffi>=0.13.0
22
+ Requires-Dist: lxml>=5.2.1
23
+ Requires-Dist: motor>=3.7.0
24
+ Requires-Dist: parsel>=1.9.1
25
+ Requires-Dist: pydantic>=2.11.7
26
+ Requires-Dist: pymongo>=4.11
27
+ Requires-Dist: PyMySQL>=1.1.1
28
+ Requires-Dist: python-dateutil>=2.9.0.post0
29
+ Requires-Dist: redis>=6.2.0
30
+ Requires-Dist: requests>=2.32.4
31
+ Requires-Dist: six>=1.17.0
32
+ Requires-Dist: ujson>=5.9.0
33
+ Requires-Dist: urllib3>=2.5.0
34
+ Requires-Dist: w3lib>=2.1.2
35
+ Requires-Dist: rich>=14.1.0
36
+ Requires-Dist: astor>=0.8.1
37
+ Requires-Dist: watchdog>=6.0.0
38
+ Provides-Extra: render
39
+ Requires-Dist: webdriver-manager>=4.0.0; extra == "render"
40
+ Requires-Dist: playwright; extra == "render"
41
+ Requires-Dist: selenium>=3.141.0; extra == "render"
42
+ Provides-Extra: all
43
+ Requires-Dist: bitarray>=1.5.3; extra == "all"
44
+ Requires-Dist: PyExecJS>=1.5.1; extra == "all"
45
+ Requires-Dist: pymongo>=3.10.1; extra == "all"
46
+ Requires-Dist: redis-py-cluster>=2.1.0; extra == "all"
47
+ Requires-Dist: webdriver-manager>=4.0.0; extra == "all"
48
+ Requires-Dist: playwright; extra == "all"
49
+ Requires-Dist: selenium>=3.141.0; extra == "all"
50
+
51
+ <p align="center">
52
+ <img src="assets/logo.svg" alt="Crawlo Logo" width="150"/>
53
+ </p>
54
+
55
+ <h1 align="center">Crawlo</h1>
56
+
57
+ <p align="center">
58
+ <strong>一个基于 asyncio 的现代化、高性能 Python 异步爬虫框架。</strong>
59
+ </p>
60
+
61
+ <p align="center">
62
+ <a href="#核心特性">核心特性</a> •
63
+ <a href="#项目架构">架构</a> •
64
+ <a href="#安装">安装</a> •
65
+ <a href="#配置模式详解">配置模式</a> •
66
+ <a href="https://github.com/crawl-coder/Crawlo">文档</a>
67
+ </p>
68
+
69
+ ## 核心特性
70
+
71
+ - 🚀 **高性能异步架构**:基于 asyncio 和 aiohttp,充分利用异步 I/O 提升爬取效率
72
+ - 🎯 **智能调度系统**:优先级队列、并发控制、自动重试、智能限速
73
+ - 🔄 **灵活的配置模式**:
74
+ - **Standalone 模式**:单机开发测试,使用内存队列
75
+ - **Distributed 模式**:多节点分布式部署,严格要求 Redis(不允许降级)
76
+ - **Auto 模式**:智能检测 Redis 可用性,自动选择最佳配置(推荐)
77
+ - 📦 **丰富的组件生态**:
78
+ - 内置 Redis 和 MongoDB 支持
79
+ - MySQL 异步连接池(基于 asyncmy和aiomysql分别实现)
80
+ - 多种过滤器和去重管道(Memory/Redis)
81
+ - 代理中间件支持(简单代理/动态代理)
82
+ - 多种下载器(aiohttp、httpx、curl-cffi)
83
+ - 🛠 **开发友好**:
84
+ - 类 Scrapy 的项目结构和 API 设计
85
+ - 配置工厂模式(`CrawloConfig.auto()`)
86
+ - 自动爬虫发现机制
87
+ - 完善的日志系统
88
+
89
+ ## 项目架构
90
+
91
+ Crawlo 框架采用模块化设计,核心组件包括:
92
+
93
+ ![Crawlo 框架架构图](assets/Crawlo%20框架架构图.png)
94
+
95
+ - **Engine**:核心引擎,协调各个组件工作
96
+ - **Scheduler**:调度器,管理请求队列和去重
97
+ - **Downloader**:下载器,支持多种 HTTP 客户端
98
+ - **Spider**:爬虫基类,定义数据提取逻辑
99
+ - **Pipeline**:数据管道,处理和存储数据
100
+ - **Middleware**:中间件,处理请求和响应
101
+
102
+ ![Crawlo 数据流图](assets/Crawlo%20数据流图.png)
103
+
104
+ ## 示例项目
105
+
106
+ 查看 [`examples/`](examples/) 目录下的完整示例项目:
107
+
108
+ - **ofweek_standalone** - Auto 模式示例(智能检测)
109
+ - **ofweek_spider** - Auto 模式示例
110
+ - **ofweek_distributed** - Distributed 模式示例(严格分布式)
111
+
112
+ ## 安装
113
+
114
+ ```
115
+ # 基础安装
116
+ pip install crawlo
117
+ ```
118
+
119
+ ## 配置模式详解
120
+
121
+ > ⚠️ **重要**:配置模式的选择直接影响爬虫的运行方式、性能和可靠性,请仔细阅读本节内容。
122
+
123
+ Crawlo 提供三种配置模式,满足不同场景需求:
124
+
125
+ ### 三种模式对比
126
+
127
+ | 配置项 | Standalone | Distributed | Auto |
128
+ |--------|-----------|-------------|------|
129
+ | **RUN_MODE** | `standalone` | `distributed` | `auto` |
130
+ | **队列类型** | 内存队列 | Redis 队列 | 自动检测 |
131
+ | **Redis 要求** | 不需要 | **必需** | 可选 |
132
+ | **Redis 不可用时** | N/A | 🚫 **报错退出** | ✅ 降级到内存 |
133
+ | **配置自动更新** | ❌ 否 | ❌ 否 | ✅ 是 |
134
+ | **过滤器** | Memory | Redis | Redis/Memory |
135
+ | **去重管道** | Memory | Redis | Redis/Memory |
136
+ | **适用场景** | 开发测试 | 多节点部署 | 生产环境 |
137
+ | **并发数默认值** | 8 | 16 | 12 |
138
+ | **推荐指数** | ⭐⭐⭐ | ⭐⭐⭐⭐ | ⭐⭐⭐⭐⭐ |
139
+
140
+ ### 1. Auto 模式(推荐)
141
+
142
+ **智能检测,自动适配,推荐用于生产环境。**
143
+
144
+ ``python
145
+ from crawlo.config import CrawloConfig
146
+
147
+ config = CrawloConfig.auto(
148
+ project_name='myproject',
149
+ concurrency=12,
150
+ download_delay=1.0
151
+ )
152
+ locals().update(config.to_dict())
153
+ ```
154
+
155
+ **运行机制**:
156
+ - 配置阶段不依赖 Redis
157
+ - 运行时才检测 Redis 可用性
158
+ - Redis 可用 → 使用 `RedisPriorityQueue` + `AioRedisFilter`
159
+ - Redis 不可用 → 降级到 `MemoryQueue` + `MemoryFilter`
160
+ - 自动更新配置(`QUEUE_TYPE`、`FILTER_CLASS`、`DEFAULT_DEDUP_PIPELINE`)
161
+
162
+ **优势**:
163
+ - ✅ 开发环境无需配置 Redis,直接启动
164
+ - ✅ 生产环境 Redis 故障时自动降级,保证系统可用性
165
+ - ✅ 同一份代码可在不同环境运行,无需修改配置
166
+ - ✅ 最佳的灵活性和可靠性
167
+
168
+ **适用场景**:
169
+ - 生产环境部署(首选)
170
+ - 需要在多种环境运行的项目
171
+ - 希望系统具备容错能力
172
+
173
+ ### 2. Standalone 模式
174
+
175
+ **单机模式,适合开发测试和中小规模爬取。**
176
+
177
+ ``python
178
+ config = CrawloConfig.standalone(
179
+ project_name='myproject',
180
+ concurrency=8
181
+ )
182
+ locals().update(config.to_dict())
183
+ ```
184
+
185
+ **运行机制**:
186
+ - 固定使用 `MemoryQueue`(内存队列)
187
+ - 固定使用 `MemoryFilter`(内存过滤器)
188
+ - 固定使用 `MemoryDedupPipeline`(内存去重)
189
+ - 不进行 Redis 检测
190
+ - 配置不会自动更新
191
+
192
+ **优势**:
193
+ - ✅ 无需任何外部依赖
194
+ - ✅ 启动速度快
195
+ - ✅ 适合快速开发调试
196
+
197
+ **限制**:
198
+ - ❌ 不支持分布式部署
199
+ - ❌ 重启后队列数据丢失
200
+ - ❌ 不适合大规模数据采集
201
+
202
+ **适用场景**:
203
+ - 本地开发调试
204
+ - 学习框架特性
205
+ - 中小规模数据采集(< 10万条)
206
+ - 单机运行的简单爬虫
207
+
208
+ ### 3. Distributed 模式
209
+
210
+ **分布式模式,严格要求 Redis 可用,适合多节点协同工作。**
211
+
212
+ ``python
213
+ config = CrawloConfig.distributed(
214
+ project_name='myproject',
215
+ redis_host='redis.example.com',
216
+ redis_port=6379,
217
+ redis_password='your_password',
218
+ concurrency=16
219
+ )
220
+ locals().update(config.to_dict())
221
+ ```
222
+
223
+ **运行机制**:
224
+ - 必须使用 `RedisPriorityQueue`
225
+ - 必须使用 `AioRedisFilter`
226
+ - 必须使用 `RedisDedupPipeline`
227
+ - 启动时强制检查 Redis 连接
228
+ - **Redis 不可用时抛出 `RuntimeError` 并退出(不允许降级)**
229
+
230
+ **为什么要严格要求 Redis?**
231
+
232
+ 1. **数据一致性**:防止不同节点使用不同的队列类型
233
+ 2. **去重有效性**:确保多节点间的去重功能正常工作
234
+ 3. **任务分配**:防止任务被重复执行
235
+ 4. **问题早发现**:启动失败比运行时失败更容易发现和修复
236
+ 5. **明确的意图**:分布式模式就应该是分布式的,不应该静默降级
237
+
238
+ **Redis 不可用时的错误信息**:
239
+
240
+ ```
241
+ $ crawlo run my_spider
242
+
243
+ 2025-10-25 22:00:00 - [queue_manager] - ERROR:
244
+ Distributed 模式要求 Redis 可用,但无法连接到 Redis 服务器。
245
+ 错误信息: Connection refused
246
+ Redis URL: redis://127.0.0.1:6379/0
247
+ 请检查:
248
+ 1. Redis 服务是否正在运行
249
+ 2. Redis 连接配置是否正确
250
+ 3. 网络连接是否正常
251
+
252
+ RuntimeError: Distributed 模式要求 Redis 可用,但无法连接到 Redis 服务器。
253
+ ```
254
+
255
+ **优势**:
256
+ - ✅ 支持多节点协同爬取
257
+ - ✅ 数据持久化,重启后可继续
258
+ - ✅ 严格的分布式一致性保证
259
+ - ✅ 适合大规模数据采集
260
+
261
+ **适用场景**:
262
+ - 多服务器协同采集
263
+ - 大规模数据采集(> 百万条)
264
+ - 需要严格保证分布式一致性
265
+ - 生产环境多节点部署
266
+
267
+ ### 模式选择建议
268
+
269
+ | 场景 | 推荐模式 | 原因 |
270
+ |------|---------|------|
271
+ | 生产环境(单节点或多节点) | **Auto** | 自动适配,容错能力强 |
272
+ | 开发环境 | **Standalone** 或 **Auto** | 无需配置 Redis |
273
+ | 严格的多节点分布式部署 | **Distributed** | 保证分布式一致性 |
274
+ | 学习和测试 | **Standalone** | 最简单,无依赖 |
275
+ | 中小规模爬取 | **Standalone** 或 **Auto** | 简单高效 |
276
+ | 大规模爬取 | **Auto** 或 **Distributed** | 性能和可靠性 |
277
+
278
+ > 📖 **完整文档**:更多详细信息请参考 [配置模式完全指南](docs/tutorials/configuration_modes.md)
279
+
280
+ ## Redis 数据结构说明
281
+
282
+ 在使用 Distributed 模式或 Auto 模式且 Redis 可用时,Crawlo 框架会在 Redis 中创建以下数据结构用于管理和跟踪爬虫状态:
283
+
284
+ ### 核心 Redis Keys
285
+
286
+ 1. **`{project_name}:filter:fingerprint`** - 请求去重过滤器
287
+ - 类型:Redis Set
288
+ - 用途:存储已处理请求的指纹,避免重复抓取相同URL
289
+ - 示例:`crawlo:ofweek_standalone:filter:fingerprint`
290
+
291
+ 2. **`{project_name}:item:fingerprint`** - 数据项去重集合
292
+ - 类型:Redis Set
293
+ - 用途:存储已处理数据项的指纹,避免重复处理相同的数据
294
+ - 示例:`crawlo:ofweek_standalone:item:fingerprint`
295
+
296
+ 3. **`{project_name}:queue:requests`** - 主请求队列
297
+ - 类型:Redis Sorted Set
298
+ - 用途:存储待处理的爬虫请求,按优先级排序
299
+ - 示例:`crawlo:ofweek_standalone:queue:requests`
300
+
301
+ 4. **`{project_name}:queue:requests:data`** - 主请求队列数据
302
+ - 类型:Redis Hash
303
+ - 用途:保存请求队列中每个请求的详细序列化数据
304
+ - 示例:`crawlo:ofweek_standalone:queue:requests:data`
305
+
306
+ ### 数据核验方法
307
+
308
+ 在爬虫采集完成后,您可以使用这些 Redis key 来核验数据和监控爬虫状态:
309
+
310
+ ```bash
311
+ # 连接到 Redis
312
+ redis-cli
313
+
314
+ # 查看请求去重数量(已处理的唯一URL数)
315
+ SCARD crawlo:ofweek_standalone:filter:fingerprint
316
+
317
+ # 查看数据项去重数量(已处理的唯一数据项数)
318
+ SCARD crawlo:ofweek_standalone:item:fingerprint
319
+
320
+ # 查看待处理队列长度
321
+ ZCARD crawlo:ofweek_standalone:queue:requests
322
+
323
+ # 获取部分指纹数据进行检查
324
+ SMEMBERS crawlo:ofweek_standalone:filter:fingerprint LIMIT 10
325
+
326
+ # 获取队列中的请求信息
327
+ ZRANGE crawlo:ofweek_standalone:queue:requests 0 -1 WITHSCORES LIMIT 10
328
+ ```
329
+
330
+ ### 注意事项
331
+
332
+ 1. **数据清理**:爬虫任务完成后,建议清理这些 Redis keys 以释放内存:
333
+ ```bash
334
+ DEL crawlo:ofweek_standalone:filter:fingerprint
335
+ DEL crawlo:ofweek_standalone:item:fingerprint
336
+ DEL crawlo:ofweek_standalone:queue:requests
337
+ DEL crawlo:ofweek_standalone:queue:requests:data
338
+ ```
339
+
340
+ 2. **命名空间隔离**:不同项目使用不同的 `{project_name}` 前缀,确保数据隔离。对于同一项目下的不同爬虫,还可以通过 `{spider_name}` 进一步区分,确保更细粒度的数据隔离。
341
+
342
+ 3. **持久化考虑**:如果需要持久化这些数据,确保 Redis 配置了合适的持久化策略
343
+
344
+ ## 配置优先级
345
+
346
+ Crawlo 框架支持多层级的配置系统,了解配置优先级对于正确使用框架至关重要。
347
+
348
+ ### 配置来源与优先级
349
+
350
+ 从**低到高**的优先级顺序:
351
+
352
+ ```
353
+ 1. default_settings.py (框架默认配置) ⭐
354
+
355
+ 2. 环境变量 (CRAWLO_*) ⭐⭐
356
+ (在 default_settings.py 中通过 EnvConfigManager 读取)
357
+
358
+ 3. 用户 settings.py (项目配置文件) ⭐⭐⭐
359
+
360
+ 4. Spider.custom_settings (Spider 自定义配置) ⭐⭐⭐⭐
361
+
362
+ 5. 运行时 settings 参数 (crawl() 传入的配置) ⭐⭐⭐⭐⭐
363
+ ```
364
+
365
+ ### 环境变量配置
366
+
367
+ 所有环境变量都使用 `CRAWLO_` 前缀:
368
+
369
+ ```bash
370
+ # 基础配置
371
+ export CRAWLO_MODE=auto # 运行模式
372
+ export CRAWLO_PROJECT_NAME=myproject # 项目名称
373
+ export CRAWLO_CONCURRENCY=16 # 并发数
374
+
375
+ # Redis 配置
376
+ export CRAWLO_REDIS_HOST=127.0.0.1 # Redis 主机
377
+ export CRAWLO_REDIS_PORT=6379 # Redis 端口
378
+ export CRAWLO_REDIS_PASSWORD=your_password # Redis 密码
379
+ export CRAWLO_REDIS_DB=0 # Redis 数据库
380
+ ```
381
+
382
+ ### 配置合并策略
383
+
384
+ **普通配置**(如 `CONCURRENCY`):采用**覆盖策略**
385
+ ```python
386
+ # 假设各处都有定义
387
+ default_settings.py: 8 →
388
+ 环境变量: 12 →
389
+ settings.py: 16 →
390
+ Spider.custom_settings: 24 →
391
+ crawl(settings={...}): 32 ✅ 最终值 = 32
392
+ ```
393
+
394
+ **列表配置**(如 `MIDDLEWARES`、`PIPELINES`、`EXTENSIONS`):采用**合并策略**
395
+ ```python
396
+ # default_settings.py
397
+ PIPELINES = ['crawlo.pipelines.console_pipeline.ConsolePipeline']
398
+
399
+ # settings.py
400
+ PIPELINES = ['myproject.pipelines.MySQLPipeline']
401
+
402
+ # 最终结果(合并)
403
+ PIPELINES = [
404
+ 'crawlo.pipelines.console_pipeline.ConsolePipeline', # 保留默认
405
+ 'myproject.pipelines.MySQLPipeline', # 追加用户
406
+ ]
407
+ ```
408
+
409
+ ### Spider 级别配置
410
+
411
+ 在 Spider 类中可以覆盖项目配置:
412
+
413
+ ```python
414
+ class MySpider(Spider):
415
+ name = 'myspider'
416
+
417
+ custom_settings = {
418
+ 'CONCURRENCY': 32, # 覆盖项目配置
419
+ 'DOWNLOAD_DELAY': 2.0, # 覆盖项目配置
420
+ 'PIPELINES': [ # 会与默认管道合并
421
+ 'myproject.pipelines.SpecialPipeline',
422
+ ]
423
+ }
424
+ ```
425
+
426
+ ### 运行时动态配置
427
+
428
+ ```
429
+ from crawlo import CrawlerProcess
430
+
431
+ process = CrawlerProcess()
432
+ await process.crawl(
433
+ MySpider,
434
+ settings={
435
+ 'CONCURRENCY': 64, # 最高优先级
436
+ 'DOWNLOAD_DELAY': 0.1,
437
+ }
438
+ )
439
+ ```
440
+
441
+ ### ⚠️ 常见陷阱
442
+
443
+ **陷阱1:环境变量被项目配置覆盖**
444
+ ```python
445
+ # 环境变量
446
+ export CRAWLO_REDIS_HOST=192.168.1.100
447
+
448
+ # settings.py(这会覆盖环境变量!)
449
+ REDIS_HOST = 'localhost' # ❌ 会覆盖环境变量
450
+
451
+ # 解决方案:不在 settings.py 中重复设置,或使用 CrawloConfig.auto()
452
+ ```
453
+
454
+ **陷阱2:误以为列表配置会被清空**
455
+ ```python
456
+ # settings.py
457
+ PIPELINES = ['myproject.pipelines.MySQLPipeline']
458
+
459
+ # 实际结果(默认管道会被保留并合并)
460
+ PIPELINES = [
461
+ 'crawlo.pipelines.console_pipeline.ConsolePipeline', # 默认保留
462
+ 'myproject.pipelines.MySQLPipeline', # 用户追加
463
+ ]
464
+
465
+ # 如果想完全替换,需要先清空
466
+ PIPELINES = [] # 清空
467
+ PIPELINES.append('myproject.pipelines.MySQLPipeline')
468
+ ```
469
+
470
+ > 📖 **详细文档**:完整的配置优先级说明请参考 [配置优先级详解](docs/配置优先级详解.md)
471
+
472
+ ## 中间件优先级策略
473
+
474
+ 在 crawlo 框架中,中间件的执行顺序由优先级数值决定,数值越大执行越早。以下是推荐的中间件优先级分配策略:
475
+
476
+ ### 1. 优先级数值范围和含义
477
+
478
+ - **高优先级 (80-100)**:请求预处理阶段,如过滤、验证等
479
+ - **中高优先级 (60-79)**:请求处理阶段,如添加请求头、代理设置等
480
+ - **中等优先级 (40-59)**:响应处理阶段,如重试、状态码处理等
481
+ - **低优先级 (0-39)**:响应后处理阶段,如过滤、记录等
482
+
483
+ ### 2. 默认中间件优先级分配
484
+
485
+ ```python
486
+ # === 请求预处理阶段 ===
487
+ 'crawlo.middleware.request_ignore.RequestIgnoreMiddleware': 100 # 1. 忽略无效请求(最高优先级)
488
+ 'crawlo.middleware.download_delay.DownloadDelayMiddleware': 90 # 2. 控制请求频率
489
+ 'crawlo.middleware.default_header.DefaultHeaderMiddleware': 80 # 3. 添加默认请求头
490
+ 'crawlo.middleware.offsite.OffsiteMiddleware': 60 # 5. 站外请求过滤
491
+
492
+ # === 响应处理阶段 ===
493
+ 'crawlo.middleware.retry.RetryMiddleware': 50 # 6. 失败请求重试
494
+ 'crawlo.middleware.response_code.ResponseCodeMiddleware': 40 # 7. 处理特殊状态码
495
+ 'crawlo.middleware.response_filter.ResponseFilterMiddleware': 30 # 8. 响应内容过滤(最低优先级)
496
+ ```
497
+
498
+ ### 3. 用户自定义中间件优先级建议
499
+
500
+ - **请求处理类中间件**:
501
+ - 添加请求头/代理:优先级 75-85
502
+ - 请求过滤/验证:优先级 85-95
503
+ - 请求修改/增强:优先级 60-75
504
+
505
+ - **响应处理类中间件**:
506
+ - 响应重试/恢复:优先级 45-55
507
+ - 响应验证/解析:优先级 30-40
508
+ - 响应后处理:优先级 10-25
509
+
510
+ - **特殊处理类中间件**:
511
+ - 安全/认证中间件:优先级 90+
512
+ - 日志/监控中间件:优先级 20-40
513
+
514
+ ### 4. 优先级设置原则
515
+
516
+ 1. **请求处理优先于响应处理**:请求相关中间件优先级通常高于响应处理中间件
517
+ 2. **过滤器通常优先级较高**:过滤无效请求的中间件应具有较高优先级
518
+ 3. **依赖关系**:如果中间件A的输出是中间件B的输入,A的优先级应高于B
519
+ 4. **性能考虑**:可能快速过滤请求的中间件应具有较高优先级
520
+
521
+ > 💡 **提示**:`OffsiteMiddleware` 只有在配置了 `ALLOWED_DOMAINS` 时才会启用,否则会因 `NotConfiguredError` 而被禁用
522
+
523
+ ## 快速开始
524
+
525
+ ### 1. 创建项目
526
+
527
+ ```
528
+ # 创建新项目
529
+ crawlo startproject myproject
530
+ cd myproject
531
+
532
+ # 创建爬虫
533
+ crawlo genspider example example.com
534
+ ```
535
+
536
+ ### 2. 配置项目(推荐使用 Auto 模式)
537
+
538
+ ```
539
+ # myproject/settings.py
540
+ from crawlo.config import CrawloConfig
541
+
542
+ # 使用 Auto 模式:智能检测 Redis,自动选择最佳配置
543
+ config = CrawloConfig.auto(
544
+ project_name='myproject',
545
+ concurrency=12, # 并发数
546
+ download_delay=1.0 # 下载延迟(秒)
547
+ )
548
+
549
+ # 将配置应用到当前模块
550
+ locals().update(config.to_dict())
551
+
552
+ # 爬虫模块配置
553
+ SPIDER_MODULES = ['myproject.spiders']
554
+
555
+ # 日志配置
556
+ LOG_LEVEL = 'INFO'
557
+ LOG_FILE = 'logs/myproject.log'
558
+
559
+ # 可选:添加数据管道
560
+ # PIPELINES = [
561
+ # 'crawlo.pipelines.mysql_pipeline.AsyncmyMySQLPipeline',
562
+ # ]
563
+
564
+ # 可选:Redis 配置(Auto 模式会自动检测)
565
+ # REDIS_HOST = '127.0.0.1'
566
+ # REDIS_PORT = 6379
567
+ ```
568
+
569
+ **其他配置模式:**
570
+
571
+ ```python
572
+ # Standalone 模式:单机开发测试
573
+ config = CrawloConfig.standalone(
574
+ project_name='myproject',
575
+ concurrency=8
576
+ )
577
+
578
+ # Distributed 模式:多节点分布式(必须配置 Redis)
579
+ config = CrawloConfig.distributed(
580
+ project_name='myproject',
581
+ redis_host='redis.example.com',
582
+ redis_port=6379,
583
+ redis_password='your_password',
584
+ concurrency=16
585
+ )
586
+ ```
587
+
588
+ ### 3. 编写爬虫
589
+
590
+ ```
591
+ # myproject/spiders/example.py
592
+ from crawlo import Spider
593
+ from crawlo.http import Request
594
+
595
+ class ExampleSpider(Spider):
596
+ name = 'example'
597
+ start_urls = ['https://example.com']
598
+
599
+ async def parse(self, response):
600
+ # 提取数据
601
+ title = response.css('h1::text').get()
602
+
603
+ # 返回数据
604
+ yield {
605
+ 'title': title,
606
+ 'url': response.url
607
+ }
608
+
609
+ # 跟进链接
610
+ for href in response.css('a::attr(href)').getall():
611
+ yield Request(
612
+ url=response.urljoin(href),
613
+ callback=self.parse
614
+ )
615
+ ```
616
+
617
+ ### 4. 运行爬虫
618
+
619
+ ```
620
+ # 运行指定爬虫
621
+ crawlo run example
622
+
623
+ # 指定日志级别
624
+ crawlo run example --log-level DEBUG
625
+ ```
626
+
627
+ ## 核心功能
628
+
629
+ ### Response 对象
630
+
631
+ Crawlo 的 [`Response`](crawlo/http/response.py) 对象提供了强大的网页处理能力:
632
+
633
+ **1. 智能编码检测**
634
+
635
+ ```
636
+ # 自动检测并正确解码页面内容
637
+ # 优先级:Content-Type → HTML meta → chardet → utf-8
638
+ response.text # 已正确解码的文本
639
+ response.encoding # 检测到的编码
640
+ ```
641
+
642
+ **2. CSS/XPath 选择器**
643
+
644
+ ```
645
+ # CSS 选择器(推荐)
646
+ title = response.css('h1::text').get()
647
+ links = response.css('a::attr(href)').getall()
648
+
649
+ # XPath 选择器
650
+ title = response.xpath('//title/text()').get()
651
+ links = response.xpath('//a/@href').getall()
652
+
653
+ # 支持默认值
654
+ title = response.css('h1::text').get(default='无标题')
655
+ ```
656
+
657
+ **3. URL 处理**
658
+
659
+ ```
660
+ response.url # 自动规范化(移除 fragment)
661
+ response.original_url # 保留原始 URL
662
+
663
+ # 智能 URL 拼接
664
+ response.urljoin('/path') # 绝对路径
665
+ response.urljoin('../path') # 相对路径
666
+ response.urljoin('//cdn.com/img') # 协议相对路径
667
+ ```
668
+
669
+ **4. 便捷提取方法**
670
+
671
+ ```
672
+ # 提取单个/多个元素文本
673
+ title = response.extract_text('h1')
674
+ paragraphs = response.extract_texts('.content p')
675
+
676
+ # 提取单个/多个元素属性
677
+ link = response.extract_attr('a', 'href')
678
+ all_links = response.extract_attrs('a', 'href')
679
+ ```
680
+
681
+ ### 配置工厂模式
682
+
683
+ Crawlo 提供了便捷的配置工厂方法,无需手动配置繁琐的参数:
684
+
685
+ ```
686
+ from crawlo.config import CrawloConfig
687
+
688
+ # Auto 模式(推荐):智能检测,自动适配
689
+ config = CrawloConfig.auto(
690
+ project_name='myproject',
691
+ concurrency=12,
692
+ download_delay=1.0
693
+ )
694
+
695
+ # Standalone 模式:单机开发
696
+ config = CrawloConfig.standalone(
697
+ project_name='myproject',
698
+ concurrency=8
699
+ )
700
+
701
+ # Distributed 模式:严格分布式
702
+ config = CrawloConfig.distributed(
703
+ project_name='myproject',
704
+ redis_host='localhost',
705
+ redis_port=6379,
706
+ concurrency=16
707
+ )
708
+
709
+ # 应用到 settings.py
710
+ locals().update(config.to_dict())
711
+ ```
712
+
713
+ **三种模式的核心区别**:
714
+
715
+ - **Auto**:智能检测 Redis,自动选择最佳配置,**推荐用于生产环境**
716
+ - **Standalone**:固定使用内存队列,适合开发测试,无外部依赖
717
+ - **Distributed**:严格要求 Redis,不允许降级,保证分布式一致性
718
+
719
+ > 💡 详细配置说明请查看前面的 [配置模式详解](#配置模式详解) 章节
720
+
721
+ ### 日志系统
722
+
723
+ Crawlo 提供了完善的日志系统,支持控制台和文件双输出:
724
+
725
+ ```
726
+ from crawlo.logging import get_logger
727
+
728
+ logger = get_logger(__name__)
729
+
730
+ logger.debug('调试信息')
731
+ logger.info('普通信息')
732
+ logger.warning('警告信息')
733
+ logger.error('错误信息')
734
+ ```
735
+
736
+ **日志配置:**
737
+
738
+ ```
739
+ # settings.py
740
+ LOG_LEVEL = 'INFO' # DEBUG, INFO, WARNING, ERROR, CRITICAL
741
+ LOG_FILE = 'logs/spider.log'
742
+ LOG_ENCODING = 'utf-8' # 明确指定日志文件编码
743
+ STATS_DUMP = True # 是否输出统计信息
744
+ ```
745
+
746
+ **高级功能:**
747
+
748
+ ```
749
+ from crawlo.logging import configure_logging
750
+
751
+ # 分别配置控制台和文件日志级别
752
+ configure_logging(
753
+ LOG_LEVEL='INFO',
754
+ LOG_CONSOLE_LEVEL='WARNING', # 控制台只显示 WARNING 及以上
755
+ LOG_FILE_LEVEL='DEBUG', # 文件记录 DEBUG 及以上
756
+ LOG_FILE='logs/app.log',
757
+ LOG_MAX_BYTES=10*1024*1024, # 10MB
758
+ LOG_BACKUP_COUNT=5
759
+ )
760
+ ```
761
+
762
+ ### 爬虫自动发现
763
+
764
+ Crawlo 支持自动发现爬虫,无需手动导入:
765
+
766
+ ```
767
+ # 自动发现并运行(推荐)
768
+ crawlo run spider_name
769
+
770
+ # 指定文件路径运行
771
+ crawlo run -f path/to/spider.py -s SpiderClassName
772
+ ```
773
+
774
+ 框架会自动在 `SPIDER_MODULES` 配置的模块中查找爬虫。
775
+
776
+ ### 跨平台支持
777
+
778
+ Crawlo 在 Windows、macOS、Linux 上均可无缝运行:
779
+
780
+ - **Windows**:自动使用 ProactorEventLoop,正确处理控制台编码
781
+ - **macOS/Linux**:使用默认的 SelectorEventLoop
782
+ - 兼容不同平台的路径格式
783
+
784
+ > 💡 **Windows 用户提示**:框架默认已禁用日志轮转功能以避免文件锁定问题。如需启用日志轮转,建议安装 `concurrent-log-handler`:
785
+ > ```bash
786
+ > pip install concurrent-log-handler
787
+ > ```
788
+ > 然后在 settings.py 中设置:
789
+ > ```python
790
+ > LOG_MAX_BYTES = 10 * 1024 * 1024 # 10MB
791
+ > LOG_BACKUP_COUNT = 5
792
+ > ```
793
+
794
+ ![Crawlo 核心架构图](assets/Crawlo%20核心架构图.png)
795
+
796
+ ## 文档
797
+
798
+ 完整文档请查看 [`docs/`](docs/) 目录:
799
+
800
+ ### 📚 核心教程
801
+
802
+ - [配置模式完全指南](docs/tutorials/configuration_modes.md) - **强烈推荐阅读**
803
+ - [架构概述](docs/modules/architecture/index.md)
804
+ - [运行模式](docs/modules/architecture/modes.md)
805
+ - [配置系统](docs/modules/configuration/index.md)
806
+
807
+ ### 🔧 核心模块
808
+
809
+ - [引擎 (Engine)](docs/modules/core/engine.md)
810
+ - [调度器 (Scheduler)](docs/modules/core/scheduler.md)
811
+ - [处理器 (Processor)](docs/modules/core/processor.md)
812
+ - [爬虫基类 (Spider)](docs/modules/core/spider.md)
813
+
814
+ ### 📦 功能模块
815
+
816
+ - [下载器 (Downloader)](docs/modules/downloader/index.md)
817
+ - [队列 (Queue)](docs/modules/queue/index.md)
818
+ - [过滤器 (Filter)](docs/modules/filter/index.md)
819
+ - [中间件 (Middleware)](docs/modules/middleware/index.md)
820
+ - [中间件优先级策略](docs/middleware_priority_guide.md)
821
+ - [管道 (Pipeline)](docs/modules/pipeline/index.md)
822
+ - [扩展 (Extension)](docs/modules/extension/index.md)
823
+
824
+ ### 🛠 命令行工具
825
+
826
+ - [CLI 概述](docs/modules/cli/index.md)
827
+ - [startproject](docs/modules/cli/startproject.md) - 项目初始化
828
+ - [genspider](docs/modules/cli/genspider.md) - 爬虫生成
829
+ - [run](docs/modules/cli/run.md) - 爬虫运行
830
+ - [list](docs/modules/cli/list.md) - 查看爬虫列表
831
+ - [check](docs/modules/cli/check.md) - 配置检查
832
+ - [stats](docs/modules/cli/stats.md) - 统计信息
833
+
834
+ ### 🚀 高级主题
835
+
836
+ - [分布式部署](docs/modules/advanced/distributed.md)
837
+ - [性能优化](docs/modules/advanced/performance.md)
838
+ - [故障排除](docs/modules/advanced/troubleshooting.md)
839
+ - [最佳实践](docs/modules/advanced/best_practices.md)
840
+
841
+ ### 📝 性能优化报告
842
+
843
+ - [初始化优化报告](docs/initialization_optimization_report.md)
844
+ - [MySQL 连接池优化](docs/mysql_connection_pool_optimization.md)
845
+ - [MongoDB 连接池优化](docs/mongo_connection_pool_optimization.md)
846
+
847
+ ### 🎯 中间件指南
848
+
849
+ - [中间件优先级策略](docs/middleware_priority_guide.md)
850
+
851
+ ### 📖 API 参考
852
+
853
+ - [完整 API 文档](docs/api/)
854
+
855
+ ---
856
+
857
+ **在线文档**:
858
+ - [中文文档](https://crawlo.readthedocs.io/en/latest/README_zh/)
859
+ - [English Documentation](https://crawlo.readthedocs.io/en/latest/)
860
+
861
+ **本地构建文档**:
862
+ ```
863
+ mkdocs serve
864
+ # 浏览器访问 http://localhost:8000
865
+ ```
866
+
867
+ ## 常见问题
868
+
869
+ ### 1. 如何选择配置模式?
870
+
871
+ - **开发测试**:使用 `CrawloConfig.standalone()`
872
+ - **生产环境**:使用 `CrawloConfig.auto()`(推荐)
873
+ - **多节点部署**:使用 `CrawloConfig.distributed()`
874
+
875
+ ### 2. Distributed 模式 Redis 不可用怎么办?
876
+
877
+ Distributed 模式**严格要求 Redis**,不可用时会抛出 `RuntimeError` 并退出。这是为了保证分布式一致性和数据安全。
878
+
879
+ 如果希望 Redis 不可用时自动降级,请使用 **Auto 模式**。
880
+
881
+ ### 3. Auto 模式如何工作?
882
+
883
+ Auto 模式在运行时智能检测:
884
+ - Redis 可用 → 使用 RedisPriorityQueue + AioRedisFilter
885
+ - Redis 不可用 → 降级到 MemoryQueue + MemoryFilter
886
+
887
+ 详见 [配置模式完全指南](docs/tutorials/configuration_modes.md)。
888
+
889
+ ### 4. 如何启用 MySQL 或 MongoDB 支持?
890
+
891
+ ```
892
+ # settings.py
893
+
894
+ PIPELINES = [
895
+ 'crawlo.pipelines.mysql_pipeline.AsyncmyMySQLPipeline', # MySQL
896
+ # 或
897
+ 'crawlo.pipelines.mongo_pipeline.MongoDBPipeline', # MongoDB
898
+ ]
899
+
900
+ # MySQL 配置
901
+ MYSQL_HOST = '127.0.0.1'
902
+ MYSQL_USER = 'root'
903
+ MYSQL_PASSWORD = 'password'
904
+ MYSQL_DB = 'mydb'
905
+ MYSQL_TABLE = 'items'
906
+
907
+ # MySQL 冲突处理策略(三者互斥,按优先级生效)
908
+ MYSQL_UPDATE_COLUMNS = ('updated',) # 优先级最高:主键冲突时更新指定列,使用 ON DUPLICATE KEY UPDATE
909
+ MYSQL_AUTO_UPDATE = False # 优先级中等:是否使用 REPLACE INTO(完全覆盖已存在记录)
910
+ MYSQL_INSERT_IGNORE = False # 优先级最低:是否使用 INSERT IGNORE(忽略重复数据)
911
+
912
+ # 批量插入配置
913
+ MYSQL_USE_BATCH = True # 是否使用批量插入提高性能
914
+ MYSQL_BATCH_SIZE = 100 # 批量插入大小
915
+
916
+ # MongoDB 配置
917
+ MONGO_URI = 'mongodb://localhost:27017'
918
+ MONGO_DATABASE = 'mydb'
919
+ MONGO_COLLECTION = 'items'
920
+ ```
921
+
922
+ **MySQL 冲突处理策略说明:**
923
+
924
+ Crawlo 的 MySQL 管道支持三种冲突处理策略,它们按照以下优先级顺序生效,**高优先级会覆盖低优先级**:
925
+
926
+ 1. **`MYSQL_UPDATE_COLUMNS`(最高优先级)**:
927
+ - 设置此项时,使用 `INSERT ... ON DUPLICATE KEY UPDATE` 语句
928
+ - 当主键或唯一索引冲突时,仅更新指定的列
929
+ - 示例:`MYSQL_UPDATE_COLUMNS = ('updated', 'modified')`
930
+
931
+ 2. **`MYSQL_AUTO_UPDATE`(中等优先级)**:
932
+ - 当 `MYSQL_UPDATE_COLUMNS` 未设置时生效
933
+ - 使用 `REPLACE INTO` 语句,完全替换已存在的记录
934
+ - 设置为 `True` 时启用
935
+
936
+ 3. **`MYSQL_INSERT_IGNORE`(最低优先级)**:
937
+ - 当前两个选项都未设置时生效
938
+ - 使用 `INSERT IGNORE` 语句,遇到冲突时忽略重复数据
939
+ - 设置为 `True` 时启用
940
+
941
+ **注意**:这三个参数是互斥的,只会应用优先级最高的那个设置。
942
+
943
+ ### 5. 如何使用代理?
944
+
945
+ ```
946
+ # settings.py
947
+
948
+ # 简单代理列表
949
+ PROXY_LIST = [
950
+ "http://proxy1:8080",
951
+ "http://proxy2:8080"
952
+ ]
953
+
954
+ # 或使用动态代理 API
955
+ PROXY_API_URL = "http://your-proxy-api.com/get-proxy"
956
+ ```
957
+
958
+ ## 学习路径
959
+
960
+ 如果您是 Crawlo 的新用户,建议按以下顺序学习:
961
+
962
+ 1. **入门** - 阅读快速开始指南,运行第一个示例
963
+ 2. **配置模式** - 学习三种配置模式,选择适合的模式([配置模式指南](docs/tutorials/configuration_modes.md))
964
+ 3. **核心概念** - 了解框架架构和基本概念
965
+ 4. **核心模块** - 深入学习引擎、调度器、处理器等核嘿组件
966
+ 5. **功能模块** - 根据需求学习下载器、队列、过滤器等模块
967
+ 6. **高级主题** - 掌握分布式部署、性能优化等高级功能
968
+
969
+ ## 贡献
970
+
971
+ 欢迎贡献!如果您想为 Crawlo 做出贡献,请访问我们的 [GitHub 仓库](https://github.com/crawl-coder/Crawlo):
972
+
973
+ 1. Fork [Crawlo 仓库](https://github.com/crawl-coder/Crawlo)
974
+ 2. 创建功能分支 (`git checkout -b feature/AmazingFeature`)
975
+ 3. 提交您的更改 (`git commit -m 'Add some AmazingFeature'`)
976
+ 4. 推送到分支 (`git push origin feature/AmazingFeature`)
977
+ 5. 发起 Pull Request
978
+
979
+ ## 许可证
980
+
981
+ MIT License - 详见 [LICENSE](LICENSE) 文件
982
+
983
+ ## 变更日志
984
+
985
+ ### v1.2.0
986
+
987
+ - **Redis Key 重构**:引入 `RedisKeyManager` 统一管理 Redis Key 的生成和验证
988
+ - 支持项目级别和爬虫级别的 Key 命名规范
989
+ - 支持在同一个项目下区分不同的爬虫
990
+ - 集成 `RedisKeyValidator` 确保 Key 命名规范一致性
991
+ - 详细文档请参见 [Redis Key 重构说明](docs/redis_key_refactor.md)
992
+
993
+ ---
994
+
995
+ <p align="center">
996
+ <i>如有问题或建议,欢迎提交 <a href="https://github.com/crawl-coder/Crawlo/issues">Issue</a></i>
997
+ </p>