crawlo 1.2.5__tar.gz → 1.2.6__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of crawlo might be problematic. Click here for more details.

Files changed (219) hide show
  1. {crawlo-1.2.5/crawlo.egg-info → crawlo-1.2.6}/PKG-INFO +1 -1
  2. crawlo-1.2.6/crawlo/__version__.py +1 -0
  3. {crawlo-1.2.5 → crawlo-1.2.6}/crawlo/core/engine.py +3 -1
  4. crawlo-1.2.6/crawlo/core/scheduler.py +240 -0
  5. {crawlo-1.2.5 → crawlo-1.2.6}/crawlo/filters/aioredis_filter.py +44 -91
  6. {crawlo-1.2.5 → crawlo-1.2.6}/crawlo/queue/queue_manager.py +47 -8
  7. {crawlo-1.2.5 → crawlo-1.2.6}/crawlo/queue/redis_priority_queue.py +9 -2
  8. {crawlo-1.2.5 → crawlo-1.2.6}/crawlo/settings/default_settings.py +5 -7
  9. {crawlo-1.2.5 → crawlo-1.2.6}/crawlo/templates/project/settings.py.tmpl +3 -39
  10. {crawlo-1.2.5 → crawlo-1.2.6}/crawlo/templates/project/settings_distributed.py.tmpl +4 -1
  11. crawlo-1.2.6/crawlo/templates/project/settings_gentle.py.tmpl +101 -0
  12. crawlo-1.2.6/crawlo/templates/project/settings_high_performance.py.tmpl +135 -0
  13. crawlo-1.2.6/crawlo/templates/project/settings_simple.py.tmpl +99 -0
  14. {crawlo-1.2.5 → crawlo-1.2.6}/crawlo/templates/run.py.tmpl +1 -3
  15. {crawlo-1.2.5 → crawlo-1.2.6}/crawlo/utils/redis_connection_pool.py +19 -2
  16. {crawlo-1.2.5 → crawlo-1.2.6/crawlo.egg-info}/PKG-INFO +1 -1
  17. crawlo-1.2.5/crawlo/__version__.py +0 -1
  18. crawlo-1.2.5/crawlo/core/scheduler.py +0 -144
  19. crawlo-1.2.5/crawlo/templates/project/settings_gentle.py.tmpl +0 -128
  20. crawlo-1.2.5/crawlo/templates/project/settings_high_performance.py.tmpl +0 -150
  21. crawlo-1.2.5/crawlo/templates/project/settings_simple.py.tmpl +0 -103
  22. {crawlo-1.2.5 → crawlo-1.2.6}/LICENSE +0 -0
  23. {crawlo-1.2.5 → crawlo-1.2.6}/MANIFEST.in +0 -0
  24. {crawlo-1.2.5 → crawlo-1.2.6}/README.md +0 -0
  25. {crawlo-1.2.5 → crawlo-1.2.6}/crawlo/__init__.py +0 -0
  26. {crawlo-1.2.5 → crawlo-1.2.6}/crawlo/cleaners/__init__.py +0 -0
  27. {crawlo-1.2.5 → crawlo-1.2.6}/crawlo/cleaners/data_formatter.py +0 -0
  28. {crawlo-1.2.5 → crawlo-1.2.6}/crawlo/cleaners/encoding_converter.py +0 -0
  29. {crawlo-1.2.5 → crawlo-1.2.6}/crawlo/cleaners/text_cleaner.py +0 -0
  30. {crawlo-1.2.5 → crawlo-1.2.6}/crawlo/cli.py +0 -0
  31. {crawlo-1.2.5 → crawlo-1.2.6}/crawlo/commands/__init__.py +0 -0
  32. {crawlo-1.2.5 → crawlo-1.2.6}/crawlo/commands/check.py +0 -0
  33. {crawlo-1.2.5 → crawlo-1.2.6}/crawlo/commands/genspider.py +0 -0
  34. {crawlo-1.2.5 → crawlo-1.2.6}/crawlo/commands/help.py +0 -0
  35. {crawlo-1.2.5 → crawlo-1.2.6}/crawlo/commands/list.py +0 -0
  36. {crawlo-1.2.5 → crawlo-1.2.6}/crawlo/commands/run.py +0 -0
  37. {crawlo-1.2.5 → crawlo-1.2.6}/crawlo/commands/startproject.py +0 -0
  38. {crawlo-1.2.5 → crawlo-1.2.6}/crawlo/commands/stats.py +0 -0
  39. {crawlo-1.2.5 → crawlo-1.2.6}/crawlo/commands/utils.py +0 -0
  40. {crawlo-1.2.5 → crawlo-1.2.6}/crawlo/config.py +0 -0
  41. {crawlo-1.2.5 → crawlo-1.2.6}/crawlo/config_validator.py +0 -0
  42. {crawlo-1.2.5 → crawlo-1.2.6}/crawlo/core/__init__.py +0 -0
  43. {crawlo-1.2.5 → crawlo-1.2.6}/crawlo/core/processor.py +0 -0
  44. {crawlo-1.2.5 → crawlo-1.2.6}/crawlo/crawler.py +0 -0
  45. {crawlo-1.2.5 → crawlo-1.2.6}/crawlo/data/__init__.py +0 -0
  46. {crawlo-1.2.5 → crawlo-1.2.6}/crawlo/data/user_agents.py +0 -0
  47. {crawlo-1.2.5 → crawlo-1.2.6}/crawlo/downloader/__init__.py +0 -0
  48. {crawlo-1.2.5 → crawlo-1.2.6}/crawlo/downloader/aiohttp_downloader.py +0 -0
  49. {crawlo-1.2.5 → crawlo-1.2.6}/crawlo/downloader/cffi_downloader.py +0 -0
  50. {crawlo-1.2.5 → crawlo-1.2.6}/crawlo/downloader/httpx_downloader.py +0 -0
  51. {crawlo-1.2.5 → crawlo-1.2.6}/crawlo/downloader/hybrid_downloader.py +0 -0
  52. {crawlo-1.2.5 → crawlo-1.2.6}/crawlo/downloader/playwright_downloader.py +0 -0
  53. {crawlo-1.2.5 → crawlo-1.2.6}/crawlo/downloader/selenium_downloader.py +0 -0
  54. {crawlo-1.2.5 → crawlo-1.2.6}/crawlo/event.py +0 -0
  55. {crawlo-1.2.5 → crawlo-1.2.6}/crawlo/exceptions.py +0 -0
  56. {crawlo-1.2.5 → crawlo-1.2.6}/crawlo/extension/__init__.py +0 -0
  57. {crawlo-1.2.5 → crawlo-1.2.6}/crawlo/extension/health_check.py +0 -0
  58. {crawlo-1.2.5 → crawlo-1.2.6}/crawlo/extension/log_interval.py +0 -0
  59. {crawlo-1.2.5 → crawlo-1.2.6}/crawlo/extension/log_stats.py +0 -0
  60. {crawlo-1.2.5 → crawlo-1.2.6}/crawlo/extension/logging_extension.py +0 -0
  61. {crawlo-1.2.5 → crawlo-1.2.6}/crawlo/extension/memory_monitor.py +0 -0
  62. {crawlo-1.2.5 → crawlo-1.2.6}/crawlo/extension/performance_profiler.py +0 -0
  63. {crawlo-1.2.5 → crawlo-1.2.6}/crawlo/extension/request_recorder.py +0 -0
  64. {crawlo-1.2.5 → crawlo-1.2.6}/crawlo/filters/__init__.py +0 -0
  65. {crawlo-1.2.5 → crawlo-1.2.6}/crawlo/filters/memory_filter.py +0 -0
  66. {crawlo-1.2.5 → crawlo-1.2.6}/crawlo/items/__init__.py +0 -0
  67. {crawlo-1.2.5 → crawlo-1.2.6}/crawlo/items/base.py +0 -0
  68. {crawlo-1.2.5 → crawlo-1.2.6}/crawlo/items/fields.py +0 -0
  69. {crawlo-1.2.5 → crawlo-1.2.6}/crawlo/items/items.py +0 -0
  70. {crawlo-1.2.5 → crawlo-1.2.6}/crawlo/middleware/__init__.py +0 -0
  71. {crawlo-1.2.5 → crawlo-1.2.6}/crawlo/middleware/default_header.py +0 -0
  72. {crawlo-1.2.5 → crawlo-1.2.6}/crawlo/middleware/download_delay.py +0 -0
  73. {crawlo-1.2.5 → crawlo-1.2.6}/crawlo/middleware/middleware_manager.py +0 -0
  74. {crawlo-1.2.5 → crawlo-1.2.6}/crawlo/middleware/offsite.py +0 -0
  75. {crawlo-1.2.5 → crawlo-1.2.6}/crawlo/middleware/proxy.py +0 -0
  76. {crawlo-1.2.5 → crawlo-1.2.6}/crawlo/middleware/request_ignore.py +0 -0
  77. {crawlo-1.2.5 → crawlo-1.2.6}/crawlo/middleware/response_code.py +0 -0
  78. {crawlo-1.2.5 → crawlo-1.2.6}/crawlo/middleware/response_filter.py +0 -0
  79. {crawlo-1.2.5 → crawlo-1.2.6}/crawlo/middleware/retry.py +0 -0
  80. {crawlo-1.2.5 → crawlo-1.2.6}/crawlo/mode_manager.py +0 -0
  81. {crawlo-1.2.5 → crawlo-1.2.6}/crawlo/network/__init__.py +0 -0
  82. {crawlo-1.2.5 → crawlo-1.2.6}/crawlo/network/request.py +0 -0
  83. {crawlo-1.2.5 → crawlo-1.2.6}/crawlo/network/response.py +0 -0
  84. {crawlo-1.2.5 → crawlo-1.2.6}/crawlo/pipelines/__init__.py +0 -0
  85. {crawlo-1.2.5 → crawlo-1.2.6}/crawlo/pipelines/bloom_dedup_pipeline.py +0 -0
  86. {crawlo-1.2.5 → crawlo-1.2.6}/crawlo/pipelines/console_pipeline.py +0 -0
  87. {crawlo-1.2.5 → crawlo-1.2.6}/crawlo/pipelines/csv_pipeline.py +0 -0
  88. {crawlo-1.2.5 → crawlo-1.2.6}/crawlo/pipelines/database_dedup_pipeline.py +0 -0
  89. {crawlo-1.2.5 → crawlo-1.2.6}/crawlo/pipelines/json_pipeline.py +0 -0
  90. {crawlo-1.2.5 → crawlo-1.2.6}/crawlo/pipelines/memory_dedup_pipeline.py +0 -0
  91. {crawlo-1.2.5 → crawlo-1.2.6}/crawlo/pipelines/mongo_pipeline.py +0 -0
  92. {crawlo-1.2.5 → crawlo-1.2.6}/crawlo/pipelines/mysql_pipeline.py +0 -0
  93. {crawlo-1.2.5 → crawlo-1.2.6}/crawlo/pipelines/pipeline_manager.py +0 -0
  94. {crawlo-1.2.5 → crawlo-1.2.6}/crawlo/pipelines/redis_dedup_pipeline.py +0 -0
  95. {crawlo-1.2.5 → crawlo-1.2.6}/crawlo/project.py +0 -0
  96. {crawlo-1.2.5 → crawlo-1.2.6}/crawlo/queue/__init__.py +0 -0
  97. {crawlo-1.2.5 → crawlo-1.2.6}/crawlo/queue/pqueue.py +0 -0
  98. {crawlo-1.2.5 → crawlo-1.2.6}/crawlo/settings/__init__.py +0 -0
  99. {crawlo-1.2.5 → crawlo-1.2.6}/crawlo/settings/setting_manager.py +0 -0
  100. {crawlo-1.2.5 → crawlo-1.2.6}/crawlo/spider/__init__.py +0 -0
  101. {crawlo-1.2.5 → crawlo-1.2.6}/crawlo/stats_collector.py +0 -0
  102. {crawlo-1.2.5 → crawlo-1.2.6}/crawlo/subscriber.py +0 -0
  103. {crawlo-1.2.5 → crawlo-1.2.6}/crawlo/task_manager.py +0 -0
  104. {crawlo-1.2.5 → crawlo-1.2.6}/crawlo/templates/crawlo.cfg.tmpl +0 -0
  105. {crawlo-1.2.5 → crawlo-1.2.6}/crawlo/templates/project/__init__.py.tmpl +0 -0
  106. {crawlo-1.2.5 → crawlo-1.2.6}/crawlo/templates/project/items.py.tmpl +0 -0
  107. {crawlo-1.2.5 → crawlo-1.2.6}/crawlo/templates/project/middlewares.py.tmpl +0 -0
  108. {crawlo-1.2.5 → crawlo-1.2.6}/crawlo/templates/project/pipelines.py.tmpl +0 -0
  109. {crawlo-1.2.5 → crawlo-1.2.6}/crawlo/templates/project/spiders/__init__.py.tmpl +0 -0
  110. {crawlo-1.2.5 → crawlo-1.2.6}/crawlo/templates/spider/spider.py.tmpl +0 -0
  111. {crawlo-1.2.5 → crawlo-1.2.6}/crawlo/tools/__init__.py +0 -0
  112. {crawlo-1.2.5 → crawlo-1.2.6}/crawlo/tools/anti_crawler.py +0 -0
  113. {crawlo-1.2.5 → crawlo-1.2.6}/crawlo/tools/authenticated_proxy.py +0 -0
  114. {crawlo-1.2.5 → crawlo-1.2.6}/crawlo/tools/data_validator.py +0 -0
  115. {crawlo-1.2.5 → crawlo-1.2.6}/crawlo/tools/date_tools.py +0 -0
  116. {crawlo-1.2.5 → crawlo-1.2.6}/crawlo/tools/distributed_coordinator.py +0 -0
  117. {crawlo-1.2.5 → crawlo-1.2.6}/crawlo/tools/retry_mechanism.py +0 -0
  118. {crawlo-1.2.5 → crawlo-1.2.6}/crawlo/tools/scenario_adapter.py +0 -0
  119. {crawlo-1.2.5 → crawlo-1.2.6}/crawlo/utils/__init__.py +0 -0
  120. {crawlo-1.2.5 → crawlo-1.2.6}/crawlo/utils/batch_processor.py +0 -0
  121. {crawlo-1.2.5 → crawlo-1.2.6}/crawlo/utils/controlled_spider_mixin.py +0 -0
  122. {crawlo-1.2.5 → crawlo-1.2.6}/crawlo/utils/date_tools.py +0 -0
  123. {crawlo-1.2.5 → crawlo-1.2.6}/crawlo/utils/db_helper.py +0 -0
  124. {crawlo-1.2.5 → crawlo-1.2.6}/crawlo/utils/enhanced_error_handler.py +0 -0
  125. {crawlo-1.2.5 → crawlo-1.2.6}/crawlo/utils/env_config.py +0 -0
  126. {crawlo-1.2.5 → crawlo-1.2.6}/crawlo/utils/error_handler.py +0 -0
  127. {crawlo-1.2.5 → crawlo-1.2.6}/crawlo/utils/func_tools.py +0 -0
  128. {crawlo-1.2.5 → crawlo-1.2.6}/crawlo/utils/large_scale_config.py +0 -0
  129. {crawlo-1.2.5 → crawlo-1.2.6}/crawlo/utils/large_scale_helper.py +0 -0
  130. {crawlo-1.2.5 → crawlo-1.2.6}/crawlo/utils/log.py +0 -0
  131. {crawlo-1.2.5 → crawlo-1.2.6}/crawlo/utils/performance_monitor.py +0 -0
  132. {crawlo-1.2.5 → crawlo-1.2.6}/crawlo/utils/queue_helper.py +0 -0
  133. {crawlo-1.2.5 → crawlo-1.2.6}/crawlo/utils/redis_key_validator.py +0 -0
  134. {crawlo-1.2.5 → crawlo-1.2.6}/crawlo/utils/request.py +0 -0
  135. {crawlo-1.2.5 → crawlo-1.2.6}/crawlo/utils/request_serializer.py +0 -0
  136. {crawlo-1.2.5 → crawlo-1.2.6}/crawlo/utils/spider_loader.py +0 -0
  137. {crawlo-1.2.5 → crawlo-1.2.6}/crawlo/utils/system.py +0 -0
  138. {crawlo-1.2.5 → crawlo-1.2.6}/crawlo/utils/tools.py +0 -0
  139. {crawlo-1.2.5 → crawlo-1.2.6}/crawlo/utils/url.py +0 -0
  140. {crawlo-1.2.5 → crawlo-1.2.6}/crawlo.egg-info/SOURCES.txt +0 -0
  141. {crawlo-1.2.5 → crawlo-1.2.6}/crawlo.egg-info/dependency_links.txt +0 -0
  142. {crawlo-1.2.5 → crawlo-1.2.6}/crawlo.egg-info/entry_points.txt +0 -0
  143. {crawlo-1.2.5 → crawlo-1.2.6}/crawlo.egg-info/requires.txt +0 -0
  144. {crawlo-1.2.5 → crawlo-1.2.6}/crawlo.egg-info/top_level.txt +0 -0
  145. {crawlo-1.2.5 → crawlo-1.2.6}/examples/__init__.py +0 -0
  146. {crawlo-1.2.5 → crawlo-1.2.6}/pyproject.toml +0 -0
  147. {crawlo-1.2.5 → crawlo-1.2.6}/requirements.txt +0 -0
  148. {crawlo-1.2.5 → crawlo-1.2.6}/setup.cfg +0 -0
  149. {crawlo-1.2.5 → crawlo-1.2.6}/tests/DOUBLE_CRAWLO_PREFIX_FIX_REPORT.md +0 -0
  150. {crawlo-1.2.5 → crawlo-1.2.6}/tests/__init__.py +0 -0
  151. {crawlo-1.2.5 → crawlo-1.2.6}/tests/advanced_tools_example.py +0 -0
  152. {crawlo-1.2.5 → crawlo-1.2.6}/tests/authenticated_proxy_example.py +0 -0
  153. {crawlo-1.2.5 → crawlo-1.2.6}/tests/cleaners_example.py +0 -0
  154. {crawlo-1.2.5 → crawlo-1.2.6}/tests/config_validation_demo.py +0 -0
  155. {crawlo-1.2.5 → crawlo-1.2.6}/tests/controlled_spider_example.py +0 -0
  156. {crawlo-1.2.5 → crawlo-1.2.6}/tests/date_tools_example.py +0 -0
  157. {crawlo-1.2.5 → crawlo-1.2.6}/tests/dynamic_loading_example.py +0 -0
  158. {crawlo-1.2.5 → crawlo-1.2.6}/tests/dynamic_loading_test.py +0 -0
  159. {crawlo-1.2.5 → crawlo-1.2.6}/tests/env_config_example.py +0 -0
  160. {crawlo-1.2.5 → crawlo-1.2.6}/tests/error_handling_example.py +0 -0
  161. {crawlo-1.2.5 → crawlo-1.2.6}/tests/redis_key_validation_demo.py +0 -0
  162. {crawlo-1.2.5 → crawlo-1.2.6}/tests/response_improvements_example.py +0 -0
  163. {crawlo-1.2.5 → crawlo-1.2.6}/tests/test_advanced_tools.py +0 -0
  164. {crawlo-1.2.5 → crawlo-1.2.6}/tests/test_all_redis_key_configs.py +0 -0
  165. {crawlo-1.2.5 → crawlo-1.2.6}/tests/test_authenticated_proxy.py +0 -0
  166. {crawlo-1.2.5 → crawlo-1.2.6}/tests/test_cleaners.py +0 -0
  167. {crawlo-1.2.5 → crawlo-1.2.6}/tests/test_comprehensive.py +0 -0
  168. {crawlo-1.2.5 → crawlo-1.2.6}/tests/test_config_validator.py +0 -0
  169. {crawlo-1.2.5 → crawlo-1.2.6}/tests/test_crawlo_proxy_integration.py +0 -0
  170. {crawlo-1.2.5 → crawlo-1.2.6}/tests/test_date_tools.py +0 -0
  171. {crawlo-1.2.5 → crawlo-1.2.6}/tests/test_default_header_middleware.py +0 -0
  172. {crawlo-1.2.5 → crawlo-1.2.6}/tests/test_double_crawlo_fix.py +0 -0
  173. {crawlo-1.2.5 → crawlo-1.2.6}/tests/test_double_crawlo_fix_simple.py +0 -0
  174. {crawlo-1.2.5 → crawlo-1.2.6}/tests/test_download_delay_middleware.py +0 -0
  175. {crawlo-1.2.5 → crawlo-1.2.6}/tests/test_downloader_proxy_compatibility.py +0 -0
  176. {crawlo-1.2.5 → crawlo-1.2.6}/tests/test_dynamic_downloaders_proxy.py +0 -0
  177. {crawlo-1.2.5 → crawlo-1.2.6}/tests/test_dynamic_proxy.py +0 -0
  178. {crawlo-1.2.5 → crawlo-1.2.6}/tests/test_dynamic_proxy_config.py +0 -0
  179. {crawlo-1.2.5 → crawlo-1.2.6}/tests/test_dynamic_proxy_real.py +0 -0
  180. {crawlo-1.2.5 → crawlo-1.2.6}/tests/test_edge_cases.py +0 -0
  181. {crawlo-1.2.5 → crawlo-1.2.6}/tests/test_enhanced_error_handler.py +0 -0
  182. {crawlo-1.2.5 → crawlo-1.2.6}/tests/test_env_config.py +0 -0
  183. {crawlo-1.2.5 → crawlo-1.2.6}/tests/test_error_handler_compatibility.py +0 -0
  184. {crawlo-1.2.5 → crawlo-1.2.6}/tests/test_final_validation.py +0 -0
  185. {crawlo-1.2.5 → crawlo-1.2.6}/tests/test_framework_env_usage.py +0 -0
  186. {crawlo-1.2.5 → crawlo-1.2.6}/tests/test_integration.py +0 -0
  187. {crawlo-1.2.5 → crawlo-1.2.6}/tests/test_item_dedup_redis_key.py +0 -0
  188. {crawlo-1.2.5 → crawlo-1.2.6}/tests/test_offsite_middleware.py +0 -0
  189. {crawlo-1.2.5 → crawlo-1.2.6}/tests/test_parsel.py +0 -0
  190. {crawlo-1.2.5 → crawlo-1.2.6}/tests/test_performance.py +0 -0
  191. {crawlo-1.2.5 → crawlo-1.2.6}/tests/test_proxy_api.py +0 -0
  192. {crawlo-1.2.5 → crawlo-1.2.6}/tests/test_proxy_health_check.py +0 -0
  193. {crawlo-1.2.5 → crawlo-1.2.6}/tests/test_proxy_middleware.py +0 -0
  194. {crawlo-1.2.5 → crawlo-1.2.6}/tests/test_proxy_middleware_enhanced.py +0 -0
  195. {crawlo-1.2.5 → crawlo-1.2.6}/tests/test_proxy_middleware_integration.py +0 -0
  196. {crawlo-1.2.5 → crawlo-1.2.6}/tests/test_proxy_providers.py +0 -0
  197. {crawlo-1.2.5 → crawlo-1.2.6}/tests/test_proxy_stats.py +0 -0
  198. {crawlo-1.2.5 → crawlo-1.2.6}/tests/test_proxy_strategies.py +0 -0
  199. {crawlo-1.2.5 → crawlo-1.2.6}/tests/test_queue_manager_double_crawlo.py +0 -0
  200. {crawlo-1.2.5 → crawlo-1.2.6}/tests/test_queue_manager_redis_key.py +0 -0
  201. {crawlo-1.2.5 → crawlo-1.2.6}/tests/test_real_scenario_proxy.py +0 -0
  202. {crawlo-1.2.5 → crawlo-1.2.6}/tests/test_redis_config.py +0 -0
  203. {crawlo-1.2.5 → crawlo-1.2.6}/tests/test_redis_connection_pool.py +0 -0
  204. {crawlo-1.2.5 → crawlo-1.2.6}/tests/test_redis_key_naming.py +0 -0
  205. {crawlo-1.2.5 → crawlo-1.2.6}/tests/test_redis_key_validator.py +0 -0
  206. {crawlo-1.2.5 → crawlo-1.2.6}/tests/test_redis_queue.py +0 -0
  207. {crawlo-1.2.5 → crawlo-1.2.6}/tests/test_request_ignore_middleware.py +0 -0
  208. {crawlo-1.2.5 → crawlo-1.2.6}/tests/test_request_serialization.py +0 -0
  209. {crawlo-1.2.5 → crawlo-1.2.6}/tests/test_response_code_middleware.py +0 -0
  210. {crawlo-1.2.5 → crawlo-1.2.6}/tests/test_response_filter_middleware.py +0 -0
  211. {crawlo-1.2.5 → crawlo-1.2.6}/tests/test_response_improvements.py +0 -0
  212. {crawlo-1.2.5 → crawlo-1.2.6}/tests/test_retry_middleware.py +0 -0
  213. {crawlo-1.2.5 → crawlo-1.2.6}/tests/test_scheduler.py +0 -0
  214. {crawlo-1.2.5 → crawlo-1.2.6}/tests/test_simple_response.py +0 -0
  215. {crawlo-1.2.5 → crawlo-1.2.6}/tests/test_telecom_spider_redis_key.py +0 -0
  216. {crawlo-1.2.5 → crawlo-1.2.6}/tests/test_template_content.py +0 -0
  217. {crawlo-1.2.5 → crawlo-1.2.6}/tests/test_template_redis_key.py +0 -0
  218. {crawlo-1.2.5 → crawlo-1.2.6}/tests/test_tools.py +0 -0
  219. {crawlo-1.2.5 → crawlo-1.2.6}/tests/tools_example.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: crawlo
3
- Version: 1.2.5
3
+ Version: 1.2.6
4
4
  Summary: Crawlo 是一款基于异步IO的高性能Python爬虫框架,支持分布式抓取。
5
5
  Home-page: https://github.com/crawl-coder/Crawlo.git
6
6
  Author: crawl-coder
@@ -0,0 +1 @@
1
+ __version__ = "1.2.6"
@@ -88,8 +88,9 @@ class Engine(object):
88
88
  self.downloader = downloader_cls(self.crawler)
89
89
  if hasattr(self.downloader, 'open'):
90
90
  if asyncio.iscoroutinefunction(self.downloader.open):
91
- await self.downloader.open()
91
+ self.downloader.open()
92
92
  else:
93
+ # DownloaderBase.open() 是同步方法,直接调用而不是await
93
94
  self.downloader.open()
94
95
 
95
96
  self.processor = Processor(self.crawler)
@@ -97,6 +98,7 @@ class Engine(object):
97
98
  if asyncio.iscoroutinefunction(self.processor.open):
98
99
  await self.processor.open()
99
100
  else:
101
+ # Processor.open() 是同步方法
100
102
  self.processor.open()
101
103
 
102
104
  self.start_requests = iter(spider.start_requests())
@@ -0,0 +1,240 @@
1
+ #!/usr/bin/python
2
+ # -*- coding:UTF-8 -*-
3
+ from typing import Optional, Callable
4
+ import traceback
5
+
6
+ from crawlo.utils.log import get_logger
7
+ from crawlo.utils.request import set_request
8
+ from crawlo.utils.request_serializer import RequestSerializer
9
+ from crawlo.utils.error_handler import ErrorHandler
10
+ from crawlo.queue.queue_manager import QueueManager, QueueConfig, QueueType
11
+ from crawlo.project import load_class, common_call
12
+
13
+
14
+ class Scheduler:
15
+ def __init__(self, crawler, dupe_filter, stats, log_level, priority):
16
+ self.crawler = crawler
17
+ self.queue_manager: Optional[QueueManager] = None
18
+ self.request_serializer = RequestSerializer() # 专门处理序列化
19
+
20
+ self.logger = get_logger(name=self.__class__.__name__, level=log_level)
21
+ self.error_handler = ErrorHandler(self.__class__.__name__, log_level)
22
+ self.stats = stats
23
+ self.dupe_filter = dupe_filter
24
+ self.priority = priority
25
+
26
+ @classmethod
27
+ def create_instance(cls, crawler):
28
+ filter_cls = load_class(crawler.settings.get('FILTER_CLASS'))
29
+ o = cls(
30
+ crawler=crawler,
31
+ dupe_filter=filter_cls.create_instance(crawler),
32
+ stats=crawler.stats,
33
+ log_level=crawler.settings.get('LOG_LEVEL'),
34
+ priority=crawler.settings.get('DEPTH_PRIORITY')
35
+ )
36
+ return o
37
+
38
+ async def open(self):
39
+ """初始化调度器和队列"""
40
+ self.logger.info("开始初始化调度器...")
41
+ try:
42
+ # 创建队列配置
43
+ queue_config = QueueConfig.from_settings(self.crawler.settings)
44
+
45
+ # 创建队列管理器
46
+ self.queue_manager = QueueManager(queue_config)
47
+
48
+ # 初始化队列
49
+ self.logger.info("开始初始化队列管理器...")
50
+ needs_config_update = await self.queue_manager.initialize()
51
+
52
+ self.logger.info(f"队列初始化完成,needs_config_update: {needs_config_update}")
53
+ self.logger.info(f"当前队列类型: {self.queue_manager._queue_type}")
54
+
55
+ # 检查是否需要更新过滤器配置
56
+ if needs_config_update:
57
+ # 如果返回True,说明队列类型发生了变化,需要检查当前队列类型来决定更新方向
58
+ self.logger.info("需要更新配置...")
59
+ if self.queue_manager._queue_type == QueueType.REDIS:
60
+ self.logger.info("更新为Redis配置...")
61
+ self._update_filter_config_for_redis()
62
+ else:
63
+ self.logger.info("更新为内存配置...")
64
+ self._update_filter_config_if_needed()
65
+ else:
66
+ # 检查是否需要更新配置(即使队列管理器没有要求更新)
67
+ self.logger.debug("检查是否需要更新配置...")
68
+ if self.queue_manager._queue_type == QueueType.REDIS:
69
+ # 检查当前过滤器是否为内存过滤器
70
+ current_filter_class = self.crawler.settings.get('FILTER_CLASS', '')
71
+ if 'memory_filter' in current_filter_class:
72
+ self.logger.info("检测到需要更新为Redis配置...")
73
+ self._update_filter_config_for_redis()
74
+ elif self.queue_manager._queue_type == QueueType.MEMORY:
75
+ # 检查当前过滤器是否为Redis过滤器
76
+ current_filter_class = self.crawler.settings.get('FILTER_CLASS', '')
77
+ if 'aioredis_filter' in current_filter_class or 'redis_filter' in current_filter_class:
78
+ self.logger.info("检测到需要更新为内存配置...")
79
+ self._update_filter_config_if_needed()
80
+
81
+ # 只有在确实需要更新配置时才重新创建过滤器实例
82
+ # 检查是否真的进行了配置更新
83
+ filter_updated = (
84
+ (self.queue_manager._queue_type == QueueType.REDIS and 'memory_filter' in self.crawler.settings.get('FILTER_CLASS', '')) or
85
+ (self.queue_manager._queue_type == QueueType.MEMORY and ('aioredis_filter' in self.crawler.settings.get('FILTER_CLASS', '') or 'redis_filter' in self.crawler.settings.get('FILTER_CLASS', '')))
86
+ )
87
+
88
+ if needs_config_update or filter_updated:
89
+ # 重新创建过滤器实例,确保使用更新后的配置
90
+ self.logger.debug("重新创建过滤器实例...")
91
+ filter_cls = load_class(self.crawler.settings.get('FILTER_CLASS'))
92
+ self.dupe_filter = filter_cls.create_instance(self.crawler)
93
+ self.logger.info(f"✅ 过滤器实例已更新为: {type(self.dupe_filter).__name__}")
94
+ else:
95
+ self.logger.debug("过滤器配置无需更新,跳过重新创建")
96
+
97
+ # 输出队列状态和配置信息
98
+ status = self.queue_manager.get_status()
99
+ current_filter = self.crawler.settings.get('FILTER_CLASS')
100
+ current_dedup_pipeline = self.crawler.settings.get('DEFAULT_DEDUP_PIPELINE')
101
+
102
+ self.logger.info(f'队列类型: {status["type"]}, 状态: {status["health"]}')
103
+ self.logger.info(f'当前过滤器: {type(self.dupe_filter).__name__} ({current_filter})')
104
+ self.logger.info(f'当前去重管道: {current_dedup_pipeline}')
105
+ self.logger.info("调度器初始化完成")
106
+ except Exception as e:
107
+ self.logger.error(f"❌ 调度器初始化失败: {e}")
108
+ self.logger.debug(f"详细错误信息:\n{traceback.format_exc()}")
109
+ raise
110
+
111
+ def _update_filter_config_if_needed(self):
112
+ """如果队列类型切换到内存模式,则更新过滤器配置"""
113
+ if self.queue_manager and self.queue_manager._queue_type == QueueType.MEMORY:
114
+ # 检查当前过滤器是否为Redis过滤器
115
+ current_filter_class = self.crawler.settings.get('FILTER_CLASS', '')
116
+ if 'aioredis_filter' in current_filter_class or 'redis_filter' in current_filter_class:
117
+ # 更新为内存过滤器
118
+ self.crawler.settings.set('FILTER_CLASS', 'crawlo.filters.memory_filter.MemoryFilter')
119
+ self.logger.info("✅ 已更新过滤器配置为内存模式")
120
+
121
+ # 检查当前去重管道是否为Redis去重管道
122
+ current_dedup_pipeline = self.crawler.settings.get('DEFAULT_DEDUP_PIPELINE', '')
123
+ if 'redis_dedup_pipeline' in current_dedup_pipeline:
124
+ # 更新为内存去重管道
125
+ self.crawler.settings.set('DEFAULT_DEDUP_PIPELINE', 'crawlo.pipelines.memory_dedup_pipeline.MemoryDedupPipeline')
126
+ # 同时更新PIPELINES列表中的去重管道
127
+ pipelines = self.crawler.settings.get('PIPELINES', [])
128
+ if current_dedup_pipeline in pipelines:
129
+ # 找到并替换Redis去重管道为内存去重管道
130
+ index = pipelines.index(current_dedup_pipeline)
131
+ pipelines[index] = 'crawlo.pipelines.memory_dedup_pipeline.MemoryDedupPipeline'
132
+ self.crawler.settings.set('PIPELINES', pipelines)
133
+ self.logger.info("✅ 已更新去重管道配置为内存模式")
134
+
135
+ def _update_filter_config_for_redis(self):
136
+ """如果队列类型是Redis,则更新过滤器配置为Redis实现"""
137
+ if self.queue_manager and self.queue_manager._queue_type == QueueType.REDIS:
138
+ # 检查当前过滤器是否为内存过滤器
139
+ current_filter_class = self.crawler.settings.get('FILTER_CLASS', '')
140
+ if 'memory_filter' in current_filter_class:
141
+ # 更新为Redis过滤器
142
+ self.crawler.settings.set('FILTER_CLASS', 'crawlo.filters.aioredis_filter.AioRedisFilter')
143
+ self.logger.info("✅ 已更新过滤器配置为Redis模式")
144
+
145
+ # 检查当前去重管道是否为内存去重管道
146
+ current_dedup_pipeline = self.crawler.settings.get('DEFAULT_DEDUP_PIPELINE', '')
147
+ if 'memory_dedup_pipeline' in current_dedup_pipeline:
148
+ # 更新为Redis去重管道
149
+ self.crawler.settings.set('DEFAULT_DEDUP_PIPELINE', 'crawlo.pipelines.redis_dedup_pipeline.RedisDedupPipeline')
150
+ # 同时更新PIPELINES列表中的去重管道
151
+ pipelines = self.crawler.settings.get('PIPELINES', [])
152
+ if current_dedup_pipeline in pipelines:
153
+ # 找到并替换内存去重管道为Redis去重管道
154
+ index = pipelines.index(current_dedup_pipeline)
155
+ pipelines[index] = 'crawlo.pipelines.redis_dedup_pipeline.RedisDedupPipeline'
156
+ self.crawler.settings.set('PIPELINES', pipelines)
157
+ self.logger.info("✅ 已更新去重管道配置为Redis模式")
158
+
159
+ async def next_request(self):
160
+ """获取下一个请求"""
161
+ if not self.queue_manager:
162
+ return None
163
+
164
+ try:
165
+ request = await self.queue_manager.get()
166
+
167
+ # 恢复 callback(从 Redis 队列取出时)
168
+ if request:
169
+ spider = getattr(self.crawler, 'spider', None)
170
+ request = self.request_serializer.restore_after_deserialization(request, spider)
171
+
172
+ return request
173
+ except Exception as e:
174
+ self.error_handler.handle_error(
175
+ e,
176
+ context="获取下一个请求失败",
177
+ raise_error=False
178
+ )
179
+ return None
180
+
181
+ async def enqueue_request(self, request):
182
+ """将请求加入队列"""
183
+ if not request.dont_filter and await common_call(self.dupe_filter.requested, request):
184
+ self.dupe_filter.log_stats(request)
185
+ return False
186
+
187
+ if not self.queue_manager:
188
+ self.logger.error("队列管理器未初始化")
189
+ return False
190
+
191
+ set_request(request, self.priority)
192
+
193
+ try:
194
+ # 使用统一的队列接口
195
+ success = await self.queue_manager.put(request, priority=getattr(request, 'priority', 0))
196
+
197
+ if success:
198
+ self.logger.debug(f"✅ 请求入队成功: {request.url}")
199
+
200
+ return success
201
+ except Exception as e:
202
+ self.error_handler.handle_error(
203
+ e,
204
+ context="请求入队失败",
205
+ raise_error=False
206
+ )
207
+ return False
208
+
209
+ def idle(self) -> bool:
210
+ """检查队列是否为空"""
211
+ return len(self) == 0
212
+
213
+ async def async_idle(self) -> bool:
214
+ """异步检查队列是否为空(更精确)"""
215
+ if not self.queue_manager:
216
+ return True
217
+ # 使用队列管理器的异步empty方法
218
+ return await self.queue_manager.async_empty()
219
+
220
+ async def close(self):
221
+ """关闭调度器"""
222
+ try:
223
+ if isinstance(closed := getattr(self.dupe_filter, 'closed', None), Callable):
224
+ await closed()
225
+
226
+ if self.queue_manager:
227
+ await self.queue_manager.close()
228
+ except Exception as e:
229
+ self.error_handler.handle_error(
230
+ e,
231
+ context="关闭调度器失败",
232
+ raise_error=False
233
+ )
234
+
235
+ def __len__(self):
236
+ """获取队列大小"""
237
+ if not self.queue_manager:
238
+ return 0
239
+ # 返回同步的近似值,实际大小需要异步获取
240
+ return 0 if self.queue_manager.empty() else 1
@@ -1,18 +1,6 @@
1
- #!/usr/bin/python
2
- # -*- coding:UTF-8 -*-
3
- """
4
- Redis 过滤器实现
5
- =================
6
- 提供基于 Redis 的分布式请求去重功能。
7
-
8
- 特点:
9
- - 分布式支持: 多节点共享去重数据
10
- - TTL 支持: 自动过期清理
11
- - 高性能: 使用 Redis pipeline 优化
12
- - 容错设计: 网络异常自动重试
13
- """
14
- import redis.asyncio as aioredis
15
1
  from typing import Optional
2
+ import redis.asyncio as aioredis
3
+
16
4
  from crawlo.filters import BaseFilter
17
5
  from crawlo.utils.log import get_logger
18
6
  from crawlo.utils.request import request_fingerprint
@@ -70,6 +58,9 @@ class AioRedisFilter(BaseFilter):
70
58
  # 性能计数器
71
59
  self._redis_operations = 0
72
60
  self._pipeline_operations = 0
61
+
62
+ # 连接状态标记,避免重复尝试连接失败的Redis
63
+ self._connection_failed = False
73
64
 
74
65
  @classmethod
75
66
  def create_instance(cls, crawler) -> 'BaseFilter':
@@ -123,8 +114,17 @@ class AioRedisFilter(BaseFilter):
123
114
 
124
115
  async def _get_redis_client(self):
125
116
  """获取Redis客户端实例(延迟初始化)"""
117
+ # 如果之前连接失败,直接返回None
118
+ if self._connection_failed:
119
+ return None
120
+
126
121
  if self.redis is None and self._redis_pool is not None:
127
- self.redis = await self._redis_pool.get_connection()
122
+ try:
123
+ self.redis = await self._redis_pool.get_connection()
124
+ except Exception as e:
125
+ self._connection_failed = True
126
+ self.logger.error(f"Redis连接失败,将使用本地去重: {e}")
127
+ return None
128
128
  return self.redis
129
129
 
130
130
  async def requested(self, request) -> bool:
@@ -136,13 +136,17 @@ class AioRedisFilter(BaseFilter):
136
136
  """
137
137
  try:
138
138
  # 确保Redis客户端已初始化
139
- await self._get_redis_client()
139
+ redis_client = await self._get_redis_client()
140
+
141
+ # 如果Redis不可用,返回False表示不重复(避免丢失请求)
142
+ if redis_client is None:
143
+ return False
140
144
 
141
145
  fp = str(request_fingerprint(request))
142
146
  self._redis_operations += 1
143
147
 
144
148
  # 使用 pipeline 优化性能
145
- pipe = self.redis.pipeline()
149
+ pipe = redis_client.pipeline()
146
150
  pipe.sismember(self.redis_key, fp)
147
151
 
148
152
  results = await pipe.execute()
@@ -173,12 +177,16 @@ class AioRedisFilter(BaseFilter):
173
177
  """
174
178
  try:
175
179
  # 确保Redis客户端已初始化
176
- await self._get_redis_client()
180
+ redis_client = await self._get_redis_client()
181
+
182
+ # 如果Redis不可用,返回False表示添加失败
183
+ if redis_client is None:
184
+ return False
177
185
 
178
186
  fp = str(fp)
179
187
 
180
188
  # 使用 pipeline 优化性能
181
- pipe = self.redis.pipeline()
189
+ pipe = redis_client.pipeline()
182
190
  pipe.sadd(self.redis_key, fp)
183
191
 
184
192
  if self.ttl and self.ttl > 0:
@@ -197,85 +205,30 @@ class AioRedisFilter(BaseFilter):
197
205
  except Exception as e:
198
206
  self.logger.error(f"添加指纹失败: {fp[:20]}... - {e}")
199
207
  return False
200
-
201
- def __contains__(self, item: str) -> bool:
208
+
209
+ async def __contains__(self, fp: str) -> bool:
202
210
  """
203
- 同步版本的包含检查(不推荐在异步环境中使用)
211
+ 检查指纹是否存在于Redis集合中
204
212
 
205
- :param item: 要检查的指纹
206
- :return: 是否已存在
213
+ :param fp: 请求指纹字符串
214
+ :return: 是否存在
207
215
  """
208
- # 这是一个同步方法,不能直接调用异步Redis操作
209
- # 建议使用 requested() 方法替代
210
- raise NotImplementedError("请使用 requested() 方法进行异步检查")
211
-
212
- async def get_stats(self) -> dict:
213
- """获取过滤器详细统计信息"""
214
216
  try:
215
217
  # 确保Redis客户端已初始化
216
- await self._get_redis_client()
217
-
218
- count = await self.redis.scard(self.redis_key)
218
+ redis_client = await self._get_redis_client()
219
219
 
220
- # 获取TTL信息
221
- ttl_info = "TTL未设置"
222
- if self.ttl:
223
- remaining_ttl = await self.redis.ttl(self.redis_key)
224
- if remaining_ttl > 0:
225
- ttl_info = f"剩余 {remaining_ttl} 秒"
226
- else:
227
- ttl_info = f"配置 {self.ttl} 秒"
228
-
229
- stats = {
230
- 'filter_type': 'AioRedisFilter',
231
- '指纹总数': count,
232
- 'Redis键名': self.redis_key,
233
- 'TTL配置': ttl_info,
234
- 'Redis操作数': self._redis_operations,
235
- 'Pipeline操作数': self._pipeline_operations,
236
- '性能优化率': f"{self._pipeline_operations / max(1, self._redis_operations) * 100:.1f}%"
237
- }
238
-
239
- # 合并基类统计
240
- base_stats = super().get_stats()
241
- stats.update(base_stats)
242
-
243
- return stats
220
+ # 如果Redis不可用,返回False表示不存在
221
+ if redis_client is None:
222
+ return False
244
223
 
224
+ # 检查指纹是否存在
225
+ exists = await redis_client.sismember(self.redis_key, str(fp))
226
+ return exists
245
227
  except Exception as e:
246
- self.logger.error(f"获取统计信息失败: {e}")
247
- return super().get_stats()
248
-
249
- async def clear_all(self) -> int:
250
- """清空所有指纹数据"""
251
- try:
252
- # 确保Redis客户端已初始化
253
- await self._get_redis_client()
254
-
255
- deleted = await self.redis.delete(self.redis_key)
256
- self.logger.info(f"已清除指纹数: {deleted}")
257
- return deleted
258
- except Exception as e:
259
- self.logger.error("清空指纹失败")
260
- raise
228
+ self.logger.error(f"检查指纹存在性失败: {fp[:20]}... - {e}")
229
+ # 在网络异常时返回False,避免丢失请求
230
+ return False
261
231
 
262
- async def closed(self, reason: Optional[str] = None) -> None:
263
- """爬虫关闭时的清理操作"""
264
- try:
265
- # 确保Redis客户端已初始化
266
- await self._get_redis_client()
267
-
268
- if self.cleanup_fp:
269
- deleted = await self.redis.delete(self.redis_key)
270
- self.logger.info(f"爬虫关闭清理: 已删除{deleted}个指纹")
271
- else:
272
- count = await self.redis.scard(self.redis_key)
273
- ttl_info = f"{self.ttl}秒" if self.ttl else "持久化"
274
- self.logger.info(f"保留指纹数: {count} (TTL: {ttl_info})")
275
- finally:
276
- await self._close_redis()
277
232
 
278
- async def _close_redis(self) -> None:
279
- """安全关闭Redis连接"""
280
- # 连接池会自动管理连接,这里不需要显式关闭
281
- self.logger.debug("Redis连接已释放")
233
+ # 为了兼容性,确保导出类
234
+ __all__ = ['AioRedisFilter']
@@ -4,11 +4,11 @@
4
4
  统一的队列管理器
5
5
  提供简洁、一致的队列接口,自动处理不同队列类型的差异
6
6
  """
7
- from typing import Optional, Dict, Any, Union
8
- from enum import Enum
7
+ import os
9
8
  import asyncio
10
9
  import traceback
11
- import os
10
+ from typing import Optional, Dict, Any, Union
11
+ from enum import Enum
12
12
 
13
13
  from crawlo.utils.log import get_logger
14
14
  from crawlo.utils.request_serializer import RequestSerializer
@@ -103,11 +103,24 @@ class QueueManager:
103
103
  self._queue_type = queue_type
104
104
 
105
105
  # 测试队列健康状态
106
- await self._health_check()
106
+ health_check_result = await self._health_check()
107
107
 
108
108
  self.logger.info(f"✅ 队列初始化成功: {queue_type.value}")
109
- self.logger.info(f"📊 队列配置: {self._get_queue_info()}")
110
- return True
109
+ # 只在调试模式下输出详细配置信息
110
+ self.logger.debug(f"📊 队列配置: {self._get_queue_info()}")
111
+
112
+ # 如果健康检查返回True,表示队列类型发生了切换,需要更新配置
113
+ if health_check_result:
114
+ return True
115
+
116
+ # 如果队列类型是Redis,检查是否需要更新配置
117
+ if queue_type == QueueType.REDIS:
118
+ # 这个检查需要在调度器中进行,因为队列管理器无法访问crawler.settings
119
+ # 但我们不需要总是返回True,只有在确实需要更新时才返回True
120
+ # 调度器会进行更详细的检查
121
+ pass
122
+
123
+ return False # 默认不需要更新配置
111
124
 
112
125
  except Exception as e:
113
126
  # 记录详细的错误信息和堆栈跟踪
@@ -265,7 +278,15 @@ class QueueManager:
265
278
  raise RuntimeError("Redis 队列不可用:未安装 redis 依赖")
266
279
  if not self.config.redis_url:
267
280
  raise RuntimeError("Redis 队列不可用:未配置 REDIS_URL")
268
- return QueueType.REDIS
281
+ # 测试 Redis 连接
282
+ try:
283
+ test_queue = RedisPriorityQueue(self.config.redis_url)
284
+ await test_queue.connect()
285
+ await test_queue.close()
286
+ return QueueType.REDIS
287
+ except Exception as e:
288
+ # 如果强制使用Redis但连接失败,则抛出异常
289
+ raise RuntimeError(f"Redis 队列不可用:无法连接到 Redis ({e})")
269
290
 
270
291
  elif self.config.queue_type == QueueType.MEMORY:
271
292
  return QueueType.MEMORY
@@ -307,7 +328,7 @@ class QueueManager:
307
328
  else:
308
329
  raise ValueError(f"不支持的队列类型: {queue_type}")
309
330
 
310
- async def _health_check(self) -> None:
331
+ async def _health_check(self) -> bool:
311
332
  """健康检查"""
312
333
  try:
313
334
  if self._queue_type == QueueType.REDIS:
@@ -317,9 +338,27 @@ class QueueManager:
317
338
  else:
318
339
  # 内存队列总是健康的
319
340
  self._health_status = "healthy"
341
+ return False # 内存队列不需要更新配置
320
342
  except Exception as e:
321
343
  self.logger.warning(f"队列健康检查失败: {e}")
322
344
  self._health_status = "unhealthy"
345
+ # 如果是Redis队列且健康检查失败,尝试切换到内存队列
346
+ if self._queue_type == QueueType.REDIS and self.config.queue_type == QueueType.AUTO:
347
+ self.logger.info("Redis队列不可用,尝试切换到内存队列...")
348
+ try:
349
+ await self._queue.close()
350
+ except:
351
+ pass
352
+ self._queue = None
353
+ # 重新创建内存队列
354
+ self._queue = await self._create_queue(QueueType.MEMORY)
355
+ self._queue_type = QueueType.MEMORY
356
+ self._queue_semaphore = asyncio.Semaphore(self.config.max_queue_size)
357
+ self._health_status = "healthy"
358
+ self.logger.info("✅ 已切换到内存队列")
359
+ # 返回一个信号,表示需要更新过滤器和去重管道配置
360
+ return True
361
+ return False
323
362
 
324
363
  def _get_queue_info(self) -> Dict[str, Any]:
325
364
  """获取队列配置信息"""
@@ -77,7 +77,13 @@ class RedisPriorityQueue:
77
77
  """异步连接 Redis,支持重试"""
78
78
  async with self._lock:
79
79
  if self._redis is not None:
80
- return self._redis
80
+ # 如果已经连接,测试连接是否仍然有效
81
+ try:
82
+ await self._redis.ping()
83
+ return self._redis
84
+ except Exception:
85
+ # 连接失效,重新连接
86
+ self._redis = None
81
87
 
82
88
  for attempt in range(max_retries):
83
89
  try:
@@ -97,7 +103,8 @@ class RedisPriorityQueue:
97
103
 
98
104
  # 测试连接
99
105
  await self._redis.ping()
100
- logger.info(f"✅ Redis 连接成功 (Module: {self.module_name})")
106
+ # 只在调试模式下输出详细连接信息
107
+ logger.debug(f"✅ Redis 连接成功 (Module: {self.module_name})")
101
108
  return self._redis
102
109
  except Exception as e:
103
110
  error_msg = f"⚠️ Redis 连接失败 (尝试 {attempt + 1}/{max_retries}, Module: {self.module_name}): {e}"
@@ -3,7 +3,6 @@
3
3
  默认配置文件
4
4
  包含 Crawlo 框架的所有默认设置项
5
5
  """
6
- import os
7
6
 
8
7
  # 添加环境变量配置工具导入
9
8
  from crawlo.utils.env_config import get_redis_config, get_runtime_config
@@ -13,9 +12,6 @@ from crawlo.utils.env_config import get_redis_config, get_runtime_config
13
12
  # 项目名称(用于日志、Redis Key 等标识)
14
13
  PROJECT_NAME = get_runtime_config()['PROJECT_NAME']
15
14
 
16
- # 框架版本
17
- VERSION = 1.0
18
-
19
15
  # 运行模式:standalone/distributed/auto
20
16
  RUN_MODE = get_runtime_config()['CRAWLO_MODE']
21
17
 
@@ -46,9 +42,11 @@ SCHEDULER_QUEUE_NAME = f"crawlo:{PROJECT_NAME}:queue:requests"
46
42
  # 队列类型:memory/redis/auto
47
43
  QUEUE_TYPE = 'auto'
48
44
 
49
- # 明确配置默认去重管道和过滤器,避免冗余的if-else判断
50
- DEFAULT_DEDUP_PIPELINE = 'crawlo.pipelines.memory_dedup_pipeline.MemoryDedupPipeline'
51
- FILTER_CLASS = 'crawlo.filters.memory_filter.MemoryFilter'
45
+
46
+ # 默认使用内存过滤器和去重管道,确保在无Redis环境下也能正常运行
47
+ # 在auto模式下,如果Redis可用,框架会自动更新为Redis实现以提供更好的去重能力
48
+ DEFAULT_DEDUP_PIPELINE = 'crawlo.pipelines.memory_dedup_pipeline.MemoryDedupPipeline'
49
+ FILTER_CLASS = 'crawlo.filters.memory_filter.MemoryFilter'
52
50
 
53
51
  # --- Redis 过滤器配置 ---
54
52
  # 使用环境变量配置工具获取 Redis 配置
@@ -75,43 +75,6 @@ INTERVAL = 5
75
75
  DEPTH_PRIORITY = 1
76
76
  MAX_RUNNING_SPIDERS = 3
77
77
 
78
- # ============================== 运行模式选择 ==============================
79
- # 运行模式:'standalone'(单机), 'distributed'(分布式), 'auto'(自动检测)
80
- #
81
- # 三种运行模式的最佳使用场景:
82
- #
83
- # 1. standalone(单机模式):
84
- # - 适用场景:开发调试、小规模数据采集、个人项目
85
- # - 特点:简单易用,资源占用少,无需额外依赖
86
- # - 配置建议:
87
- # * QUEUE_TYPE = 'auto'(自动选择队列类型)
88
- # * FILTER_CLASS = 'crawlo.filters.memory_filter.MemoryFilter'(内存过滤器)
89
- # * DEFAULT_DEDUP_PIPELINE = 'crawlo.pipelines.memory_dedup_pipeline.MemoryDedupPipeline'(内存去重)
90
- # - 混合配置(推荐):
91
- # * QUEUE_TYPE = 'auto'(自动选择)
92
- # * FILTER_CLASS = 'crawlo.filters.aioredis_filter.AioRedisFilter'(Redis过滤器)
93
- # * DEFAULT_DEDUP_PIPELINE = 'crawlo.pipelines.redis_dedup_pipeline.RedisDedupPipeline'(Redis去重)
94
- # * 优势:享受Redis去重的持久性,同时保持部署简单
95
- #
96
- # 2. distributed(分布式模式):
97
- # - 适用场景:大规模数据采集、多节点协同工作、高并发需求
98
- # - 特点:支持多节点扩展,高并发处理能力,需要Redis支持
99
- # - 配置建议:
100
- # * QUEUE_TYPE = 'redis'(Redis队列)
101
- # * FILTER_CLASS = 'crawlo.filters.aioredis_filter.AioRedisFilter'(Redis过滤器)
102
- # * DEFAULT_DEDUP_PIPELINE = 'crawlo.pipelines.redis_dedup_pipeline.RedisDedupPipeline'(Redis去重)
103
- # - 部署要求:需要配置Redis服务器连接参数
104
- #
105
- # 3. auto(自动检测模式):
106
- # - 适用场景:希望根据环境自动选择最佳运行方式
107
- # - 特点:智能检测环境配置,自动选择运行模式
108
- # - 配置建议:
109
- # * 框架会根据Redis可用性自动选择队列类型
110
- # * 默认使用内存过滤器和去重管道
111
- # - 适用情况:需要在不同环境中使用同一套配置
112
-
113
- RUN_MODE = 'standalone' # 默认单机模式,简单易用
114
-
115
78
  # ============================== 队列配置(支持分布式) ==============================
116
79
 
117
80
  # 队列类型:'auto'(自动选择), 'memory'(内存队列), 'redis'(分布式队列)
@@ -157,8 +120,9 @@ MONGO_USE_BATCH = False # 是否启用批量插入
157
120
  REQUEST_DIR = '.'
158
121
 
159
122
  # 明确配置默认去重管道和过滤器,避免冗余的if-else判断
160
- DEFAULT_DEDUP_PIPELINE = 'crawlo.pipelines.memory_dedup_pipeline.MemoryDedupPipeline'
161
- FILTER_CLASS = 'crawlo.filters.memory_filter.MemoryFilter'
123
+ # 在单机模式下,如果Redis可用则使用Redis去重,否则使用内存去重
124
+ DEFAULT_DEDUP_PIPELINE = 'crawlo.pipelines.redis_dedup_pipeline.RedisDedupPipeline'
125
+ FILTER_CLASS = 'crawlo.filters.aioredis_filter.AioRedisFilter'
162
126
 
163
127
  # --- Redis 配置(用于分布式去重和队列) ---
164
128
  REDIS_HOST = os.getenv('REDIS_HOST', '127.0.0.1')