crawlo 1.2.4__tar.gz → 1.2.6__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of crawlo might be problematic. Click here for more details.

Files changed (219) hide show
  1. {crawlo-1.2.4/crawlo.egg-info → crawlo-1.2.6}/PKG-INFO +1 -1
  2. crawlo-1.2.6/crawlo/__version__.py +1 -0
  3. {crawlo-1.2.4 → crawlo-1.2.6}/crawlo/cli.py +12 -5
  4. {crawlo-1.2.4 → crawlo-1.2.6}/crawlo/commands/startproject.py +22 -6
  5. {crawlo-1.2.4 → crawlo-1.2.6}/crawlo/core/engine.py +3 -1
  6. crawlo-1.2.6/crawlo/core/scheduler.py +240 -0
  7. {crawlo-1.2.4 → crawlo-1.2.6}/crawlo/filters/aioredis_filter.py +44 -91
  8. {crawlo-1.2.4 → crawlo-1.2.6}/crawlo/queue/queue_manager.py +47 -8
  9. {crawlo-1.2.4 → crawlo-1.2.6}/crawlo/queue/redis_priority_queue.py +9 -2
  10. {crawlo-1.2.4 → crawlo-1.2.6}/crawlo/settings/default_settings.py +5 -7
  11. {crawlo-1.2.4 → crawlo-1.2.6}/crawlo/templates/project/settings.py.tmpl +3 -65
  12. {crawlo-1.2.4 → crawlo-1.2.6}/crawlo/templates/project/settings_distributed.py.tmpl +4 -7
  13. crawlo-1.2.6/crawlo/templates/project/settings_gentle.py.tmpl +101 -0
  14. crawlo-1.2.6/crawlo/templates/project/settings_high_performance.py.tmpl +135 -0
  15. crawlo-1.2.6/crawlo/templates/project/settings_simple.py.tmpl +99 -0
  16. {crawlo-1.2.4/crawlo/templates/project → crawlo-1.2.6/crawlo/templates}/run.py.tmpl +1 -3
  17. {crawlo-1.2.4 → crawlo-1.2.6}/crawlo/utils/redis_connection_pool.py +19 -2
  18. {crawlo-1.2.4 → crawlo-1.2.6/crawlo.egg-info}/PKG-INFO +1 -1
  19. {crawlo-1.2.4 → crawlo-1.2.6}/crawlo.egg-info/SOURCES.txt +1 -1
  20. crawlo-1.2.4/crawlo/__version__.py +0 -1
  21. crawlo-1.2.4/crawlo/core/scheduler.py +0 -144
  22. crawlo-1.2.4/crawlo/templates/project/settings_gentle.py.tmpl +0 -134
  23. crawlo-1.2.4/crawlo/templates/project/settings_high_performance.py.tmpl +0 -156
  24. crawlo-1.2.4/crawlo/templates/project/settings_simple.py.tmpl +0 -109
  25. {crawlo-1.2.4 → crawlo-1.2.6}/LICENSE +0 -0
  26. {crawlo-1.2.4 → crawlo-1.2.6}/MANIFEST.in +0 -0
  27. {crawlo-1.2.4 → crawlo-1.2.6}/README.md +0 -0
  28. {crawlo-1.2.4 → crawlo-1.2.6}/crawlo/__init__.py +0 -0
  29. {crawlo-1.2.4 → crawlo-1.2.6}/crawlo/cleaners/__init__.py +0 -0
  30. {crawlo-1.2.4 → crawlo-1.2.6}/crawlo/cleaners/data_formatter.py +0 -0
  31. {crawlo-1.2.4 → crawlo-1.2.6}/crawlo/cleaners/encoding_converter.py +0 -0
  32. {crawlo-1.2.4 → crawlo-1.2.6}/crawlo/cleaners/text_cleaner.py +0 -0
  33. {crawlo-1.2.4 → crawlo-1.2.6}/crawlo/commands/__init__.py +0 -0
  34. {crawlo-1.2.4 → crawlo-1.2.6}/crawlo/commands/check.py +0 -0
  35. {crawlo-1.2.4 → crawlo-1.2.6}/crawlo/commands/genspider.py +0 -0
  36. {crawlo-1.2.4 → crawlo-1.2.6}/crawlo/commands/help.py +0 -0
  37. {crawlo-1.2.4 → crawlo-1.2.6}/crawlo/commands/list.py +0 -0
  38. {crawlo-1.2.4 → crawlo-1.2.6}/crawlo/commands/run.py +0 -0
  39. {crawlo-1.2.4 → crawlo-1.2.6}/crawlo/commands/stats.py +0 -0
  40. {crawlo-1.2.4 → crawlo-1.2.6}/crawlo/commands/utils.py +0 -0
  41. {crawlo-1.2.4 → crawlo-1.2.6}/crawlo/config.py +0 -0
  42. {crawlo-1.2.4 → crawlo-1.2.6}/crawlo/config_validator.py +0 -0
  43. {crawlo-1.2.4 → crawlo-1.2.6}/crawlo/core/__init__.py +0 -0
  44. {crawlo-1.2.4 → crawlo-1.2.6}/crawlo/core/processor.py +0 -0
  45. {crawlo-1.2.4 → crawlo-1.2.6}/crawlo/crawler.py +0 -0
  46. {crawlo-1.2.4 → crawlo-1.2.6}/crawlo/data/__init__.py +0 -0
  47. {crawlo-1.2.4 → crawlo-1.2.6}/crawlo/data/user_agents.py +0 -0
  48. {crawlo-1.2.4 → crawlo-1.2.6}/crawlo/downloader/__init__.py +0 -0
  49. {crawlo-1.2.4 → crawlo-1.2.6}/crawlo/downloader/aiohttp_downloader.py +0 -0
  50. {crawlo-1.2.4 → crawlo-1.2.6}/crawlo/downloader/cffi_downloader.py +0 -0
  51. {crawlo-1.2.4 → crawlo-1.2.6}/crawlo/downloader/httpx_downloader.py +0 -0
  52. {crawlo-1.2.4 → crawlo-1.2.6}/crawlo/downloader/hybrid_downloader.py +0 -0
  53. {crawlo-1.2.4 → crawlo-1.2.6}/crawlo/downloader/playwright_downloader.py +0 -0
  54. {crawlo-1.2.4 → crawlo-1.2.6}/crawlo/downloader/selenium_downloader.py +0 -0
  55. {crawlo-1.2.4 → crawlo-1.2.6}/crawlo/event.py +0 -0
  56. {crawlo-1.2.4 → crawlo-1.2.6}/crawlo/exceptions.py +0 -0
  57. {crawlo-1.2.4 → crawlo-1.2.6}/crawlo/extension/__init__.py +0 -0
  58. {crawlo-1.2.4 → crawlo-1.2.6}/crawlo/extension/health_check.py +0 -0
  59. {crawlo-1.2.4 → crawlo-1.2.6}/crawlo/extension/log_interval.py +0 -0
  60. {crawlo-1.2.4 → crawlo-1.2.6}/crawlo/extension/log_stats.py +0 -0
  61. {crawlo-1.2.4 → crawlo-1.2.6}/crawlo/extension/logging_extension.py +0 -0
  62. {crawlo-1.2.4 → crawlo-1.2.6}/crawlo/extension/memory_monitor.py +0 -0
  63. {crawlo-1.2.4 → crawlo-1.2.6}/crawlo/extension/performance_profiler.py +0 -0
  64. {crawlo-1.2.4 → crawlo-1.2.6}/crawlo/extension/request_recorder.py +0 -0
  65. {crawlo-1.2.4 → crawlo-1.2.6}/crawlo/filters/__init__.py +0 -0
  66. {crawlo-1.2.4 → crawlo-1.2.6}/crawlo/filters/memory_filter.py +0 -0
  67. {crawlo-1.2.4 → crawlo-1.2.6}/crawlo/items/__init__.py +0 -0
  68. {crawlo-1.2.4 → crawlo-1.2.6}/crawlo/items/base.py +0 -0
  69. {crawlo-1.2.4 → crawlo-1.2.6}/crawlo/items/fields.py +0 -0
  70. {crawlo-1.2.4 → crawlo-1.2.6}/crawlo/items/items.py +0 -0
  71. {crawlo-1.2.4 → crawlo-1.2.6}/crawlo/middleware/__init__.py +0 -0
  72. {crawlo-1.2.4 → crawlo-1.2.6}/crawlo/middleware/default_header.py +0 -0
  73. {crawlo-1.2.4 → crawlo-1.2.6}/crawlo/middleware/download_delay.py +0 -0
  74. {crawlo-1.2.4 → crawlo-1.2.6}/crawlo/middleware/middleware_manager.py +0 -0
  75. {crawlo-1.2.4 → crawlo-1.2.6}/crawlo/middleware/offsite.py +0 -0
  76. {crawlo-1.2.4 → crawlo-1.2.6}/crawlo/middleware/proxy.py +0 -0
  77. {crawlo-1.2.4 → crawlo-1.2.6}/crawlo/middleware/request_ignore.py +0 -0
  78. {crawlo-1.2.4 → crawlo-1.2.6}/crawlo/middleware/response_code.py +0 -0
  79. {crawlo-1.2.4 → crawlo-1.2.6}/crawlo/middleware/response_filter.py +0 -0
  80. {crawlo-1.2.4 → crawlo-1.2.6}/crawlo/middleware/retry.py +0 -0
  81. {crawlo-1.2.4 → crawlo-1.2.6}/crawlo/mode_manager.py +0 -0
  82. {crawlo-1.2.4 → crawlo-1.2.6}/crawlo/network/__init__.py +0 -0
  83. {crawlo-1.2.4 → crawlo-1.2.6}/crawlo/network/request.py +0 -0
  84. {crawlo-1.2.4 → crawlo-1.2.6}/crawlo/network/response.py +0 -0
  85. {crawlo-1.2.4 → crawlo-1.2.6}/crawlo/pipelines/__init__.py +0 -0
  86. {crawlo-1.2.4 → crawlo-1.2.6}/crawlo/pipelines/bloom_dedup_pipeline.py +0 -0
  87. {crawlo-1.2.4 → crawlo-1.2.6}/crawlo/pipelines/console_pipeline.py +0 -0
  88. {crawlo-1.2.4 → crawlo-1.2.6}/crawlo/pipelines/csv_pipeline.py +0 -0
  89. {crawlo-1.2.4 → crawlo-1.2.6}/crawlo/pipelines/database_dedup_pipeline.py +0 -0
  90. {crawlo-1.2.4 → crawlo-1.2.6}/crawlo/pipelines/json_pipeline.py +0 -0
  91. {crawlo-1.2.4 → crawlo-1.2.6}/crawlo/pipelines/memory_dedup_pipeline.py +0 -0
  92. {crawlo-1.2.4 → crawlo-1.2.6}/crawlo/pipelines/mongo_pipeline.py +0 -0
  93. {crawlo-1.2.4 → crawlo-1.2.6}/crawlo/pipelines/mysql_pipeline.py +0 -0
  94. {crawlo-1.2.4 → crawlo-1.2.6}/crawlo/pipelines/pipeline_manager.py +0 -0
  95. {crawlo-1.2.4 → crawlo-1.2.6}/crawlo/pipelines/redis_dedup_pipeline.py +0 -0
  96. {crawlo-1.2.4 → crawlo-1.2.6}/crawlo/project.py +0 -0
  97. {crawlo-1.2.4 → crawlo-1.2.6}/crawlo/queue/__init__.py +0 -0
  98. {crawlo-1.2.4 → crawlo-1.2.6}/crawlo/queue/pqueue.py +0 -0
  99. {crawlo-1.2.4 → crawlo-1.2.6}/crawlo/settings/__init__.py +0 -0
  100. {crawlo-1.2.4 → crawlo-1.2.6}/crawlo/settings/setting_manager.py +0 -0
  101. {crawlo-1.2.4 → crawlo-1.2.6}/crawlo/spider/__init__.py +0 -0
  102. {crawlo-1.2.4 → crawlo-1.2.6}/crawlo/stats_collector.py +0 -0
  103. {crawlo-1.2.4 → crawlo-1.2.6}/crawlo/subscriber.py +0 -0
  104. {crawlo-1.2.4 → crawlo-1.2.6}/crawlo/task_manager.py +0 -0
  105. {crawlo-1.2.4 → crawlo-1.2.6}/crawlo/templates/crawlo.cfg.tmpl +0 -0
  106. {crawlo-1.2.4 → crawlo-1.2.6}/crawlo/templates/project/__init__.py.tmpl +0 -0
  107. {crawlo-1.2.4 → crawlo-1.2.6}/crawlo/templates/project/items.py.tmpl +0 -0
  108. {crawlo-1.2.4 → crawlo-1.2.6}/crawlo/templates/project/middlewares.py.tmpl +0 -0
  109. {crawlo-1.2.4 → crawlo-1.2.6}/crawlo/templates/project/pipelines.py.tmpl +0 -0
  110. {crawlo-1.2.4 → crawlo-1.2.6}/crawlo/templates/project/spiders/__init__.py.tmpl +0 -0
  111. {crawlo-1.2.4 → crawlo-1.2.6}/crawlo/templates/spider/spider.py.tmpl +0 -0
  112. {crawlo-1.2.4 → crawlo-1.2.6}/crawlo/tools/__init__.py +0 -0
  113. {crawlo-1.2.4 → crawlo-1.2.6}/crawlo/tools/anti_crawler.py +0 -0
  114. {crawlo-1.2.4 → crawlo-1.2.6}/crawlo/tools/authenticated_proxy.py +0 -0
  115. {crawlo-1.2.4 → crawlo-1.2.6}/crawlo/tools/data_validator.py +0 -0
  116. {crawlo-1.2.4 → crawlo-1.2.6}/crawlo/tools/date_tools.py +0 -0
  117. {crawlo-1.2.4 → crawlo-1.2.6}/crawlo/tools/distributed_coordinator.py +0 -0
  118. {crawlo-1.2.4 → crawlo-1.2.6}/crawlo/tools/retry_mechanism.py +0 -0
  119. {crawlo-1.2.4 → crawlo-1.2.6}/crawlo/tools/scenario_adapter.py +0 -0
  120. {crawlo-1.2.4 → crawlo-1.2.6}/crawlo/utils/__init__.py +0 -0
  121. {crawlo-1.2.4 → crawlo-1.2.6}/crawlo/utils/batch_processor.py +0 -0
  122. {crawlo-1.2.4 → crawlo-1.2.6}/crawlo/utils/controlled_spider_mixin.py +0 -0
  123. {crawlo-1.2.4 → crawlo-1.2.6}/crawlo/utils/date_tools.py +0 -0
  124. {crawlo-1.2.4 → crawlo-1.2.6}/crawlo/utils/db_helper.py +0 -0
  125. {crawlo-1.2.4 → crawlo-1.2.6}/crawlo/utils/enhanced_error_handler.py +0 -0
  126. {crawlo-1.2.4 → crawlo-1.2.6}/crawlo/utils/env_config.py +0 -0
  127. {crawlo-1.2.4 → crawlo-1.2.6}/crawlo/utils/error_handler.py +0 -0
  128. {crawlo-1.2.4 → crawlo-1.2.6}/crawlo/utils/func_tools.py +0 -0
  129. {crawlo-1.2.4 → crawlo-1.2.6}/crawlo/utils/large_scale_config.py +0 -0
  130. {crawlo-1.2.4 → crawlo-1.2.6}/crawlo/utils/large_scale_helper.py +0 -0
  131. {crawlo-1.2.4 → crawlo-1.2.6}/crawlo/utils/log.py +0 -0
  132. {crawlo-1.2.4 → crawlo-1.2.6}/crawlo/utils/performance_monitor.py +0 -0
  133. {crawlo-1.2.4 → crawlo-1.2.6}/crawlo/utils/queue_helper.py +0 -0
  134. {crawlo-1.2.4 → crawlo-1.2.6}/crawlo/utils/redis_key_validator.py +0 -0
  135. {crawlo-1.2.4 → crawlo-1.2.6}/crawlo/utils/request.py +0 -0
  136. {crawlo-1.2.4 → crawlo-1.2.6}/crawlo/utils/request_serializer.py +0 -0
  137. {crawlo-1.2.4 → crawlo-1.2.6}/crawlo/utils/spider_loader.py +0 -0
  138. {crawlo-1.2.4 → crawlo-1.2.6}/crawlo/utils/system.py +0 -0
  139. {crawlo-1.2.4 → crawlo-1.2.6}/crawlo/utils/tools.py +0 -0
  140. {crawlo-1.2.4 → crawlo-1.2.6}/crawlo/utils/url.py +0 -0
  141. {crawlo-1.2.4 → crawlo-1.2.6}/crawlo.egg-info/dependency_links.txt +0 -0
  142. {crawlo-1.2.4 → crawlo-1.2.6}/crawlo.egg-info/entry_points.txt +0 -0
  143. {crawlo-1.2.4 → crawlo-1.2.6}/crawlo.egg-info/requires.txt +0 -0
  144. {crawlo-1.2.4 → crawlo-1.2.6}/crawlo.egg-info/top_level.txt +0 -0
  145. {crawlo-1.2.4 → crawlo-1.2.6}/examples/__init__.py +0 -0
  146. {crawlo-1.2.4 → crawlo-1.2.6}/pyproject.toml +0 -0
  147. {crawlo-1.2.4 → crawlo-1.2.6}/requirements.txt +0 -0
  148. {crawlo-1.2.4 → crawlo-1.2.6}/setup.cfg +0 -0
  149. {crawlo-1.2.4 → crawlo-1.2.6}/tests/DOUBLE_CRAWLO_PREFIX_FIX_REPORT.md +0 -0
  150. {crawlo-1.2.4 → crawlo-1.2.6}/tests/__init__.py +0 -0
  151. {crawlo-1.2.4 → crawlo-1.2.6}/tests/advanced_tools_example.py +0 -0
  152. {crawlo-1.2.4 → crawlo-1.2.6}/tests/authenticated_proxy_example.py +0 -0
  153. {crawlo-1.2.4 → crawlo-1.2.6}/tests/cleaners_example.py +0 -0
  154. {crawlo-1.2.4 → crawlo-1.2.6}/tests/config_validation_demo.py +0 -0
  155. {crawlo-1.2.4 → crawlo-1.2.6}/tests/controlled_spider_example.py +0 -0
  156. {crawlo-1.2.4 → crawlo-1.2.6}/tests/date_tools_example.py +0 -0
  157. {crawlo-1.2.4 → crawlo-1.2.6}/tests/dynamic_loading_example.py +0 -0
  158. {crawlo-1.2.4 → crawlo-1.2.6}/tests/dynamic_loading_test.py +0 -0
  159. {crawlo-1.2.4 → crawlo-1.2.6}/tests/env_config_example.py +0 -0
  160. {crawlo-1.2.4 → crawlo-1.2.6}/tests/error_handling_example.py +0 -0
  161. {crawlo-1.2.4 → crawlo-1.2.6}/tests/redis_key_validation_demo.py +0 -0
  162. {crawlo-1.2.4 → crawlo-1.2.6}/tests/response_improvements_example.py +0 -0
  163. {crawlo-1.2.4 → crawlo-1.2.6}/tests/test_advanced_tools.py +0 -0
  164. {crawlo-1.2.4 → crawlo-1.2.6}/tests/test_all_redis_key_configs.py +0 -0
  165. {crawlo-1.2.4 → crawlo-1.2.6}/tests/test_authenticated_proxy.py +0 -0
  166. {crawlo-1.2.4 → crawlo-1.2.6}/tests/test_cleaners.py +0 -0
  167. {crawlo-1.2.4 → crawlo-1.2.6}/tests/test_comprehensive.py +0 -0
  168. {crawlo-1.2.4 → crawlo-1.2.6}/tests/test_config_validator.py +0 -0
  169. {crawlo-1.2.4 → crawlo-1.2.6}/tests/test_crawlo_proxy_integration.py +0 -0
  170. {crawlo-1.2.4 → crawlo-1.2.6}/tests/test_date_tools.py +0 -0
  171. {crawlo-1.2.4 → crawlo-1.2.6}/tests/test_default_header_middleware.py +0 -0
  172. {crawlo-1.2.4 → crawlo-1.2.6}/tests/test_double_crawlo_fix.py +0 -0
  173. {crawlo-1.2.4 → crawlo-1.2.6}/tests/test_double_crawlo_fix_simple.py +0 -0
  174. {crawlo-1.2.4 → crawlo-1.2.6}/tests/test_download_delay_middleware.py +0 -0
  175. {crawlo-1.2.4 → crawlo-1.2.6}/tests/test_downloader_proxy_compatibility.py +0 -0
  176. {crawlo-1.2.4 → crawlo-1.2.6}/tests/test_dynamic_downloaders_proxy.py +0 -0
  177. {crawlo-1.2.4 → crawlo-1.2.6}/tests/test_dynamic_proxy.py +0 -0
  178. {crawlo-1.2.4 → crawlo-1.2.6}/tests/test_dynamic_proxy_config.py +0 -0
  179. {crawlo-1.2.4 → crawlo-1.2.6}/tests/test_dynamic_proxy_real.py +0 -0
  180. {crawlo-1.2.4 → crawlo-1.2.6}/tests/test_edge_cases.py +0 -0
  181. {crawlo-1.2.4 → crawlo-1.2.6}/tests/test_enhanced_error_handler.py +0 -0
  182. {crawlo-1.2.4 → crawlo-1.2.6}/tests/test_env_config.py +0 -0
  183. {crawlo-1.2.4 → crawlo-1.2.6}/tests/test_error_handler_compatibility.py +0 -0
  184. {crawlo-1.2.4 → crawlo-1.2.6}/tests/test_final_validation.py +0 -0
  185. {crawlo-1.2.4 → crawlo-1.2.6}/tests/test_framework_env_usage.py +0 -0
  186. {crawlo-1.2.4 → crawlo-1.2.6}/tests/test_integration.py +0 -0
  187. {crawlo-1.2.4 → crawlo-1.2.6}/tests/test_item_dedup_redis_key.py +0 -0
  188. {crawlo-1.2.4 → crawlo-1.2.6}/tests/test_offsite_middleware.py +0 -0
  189. {crawlo-1.2.4 → crawlo-1.2.6}/tests/test_parsel.py +0 -0
  190. {crawlo-1.2.4 → crawlo-1.2.6}/tests/test_performance.py +0 -0
  191. {crawlo-1.2.4 → crawlo-1.2.6}/tests/test_proxy_api.py +0 -0
  192. {crawlo-1.2.4 → crawlo-1.2.6}/tests/test_proxy_health_check.py +0 -0
  193. {crawlo-1.2.4 → crawlo-1.2.6}/tests/test_proxy_middleware.py +0 -0
  194. {crawlo-1.2.4 → crawlo-1.2.6}/tests/test_proxy_middleware_enhanced.py +0 -0
  195. {crawlo-1.2.4 → crawlo-1.2.6}/tests/test_proxy_middleware_integration.py +0 -0
  196. {crawlo-1.2.4 → crawlo-1.2.6}/tests/test_proxy_providers.py +0 -0
  197. {crawlo-1.2.4 → crawlo-1.2.6}/tests/test_proxy_stats.py +0 -0
  198. {crawlo-1.2.4 → crawlo-1.2.6}/tests/test_proxy_strategies.py +0 -0
  199. {crawlo-1.2.4 → crawlo-1.2.6}/tests/test_queue_manager_double_crawlo.py +0 -0
  200. {crawlo-1.2.4 → crawlo-1.2.6}/tests/test_queue_manager_redis_key.py +0 -0
  201. {crawlo-1.2.4 → crawlo-1.2.6}/tests/test_real_scenario_proxy.py +0 -0
  202. {crawlo-1.2.4 → crawlo-1.2.6}/tests/test_redis_config.py +0 -0
  203. {crawlo-1.2.4 → crawlo-1.2.6}/tests/test_redis_connection_pool.py +0 -0
  204. {crawlo-1.2.4 → crawlo-1.2.6}/tests/test_redis_key_naming.py +0 -0
  205. {crawlo-1.2.4 → crawlo-1.2.6}/tests/test_redis_key_validator.py +0 -0
  206. {crawlo-1.2.4 → crawlo-1.2.6}/tests/test_redis_queue.py +0 -0
  207. {crawlo-1.2.4 → crawlo-1.2.6}/tests/test_request_ignore_middleware.py +0 -0
  208. {crawlo-1.2.4 → crawlo-1.2.6}/tests/test_request_serialization.py +0 -0
  209. {crawlo-1.2.4 → crawlo-1.2.6}/tests/test_response_code_middleware.py +0 -0
  210. {crawlo-1.2.4 → crawlo-1.2.6}/tests/test_response_filter_middleware.py +0 -0
  211. {crawlo-1.2.4 → crawlo-1.2.6}/tests/test_response_improvements.py +0 -0
  212. {crawlo-1.2.4 → crawlo-1.2.6}/tests/test_retry_middleware.py +0 -0
  213. {crawlo-1.2.4 → crawlo-1.2.6}/tests/test_scheduler.py +0 -0
  214. {crawlo-1.2.4 → crawlo-1.2.6}/tests/test_simple_response.py +0 -0
  215. {crawlo-1.2.4 → crawlo-1.2.6}/tests/test_telecom_spider_redis_key.py +0 -0
  216. {crawlo-1.2.4 → crawlo-1.2.6}/tests/test_template_content.py +0 -0
  217. {crawlo-1.2.4 → crawlo-1.2.6}/tests/test_template_redis_key.py +0 -0
  218. {crawlo-1.2.4 → crawlo-1.2.6}/tests/test_tools.py +0 -0
  219. {crawlo-1.2.4 → crawlo-1.2.6}/tests/tools_example.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: crawlo
3
- Version: 1.2.4
3
+ Version: 1.2.6
4
4
  Summary: Crawlo 是一款基于异步IO的高性能Python爬虫框架,支持分布式抓取。
5
5
  Home-page: https://github.com/crawl-coder/Crawlo.git
6
6
  Author: crawl-coder
@@ -0,0 +1 @@
1
+ __version__ = "1.2.6"
@@ -10,12 +10,19 @@ from crawlo.commands import get_commands
10
10
  def main():
11
11
  # 获取框架版本号
12
12
  version_file = os.path.join(os.path.dirname(__file__), '__version__.py')
13
+ VERSION = '1.0.0' # 默认版本号
13
14
  if os.path.exists(version_file):
14
- with open(version_file, 'r') as f:
15
- exec(f.read())
16
- VERSION = locals().get('__version__', '1.0.0')
17
- else:
18
- VERSION = '1.0.0'
15
+ try:
16
+ with open(version_file, 'r') as f:
17
+ content = f.read()
18
+ # 使用正则表达式提取版本号
19
+ import re
20
+ version_match = re.search(r"__version__\s*=\s*['\"]([^'\"]*)['\"]", content)
21
+ if version_match:
22
+ VERSION = version_match.group(1)
23
+ except Exception:
24
+ # 如果读取失败,使用默认版本号
25
+ pass
19
26
 
20
27
  # 获取所有可用命令
21
28
  commands = get_commands()
@@ -108,7 +108,11 @@ def _copytree_with_templates(src, dst, context, template_type='default', modules
108
108
 
109
109
  for item in src_path.rglob('*'):
110
110
  rel_path = item.relative_to(src_path)
111
- dst_item = dst_path / rel_path
111
+ # 对于run.py.tmpl文件,需要特殊处理,将其放到项目根目录
112
+ if item.name == 'run.py.tmpl':
113
+ dst_item = dst_path.parent / rel_path # 放到项目根目录
114
+ else:
115
+ dst_item = dst_path / rel_path
112
116
 
113
117
  # 检查是否应该包含此文件
114
118
  path_str = str(rel_path).replace('\\', '/')
@@ -147,6 +151,9 @@ def _copytree_with_templates(src, dst, context, template_type='default', modules
147
151
  if item.name == 'settings.py.tmpl':
148
152
  # 特殊处理设置模板文件,统一生成为 settings.py
149
153
  final_dst = dst_item.parent / 'settings.py'
154
+ # 特殊处理run.py.tmpl文件
155
+ elif item.name == 'run.py.tmpl':
156
+ final_dst = dst_item.with_suffix('') # 去掉.tmpl后缀
150
157
  else:
151
158
  final_dst = dst_item.with_suffix('')
152
159
 
@@ -171,8 +178,8 @@ def _should_include_file(rel_path, modules: List[str]) -> bool:
171
178
  'settings.py.tmpl',
172
179
  'spiders/__init__.py.tmpl',
173
180
  'items.py.tmpl',
174
- 'middlewares.py.tmpl',
175
- 'run.py.tmpl'
181
+ 'middlewares.py.tmpl'
182
+ # 移除了'run.py.tmpl',因为它现在在模板根目录
176
183
  ]
177
184
 
178
185
  path_str = str(rel_path).replace('\\', '/')
@@ -364,16 +371,25 @@ def main(args):
364
371
  else:
365
372
  console.print("[yellow]⚠ 警告:[/yellow] 找不到模板 'crawlo.cfg.tmpl'。")
366
373
 
367
- # 3. 复制并渲染项目包内容
374
+ # 3. 渲染 run.py.tmpl (放在项目根目录)
375
+ run_template = TEMPLATES_DIR / 'run.py.tmpl'
376
+ if run_template.exists():
377
+ run_content = _render_template(run_template, context)
378
+ (project_dir / 'run.py').write_text(run_content, encoding='utf-8')
379
+ console.print(f":white_check_mark: 已创建 [green]{project_dir / 'run.py'}[/green]")
380
+ else:
381
+ console.print("[yellow]⚠ 警告:[/yellow] 找不到模板 'run.py.tmpl'。")
382
+
383
+ # 4. 复制并渲染项目包内容
368
384
  package_dir = project_dir / project_name
369
385
  _copytree_with_templates(template_dir, package_dir, context, template_type, modules)
370
386
  console.print(f":white_check_mark: 已创建项目包: [green]{package_dir}[/green]")
371
387
 
372
- # 4. 创建 logs 目录
388
+ # 5. 创建 logs 目录
373
389
  (project_dir / 'logs').mkdir(exist_ok=True)
374
390
  console.print(":white_check_mark: 已创建 logs 目录")
375
391
 
376
- # 5. 创建 output 目录(用于数据输出)
392
+ # 6. 创建 output 目录(用于数据输出)
377
393
  (project_dir / 'output').mkdir(exist_ok=True)
378
394
  console.print(":white_check_mark: 已创建 output 目录")
379
395
 
@@ -88,8 +88,9 @@ class Engine(object):
88
88
  self.downloader = downloader_cls(self.crawler)
89
89
  if hasattr(self.downloader, 'open'):
90
90
  if asyncio.iscoroutinefunction(self.downloader.open):
91
- await self.downloader.open()
91
+ self.downloader.open()
92
92
  else:
93
+ # DownloaderBase.open() 是同步方法,直接调用而不是await
93
94
  self.downloader.open()
94
95
 
95
96
  self.processor = Processor(self.crawler)
@@ -97,6 +98,7 @@ class Engine(object):
97
98
  if asyncio.iscoroutinefunction(self.processor.open):
98
99
  await self.processor.open()
99
100
  else:
101
+ # Processor.open() 是同步方法
100
102
  self.processor.open()
101
103
 
102
104
  self.start_requests = iter(spider.start_requests())
@@ -0,0 +1,240 @@
1
+ #!/usr/bin/python
2
+ # -*- coding:UTF-8 -*-
3
+ from typing import Optional, Callable
4
+ import traceback
5
+
6
+ from crawlo.utils.log import get_logger
7
+ from crawlo.utils.request import set_request
8
+ from crawlo.utils.request_serializer import RequestSerializer
9
+ from crawlo.utils.error_handler import ErrorHandler
10
+ from crawlo.queue.queue_manager import QueueManager, QueueConfig, QueueType
11
+ from crawlo.project import load_class, common_call
12
+
13
+
14
+ class Scheduler:
15
+ def __init__(self, crawler, dupe_filter, stats, log_level, priority):
16
+ self.crawler = crawler
17
+ self.queue_manager: Optional[QueueManager] = None
18
+ self.request_serializer = RequestSerializer() # 专门处理序列化
19
+
20
+ self.logger = get_logger(name=self.__class__.__name__, level=log_level)
21
+ self.error_handler = ErrorHandler(self.__class__.__name__, log_level)
22
+ self.stats = stats
23
+ self.dupe_filter = dupe_filter
24
+ self.priority = priority
25
+
26
+ @classmethod
27
+ def create_instance(cls, crawler):
28
+ filter_cls = load_class(crawler.settings.get('FILTER_CLASS'))
29
+ o = cls(
30
+ crawler=crawler,
31
+ dupe_filter=filter_cls.create_instance(crawler),
32
+ stats=crawler.stats,
33
+ log_level=crawler.settings.get('LOG_LEVEL'),
34
+ priority=crawler.settings.get('DEPTH_PRIORITY')
35
+ )
36
+ return o
37
+
38
+ async def open(self):
39
+ """初始化调度器和队列"""
40
+ self.logger.info("开始初始化调度器...")
41
+ try:
42
+ # 创建队列配置
43
+ queue_config = QueueConfig.from_settings(self.crawler.settings)
44
+
45
+ # 创建队列管理器
46
+ self.queue_manager = QueueManager(queue_config)
47
+
48
+ # 初始化队列
49
+ self.logger.info("开始初始化队列管理器...")
50
+ needs_config_update = await self.queue_manager.initialize()
51
+
52
+ self.logger.info(f"队列初始化完成,needs_config_update: {needs_config_update}")
53
+ self.logger.info(f"当前队列类型: {self.queue_manager._queue_type}")
54
+
55
+ # 检查是否需要更新过滤器配置
56
+ if needs_config_update:
57
+ # 如果返回True,说明队列类型发生了变化,需要检查当前队列类型来决定更新方向
58
+ self.logger.info("需要更新配置...")
59
+ if self.queue_manager._queue_type == QueueType.REDIS:
60
+ self.logger.info("更新为Redis配置...")
61
+ self._update_filter_config_for_redis()
62
+ else:
63
+ self.logger.info("更新为内存配置...")
64
+ self._update_filter_config_if_needed()
65
+ else:
66
+ # 检查是否需要更新配置(即使队列管理器没有要求更新)
67
+ self.logger.debug("检查是否需要更新配置...")
68
+ if self.queue_manager._queue_type == QueueType.REDIS:
69
+ # 检查当前过滤器是否为内存过滤器
70
+ current_filter_class = self.crawler.settings.get('FILTER_CLASS', '')
71
+ if 'memory_filter' in current_filter_class:
72
+ self.logger.info("检测到需要更新为Redis配置...")
73
+ self._update_filter_config_for_redis()
74
+ elif self.queue_manager._queue_type == QueueType.MEMORY:
75
+ # 检查当前过滤器是否为Redis过滤器
76
+ current_filter_class = self.crawler.settings.get('FILTER_CLASS', '')
77
+ if 'aioredis_filter' in current_filter_class or 'redis_filter' in current_filter_class:
78
+ self.logger.info("检测到需要更新为内存配置...")
79
+ self._update_filter_config_if_needed()
80
+
81
+ # 只有在确实需要更新配置时才重新创建过滤器实例
82
+ # 检查是否真的进行了配置更新
83
+ filter_updated = (
84
+ (self.queue_manager._queue_type == QueueType.REDIS and 'memory_filter' in self.crawler.settings.get('FILTER_CLASS', '')) or
85
+ (self.queue_manager._queue_type == QueueType.MEMORY and ('aioredis_filter' in self.crawler.settings.get('FILTER_CLASS', '') or 'redis_filter' in self.crawler.settings.get('FILTER_CLASS', '')))
86
+ )
87
+
88
+ if needs_config_update or filter_updated:
89
+ # 重新创建过滤器实例,确保使用更新后的配置
90
+ self.logger.debug("重新创建过滤器实例...")
91
+ filter_cls = load_class(self.crawler.settings.get('FILTER_CLASS'))
92
+ self.dupe_filter = filter_cls.create_instance(self.crawler)
93
+ self.logger.info(f"✅ 过滤器实例已更新为: {type(self.dupe_filter).__name__}")
94
+ else:
95
+ self.logger.debug("过滤器配置无需更新,跳过重新创建")
96
+
97
+ # 输出队列状态和配置信息
98
+ status = self.queue_manager.get_status()
99
+ current_filter = self.crawler.settings.get('FILTER_CLASS')
100
+ current_dedup_pipeline = self.crawler.settings.get('DEFAULT_DEDUP_PIPELINE')
101
+
102
+ self.logger.info(f'队列类型: {status["type"]}, 状态: {status["health"]}')
103
+ self.logger.info(f'当前过滤器: {type(self.dupe_filter).__name__} ({current_filter})')
104
+ self.logger.info(f'当前去重管道: {current_dedup_pipeline}')
105
+ self.logger.info("调度器初始化完成")
106
+ except Exception as e:
107
+ self.logger.error(f"❌ 调度器初始化失败: {e}")
108
+ self.logger.debug(f"详细错误信息:\n{traceback.format_exc()}")
109
+ raise
110
+
111
+ def _update_filter_config_if_needed(self):
112
+ """如果队列类型切换到内存模式,则更新过滤器配置"""
113
+ if self.queue_manager and self.queue_manager._queue_type == QueueType.MEMORY:
114
+ # 检查当前过滤器是否为Redis过滤器
115
+ current_filter_class = self.crawler.settings.get('FILTER_CLASS', '')
116
+ if 'aioredis_filter' in current_filter_class or 'redis_filter' in current_filter_class:
117
+ # 更新为内存过滤器
118
+ self.crawler.settings.set('FILTER_CLASS', 'crawlo.filters.memory_filter.MemoryFilter')
119
+ self.logger.info("✅ 已更新过滤器配置为内存模式")
120
+
121
+ # 检查当前去重管道是否为Redis去重管道
122
+ current_dedup_pipeline = self.crawler.settings.get('DEFAULT_DEDUP_PIPELINE', '')
123
+ if 'redis_dedup_pipeline' in current_dedup_pipeline:
124
+ # 更新为内存去重管道
125
+ self.crawler.settings.set('DEFAULT_DEDUP_PIPELINE', 'crawlo.pipelines.memory_dedup_pipeline.MemoryDedupPipeline')
126
+ # 同时更新PIPELINES列表中的去重管道
127
+ pipelines = self.crawler.settings.get('PIPELINES', [])
128
+ if current_dedup_pipeline in pipelines:
129
+ # 找到并替换Redis去重管道为内存去重管道
130
+ index = pipelines.index(current_dedup_pipeline)
131
+ pipelines[index] = 'crawlo.pipelines.memory_dedup_pipeline.MemoryDedupPipeline'
132
+ self.crawler.settings.set('PIPELINES', pipelines)
133
+ self.logger.info("✅ 已更新去重管道配置为内存模式")
134
+
135
+ def _update_filter_config_for_redis(self):
136
+ """如果队列类型是Redis,则更新过滤器配置为Redis实现"""
137
+ if self.queue_manager and self.queue_manager._queue_type == QueueType.REDIS:
138
+ # 检查当前过滤器是否为内存过滤器
139
+ current_filter_class = self.crawler.settings.get('FILTER_CLASS', '')
140
+ if 'memory_filter' in current_filter_class:
141
+ # 更新为Redis过滤器
142
+ self.crawler.settings.set('FILTER_CLASS', 'crawlo.filters.aioredis_filter.AioRedisFilter')
143
+ self.logger.info("✅ 已更新过滤器配置为Redis模式")
144
+
145
+ # 检查当前去重管道是否为内存去重管道
146
+ current_dedup_pipeline = self.crawler.settings.get('DEFAULT_DEDUP_PIPELINE', '')
147
+ if 'memory_dedup_pipeline' in current_dedup_pipeline:
148
+ # 更新为Redis去重管道
149
+ self.crawler.settings.set('DEFAULT_DEDUP_PIPELINE', 'crawlo.pipelines.redis_dedup_pipeline.RedisDedupPipeline')
150
+ # 同时更新PIPELINES列表中的去重管道
151
+ pipelines = self.crawler.settings.get('PIPELINES', [])
152
+ if current_dedup_pipeline in pipelines:
153
+ # 找到并替换内存去重管道为Redis去重管道
154
+ index = pipelines.index(current_dedup_pipeline)
155
+ pipelines[index] = 'crawlo.pipelines.redis_dedup_pipeline.RedisDedupPipeline'
156
+ self.crawler.settings.set('PIPELINES', pipelines)
157
+ self.logger.info("✅ 已更新去重管道配置为Redis模式")
158
+
159
+ async def next_request(self):
160
+ """获取下一个请求"""
161
+ if not self.queue_manager:
162
+ return None
163
+
164
+ try:
165
+ request = await self.queue_manager.get()
166
+
167
+ # 恢复 callback(从 Redis 队列取出时)
168
+ if request:
169
+ spider = getattr(self.crawler, 'spider', None)
170
+ request = self.request_serializer.restore_after_deserialization(request, spider)
171
+
172
+ return request
173
+ except Exception as e:
174
+ self.error_handler.handle_error(
175
+ e,
176
+ context="获取下一个请求失败",
177
+ raise_error=False
178
+ )
179
+ return None
180
+
181
+ async def enqueue_request(self, request):
182
+ """将请求加入队列"""
183
+ if not request.dont_filter and await common_call(self.dupe_filter.requested, request):
184
+ self.dupe_filter.log_stats(request)
185
+ return False
186
+
187
+ if not self.queue_manager:
188
+ self.logger.error("队列管理器未初始化")
189
+ return False
190
+
191
+ set_request(request, self.priority)
192
+
193
+ try:
194
+ # 使用统一的队列接口
195
+ success = await self.queue_manager.put(request, priority=getattr(request, 'priority', 0))
196
+
197
+ if success:
198
+ self.logger.debug(f"✅ 请求入队成功: {request.url}")
199
+
200
+ return success
201
+ except Exception as e:
202
+ self.error_handler.handle_error(
203
+ e,
204
+ context="请求入队失败",
205
+ raise_error=False
206
+ )
207
+ return False
208
+
209
+ def idle(self) -> bool:
210
+ """检查队列是否为空"""
211
+ return len(self) == 0
212
+
213
+ async def async_idle(self) -> bool:
214
+ """异步检查队列是否为空(更精确)"""
215
+ if not self.queue_manager:
216
+ return True
217
+ # 使用队列管理器的异步empty方法
218
+ return await self.queue_manager.async_empty()
219
+
220
+ async def close(self):
221
+ """关闭调度器"""
222
+ try:
223
+ if isinstance(closed := getattr(self.dupe_filter, 'closed', None), Callable):
224
+ await closed()
225
+
226
+ if self.queue_manager:
227
+ await self.queue_manager.close()
228
+ except Exception as e:
229
+ self.error_handler.handle_error(
230
+ e,
231
+ context="关闭调度器失败",
232
+ raise_error=False
233
+ )
234
+
235
+ def __len__(self):
236
+ """获取队列大小"""
237
+ if not self.queue_manager:
238
+ return 0
239
+ # 返回同步的近似值,实际大小需要异步获取
240
+ return 0 if self.queue_manager.empty() else 1
@@ -1,18 +1,6 @@
1
- #!/usr/bin/python
2
- # -*- coding:UTF-8 -*-
3
- """
4
- Redis 过滤器实现
5
- =================
6
- 提供基于 Redis 的分布式请求去重功能。
7
-
8
- 特点:
9
- - 分布式支持: 多节点共享去重数据
10
- - TTL 支持: 自动过期清理
11
- - 高性能: 使用 Redis pipeline 优化
12
- - 容错设计: 网络异常自动重试
13
- """
14
- import redis.asyncio as aioredis
15
1
  from typing import Optional
2
+ import redis.asyncio as aioredis
3
+
16
4
  from crawlo.filters import BaseFilter
17
5
  from crawlo.utils.log import get_logger
18
6
  from crawlo.utils.request import request_fingerprint
@@ -70,6 +58,9 @@ class AioRedisFilter(BaseFilter):
70
58
  # 性能计数器
71
59
  self._redis_operations = 0
72
60
  self._pipeline_operations = 0
61
+
62
+ # 连接状态标记,避免重复尝试连接失败的Redis
63
+ self._connection_failed = False
73
64
 
74
65
  @classmethod
75
66
  def create_instance(cls, crawler) -> 'BaseFilter':
@@ -123,8 +114,17 @@ class AioRedisFilter(BaseFilter):
123
114
 
124
115
  async def _get_redis_client(self):
125
116
  """获取Redis客户端实例(延迟初始化)"""
117
+ # 如果之前连接失败,直接返回None
118
+ if self._connection_failed:
119
+ return None
120
+
126
121
  if self.redis is None and self._redis_pool is not None:
127
- self.redis = await self._redis_pool.get_connection()
122
+ try:
123
+ self.redis = await self._redis_pool.get_connection()
124
+ except Exception as e:
125
+ self._connection_failed = True
126
+ self.logger.error(f"Redis连接失败,将使用本地去重: {e}")
127
+ return None
128
128
  return self.redis
129
129
 
130
130
  async def requested(self, request) -> bool:
@@ -136,13 +136,17 @@ class AioRedisFilter(BaseFilter):
136
136
  """
137
137
  try:
138
138
  # 确保Redis客户端已初始化
139
- await self._get_redis_client()
139
+ redis_client = await self._get_redis_client()
140
+
141
+ # 如果Redis不可用,返回False表示不重复(避免丢失请求)
142
+ if redis_client is None:
143
+ return False
140
144
 
141
145
  fp = str(request_fingerprint(request))
142
146
  self._redis_operations += 1
143
147
 
144
148
  # 使用 pipeline 优化性能
145
- pipe = self.redis.pipeline()
149
+ pipe = redis_client.pipeline()
146
150
  pipe.sismember(self.redis_key, fp)
147
151
 
148
152
  results = await pipe.execute()
@@ -173,12 +177,16 @@ class AioRedisFilter(BaseFilter):
173
177
  """
174
178
  try:
175
179
  # 确保Redis客户端已初始化
176
- await self._get_redis_client()
180
+ redis_client = await self._get_redis_client()
181
+
182
+ # 如果Redis不可用,返回False表示添加失败
183
+ if redis_client is None:
184
+ return False
177
185
 
178
186
  fp = str(fp)
179
187
 
180
188
  # 使用 pipeline 优化性能
181
- pipe = self.redis.pipeline()
189
+ pipe = redis_client.pipeline()
182
190
  pipe.sadd(self.redis_key, fp)
183
191
 
184
192
  if self.ttl and self.ttl > 0:
@@ -197,85 +205,30 @@ class AioRedisFilter(BaseFilter):
197
205
  except Exception as e:
198
206
  self.logger.error(f"添加指纹失败: {fp[:20]}... - {e}")
199
207
  return False
200
-
201
- def __contains__(self, item: str) -> bool:
208
+
209
+ async def __contains__(self, fp: str) -> bool:
202
210
  """
203
- 同步版本的包含检查(不推荐在异步环境中使用)
211
+ 检查指纹是否存在于Redis集合中
204
212
 
205
- :param item: 要检查的指纹
206
- :return: 是否已存在
213
+ :param fp: 请求指纹字符串
214
+ :return: 是否存在
207
215
  """
208
- # 这是一个同步方法,不能直接调用异步Redis操作
209
- # 建议使用 requested() 方法替代
210
- raise NotImplementedError("请使用 requested() 方法进行异步检查")
211
-
212
- async def get_stats(self) -> dict:
213
- """获取过滤器详细统计信息"""
214
216
  try:
215
217
  # 确保Redis客户端已初始化
216
- await self._get_redis_client()
217
-
218
- count = await self.redis.scard(self.redis_key)
218
+ redis_client = await self._get_redis_client()
219
219
 
220
- # 获取TTL信息
221
- ttl_info = "TTL未设置"
222
- if self.ttl:
223
- remaining_ttl = await self.redis.ttl(self.redis_key)
224
- if remaining_ttl > 0:
225
- ttl_info = f"剩余 {remaining_ttl} 秒"
226
- else:
227
- ttl_info = f"配置 {self.ttl} 秒"
228
-
229
- stats = {
230
- 'filter_type': 'AioRedisFilter',
231
- '指纹总数': count,
232
- 'Redis键名': self.redis_key,
233
- 'TTL配置': ttl_info,
234
- 'Redis操作数': self._redis_operations,
235
- 'Pipeline操作数': self._pipeline_operations,
236
- '性能优化率': f"{self._pipeline_operations / max(1, self._redis_operations) * 100:.1f}%"
237
- }
238
-
239
- # 合并基类统计
240
- base_stats = super().get_stats()
241
- stats.update(base_stats)
242
-
243
- return stats
220
+ # 如果Redis不可用,返回False表示不存在
221
+ if redis_client is None:
222
+ return False
244
223
 
224
+ # 检查指纹是否存在
225
+ exists = await redis_client.sismember(self.redis_key, str(fp))
226
+ return exists
245
227
  except Exception as e:
246
- self.logger.error(f"获取统计信息失败: {e}")
247
- return super().get_stats()
248
-
249
- async def clear_all(self) -> int:
250
- """清空所有指纹数据"""
251
- try:
252
- # 确保Redis客户端已初始化
253
- await self._get_redis_client()
254
-
255
- deleted = await self.redis.delete(self.redis_key)
256
- self.logger.info(f"已清除指纹数: {deleted}")
257
- return deleted
258
- except Exception as e:
259
- self.logger.error("清空指纹失败")
260
- raise
228
+ self.logger.error(f"检查指纹存在性失败: {fp[:20]}... - {e}")
229
+ # 在网络异常时返回False,避免丢失请求
230
+ return False
261
231
 
262
- async def closed(self, reason: Optional[str] = None) -> None:
263
- """爬虫关闭时的清理操作"""
264
- try:
265
- # 确保Redis客户端已初始化
266
- await self._get_redis_client()
267
-
268
- if self.cleanup_fp:
269
- deleted = await self.redis.delete(self.redis_key)
270
- self.logger.info(f"爬虫关闭清理: 已删除{deleted}个指纹")
271
- else:
272
- count = await self.redis.scard(self.redis_key)
273
- ttl_info = f"{self.ttl}秒" if self.ttl else "持久化"
274
- self.logger.info(f"保留指纹数: {count} (TTL: {ttl_info})")
275
- finally:
276
- await self._close_redis()
277
232
 
278
- async def _close_redis(self) -> None:
279
- """安全关闭Redis连接"""
280
- # 连接池会自动管理连接,这里不需要显式关闭
281
- self.logger.debug("Redis连接已释放")
233
+ # 为了兼容性,确保导出类
234
+ __all__ = ['AioRedisFilter']
@@ -4,11 +4,11 @@
4
4
  统一的队列管理器
5
5
  提供简洁、一致的队列接口,自动处理不同队列类型的差异
6
6
  """
7
- from typing import Optional, Dict, Any, Union
8
- from enum import Enum
7
+ import os
9
8
  import asyncio
10
9
  import traceback
11
- import os
10
+ from typing import Optional, Dict, Any, Union
11
+ from enum import Enum
12
12
 
13
13
  from crawlo.utils.log import get_logger
14
14
  from crawlo.utils.request_serializer import RequestSerializer
@@ -103,11 +103,24 @@ class QueueManager:
103
103
  self._queue_type = queue_type
104
104
 
105
105
  # 测试队列健康状态
106
- await self._health_check()
106
+ health_check_result = await self._health_check()
107
107
 
108
108
  self.logger.info(f"✅ 队列初始化成功: {queue_type.value}")
109
- self.logger.info(f"📊 队列配置: {self._get_queue_info()}")
110
- return True
109
+ # 只在调试模式下输出详细配置信息
110
+ self.logger.debug(f"📊 队列配置: {self._get_queue_info()}")
111
+
112
+ # 如果健康检查返回True,表示队列类型发生了切换,需要更新配置
113
+ if health_check_result:
114
+ return True
115
+
116
+ # 如果队列类型是Redis,检查是否需要更新配置
117
+ if queue_type == QueueType.REDIS:
118
+ # 这个检查需要在调度器中进行,因为队列管理器无法访问crawler.settings
119
+ # 但我们不需要总是返回True,只有在确实需要更新时才返回True
120
+ # 调度器会进行更详细的检查
121
+ pass
122
+
123
+ return False # 默认不需要更新配置
111
124
 
112
125
  except Exception as e:
113
126
  # 记录详细的错误信息和堆栈跟踪
@@ -265,7 +278,15 @@ class QueueManager:
265
278
  raise RuntimeError("Redis 队列不可用:未安装 redis 依赖")
266
279
  if not self.config.redis_url:
267
280
  raise RuntimeError("Redis 队列不可用:未配置 REDIS_URL")
268
- return QueueType.REDIS
281
+ # 测试 Redis 连接
282
+ try:
283
+ test_queue = RedisPriorityQueue(self.config.redis_url)
284
+ await test_queue.connect()
285
+ await test_queue.close()
286
+ return QueueType.REDIS
287
+ except Exception as e:
288
+ # 如果强制使用Redis但连接失败,则抛出异常
289
+ raise RuntimeError(f"Redis 队列不可用:无法连接到 Redis ({e})")
269
290
 
270
291
  elif self.config.queue_type == QueueType.MEMORY:
271
292
  return QueueType.MEMORY
@@ -307,7 +328,7 @@ class QueueManager:
307
328
  else:
308
329
  raise ValueError(f"不支持的队列类型: {queue_type}")
309
330
 
310
- async def _health_check(self) -> None:
331
+ async def _health_check(self) -> bool:
311
332
  """健康检查"""
312
333
  try:
313
334
  if self._queue_type == QueueType.REDIS:
@@ -317,9 +338,27 @@ class QueueManager:
317
338
  else:
318
339
  # 内存队列总是健康的
319
340
  self._health_status = "healthy"
341
+ return False # 内存队列不需要更新配置
320
342
  except Exception as e:
321
343
  self.logger.warning(f"队列健康检查失败: {e}")
322
344
  self._health_status = "unhealthy"
345
+ # 如果是Redis队列且健康检查失败,尝试切换到内存队列
346
+ if self._queue_type == QueueType.REDIS and self.config.queue_type == QueueType.AUTO:
347
+ self.logger.info("Redis队列不可用,尝试切换到内存队列...")
348
+ try:
349
+ await self._queue.close()
350
+ except:
351
+ pass
352
+ self._queue = None
353
+ # 重新创建内存队列
354
+ self._queue = await self._create_queue(QueueType.MEMORY)
355
+ self._queue_type = QueueType.MEMORY
356
+ self._queue_semaphore = asyncio.Semaphore(self.config.max_queue_size)
357
+ self._health_status = "healthy"
358
+ self.logger.info("✅ 已切换到内存队列")
359
+ # 返回一个信号,表示需要更新过滤器和去重管道配置
360
+ return True
361
+ return False
323
362
 
324
363
  def _get_queue_info(self) -> Dict[str, Any]:
325
364
  """获取队列配置信息"""
@@ -77,7 +77,13 @@ class RedisPriorityQueue:
77
77
  """异步连接 Redis,支持重试"""
78
78
  async with self._lock:
79
79
  if self._redis is not None:
80
- return self._redis
80
+ # 如果已经连接,测试连接是否仍然有效
81
+ try:
82
+ await self._redis.ping()
83
+ return self._redis
84
+ except Exception:
85
+ # 连接失效,重新连接
86
+ self._redis = None
81
87
 
82
88
  for attempt in range(max_retries):
83
89
  try:
@@ -97,7 +103,8 @@ class RedisPriorityQueue:
97
103
 
98
104
  # 测试连接
99
105
  await self._redis.ping()
100
- logger.info(f"✅ Redis 连接成功 (Module: {self.module_name})")
106
+ # 只在调试模式下输出详细连接信息
107
+ logger.debug(f"✅ Redis 连接成功 (Module: {self.module_name})")
101
108
  return self._redis
102
109
  except Exception as e:
103
110
  error_msg = f"⚠️ Redis 连接失败 (尝试 {attempt + 1}/{max_retries}, Module: {self.module_name}): {e}"