crawlo 1.3.5__tar.gz → 1.3.7__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of crawlo might be problematic. Click here for more details.

Files changed (301) hide show
  1. {crawlo-1.3.5/crawlo.egg-info → crawlo-1.3.7}/PKG-INFO +74 -1
  2. {crawlo-1.3.5 → crawlo-1.3.7}/README.md +73 -0
  3. crawlo-1.3.7/crawlo/__version__.py +1 -0
  4. {crawlo-1.3.5 → crawlo-1.3.7}/crawlo/framework.py +3 -2
  5. {crawlo-1.3.5 → crawlo-1.3.7}/crawlo/queue/queue_manager.py +26 -7
  6. {crawlo-1.3.5 → crawlo-1.3.7}/crawlo/queue/redis_priority_queue.py +43 -2
  7. {crawlo-1.3.5 → crawlo-1.3.7}/crawlo/settings/default_settings.py +8 -8
  8. {crawlo-1.3.5 → crawlo-1.3.7}/crawlo/templates/project/settings.py.tmpl +3 -0
  9. {crawlo-1.3.5 → crawlo-1.3.7}/crawlo/templates/project/settings_distributed.py.tmpl +3 -0
  10. {crawlo-1.3.5 → crawlo-1.3.7/crawlo.egg-info}/PKG-INFO +74 -1
  11. {crawlo-1.3.5 → crawlo-1.3.7}/crawlo.egg-info/SOURCES.txt +4 -0
  12. crawlo-1.3.7/tests/simple_queue_type_test.py +42 -0
  13. {crawlo-1.3.5 → crawlo-1.3.7}/tests/test_crawlo_proxy_integration.py +1 -1
  14. {crawlo-1.3.5 → crawlo-1.3.7}/tests/test_double_crawlo_fix.py +10 -13
  15. {crawlo-1.3.5 → crawlo-1.3.7}/tests/test_queue_manager_redis_key.py +39 -36
  16. crawlo-1.3.7/tests/test_queue_naming.py +155 -0
  17. crawlo-1.3.7/tests/test_queue_type.py +107 -0
  18. crawlo-1.3.7/tests/test_redis_queue_name_fix.py +176 -0
  19. crawlo-1.3.5/crawlo/__version__.py +0 -1
  20. {crawlo-1.3.5 → crawlo-1.3.7}/LICENSE +0 -0
  21. {crawlo-1.3.5 → crawlo-1.3.7}/MANIFEST.in +0 -0
  22. {crawlo-1.3.5 → crawlo-1.3.7}/crawlo/__init__.py +0 -0
  23. {crawlo-1.3.5 → crawlo-1.3.7}/crawlo/cli.py +0 -0
  24. {crawlo-1.3.5 → crawlo-1.3.7}/crawlo/commands/__init__.py +0 -0
  25. {crawlo-1.3.5 → crawlo-1.3.7}/crawlo/commands/check.py +0 -0
  26. {crawlo-1.3.5 → crawlo-1.3.7}/crawlo/commands/genspider.py +0 -0
  27. {crawlo-1.3.5 → crawlo-1.3.7}/crawlo/commands/help.py +0 -0
  28. {crawlo-1.3.5 → crawlo-1.3.7}/crawlo/commands/list.py +0 -0
  29. {crawlo-1.3.5 → crawlo-1.3.7}/crawlo/commands/run.py +0 -0
  30. {crawlo-1.3.5 → crawlo-1.3.7}/crawlo/commands/startproject.py +0 -0
  31. {crawlo-1.3.5 → crawlo-1.3.7}/crawlo/commands/stats.py +0 -0
  32. {crawlo-1.3.5 → crawlo-1.3.7}/crawlo/commands/utils.py +0 -0
  33. {crawlo-1.3.5 → crawlo-1.3.7}/crawlo/config.py +0 -0
  34. {crawlo-1.3.5 → crawlo-1.3.7}/crawlo/config_validator.py +0 -0
  35. {crawlo-1.3.5 → crawlo-1.3.7}/crawlo/core/__init__.py +0 -0
  36. {crawlo-1.3.5 → crawlo-1.3.7}/crawlo/core/engine.py +0 -0
  37. {crawlo-1.3.5 → crawlo-1.3.7}/crawlo/core/processor.py +0 -0
  38. {crawlo-1.3.5 → crawlo-1.3.7}/crawlo/core/scheduler.py +0 -0
  39. {crawlo-1.3.5 → crawlo-1.3.7}/crawlo/crawler.py +0 -0
  40. {crawlo-1.3.5 → crawlo-1.3.7}/crawlo/data/__init__.py +0 -0
  41. {crawlo-1.3.5 → crawlo-1.3.7}/crawlo/data/user_agents.py +0 -0
  42. {crawlo-1.3.5 → crawlo-1.3.7}/crawlo/downloader/__init__.py +0 -0
  43. {crawlo-1.3.5 → crawlo-1.3.7}/crawlo/downloader/aiohttp_downloader.py +0 -0
  44. {crawlo-1.3.5 → crawlo-1.3.7}/crawlo/downloader/cffi_downloader.py +0 -0
  45. {crawlo-1.3.5 → crawlo-1.3.7}/crawlo/downloader/httpx_downloader.py +0 -0
  46. {crawlo-1.3.5 → crawlo-1.3.7}/crawlo/downloader/hybrid_downloader.py +0 -0
  47. {crawlo-1.3.5 → crawlo-1.3.7}/crawlo/downloader/playwright_downloader.py +0 -0
  48. {crawlo-1.3.5 → crawlo-1.3.7}/crawlo/downloader/selenium_downloader.py +0 -0
  49. {crawlo-1.3.5 → crawlo-1.3.7}/crawlo/event.py +0 -0
  50. {crawlo-1.3.5 → crawlo-1.3.7}/crawlo/exceptions.py +0 -0
  51. {crawlo-1.3.5 → crawlo-1.3.7}/crawlo/extension/__init__.py +0 -0
  52. {crawlo-1.3.5 → crawlo-1.3.7}/crawlo/extension/health_check.py +0 -0
  53. {crawlo-1.3.5 → crawlo-1.3.7}/crawlo/extension/log_interval.py +0 -0
  54. {crawlo-1.3.5 → crawlo-1.3.7}/crawlo/extension/log_stats.py +0 -0
  55. {crawlo-1.3.5 → crawlo-1.3.7}/crawlo/extension/logging_extension.py +0 -0
  56. {crawlo-1.3.5 → crawlo-1.3.7}/crawlo/extension/memory_monitor.py +0 -0
  57. {crawlo-1.3.5 → crawlo-1.3.7}/crawlo/extension/performance_profiler.py +0 -0
  58. {crawlo-1.3.5 → crawlo-1.3.7}/crawlo/extension/request_recorder.py +0 -0
  59. {crawlo-1.3.5 → crawlo-1.3.7}/crawlo/factories/__init__.py +0 -0
  60. {crawlo-1.3.5 → crawlo-1.3.7}/crawlo/factories/base.py +0 -0
  61. {crawlo-1.3.5 → crawlo-1.3.7}/crawlo/factories/crawler.py +0 -0
  62. {crawlo-1.3.5 → crawlo-1.3.7}/crawlo/factories/registry.py +0 -0
  63. {crawlo-1.3.5 → crawlo-1.3.7}/crawlo/filters/__init__.py +0 -0
  64. {crawlo-1.3.5 → crawlo-1.3.7}/crawlo/filters/aioredis_filter.py +0 -0
  65. {crawlo-1.3.5 → crawlo-1.3.7}/crawlo/filters/memory_filter.py +0 -0
  66. {crawlo-1.3.5 → crawlo-1.3.7}/crawlo/initialization/__init__.py +0 -0
  67. {crawlo-1.3.5 → crawlo-1.3.7}/crawlo/initialization/built_in.py +0 -0
  68. {crawlo-1.3.5 → crawlo-1.3.7}/crawlo/initialization/context.py +0 -0
  69. {crawlo-1.3.5 → crawlo-1.3.7}/crawlo/initialization/core.py +0 -0
  70. {crawlo-1.3.5 → crawlo-1.3.7}/crawlo/initialization/phases.py +0 -0
  71. {crawlo-1.3.5 → crawlo-1.3.7}/crawlo/initialization/registry.py +0 -0
  72. {crawlo-1.3.5 → crawlo-1.3.7}/crawlo/items/__init__.py +0 -0
  73. {crawlo-1.3.5 → crawlo-1.3.7}/crawlo/items/base.py +0 -0
  74. {crawlo-1.3.5 → crawlo-1.3.7}/crawlo/items/fields.py +0 -0
  75. {crawlo-1.3.5 → crawlo-1.3.7}/crawlo/items/items.py +0 -0
  76. {crawlo-1.3.5 → crawlo-1.3.7}/crawlo/logging/__init__.py +0 -0
  77. {crawlo-1.3.5 → crawlo-1.3.7}/crawlo/logging/config.py +0 -0
  78. {crawlo-1.3.5 → crawlo-1.3.7}/crawlo/logging/factory.py +0 -0
  79. {crawlo-1.3.5 → crawlo-1.3.7}/crawlo/logging/manager.py +0 -0
  80. {crawlo-1.3.5 → crawlo-1.3.7}/crawlo/middleware/__init__.py +0 -0
  81. {crawlo-1.3.5 → crawlo-1.3.7}/crawlo/middleware/default_header.py +0 -0
  82. {crawlo-1.3.5 → crawlo-1.3.7}/crawlo/middleware/download_delay.py +0 -0
  83. {crawlo-1.3.5 → crawlo-1.3.7}/crawlo/middleware/middleware_manager.py +0 -0
  84. {crawlo-1.3.5 → crawlo-1.3.7}/crawlo/middleware/offsite.py +0 -0
  85. {crawlo-1.3.5 → crawlo-1.3.7}/crawlo/middleware/proxy.py +0 -0
  86. {crawlo-1.3.5 → crawlo-1.3.7}/crawlo/middleware/request_ignore.py +0 -0
  87. {crawlo-1.3.5 → crawlo-1.3.7}/crawlo/middleware/response_code.py +0 -0
  88. {crawlo-1.3.5 → crawlo-1.3.7}/crawlo/middleware/response_filter.py +0 -0
  89. {crawlo-1.3.5 → crawlo-1.3.7}/crawlo/middleware/retry.py +0 -0
  90. {crawlo-1.3.5 → crawlo-1.3.7}/crawlo/middleware/simple_proxy.py +0 -0
  91. {crawlo-1.3.5 → crawlo-1.3.7}/crawlo/mode_manager.py +0 -0
  92. {crawlo-1.3.5 → crawlo-1.3.7}/crawlo/network/__init__.py +0 -0
  93. {crawlo-1.3.5 → crawlo-1.3.7}/crawlo/network/request.py +0 -0
  94. {crawlo-1.3.5 → crawlo-1.3.7}/crawlo/network/response.py +0 -0
  95. {crawlo-1.3.5 → crawlo-1.3.7}/crawlo/pipelines/__init__.py +0 -0
  96. {crawlo-1.3.5 → crawlo-1.3.7}/crawlo/pipelines/bloom_dedup_pipeline.py +0 -0
  97. {crawlo-1.3.5 → crawlo-1.3.7}/crawlo/pipelines/console_pipeline.py +0 -0
  98. {crawlo-1.3.5 → crawlo-1.3.7}/crawlo/pipelines/csv_pipeline.py +0 -0
  99. {crawlo-1.3.5 → crawlo-1.3.7}/crawlo/pipelines/database_dedup_pipeline.py +0 -0
  100. {crawlo-1.3.5 → crawlo-1.3.7}/crawlo/pipelines/json_pipeline.py +0 -0
  101. {crawlo-1.3.5 → crawlo-1.3.7}/crawlo/pipelines/memory_dedup_pipeline.py +0 -0
  102. {crawlo-1.3.5 → crawlo-1.3.7}/crawlo/pipelines/mongo_pipeline.py +0 -0
  103. {crawlo-1.3.5 → crawlo-1.3.7}/crawlo/pipelines/mysql_pipeline.py +0 -0
  104. {crawlo-1.3.5 → crawlo-1.3.7}/crawlo/pipelines/pipeline_manager.py +0 -0
  105. {crawlo-1.3.5 → crawlo-1.3.7}/crawlo/pipelines/redis_dedup_pipeline.py +0 -0
  106. {crawlo-1.3.5 → crawlo-1.3.7}/crawlo/project.py +0 -0
  107. {crawlo-1.3.5 → crawlo-1.3.7}/crawlo/queue/__init__.py +0 -0
  108. {crawlo-1.3.5 → crawlo-1.3.7}/crawlo/queue/pqueue.py +0 -0
  109. {crawlo-1.3.5 → crawlo-1.3.7}/crawlo/settings/__init__.py +0 -0
  110. {crawlo-1.3.5 → crawlo-1.3.7}/crawlo/settings/setting_manager.py +0 -0
  111. {crawlo-1.3.5 → crawlo-1.3.7}/crawlo/spider/__init__.py +0 -0
  112. {crawlo-1.3.5 → crawlo-1.3.7}/crawlo/stats_collector.py +0 -0
  113. {crawlo-1.3.5 → crawlo-1.3.7}/crawlo/subscriber.py +0 -0
  114. {crawlo-1.3.5 → crawlo-1.3.7}/crawlo/task_manager.py +0 -0
  115. {crawlo-1.3.5 → crawlo-1.3.7}/crawlo/templates/crawlo.cfg.tmpl +0 -0
  116. {crawlo-1.3.5 → crawlo-1.3.7}/crawlo/templates/project/__init__.py.tmpl +0 -0
  117. {crawlo-1.3.5 → crawlo-1.3.7}/crawlo/templates/project/items.py.tmpl +0 -0
  118. {crawlo-1.3.5 → crawlo-1.3.7}/crawlo/templates/project/middlewares.py.tmpl +0 -0
  119. {crawlo-1.3.5 → crawlo-1.3.7}/crawlo/templates/project/pipelines.py.tmpl +0 -0
  120. {crawlo-1.3.5 → crawlo-1.3.7}/crawlo/templates/project/settings_gentle.py.tmpl +0 -0
  121. {crawlo-1.3.5 → crawlo-1.3.7}/crawlo/templates/project/settings_high_performance.py.tmpl +0 -0
  122. {crawlo-1.3.5 → crawlo-1.3.7}/crawlo/templates/project/settings_minimal.py.tmpl +0 -0
  123. {crawlo-1.3.5 → crawlo-1.3.7}/crawlo/templates/project/settings_simple.py.tmpl +0 -0
  124. {crawlo-1.3.5 → crawlo-1.3.7}/crawlo/templates/project/spiders/__init__.py.tmpl +0 -0
  125. {crawlo-1.3.5 → crawlo-1.3.7}/crawlo/templates/run.py.tmpl +0 -0
  126. {crawlo-1.3.5 → crawlo-1.3.7}/crawlo/templates/spider/spider.py.tmpl +0 -0
  127. {crawlo-1.3.5 → crawlo-1.3.7}/crawlo/templates/spiders_init.py.tmpl +0 -0
  128. {crawlo-1.3.5 → crawlo-1.3.7}/crawlo/tools/__init__.py +0 -0
  129. {crawlo-1.3.5 → crawlo-1.3.7}/crawlo/tools/anti_crawler.py +0 -0
  130. {crawlo-1.3.5 → crawlo-1.3.7}/crawlo/tools/authenticated_proxy.py +0 -0
  131. {crawlo-1.3.5 → crawlo-1.3.7}/crawlo/tools/data_formatter.py +0 -0
  132. {crawlo-1.3.5 → crawlo-1.3.7}/crawlo/tools/data_validator.py +0 -0
  133. {crawlo-1.3.5 → crawlo-1.3.7}/crawlo/tools/date_tools.py +0 -0
  134. {crawlo-1.3.5 → crawlo-1.3.7}/crawlo/tools/distributed_coordinator.py +0 -0
  135. {crawlo-1.3.5 → crawlo-1.3.7}/crawlo/tools/encoding_converter.py +0 -0
  136. {crawlo-1.3.5 → crawlo-1.3.7}/crawlo/tools/network_diagnostic.py +0 -0
  137. {crawlo-1.3.5 → crawlo-1.3.7}/crawlo/tools/request_tools.py +0 -0
  138. {crawlo-1.3.5 → crawlo-1.3.7}/crawlo/tools/retry_mechanism.py +0 -0
  139. {crawlo-1.3.5 → crawlo-1.3.7}/crawlo/tools/scenario_adapter.py +0 -0
  140. {crawlo-1.3.5 → crawlo-1.3.7}/crawlo/tools/text_cleaner.py +0 -0
  141. {crawlo-1.3.5 → crawlo-1.3.7}/crawlo/utils/__init__.py +0 -0
  142. {crawlo-1.3.5 → crawlo-1.3.7}/crawlo/utils/batch_processor.py +0 -0
  143. {crawlo-1.3.5 → crawlo-1.3.7}/crawlo/utils/class_loader.py +0 -0
  144. {crawlo-1.3.5 → crawlo-1.3.7}/crawlo/utils/controlled_spider_mixin.py +0 -0
  145. {crawlo-1.3.5 → crawlo-1.3.7}/crawlo/utils/db_helper.py +0 -0
  146. {crawlo-1.3.5 → crawlo-1.3.7}/crawlo/utils/enhanced_error_handler.py +0 -0
  147. {crawlo-1.3.5 → crawlo-1.3.7}/crawlo/utils/env_config.py +0 -0
  148. {crawlo-1.3.5 → crawlo-1.3.7}/crawlo/utils/error_handler.py +0 -0
  149. {crawlo-1.3.5 → crawlo-1.3.7}/crawlo/utils/func_tools.py +0 -0
  150. {crawlo-1.3.5 → crawlo-1.3.7}/crawlo/utils/large_scale_config.py +0 -0
  151. {crawlo-1.3.5 → crawlo-1.3.7}/crawlo/utils/large_scale_helper.py +0 -0
  152. {crawlo-1.3.5 → crawlo-1.3.7}/crawlo/utils/log.py +0 -0
  153. {crawlo-1.3.5 → crawlo-1.3.7}/crawlo/utils/performance_monitor.py +0 -0
  154. {crawlo-1.3.5 → crawlo-1.3.7}/crawlo/utils/queue_helper.py +0 -0
  155. {crawlo-1.3.5 → crawlo-1.3.7}/crawlo/utils/redis_connection_pool.py +0 -0
  156. {crawlo-1.3.5 → crawlo-1.3.7}/crawlo/utils/redis_key_validator.py +0 -0
  157. {crawlo-1.3.5 → crawlo-1.3.7}/crawlo/utils/request.py +0 -0
  158. {crawlo-1.3.5 → crawlo-1.3.7}/crawlo/utils/request_serializer.py +0 -0
  159. {crawlo-1.3.5 → crawlo-1.3.7}/crawlo/utils/spider_loader.py +0 -0
  160. {crawlo-1.3.5 → crawlo-1.3.7}/crawlo/utils/system.py +0 -0
  161. {crawlo-1.3.5 → crawlo-1.3.7}/crawlo/utils/tools.py +0 -0
  162. {crawlo-1.3.5 → crawlo-1.3.7}/crawlo/utils/url.py +0 -0
  163. {crawlo-1.3.5 → crawlo-1.3.7}/crawlo.egg-info/dependency_links.txt +0 -0
  164. {crawlo-1.3.5 → crawlo-1.3.7}/crawlo.egg-info/entry_points.txt +0 -0
  165. {crawlo-1.3.5 → crawlo-1.3.7}/crawlo.egg-info/requires.txt +0 -0
  166. {crawlo-1.3.5 → crawlo-1.3.7}/crawlo.egg-info/top_level.txt +0 -0
  167. {crawlo-1.3.5 → crawlo-1.3.7}/examples/__init__.py +0 -0
  168. {crawlo-1.3.5 → crawlo-1.3.7}/pyproject.toml +0 -0
  169. {crawlo-1.3.5 → crawlo-1.3.7}/requirements.txt +0 -0
  170. {crawlo-1.3.5 → crawlo-1.3.7}/setup.cfg +0 -0
  171. {crawlo-1.3.5 → crawlo-1.3.7}/tests/__init__.py +0 -0
  172. {crawlo-1.3.5 → crawlo-1.3.7}/tests/advanced_tools_example.py +0 -0
  173. {crawlo-1.3.5 → crawlo-1.3.7}/tests/authenticated_proxy_example.py +0 -0
  174. {crawlo-1.3.5 → crawlo-1.3.7}/tests/baidu_performance_test.py +0 -0
  175. {crawlo-1.3.5 → crawlo-1.3.7}/tests/baidu_test.py +0 -0
  176. {crawlo-1.3.5 → crawlo-1.3.7}/tests/cleaners_example.py +0 -0
  177. {crawlo-1.3.5 → crawlo-1.3.7}/tests/comprehensive_framework_test.py +0 -0
  178. {crawlo-1.3.5 → crawlo-1.3.7}/tests/comprehensive_test.py +0 -0
  179. {crawlo-1.3.5 → crawlo-1.3.7}/tests/comprehensive_testing_summary.md +0 -0
  180. {crawlo-1.3.5 → crawlo-1.3.7}/tests/config_validation_demo.py +0 -0
  181. {crawlo-1.3.5 → crawlo-1.3.7}/tests/controlled_spider_example.py +0 -0
  182. {crawlo-1.3.5 → crawlo-1.3.7}/tests/date_tools_example.py +0 -0
  183. {crawlo-1.3.5 → crawlo-1.3.7}/tests/debug_configure.py +0 -0
  184. {crawlo-1.3.5 → crawlo-1.3.7}/tests/debug_framework_logger.py +0 -0
  185. {crawlo-1.3.5 → crawlo-1.3.7}/tests/debug_log_config.py +0 -0
  186. {crawlo-1.3.5 → crawlo-1.3.7}/tests/debug_log_levels.py +0 -0
  187. {crawlo-1.3.5 → crawlo-1.3.7}/tests/debug_pipelines.py +0 -0
  188. {crawlo-1.3.5 → crawlo-1.3.7}/tests/detailed_log_test.py +0 -0
  189. {crawlo-1.3.5 → crawlo-1.3.7}/tests/distributed_test.py +0 -0
  190. {crawlo-1.3.5 → crawlo-1.3.7}/tests/distributed_test_debug.py +0 -0
  191. {crawlo-1.3.5 → crawlo-1.3.7}/tests/dynamic_loading_example.py +0 -0
  192. {crawlo-1.3.5 → crawlo-1.3.7}/tests/dynamic_loading_test.py +0 -0
  193. {crawlo-1.3.5 → crawlo-1.3.7}/tests/env_config_example.py +0 -0
  194. {crawlo-1.3.5 → crawlo-1.3.7}/tests/error_handling_example.py +0 -0
  195. {crawlo-1.3.5 → crawlo-1.3.7}/tests/final_command_test_report.md +0 -0
  196. {crawlo-1.3.5 → crawlo-1.3.7}/tests/final_comprehensive_test.py +0 -0
  197. {crawlo-1.3.5 → crawlo-1.3.7}/tests/final_log_test.py +0 -0
  198. {crawlo-1.3.5 → crawlo-1.3.7}/tests/final_validation_test.py +0 -0
  199. {crawlo-1.3.5 → crawlo-1.3.7}/tests/fix_log_test.py +0 -0
  200. {crawlo-1.3.5 → crawlo-1.3.7}/tests/framework_performance_test.py +0 -0
  201. {crawlo-1.3.5 → crawlo-1.3.7}/tests/log_buffering_test.py +0 -0
  202. {crawlo-1.3.5 → crawlo-1.3.7}/tests/log_generation_timing_test.py +0 -0
  203. {crawlo-1.3.5 → crawlo-1.3.7}/tests/optimized_performance_test.py +0 -0
  204. {crawlo-1.3.5 → crawlo-1.3.7}/tests/performance_comparison.py +0 -0
  205. {crawlo-1.3.5 → crawlo-1.3.7}/tests/queue_blocking_test.py +0 -0
  206. {crawlo-1.3.5 → crawlo-1.3.7}/tests/queue_test.py +0 -0
  207. {crawlo-1.3.5 → crawlo-1.3.7}/tests/redis_key_validation_demo.py +0 -0
  208. {crawlo-1.3.5 → crawlo-1.3.7}/tests/request_params_example.py +0 -0
  209. {crawlo-1.3.5 → crawlo-1.3.7}/tests/response_improvements_example.py +0 -0
  210. {crawlo-1.3.5 → crawlo-1.3.7}/tests/scrapy_comparison/ofweek_scrapy.py +0 -0
  211. {crawlo-1.3.5 → crawlo-1.3.7}/tests/scrapy_comparison/scrapy_test.py +0 -0
  212. {crawlo-1.3.5 → crawlo-1.3.7}/tests/simple_command_test.py +0 -0
  213. {crawlo-1.3.5 → crawlo-1.3.7}/tests/simple_crawlo_test.py +0 -0
  214. {crawlo-1.3.5 → crawlo-1.3.7}/tests/simple_log_test.py +0 -0
  215. {crawlo-1.3.5 → crawlo-1.3.7}/tests/simple_log_test2.py +0 -0
  216. {crawlo-1.3.5 → crawlo-1.3.7}/tests/simple_optimization_test.py +0 -0
  217. {crawlo-1.3.5 → crawlo-1.3.7}/tests/simple_spider_test.py +0 -0
  218. {crawlo-1.3.5 → crawlo-1.3.7}/tests/simple_test.py +0 -0
  219. {crawlo-1.3.5 → crawlo-1.3.7}/tests/spider_log_timing_test.py +0 -0
  220. {crawlo-1.3.5 → crawlo-1.3.7}/tests/test_advanced_tools.py +0 -0
  221. {crawlo-1.3.5 → crawlo-1.3.7}/tests/test_all_commands.py +0 -0
  222. {crawlo-1.3.5 → crawlo-1.3.7}/tests/test_all_redis_key_configs.py +0 -0
  223. {crawlo-1.3.5 → crawlo-1.3.7}/tests/test_authenticated_proxy.py +0 -0
  224. {crawlo-1.3.5 → crawlo-1.3.7}/tests/test_batch_processor.py +0 -0
  225. {crawlo-1.3.5 → crawlo-1.3.7}/tests/test_cleaners.py +0 -0
  226. {crawlo-1.3.5 → crawlo-1.3.7}/tests/test_component_factory.py +0 -0
  227. {crawlo-1.3.5 → crawlo-1.3.7}/tests/test_comprehensive.py +0 -0
  228. {crawlo-1.3.5 → crawlo-1.3.7}/tests/test_config_consistency.py +0 -0
  229. {crawlo-1.3.5 → crawlo-1.3.7}/tests/test_config_merge.py +0 -0
  230. {crawlo-1.3.5 → crawlo-1.3.7}/tests/test_config_validator.py +0 -0
  231. {crawlo-1.3.5 → crawlo-1.3.7}/tests/test_controlled_spider_mixin.py +0 -0
  232. {crawlo-1.3.5 → crawlo-1.3.7}/tests/test_date_tools.py +0 -0
  233. {crawlo-1.3.5 → crawlo-1.3.7}/tests/test_default_header_middleware.py +0 -0
  234. {crawlo-1.3.5 → crawlo-1.3.7}/tests/test_distributed.py +0 -0
  235. {crawlo-1.3.5 → crawlo-1.3.7}/tests/test_double_crawlo_fix_simple.py +0 -0
  236. {crawlo-1.3.5 → crawlo-1.3.7}/tests/test_download_delay_middleware.py +0 -0
  237. {crawlo-1.3.5 → crawlo-1.3.7}/tests/test_downloader_proxy_compatibility.py +0 -0
  238. {crawlo-1.3.5 → crawlo-1.3.7}/tests/test_dynamic_downloaders_proxy.py +0 -0
  239. {crawlo-1.3.5 → crawlo-1.3.7}/tests/test_dynamic_proxy.py +0 -0
  240. {crawlo-1.3.5 → crawlo-1.3.7}/tests/test_dynamic_proxy_config.py +0 -0
  241. {crawlo-1.3.5 → crawlo-1.3.7}/tests/test_dynamic_proxy_real.py +0 -0
  242. {crawlo-1.3.5 → crawlo-1.3.7}/tests/test_edge_cases.py +0 -0
  243. {crawlo-1.3.5 → crawlo-1.3.7}/tests/test_enhanced_error_handler.py +0 -0
  244. {crawlo-1.3.5 → crawlo-1.3.7}/tests/test_enhanced_error_handler_comprehensive.py +0 -0
  245. {crawlo-1.3.5 → crawlo-1.3.7}/tests/test_env_config.py +0 -0
  246. {crawlo-1.3.5 → crawlo-1.3.7}/tests/test_error_handler_compatibility.py +0 -0
  247. {crawlo-1.3.5 → crawlo-1.3.7}/tests/test_factories.py +0 -0
  248. {crawlo-1.3.5 → crawlo-1.3.7}/tests/test_final_validation.py +0 -0
  249. {crawlo-1.3.5 → crawlo-1.3.7}/tests/test_framework_env_usage.py +0 -0
  250. {crawlo-1.3.5 → crawlo-1.3.7}/tests/test_framework_logger.py +0 -0
  251. {crawlo-1.3.5 → crawlo-1.3.7}/tests/test_framework_startup.py +0 -0
  252. {crawlo-1.3.5 → crawlo-1.3.7}/tests/test_get_component_logger.py +0 -0
  253. {crawlo-1.3.5 → crawlo-1.3.7}/tests/test_integration.py +0 -0
  254. {crawlo-1.3.5 → crawlo-1.3.7}/tests/test_item_dedup_redis_key.py +0 -0
  255. {crawlo-1.3.5 → crawlo-1.3.7}/tests/test_large_scale_config.py +0 -0
  256. {crawlo-1.3.5 → crawlo-1.3.7}/tests/test_large_scale_helper.py +0 -0
  257. {crawlo-1.3.5 → crawlo-1.3.7}/tests/test_logging_system.py +0 -0
  258. {crawlo-1.3.5 → crawlo-1.3.7}/tests/test_mode_change.py +0 -0
  259. {crawlo-1.3.5 → crawlo-1.3.7}/tests/test_mode_consistency.py +0 -0
  260. {crawlo-1.3.5 → crawlo-1.3.7}/tests/test_offsite_middleware.py +0 -0
  261. {crawlo-1.3.5 → crawlo-1.3.7}/tests/test_parsel.py +0 -0
  262. {crawlo-1.3.5 → crawlo-1.3.7}/tests/test_performance.py +0 -0
  263. {crawlo-1.3.5 → crawlo-1.3.7}/tests/test_performance_monitor.py +0 -0
  264. {crawlo-1.3.5 → crawlo-1.3.7}/tests/test_proxy_api.py +0 -0
  265. {crawlo-1.3.5 → crawlo-1.3.7}/tests/test_proxy_health_check.py +0 -0
  266. {crawlo-1.3.5 → crawlo-1.3.7}/tests/test_proxy_middleware.py +0 -0
  267. {crawlo-1.3.5 → crawlo-1.3.7}/tests/test_proxy_middleware_enhanced.py +0 -0
  268. {crawlo-1.3.5 → crawlo-1.3.7}/tests/test_proxy_middleware_integration.py +0 -0
  269. {crawlo-1.3.5 → crawlo-1.3.7}/tests/test_proxy_middleware_refactored.py +0 -0
  270. {crawlo-1.3.5 → crawlo-1.3.7}/tests/test_proxy_providers.py +0 -0
  271. {crawlo-1.3.5 → crawlo-1.3.7}/tests/test_proxy_stats.py +0 -0
  272. {crawlo-1.3.5 → crawlo-1.3.7}/tests/test_proxy_strategies.py +0 -0
  273. {crawlo-1.3.5 → crawlo-1.3.7}/tests/test_queue_empty_check.py +0 -0
  274. {crawlo-1.3.5 → crawlo-1.3.7}/tests/test_queue_manager_double_crawlo.py +0 -0
  275. {crawlo-1.3.5 → crawlo-1.3.7}/tests/test_random_user_agent.py +0 -0
  276. {crawlo-1.3.5 → crawlo-1.3.7}/tests/test_real_scenario_proxy.py +0 -0
  277. {crawlo-1.3.5 → crawlo-1.3.7}/tests/test_redis_config.py +0 -0
  278. {crawlo-1.3.5 → crawlo-1.3.7}/tests/test_redis_connection_pool.py +0 -0
  279. {crawlo-1.3.5 → crawlo-1.3.7}/tests/test_redis_key_naming.py +0 -0
  280. {crawlo-1.3.5 → crawlo-1.3.7}/tests/test_redis_key_validator.py +0 -0
  281. {crawlo-1.3.5 → crawlo-1.3.7}/tests/test_redis_queue.py +0 -0
  282. {crawlo-1.3.5 → crawlo-1.3.7}/tests/test_request_ignore_middleware.py +0 -0
  283. {crawlo-1.3.5 → crawlo-1.3.7}/tests/test_request_params.py +0 -0
  284. {crawlo-1.3.5 → crawlo-1.3.7}/tests/test_request_serialization.py +0 -0
  285. {crawlo-1.3.5 → crawlo-1.3.7}/tests/test_response_code_middleware.py +0 -0
  286. {crawlo-1.3.5 → crawlo-1.3.7}/tests/test_response_filter_middleware.py +0 -0
  287. {crawlo-1.3.5 → crawlo-1.3.7}/tests/test_response_improvements.py +0 -0
  288. {crawlo-1.3.5 → crawlo-1.3.7}/tests/test_retry_middleware.py +0 -0
  289. {crawlo-1.3.5 → crawlo-1.3.7}/tests/test_scheduler.py +0 -0
  290. {crawlo-1.3.5 → crawlo-1.3.7}/tests/test_scheduler_config_update.py +0 -0
  291. {crawlo-1.3.5 → crawlo-1.3.7}/tests/test_simple_response.py +0 -0
  292. {crawlo-1.3.5 → crawlo-1.3.7}/tests/test_telecom_spider_redis_key.py +0 -0
  293. {crawlo-1.3.5 → crawlo-1.3.7}/tests/test_template_content.py +0 -0
  294. {crawlo-1.3.5 → crawlo-1.3.7}/tests/test_template_redis_key.py +0 -0
  295. {crawlo-1.3.5 → crawlo-1.3.7}/tests/test_tools.py +0 -0
  296. {crawlo-1.3.5 → crawlo-1.3.7}/tests/test_user_agents.py +0 -0
  297. {crawlo-1.3.5 → crawlo-1.3.7}/tests/tools_example.py +0 -0
  298. {crawlo-1.3.5 → crawlo-1.3.7}/tests/untested_features_report.md +0 -0
  299. {crawlo-1.3.5 → crawlo-1.3.7}/tests/verify_debug.py +0 -0
  300. {crawlo-1.3.5 → crawlo-1.3.7}/tests/verify_distributed.py +0 -0
  301. {crawlo-1.3.5 → crawlo-1.3.7}/tests/verify_log_fix.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: crawlo
3
- Version: 1.3.5
3
+ Version: 1.3.7
4
4
  Summary: Crawlo 是一款基于异步IO的高性能Python爬虫框架,支持分布式抓取。
5
5
  Home-page: https://github.com/crawl-coder/Crawlo.git
6
6
  Author: crawl-coder
@@ -630,6 +630,51 @@ Crawlo支持三种队列类型,可通过`QUEUE_TYPE`配置项设置:
630
630
 
631
631
  推荐使用`auto`模式,让框架根据环境自动选择最适合的队列类型。
632
632
 
633
+ #### Redis Key 命名规范
634
+
635
+ 在分布式模式下,Crawlo框架使用Redis作为队列和去重存储。为了确保不同项目和爬虫之间的数据隔离,框架采用统一的Redis Key命名规范:
636
+
637
+ ##### 默认命名规则
638
+ Redis Key遵循以下命名格式:`crawlo:{PROJECT_NAME}:{component}:{identifier}`
639
+
640
+ 其中:
641
+ - `PROJECT_NAME`:项目名称,用于区分不同项目
642
+ - `component`:组件类型,如`queue`、`filter`、`item`
643
+ - `identifier`:具体标识符,如`requests`、`processing`、`failed`、`fingerprint`
644
+
645
+ ##### 具体Key格式
646
+ 1. **请求队列**:`crawlo:{PROJECT_NAME}:queue:requests`
647
+ - 用于存储待处理的请求任务
648
+
649
+ 2. **处理中队列**:`crawlo:{PROJECT_NAME}:queue:processing`
650
+ - 用于存储正在处理的请求任务
651
+
652
+ 3. **失败队列**:`crawlo:{PROJECT_NAME}:queue:failed`
653
+ - 用于存储处理失败的请求任务
654
+
655
+ 4. **请求去重**:`crawlo:{PROJECT_NAME}:filter:fingerprint`
656
+ - 用于存储请求URL的指纹,实现去重功能
657
+
658
+ 5. **数据项去重**:`crawlo:{PROJECT_NAME}:item:fingerprint`
659
+ - 用于存储数据项的指纹,防止重复存储
660
+
661
+ ##### 自定义队列名称
662
+ 用户可以通过`SCHEDULER_QUEUE_NAME`配置项自定义请求队列名称。处理中队列和失败队列会基于请求队列名称自动生成:
663
+ - 处理中队列:将`:queue:requests`替换为`:queue:processing`
664
+ - 失败队列:将`:queue:requests`替换为`:queue:failed`
665
+
666
+ 示例配置:
667
+ ```python
668
+ # settings.py
669
+ SCHEDULER_QUEUE_NAME = f'crawlo:{PROJECT_NAME}:queue:requests'
670
+ ```
671
+
672
+ ##### 命名规范优势
673
+ 1. **命名空间隔离**:通过项目名称实现不同项目间的数据隔离
674
+ 2. **组件分类清晰**:通过组件类型区分不同功能模块
675
+ 3. **易于监控和管理**:统一的命名格式便于Redis监控和管理
676
+ 4. **防止命名冲突**:避免不同项目或组件间的Key冲突
677
+
633
678
  <!-- 配置系统 section -->
634
679
  <h2 align="center">🎛️ 配置系统</h2>
635
680
 
@@ -1095,6 +1140,34 @@ asyncio.run(process.crawl('my_spider_name'))
1095
1140
 
1096
1141
  ---
1097
1142
 
1143
+ <!-- Redis键名修复说明 section -->
1144
+ <h2 align="center">🔧 Redis键名修复说明</h2>
1145
+
1146
+ 在早期版本中,Crawlo框架存在Redis队列键名生成的双重前缀问题。具体表现为:
1147
+
1148
+ - **问题现象**:Redis队列键名出现双重"crawlo"前缀,如`crawlo:crawlo:queue:requests`而不是正确的`crawlo:{project_name}:queue:requests`
1149
+ - **影响范围**:影响分布式模式下的请求队列、处理队列和失败队列的正确识别和使用
1150
+ - **根本原因**:队列管理器中的项目名称提取逻辑未能正确处理不同格式的队列名称
1151
+
1152
+ **修复内容**:
1153
+
1154
+ 1. **队列管理器优化**:
1155
+ - 改进了[QueueConfig.from_settings](file:///Users/oscar/projects/Crawlo/crawlo/queue/queue_manager.py#L148-L180)方法,使其在`SCHEDULER_QUEUE_NAME`未设置时能正确使用基于项目名称的默认值
1156
+ - 修复了队列管理器中从队列名称提取项目名称的逻辑,确保能正确处理各种前缀情况
1157
+
1158
+ 2. **Redis队列实现改进**:
1159
+ - 在[RedisPriorityQueue](file:///Users/oscar/projects/Crawlo/crawlo/queue/redis_priority_queue.py#L39-L76)中添加了`_normalize_queue_name`方法来规范化队列名称
1160
+ - 处理了多重"crawlo"前缀的情况,确保队列名称符合统一规范
1161
+
1162
+ 3. **配置文件调整**:
1163
+ - 将`SCHEDULER_QUEUE_NAME`设置为注释状态,提供更大的配置灵活性
1164
+ - 在所有模板和示例项目的配置文件中保持了一致性
1165
+
1166
+ **验证测试**:
1167
+ 通过专门的测试脚本验证了修复效果,确保在各种队列命名情况下都能正确生成和识别Redis键名。
1168
+
1169
+ ---
1170
+
1098
1171
  <!-- 文档 section -->
1099
1172
  <h2 align="center">📚 文档</h2>
1100
1173
 
@@ -580,6 +580,51 @@ Crawlo支持三种队列类型,可通过`QUEUE_TYPE`配置项设置:
580
580
 
581
581
  推荐使用`auto`模式,让框架根据环境自动选择最适合的队列类型。
582
582
 
583
+ #### Redis Key 命名规范
584
+
585
+ 在分布式模式下,Crawlo框架使用Redis作为队列和去重存储。为了确保不同项目和爬虫之间的数据隔离,框架采用统一的Redis Key命名规范:
586
+
587
+ ##### 默认命名规则
588
+ Redis Key遵循以下命名格式:`crawlo:{PROJECT_NAME}:{component}:{identifier}`
589
+
590
+ 其中:
591
+ - `PROJECT_NAME`:项目名称,用于区分不同项目
592
+ - `component`:组件类型,如`queue`、`filter`、`item`
593
+ - `identifier`:具体标识符,如`requests`、`processing`、`failed`、`fingerprint`
594
+
595
+ ##### 具体Key格式
596
+ 1. **请求队列**:`crawlo:{PROJECT_NAME}:queue:requests`
597
+ - 用于存储待处理的请求任务
598
+
599
+ 2. **处理中队列**:`crawlo:{PROJECT_NAME}:queue:processing`
600
+ - 用于存储正在处理的请求任务
601
+
602
+ 3. **失败队列**:`crawlo:{PROJECT_NAME}:queue:failed`
603
+ - 用于存储处理失败的请求任务
604
+
605
+ 4. **请求去重**:`crawlo:{PROJECT_NAME}:filter:fingerprint`
606
+ - 用于存储请求URL的指纹,实现去重功能
607
+
608
+ 5. **数据项去重**:`crawlo:{PROJECT_NAME}:item:fingerprint`
609
+ - 用于存储数据项的指纹,防止重复存储
610
+
611
+ ##### 自定义队列名称
612
+ 用户可以通过`SCHEDULER_QUEUE_NAME`配置项自定义请求队列名称。处理中队列和失败队列会基于请求队列名称自动生成:
613
+ - 处理中队列:将`:queue:requests`替换为`:queue:processing`
614
+ - 失败队列:将`:queue:requests`替换为`:queue:failed`
615
+
616
+ 示例配置:
617
+ ```python
618
+ # settings.py
619
+ SCHEDULER_QUEUE_NAME = f'crawlo:{PROJECT_NAME}:queue:requests'
620
+ ```
621
+
622
+ ##### 命名规范优势
623
+ 1. **命名空间隔离**:通过项目名称实现不同项目间的数据隔离
624
+ 2. **组件分类清晰**:通过组件类型区分不同功能模块
625
+ 3. **易于监控和管理**:统一的命名格式便于Redis监控和管理
626
+ 4. **防止命名冲突**:避免不同项目或组件间的Key冲突
627
+
583
628
  <!-- 配置系统 section -->
584
629
  <h2 align="center">🎛️ 配置系统</h2>
585
630
 
@@ -1045,6 +1090,34 @@ asyncio.run(process.crawl('my_spider_name'))
1045
1090
 
1046
1091
  ---
1047
1092
 
1093
+ <!-- Redis键名修复说明 section -->
1094
+ <h2 align="center">🔧 Redis键名修复说明</h2>
1095
+
1096
+ 在早期版本中,Crawlo框架存在Redis队列键名生成的双重前缀问题。具体表现为:
1097
+
1098
+ - **问题现象**:Redis队列键名出现双重"crawlo"前缀,如`crawlo:crawlo:queue:requests`而不是正确的`crawlo:{project_name}:queue:requests`
1099
+ - **影响范围**:影响分布式模式下的请求队列、处理队列和失败队列的正确识别和使用
1100
+ - **根本原因**:队列管理器中的项目名称提取逻辑未能正确处理不同格式的队列名称
1101
+
1102
+ **修复内容**:
1103
+
1104
+ 1. **队列管理器优化**:
1105
+ - 改进了[QueueConfig.from_settings](file:///Users/oscar/projects/Crawlo/crawlo/queue/queue_manager.py#L148-L180)方法,使其在`SCHEDULER_QUEUE_NAME`未设置时能正确使用基于项目名称的默认值
1106
+ - 修复了队列管理器中从队列名称提取项目名称的逻辑,确保能正确处理各种前缀情况
1107
+
1108
+ 2. **Redis队列实现改进**:
1109
+ - 在[RedisPriorityQueue](file:///Users/oscar/projects/Crawlo/crawlo/queue/redis_priority_queue.py#L39-L76)中添加了`_normalize_queue_name`方法来规范化队列名称
1110
+ - 处理了多重"crawlo"前缀的情况,确保队列名称符合统一规范
1111
+
1112
+ 3. **配置文件调整**:
1113
+ - 将`SCHEDULER_QUEUE_NAME`设置为注释状态,提供更大的配置灵活性
1114
+ - 在所有模板和示例项目的配置文件中保持了一致性
1115
+
1116
+ **验证测试**:
1117
+ 通过专门的测试脚本验证了修复效果,确保在各种队列命名情况下都能正确生成和识别Redis键名。
1118
+
1119
+ ---
1120
+
1048
1121
  <!-- 文档 section -->
1049
1122
  <h2 align="center">📚 文档</h2>
1050
1123
 
@@ -0,0 +1 @@
1
+ __version__ = '1.3.7'
@@ -57,9 +57,10 @@ class CrawloFramework:
57
57
 
58
58
  self._logger.info(f"Crawlo Framework Started {version}")
59
59
 
60
- # 获取运行模式并记录日志
60
+ # 获取运行模式和队列类型并记录日志
61
61
  run_mode = self._settings.get('RUN_MODE', 'unknown')
62
- self._logger.info(f"Run mode: {run_mode}")
62
+ queue_type = self._settings.get('QUEUE_TYPE', 'unknown')
63
+ self._logger.info(f"RunMode: {run_mode}, QueueType: {queue_type}")
63
64
 
64
65
  # 记录项目名称
65
66
  project_name = self._settings.get('PROJECT_NAME', 'unknown')
@@ -146,6 +146,17 @@ class QueueConfig:
146
146
  @classmethod
147
147
  def from_settings(cls, settings) -> 'QueueConfig':
148
148
  """Create configuration from settings"""
149
+ # 获取项目名称,用于生成默认队列名称
150
+ project_name = settings.get('PROJECT_NAME', 'default')
151
+ default_queue_name = f"crawlo:{project_name}:queue:requests"
152
+
153
+ # 如果设置了SCHEDULER_QUEUE_NAME,则使用该值,否则使用基于项目名称的默认值
154
+ scheduler_queue_name = settings.get('SCHEDULER_QUEUE_NAME')
155
+ if scheduler_queue_name is not None:
156
+ queue_name = scheduler_queue_name
157
+ else:
158
+ queue_name = default_queue_name
159
+
149
160
  return cls(
150
161
  queue_type=settings.get('QUEUE_TYPE', QueueType.AUTO),
151
162
  redis_url=settings.get('REDIS_URL'),
@@ -153,7 +164,7 @@ class QueueConfig:
153
164
  redis_port=settings.get_int('REDIS_PORT', 6379),
154
165
  redis_password=settings.get('REDIS_PASSWORD'),
155
166
  redis_db=settings.get_int('REDIS_DB', 0),
156
- queue_name=settings.get('SCHEDULER_QUEUE_NAME', 'crawlo:requests'),
167
+ queue_name=queue_name,
157
168
  max_queue_size=settings.get_int('SCHEDULER_MAX_QUEUE_SIZE', 1000),
158
169
  max_retries=settings.get_int('QUEUE_MAX_RETRIES', 3),
159
170
  timeout=settings.get_int('QUEUE_TIMEOUT', 300)
@@ -423,15 +434,23 @@ class QueueManager:
423
434
  except ImportError as e:
424
435
  raise RuntimeError(f"Redis队列不可用:未能导入RedisPriorityQueue ({e})")
425
436
 
426
- # 简化项目名称提取逻辑
437
+ # 修复项目名称提取逻辑,严格按照测试文件中的逻辑实现
427
438
  project_name = "default"
428
439
  if ':' in self.config.queue_name:
429
440
  parts = self.config.queue_name.split(':')
430
- # 跳过所有"crawlo"前缀,取第一个非"crawlo"部分作为项目名称
431
- for part in parts:
432
- if part != "crawlo":
433
- project_name = part
434
- break
441
+ if len(parts) >= 2:
442
+ # 处理可能的双重 crawlo 前缀
443
+ if parts[0] == "crawlo" and parts[1] == "crawlo":
444
+ # 双重 crawlo 前缀,取"crawlo"作为项目名称
445
+ project_name = "crawlo"
446
+ elif parts[0] == "crawlo":
447
+ # 正常的 crawlo 前缀,取第二个部分作为项目名称
448
+ project_name = parts[1]
449
+ else:
450
+ # 没有 crawlo 前缀,使用第一个部分作为项目名称
451
+ project_name = parts[0]
452
+ else:
453
+ project_name = self.config.queue_name or "default"
435
454
  else:
436
455
  project_name = self.config.queue_name or "default"
437
456
 
@@ -63,8 +63,8 @@ class RedisPriorityQueue:
63
63
  if queue_name is None:
64
64
  self.queue_name = f"crawlo:{module_name}:queue:requests"
65
65
  else:
66
- # 保持用户提供的队列名称不变,不做修改
67
- self.queue_name = queue_name
66
+ # 处理多重 crawlo 前缀,规范化队列名称
67
+ self.queue_name = self._normalize_queue_name(queue_name)
68
68
 
69
69
  # 如果未提供 processing_queue,则根据 queue_name 自动生成
70
70
  if processing_queue is None:
@@ -92,6 +92,47 @@ class RedisPriorityQueue:
92
92
  self._lock = asyncio.Lock() # 用于连接初始化的锁
93
93
  self.request_serializer = RequestSerializer() # 处理序列化
94
94
 
95
+ def _normalize_queue_name(self, queue_name: str) -> str:
96
+ """
97
+ 规范化队列名称,处理多重 crawlo 前缀
98
+
99
+ :param queue_name: 原始队列名称
100
+ :return: 规范化后的队列名称
101
+ """
102
+ # 如果队列名称已经符合规范(以 crawlo: 开头且不是 crawlo:crawlo:),则保持不变
103
+ if queue_name.startswith("crawlo:") and not queue_name.startswith("crawlo:crawlo:"):
104
+ return queue_name
105
+
106
+ # 处理三重 crawlo 前缀,简化为标准格式
107
+ if queue_name.startswith("crawlo:crawlo:crawlo:"):
108
+ # 三重 crawlo 前缀,简化为标准 crawlo: 格式
109
+ remaining = queue_name[21:] # 去掉 "crawlo:crawlo:crawlo:" 前缀
110
+ if remaining:
111
+ return f"crawlo:{remaining}"
112
+ else:
113
+ return "crawlo:requests" # 默认名称
114
+
115
+ # 处理双重 crawlo 前缀
116
+ elif queue_name.startswith("crawlo:crawlo:"):
117
+ # 双重 crawlo 前缀,简化为标准 crawlo: 格式
118
+ remaining = queue_name[14:] # 去掉 "crawlo:crawlo:" 前缀
119
+ if remaining:
120
+ return f"crawlo:{remaining}"
121
+ else:
122
+ return "crawlo:requests" # 默认名称
123
+
124
+ # 处理无 crawlo 前缀的情况
125
+ elif not queue_name.startswith("crawlo:"):
126
+ # 无 crawlo 前缀,添加 crawlo: 前缀
127
+ if queue_name:
128
+ return f"crawlo:{queue_name}"
129
+ else:
130
+ return "crawlo:requests" # 默认名称
131
+
132
+ # 其他情况,保持不变
133
+ else:
134
+ return queue_name
135
+
95
136
  async def connect(self, max_retries=3, delay=1):
96
137
  """异步连接 Redis,支持重试"""
97
138
  async with self._lock:
@@ -60,7 +60,8 @@ REQUEST_GENERATION_INTERVAL = 0.01 # 请求生成间隔(秒)
60
60
  ENABLE_CONTROLLED_REQUEST_GENERATION = False # 是否启用受控请求生成
61
61
 
62
62
  # 调度器队列名称(遵循统一命名规范)
63
- SCHEDULER_QUEUE_NAME = f"crawlo:{PROJECT_NAME}:queue:requests"
63
+ # 当使用Redis队列时,取消注释并设置此值,或在项目配置文件中设置
64
+ # SCHEDULER_QUEUE_NAME = f"crawlo:{PROJECT_NAME}:queue:requests"
64
65
 
65
66
  # 队列类型:memory/redis/auto
66
67
  QUEUE_TYPE = 'auto'
@@ -97,13 +98,12 @@ if REDIS_PASSWORD:
97
98
  else:
98
99
  REDIS_URL = f'redis://{REDIS_HOST}:{REDIS_PORT}/{REDIS_DB}'
99
100
 
100
- # 统一的Redis key命名规范配置
101
- # REDIS_KEY_PREFIX 已移至各组件中,使用统一的命名规范
102
- # crawlo:{PROJECT_NAME}:filter:fingerprint (请求去重)
103
- # crawlo:{PROJECT_NAME}:item:fingerprint (数据项去重)
104
- # crawlo:{PROJECT_NAME}:queue:requests (请求队列)
105
- # crawlo:{PROJECT_NAME}:queue:processing (处理中队列)
106
- # crawlo:{PROJECT_NAME}:queue:failed (失败队列)
101
+ # Redis key命名规范已封装到框架内部组件中,用户无需手动配置:
102
+ # - 请求去重: crawlo:{PROJECT_NAME}:filter:fingerprint
103
+ # - 数据项去重: crawlo:{PROJECT_NAME}:item:fingerprint
104
+ # - 请求队列: crawlo:{PROJECT_NAME}:queue:requests
105
+ # - 处理中队列: crawlo:{PROJECT_NAME}:queue:processing
106
+ # - 失败队列: crawlo:{PROJECT_NAME}:queue:failed
107
107
 
108
108
  REDIS_TTL = 0 # 指纹过期时间(0 表示永不过期)
109
109
  CLEANUP_FP = 0 # 程序结束时是否清理指纹(0=不清理)
@@ -29,6 +29,9 @@ DOWNLOADER = 'crawlo.downloader.aiohttp_downloader.AioHttpDownloader'
29
29
  # ============================== 队列配置 ==============================
30
30
  # 队列类型: 'memory', 'redis', 'auto'
31
31
  QUEUE_TYPE = 'memory'
32
+ # 当使用Redis队列时,可自定义队列名称
33
+ # 队列名称遵循统一命名规范: crawlo:{PROJECT_NAME}:queue:requests
34
+ # SCHEDULER_QUEUE_NAME = f'crawlo:{PROJECT_NAME}:queue:requests'
32
35
 
33
36
  # ============================== 去重过滤器 ==============================
34
37
  FILTER_CLASS = 'crawlo.filters.memory_filter.MemoryFilter'
@@ -28,6 +28,9 @@ DOWNLOADER = 'crawlo.downloader.aiohttp_downloader.AioHttpDownloader'
28
28
 
29
29
  # ============================== 队列配置 ==============================
30
30
  QUEUE_TYPE = 'redis'
31
+ # 当使用Redis队列时,可自定义队列名称
32
+ # 队列名称遵循统一命名规范: crawlo:{PROJECT_NAME}:queue:requests
33
+ # SCHEDULER_QUEUE_NAME = f'crawlo:{PROJECT_NAME}:queue:requests'
31
34
 
32
35
  # ============================== 去重过滤器 ==============================
33
36
  FILTER_CLASS = 'crawlo.filters.aioredis_filter.AioRedisFilter'
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: crawlo
3
- Version: 1.3.5
3
+ Version: 1.3.7
4
4
  Summary: Crawlo 是一款基于异步IO的高性能Python爬虫框架,支持分布式抓取。
5
5
  Home-page: https://github.com/crawl-coder/Crawlo.git
6
6
  Author: crawl-coder
@@ -630,6 +630,51 @@ Crawlo支持三种队列类型,可通过`QUEUE_TYPE`配置项设置:
630
630
 
631
631
  推荐使用`auto`模式,让框架根据环境自动选择最适合的队列类型。
632
632
 
633
+ #### Redis Key 命名规范
634
+
635
+ 在分布式模式下,Crawlo框架使用Redis作为队列和去重存储。为了确保不同项目和爬虫之间的数据隔离,框架采用统一的Redis Key命名规范:
636
+
637
+ ##### 默认命名规则
638
+ Redis Key遵循以下命名格式:`crawlo:{PROJECT_NAME}:{component}:{identifier}`
639
+
640
+ 其中:
641
+ - `PROJECT_NAME`:项目名称,用于区分不同项目
642
+ - `component`:组件类型,如`queue`、`filter`、`item`
643
+ - `identifier`:具体标识符,如`requests`、`processing`、`failed`、`fingerprint`
644
+
645
+ ##### 具体Key格式
646
+ 1. **请求队列**:`crawlo:{PROJECT_NAME}:queue:requests`
647
+ - 用于存储待处理的请求任务
648
+
649
+ 2. **处理中队列**:`crawlo:{PROJECT_NAME}:queue:processing`
650
+ - 用于存储正在处理的请求任务
651
+
652
+ 3. **失败队列**:`crawlo:{PROJECT_NAME}:queue:failed`
653
+ - 用于存储处理失败的请求任务
654
+
655
+ 4. **请求去重**:`crawlo:{PROJECT_NAME}:filter:fingerprint`
656
+ - 用于存储请求URL的指纹,实现去重功能
657
+
658
+ 5. **数据项去重**:`crawlo:{PROJECT_NAME}:item:fingerprint`
659
+ - 用于存储数据项的指纹,防止重复存储
660
+
661
+ ##### 自定义队列名称
662
+ 用户可以通过`SCHEDULER_QUEUE_NAME`配置项自定义请求队列名称。处理中队列和失败队列会基于请求队列名称自动生成:
663
+ - 处理中队列:将`:queue:requests`替换为`:queue:processing`
664
+ - 失败队列:将`:queue:requests`替换为`:queue:failed`
665
+
666
+ 示例配置:
667
+ ```python
668
+ # settings.py
669
+ SCHEDULER_QUEUE_NAME = f'crawlo:{PROJECT_NAME}:queue:requests'
670
+ ```
671
+
672
+ ##### 命名规范优势
673
+ 1. **命名空间隔离**:通过项目名称实现不同项目间的数据隔离
674
+ 2. **组件分类清晰**:通过组件类型区分不同功能模块
675
+ 3. **易于监控和管理**:统一的命名格式便于Redis监控和管理
676
+ 4. **防止命名冲突**:避免不同项目或组件间的Key冲突
677
+
633
678
  <!-- 配置系统 section -->
634
679
  <h2 align="center">🎛️ 配置系统</h2>
635
680
 
@@ -1095,6 +1140,34 @@ asyncio.run(process.crawl('my_spider_name'))
1095
1140
 
1096
1141
  ---
1097
1142
 
1143
+ <!-- Redis键名修复说明 section -->
1144
+ <h2 align="center">🔧 Redis键名修复说明</h2>
1145
+
1146
+ 在早期版本中,Crawlo框架存在Redis队列键名生成的双重前缀问题。具体表现为:
1147
+
1148
+ - **问题现象**:Redis队列键名出现双重"crawlo"前缀,如`crawlo:crawlo:queue:requests`而不是正确的`crawlo:{project_name}:queue:requests`
1149
+ - **影响范围**:影响分布式模式下的请求队列、处理队列和失败队列的正确识别和使用
1150
+ - **根本原因**:队列管理器中的项目名称提取逻辑未能正确处理不同格式的队列名称
1151
+
1152
+ **修复内容**:
1153
+
1154
+ 1. **队列管理器优化**:
1155
+ - 改进了[QueueConfig.from_settings](file:///Users/oscar/projects/Crawlo/crawlo/queue/queue_manager.py#L148-L180)方法,使其在`SCHEDULER_QUEUE_NAME`未设置时能正确使用基于项目名称的默认值
1156
+ - 修复了队列管理器中从队列名称提取项目名称的逻辑,确保能正确处理各种前缀情况
1157
+
1158
+ 2. **Redis队列实现改进**:
1159
+ - 在[RedisPriorityQueue](file:///Users/oscar/projects/Crawlo/crawlo/queue/redis_priority_queue.py#L39-L76)中添加了`_normalize_queue_name`方法来规范化队列名称
1160
+ - 处理了多重"crawlo"前缀的情况,确保队列名称符合统一规范
1161
+
1162
+ 3. **配置文件调整**:
1163
+ - 将`SCHEDULER_QUEUE_NAME`设置为注释状态,提供更大的配置灵活性
1164
+ - 在所有模板和示例项目的配置文件中保持了一致性
1165
+
1166
+ **验证测试**:
1167
+ 通过专门的测试脚本验证了修复效果,确保在各种队列命名情况下都能正确生成和识别Redis键名。
1168
+
1169
+ ---
1170
+
1098
1171
  <!-- 文档 section -->
1099
1172
  <h2 align="center">📚 文档</h2>
1100
1173
 
@@ -203,6 +203,7 @@ tests/simple_crawlo_test.py
203
203
  tests/simple_log_test.py
204
204
  tests/simple_log_test2.py
205
205
  tests/simple_optimization_test.py
206
+ tests/simple_queue_type_test.py
206
207
  tests/simple_spider_test.py
207
208
  tests/simple_test.py
208
209
  tests/spider_log_timing_test.py
@@ -264,6 +265,8 @@ tests/test_proxy_strategies.py
264
265
  tests/test_queue_empty_check.py
265
266
  tests/test_queue_manager_double_crawlo.py
266
267
  tests/test_queue_manager_redis_key.py
268
+ tests/test_queue_naming.py
269
+ tests/test_queue_type.py
267
270
  tests/test_random_user_agent.py
268
271
  tests/test_real_scenario_proxy.py
269
272
  tests/test_redis_config.py
@@ -271,6 +274,7 @@ tests/test_redis_connection_pool.py
271
274
  tests/test_redis_key_naming.py
272
275
  tests/test_redis_key_validator.py
273
276
  tests/test_redis_queue.py
277
+ tests/test_redis_queue_name_fix.py
274
278
  tests/test_request_ignore_middleware.py
275
279
  tests/test_request_params.py
276
280
  tests/test_request_serialization.py
@@ -0,0 +1,42 @@
1
+ #!/usr/bin/env python3
2
+ # -*- coding: utf-8 -*-
3
+ """
4
+ 简单测试 QUEUE_TYPE 配置获取
5
+ 验证我们的日志格式修改是否正常工作
6
+ """
7
+
8
+ import sys
9
+ import os
10
+
11
+ # 添加项目根目录到路径
12
+ sys.path.insert(0, os.path.join(os.path.dirname(__file__), '..'))
13
+
14
+ from crawlo.config import CrawloConfig
15
+ from crawlo.framework import CrawloFramework
16
+
17
+
18
+ def test_log_format():
19
+ """测试日志格式修改是否正常工作"""
20
+ print("=== 测试日志格式修改 ===")
21
+
22
+ # 创建单机模式配置
23
+ config = CrawloConfig.standalone(concurrency=4)
24
+
25
+ # 创建框架实例,这会触发日志输出
26
+ framework = CrawloFramework(config.to_dict())
27
+
28
+ # 获取配置信息
29
+ run_mode = framework.settings.get('RUN_MODE', 'not found')
30
+ queue_type = framework.settings.get('QUEUE_TYPE', 'not found')
31
+
32
+ print(f"从配置中获取到的信息:")
33
+ print(f" RunMode: {run_mode}")
34
+ print(f" QueueType: {queue_type}")
35
+
36
+ print("\n✅ 日志格式修改测试完成")
37
+
38
+
39
+ if __name__ == "__main__":
40
+ print("开始简单测试 QUEUE_TYPE 配置获取...")
41
+ test_log_format()
42
+ print("\n测试结束!")
@@ -45,7 +45,7 @@ class ProxyTestSpider(Spider):
45
45
 
46
46
  item = TestItem(
47
47
  url=response.url,
48
- status=response.status,
48
+ status=response.status_code,
49
49
  proxy=str(response.meta.get('proxy', 'No proxy'))
50
50
  )
51
51
 
@@ -34,18 +34,18 @@ async def test_redis_queue_naming():
34
34
  {
35
35
  "name": "双重 crawlo 前缀",
36
36
  "queue_name": "crawlo:crawlo:queue:requests",
37
- "expected_module": "crawlo",
38
- "expected_queue": "crawlo:crawlo:queue:requests",
39
- "expected_processing": "crawlo:crawlo:queue:processing",
40
- "expected_failed": "crawlo:crawlo:queue:failed"
37
+ "expected_module": "test_project",
38
+ "expected_queue": "crawlo:queue:requests", # 修复后的期望值
39
+ "expected_processing": "crawlo:queue:processing",
40
+ "expected_failed": "crawlo:queue:failed"
41
41
  },
42
42
  {
43
43
  "name": "三重 crawlo 前缀",
44
44
  "queue_name": "crawlo:crawlo:crawlo:queue:requests",
45
- "expected_module": "crawlo",
46
- "expected_queue": "crawlo:crawlo:queue:requests",
47
- "expected_processing": "crawlo:crawlo:queue:processing",
48
- "expected_failed": "crawlo:crawlo:queue:failed"
45
+ "expected_module": "test_project",
46
+ "expected_queue": "crawlo:queue:requests", # 修复后的期望值
47
+ "expected_processing": "crawlo:queue:processing",
48
+ "expected_failed": "crawlo:queue:failed"
49
49
  },
50
50
  {
51
51
  "name": "无 crawlo 前缀",
@@ -138,11 +138,8 @@ async def test_queue_manager_naming():
138
138
  if len(parts) >= 2:
139
139
  # 处理可能的双重 crawlo 前缀
140
140
  if parts[0] == "crawlo" and parts[1] == "crawlo":
141
- # 双重 crawlo 前缀,取第三个部分作为项目名称
142
- if len(parts) >= 3:
143
- project_name = parts[2]
144
- else:
145
- project_name = "default"
141
+ # 双重 crawlo 前缀,取"crawlo"作为项目名称
142
+ project_name = "crawlo"
146
143
  elif parts[0] == "crawlo":
147
144
  # 正常的 crawlo 前缀,取第二个部分作为项目名称
148
145
  project_name = parts[1]