crawlo 1.3.0__tar.gz → 1.3.1__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of crawlo might be problematic. Click here for more details.

Files changed (228) hide show
  1. {crawlo-1.3.0/crawlo.egg-info → crawlo-1.3.1}/PKG-INFO +13 -4
  2. {crawlo-1.3.0 → crawlo-1.3.1}/README.md +12 -3
  3. crawlo-1.3.1/crawlo/__version__.py +1 -0
  4. {crawlo-1.3.0 → crawlo-1.3.1}/crawlo/commands/utils.py +12 -2
  5. {crawlo-1.3.0 → crawlo-1.3.1}/crawlo/crawler.py +91 -20
  6. {crawlo-1.3.0 → crawlo-1.3.1}/crawlo/middleware/offsite.py +2 -1
  7. {crawlo-1.3.0 → crawlo-1.3.1}/crawlo/mode_manager.py +9 -73
  8. {crawlo-1.3.0 → crawlo-1.3.1}/crawlo/pipelines/mysql_pipeline.py +5 -4
  9. {crawlo-1.3.0 → crawlo-1.3.1}/crawlo/pipelines/pipeline_manager.py +2 -1
  10. {crawlo-1.3.0 → crawlo-1.3.1}/crawlo/project.py +16 -3
  11. {crawlo-1.3.0 → crawlo-1.3.1}/crawlo/settings/default_settings.py +13 -4
  12. {crawlo-1.3.0 → crawlo-1.3.1}/crawlo/settings/setting_manager.py +29 -6
  13. {crawlo-1.3.0 → crawlo-1.3.1}/crawlo/utils/log.py +4 -4
  14. {crawlo-1.3.0 → crawlo-1.3.1/crawlo.egg-info}/PKG-INFO +13 -4
  15. crawlo-1.3.0/crawlo/__version__.py +0 -1
  16. {crawlo-1.3.0 → crawlo-1.3.1}/LICENSE +0 -0
  17. {crawlo-1.3.0 → crawlo-1.3.1}/MANIFEST.in +0 -0
  18. {crawlo-1.3.0 → crawlo-1.3.1}/crawlo/__init__.py +0 -0
  19. {crawlo-1.3.0 → crawlo-1.3.1}/crawlo/cli.py +0 -0
  20. {crawlo-1.3.0 → crawlo-1.3.1}/crawlo/commands/__init__.py +0 -0
  21. {crawlo-1.3.0 → crawlo-1.3.1}/crawlo/commands/check.py +0 -0
  22. {crawlo-1.3.0 → crawlo-1.3.1}/crawlo/commands/genspider.py +0 -0
  23. {crawlo-1.3.0 → crawlo-1.3.1}/crawlo/commands/help.py +0 -0
  24. {crawlo-1.3.0 → crawlo-1.3.1}/crawlo/commands/list.py +0 -0
  25. {crawlo-1.3.0 → crawlo-1.3.1}/crawlo/commands/run.py +0 -0
  26. {crawlo-1.3.0 → crawlo-1.3.1}/crawlo/commands/startproject.py +0 -0
  27. {crawlo-1.3.0 → crawlo-1.3.1}/crawlo/commands/stats.py +0 -0
  28. {crawlo-1.3.0 → crawlo-1.3.1}/crawlo/config.py +0 -0
  29. {crawlo-1.3.0 → crawlo-1.3.1}/crawlo/config_validator.py +0 -0
  30. {crawlo-1.3.0 → crawlo-1.3.1}/crawlo/core/__init__.py +0 -0
  31. {crawlo-1.3.0 → crawlo-1.3.1}/crawlo/core/engine.py +0 -0
  32. {crawlo-1.3.0 → crawlo-1.3.1}/crawlo/core/processor.py +0 -0
  33. {crawlo-1.3.0 → crawlo-1.3.1}/crawlo/core/scheduler.py +0 -0
  34. {crawlo-1.3.0 → crawlo-1.3.1}/crawlo/data/__init__.py +0 -0
  35. {crawlo-1.3.0 → crawlo-1.3.1}/crawlo/data/user_agents.py +0 -0
  36. {crawlo-1.3.0 → crawlo-1.3.1}/crawlo/downloader/__init__.py +0 -0
  37. {crawlo-1.3.0 → crawlo-1.3.1}/crawlo/downloader/aiohttp_downloader.py +0 -0
  38. {crawlo-1.3.0 → crawlo-1.3.1}/crawlo/downloader/cffi_downloader.py +0 -0
  39. {crawlo-1.3.0 → crawlo-1.3.1}/crawlo/downloader/httpx_downloader.py +0 -0
  40. {crawlo-1.3.0 → crawlo-1.3.1}/crawlo/downloader/hybrid_downloader.py +0 -0
  41. {crawlo-1.3.0 → crawlo-1.3.1}/crawlo/downloader/playwright_downloader.py +0 -0
  42. {crawlo-1.3.0 → crawlo-1.3.1}/crawlo/downloader/selenium_downloader.py +0 -0
  43. {crawlo-1.3.0 → crawlo-1.3.1}/crawlo/event.py +0 -0
  44. {crawlo-1.3.0 → crawlo-1.3.1}/crawlo/exceptions.py +0 -0
  45. {crawlo-1.3.0 → crawlo-1.3.1}/crawlo/extension/__init__.py +0 -0
  46. {crawlo-1.3.0 → crawlo-1.3.1}/crawlo/extension/health_check.py +0 -0
  47. {crawlo-1.3.0 → crawlo-1.3.1}/crawlo/extension/log_interval.py +0 -0
  48. {crawlo-1.3.0 → crawlo-1.3.1}/crawlo/extension/log_stats.py +0 -0
  49. {crawlo-1.3.0 → crawlo-1.3.1}/crawlo/extension/logging_extension.py +0 -0
  50. {crawlo-1.3.0 → crawlo-1.3.1}/crawlo/extension/memory_monitor.py +0 -0
  51. {crawlo-1.3.0 → crawlo-1.3.1}/crawlo/extension/performance_profiler.py +0 -0
  52. {crawlo-1.3.0 → crawlo-1.3.1}/crawlo/extension/request_recorder.py +0 -0
  53. {crawlo-1.3.0 → crawlo-1.3.1}/crawlo/filters/__init__.py +0 -0
  54. {crawlo-1.3.0 → crawlo-1.3.1}/crawlo/filters/aioredis_filter.py +0 -0
  55. {crawlo-1.3.0 → crawlo-1.3.1}/crawlo/filters/memory_filter.py +0 -0
  56. {crawlo-1.3.0 → crawlo-1.3.1}/crawlo/items/__init__.py +0 -0
  57. {crawlo-1.3.0 → crawlo-1.3.1}/crawlo/items/base.py +0 -0
  58. {crawlo-1.3.0 → crawlo-1.3.1}/crawlo/items/fields.py +0 -0
  59. {crawlo-1.3.0 → crawlo-1.3.1}/crawlo/items/items.py +0 -0
  60. {crawlo-1.3.0 → crawlo-1.3.1}/crawlo/middleware/__init__.py +0 -0
  61. {crawlo-1.3.0 → crawlo-1.3.1}/crawlo/middleware/default_header.py +0 -0
  62. {crawlo-1.3.0 → crawlo-1.3.1}/crawlo/middleware/download_delay.py +0 -0
  63. {crawlo-1.3.0 → crawlo-1.3.1}/crawlo/middleware/middleware_manager.py +0 -0
  64. {crawlo-1.3.0 → crawlo-1.3.1}/crawlo/middleware/proxy.py +0 -0
  65. {crawlo-1.3.0 → crawlo-1.3.1}/crawlo/middleware/request_ignore.py +0 -0
  66. {crawlo-1.3.0 → crawlo-1.3.1}/crawlo/middleware/response_code.py +0 -0
  67. {crawlo-1.3.0 → crawlo-1.3.1}/crawlo/middleware/response_filter.py +0 -0
  68. {crawlo-1.3.0 → crawlo-1.3.1}/crawlo/middleware/retry.py +0 -0
  69. {crawlo-1.3.0 → crawlo-1.3.1}/crawlo/middleware/simple_proxy.py +0 -0
  70. {crawlo-1.3.0 → crawlo-1.3.1}/crawlo/network/__init__.py +0 -0
  71. {crawlo-1.3.0 → crawlo-1.3.1}/crawlo/network/request.py +0 -0
  72. {crawlo-1.3.0 → crawlo-1.3.1}/crawlo/network/response.py +0 -0
  73. {crawlo-1.3.0 → crawlo-1.3.1}/crawlo/pipelines/__init__.py +0 -0
  74. {crawlo-1.3.0 → crawlo-1.3.1}/crawlo/pipelines/bloom_dedup_pipeline.py +0 -0
  75. {crawlo-1.3.0 → crawlo-1.3.1}/crawlo/pipelines/console_pipeline.py +0 -0
  76. {crawlo-1.3.0 → crawlo-1.3.1}/crawlo/pipelines/csv_pipeline.py +0 -0
  77. {crawlo-1.3.0 → crawlo-1.3.1}/crawlo/pipelines/database_dedup_pipeline.py +0 -0
  78. {crawlo-1.3.0 → crawlo-1.3.1}/crawlo/pipelines/json_pipeline.py +0 -0
  79. {crawlo-1.3.0 → crawlo-1.3.1}/crawlo/pipelines/memory_dedup_pipeline.py +0 -0
  80. {crawlo-1.3.0 → crawlo-1.3.1}/crawlo/pipelines/mongo_pipeline.py +0 -0
  81. {crawlo-1.3.0 → crawlo-1.3.1}/crawlo/pipelines/redis_dedup_pipeline.py +0 -0
  82. {crawlo-1.3.0 → crawlo-1.3.1}/crawlo/queue/__init__.py +0 -0
  83. {crawlo-1.3.0 → crawlo-1.3.1}/crawlo/queue/pqueue.py +0 -0
  84. {crawlo-1.3.0 → crawlo-1.3.1}/crawlo/queue/queue_manager.py +0 -0
  85. {crawlo-1.3.0 → crawlo-1.3.1}/crawlo/queue/redis_priority_queue.py +0 -0
  86. {crawlo-1.3.0 → crawlo-1.3.1}/crawlo/settings/__init__.py +0 -0
  87. {crawlo-1.3.0 → crawlo-1.3.1}/crawlo/spider/__init__.py +0 -0
  88. {crawlo-1.3.0 → crawlo-1.3.1}/crawlo/stats_collector.py +0 -0
  89. {crawlo-1.3.0 → crawlo-1.3.1}/crawlo/subscriber.py +0 -0
  90. {crawlo-1.3.0 → crawlo-1.3.1}/crawlo/task_manager.py +0 -0
  91. {crawlo-1.3.0 → crawlo-1.3.1}/crawlo/templates/crawlo.cfg.tmpl +0 -0
  92. {crawlo-1.3.0 → crawlo-1.3.1}/crawlo/templates/project/__init__.py.tmpl +0 -0
  93. {crawlo-1.3.0 → crawlo-1.3.1}/crawlo/templates/project/items.py.tmpl +0 -0
  94. {crawlo-1.3.0 → crawlo-1.3.1}/crawlo/templates/project/middlewares.py.tmpl +0 -0
  95. {crawlo-1.3.0 → crawlo-1.3.1}/crawlo/templates/project/pipelines.py.tmpl +0 -0
  96. {crawlo-1.3.0 → crawlo-1.3.1}/crawlo/templates/project/settings.py.tmpl +0 -0
  97. {crawlo-1.3.0 → crawlo-1.3.1}/crawlo/templates/project/settings_distributed.py.tmpl +0 -0
  98. {crawlo-1.3.0 → crawlo-1.3.1}/crawlo/templates/project/settings_gentle.py.tmpl +0 -0
  99. {crawlo-1.3.0 → crawlo-1.3.1}/crawlo/templates/project/settings_high_performance.py.tmpl +0 -0
  100. {crawlo-1.3.0 → crawlo-1.3.1}/crawlo/templates/project/settings_minimal.py.tmpl +0 -0
  101. {crawlo-1.3.0 → crawlo-1.3.1}/crawlo/templates/project/settings_simple.py.tmpl +0 -0
  102. {crawlo-1.3.0 → crawlo-1.3.1}/crawlo/templates/project/spiders/__init__.py.tmpl +0 -0
  103. {crawlo-1.3.0 → crawlo-1.3.1}/crawlo/templates/run.py.tmpl +0 -0
  104. {crawlo-1.3.0 → crawlo-1.3.1}/crawlo/templates/spider/spider.py.tmpl +0 -0
  105. {crawlo-1.3.0 → crawlo-1.3.1}/crawlo/tools/__init__.py +0 -0
  106. {crawlo-1.3.0 → crawlo-1.3.1}/crawlo/tools/anti_crawler.py +0 -0
  107. {crawlo-1.3.0 → crawlo-1.3.1}/crawlo/tools/authenticated_proxy.py +0 -0
  108. {crawlo-1.3.0 → crawlo-1.3.1}/crawlo/tools/data_formatter.py +0 -0
  109. {crawlo-1.3.0 → crawlo-1.3.1}/crawlo/tools/data_validator.py +0 -0
  110. {crawlo-1.3.0 → crawlo-1.3.1}/crawlo/tools/date_tools.py +0 -0
  111. {crawlo-1.3.0 → crawlo-1.3.1}/crawlo/tools/distributed_coordinator.py +0 -0
  112. {crawlo-1.3.0 → crawlo-1.3.1}/crawlo/tools/encoding_converter.py +0 -0
  113. {crawlo-1.3.0 → crawlo-1.3.1}/crawlo/tools/request_tools.py +0 -0
  114. {crawlo-1.3.0 → crawlo-1.3.1}/crawlo/tools/retry_mechanism.py +0 -0
  115. {crawlo-1.3.0 → crawlo-1.3.1}/crawlo/tools/scenario_adapter.py +0 -0
  116. {crawlo-1.3.0 → crawlo-1.3.1}/crawlo/tools/text_cleaner.py +0 -0
  117. {crawlo-1.3.0 → crawlo-1.3.1}/crawlo/utils/__init__.py +0 -0
  118. {crawlo-1.3.0 → crawlo-1.3.1}/crawlo/utils/batch_processor.py +0 -0
  119. {crawlo-1.3.0 → crawlo-1.3.1}/crawlo/utils/controlled_spider_mixin.py +0 -0
  120. {crawlo-1.3.0 → crawlo-1.3.1}/crawlo/utils/db_helper.py +0 -0
  121. {crawlo-1.3.0 → crawlo-1.3.1}/crawlo/utils/enhanced_error_handler.py +0 -0
  122. {crawlo-1.3.0 → crawlo-1.3.1}/crawlo/utils/env_config.py +0 -0
  123. {crawlo-1.3.0 → crawlo-1.3.1}/crawlo/utils/error_handler.py +0 -0
  124. {crawlo-1.3.0 → crawlo-1.3.1}/crawlo/utils/func_tools.py +0 -0
  125. {crawlo-1.3.0 → crawlo-1.3.1}/crawlo/utils/large_scale_config.py +0 -0
  126. {crawlo-1.3.0 → crawlo-1.3.1}/crawlo/utils/large_scale_helper.py +0 -0
  127. {crawlo-1.3.0 → crawlo-1.3.1}/crawlo/utils/performance_monitor.py +0 -0
  128. {crawlo-1.3.0 → crawlo-1.3.1}/crawlo/utils/queue_helper.py +0 -0
  129. {crawlo-1.3.0 → crawlo-1.3.1}/crawlo/utils/redis_connection_pool.py +0 -0
  130. {crawlo-1.3.0 → crawlo-1.3.1}/crawlo/utils/redis_key_validator.py +0 -0
  131. {crawlo-1.3.0 → crawlo-1.3.1}/crawlo/utils/request.py +0 -0
  132. {crawlo-1.3.0 → crawlo-1.3.1}/crawlo/utils/request_serializer.py +0 -0
  133. {crawlo-1.3.0 → crawlo-1.3.1}/crawlo/utils/spider_loader.py +0 -0
  134. {crawlo-1.3.0 → crawlo-1.3.1}/crawlo/utils/system.py +0 -0
  135. {crawlo-1.3.0 → crawlo-1.3.1}/crawlo/utils/tools.py +0 -0
  136. {crawlo-1.3.0 → crawlo-1.3.1}/crawlo/utils/url.py +0 -0
  137. {crawlo-1.3.0 → crawlo-1.3.1}/crawlo.egg-info/SOURCES.txt +0 -0
  138. {crawlo-1.3.0 → crawlo-1.3.1}/crawlo.egg-info/dependency_links.txt +0 -0
  139. {crawlo-1.3.0 → crawlo-1.3.1}/crawlo.egg-info/entry_points.txt +0 -0
  140. {crawlo-1.3.0 → crawlo-1.3.1}/crawlo.egg-info/requires.txt +0 -0
  141. {crawlo-1.3.0 → crawlo-1.3.1}/crawlo.egg-info/top_level.txt +0 -0
  142. {crawlo-1.3.0 → crawlo-1.3.1}/examples/__init__.py +0 -0
  143. {crawlo-1.3.0 → crawlo-1.3.1}/pyproject.toml +0 -0
  144. {crawlo-1.3.0 → crawlo-1.3.1}/requirements.txt +0 -0
  145. {crawlo-1.3.0 → crawlo-1.3.1}/setup.cfg +0 -0
  146. {crawlo-1.3.0 → crawlo-1.3.1}/tests/DOUBLE_CRAWLO_PREFIX_FIX_REPORT.md +0 -0
  147. {crawlo-1.3.0 → crawlo-1.3.1}/tests/__init__.py +0 -0
  148. {crawlo-1.3.0 → crawlo-1.3.1}/tests/advanced_tools_example.py +0 -0
  149. {crawlo-1.3.0 → crawlo-1.3.1}/tests/authenticated_proxy_example.py +0 -0
  150. {crawlo-1.3.0 → crawlo-1.3.1}/tests/cleaners_example.py +0 -0
  151. {crawlo-1.3.0 → crawlo-1.3.1}/tests/config_validation_demo.py +0 -0
  152. {crawlo-1.3.0 → crawlo-1.3.1}/tests/controlled_spider_example.py +0 -0
  153. {crawlo-1.3.0 → crawlo-1.3.1}/tests/date_tools_example.py +0 -0
  154. {crawlo-1.3.0 → crawlo-1.3.1}/tests/debug_pipelines.py +0 -0
  155. {crawlo-1.3.0 → crawlo-1.3.1}/tests/dynamic_loading_example.py +0 -0
  156. {crawlo-1.3.0 → crawlo-1.3.1}/tests/dynamic_loading_test.py +0 -0
  157. {crawlo-1.3.0 → crawlo-1.3.1}/tests/env_config_example.py +0 -0
  158. {crawlo-1.3.0 → crawlo-1.3.1}/tests/error_handling_example.py +0 -0
  159. {crawlo-1.3.0 → crawlo-1.3.1}/tests/redis_key_validation_demo.py +0 -0
  160. {crawlo-1.3.0 → crawlo-1.3.1}/tests/request_params_example.py +0 -0
  161. {crawlo-1.3.0 → crawlo-1.3.1}/tests/response_improvements_example.py +0 -0
  162. {crawlo-1.3.0 → crawlo-1.3.1}/tests/test_advanced_tools.py +0 -0
  163. {crawlo-1.3.0 → crawlo-1.3.1}/tests/test_all_redis_key_configs.py +0 -0
  164. {crawlo-1.3.0 → crawlo-1.3.1}/tests/test_authenticated_proxy.py +0 -0
  165. {crawlo-1.3.0 → crawlo-1.3.1}/tests/test_cleaners.py +0 -0
  166. {crawlo-1.3.0 → crawlo-1.3.1}/tests/test_comprehensive.py +0 -0
  167. {crawlo-1.3.0 → crawlo-1.3.1}/tests/test_config_consistency.py +0 -0
  168. {crawlo-1.3.0 → crawlo-1.3.1}/tests/test_config_merge.py +0 -0
  169. {crawlo-1.3.0 → crawlo-1.3.1}/tests/test_config_validator.py +0 -0
  170. {crawlo-1.3.0 → crawlo-1.3.1}/tests/test_crawlo_proxy_integration.py +0 -0
  171. {crawlo-1.3.0 → crawlo-1.3.1}/tests/test_date_tools.py +0 -0
  172. {crawlo-1.3.0 → crawlo-1.3.1}/tests/test_default_header_middleware.py +0 -0
  173. {crawlo-1.3.0 → crawlo-1.3.1}/tests/test_distributed.py +0 -0
  174. {crawlo-1.3.0 → crawlo-1.3.1}/tests/test_double_crawlo_fix.py +0 -0
  175. {crawlo-1.3.0 → crawlo-1.3.1}/tests/test_double_crawlo_fix_simple.py +0 -0
  176. {crawlo-1.3.0 → crawlo-1.3.1}/tests/test_download_delay_middleware.py +0 -0
  177. {crawlo-1.3.0 → crawlo-1.3.1}/tests/test_downloader_proxy_compatibility.py +0 -0
  178. {crawlo-1.3.0 → crawlo-1.3.1}/tests/test_dynamic_downloaders_proxy.py +0 -0
  179. {crawlo-1.3.0 → crawlo-1.3.1}/tests/test_dynamic_proxy.py +0 -0
  180. {crawlo-1.3.0 → crawlo-1.3.1}/tests/test_dynamic_proxy_config.py +0 -0
  181. {crawlo-1.3.0 → crawlo-1.3.1}/tests/test_dynamic_proxy_real.py +0 -0
  182. {crawlo-1.3.0 → crawlo-1.3.1}/tests/test_edge_cases.py +0 -0
  183. {crawlo-1.3.0 → crawlo-1.3.1}/tests/test_enhanced_error_handler.py +0 -0
  184. {crawlo-1.3.0 → crawlo-1.3.1}/tests/test_env_config.py +0 -0
  185. {crawlo-1.3.0 → crawlo-1.3.1}/tests/test_error_handler_compatibility.py +0 -0
  186. {crawlo-1.3.0 → crawlo-1.3.1}/tests/test_final_validation.py +0 -0
  187. {crawlo-1.3.0 → crawlo-1.3.1}/tests/test_framework_env_usage.py +0 -0
  188. {crawlo-1.3.0 → crawlo-1.3.1}/tests/test_integration.py +0 -0
  189. {crawlo-1.3.0 → crawlo-1.3.1}/tests/test_item_dedup_redis_key.py +0 -0
  190. {crawlo-1.3.0 → crawlo-1.3.1}/tests/test_mode_consistency.py +0 -0
  191. {crawlo-1.3.0 → crawlo-1.3.1}/tests/test_offsite_middleware.py +0 -0
  192. {crawlo-1.3.0 → crawlo-1.3.1}/tests/test_parsel.py +0 -0
  193. {crawlo-1.3.0 → crawlo-1.3.1}/tests/test_performance.py +0 -0
  194. {crawlo-1.3.0 → crawlo-1.3.1}/tests/test_proxy_api.py +0 -0
  195. {crawlo-1.3.0 → crawlo-1.3.1}/tests/test_proxy_health_check.py +0 -0
  196. {crawlo-1.3.0 → crawlo-1.3.1}/tests/test_proxy_middleware.py +0 -0
  197. {crawlo-1.3.0 → crawlo-1.3.1}/tests/test_proxy_middleware_enhanced.py +0 -0
  198. {crawlo-1.3.0 → crawlo-1.3.1}/tests/test_proxy_middleware_integration.py +0 -0
  199. {crawlo-1.3.0 → crawlo-1.3.1}/tests/test_proxy_middleware_refactored.py +0 -0
  200. {crawlo-1.3.0 → crawlo-1.3.1}/tests/test_proxy_providers.py +0 -0
  201. {crawlo-1.3.0 → crawlo-1.3.1}/tests/test_proxy_stats.py +0 -0
  202. {crawlo-1.3.0 → crawlo-1.3.1}/tests/test_proxy_strategies.py +0 -0
  203. {crawlo-1.3.0 → crawlo-1.3.1}/tests/test_queue_manager_double_crawlo.py +0 -0
  204. {crawlo-1.3.0 → crawlo-1.3.1}/tests/test_queue_manager_redis_key.py +0 -0
  205. {crawlo-1.3.0 → crawlo-1.3.1}/tests/test_random_user_agent.py +0 -0
  206. {crawlo-1.3.0 → crawlo-1.3.1}/tests/test_real_scenario_proxy.py +0 -0
  207. {crawlo-1.3.0 → crawlo-1.3.1}/tests/test_redis_config.py +0 -0
  208. {crawlo-1.3.0 → crawlo-1.3.1}/tests/test_redis_connection_pool.py +0 -0
  209. {crawlo-1.3.0 → crawlo-1.3.1}/tests/test_redis_key_naming.py +0 -0
  210. {crawlo-1.3.0 → crawlo-1.3.1}/tests/test_redis_key_validator.py +0 -0
  211. {crawlo-1.3.0 → crawlo-1.3.1}/tests/test_redis_queue.py +0 -0
  212. {crawlo-1.3.0 → crawlo-1.3.1}/tests/test_request_ignore_middleware.py +0 -0
  213. {crawlo-1.3.0 → crawlo-1.3.1}/tests/test_request_params.py +0 -0
  214. {crawlo-1.3.0 → crawlo-1.3.1}/tests/test_request_serialization.py +0 -0
  215. {crawlo-1.3.0 → crawlo-1.3.1}/tests/test_response_code_middleware.py +0 -0
  216. {crawlo-1.3.0 → crawlo-1.3.1}/tests/test_response_filter_middleware.py +0 -0
  217. {crawlo-1.3.0 → crawlo-1.3.1}/tests/test_response_improvements.py +0 -0
  218. {crawlo-1.3.0 → crawlo-1.3.1}/tests/test_retry_middleware.py +0 -0
  219. {crawlo-1.3.0 → crawlo-1.3.1}/tests/test_scheduler.py +0 -0
  220. {crawlo-1.3.0 → crawlo-1.3.1}/tests/test_scheduler_config_update.py +0 -0
  221. {crawlo-1.3.0 → crawlo-1.3.1}/tests/test_simple_response.py +0 -0
  222. {crawlo-1.3.0 → crawlo-1.3.1}/tests/test_telecom_spider_redis_key.py +0 -0
  223. {crawlo-1.3.0 → crawlo-1.3.1}/tests/test_template_content.py +0 -0
  224. {crawlo-1.3.0 → crawlo-1.3.1}/tests/test_template_redis_key.py +0 -0
  225. {crawlo-1.3.0 → crawlo-1.3.1}/tests/test_tools.py +0 -0
  226. {crawlo-1.3.0 → crawlo-1.3.1}/tests/test_user_agents.py +0 -0
  227. {crawlo-1.3.0 → crawlo-1.3.1}/tests/tools_example.py +0 -0
  228. {crawlo-1.3.0 → crawlo-1.3.1}/tests/verify_distributed.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: crawlo
3
- Version: 1.3.0
3
+ Version: 1.3.1
4
4
  Summary: Crawlo 是一款基于异步IO的高性能Python爬虫框架,支持分布式抓取。
5
5
  Home-page: https://github.com/crawl-coder/Crawlo.git
6
6
  Author: crawl-coder
@@ -56,7 +56,7 @@ Requires-Dist: selenium>=3.141.0; extra == "all"
56
56
 
57
57
  <p align="center">
58
58
  <a href="https://www.python.org/downloads/">
59
- <img src="https://img.shields.io/badge/python-3.8%2B-blue" alt="Python Version">
59
+ <img src="https://img.shields.io/badge/python-%3C%3D3.12-blue" alt="Python Version">
60
60
  </a>
61
61
  <a href="LICENSE">
62
62
  <img src="https://img.shields.io/badge/license-MIT-green" alt="License">
@@ -429,13 +429,22 @@ Crawlo框架的中间件、管道和扩展组件采用模块化设计,框架
429
429
 
430
430
  用户可以通过`CUSTOM_MIDDLEWARES`配置自定义中间件:
431
431
 
432
- ```python
432
+ ``python
433
433
  # settings.py
434
434
  CUSTOM_MIDDLEWARES = [
435
435
  'myproject.middlewares.CustomMiddleware',
436
436
  ]
437
437
  ```
438
438
 
439
+ > **注意**:DefaultHeaderMiddleware 和 OffsiteMiddleware 需要相应的配置才能启用:
440
+ > - DefaultHeaderMiddleware 需要配置 `DEFAULT_REQUEST_HEADERS` 或 `USER_AGENT` 参数
441
+ > - OffsiteMiddleware 需要配置 `ALLOWED_DOMAINS` 参数
442
+ >
443
+ > 如果未配置相应参数,这些中间件会因为 NotConfiguredError 而被禁用。
444
+
445
+ > **注意**:中间件的顺序很重要。SimpleProxyMiddleware 通常放在列表末尾,
446
+ > 这样可以在所有默认中间件处理后再应用代理设置。
447
+
439
448
  #### 管道配置
440
449
 
441
450
  框架默认加载以下管道:
@@ -930,7 +939,7 @@ Crawlo框架内置了多种中间件,其中代理中间件有两种实现:
930
939
 
931
940
  如果需要使用简化版代理中间件,可以在配置文件中替换默认的代理中间件:
932
941
 
933
- ```python
942
+ ``python
934
943
  # settings.py
935
944
  MIDDLEWARES = [
936
945
  # 注释掉复杂版代理中间件
@@ -6,7 +6,7 @@
6
6
 
7
7
  <p align="center">
8
8
  <a href="https://www.python.org/downloads/">
9
- <img src="https://img.shields.io/badge/python-3.8%2B-blue" alt="Python Version">
9
+ <img src="https://img.shields.io/badge/python-%3C%3D3.12-blue" alt="Python Version">
10
10
  </a>
11
11
  <a href="LICENSE">
12
12
  <img src="https://img.shields.io/badge/license-MIT-green" alt="License">
@@ -379,13 +379,22 @@ Crawlo框架的中间件、管道和扩展组件采用模块化设计,框架
379
379
 
380
380
  用户可以通过`CUSTOM_MIDDLEWARES`配置自定义中间件:
381
381
 
382
- ```python
382
+ ``python
383
383
  # settings.py
384
384
  CUSTOM_MIDDLEWARES = [
385
385
  'myproject.middlewares.CustomMiddleware',
386
386
  ]
387
387
  ```
388
388
 
389
+ > **注意**:DefaultHeaderMiddleware 和 OffsiteMiddleware 需要相应的配置才能启用:
390
+ > - DefaultHeaderMiddleware 需要配置 `DEFAULT_REQUEST_HEADERS` 或 `USER_AGENT` 参数
391
+ > - OffsiteMiddleware 需要配置 `ALLOWED_DOMAINS` 参数
392
+ >
393
+ > 如果未配置相应参数,这些中间件会因为 NotConfiguredError 而被禁用。
394
+
395
+ > **注意**:中间件的顺序很重要。SimpleProxyMiddleware 通常放在列表末尾,
396
+ > 这样可以在所有默认中间件处理后再应用代理设置。
397
+
389
398
  #### 管道配置
390
399
 
391
400
  框架默认加载以下管道:
@@ -880,7 +889,7 @@ Crawlo框架内置了多种中间件,其中代理中间件有两种实现:
880
889
 
881
890
  如果需要使用简化版代理中间件,可以在配置文件中替换默认的代理中间件:
882
891
 
883
- ```python
892
+ ``python
884
893
  # settings.py
885
894
  MIDDLEWARES = [
886
895
  # 注释掉复杂版代理中间件
@@ -0,0 +1 @@
1
+ __version__ = '1.3.1'
@@ -133,8 +133,11 @@ def validate_spider_name(spider_name: str) -> bool:
133
133
  bool: 是否有效
134
134
  """
135
135
  import re
136
+ # 清理爬虫名称中的不可见字符
137
+ cleaned_name = ''.join(c for c in spider_name if not unicodedata.category(c).startswith('C'))
138
+
136
139
  # 爬虫名称应该是有效的Python标识符
137
- return spider_name.isidentifier() and re.match(r'^[a-z][a-z0-9_]*$', spider_name)
140
+ return cleaned_name.isidentifier() and re.match(r'^[a-z][a-z0-9_]*$', cleaned_name)
138
141
 
139
142
 
140
143
  def format_file_size(size_bytes: int) -> str:
@@ -181,7 +184,14 @@ def is_valid_domain(domain: str) -> bool:
181
184
  bool: 是否有效
182
185
  """
183
186
  import re
187
+ # 清理域名中的不可见字符
188
+ cleaned_domain = ''.join(c for c in domain if not unicodedata.category(c).startswith('C'))
189
+
184
190
  pattern = re.compile(
185
191
  r'^(?:[a-zA-Z0-9](?:[a-zA-Z0-9-]{0,61}[a-zA-Z0-9])?\.)*[a-zA-Z0-9](?:[a-zA-Z0-9-]{0,61}[a-zA-Z0-9])?$'
186
192
  )
187
- return bool(pattern.match(domain))
193
+ return bool(pattern.match(cleaned_domain))
194
+
195
+
196
+ # 添加导入
197
+ import unicodedata
@@ -36,13 +36,12 @@ from .spider import Spider, get_global_spider_registry
36
36
  from .core.engine import Engine
37
37
  from .subscriber import Subscriber
38
38
  from .extension import ExtensionManager
39
+ from crawlo.utils.log import get_logger
39
40
  from .stats_collector import StatsCollector
40
41
  from .event import spider_opened, spider_closed
41
42
  from .settings.setting_manager import SettingManager
42
43
  from crawlo.project import merge_settings, get_settings
43
44
 
44
- # 使用自定义日志系统
45
- from crawlo.utils.log import get_logger
46
45
  logger = get_logger(__name__)
47
46
 
48
47
 
@@ -112,7 +111,12 @@ class Crawler:
112
111
  - Exception handling and cleanup
113
112
  """
114
113
 
115
- def __init__(self, spider_cls: Type[Spider], settings: SettingManager, context: Optional[CrawlerContext] = None):
114
+ def __init__(
115
+ self,
116
+ spider_cls: Type[Spider],
117
+ settings: SettingManager,
118
+ context: Optional[CrawlerContext] = None
119
+ ):
116
120
  self.spider_cls = spider_cls
117
121
  self.spider: Optional[Spider] = None
118
122
  self.engine: Optional[Engine] = None
@@ -137,6 +141,22 @@ class Crawler:
137
141
  'error_count': 0
138
142
  }
139
143
 
144
+ # Initialize components
145
+ self.subscriber = self._create_subscriber()
146
+ self.spider = self._create_spider()
147
+ self.engine = self._create_engine()
148
+ self.stats = self._create_stats()
149
+ # Note: Do not initialize extension manager here, let it initialize in the engine
150
+
151
+ # Validate crawler state
152
+ self._validate_crawler_state()
153
+
154
+ # 打印启动信息,确保在日志系统配置之后打印
155
+ self._log_startup_info()
156
+
157
+ # 将启动爬虫名称的日志移到这里,确保在日志系统配置之后打印
158
+ logger.info(f"Starting running {self.spider.name}")
159
+
140
160
  async def crawl(self):
141
161
  """
142
162
  Start the crawler core process
@@ -233,6 +253,52 @@ class Crawler:
233
253
  return self._end_time - self._start_time
234
254
  return 0.0
235
255
 
256
+ def _log_startup_info(self):
257
+ """Print startup information, including run mode and key configuration checks"""
258
+ # Get run mode
259
+ run_mode = self.settings.get('RUN_MODE', 'standalone')
260
+
261
+ # Get version number
262
+ version = self.settings.get('VERSION', '1.0.0')
263
+ if not version or version == 'None':
264
+ version = '1.0.0'
265
+
266
+ # Print framework start info
267
+ logger.info(f"Crawlo Framework Started {version}")
268
+
269
+ # Add mode info if available
270
+ mode_info = self.settings.get('_mode_info')
271
+ if mode_info:
272
+ logger.info(mode_info)
273
+ else:
274
+ # 如果没有_mode_info,添加默认信息
275
+ logger.info("使用单机模式 - 简单快速,适合开发和中小规模爬取")
276
+
277
+ # Get actual queue type
278
+ queue_type = self.settings.get('QUEUE_TYPE', 'memory')
279
+
280
+ # Display information based on run mode and queue type combination
281
+ if run_mode == 'distributed':
282
+ logger.info("Run Mode: distributed")
283
+ logger.info("Distributed Mode - Multi-node collaboration supported")
284
+ # Show Redis configuration
285
+ redis_host = self.settings.get('REDIS_HOST', 'localhost')
286
+ redis_port = self.settings.get('REDIS_PORT', 6379)
287
+ logger.info(f"Redis Address: {redis_host}:{redis_port}")
288
+ elif run_mode == 'standalone':
289
+ if queue_type == 'redis':
290
+ logger.info("Run Mode: standalone+redis")
291
+ # Show Redis configuration
292
+ redis_host = self.settings.get('REDIS_HOST', 'localhost')
293
+ redis_port = self.settings.get('REDIS_PORT', 6379)
294
+ logger.info(f"Redis Address: {redis_host}:{redis_port}")
295
+ elif queue_type == 'auto':
296
+ logger.info("Run Mode: standalone+auto")
297
+ else: # memory
298
+ logger.info("Run Mode: standalone")
299
+ else:
300
+ logger.info(f"Run Mode: {run_mode}")
301
+
236
302
  async def _ensure_cleanup(self):
237
303
  """Ensure resource cleanup"""
238
304
  try:
@@ -483,7 +549,10 @@ class CrawlerProcess:
483
549
  signal.signal(signal.SIGINT, self._shutdown)
484
550
  signal.signal(signal.SIGTERM, self._shutdown)
485
551
 
486
- self._log_startup_info()
552
+ # 注意:移除在这里调用_log_startup_info(),因为这时候日志系统可能还没有被正确配置
553
+ # 日志系统的配置是在project.py的get_settings函数中进行的,而CrawlerProcess的实例化
554
+ # 是在get_settings函数返回之前进行的,所以这时候调用_log_startup_info()可能会导致
555
+ # 日志信息没有被正确写入到日志文件中
487
556
 
488
557
  logger.debug(
489
558
  f"CrawlerProcess initialized successfully\n"
@@ -983,39 +1052,41 @@ class CrawlerProcess:
983
1052
  if not version or version == 'None':
984
1053
  version = '1.0.0'
985
1054
 
986
- # Build startup info log
987
- startup_info = [
988
- f"Crawlo Framework Started {version}"
989
- ]
1055
+ # Print framework start info
1056
+ logger.info(f"Crawlo Framework Started {version}")
1057
+
1058
+ # Add mode info if available
1059
+ mode_info = self.settings.get('_mode_info')
1060
+ if mode_info:
1061
+ logger.info(mode_info)
1062
+ else:
1063
+ # 如果没有_mode_info,添加默认信息
1064
+ logger.info("使用单机模式 - 简单快速,适合开发和中小规模爬取")
990
1065
 
991
1066
  # Get actual queue type
992
1067
  queue_type = self.settings.get('QUEUE_TYPE', 'memory')
993
1068
 
994
1069
  # Display information based on run mode and queue type combination
995
1070
  if run_mode == 'distributed':
996
- startup_info.append("Run Mode: distributed")
997
- startup_info.append("Distributed Mode - Multi-node collaboration supported")
1071
+ logger.info("Run Mode: distributed")
1072
+ logger.info("Distributed Mode - Multi-node collaboration supported")
998
1073
  # Show Redis configuration
999
1074
  redis_host = self.settings.get('REDIS_HOST', 'localhost')
1000
1075
  redis_port = self.settings.get('REDIS_PORT', 6379)
1001
- startup_info.append(f"Redis Address: {redis_host}:{redis_port}")
1076
+ logger.info(f"Redis Address: {redis_host}:{redis_port}")
1002
1077
  elif run_mode == 'standalone':
1003
1078
  if queue_type == 'redis':
1004
- startup_info.append("Run Mode: standalone+redis")
1079
+ logger.info("Run Mode: standalone+redis")
1005
1080
  # Show Redis configuration
1006
1081
  redis_host = self.settings.get('REDIS_HOST', 'localhost')
1007
1082
  redis_port = self.settings.get('REDIS_PORT', 6379)
1008
- startup_info.append(f"Redis Address: {redis_host}:{redis_port}")
1083
+ logger.info(f"Redis Address: {redis_host}:{redis_port}")
1009
1084
  elif queue_type == 'auto':
1010
- startup_info.append("Run Mode: standalone+auto")
1085
+ logger.info("Run Mode: standalone+auto")
1011
1086
  else: # memory
1012
- startup_info.append("Run Mode: standalone")
1087
+ logger.info("Run Mode: standalone")
1013
1088
  else:
1014
- startup_info.append(f"Run Mode: {run_mode}")
1015
-
1016
- # Print startup information at INFO level
1017
- for info in startup_info:
1018
- logger.info(info)
1089
+ logger.info(f"Run Mode: {run_mode}")
1019
1090
 
1020
1091
 
1021
1092
  # === Utility functions ===
@@ -45,7 +45,8 @@ class OffsiteMiddleware:
45
45
  # 编译域名正则表达式以提高性能
46
46
  o._compile_domains()
47
47
 
48
- crawler.logger.info(f"OffsiteMiddleware已启用,允许的域名: {allowed_domains}")
48
+ # 使用中间件自己的logger而不是crawler.logger
49
+ o.logger.info(f"OffsiteMiddleware已启用,允许的域名: {allowed_domains}")
49
50
  return o
50
51
 
51
52
  def _compile_domains(self):
@@ -14,8 +14,6 @@ import os
14
14
  from enum import Enum
15
15
  from typing import Dict, Any, Optional
16
16
 
17
- from crawlo.utils.log import get_logger
18
-
19
17
 
20
18
  class RunMode(Enum):
21
19
  """运行模式枚举"""
@@ -28,7 +26,7 @@ class ModeManager:
28
26
  """运行模式管理器"""
29
27
 
30
28
  def __init__(self):
31
- self.logger = get_logger(self.__class__.__name__)
29
+ pass
32
30
 
33
31
  @staticmethod
34
32
  def get_standalone_settings() -> Dict[str, Any]:
@@ -40,52 +38,6 @@ class ModeManager:
40
38
  'CONCURRENCY': 8,
41
39
  'MAX_RUNNING_SPIDERS': 1,
42
40
  'DOWNLOAD_DELAY': 1.0,
43
- 'LOG_LEVEL': 'INFO',
44
- }
45
-
46
- @staticmethod
47
- def get_distributed_settings(
48
- redis_host: str = '127.0.0.1',
49
- redis_port: int = 6379,
50
- redis_password: Optional[str] = None,
51
- redis_db: int = 0, # 添加 redis_db 参数
52
- project_name: str = 'crawlo'
53
- ) -> Dict[str, Any]:
54
- """获取分布式模式配置"""
55
- # 构建 Redis URL,使用传入的 redis_db 参数
56
- if redis_password:
57
- redis_url = f'redis://:{redis_password}@{redis_host}:{redis_port}/{redis_db}'
58
- else:
59
- redis_url = f'redis://{redis_host}:{redis_port}/{redis_db}'
60
-
61
- return {
62
- 'PROJECT_NAME': project_name, # 添加项目名称到配置中
63
- 'QUEUE_TYPE': 'redis',
64
- 'FILTER_CLASS': 'crawlo.filters.aioredis_filter.AioRedisFilter',
65
- 'REDIS_HOST': redis_host,
66
- 'REDIS_PORT': redis_port,
67
- 'REDIS_PASSWORD': redis_password,
68
- 'REDIS_DB': redis_db, # 添加 Redis 数据库编号到配置中
69
- 'REDIS_URL': redis_url,
70
- 'SCHEDULER_QUEUE_NAME': f'crawlo:{project_name}:queue:requests', # 使用统一命名规范
71
- # Redis key配置已移至各组件中,使用统一的命名规范
72
- # crawlo:{project_name}:filter:fingerprint (请求去重)
73
- 'CONCURRENCY': 16,
74
- 'MAX_RUNNING_SPIDERS': 1,
75
- 'DOWNLOAD_DELAY': 1.0,
76
- 'LOG_LEVEL': 'INFO',
77
- }
78
-
79
- @staticmethod
80
- def get_auto_settings() -> Dict[str, Any]:
81
- """获取自动检测模式配置"""
82
- return {
83
- 'QUEUE_TYPE': 'auto',
84
- 'FILTER_CLASS': 'crawlo.filters.memory_filter.MemoryFilter', # 默认内存过滤器
85
- 'CONCURRENCY': 12,
86
- 'MAX_RUNNING_SPIDERS': 1,
87
- 'DOWNLOAD_DELAY': 1.0,
88
- 'LOG_LEVEL': 'INFO',
89
41
  }
90
42
 
91
43
  def resolve_mode_settings(
@@ -104,13 +56,14 @@ class ModeManager:
104
56
  Dict[str, Any]: 配置字典
105
57
  """
106
58
  mode = RunMode(mode.lower())
59
+ mode_info = None
107
60
 
108
61
  if mode == RunMode.STANDALONE:
109
- self.logger.info("使用单机模式 - 简单快速,适合开发和中小规模爬取")
62
+ mode_info = "使用单机模式 - 简单快速,适合开发和中小规模爬取"
110
63
  settings = self.get_standalone_settings()
111
64
 
112
65
  elif mode == RunMode.DISTRIBUTED:
113
- self.logger.info("使用分布式模式 - 支持多节点扩展,适合大规模爬取")
66
+ mode_info = "使用分布式模式 - 支持多节点扩展,适合大规模爬取"
114
67
  settings = self.get_distributed_settings(
115
68
  redis_host=kwargs.get('redis_host', '127.0.0.1'),
116
69
  redis_port=kwargs.get('redis_port', 6379),
@@ -120,7 +73,7 @@ class ModeManager:
120
73
  )
121
74
 
122
75
  elif mode == RunMode.AUTO:
123
- self.logger.info("使用自动检测模式 - 智能选择最佳运行方式")
76
+ mode_info = "使用自动检测模式 - 智能选择最佳运行方式"
124
77
  settings = self.get_auto_settings()
125
78
 
126
79
  else:
@@ -131,6 +84,9 @@ class ModeManager:
131
84
  if k not in ['redis_host', 'redis_port', 'redis_password', 'project_name']}
132
85
  settings.update(user_settings)
133
86
 
87
+ # 将模式信息添加到配置中,供后续使用
88
+ settings['_mode_info'] = mode_info
89
+
134
90
  return settings
135
91
 
136
92
  def from_environment(self) -> Dict[str, Any]:
@@ -190,24 +146,4 @@ def auto_mode(**kwargs) -> Dict[str, Any]:
190
146
  def from_env(default_mode: str = 'standalone') -> Dict[str, Any]:
191
147
  """从环境变量创建配置"""
192
148
  # 移除直接使用 os.getenv(),要求通过 settings 配置
193
- raise RuntimeError("环境变量配置已移除,请在 settings 中配置相关参数")
194
-
195
- # 保留原有代码作为参考
196
- # mode = os.getenv('CRAWLO_MODE', default_mode).lower()
197
- #
198
- # if mode == 'distributed':
199
- # return distributed_mode(
200
- # redis_host=os.getenv('REDIS_HOST', '127.0.0.1'),
201
- # redis_port=int(os.getenv('REDIS_PORT', 6379)),
202
- # redis_password=os.getenv('REDIS_PASSWORD'),
203
- # project_name=os.getenv('PROJECT_NAME', 'crawlo'),
204
- # CONCURRENCY=int(os.getenv('CONCURRENCY', 16)),
205
- # )
206
- # elif mode == 'auto':
207
- # return auto_mode(
208
- # CONCURRENCY=int(os.getenv('CONCURRENCY', 12)),
209
- # )
210
- # else: # standalone
211
- # return standalone_mode(
212
- # CONCURRENCY=int(os.getenv('CONCURRENCY', 8)),
213
- # )
149
+ raise RuntimeError("环境变量配置已移除,请在 settings 中配置相关参数")
@@ -7,6 +7,7 @@ from typing import Optional, List, Dict
7
7
  from crawlo.exceptions import ItemDiscard
8
8
  from crawlo.utils.db_helper import make_insert_sql, make_batch_sql
9
9
  from crawlo.utils.log import get_logger
10
+ from . import BasePipeline
10
11
 
11
12
 
12
13
  class AsyncmyMySQLPipeline:
@@ -200,7 +201,7 @@ class AiomysqlMySQLPipeline:
200
201
  crawler.subscriber.subscribe(self.spider_closed, event='spider_closed')
201
202
 
202
203
  @classmethod
203
- def create_instance(cls, crawler):
204
+ def from_crawler(cls, crawler):
204
205
  return cls(crawler)
205
206
 
206
207
  async def _init_pool(self):
@@ -213,12 +214,12 @@ class AiomysqlMySQLPipeline:
213
214
  try:
214
215
  self.pool = await aiomysql.create_pool(
215
216
  host=self.settings.get('MYSQL_HOST', 'localhost'),
216
- port=self.settings.getint('MYSQL_PORT', 3306),
217
+ port=self.settings.get_int('MYSQL_PORT', 3306),
217
218
  user=self.settings.get('MYSQL_USER', 'root'),
218
219
  password=self.settings.get('MYSQL_PASSWORD', ''),
219
220
  db=self.settings.get('MYSQL_DB', 'scrapy_db'),
220
- minsize=self.settings.getint('MYSQL_POOL_MIN', 2),
221
- maxsize=self.settings.getint('MYSQL_POOL_MAX', 5),
221
+ minsize=self.settings.get_int('MYSQL_POOL_MIN', 2),
222
+ maxsize=self.settings.get_int('MYSQL_POOL_MAX', 5),
222
223
  cursorclass=aiomysql.DictCursor,
223
224
  autocommit=False
224
225
  )
@@ -30,6 +30,7 @@ class PipelineManager:
30
30
  # 移除所有去重管道实例(如果存在)
31
31
  pipelines = [item for item in pipelines if item != dedup_pipeline]
32
32
  # 在开头插入去重管道
33
+ self.logger.debug(f"{dedup_pipeline} insert successful")
33
34
  pipelines.insert(0, dedup_pipeline)
34
35
 
35
36
  self._add_pipelines(pipelines)
@@ -46,7 +47,7 @@ class PipelineManager:
46
47
  pipeline_cls = load_class(pipeline)
47
48
  if not hasattr(pipeline_cls, 'from_crawler'):
48
49
  raise PipelineInitError(
49
- f"Pipeline init failed, must inherit from `BasePipeline` or have a `create_instance` method"
50
+ f"Pipeline init failed, must inherit from `BasePipeline` or have a `from_crawler` method"
50
51
  )
51
52
  self.pipelines.append(pipeline_cls.from_crawler(self.crawler))
52
53
  except Exception as e:
@@ -268,12 +268,25 @@ def get_settings(custom_settings: Optional[dict] = None) -> SettingManager:
268
268
  except Exception as e:
269
269
  raise ImportError(f"加载 settings 模块失败 '{settings_module_path}': {e}")
270
270
 
271
- # 5. 合并运行时配置
271
+ # 5. 根据 RUN_MODE 获取相应配置
272
+ run_mode = settings.get('RUN_MODE', 'standalone')
273
+ if run_mode:
274
+ from crawlo.mode_manager import ModeManager
275
+ mode_manager = ModeManager()
276
+ mode_settings = mode_manager.resolve_mode_settings(run_mode)
277
+ # 合并模式配置,但不覆盖用户已设置的配置
278
+ for key, value in mode_settings.items():
279
+ # 只有当用户没有设置该配置项时才应用模式配置
280
+ if key not in settings.attributes:
281
+ settings.set(key, value)
282
+ logger.debug(f"🔧 已应用 {run_mode} 模式配置")
283
+
284
+ # 6. 合并运行时配置
272
285
  if custom_settings:
273
286
  settings.update_attributes(custom_settings)
274
287
  logger.debug(f"🔧 已应用运行时自定义配置: {list(custom_settings.keys())}")
275
288
 
276
- # 6. 显示核心配置摘要(INFO级别)
289
+ # 7. 显示核心配置摘要(INFO级别)
277
290
  # _log_settings_summary(settings)
278
291
 
279
292
  # 配置日志系统
@@ -281,4 +294,4 @@ def get_settings(custom_settings: Optional[dict] = None) -> SettingManager:
281
294
 
282
295
  # 将项目初始化完成的消息改为DEBUG级别
283
296
  logger.debug("🎉 Crawlo 项目配置初始化完成!")
284
- return settings
297
+ return settings
@@ -48,7 +48,18 @@ QUEUE_TYPE = 'auto'
48
48
  # 默认使用内存过滤器和去重管道,确保在无Redis环境下也能正常运行
49
49
  # 在auto模式下,如果Redis可用,框架会自动更新为Redis实现以提供更好的去重能力
50
50
  DEFAULT_DEDUP_PIPELINE = 'crawlo.pipelines.memory_dedup_pipeline.MemoryDedupPipeline'
51
- FILTER_CLASS = 'crawlo.filters.memory_filter.MemoryFilter'
51
+ FILTER_CLASS = 'crawlo.filters.memory_filter.MemoryFilter'
52
+
53
+
54
+ MYSQL_HOST = '127.0.0.1'
55
+ MYSQL_PORT = 3306
56
+ MYSQL_USER = 'root'
57
+ MYSQL_PASSWORD = '123456'
58
+ MYSQL_DB = 'crawl_pro'
59
+ MYSQL_TABLE = 'crawlo'
60
+ MYSQL_BATCH_SIZE = 100
61
+ MYSQL_USE_BATCH = False # 是否启用批量插入
62
+
52
63
 
53
64
  # --- Redis 过滤器配置 ---
54
65
  # 使用环境变量配置工具获取 Redis 配置
@@ -85,7 +96,6 @@ MIDDLEWARES = [
85
96
  'crawlo.middleware.request_ignore.RequestIgnoreMiddleware', # 1. 忽略无效请求
86
97
  'crawlo.middleware.download_delay.DownloadDelayMiddleware', # 2. 控制请求频率
87
98
  'crawlo.middleware.default_header.DefaultHeaderMiddleware', # 3. 添加默认请求头
88
- # 'crawlo.middleware.proxy.ProxyMiddleware', # 4. 设置代理(默认不启用)
89
99
  'crawlo.middleware.offsite.OffsiteMiddleware', # 5. 站外请求过滤
90
100
 
91
101
  # === 响应处理阶段 ===
@@ -98,8 +108,7 @@ MIDDLEWARES = [
98
108
 
99
109
  # 框架数据处理管道列表(框架默认管道 + 用户自定义管道)
100
110
  PIPELINES = [
101
- 'crawlo.pipelines.console_pipeline.ConsolePipeline', # 控制台输出
102
- # 'crawlo.pipelines.mysql_pipeline.AsyncmyMySQLPipeline', # MySQL 存储(可选)
111
+ 'crawlo.pipelines.console_pipeline.ConsolePipeline',
103
112
  ]
104
113
 
105
114
  # 明确添加默认去重管道到管道列表开头
@@ -27,7 +27,14 @@ class SettingManager(MutableMapping):
27
27
  user_middlewares = user_config['MIDDLEWARES']
28
28
  # 如果用户配置了空列表,则仍然使用默认配置
29
29
  if user_middlewares:
30
- self.attributes['MIDDLEWARES'] = default_middlewares + user_middlewares
30
+ # 过滤掉空值和注释
31
+ user_middlewares = [middleware for middleware in user_middlewares if middleware and not middleware.strip().startswith('#')]
32
+ # 合并默认中间件和用户中间件,去重但保持顺序
33
+ merged_middlewares = default_middlewares[:]
34
+ for middleware in user_middlewares:
35
+ if middleware not in merged_middlewares:
36
+ merged_middlewares.append(middleware)
37
+ self.attributes['MIDDLEWARES'] = merged_middlewares
31
38
 
32
39
  # 合并管道配置
33
40
  if 'PIPELINES' in user_config:
@@ -37,8 +44,12 @@ class SettingManager(MutableMapping):
37
44
  if user_pipelines:
38
45
  # 过滤掉空值和注释
39
46
  user_pipelines = [pipeline for pipeline in user_pipelines if pipeline and not pipeline.strip().startswith('#')]
40
- if user_pipelines:
41
- self.attributes['PIPELINES'] = user_pipelines
47
+ # 合并默认管道和用户管道,去重但保持顺序
48
+ merged_pipelines = default_pipelines[:]
49
+ for pipeline in user_pipelines:
50
+ if pipeline not in merged_pipelines:
51
+ merged_pipelines.append(pipeline)
52
+ self.attributes['PIPELINES'] = merged_pipelines
42
53
 
43
54
  # 特殊处理PIPELINES,确保去重管道在最前面
44
55
  dedup_pipeline = self.attributes.get('DEFAULT_DEDUP_PIPELINE')
@@ -56,8 +67,14 @@ class SettingManager(MutableMapping):
56
67
  user_extensions = user_config['EXTENSIONS']
57
68
  # 如果用户配置了空列表,则仍然使用默认配置
58
69
  if user_extensions:
59
- self.attributes['EXTENSIONS'] = default_extensions + user_extensions
60
- # 如果用户没有配置扩展,则使用默认配置
70
+ # 过滤掉空值和注释
71
+ user_extensions = [extension for extension in user_extensions if extension and not extension.strip().startswith('#')]
72
+ # 合并默认扩展和用户扩展,去重但保持顺序
73
+ merged_extensions = default_extensions[:]
74
+ for extension in user_extensions:
75
+ if extension not in merged_extensions:
76
+ merged_extensions.append(extension)
77
+ self.attributes['EXTENSIONS'] = merged_extensions
61
78
 
62
79
  # 更新其他用户配置
63
80
  for key, value in user_config.items():
@@ -119,9 +136,15 @@ class SettingManager(MutableMapping):
119
136
  def set_settings(self, module):
120
137
  if isinstance(module, str):
121
138
  module = import_module(module)
139
+
140
+ # 收集模块中的所有配置项
141
+ module_settings = {}
122
142
  for key in dir(module):
123
143
  if key.isupper():
124
- self.set(key, getattr(module, key))
144
+ module_settings[key] = getattr(module, key)
145
+
146
+ # 使用合并逻辑而不是直接设置
147
+ self._merge_config(module_settings)
125
148
 
126
149
  # 实现 MutableMapping 必须的方法
127
150
  def __getitem__(self, item):
@@ -17,8 +17,8 @@ class LoggerManager:
17
17
  logger_cache = {}
18
18
  _default_filename = None
19
19
  _default_level = DEBUG # 设置为最低级别,由handler控制实际输出
20
- _default_file_level = INFO # 默认为INFO级别
21
- _default_console_level = INFO # 默认为INFO级别
20
+ _default_file_level = DEBUG # 默认为DEBUG级别,确保所有INFO级别日志都能写入文件
21
+ _default_console_level = DEBUG # 默认为DEBUG级别
22
22
  _default_log_format = LOG_FORMAT
23
23
  _default_encoding = 'utf-8'
24
24
  _configured = False # 标记是否已配置
@@ -62,14 +62,14 @@ class LoggerManager:
62
62
  level = get_val('LOG_LEVEL', 'INFO') # 默认为INFO级别
63
63
  file_level = get_val('LOG_FILE_LEVEL', level) # 默认继承LOG_LEVEL的值
64
64
  # 根据项目规范,已完全移除LOG_CONSOLE_LEVEL支持,统一使用LOG_LEVEL控制控制台和文件的日志输出级别
65
- console_level = level # 控制台日志级别直接使用LOG_LEVEL的值
66
65
  log_format = get_val('LOG_FORMAT', LOG_FORMAT)
67
66
  encoding = get_val('LOG_ENCODING', 'utf-8')
68
67
 
69
68
  cls._default_filename = filename
70
69
  cls._default_level = cls._to_level(level)
71
70
  cls._default_file_level = cls._to_level(file_level)
72
- cls._default_console_level = cls._to_level(console_level)
71
+ # 控制台日志级别直接使用LOG_LEVEL的值,不再支持LOG_CONSOLE_LEVEL
72
+ cls._default_console_level = cls._default_level
73
73
  cls._default_log_format = log_format
74
74
  cls._default_encoding = encoding
75
75