crawlo 1.3.2__tar.gz → 1.3.4__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of crawlo might be problematic. Click here for more details.

Files changed (299) hide show
  1. {crawlo-1.3.2/crawlo.egg-info → crawlo-1.3.4}/PKG-INFO +120 -14
  2. {crawlo-1.3.2 → crawlo-1.3.4}/README.md +119 -13
  3. {crawlo-1.3.2 → crawlo-1.3.4}/crawlo/__init__.py +24 -0
  4. crawlo-1.3.4/crawlo/__version__.py +1 -0
  5. {crawlo-1.3.2 → crawlo-1.3.4}/crawlo/commands/run.py +58 -32
  6. crawlo-1.3.4/crawlo/core/__init__.py +46 -0
  7. {crawlo-1.3.2 → crawlo-1.3.4}/crawlo/core/engine.py +119 -45
  8. {crawlo-1.3.2 → crawlo-1.3.4}/crawlo/core/scheduler.py +4 -3
  9. crawlo-1.3.4/crawlo/crawler.py +639 -0
  10. {crawlo-1.3.2 → crawlo-1.3.4}/crawlo/downloader/aiohttp_downloader.py +4 -2
  11. {crawlo-1.3.2 → crawlo-1.3.4}/crawlo/extension/__init__.py +1 -1
  12. {crawlo-1.3.2 → crawlo-1.3.4}/crawlo/extension/logging_extension.py +23 -7
  13. crawlo-1.3.4/crawlo/factories/__init__.py +28 -0
  14. crawlo-1.3.4/crawlo/factories/base.py +69 -0
  15. crawlo-1.3.4/crawlo/factories/crawler.py +104 -0
  16. crawlo-1.3.4/crawlo/factories/registry.py +85 -0
  17. {crawlo-1.3.2 → crawlo-1.3.4}/crawlo/filters/aioredis_filter.py +25 -2
  18. crawlo-1.3.4/crawlo/framework.py +292 -0
  19. crawlo-1.3.4/crawlo/initialization/__init__.py +40 -0
  20. crawlo-1.3.4/crawlo/initialization/built_in.py +426 -0
  21. crawlo-1.3.4/crawlo/initialization/context.py +142 -0
  22. crawlo-1.3.4/crawlo/initialization/core.py +194 -0
  23. crawlo-1.3.4/crawlo/initialization/phases.py +149 -0
  24. crawlo-1.3.4/crawlo/initialization/registry.py +146 -0
  25. {crawlo-1.3.2 → crawlo-1.3.4}/crawlo/items/base.py +2 -1
  26. crawlo-1.3.4/crawlo/logging/__init__.py +38 -0
  27. crawlo-1.3.4/crawlo/logging/config.py +97 -0
  28. crawlo-1.3.4/crawlo/logging/factory.py +129 -0
  29. crawlo-1.3.4/crawlo/logging/manager.py +112 -0
  30. {crawlo-1.3.2 → crawlo-1.3.4}/crawlo/middleware/middleware_manager.py +1 -1
  31. {crawlo-1.3.2 → crawlo-1.3.4}/crawlo/middleware/offsite.py +1 -1
  32. {crawlo-1.3.2 → crawlo-1.3.4}/crawlo/mode_manager.py +26 -1
  33. {crawlo-1.3.2 → crawlo-1.3.4}/crawlo/pipelines/pipeline_manager.py +2 -1
  34. {crawlo-1.3.2 → crawlo-1.3.4}/crawlo/project.py +76 -46
  35. {crawlo-1.3.2 → crawlo-1.3.4}/crawlo/queue/pqueue.py +11 -5
  36. {crawlo-1.3.2 → crawlo-1.3.4}/crawlo/queue/queue_manager.py +143 -19
  37. {crawlo-1.3.2 → crawlo-1.3.4}/crawlo/queue/redis_priority_queue.py +69 -49
  38. {crawlo-1.3.2 → crawlo-1.3.4}/crawlo/settings/default_settings.py +110 -14
  39. {crawlo-1.3.2 → crawlo-1.3.4}/crawlo/settings/setting_manager.py +29 -13
  40. {crawlo-1.3.2 → crawlo-1.3.4}/crawlo/spider/__init__.py +34 -16
  41. {crawlo-1.3.2 → crawlo-1.3.4}/crawlo/stats_collector.py +17 -3
  42. crawlo-1.3.4/crawlo/task_manager.py +139 -0
  43. crawlo-1.3.4/crawlo/templates/project/settings.py.tmpl +168 -0
  44. crawlo-1.3.4/crawlo/templates/project/settings_distributed.py.tmpl +167 -0
  45. crawlo-1.3.4/crawlo/templates/project/settings_gentle.py.tmpl +167 -0
  46. crawlo-1.3.4/crawlo/templates/project/settings_high_performance.py.tmpl +168 -0
  47. crawlo-1.3.4/crawlo/templates/project/settings_minimal.py.tmpl +66 -0
  48. crawlo-1.3.4/crawlo/templates/project/settings_simple.py.tmpl +165 -0
  49. crawlo-1.3.4/crawlo/templates/project/spiders/__init__.py.tmpl +10 -0
  50. {crawlo-1.3.2 → crawlo-1.3.4}/crawlo/templates/run.py.tmpl +10 -14
  51. crawlo-1.3.4/crawlo/templates/spiders_init.py.tmpl +10 -0
  52. crawlo-1.3.4/crawlo/tools/network_diagnostic.py +365 -0
  53. crawlo-1.3.4/crawlo/utils/class_loader.py +26 -0
  54. {crawlo-1.3.2 → crawlo-1.3.4}/crawlo/utils/error_handler.py +76 -35
  55. crawlo-1.3.4/crawlo/utils/log.py +44 -0
  56. {crawlo-1.3.2 → crawlo-1.3.4}/crawlo/utils/redis_connection_pool.py +43 -6
  57. {crawlo-1.3.2 → crawlo-1.3.4}/crawlo/utils/request_serializer.py +8 -1
  58. {crawlo-1.3.2 → crawlo-1.3.4/crawlo.egg-info}/PKG-INFO +120 -14
  59. {crawlo-1.3.2 → crawlo-1.3.4}/crawlo.egg-info/SOURCES.txt +61 -2
  60. {crawlo-1.3.2 → crawlo-1.3.4}/tests/authenticated_proxy_example.py +2 -2
  61. crawlo-1.3.4/tests/baidu_performance_test.py +109 -0
  62. crawlo-1.3.4/tests/baidu_test.py +60 -0
  63. crawlo-1.3.4/tests/comprehensive_framework_test.py +213 -0
  64. crawlo-1.3.4/tests/comprehensive_test.py +82 -0
  65. crawlo-1.3.4/tests/comprehensive_testing_summary.md +187 -0
  66. crawlo-1.3.4/tests/debug_configure.py +70 -0
  67. crawlo-1.3.4/tests/debug_framework_logger.py +85 -0
  68. crawlo-1.3.4/tests/debug_log_levels.py +64 -0
  69. crawlo-1.3.4/tests/distributed_test.py +67 -0
  70. crawlo-1.3.4/tests/distributed_test_debug.py +77 -0
  71. crawlo-1.3.4/tests/final_command_test_report.md +0 -0
  72. crawlo-1.3.4/tests/final_comprehensive_test.py +152 -0
  73. crawlo-1.3.4/tests/final_validation_test.py +183 -0
  74. crawlo-1.3.4/tests/framework_performance_test.py +203 -0
  75. crawlo-1.3.4/tests/optimized_performance_test.py +212 -0
  76. crawlo-1.3.4/tests/performance_comparison.py +246 -0
  77. crawlo-1.3.4/tests/queue_blocking_test.py +114 -0
  78. crawlo-1.3.4/tests/queue_test.py +90 -0
  79. crawlo-1.3.4/tests/scrapy_comparison/ofweek_scrapy.py +139 -0
  80. crawlo-1.3.4/tests/scrapy_comparison/scrapy_test.py +134 -0
  81. crawlo-1.3.4/tests/simple_command_test.py +120 -0
  82. crawlo-1.3.4/tests/simple_crawlo_test.py +128 -0
  83. crawlo-1.3.4/tests/simple_log_test.py +58 -0
  84. crawlo-1.3.4/tests/simple_optimization_test.py +129 -0
  85. crawlo-1.3.4/tests/simple_spider_test.py +50 -0
  86. crawlo-1.3.4/tests/simple_test.py +48 -0
  87. crawlo-1.3.4/tests/test_all_commands.py +231 -0
  88. crawlo-1.3.4/tests/test_batch_processor.py +179 -0
  89. crawlo-1.3.4/tests/test_component_factory.py +175 -0
  90. crawlo-1.3.4/tests/test_controlled_spider_mixin.py +80 -0
  91. crawlo-1.3.4/tests/test_enhanced_error_handler_comprehensive.py +246 -0
  92. crawlo-1.3.4/tests/test_factories.py +253 -0
  93. crawlo-1.3.4/tests/test_framework_logger.py +67 -0
  94. crawlo-1.3.4/tests/test_framework_startup.py +65 -0
  95. crawlo-1.3.4/tests/test_large_scale_config.py +113 -0
  96. crawlo-1.3.4/tests/test_large_scale_helper.py +236 -0
  97. crawlo-1.3.4/tests/test_mode_change.py +73 -0
  98. {crawlo-1.3.2 → crawlo-1.3.4}/tests/test_mode_consistency.py +1 -1
  99. crawlo-1.3.4/tests/test_performance_monitor.py +116 -0
  100. crawlo-1.3.4/tests/test_queue_empty_check.py +42 -0
  101. crawlo-1.3.4/tests/untested_features_report.md +139 -0
  102. crawlo-1.3.4/tests/verify_debug.py +52 -0
  103. crawlo-1.3.4/tests/verify_log_fix.py +112 -0
  104. crawlo-1.3.2/crawlo/__version__.py +0 -1
  105. crawlo-1.3.2/crawlo/core/__init__.py +0 -2
  106. crawlo-1.3.2/crawlo/crawler.py +0 -1169
  107. crawlo-1.3.2/crawlo/task_manager.py +0 -30
  108. crawlo-1.3.2/crawlo/templates/project/settings.py.tmpl +0 -267
  109. crawlo-1.3.2/crawlo/templates/project/settings_distributed.py.tmpl +0 -180
  110. crawlo-1.3.2/crawlo/templates/project/settings_gentle.py.tmpl +0 -61
  111. crawlo-1.3.2/crawlo/templates/project/settings_high_performance.py.tmpl +0 -131
  112. crawlo-1.3.2/crawlo/templates/project/settings_minimal.py.tmpl +0 -35
  113. crawlo-1.3.2/crawlo/templates/project/settings_simple.py.tmpl +0 -102
  114. crawlo-1.3.2/crawlo/templates/project/spiders/__init__.py.tmpl +0 -6
  115. crawlo-1.3.2/crawlo/utils/log.py +0 -147
  116. crawlo-1.3.2/tests/DOUBLE_CRAWLO_PREFIX_FIX_REPORT.md +0 -82
  117. {crawlo-1.3.2 → crawlo-1.3.4}/LICENSE +0 -0
  118. {crawlo-1.3.2 → crawlo-1.3.4}/MANIFEST.in +0 -0
  119. {crawlo-1.3.2 → crawlo-1.3.4}/crawlo/cli.py +0 -0
  120. {crawlo-1.3.2 → crawlo-1.3.4}/crawlo/commands/__init__.py +0 -0
  121. {crawlo-1.3.2 → crawlo-1.3.4}/crawlo/commands/check.py +0 -0
  122. {crawlo-1.3.2 → crawlo-1.3.4}/crawlo/commands/genspider.py +0 -0
  123. {crawlo-1.3.2 → crawlo-1.3.4}/crawlo/commands/help.py +0 -0
  124. {crawlo-1.3.2 → crawlo-1.3.4}/crawlo/commands/list.py +0 -0
  125. {crawlo-1.3.2 → crawlo-1.3.4}/crawlo/commands/startproject.py +0 -0
  126. {crawlo-1.3.2 → crawlo-1.3.4}/crawlo/commands/stats.py +0 -0
  127. {crawlo-1.3.2 → crawlo-1.3.4}/crawlo/commands/utils.py +0 -0
  128. {crawlo-1.3.2 → crawlo-1.3.4}/crawlo/config.py +0 -0
  129. {crawlo-1.3.2 → crawlo-1.3.4}/crawlo/config_validator.py +0 -0
  130. {crawlo-1.3.2 → crawlo-1.3.4}/crawlo/core/processor.py +0 -0
  131. {crawlo-1.3.2 → crawlo-1.3.4}/crawlo/data/__init__.py +0 -0
  132. {crawlo-1.3.2 → crawlo-1.3.4}/crawlo/data/user_agents.py +0 -0
  133. {crawlo-1.3.2 → crawlo-1.3.4}/crawlo/downloader/__init__.py +0 -0
  134. {crawlo-1.3.2 → crawlo-1.3.4}/crawlo/downloader/cffi_downloader.py +0 -0
  135. {crawlo-1.3.2 → crawlo-1.3.4}/crawlo/downloader/httpx_downloader.py +0 -0
  136. {crawlo-1.3.2 → crawlo-1.3.4}/crawlo/downloader/hybrid_downloader.py +0 -0
  137. {crawlo-1.3.2 → crawlo-1.3.4}/crawlo/downloader/playwright_downloader.py +0 -0
  138. {crawlo-1.3.2 → crawlo-1.3.4}/crawlo/downloader/selenium_downloader.py +0 -0
  139. {crawlo-1.3.2 → crawlo-1.3.4}/crawlo/event.py +0 -0
  140. {crawlo-1.3.2 → crawlo-1.3.4}/crawlo/exceptions.py +0 -0
  141. {crawlo-1.3.2 → crawlo-1.3.4}/crawlo/extension/health_check.py +0 -0
  142. {crawlo-1.3.2 → crawlo-1.3.4}/crawlo/extension/log_interval.py +0 -0
  143. {crawlo-1.3.2 → crawlo-1.3.4}/crawlo/extension/log_stats.py +0 -0
  144. {crawlo-1.3.2 → crawlo-1.3.4}/crawlo/extension/memory_monitor.py +0 -0
  145. {crawlo-1.3.2 → crawlo-1.3.4}/crawlo/extension/performance_profiler.py +0 -0
  146. {crawlo-1.3.2 → crawlo-1.3.4}/crawlo/extension/request_recorder.py +0 -0
  147. {crawlo-1.3.2 → crawlo-1.3.4}/crawlo/filters/__init__.py +0 -0
  148. {crawlo-1.3.2 → crawlo-1.3.4}/crawlo/filters/memory_filter.py +0 -0
  149. {crawlo-1.3.2 → crawlo-1.3.4}/crawlo/items/__init__.py +0 -0
  150. {crawlo-1.3.2 → crawlo-1.3.4}/crawlo/items/fields.py +0 -0
  151. {crawlo-1.3.2 → crawlo-1.3.4}/crawlo/items/items.py +0 -0
  152. {crawlo-1.3.2 → crawlo-1.3.4}/crawlo/middleware/__init__.py +0 -0
  153. {crawlo-1.3.2 → crawlo-1.3.4}/crawlo/middleware/default_header.py +0 -0
  154. {crawlo-1.3.2 → crawlo-1.3.4}/crawlo/middleware/download_delay.py +0 -0
  155. {crawlo-1.3.2 → crawlo-1.3.4}/crawlo/middleware/proxy.py +0 -0
  156. {crawlo-1.3.2 → crawlo-1.3.4}/crawlo/middleware/request_ignore.py +0 -0
  157. {crawlo-1.3.2 → crawlo-1.3.4}/crawlo/middleware/response_code.py +0 -0
  158. {crawlo-1.3.2 → crawlo-1.3.4}/crawlo/middleware/response_filter.py +0 -0
  159. {crawlo-1.3.2 → crawlo-1.3.4}/crawlo/middleware/retry.py +0 -0
  160. {crawlo-1.3.2 → crawlo-1.3.4}/crawlo/middleware/simple_proxy.py +0 -0
  161. {crawlo-1.3.2 → crawlo-1.3.4}/crawlo/network/__init__.py +0 -0
  162. {crawlo-1.3.2 → crawlo-1.3.4}/crawlo/network/request.py +0 -0
  163. {crawlo-1.3.2 → crawlo-1.3.4}/crawlo/network/response.py +0 -0
  164. {crawlo-1.3.2 → crawlo-1.3.4}/crawlo/pipelines/__init__.py +0 -0
  165. {crawlo-1.3.2 → crawlo-1.3.4}/crawlo/pipelines/bloom_dedup_pipeline.py +0 -0
  166. {crawlo-1.3.2 → crawlo-1.3.4}/crawlo/pipelines/console_pipeline.py +0 -0
  167. {crawlo-1.3.2 → crawlo-1.3.4}/crawlo/pipelines/csv_pipeline.py +0 -0
  168. {crawlo-1.3.2 → crawlo-1.3.4}/crawlo/pipelines/database_dedup_pipeline.py +0 -0
  169. {crawlo-1.3.2 → crawlo-1.3.4}/crawlo/pipelines/json_pipeline.py +0 -0
  170. {crawlo-1.3.2 → crawlo-1.3.4}/crawlo/pipelines/memory_dedup_pipeline.py +0 -0
  171. {crawlo-1.3.2 → crawlo-1.3.4}/crawlo/pipelines/mongo_pipeline.py +0 -0
  172. {crawlo-1.3.2 → crawlo-1.3.4}/crawlo/pipelines/mysql_pipeline.py +0 -0
  173. {crawlo-1.3.2 → crawlo-1.3.4}/crawlo/pipelines/redis_dedup_pipeline.py +0 -0
  174. {crawlo-1.3.2 → crawlo-1.3.4}/crawlo/queue/__init__.py +0 -0
  175. {crawlo-1.3.2 → crawlo-1.3.4}/crawlo/settings/__init__.py +0 -0
  176. {crawlo-1.3.2 → crawlo-1.3.4}/crawlo/subscriber.py +0 -0
  177. {crawlo-1.3.2 → crawlo-1.3.4}/crawlo/templates/crawlo.cfg.tmpl +0 -0
  178. {crawlo-1.3.2 → crawlo-1.3.4}/crawlo/templates/project/__init__.py.tmpl +0 -0
  179. {crawlo-1.3.2 → crawlo-1.3.4}/crawlo/templates/project/items.py.tmpl +0 -0
  180. {crawlo-1.3.2 → crawlo-1.3.4}/crawlo/templates/project/middlewares.py.tmpl +0 -0
  181. {crawlo-1.3.2 → crawlo-1.3.4}/crawlo/templates/project/pipelines.py.tmpl +0 -0
  182. {crawlo-1.3.2 → crawlo-1.3.4}/crawlo/templates/spider/spider.py.tmpl +0 -0
  183. {crawlo-1.3.2 → crawlo-1.3.4}/crawlo/tools/__init__.py +0 -0
  184. {crawlo-1.3.2 → crawlo-1.3.4}/crawlo/tools/anti_crawler.py +0 -0
  185. {crawlo-1.3.2 → crawlo-1.3.4}/crawlo/tools/authenticated_proxy.py +0 -0
  186. {crawlo-1.3.2 → crawlo-1.3.4}/crawlo/tools/data_formatter.py +0 -0
  187. {crawlo-1.3.2 → crawlo-1.3.4}/crawlo/tools/data_validator.py +0 -0
  188. {crawlo-1.3.2 → crawlo-1.3.4}/crawlo/tools/date_tools.py +0 -0
  189. {crawlo-1.3.2 → crawlo-1.3.4}/crawlo/tools/distributed_coordinator.py +0 -0
  190. {crawlo-1.3.2 → crawlo-1.3.4}/crawlo/tools/encoding_converter.py +0 -0
  191. {crawlo-1.3.2 → crawlo-1.3.4}/crawlo/tools/request_tools.py +0 -0
  192. {crawlo-1.3.2 → crawlo-1.3.4}/crawlo/tools/retry_mechanism.py +0 -0
  193. {crawlo-1.3.2 → crawlo-1.3.4}/crawlo/tools/scenario_adapter.py +0 -0
  194. {crawlo-1.3.2 → crawlo-1.3.4}/crawlo/tools/text_cleaner.py +0 -0
  195. {crawlo-1.3.2 → crawlo-1.3.4}/crawlo/utils/__init__.py +0 -0
  196. {crawlo-1.3.2 → crawlo-1.3.4}/crawlo/utils/batch_processor.py +0 -0
  197. {crawlo-1.3.2 → crawlo-1.3.4}/crawlo/utils/controlled_spider_mixin.py +0 -0
  198. {crawlo-1.3.2 → crawlo-1.3.4}/crawlo/utils/db_helper.py +0 -0
  199. {crawlo-1.3.2 → crawlo-1.3.4}/crawlo/utils/enhanced_error_handler.py +0 -0
  200. {crawlo-1.3.2 → crawlo-1.3.4}/crawlo/utils/env_config.py +0 -0
  201. {crawlo-1.3.2 → crawlo-1.3.4}/crawlo/utils/func_tools.py +0 -0
  202. {crawlo-1.3.2 → crawlo-1.3.4}/crawlo/utils/large_scale_config.py +0 -0
  203. {crawlo-1.3.2 → crawlo-1.3.4}/crawlo/utils/large_scale_helper.py +0 -0
  204. {crawlo-1.3.2 → crawlo-1.3.4}/crawlo/utils/performance_monitor.py +0 -0
  205. {crawlo-1.3.2 → crawlo-1.3.4}/crawlo/utils/queue_helper.py +0 -0
  206. {crawlo-1.3.2 → crawlo-1.3.4}/crawlo/utils/redis_key_validator.py +0 -0
  207. {crawlo-1.3.2 → crawlo-1.3.4}/crawlo/utils/request.py +0 -0
  208. {crawlo-1.3.2 → crawlo-1.3.4}/crawlo/utils/spider_loader.py +0 -0
  209. {crawlo-1.3.2 → crawlo-1.3.4}/crawlo/utils/system.py +0 -0
  210. {crawlo-1.3.2 → crawlo-1.3.4}/crawlo/utils/tools.py +0 -0
  211. {crawlo-1.3.2 → crawlo-1.3.4}/crawlo/utils/url.py +0 -0
  212. {crawlo-1.3.2 → crawlo-1.3.4}/crawlo.egg-info/dependency_links.txt +0 -0
  213. {crawlo-1.3.2 → crawlo-1.3.4}/crawlo.egg-info/entry_points.txt +0 -0
  214. {crawlo-1.3.2 → crawlo-1.3.4}/crawlo.egg-info/requires.txt +0 -0
  215. {crawlo-1.3.2 → crawlo-1.3.4}/crawlo.egg-info/top_level.txt +0 -0
  216. {crawlo-1.3.2 → crawlo-1.3.4}/examples/__init__.py +0 -0
  217. {crawlo-1.3.2 → crawlo-1.3.4}/pyproject.toml +0 -0
  218. {crawlo-1.3.2 → crawlo-1.3.4}/requirements.txt +0 -0
  219. {crawlo-1.3.2 → crawlo-1.3.4}/setup.cfg +0 -0
  220. {crawlo-1.3.2 → crawlo-1.3.4}/tests/__init__.py +0 -0
  221. {crawlo-1.3.2 → crawlo-1.3.4}/tests/advanced_tools_example.py +0 -0
  222. {crawlo-1.3.2 → crawlo-1.3.4}/tests/cleaners_example.py +0 -0
  223. {crawlo-1.3.2 → crawlo-1.3.4}/tests/config_validation_demo.py +0 -0
  224. {crawlo-1.3.2 → crawlo-1.3.4}/tests/controlled_spider_example.py +0 -0
  225. {crawlo-1.3.2 → crawlo-1.3.4}/tests/date_tools_example.py +0 -0
  226. {crawlo-1.3.2 → crawlo-1.3.4}/tests/debug_pipelines.py +0 -0
  227. {crawlo-1.3.2 → crawlo-1.3.4}/tests/dynamic_loading_example.py +0 -0
  228. {crawlo-1.3.2 → crawlo-1.3.4}/tests/dynamic_loading_test.py +0 -0
  229. {crawlo-1.3.2 → crawlo-1.3.4}/tests/env_config_example.py +0 -0
  230. {crawlo-1.3.2 → crawlo-1.3.4}/tests/error_handling_example.py +0 -0
  231. {crawlo-1.3.2 → crawlo-1.3.4}/tests/redis_key_validation_demo.py +0 -0
  232. {crawlo-1.3.2 → crawlo-1.3.4}/tests/request_params_example.py +0 -0
  233. {crawlo-1.3.2 → crawlo-1.3.4}/tests/response_improvements_example.py +0 -0
  234. {crawlo-1.3.2 → crawlo-1.3.4}/tests/test_advanced_tools.py +0 -0
  235. {crawlo-1.3.2 → crawlo-1.3.4}/tests/test_all_redis_key_configs.py +0 -0
  236. {crawlo-1.3.2 → crawlo-1.3.4}/tests/test_authenticated_proxy.py +0 -0
  237. {crawlo-1.3.2 → crawlo-1.3.4}/tests/test_cleaners.py +0 -0
  238. {crawlo-1.3.2 → crawlo-1.3.4}/tests/test_comprehensive.py +0 -0
  239. {crawlo-1.3.2 → crawlo-1.3.4}/tests/test_config_consistency.py +0 -0
  240. {crawlo-1.3.2 → crawlo-1.3.4}/tests/test_config_merge.py +0 -0
  241. {crawlo-1.3.2 → crawlo-1.3.4}/tests/test_config_validator.py +0 -0
  242. {crawlo-1.3.2 → crawlo-1.3.4}/tests/test_crawlo_proxy_integration.py +0 -0
  243. {crawlo-1.3.2 → crawlo-1.3.4}/tests/test_date_tools.py +0 -0
  244. {crawlo-1.3.2 → crawlo-1.3.4}/tests/test_default_header_middleware.py +0 -0
  245. {crawlo-1.3.2 → crawlo-1.3.4}/tests/test_distributed.py +0 -0
  246. {crawlo-1.3.2 → crawlo-1.3.4}/tests/test_double_crawlo_fix.py +0 -0
  247. {crawlo-1.3.2 → crawlo-1.3.4}/tests/test_double_crawlo_fix_simple.py +0 -0
  248. {crawlo-1.3.2 → crawlo-1.3.4}/tests/test_download_delay_middleware.py +0 -0
  249. {crawlo-1.3.2 → crawlo-1.3.4}/tests/test_downloader_proxy_compatibility.py +0 -0
  250. {crawlo-1.3.2 → crawlo-1.3.4}/tests/test_dynamic_downloaders_proxy.py +0 -0
  251. {crawlo-1.3.2 → crawlo-1.3.4}/tests/test_dynamic_proxy.py +0 -0
  252. {crawlo-1.3.2 → crawlo-1.3.4}/tests/test_dynamic_proxy_config.py +0 -0
  253. {crawlo-1.3.2 → crawlo-1.3.4}/tests/test_dynamic_proxy_real.py +0 -0
  254. {crawlo-1.3.2 → crawlo-1.3.4}/tests/test_edge_cases.py +0 -0
  255. {crawlo-1.3.2 → crawlo-1.3.4}/tests/test_enhanced_error_handler.py +0 -0
  256. {crawlo-1.3.2 → crawlo-1.3.4}/tests/test_env_config.py +0 -0
  257. {crawlo-1.3.2 → crawlo-1.3.4}/tests/test_error_handler_compatibility.py +0 -0
  258. {crawlo-1.3.2 → crawlo-1.3.4}/tests/test_final_validation.py +0 -0
  259. {crawlo-1.3.2 → crawlo-1.3.4}/tests/test_framework_env_usage.py +0 -0
  260. {crawlo-1.3.2 → crawlo-1.3.4}/tests/test_integration.py +0 -0
  261. {crawlo-1.3.2 → crawlo-1.3.4}/tests/test_item_dedup_redis_key.py +0 -0
  262. {crawlo-1.3.2 → crawlo-1.3.4}/tests/test_offsite_middleware.py +0 -0
  263. {crawlo-1.3.2 → crawlo-1.3.4}/tests/test_parsel.py +0 -0
  264. {crawlo-1.3.2 → crawlo-1.3.4}/tests/test_performance.py +0 -0
  265. {crawlo-1.3.2 → crawlo-1.3.4}/tests/test_proxy_api.py +0 -0
  266. {crawlo-1.3.2 → crawlo-1.3.4}/tests/test_proxy_health_check.py +0 -0
  267. {crawlo-1.3.2 → crawlo-1.3.4}/tests/test_proxy_middleware.py +0 -0
  268. {crawlo-1.3.2 → crawlo-1.3.4}/tests/test_proxy_middleware_enhanced.py +0 -0
  269. {crawlo-1.3.2 → crawlo-1.3.4}/tests/test_proxy_middleware_integration.py +0 -0
  270. {crawlo-1.3.2 → crawlo-1.3.4}/tests/test_proxy_middleware_refactored.py +0 -0
  271. {crawlo-1.3.2 → crawlo-1.3.4}/tests/test_proxy_providers.py +0 -0
  272. {crawlo-1.3.2 → crawlo-1.3.4}/tests/test_proxy_stats.py +0 -0
  273. {crawlo-1.3.2 → crawlo-1.3.4}/tests/test_proxy_strategies.py +0 -0
  274. {crawlo-1.3.2 → crawlo-1.3.4}/tests/test_queue_manager_double_crawlo.py +0 -0
  275. {crawlo-1.3.2 → crawlo-1.3.4}/tests/test_queue_manager_redis_key.py +0 -0
  276. {crawlo-1.3.2 → crawlo-1.3.4}/tests/test_random_user_agent.py +0 -0
  277. {crawlo-1.3.2 → crawlo-1.3.4}/tests/test_real_scenario_proxy.py +0 -0
  278. {crawlo-1.3.2 → crawlo-1.3.4}/tests/test_redis_config.py +0 -0
  279. {crawlo-1.3.2 → crawlo-1.3.4}/tests/test_redis_connection_pool.py +0 -0
  280. {crawlo-1.3.2 → crawlo-1.3.4}/tests/test_redis_key_naming.py +0 -0
  281. {crawlo-1.3.2 → crawlo-1.3.4}/tests/test_redis_key_validator.py +0 -0
  282. {crawlo-1.3.2 → crawlo-1.3.4}/tests/test_redis_queue.py +0 -0
  283. {crawlo-1.3.2 → crawlo-1.3.4}/tests/test_request_ignore_middleware.py +0 -0
  284. {crawlo-1.3.2 → crawlo-1.3.4}/tests/test_request_params.py +0 -0
  285. {crawlo-1.3.2 → crawlo-1.3.4}/tests/test_request_serialization.py +0 -0
  286. {crawlo-1.3.2 → crawlo-1.3.4}/tests/test_response_code_middleware.py +0 -0
  287. {crawlo-1.3.2 → crawlo-1.3.4}/tests/test_response_filter_middleware.py +0 -0
  288. {crawlo-1.3.2 → crawlo-1.3.4}/tests/test_response_improvements.py +0 -0
  289. {crawlo-1.3.2 → crawlo-1.3.4}/tests/test_retry_middleware.py +0 -0
  290. {crawlo-1.3.2 → crawlo-1.3.4}/tests/test_scheduler.py +0 -0
  291. {crawlo-1.3.2 → crawlo-1.3.4}/tests/test_scheduler_config_update.py +0 -0
  292. {crawlo-1.3.2 → crawlo-1.3.4}/tests/test_simple_response.py +0 -0
  293. {crawlo-1.3.2 → crawlo-1.3.4}/tests/test_telecom_spider_redis_key.py +0 -0
  294. {crawlo-1.3.2 → crawlo-1.3.4}/tests/test_template_content.py +0 -0
  295. {crawlo-1.3.2 → crawlo-1.3.4}/tests/test_template_redis_key.py +0 -0
  296. {crawlo-1.3.2 → crawlo-1.3.4}/tests/test_tools.py +0 -0
  297. {crawlo-1.3.2 → crawlo-1.3.4}/tests/test_user_agents.py +0 -0
  298. {crawlo-1.3.2 → crawlo-1.3.4}/tests/tools_example.py +0 -0
  299. {crawlo-1.3.2 → crawlo-1.3.4}/tests/verify_distributed.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: crawlo
3
- Version: 1.3.2
3
+ Version: 1.3.4
4
4
  Summary: Crawlo 是一款基于异步IO的高性能Python爬虫框架,支持分布式抓取。
5
5
  Home-page: https://github.com/crawl-coder/Crawlo.git
6
6
  Author: crawl-coder
@@ -132,13 +132,13 @@ Requires-Dist: selenium>=3.141.0; extra == "all"
132
132
 
133
133
  ### 安装
134
134
 
135
- ```bash
135
+ ``bash
136
136
  pip install crawlo
137
137
  ```
138
138
 
139
139
  ### 创建项目
140
140
 
141
- ```bash
141
+ ``bash
142
142
  # 创建默认项目
143
143
  crawlo startproject myproject
144
144
 
@@ -153,7 +153,7 @@ cd myproject
153
153
 
154
154
  ### 生成爬虫
155
155
 
156
- ```bash
156
+ ``bash
157
157
  # 在项目目录中生成爬虫
158
158
  crawlo genspider news_spider news.example.com
159
159
  ```
@@ -182,7 +182,7 @@ class MySpider(Spider):
182
182
 
183
183
  ### 运行爬虫
184
184
 
185
- ```bash
185
+ ``bash
186
186
  # 使用命令行工具运行爬虫(推荐)
187
187
  crawlo run myspider
188
188
 
@@ -289,7 +289,7 @@ Crawlo 提供了多种灵活的配置方式,以适应不同的使用场景和
289
289
 
290
290
  使用 `CrawloConfig` 配置工厂是推荐的配置方式,它提供了类型安全和智能提示。
291
291
 
292
- ```python
292
+ ``python
293
293
  from crawlo.config import CrawloConfig
294
294
  from crawlo.crawler import CrawlerProcess
295
295
 
@@ -363,7 +363,7 @@ process = CrawlerProcess(settings=config.to_dict())
363
363
  适用于开发调试、小规模数据采集、个人项目。
364
364
 
365
365
  **推荐配置方式:**
366
- ```python
366
+ ``python
367
367
  from crawlo.config import CrawloConfig
368
368
  config = CrawloConfig.standalone(concurrency=4, download_delay=1.0)
369
369
  process = CrawlerProcess(settings=config.to_dict())
@@ -379,7 +379,7 @@ process = CrawlerProcess(settings=config.to_dict())
379
379
  适用于大规模数据采集、多节点协同工作、高并发需求。
380
380
 
381
381
  **推荐配置方式:**
382
- ```python
382
+ ``python
383
383
  from crawlo.config import CrawloConfig
384
384
  config = CrawloConfig.distributed(
385
385
  redis_host='your_redis_host',
@@ -400,7 +400,7 @@ process = CrawlerProcess(settings=config.to_dict())
400
400
  适用于希望根据环境自动选择最佳运行方式。
401
401
 
402
402
  **推荐配置方式:**
403
- ```python
403
+ ``python
404
404
  from crawlo.config import CrawloConfig
405
405
  config = CrawloConfig.auto(concurrency=12)
406
406
  process = CrawlerProcess(settings=config.to_dict())
@@ -453,7 +453,7 @@ CUSTOM_MIDDLEWARES = [
453
453
 
454
454
  用户可以通过`CUSTOM_PIPELINES`配置自定义管道:
455
455
 
456
- ```python
456
+ ``python
457
457
  # settings.py
458
458
  CUSTOM_PIPELINES = [
459
459
  'crawlo.pipelines.json_pipeline.JsonPipeline',
@@ -839,7 +839,7 @@ request = Request(
839
839
 
840
840
  可以同时使用多种参数类型,框架会自动处理:
841
841
 
842
- ```python
842
+ ``python
843
843
  # GET请求同时使用params和form_data(都会作为查询参数)
844
844
  request = Request(
845
845
  url='https://api.example.com/search',
@@ -881,7 +881,7 @@ request = Request(
881
881
 
882
882
  Request类支持链式调用来简化配置:
883
883
 
884
- ```python
884
+ ``python
885
885
  request = Request('https://example.com')\
886
886
  .add_header('User-Agent', 'Crawlo Bot')\
887
887
  .set_proxy('http://proxy.example.com:8080')\
@@ -894,7 +894,7 @@ request = Request('https://example.com')\
894
894
 
895
895
  Crawlo提供了多种预定义的请求优先级:
896
896
 
897
- ```python
897
+ ``python
898
898
  from crawlo import Request, RequestPriority
899
899
 
900
900
  # 设置不同的优先级
@@ -909,7 +909,7 @@ background_request = Request('https://example.com', priority=RequestPriority.BAC
909
909
 
910
910
  对于需要JavaScript渲染的页面,可以启用动态加载器:
911
911
 
912
- ```python
912
+ ``python
913
913
  # 启用动态加载器
914
914
  request = Request('https://example.com')\
915
915
  .set_dynamic_loader(use_dynamic=True)
@@ -980,12 +980,118 @@ PROXY_LIST = [
980
980
 
981
981
  ---
982
982
 
983
+ <!-- 高级工具 section -->
984
+ <h2 align="center">🛠️ 高级工具</h2>
985
+
986
+ Crawlo 框架提供了一系列高级工具,帮助开发者更好地处理大规模爬虫任务和复杂场景。
987
+
988
+ ### 1. 工厂模式相关模块
989
+
990
+ **功能**:
991
+ - 组件创建和依赖注入
992
+ - 单例模式支持
993
+ - 统一的组件管理机制
994
+
995
+ **使用场景**:
996
+ - 需要统一管理组件创建过程
997
+ - 需要依赖注入功能
998
+ - 需要单例组件实例
999
+
1000
+ ### 2. 批处理工具
1001
+
1002
+ **功能**:
1003
+ - 大规模数据处理
1004
+ - 并发控制
1005
+ - 内存使用优化
1006
+
1007
+ **使用场景**:
1008
+ - 处理大量数据项
1009
+ - 需要控制并发数量
1010
+ - 内存敏感的数据处理任务
1011
+
1012
+ ### 3. 受控爬虫混入类
1013
+
1014
+ **功能**:
1015
+ - 控制大规模请求生成
1016
+ - 防止内存溢出
1017
+ - 动态并发控制
1018
+
1019
+ **使用场景**:
1020
+ - 需要生成大量请求的爬虫
1021
+ - 内存受限的环境
1022
+ - 需要精确控制并发的场景
1023
+
1024
+ ### 4. 大规模配置工具
1025
+
1026
+ **功能**:
1027
+ - 针对不同场景的优化配置
1028
+ - 简化配置过程
1029
+ - 提高爬取效率和稳定性
1030
+
1031
+ **配置类型**:
1032
+ - **保守型**: 资源受限环境
1033
+ - **平衡型**: 一般生产环境
1034
+ - **激进型**: 高性能服务器
1035
+ - **内存优化型**: 内存受限但要处理大量请求
1036
+
1037
+ **使用场景**:
1038
+ - 处理数万+请求的大规模爬取
1039
+ - 不同性能环境的适配
1040
+ - 快速配置优化
1041
+
1042
+ ### 5. 大规模爬虫辅助工具
1043
+
1044
+ **功能**:
1045
+ - 批量数据处理
1046
+ - 进度管理和断点续传
1047
+ - 内存使用优化
1048
+ - 多种数据源支持
1049
+
1050
+ **组件**:
1051
+ - **LargeScaleHelper**: 批量迭代大量数据
1052
+ - **ProgressManager**: 进度管理
1053
+ - **MemoryOptimizer**: 内存优化
1054
+ - **DataSourceAdapter**: 数据源适配器
1055
+
1056
+ **使用场景**:
1057
+ - 处理数万+ URL的爬虫
1058
+ - 需要断点续传的功能
1059
+ - 内存敏感的大规模处理任务
1060
+
1061
+ ### 6. 自动爬虫模块导入
1062
+
1063
+ **功能**:
1064
+ - 自动发现和导入爬虫模块
1065
+ - 无需手动导入即可注册爬虫
1066
+ - 智能扫描项目中的爬虫文件
1067
+
1068
+ **使用方式**:
1069
+ 框架会自动扫描指定的`spider_modules`路径,导入其中的所有爬虫模块并自动注册爬虫类。用户只需在创建`CrawlerProcess`时指定`spider_modules`参数:
1070
+
1071
+ ```python
1072
+ # 指定爬虫模块路径,框架会自动导入并注册所有爬虫
1073
+ spider_modules = ['myproject.spiders']
1074
+ process = CrawlerProcess(spider_modules=spider_modules)
1075
+
1076
+ # 运行指定的爬虫(无需手动导入)
1077
+ asyncio.run(process.crawl('my_spider_name'))
1078
+ ```
1079
+
1080
+ **优势**:
1081
+ - 简化项目结构,减少样板代码
1082
+ - 自动化管理爬虫注册过程
1083
+ - 提高开发效率,降低出错概率
1084
+ - 保持代码整洁和一致性
1085
+
1086
+ 有关这些高级工具的详细使用方法和实际案例,请参考 [高级工具示例项目](examples/advanced_tools_example/)。
1087
+
983
1088
  <!-- 示例项目 section -->
984
1089
  <h2 align="center">📦 示例项目</h2>
985
1090
 
986
1091
  - [OFweek分布式爬虫](examples/ofweek_distributed/) - 复杂的分布式爬虫示例,包含Redis去重功能
987
1092
  - [OFweek独立爬虫](examples/ofweek_standalone/) - 独立运行的爬虫示例
988
1093
  - [OFweek混合模式爬虫](examples/ofweek_spider/) - 支持单机和分布式模式切换的爬虫示例
1094
+ - [高级工具示例](examples/advanced_tools_example/) - 展示Crawlo框架中各种高级工具的使用方法,包括工厂模式、批处理工具、受控爬虫混入类、大规模配置工具和大规模爬虫辅助工具
989
1095
 
990
1096
  ---
991
1097
 
@@ -82,13 +82,13 @@
82
82
 
83
83
  ### 安装
84
84
 
85
- ```bash
85
+ ``bash
86
86
  pip install crawlo
87
87
  ```
88
88
 
89
89
  ### 创建项目
90
90
 
91
- ```bash
91
+ ``bash
92
92
  # 创建默认项目
93
93
  crawlo startproject myproject
94
94
 
@@ -103,7 +103,7 @@ cd myproject
103
103
 
104
104
  ### 生成爬虫
105
105
 
106
- ```bash
106
+ ``bash
107
107
  # 在项目目录中生成爬虫
108
108
  crawlo genspider news_spider news.example.com
109
109
  ```
@@ -132,7 +132,7 @@ class MySpider(Spider):
132
132
 
133
133
  ### 运行爬虫
134
134
 
135
- ```bash
135
+ ``bash
136
136
  # 使用命令行工具运行爬虫(推荐)
137
137
  crawlo run myspider
138
138
 
@@ -239,7 +239,7 @@ Crawlo 提供了多种灵活的配置方式,以适应不同的使用场景和
239
239
 
240
240
  使用 `CrawloConfig` 配置工厂是推荐的配置方式,它提供了类型安全和智能提示。
241
241
 
242
- ```python
242
+ ``python
243
243
  from crawlo.config import CrawloConfig
244
244
  from crawlo.crawler import CrawlerProcess
245
245
 
@@ -313,7 +313,7 @@ process = CrawlerProcess(settings=config.to_dict())
313
313
  适用于开发调试、小规模数据采集、个人项目。
314
314
 
315
315
  **推荐配置方式:**
316
- ```python
316
+ ``python
317
317
  from crawlo.config import CrawloConfig
318
318
  config = CrawloConfig.standalone(concurrency=4, download_delay=1.0)
319
319
  process = CrawlerProcess(settings=config.to_dict())
@@ -329,7 +329,7 @@ process = CrawlerProcess(settings=config.to_dict())
329
329
  适用于大规模数据采集、多节点协同工作、高并发需求。
330
330
 
331
331
  **推荐配置方式:**
332
- ```python
332
+ ``python
333
333
  from crawlo.config import CrawloConfig
334
334
  config = CrawloConfig.distributed(
335
335
  redis_host='your_redis_host',
@@ -350,7 +350,7 @@ process = CrawlerProcess(settings=config.to_dict())
350
350
  适用于希望根据环境自动选择最佳运行方式。
351
351
 
352
352
  **推荐配置方式:**
353
- ```python
353
+ ``python
354
354
  from crawlo.config import CrawloConfig
355
355
  config = CrawloConfig.auto(concurrency=12)
356
356
  process = CrawlerProcess(settings=config.to_dict())
@@ -403,7 +403,7 @@ CUSTOM_MIDDLEWARES = [
403
403
 
404
404
  用户可以通过`CUSTOM_PIPELINES`配置自定义管道:
405
405
 
406
- ```python
406
+ ``python
407
407
  # settings.py
408
408
  CUSTOM_PIPELINES = [
409
409
  'crawlo.pipelines.json_pipeline.JsonPipeline',
@@ -789,7 +789,7 @@ request = Request(
789
789
 
790
790
  可以同时使用多种参数类型,框架会自动处理:
791
791
 
792
- ```python
792
+ ``python
793
793
  # GET请求同时使用params和form_data(都会作为查询参数)
794
794
  request = Request(
795
795
  url='https://api.example.com/search',
@@ -831,7 +831,7 @@ request = Request(
831
831
 
832
832
  Request类支持链式调用来简化配置:
833
833
 
834
- ```python
834
+ ``python
835
835
  request = Request('https://example.com')\
836
836
  .add_header('User-Agent', 'Crawlo Bot')\
837
837
  .set_proxy('http://proxy.example.com:8080')\
@@ -844,7 +844,7 @@ request = Request('https://example.com')\
844
844
 
845
845
  Crawlo提供了多种预定义的请求优先级:
846
846
 
847
- ```python
847
+ ``python
848
848
  from crawlo import Request, RequestPriority
849
849
 
850
850
  # 设置不同的优先级
@@ -859,7 +859,7 @@ background_request = Request('https://example.com', priority=RequestPriority.BAC
859
859
 
860
860
  对于需要JavaScript渲染的页面,可以启用动态加载器:
861
861
 
862
- ```python
862
+ ``python
863
863
  # 启用动态加载器
864
864
  request = Request('https://example.com')\
865
865
  .set_dynamic_loader(use_dynamic=True)
@@ -930,12 +930,118 @@ PROXY_LIST = [
930
930
 
931
931
  ---
932
932
 
933
+ <!-- 高级工具 section -->
934
+ <h2 align="center">🛠️ 高级工具</h2>
935
+
936
+ Crawlo 框架提供了一系列高级工具,帮助开发者更好地处理大规模爬虫任务和复杂场景。
937
+
938
+ ### 1. 工厂模式相关模块
939
+
940
+ **功能**:
941
+ - 组件创建和依赖注入
942
+ - 单例模式支持
943
+ - 统一的组件管理机制
944
+
945
+ **使用场景**:
946
+ - 需要统一管理组件创建过程
947
+ - 需要依赖注入功能
948
+ - 需要单例组件实例
949
+
950
+ ### 2. 批处理工具
951
+
952
+ **功能**:
953
+ - 大规模数据处理
954
+ - 并发控制
955
+ - 内存使用优化
956
+
957
+ **使用场景**:
958
+ - 处理大量数据项
959
+ - 需要控制并发数量
960
+ - 内存敏感的数据处理任务
961
+
962
+ ### 3. 受控爬虫混入类
963
+
964
+ **功能**:
965
+ - 控制大规模请求生成
966
+ - 防止内存溢出
967
+ - 动态并发控制
968
+
969
+ **使用场景**:
970
+ - 需要生成大量请求的爬虫
971
+ - 内存受限的环境
972
+ - 需要精确控制并发的场景
973
+
974
+ ### 4. 大规模配置工具
975
+
976
+ **功能**:
977
+ - 针对不同场景的优化配置
978
+ - 简化配置过程
979
+ - 提高爬取效率和稳定性
980
+
981
+ **配置类型**:
982
+ - **保守型**: 资源受限环境
983
+ - **平衡型**: 一般生产环境
984
+ - **激进型**: 高性能服务器
985
+ - **内存优化型**: 内存受限但要处理大量请求
986
+
987
+ **使用场景**:
988
+ - 处理数万+请求的大规模爬取
989
+ - 不同性能环境的适配
990
+ - 快速配置优化
991
+
992
+ ### 5. 大规模爬虫辅助工具
993
+
994
+ **功能**:
995
+ - 批量数据处理
996
+ - 进度管理和断点续传
997
+ - 内存使用优化
998
+ - 多种数据源支持
999
+
1000
+ **组件**:
1001
+ - **LargeScaleHelper**: 批量迭代大量数据
1002
+ - **ProgressManager**: 进度管理
1003
+ - **MemoryOptimizer**: 内存优化
1004
+ - **DataSourceAdapter**: 数据源适配器
1005
+
1006
+ **使用场景**:
1007
+ - 处理数万+ URL的爬虫
1008
+ - 需要断点续传的功能
1009
+ - 内存敏感的大规模处理任务
1010
+
1011
+ ### 6. 自动爬虫模块导入
1012
+
1013
+ **功能**:
1014
+ - 自动发现和导入爬虫模块
1015
+ - 无需手动导入即可注册爬虫
1016
+ - 智能扫描项目中的爬虫文件
1017
+
1018
+ **使用方式**:
1019
+ 框架会自动扫描指定的`spider_modules`路径,导入其中的所有爬虫模块并自动注册爬虫类。用户只需在创建`CrawlerProcess`时指定`spider_modules`参数:
1020
+
1021
+ ```python
1022
+ # 指定爬虫模块路径,框架会自动导入并注册所有爬虫
1023
+ spider_modules = ['myproject.spiders']
1024
+ process = CrawlerProcess(spider_modules=spider_modules)
1025
+
1026
+ # 运行指定的爬虫(无需手动导入)
1027
+ asyncio.run(process.crawl('my_spider_name'))
1028
+ ```
1029
+
1030
+ **优势**:
1031
+ - 简化项目结构,减少样板代码
1032
+ - 自动化管理爬虫注册过程
1033
+ - 提高开发效率,降低出错概率
1034
+ - 保持代码整洁和一致性
1035
+
1036
+ 有关这些高级工具的详细使用方法和实际案例,请参考 [高级工具示例项目](examples/advanced_tools_example/)。
1037
+
933
1038
  <!-- 示例项目 section -->
934
1039
  <h2 align="center">📦 示例项目</h2>
935
1040
 
936
1041
  - [OFweek分布式爬虫](examples/ofweek_distributed/) - 复杂的分布式爬虫示例,包含Redis去重功能
937
1042
  - [OFweek独立爬虫](examples/ofweek_standalone/) - 独立运行的爬虫示例
938
1043
  - [OFweek混合模式爬虫](examples/ofweek_spider/) - 支持单机和分布式模式切换的爬虫示例
1044
+ - [高级工具示例](examples/advanced_tools_example/) - 展示Crawlo框架中各种高级工具的使用方法,包括工厂模式、批处理工具、受控爬虫混入类、大规模配置工具和大规模爬虫辅助工具
939
1045
 
940
1046
  ---
941
1047
 
@@ -3,6 +3,8 @@
3
3
  """
4
4
  Crawlo - 一个异步爬虫框架
5
5
  """
6
+ from typing import TYPE_CHECKING
7
+
6
8
  from crawlo.spider import Spider
7
9
  from crawlo.items import Item, Field
8
10
  from crawlo.network.request import Request
@@ -24,9 +26,29 @@ from crawlo.utils import (
24
26
  )
25
27
  from crawlo import tools
26
28
 
29
+ # 框架核心模块 - 使用TYPE_CHECKING避免循环导入
30
+ if TYPE_CHECKING:
31
+ from crawlo.core.framework_initializer import get_framework_initializer, initialize_framework
32
+
27
33
  # 为了向后兼容,从tools中导入cleaners相关的功能
28
34
  import crawlo.tools as cleaners
29
35
 
36
+ # 延迟导入的辅助函数
37
+ def get_framework_initializer():
38
+ """延迟导入get_framework_initializer以避免循环依赖"""
39
+ from crawlo.core.framework_initializer import get_framework_initializer as _get_framework_initializer
40
+ return _get_framework_initializer()
41
+
42
+ def initialize_framework(custom_settings=None):
43
+ """延迟导入initialize_framework以避免循环依赖"""
44
+ from crawlo.core.framework_initializer import initialize_framework as _initialize_framework
45
+ return _initialize_framework(custom_settings)
46
+
47
+ # 向后兼容的别名
48
+ def get_bootstrap_manager():
49
+ """向后兼容的别名"""
50
+ return get_framework_initializer()
51
+
30
52
  # 版本号:优先从元数据读取
31
53
  try:
32
54
  from importlib.metadata import version
@@ -60,5 +82,7 @@ __all__ = [
60
82
  'from_timestamp_with_tz',
61
83
  'cleaners',
62
84
  'tools',
85
+ 'get_framework_initializer',
86
+ 'get_bootstrap_manager',
63
87
  '__version__',
64
88
  ]
@@ -0,0 +1 @@
1
+ __version__ = '1.3.4'
@@ -21,10 +21,23 @@ from rich.text import Text
21
21
  from crawlo.commands.stats import record_stats
22
22
  from crawlo.crawler import CrawlerProcess
23
23
  from crawlo.project import get_settings, _find_project_root
24
- # 使用自定义日志系统
24
+ # 使用新的统一初始化系统
25
+ from crawlo.initialization import initialize_framework
26
+ from crawlo.core import get_framework_initializer
25
27
  from crawlo.utils.log import get_logger
26
28
 
27
- logger = get_logger(__name__)
29
+ # 延迟获取logger,确保在日志系统配置之后获取
30
+ _logger = None
31
+
32
+
33
+ def logger():
34
+ """延迟获取logger实例,确保在日志系统配置之后获取"""
35
+ global _logger
36
+ if _logger is None:
37
+ # 使用改进后的日志系统,可以安全地在任何时候创建
38
+ _logger = get_logger(__name__)
39
+ return _logger
40
+
28
41
 
29
42
  console = Console()
30
43
 
@@ -35,15 +48,15 @@ def check_redis_connection(settings):
35
48
  # 检查是否为分布式模式
36
49
  run_mode = settings.get('RUN_MODE', 'standalone')
37
50
  queue_type = settings.get('QUEUE_TYPE', 'memory')
38
-
51
+
39
52
  if run_mode == 'distributed' or queue_type == 'redis':
40
53
  import redis.asyncio as redis
41
54
  redis_url = settings.get('REDIS_URL', 'redis://127.0.0.1:6379/0')
42
55
  redis_host = settings.get('REDIS_HOST', '127.0.0.1')
43
56
  redis_port = settings.get('REDIS_PORT', 6379)
44
-
57
+
45
58
  console.print(f"检查 Redis 连接: {redis_host}:{redis_port}")
46
-
59
+
47
60
  # 创建Redis连接进行测试
48
61
  async def _test_redis():
49
62
  try:
@@ -54,11 +67,11 @@ def check_redis_connection(settings):
54
67
  except Exception as e:
55
68
  console.print(f"Redis 连接失败: {e}")
56
69
  return False
57
-
70
+
58
71
  # 运行异步测试
59
72
  if not asyncio.run(_test_redis()):
60
73
  raise ConnectionError(f"无法连接到 Redis 服务器 {redis_host}:{redis_port}")
61
-
74
+
62
75
  console.print("Redis 连接正常")
63
76
  return True
64
77
  else:
@@ -78,11 +91,15 @@ def main(args):
78
91
  用法:
79
92
  crawlo run <spider_name>|all [--json] [--no-stats]
80
93
  """
94
+ # 确保框架已初始化
95
+ init_manager = get_framework_initializer()
96
+
81
97
  # 添加调试信息
82
- logger.debug("DEBUG: 进入main函数")
83
-
98
+ logger().debug("DEBUG: 进入main函数")
99
+
84
100
  if len(args) < 1:
85
- console.print("[bold red]用法:[/bold red] [blue]crawlo run[/blue] <爬虫名称>|all [bold yellow][--json] [--no-stats][/bold yellow]")
101
+ console.print(
102
+ "[bold red]用法:[/bold red] [blue]crawlo run[/blue] <爬虫名称>|all [bold yellow][--json] [--no-stats][/bold yellow]")
86
103
  console.print("示例:")
87
104
  console.print(" [blue]crawlo run baidu[/blue]")
88
105
  console.print(" [blue]crawlo run all[/blue]")
@@ -153,9 +170,9 @@ def main(args):
153
170
  console.print(Panel(msg, title="导入错误", border_style="red"))
154
171
  return 1
155
172
 
156
- # 4. 加载 settings 和爬虫模块
157
- settings = get_settings()
158
-
173
+ # 4. 启动框架并加载 settings
174
+ settings = initialize_framework()
175
+
159
176
  # 检查Redis连接(如果是分布式模式)
160
177
  if not check_redis_connection(settings):
161
178
  if show_json:
@@ -163,9 +180,22 @@ def main(args):
163
180
  return 1
164
181
  else:
165
182
  return 1
166
-
167
- spider_modules = [f"{project_package}.spiders"]
183
+
184
+ # 从配置中获取SPIDER_MODULES
185
+ spider_modules = settings.get('SPIDER_MODULES', [f"{project_package}.spiders"])
186
+ logger().debug(f"SPIDER_MODULES from settings: {spider_modules}")
168
187
  process = CrawlerProcess(settings=settings, spider_modules=spider_modules)
188
+
189
+ # 不再需要手动导入爬虫模块,框架内部会自动处理
190
+ # 检查注册表中的爬虫
191
+ from crawlo.spider import get_global_spider_registry
192
+ registry = get_global_spider_registry()
193
+ spider_names = list(registry.keys())
194
+ logger().debug(f"Registered spiders after import: {spider_names}")
195
+
196
+ # 调试信息
197
+ logger().debug(f"SPIDER_MODULES: {spider_modules}")
198
+ logger().debug(f"Available spiders: {process.get_spider_names()}")
169
199
 
170
200
  # === 情况1:运行所有爬虫 ===
171
201
  if spider_arg.lower() == "all":
@@ -193,19 +223,14 @@ def main(args):
193
223
  # 显示即将运行的爬虫列表
194
224
  # 根据用户要求,不再显示详细的爬虫列表信息
195
225
 
196
- # 注册 stats 记录(除非 --no-stats)
197
- if not no_stats:
198
- for crawler in process.crawlers:
199
- crawler.signals.connect(record_stats, signal="spider_closed")
200
-
201
226
  # 并行运行所有爬虫
202
227
  with Progress(
203
- SpinnerColumn(),
204
- TextColumn("[progress.description]{task.description}"),
205
- transient=True,
228
+ SpinnerColumn(),
229
+ TextColumn("[progress.description]{task.description}"),
230
+ transient=True,
206
231
  ) as progress:
207
232
  task = progress.add_task("正在运行所有爬虫...", total=None)
208
- asyncio.run(process.crawl(spider_names))
233
+ asyncio.run(process.crawl_multiple(spider_names))
209
234
 
210
235
  if show_json:
211
236
  console.print_json(data={"success": True, "spiders": spider_names})
@@ -267,15 +292,16 @@ def main(args):
267
292
  # console.print()
268
293
 
269
294
  # 注册 stats 记录
270
- if not no_stats:
271
- for crawler in process.crawlers:
272
- crawler.signals.connect(record_stats, signal="spider_closed")
295
+ # 注意:CrawlerProcess没有crawlers属性,我们需要在运行时注册
296
+ # if not no_stats:
297
+ # for crawler in process.crawlers:
298
+ # crawler.signals.connect(record_stats, signal="spider_closed")
273
299
 
274
300
  # 运行爬虫
275
301
  with Progress(
276
- SpinnerColumn(),
277
- TextColumn("[progress.description]{task.description}"),
278
- transient=True,
302
+ SpinnerColumn(),
303
+ TextColumn("[progress.description]{task.description}"),
304
+ transient=True,
279
305
  ) as progress:
280
306
  task = progress.add_task(f"正在运行 {spider_name}...", total=None)
281
307
  asyncio.run(process.crawl(spider_name))
@@ -298,7 +324,7 @@ def main(args):
298
324
  console.print(f"[bold yellow]{msg}[/bold yellow]")
299
325
  return 1
300
326
  except Exception as e:
301
- logger.exception("Exception during 'crawlo run'")
327
+ logger().exception("Exception during 'crawlo run'")
302
328
  msg = f"意外错误: {e}"
303
329
  if show_json:
304
330
  console.print_json(data={"success": False, "error": msg})
@@ -312,4 +338,4 @@ if __name__ == "__main__":
312
338
  支持直接运行:
313
339
  python -m crawlo.commands.run spider_name
314
340
  """
315
- sys.exit(main(sys.argv[1:]))
341
+ sys.exit(main(sys.argv[1:]))